2
subroutines that operates on trees, inserted into other programs
3
such as baseml, basemlg, codeml, and pamp.
6
extern char BASEs[], *EquateBASE[], AAs[], BINs[], CODONs[][4], nChara[], CharaMap[][64];
16
#define RECONSTRUCTION
26
#define RECONSTRUCTION
46
#define EqPartition(p1,p2,ns) (p1==p2||p1+p2+1==(1<<ns))
50
int hasbase (char *str)
52
char *p=str, *eqdel=".-?";
54
if (*p==eqdel[0] || *p==eqdel[1] || *p==eqdel[2] || isalpha(*p++))
60
int GetSeqFileType(FILE *fseq, int *paupseq);
61
int IdenticalSeqs(void);
62
void RemoveEmptySequences(void);
64
int GetSeqFileType(FILE *fseq, int *format)
66
/* paupstart="begin data" and paupend="matrix" identify paup seq files.
69
int lline=1000, ch, aligned;
70
char fastastarter='>';
71
char line[1000], *paupstart="begin data",*paupend="matrix", *p;
72
char *ntax="ntax",*nchar="nchar";
74
while (isspace(ch=fgetc(fseq)))
77
if(ch == fastastarter) {
79
ScanFastaFile(fseq, &com.ns, &com.ls, &aligned);
83
error2("The seq file appears to be in fasta format, but not aligned?");
85
if(fscanf(fseq,"%d%d", &com.ns, &com.ls)==2) {
86
*format = 0; return(0);
89
printf("\nseq file is not paml/phylip format. Trying nexus format.");
92
if(fgets(line,lline,fseq)==NULL) error2("seq err1: EOF");
94
if(strstr(line,paupstart)) break;
97
if(fgets(line,lline,fseq)==NULL) error2("seq err2: EOF");
99
if((p=strstr(line,ntax))!=NULL) {
100
while (*p != '=') { if(*p==0) error2("seq err"); p++; }
101
sscanf(p+1,"%d", &com.ns);
102
if((p=strstr(line,nchar))==NULL) error2("expect nchar");
103
while (*p != '=') { if(*p==0) error2("expect ="); p++; }
104
sscanf(p+1,"%d", &com.ls);
108
/* printf("\nns: %d\tls: %d\n", com.ns, com.ls); */
110
if(fgets(line,lline,fseq)==NULL) error2("seq err1: EOF");
112
if (strstr(line,paupend)) break;
117
int PopupComment(FILE *fseq)
119
int ch, comment1=']';
122
if(ch==EOF) error2("expecting ]");
123
if(ch==comment1) break;
124
if(noisy) putchar(ch);
130
int ReadSeq (FILE *fout, FILE *fseq, int cleandata)
132
/* read in sequence, translate into protein (CODON2AAseq), and
133
This counts ngene but does not initialize lgene[].
134
It also codes (transforms) the sequences.
135
com.seqtype: 0=nucleotides; 1=codons; 2:AAs; 3:CODON2AAs; 4:BINs
136
com.pose[] is used to store gene or site-partition labels.
137
ls/3 gene marks for codon sequences.
139
A:alpha given. K:kappa given
140
G:many genes, I:interlaved format
142
Use cleandata=1 to clean up ambiguities. In return, com.cleandata=1 if the
143
data are clean or are cleaned, and com.cleandata=0 is the data are unclean.
145
char *p,*p1, eq='.', comment0='[', *line;
146
int format=0; /* 0: paml/phylip, 1: fasta; 2: paup/nexus */
147
int i,j,k, ch, noptline=0, lspname=LSPNAME, miss=0, nb;
148
int lline=10000,lt[NS], igroup, Sequential=1,basecoding=0;
149
int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);
150
int gap=(n31==3?3:10), nchar=(com.seqtype==AAseq?20:4);
152
char *pch=((com.seqtype<=1||com.seqtype==CODON2AAseq)?BASEs:(com.seqtype==2?AAs:BINs));
156
str[0]=0; h=-1; b[0]=-1; /* avoid warning */
158
if (com.seqtype==4) error2("seqtype==BINs, check with author");
159
if (noisy>=9 && (com.seqtype<=CODONseq||com.seqtype==CODON2AAseq)) {
160
puts("\n\nAmbiguity character definition table:\n");
161
for(i=0; i<(int)strlen(BASEs); i++) {
162
nb = strlen(EquateBASE[i]);
163
printf("%c (%d): ", BASEs[i], nb);
164
for(j=0; j<nb; j++) printf("%c ", EquateBASE[i][j]);
168
GetSeqFileType(fseq, &format);
170
if (com.ns>NS) error2("too many sequences.. raise NS?");
172
printf ("\n%d nucleotides, not a multiple of 3!", com.ls); exit(-1);
174
if (noisy) printf ("\nns = %d \tls = %d\n", com.ns, com.ls);
176
for(j=0; j<com.ns; j++) {
177
if(com.spname[j]) free(com.spname[j]);
178
com.spname[j] = (char*)malloc((lspname+1)*sizeof(char));
179
for(i=0; i<lspname+1; i++) com.spname[j][i]=0;
180
if((com.z[j]=(char*)realloc(com.z[j],com.ls*sizeof(char))) == NULL)
183
com.rgene[0] = 1; com.ngene = 1;
184
lline = max2(lline, com.ls/n31*(n31+1)+lspname+50);
185
if((line=(char*)malloc(lline*sizeof(char))) == NULL) error2("oom line");
189
if(!fgets(line,lline,fseq)) error2("ReadSeq: first line");
190
com.readpattern = (strchr(line,'P') || strchr(line,'p'));
192
if(!com.readpattern) {
193
if((com.pose=(int*)realloc(com.pose, com.ls/n31*sizeof(int)))==NULL)
195
for(j=0; j<com.ls/n31; j++) com.pose[j]=0; /* gene #1, default */
198
if(com.pose) free(com.pose);
201
if(format) goto readseq;
203
for (j=0; j<lline && line[j] && line[j]!='\n'; j++) {
204
if (!isalnum(line[j])) continue;
205
line[j]=(char)toupper(line[j]);
207
case 'G': noptline++; break;
208
case 'C': basecoding=1; break;
209
case 'S': Sequential=1; break;
210
case 'I': Sequential=0; break;
211
case 'P': break; /* already dealt with. */
213
printf ("\nBad option '%c' in first line of seqfile\n", line[j]);
217
if (strchr(line,'C')) { /* protein-coding DNA sequences */
218
if(com.seqtype==2) error2("option C?");
220
if (com.ls%3!=0 || noptline<1) error2("option C?");
222
for(i=0;i<3;i++) com.lgene[i]=com.ls/3;
223
#if(defined(BASEML) || defined(BASEMLG))
226
error2("partterns for coding sequences (G C P) not implemented.");
228
for (i=0;i<com.ls;i++) com.pose[i]=(char)(i%3);
236
for(j=0; j<noptline; j++) {
238
ch = (char)fgetc(fseq);
241
if(isalnum(ch)) break;
244
ch = (char)toupper(ch);
247
if(basecoding) error2("Error in sequence data file: incorrect option format, use GC?\n");
248
if (fscanf(fseq,"%d",&com.ngene)!=1) error2("expecting #gene here..");
249
if (com.ngene>NGENE) error2("raise NGENE?");
251
fgets(line,lline,fseq);
252
if (!blankline(line)) { /* #sites in genes on the 2nd line */
253
for (i=0,p=line; i<com.ngene; i++) {
254
while (*p && !isalnum(*p)) p++;
255
if (sscanf(p,"%d",&com.lgene[i])!=1) break;
256
while (*p && isalnum(*p)) p++;
258
/* if ngene is large and some lgene is on the next line */
259
for (; i<com.ngene; i++)
260
if (fscanf(fseq,"%d", &com.lgene[i])!=1) error2("EOF at lgene");
263
for(i=0,k=0; i<com.ngene; k+=com.lgene[i],i++)
264
for(j=0; j<com.lgene[i]; j++)
267
for(i=0,k=0; i<com.ngene; i++)
270
matIout(F0, com.lgene, 1, com.ngene);
271
printf("\n%6d != %d", com.ls/n31, k);
272
puts("\nOption G: total length over genes is not correct");
274
puts("Note: gene length is in number of codons.");
276
puts("Sequence length in number of nucleotides.");
280
else { /* site marks on later line(s) */
282
error2("option PG: use number of patterns in each gene and not site marks");
283
for(k=0; k<com.ls/n31; ) {
284
if (com.ngene>9) fscanf(fseq,"%d", &ch);
286
do ch=fgetc(fseq); while (!isdigit(ch));
287
ch=ch-(int)'1'+1; /* assumes 1,2,...,9 are consecutive */
289
if (ch<1 || ch>com.ngene)
290
{ printf("\ngene mark %d at %d?\n", ch, k+1); exit (-1); }
293
if(!fgets(line,lline,fseq)) error2("sequence file, gene marks");
297
printf ("Bad option '%c' in option lines in seqfile\n", line[0]);
304
if (Sequential) { /* sequential */
305
if (noisy) printf ("Reading sequences, sequential format..\n");
306
for (j=0; j<com.ns; j++) {
308
for (i=0; i<2*lspname; i++) line[i]='\0';
309
if (!fgets (line, lline, fseq)) error2("EOF?");
310
if (blankline(line)) {
311
if (PopEmptyLines (fseq, lline, line))
312
{ printf("error in sequence data file: empty line (seq %d)\n",j+1); exit(-1); }
314
p = line+(line[0]=='=' || line[0]=='>') ;
315
while(isspace(*p)) p++;
316
if ((ch=strstr(p," ")-p)<lspname && ch>0) lspname=ch;
317
strncpy (com.spname[j], p, lspname);
318
k = strlen(com.spname[j]);
319
p += (k<lspname?k:lspname);
321
for (; k>0; k--) /* trim spaces */
322
if (!isgraph(com.spname[j][k])) com.spname[j][k]=0;
325
if (noisy>=2) printf ("Reading seq #%2d: %s \n", j+1, com.spname[j]);
326
for (k=0; k<com.ls; p++) {
327
while (*p=='\n' || *p=='\0') {
328
p=fgets(line, lline, fseq);
330
{ printf("\nEOF at site %d, seq %d\n", k+1,j+1); exit(-1); }
332
*p = (char)toupper(*p);
333
if((com.seqtype==BASEseq || com.seqtype==CODONseq) && *p=='U')
335
p1 = strchr(pch, *p);
336
if (p1 && p1-pch>=nchar)
339
if (j==0) error2("Error in sequence data file: . in 1st seq.?");
340
com.z[j][k] = com.z[0][k]; k++;
344
else if (isalpha(*p)) {
345
printf("\nError in sequence data file: %c at %d seq %d.\n",*p,k+1,j+1);
346
puts("Perhaps you did not separate the sequence from its name by >2 spaces?");
349
else if (*p == (char)EOF) error2("EOF?");
351
if(strchr(p,'\n')==NULL) /* pop up line return */
352
while((ch=fgetc(fseq))!='\n' && ch!=EOF) ;
353
} /* for (j,com.ns) */
355
else { /* interlaved */
356
if (noisy) printf ("Reading sequences, interlaved format..\n");
357
FOR (j, com.ns) lt[j]=0; /* temporary seq length */
358
for (igroup=0; ; igroup++) {
360
printf ("\nreading block %d ", igroup+1); matIout(F0,lt,1,com.ns);*/
362
FOR (j, com.ns) if (lt[j]<com.ls) break;
363
if (j==com.ns) break;
365
if (!fgets(line,lline,fseq)) {
366
printf("\nerr reading site %d, seq %d group %d\nsites read in each seq:",
367
lt[j]+1,j+1,igroup+1);
370
if (!hasbase(line)) {
372
printf ("\n%d, seq %d group %d", lt[j]+1, j+1, igroup+1);
373
error2("empty line.");
376
if (PopEmptyLines(fseq,lline,line)==-1) {
377
printf ("\n%d, seq %d group %d", lt[j]+1, j+1, igroup+1);
384
while(isspace(*p)) p++;
385
if ((ch=strstr(p," ")-p)<lspname && ch>0) lspname=ch;
386
strncpy (com.spname[j], p, lspname);
387
k=strlen(com.spname[j]);
388
p+=(k<lspname?k:lspname);
390
for (; k>0; k--) /* trim spaces */
391
if (!isgraph(com.spname[j][k])) com.spname[j][k]=0;
393
if(noisy>=2) printf("Reading seq #%2d: %s \r",j+1,com.spname[j]);
395
for (; *p && *p!='\n'; p++) {
396
if (lt[j]==com.ls) break;
397
*p=(char)toupper(*p);
398
if((com.seqtype==BASEseq || com.seqtype==CODONseq) && *p=='U')
401
if (p1 && p1-pch>=nchar)
405
printf("err: . in 1st seq, group %d.\n",igroup);
408
com.z[j][lt[j]] = com.z[0][lt[j]];
412
com.z[j][lt[j]++]=*p;
413
else if (isalpha(*p)) {
414
printf("\nerr:%c at %d seq %d block %d.",
415
*p,lt[j]+1,j+1,igroup+1);
418
else if (*p==(char)EOF) error2("EOF");
420
} /* for (j,com.ns) */
423
printf("\nblock %3d:", igroup+1);
424
for(j=0;j<com.ns;j++) printf(" %6d",lt[j]);
433
else if (cleandata) { /* forced removal of ambiguity characters */
434
if(noisy>2) puts("\nSites with gaps or missing data are removed.");
436
fprintf(fout,"\nBefore deleting alignment gaps\n");
437
fprintf(fout, " %6d %6d\n", com.ns, com.ls);
438
printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,gap,com.seqtype,0,0,NULL);
441
if(fout) fprintf(fout,"\nAfter deleting gaps. %d sites\n",com.ls);
444
if(fout && !com.readpattern) {/* verbose=1, listing sequences again */
445
fprintf(fout, " %6d %6d\n", com.ns, com.ls);
446
printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,gap,com.seqtype,0,0,NULL);
449
if(n31==3) com.ls/=n31;
451
/* IdenticalSeqs(); */
454
if(com.seqtype==1 && com.verbose) Get4foldSites();
456
if(com.seqtype==CODON2AAseq) {
457
if (noisy>2) puts("\nTranslating into AA sequences\n");
458
for(j=0; j<com.ns; j++) {
459
if (noisy>2) printf("Translating sequence %d\n",j+1);
460
DNA2protein(com.z[j], com.z[j], com.ls,com.icode);
465
fputs("\nTranslated AA Sequences\n",fout);
466
fprintf(fout,"%4d %6d",com.ns,com.ls);
467
printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,10,com.seqtype,0,0,NULL);
472
#if (defined CODEML || defined BASEML)
473
if(com.ngene==1 && com.Mgene==1) com.Mgene=0;
474
if(com.ngene>1 && com.Mgene==1 && com.verbose) printSeqsMgenes ();
476
if(com.bootstrap) { BootstrapSeq("boot.txt"); exit(0); }
479
if(noisy>=2) printf ("\nSequences read..\n");
481
puts("no sites. Got nothing to do");
485
#if (defined MCMCTREE)
486
/* Check and remove empty sequences. */
489
RemoveEmptySequences();
495
else { /* read pattern counts */
497
if((com.fpatt=(double*)realloc(com.fpatt, com.npatt*sizeof(double))) == NULL)
499
for(h=0,lst=0; h<com.npatt; h++) {
500
fscanf(fseq, "%lf", &com.fpatt[h]);
502
if(com.fpatt[h]<0 || com.fpatt[h]>1e6)
503
printf("fpatth[%d] = %.6g\n", h+1, com.fpatt[h]);
507
if(noisy) printf("\n%d site patterns read, %d sites\n", com.npatt, com.ls);
510
com.lgene[0] = com.ls;
512
com.posG[1] = com.npatt;
515
for(j=0,com.posG[0]=0; j<com.ngene; j++)
516
com.posG[j+1] = com.posG[j] + com.lgene[j];
518
for(j=0; j<com.ngene; j++) {
519
com.lgene[j] = (j==0 ? 0 : com.lgene[j-1]);
520
for(h=com.posG[j]; h<com.posG[j+1]; h++)
521
com.lgene[j] += (int)com.fpatt[h];
529
fprintf(fout,"\nPrinting out site pattern counts\n\n");
536
void RemoveEmptySequences(void)
538
/* this removes empty sequences (? or - only) and adjust com.ns
543
for(j=0; j<com.ns; j++) {
545
for(h=0; h<com.ls*(com.seqtype==1?3:1); h++)
546
if(com.z[j][h] != '?' && com.z[j][h] != '-') {
551
for(j=0,nsnew=0; j<com.ns; j++) {
553
printf("seq #%3d: %-30s is removed\n", j+1, com.spname[j]);
558
com.z[nsnew] = com.z[j];
559
com.spname[nsnew] = com.spname[j];
562
for(j=nsnew; j<com.ns; j++) {
564
com.spname[j] = NULL;
570
int printPatterns(FILE *fout)
572
int j,h, n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);
573
int gap=(n31==3?3:10), nchar=(com.seqtype==AAseq?20:4);
575
fprintf(fout,"\n%10d %10d P", com.ns, com.npatt*n31);
577
fprintf (fout," G\nG %d ", com.ngene);
578
for(j=0; j<com.ngene; j++)
579
fprintf(fout,"%7d", com.posG[j+1]-com.posG[j]);
583
if(com.seqtype==1 && com.cleandata) {
584
; /* nothing is printed out for yn00, as the coding is different. */
585
#if(defined CODEML || defined YN00)
586
printsmaCodon (fout, com.z, com.ns, com.npatt, com.npatt, 1);
590
printsma(fout,com.spname,com.z,com.ns, com.npatt,com.npatt, gap, com.seqtype, 1, 0, NULL);
593
for(h=0; h<com.npatt; h++) {
594
fprintf(fout," %4.0f", com.fpatt[h]);
595
if((h+1)%15==0) FPN(fout);
597
fprintf(fout, "\n\n");
604
void EncodeSeqs (void)
606
/* This encodes sequences and set up com.TipMap[][], called after sites are collapsed
609
For codonml, codons are coded into 0, 1, ..., 60 for the universal code.
610
For yn00, codons are coded into 0, 1, ..., 63 for the universal code.
611
This does not look like a good idea, and perhaps should be changed.
613
int n=com.ncode, nA, is,h, i, j, k,ic, indel=0, ch, b[3];
614
char *pchar = ((com.seqtype==0||com.seqtype==1) ? BASEs : (com.seqtype==2 ? AAs : BINs));
615
unsigned char c[4]="", str[4]=" ";
617
if(com.seqtype==0 || com.seqtype==2) {
618
for(is=0; is<com.ns; is++) {
619
for (h=0; h<com.npatt; h++) {
621
k = strchr(pchar, ch) - pchar;
623
printf("strange character %c in seq %d site %d\n", ch, is+1, h+1);
630
#if (defined CODEML || defined YN00)
631
else if(com.seqtype==1) {
632
/* collect all observed codons into CODONs */
633
memset(&CODONs[0][0], 0, 256*4*sizeof(char));
634
for(nA=0; nA<n; nA++) {
635
ic=FROM61[nA]; b[0]=ic/16; b[1]=(ic/4)%4; b[2]=ic%4;
636
for(i=0; i<3; i++) CODONs[nA][i] = BASEs[b[i]];
638
for(j=0,nA=n; j<com.ns; j++) {
639
for(h=0; h<com.npatt; h++) {
641
c[k] = com.z[j][h*3+k];
642
b[k] = strchr(BASEs,c[k]) - BASEs;
643
if(b[k]<0) printf("strange nucleotide %c in seq %d\n", c[k], j+1);
645
if(b[0]<4 && b[1]<4 && b[2]<4) {
646
k = FROM64[b[0]*16+b[1]*4+b[2]];
648
printf("\nstop codon %s in seq #%2d: %s\n", c, j+1,com.spname[j]);
652
else { /* an ambiguous codon */
654
if(strcmp(CODONs[k], c) == 0) break;
658
error2("too many ambiguity codons in the data. Contact author");
659
strcpy(CODONs[nA-1], c);
661
com.z[j][h] = (unsigned char)k;
663
com.z[j] = (unsigned char*)realloc(com.z[j], com.npatt);
666
printf("%d ambiguous codons are seen in the data:\n", nA - n);
667
for(k=n; k<nA; k++) printf("%4s", CODONs[k]);
675
void SetMapAmbiguity (void)
677
/* This sets up CharaMap, the map from the ambiguity characters to resolved characters.
679
int n=com.ncode, i,j, i0,i1,i2, nb[3], ib[3][4], ic;
680
char *pchar = (com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs : BINs));
687
if(com.seqtype==0 || com.seqtype==2) {
688
for(j=n,pchar+=n; *pchar; j++,pchar++) {
690
nChara[j] = strlen(EquateBASE[j]);
691
for(i=0; i<nChara[j]; i++)
692
CharaMap[j][i] = (char)(strchr(BASEs, EquateBASE[j][i]) - BASEs);
702
else if(com.seqtype==1) {
703
for(j=n; j<256 && CODONs[j][0]; j++) {
706
NucListall(CODONs[j][i], &nb[i], ib[i]);
707
for(i0=0; i0<nb[0]; i0++) {
708
for(i1=0; i1<nb[1]; i1++)
709
for(i2=0; i2<nb[2]; i2++) {
710
ic = ib[0][i0]*16+ib[1][i1]*4+ib[2][i2];
711
if(GeneticCode[com.icode][ic] != -1)
712
CharaMap[j][nChara[j]++] = FROM64[ic];
716
printf("\ncodon %s is stop codon", CODONs[j]);
725
int IdenticalSeqs(void)
727
/* This checks for identical sequences and create a data set of unique
728
sequences. The file name is <SeqDataFile.unique. This is casually
729
written and need more testing.
730
The routine is called right after the sequence data are read.
731
For codon sequences, com.ls has the number of codons, which are NOT
734
char tmpf[96], keep[NS];
736
int is,js,h, same,nkept=com.ns;
737
int ls1=com.ls*(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);
739
puts("\nIdenticalSeqs: not tested\a");
740
for(is=0; is<com.ns; is++)
742
for(is=0; is<com.ns; is++) {
743
if(!keep[is]) continue;
744
for(js=0; js<is; js++) {
745
for(h=0,same=1; h<ls1; h++)
746
if(com.z[is][h] != com.z[js][h]) break;
748
printf("Seqs. %3d & %3d (%s & %s) are identical!\n",
749
js+1,is+1,com.spname[js],com.spname[is]);
754
for(is=0; is<com.ns; is++)
755
if(!keep[is]) nkept--;
757
strcpy(tmpf, com.seqf);
758
strcat(tmpf, ".unique");
759
if((ftmp=fopen(tmpf,"w"))==NULL) error2("IdenticalSeqs: file error");
760
printSeqs(ftmp, NULL, keep, 1);
762
printf("\nUnique sequences collected in %s.\n", tmpf);
768
void AllPatterns (FILE* fout)
770
/* This prints out an alignment containting all possible site patterns, and then exits.
771
This alignment may be useful to generate a dataset of infinitely long sequences,
772
summarized in the site pattern probabilities.
773
Because the PatternWeight() function changes the order of patterns, this routine
774
prints out the alignment as one of patterns, with lots of 1's below it, to avoid
775
baseml or codeml calling that routine to collaps sites.
776
You then replace those 1'with the calculated pattern probabilities for further
780
char codon[4]=" ", b[3];
781
int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);
782
int gap=(n31==3?3:10);
785
for(j=0,com.npatt=1; j<com.ns; j++) com.npatt*=com.ncode;
786
printf ("%3d species, %d site patterns\n", com.ns, com.npatt);
788
for(j=0; j<com.ns; j++) {
789
com.spname[j] = (char*)realloc(com.spname[j], 11*sizeof(char));
790
sprintf(com.spname[j], "%c ", 'a'+j);
792
for(j=0; j<com.ns; j++)
793
if((com.z[j]=(char*) malloc(com.npatt*sizeof(char))) == NULL)
794
error2("oom in AllPatterns");
795
for (h=0; h<com.npatt; h++) {
796
for (j=0,it=h; j<com.ns; j++) {
799
com.z[com.ns-1-j][h] = (char)ic;
804
fprintf(fout, " %6d %6d P\n", com.ns, com.ls*n31);
806
#if(defined CODEML || defined YN00)
807
printsmaCodon (fout, com.z, com.ns, com.ls, com.ls, 0);
811
printsma(fout,com.spname,com.z,com.ns, com.ls, com.ls, gap, com.seqtype, 1, 0, NULL);
813
for(h=0; h<com.npatt; h++) {
815
if((h+1)%40==0) FPN(fout);
822
int PatternWeight (void)
824
/* This collaps sites into patterns, for nucleotide, amino acid, or codon sequences.
825
This relies on \0 being the end of the string so that sequences should not be
826
encoded before this routine is called.
827
com.pose[i] has labels for genes as input and maps sites to patterns in return.
828
com.fpatt, a vector of doubles, wastes space as site pattern counts are integers.
829
Sequences z[ns*ls] are copied into patterns zt[ls*lpatt], and bsearch is used
830
twice to avoid excessive copying, to count npatt first & to generate fpatt etc.
832
int maxnpatt=com.ls, h, ip,l,u, j, k, same, ig, *poset;
833
int gap = (com.seqtype==CODONseq ? 3 : 10);
834
int n31 = (com.seqtype==CODONseq ? 3 : 1);
835
int lpatt=com.ns*n31+1; /* extra 0 used for easy debugging, can be voided */
836
int *p2s; /* point patterns to sites in zt */
837
char *zt, *p, timestr[36];
838
double nc = (com.seqtype == 1 ? 64 : com.ncode) + !com.cleandata+1;
843
Collect and sort patterns. Get com.npatt, com.lgene, com.posG.
844
Move sequences com.z[ns][ls] into sites zt[ls*lpatt].
845
Use p2s to map patterns to sites in zt to avoid copying.
847
if(noisy) printf("Counting site patterns.. %s\n", printtime(timestr));
849
if((com.seqtype==1 && com.ns<5) || (com.seqtype!=1 && com.ns<7))
850
maxnpatt = (int)(pow(nc, (double)com.ns) + 0.5) * com.ngene;
851
if(maxnpatt>com.ls) maxnpatt = com.ls;
852
p2s = (int*)malloc(maxnpatt*sizeof(int));
853
zt = (char*)malloc((com.ns+1)*com.ls*n31*sizeof(char));
854
if(p2s==NULL || zt==NULL) error2("oom p2s or zt");
855
memset(zt, 0, (com.ns+1)*com.ls*n31*sizeof(char));
856
for(j=0; j<com.ns; j++)
857
for(h=0; h<com.ls; h++)
859
zt[h*lpatt+j*n31+k] = com.z[j][h*n31+k];
861
for(j=0; j<com.ns; j++) free(com.z[j]);
863
for(ig=0; ig<com.ngene; ig++) com.lgene[ig] = 0;
864
for(ig=0,com.npatt=0; ig<com.ngene; ig++) {
865
com.posG[ig] = l = u = ip = com.npatt;
866
for(h=0; h<com.ls; h++) {
867
if(com.pose[h] != ig) continue;
868
if(debug) printf("\nh %3d %s", h, zt+h*lpatt);
870
/* bsearch in existing patterns. Knuth 1998 Vol3 Ed2 p.410
871
ip is the loc for match or insertion. [l,u] is the search interval.
874
if(com.lgene[ig]++ != 0) { /* not 1st pattern? */
875
for(l=com.posG[ig], u=com.npatt-1; ; ) {
878
k = strcmp(zt+h*lpatt, zt+p2s[ip]*lpatt);
880
else if(k>0) l = ip + 1;
881
else { same = 1; break; }
885
if(com.npatt>maxnpatt)
886
error2("npatt > maxnpatt");
887
if(l > ip) ip++; /* last comparison in bsearch had k > 0. */
888
/* Insert new pattern at ip. This is the expensive step. */
891
memmove(p2s+ip+1, p2s+ip, (com.npatt-ip)*sizeof(int));
894
for(j=com.npatt; j>ip; j--)
902
printf(": %3d (%c ilu %3d%3d%3d) ", com.npatt, DS[same], ip, l, u);
903
for(j=0; j<com.npatt; j++)
904
printf(" %s", zt+p2s[j]*lpatt);
906
if(noisy && ((h+1)%10000==0 || h+1==com.ls))
907
printf("\r%12d patterns at %8d / %8d sites (%.1f%%), %s",
908
com.npatt, h+1, com.ls, (h+1.)*100/com.ls, printtime(timestr));
914
/* (B) count pattern frequencies and collect pose[] */
915
com.posG[com.ngene] = com.npatt;
916
for(j=0; j<com.ngene; j++)
918
error2("some gene labels are missing");
919
for(j=1; j<com.ngene; j++)
920
com.lgene[j] += com.lgene[j-1];
922
com.fpatt = (double*)realloc(com.fpatt, com.npatt*sizeof(double));
923
poset = (int*)malloc(com.ls*sizeof(int));
924
if(com.fpatt==NULL || poset==NULL) error2("oom poset");
925
for(ip=0; ip<com.npatt; ip++) com.fpatt[ip] = 0;
927
for(ig=0; ig<com.ngene; ig++) {
928
for(h=0; h<com.ls; h++) {
929
if(com.pose[h] != ig) continue;
930
for(same=0, l=com.posG[ig], u=com.posG[ig+1]-1; ; ) {
933
k = strcmp(zt+h*lpatt, zt+p2s[ip]*lpatt);
935
else if(k>0) l = ip + 1;
936
else { same = 1; break; }
939
error2("ghost pattern?");
945
if(com.seqtype==CODONseq && com.ngene==3 &&com.lgene[0]==com.ls/3) {
946
puts("\nCheck option G in data file? (Enter)\n");
949
for(j=0; j<com.ns; j++) {
950
com.z[j] = (char*)malloc(com.npatt*n31*sizeof(char));
951
for(ip=0,p=com.z[j]; ip<com.npatt; ip++)
953
*p++ = zt[p2s[ip]*lpatt + j*n31 + k];
955
memcpy(com.pose, poset, com.ls*sizeof(int));
956
free(poset); free(p2s); free(zt);
962
void AddFreqSeqGene(int js,int ig,double pi0[],double pi[]);
965
void Chi2FreqHomo(double f[], int ns, int nc, double X2G[2])
967
/* This calculates a chi-square like statistic for testing that the base
968
or amino acid frequencies are identical among sequences.
969
f[ns*nc] where ns is #sequences (rows) and nc is #states (columns).
972
double mf[64]={0}, small=1e-50;
979
for(i=0; i<ns; i++) {
980
for(j=0; j<nc; j++) {
982
X2G[0] += square(f[i*nc+j]-mf[j])/mf[j];
984
X2G[1] += 2*f[i*nc+j]*log(f[i*nc+j]/mf[j]);
990
int InitializeBaseAA (FILE *fout)
992
/* Count site patterns (com.fpatt) and calculate base or amino acid frequencies
993
in genes and species. This works on raw (uncoded) data.
994
Ambiguity characters in sequences are resolved by iteration.
995
For frequencies in each species, they are resolved within that sequence.
996
For average base frequencies among species, they are resolved over all
999
This routine is called by baseml and aaml. codonml uses another
1000
routine InitializeCodon()
1002
char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs)), indel[]="-?";
1003
int wname=30, h,js,k, ig, nconstp, nc=com.ncode;
1005
double pi0[20], t,lmax=0, X2G[2], *pisg; /* freq for species & gene, for X2 & G */
1007
if(noisy) printf("Counting frequencies..");
1008
if(fout) fprintf(fout,"\nFrequencies..");
1009
if((pisg=(double*)malloc(com.ns*nc*sizeof(double))) == NULL)
1011
for(h=0,nconstp=0; h<com.npatt; h++) {
1012
for (js=1; js<com.ns; js++)
1013
if(com.z[js][h] != com.z[0][h]) break;
1014
if (js==com.ns && com.z[0][h]!=indel[0] && com.z[0][h]!=indel[1])
1015
nconstp += (int)com.fpatt[h];
1017
for (ig=0,zero(com.pi,nc); ig<com.ngene; ig++) {
1019
fprintf (fout,"\n\nGene %2d (len %4d)", ig+1, com.lgene[ig]-(ig==0?0:com.lgene[ig-1]));
1020
fprintf(fout,"\n%*s",wname, "");
1021
for(k=0; k<nc; k++) fprintf(fout,"%7c", pch[k]);
1023
/* The following block calculates freqs in each species for each gene.
1024
Ambiguities are resolved in each species. com.pi and com.piG are
1025
used for output only, and are not be used later with missing data.
1027
zero(com.piG[ig], nc);
1028
zero(pisg, com.ns*nc);
1029
for(js=0; js<com.ns; js++) {
1030
fillxc(pi0, 1.0/nc, nc);
1031
for(irf=0; irf<nrf; irf++) {
1033
AddFreqSeqGene(js, ig, pi0, com.pi);
1034
t = sum(com.pi, nc);
1036
printf("Some sequences are empty.\n");
1037
fillxc(com.pi, 1.0/nc, nc);
1040
abyx(1/t, com.pi, nc);
1041
if(com.cleandata || com.cleandata || (t=distance(com.pi,pi0,nc))<1e-8)
1043
xtoy(com.pi, pi0, nc);
1045
fprintf(fout,"\n%-*s", wname, com.spname[js]);
1046
for(k=0; k<nc; k++) fprintf(fout, "%7.4f", com.pi[k]);
1047
for(k=0; k<nc; k++) com.piG[ig][k] += com.pi[k]/com.ns;
1048
xtoy(com.pi, pisg+js*nc, nc);
1051
fprintf(fout,"\n\n%-*s",wname,"Mean");
1052
for(k=0; k<nc; k++) fprintf(fout,"%7.4f",com.piG[ig][k]);
1055
Chi2FreqHomo(pisg, com.ns, nc, X2G);
1057
fprintf(fout,"\n\nHomogeneity statistic: X2 = %.5f G = %.5f ",X2G[0], X2G[1]);
1059
/* fprintf(frst1,"\t%.5f", X2G[1]); */
1062
if(noisy) printf("\n");
1064
/* If there are missing data, the following block calculates freqs
1065
in each gene (com.piG[]), as well as com.pi[] for the entire sequence.
1066
Ambiguities are resolved over entire data sets across species (within
1067
each gene for com.piG[]). These are used in ML calculation later.
1070
for (ig=0,zero(com.pi,nc); ig<com.ngene; ig++) {
1071
t = (ig==0 ? com.lgene[0] : com.lgene[ig]-com.lgene[ig-1])/(double)com.ls;
1072
for(k=0; k<nc; k++) com.pi[k] += com.piG[ig][k]*t;
1076
for (ig=0; ig<com.ngene; ig++) {
1077
xtoy(com.piG[ig], pi0, nc);
1078
for(irf=0; irf<nrf; irf++) { /* com.piG[] */
1079
zero(com.piG[ig], nc);
1080
for(js=0; js<com.ns; js++)
1081
AddFreqSeqGene(js, ig, pi0, com.piG[ig]);
1082
t = sum(com.piG[ig], nc);
1084
puts("empty sequences?");
1085
abyx(1/t, com.piG[ig], nc);
1086
if(distance(com.piG[ig], pi0, nc)<1e-8) break;
1087
xtoy(com.piG[ig], pi0, nc);
1091
for(k=0; k<nc; k++) for(ig=0; ig<com.ngene; ig++)
1092
pi0[k] += com.piG[ig][k]/com.ngene;
1093
for(irf=0; irf<nrf; irf++) { /* com.pi[] */
1095
for(ig=0; ig<com.ngene; ig++) for(js=0; js<com.ns; js++)
1096
AddFreqSeqGene(js, ig, pi0, com.pi);
1097
abyx(1/sum(com.pi,nc), com.pi, nc);
1098
if(distance(com.pi, pi0, nc)<1e-8) break;
1099
xtoy(com.pi,pi0,nc);
1102
fprintf (fout, "\n\n%-*s", wname, "Average");
1103
for(k=0; k<nc; k++) fprintf(fout," %7.4f", com.pi[k]);
1104
if(!com.cleandata) fputs("\n(Ambiguity characters are used to calculate freqs.)\n",fout);
1106
fprintf (fout,"\n\n# constant sites: %6d (%.2f%%)",
1107
nconstp, (double)nconstp*100./com.ls);
1109
if (com.model==0 || (com.seqtype==BASEseq && com.model==1)) {
1110
fillxc(com.pi, 1./nc, nc);
1111
FOR(ig,com.ngene) xtoy (com.pi, com.piG[ig], nc);
1113
if (com.seqtype==BASEseq && com.model==5) { /* T92 model */
1114
com.pi[0]=com.pi[2]=(com.pi[0]+com.pi[2])/2;
1115
com.pi[1]=com.pi[3]=(com.pi[1]+com.pi[3])/2;
1116
for(ig=0; ig<com.ngene; ig++) {
1117
com.piG[ig][0] = com.piG[ig][2] = (com.piG[ig][0] + com.piG[ig][2])/2;
1118
com.piG[ig][1] = com.piG[ig][3] = (com.piG[ig][1] + com.piG[ig][3])/2;
1122
/* this is used only for REV & REVu in baseml and model==3 in aaml */
1123
if(com.seqtype==AAseq) {
1124
for (k=0,t=0; k<nc; k++) t+=(com.pi[k]>0);
1126
puts("\n\a\t\tAre these a.a. sequences?");
1128
if(com.cleandata && com.ngene==1) {
1129
for(h=0,lmax=-(double)com.ls*log((double)com.ls); h<com.npatt; h++)
1130
if(com.fpatt[h]>1) lmax+=com.fpatt[h]*log((double)com.fpatt[h]);
1133
if(lmax) fprintf(fout, "\nln Lmax (unconstrained) = %.6f\n", lmax);
1142
void AddFreqSeqGene(int js, int ig, double pi0[], double pi[])
1144
/* This adds the character counts in sequence js in gene ig to pi,
1145
using pi0, by resolving ambiguities. The data are coded. com.cleandata==1 or 0.
1146
This is for nucleotide and amino acid sequences only.
1148
char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs));
1149
int k, h, b, nc=com.ncode;
1153
for(h=com.posG[ig]; h<com.posG[ig+1]; h++)
1154
pi[com.z[js][h]] += com.fpatt[h];
1157
for(h=com.posG[ig]; h<com.posG[ig+1]; h++) {
1160
pi[b] += com.fpatt[h];
1163
if(com.seqtype==BASEseq) {
1164
NucListall(BASEs[b], &nb, ib);
1165
for(k=0,t=0; k<nb; k++) t += pi0[ib[k]];
1167
pi[ib[k]] += pi0[ib[k]]/t * com.fpatt[h];
1170
if(com.seqtype==BASEseq) {
1171
for(k=0,t=0; k<nChara[b]; k++)
1172
t += pi0[CharaMap[b][k]];
1173
for(k=0; k<nChara[b]; k++)
1174
pi[CharaMap[b][k]] += pi0[CharaMap[b][k]]/t * com.fpatt[h];
1176
else if(com.seqtype==AAseq) /* unrecognized AAs are treated as "?". */
1177
for(k=0; k<nc; k++) pi[k] += pi0[k]*com.fpatt[h];
1184
int RemoveIndel(void)
1186
/* Remove ambiguity characters and indels in the untranformed sequences,
1187
Changing com.ls and com.pose[] (site marks for multiple genes).
1188
For codonml, com.ls is still 3*#codons
1189
Called at the end of ReadSeq, when com.pose[] are still site marks.
1190
All characters in com.z[][] not found in the character string pch are
1191
considered ambiguity characters and are removed.
1193
int h,k, j,js,lnew,nindel, n31,nchar;
1194
char b, *pch, *miss; /* miss[h]=1 if site (codon) h is missing, 0 otherwise */
1196
if(com.seqtype==CODONseq||com.seqtype==CODON2AAseq)
1197
{ n31=3; nchar=4; pch=BASEs; }
1200
if(com.seqtype==AAseq) { nchar=20; pch=AAs; }
1201
else if(com.seqtype==BASEseq) { nchar=4; pch=BASEs; }
1202
else { nchar=2; pch=BINs; }
1205
if (com.ls%n31) error2("ls in RemoveIndel.");
1206
if((miss=(char*)malloc(com.ls/n31 *sizeof(char)))==NULL)
1208
FOR (h,com.ls/n31) miss[h]=0;
1209
for (js=0; js<com.ns; js++) {
1210
for (h=0,nindel=0; h<com.ls/n31; h++) {
1211
for (k=0; k<n31; k++) {
1212
b=(char)toupper(com.z[js][h*n31+k]);
1213
FOR(j,nchar) if(b==pch[j]) break;
1214
if(j==nchar) { miss[h]=1; nindel++; }
1217
if (noisy>2 && nindel)
1218
printf("\n%6d ambiguity characters in seq. %d", nindel,js+1);
1221
for(h=0,k=0; h<com.ls/n31; h++) if(miss[h]) k++;
1222
printf("\n%d sites are removed. ", k);
1224
for(h=0; h<com.ls/n31; h++) if(miss[h]) printf(" %2d", h+1);
1227
for (h=0,lnew=0; h<com.ls/n31; h++) {
1228
if(miss[h]) continue;
1229
for (js=0; js<com.ns; js++) {
1230
for (k=0; k<n31; k++)
1231
com.z[js][lnew*n31+k]=com.z[js][h*n31+k];
1233
com.pose[lnew]=com.pose[h];
1243
int MPInformSites (void)
1245
/* Outputs parsimony informative and noninformative sites into
1246
two files named MPinf.seq and MPninf.seq
1247
Uses transformed sequences.
1248
Not used for a long time. Does not work if com.pose is NULL.
1250
char *imark, *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs));
1251
int h, i, markb[NS], inf, lsinf;
1254
puts("\nMPInformSites: missing data not dealt with yet?\n");
1256
finf=fopen("MPinf.seq","w");
1257
fninf=fopen("MPninf.seq","w");
1258
if (finf==NULL || fninf==NULL) error2("MPInformSites: file creation error");
1260
puts ("\nSorting parsimony-informative sites: MPinf.seq & MPninf.seq");
1261
if ((imark=(char*)malloc(com.ls*sizeof(char)))==NULL) error2("oom imark");
1262
for (h=0,lsinf=0; h<com.ls; h++) {
1263
for (i=0; i<com.ns; i++) markb[i]=0;
1264
for (i=0; i<com.ns; i++) markb[(int)com.z[i][com.pose[h]]]++;
1266
for (i=0,inf=0; i<com.ncode; i++) if (markb[i]>=2) inf++;
1267
if (inf>=2) { imark[h]=1; lsinf++; }
1270
fprintf (finf, "%6d%6d\n", com.ns, lsinf);
1271
fprintf (fninf, "%6d%6d\n", com.ns, com.ls-lsinf);
1272
for (i=0; i<com.ns; i++) {
1273
fprintf (finf, "\n%s\n", com.spname[i]);
1274
fprintf (fninf, "\n%s\n", com.spname[i]);
1275
for (h=0; h<com.ls; h++)
1276
fprintf ((imark[h]?finf:fninf), "%c", pch[(int)com.z[i][com.pose[h]]]);
1277
FPN (finf); FPN(fninf);
1280
fclose(finf); fclose(fninf);
1285
int PatternWeightJC69like (FILE *fout)
1287
/* This collaps site patterns further for JC69-like models, called after
1288
PatternWeight(). This is used for JC and poisson amino acid models.
1289
The routine could be merged into PatternWeight(), which should lead to
1290
faster computation, but this is not done because right now
1291
InitializeBaseAA() prints out base or amino acid frequencies after
1292
PatternWeight() and before this routine.
1294
If the data have no ambiguities (com.cleanddata=1), the routine recodes
1295
the data, for example, changing data at a site 1120 (CCAT) into 0012
1296
(TTCA) before checking against old patterns already found. If the data
1297
contain ambiguities, they are not encoded. In that case, for every
1298
site, the routine changes ? or N into - first. It then checks whether there
1299
are any other ambibiguities and will recode if and only if there are not
1300
any other ambiguities. For example, a site with data CC?T will be
1301
changed into CC-T first and then recoded into TT-C and checked against
1302
old patterns found. A site with data CCRT will not be recoded. In theory
1303
such sites may be packed as well, but perhaps the effort is not worthwhile.
1304
The routine checks data like CCRT against old patterns already found,
1306
If com.pose is not NULL, the routine also updates com.pose. This allows
1307
the program to work if com.readpattern==1.
1309
char zh[NS], b, gap, *pch=(com.seqtype==0 ? BASEs : AAs);
1310
int npatt0=com.npatt, h, ht, j,k, same=0, ig, recode;
1313
error2("PatternWeightJC69like does not work for codon seqs");
1314
if(noisy) printf("Counting site patterns again, for JC69.\n");
1315
gap = strchr(pch, (int)'-') - pch;
1316
for (h=0,com.npatt=0,ig=-1; h<npatt0; h++) {
1317
if (ig<com.ngene-1 && h==com.posG[ig+1])
1318
com.posG[++ig] = com.npatt;
1320
if(com.cleandata) { /* clean data, always recode */
1323
for (j=1; j<com.ns; j++) {
1325
if (com.z[j][h]==com.z[k][h]) break;
1326
zh[j] = (k<j ? zh[k] : b++);
1329
else { /* recode only if there are no non-gap ambiguity characters */
1330
for(j=0; j<com.ns; j++)
1331
zh[j] = com.z[j][h];
1333
/* After this loop, recode = 0 or 1 decides whether to recode. */
1334
for (j=0,recode=1; j<com.ns; j++) {
1335
if (zh[j] < com.ncode)
1337
if (nChara[zh[j]] == com.ncode) {
1348
for (j=1; j<com.ns; j++) {
1351
if (zh[j] == com.z[k][h]) break;
1352
if(k<j) zh[j] = zh[k];
1359
for (ht=com.posG[ig],same=0; ht<com.npatt; ht++) {
1360
for (j=0,same=1; j<com.ns; j++)
1361
if (zh[j]!=com.z[j][ht]) {
1367
com.fpatt[ht] += com.fpatt[h];
1369
for(j=0; j<com.ns; j++) com.z[j][com.npatt] = zh[j];
1370
com.fpatt[com.npatt++] = com.fpatt[h];
1373
for(k=0; k<com.ls; k++)
1374
if(com.pose[k]==h) com.pose[k] = ht;
1376
com.posG[com.ngene] = com.npatt;
1377
if (noisy) printf ("\nnew no. site patterns:%7d\n", com.npatt);
1380
fprintf(fout, "\n\nPrinting out site pattern counts\n");
1381
printPatterns(fout);
1386
int Site2Pattern (FILE *fout)
1389
fprintf(fout,"\n\nMapping site to pattern (i.e. site %d has pattern %d):\n",
1390
com.ls-1, com.pose[com.ls-2]+1);
1392
fprintf (fout, "%6d", com.pose[h]+1);
1393
if ((h+1)%10==0) FPN (fout);
1404
int print1seq (FILE*fout, char *z, int ls, int pose[])
1406
/* This prints out one sequence, and the sequences are encoded.
1407
z[] contains patterns if (pose!=NULL)
1408
This uses com.seqtype.
1411
char *pch = (com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs)), str[4]="";
1412
int nb = (com.seqtype==CODONseq?3:1);
1414
for(h=0; h<ls; h++) {
1415
hp = (pose ? pose[h] : h);
1416
if(com.seqtype != CODONseq) {
1417
fprintf(fout, "%c", pch[z[hp]]);
1418
if((h+1)%gap==0) fputc(' ', fout);
1421
fprintf(fout, "%s ", CODONs[z[hp]]);
1426
void printSeqs (FILE *fout, int *pose, char keep[], int format)
1428
/* Print sequences into fout, using paml (format=0 or 1) or paup (format=2)
1430
Use pose=NULL if called before site patterns are collapsed.
1431
keep[] marks the sequences to be printed. Use NULL for keep if all sequences
1433
Sequences may (com.cleandata==1) and may not (com.cleandata=0) be coded.
1434
com.z[] has site patterns if pose!=NULL.
1435
This uses com.seqtype, and com.ls is the number of codons for codon seqs.
1436
See notes in print1seq()
1438
format = 0,1: PAML sites or patterns
1439
2: PAUP Nexus format.
1441
This is used by evolver. Check and merge with printsma().
1444
int h, j, ls1, n31=(com.seqtype==1?3:1), nskept=com.ns, wname=30;
1445
char *dt=(com.seqtype==AAseq?"protein":"dna");
1447
ls1 = (format==1 ? com.npatt : com.ls);
1449
for(j=0; j<com.ns; j++) nskept -= !keep[j];
1450
if(format==0 || format==1)
1451
fprintf(fout, "\n\n%6d %7d %s\n\n", nskept, ls1*n31, (format==1?" P":""));
1452
else if(format==2) { /* NEXUS format */
1453
fprintf(fout,"\nbegin data;\n");
1454
fprintf(fout," dimensions ntax=%d nchar=%d;\n", nskept, ls1*n31);
1455
fprintf(fout," format datatype=%s missing=? gap=-;\n matrix\n",dt);
1458
for(j=0; j<com.ns; j++,FPN(fout)) {
1459
if(keep && !keep[j]) continue;
1460
fprintf(fout,"%s%-*s ", (format==2?" ":""), wname, com.spname[j]);
1461
print1seq(fout, com.z[j], (format==1?com.npatt:com.ls), pose);
1463
if(format==2) fprintf(fout, " ;\nend;");
1464
else if (format==1) {
1465
for(h=0,FPN(fout); h<com.npatt; h++) {
1466
/* fprintf(fout," %12.8f", com.fpatt[h]/(double)com.ls); */
1467
fprintf(fout," %4.0f", com.fpatt[h]);
1468
if((h+1)%15==0) FPN(fout);
1472
fprintf(fout,"\n\n");
1476
#define gammap(x,alpha) (alpha*(1-pow(x,-1.0/alpha)))
1477
/* DistanceREV () used to be here, moved to pamp.
1480
#if (defined BASEML || defined BASEMLG || defined MCMCTREE || defined PROBTREE || defined YULETREE)
1482
double SeqDivergence (double x[], int model, double alpha, double *kappa)
1484
/* alpha=0 if no gamma
1485
return -1 if in error.
1486
Check DistanceF84() if variances are wanted.
1489
double p[4], Y,R, a1,a2,b, P1,P2,Q,fd,tc,ag, GC;
1490
double small=1e-10/com.ls,largek=999, larged=9;
1493
matout(F0, x, 4, 4);
1494
printf("\nfrequency matrix error, setting distance to large d");
1497
for (i=0,fd=1,zero(p,4); i<4; i++) {
1499
FOR (j,4) { p[i]+=x[i*4+j]/2; p[j]+=x[i*4+j]/2; }
1501
P1=x[0*4+1]+x[1*4+0];
1502
P2=x[2*4+3]+x[3*4+2];
1503
Q = x[0*4+2]+x[0*4+3]+x[1*4+2]+x[1*4+3]+ x[2*4+0]+x[2*4+1]+x[3*4+0]+x[3*4+1];
1509
Y=p[0]+p[1]; R=p[2]+p[3]; tc=p[0]*p[1]; ag=p[2]*p[3];
1515
for (i=0,b=0; i<4; i++) b += p[i]*(1-p[i]);
1516
if (1-fd/b<=0) return (larged);
1518
if (alpha<=0) return (-b*log (1-fd/b));
1519
else return (-b*gammap(1-fd/b,alpha));
1522
printf("\nP Q = %.6f %.6f\n", P1+P2,Q);
1523
printf("\nP1 P2 Q = %.6f %.6f %.6f\n", P1,P2,Q);
1525
a1=1-2*(P1+P2)-Q; b=1-2*Q;
1526
/* if (a1<=0 || b<=0) return (-1); */
1527
if (a1<=0 || b<=0) return (larged);
1528
if (alpha<=0) { a1=-log(a1); b=-log(b); }
1529
else { a1=-gammap(a1,alpha); b=-gammap(b,alpha); }
1530
a1=.5*a1-.25*b; b=.25*b;
1531
if(b>small) *kappa = a1/b; else *kappa=largek;
1534
if(Y<small || R<small)
1535
error2("Y or R = 0.");
1537
a1=(2*(tc+ag)+2*(tc*R/Y+ag*Y/R)*(1-Q/(2*Y*R)) -P1-P2) / (2*tc/Y+2*ag/R);
1539
/* if (a1<=0 || b<=0) return (-1); */
1540
if (a1<=0 || b<=0) return (larged);
1541
if (alpha<=0) { a1=-log(a1); b=-log(b); }
1542
else { a1=-gammap(a1,alpha); b=-gammap(b,alpha); }
1545
*kappa = max2(*kappa, -.5);
1546
return 4*b*(tc*(1+ *kappa/Y)+ag*(1+ *kappa/R)+Y*R);
1547
case (HKY85): /* HKY85, from Rzhetsky & Nei (1995 MBE 12, 131-51) */
1548
if(Y<small || R<small)
1549
error2("Y or R = 0.");
1552
a1=1-Y*P1/(2*tc)-Q/(2*Y);
1553
a2=1-R*P2/(2*ag)-Q/(2*R);
1555
if (a1<=0 || a2<=0 || b<=0) return (larged);
1556
if (alpha<=0) { a1=-log(a1); a2=-log(a2); b=-log(b); }
1557
else { a1=-gammap(a1,alpha); a2=-gammap(a2,alpha); b=-gammap(b,alpha);}
1560
if (b>0) *kappa = min2((a1+a2)/(2*b), largek);
1561
return 2*(p[0]*p[1] + p[2]*p[3])*(a1+a2)/2 + 2*Y*R*b;
1565
a1 = 1 - Q - (P1+P2)/(2*GC*(1-GC)); b=1-2*Q;
1566
if (a1<=0 || b<=0) return (larged);
1567
if (alpha<=0) { a1=-log(a1); b=-log(b); }
1568
else { a1=-gammap(a1,alpha); b=-gammap(b,alpha);}
1569
if(Q>0) *kappa = 2*a1/b-1;
1570
return 2*GC*(1-GC)*a1 + (1-2*GC*(1-GC))/2*b;
1571
case (TN93): /* TN93 */
1572
if(Y<small || R<small)
1573
error2("Y or R = 0.");
1574
a1=1-Y*P1/(2*tc)-Q/(2*Y);
1575
a2=1-R*P2/(2*ag)-Q/(2*R);
1577
/* if (a1<=0 || a2<=0 || b<=0) return (-1); */
1578
if (a1<=0 || a2<=0 || b<=0) return (larged);
1579
if (alpha<=0) { a1=-log(a1); a2=-log(a2); b=-log(b); }
1580
else { a1=-gammap(a1,alpha); a2=-gammap(a2,alpha); b=-gammap(b,alpha);}
1581
a1=.5/Y*(a1-R*b); a2=.5/R*(a2-Y*b); b=.5*b;
1584
printf("\nk1&k2 = %.6f %.6f\n", a1/b,a2/b);
1586
if (b>0) *kappa = min2((a1+a2)/(2*b), largek);
1587
return 4*p[0]*p[1]*a1 + 4*p[2]*p[3]*a2 + 4*Y*R*b;
1593
double DistanceIJ (int is, int js, int model, double alpha, double *kappa)
1595
/* Distance between sequences is and js.
1596
See DistanceMatNuc() for more details.
1599
int h, n=4, missing=0;
1600
double x[16], sumx, larged=9;
1604
for (h=0; h<com.npatt; h++)
1605
x[com.z[is][h]*n+com.z[js][h]] += com.fpatt[h];
1608
for (h=0; h<com.npatt; h++) {
1612
x[b0*n+b1] += com.fpatt[h];
1619
if(sumx<=0) return(larged); /* questionable??? */
1620
abyx(1./sum(x,16),x,16);
1621
return SeqDivergence(x, model, alpha, kappa);
1625
#if (defined LSDISTANCE && defined REALSEQUENCE)
1627
extern double *SeqDistance;
1629
int DistanceMatNuc (FILE *fout, FILE*f2base, int model, double alpha)
1631
/* This calculates pairwise distances. The data may be clean and coded
1632
(com.cleandata==1) or not. In the latter case, ambiguity sites are not
1633
used (pairwise deletion). Site patterns are used.
1635
int is,js, status=0;
1636
double kappat=0, t,bigD=9;
1638
if(f2base) fprintf(f2base,"%6d\n", com.ns);
1639
if(model>=REV) model=TN93; /* TN93 here */
1641
fprintf(fout,"\nDistances:%5s", models[model]);
1642
if (model!=JC69 && model!=F81) fprintf (fout, " (kappa) ");
1643
fprintf(fout," (alpha set at %.2f)\n", alpha);
1644
fprintf(fout,"This matrix is not used in later m.l. analysis.\n");
1645
if(!com.cleandata) fprintf(fout, "\n(Pairwise deletion.)");
1647
for(is=0; is<com.ns; is++) {
1648
if(fout) fprintf(fout,"\n%-15s ", com.spname[is]);
1649
if(f2base) fprintf(f2base,"%-15s ", com.spname[is]);
1650
for(js=0; js<is; js++) {
1651
t = DistanceIJ(is, js, model, alpha, &kappat);
1652
if(t<0) { t=bigD; status=-1; }
1653
SeqDistance[is*(is-1)/2+js] = t;
1654
if(f2base) fprintf(f2base," %7.4f", t);
1655
if(fout) fprintf(fout,"%8.4f", t);
1656
if(fout && (model==K80 || model>=F84))
1657
fprintf(fout,"(%7.4f)", kappat);
1659
if(f2base) FPN(f2base);
1662
if(status) puts("\ndistance formula sometimes inapplicable..");
1672
extern int CijkIs0[];
1676
extern double Cijk[], Root[];
1678
int RootTN93 (int model, double kappa1, double kappa2, double pi[],
1679
double *scalefactor, double Root[])
1681
double T=pi[0],C=pi[1],A=pi[2],G=pi[3],Y=T+C,R=A+G;
1683
if (model==F84) { kappa2=1+kappa1/R; kappa1=1+kappa1/Y; }
1685
*scalefactor = 1/(2*T*C*kappa1+2*A*G*kappa2 + 2*Y*R);
1688
Root[1] = - (*scalefactor);
1689
Root[2] = -(Y+R*kappa2) * (*scalefactor);
1690
Root[3] = -(Y*kappa1+R) * (*scalefactor);
1695
int EigenTN93 (int model, double kappa1, double kappa2, double pi[],
1696
int *nR, double Root[], double Cijk[])
1698
/* initialize Cijk[] & Root[], which are the only part to be changed
1699
for a new substitution model
1700
for JC69, K80, F81, F84, HKY85, TN93
1701
Root: real Root divided by v, the number of nucleotide substitutions.
1704
double scalefactor, U[16],V[16], t;
1705
double T=pi[0],C=pi[1],A=pi[2],G=pi[3],Y=T+C,R=A+G;
1707
if (model==JC69 || model==F81) kappa1=kappa2=com.kappa=1;
1708
else if (com.model<TN93) kappa2=kappa1;
1709
RootTN93(model, kappa1, kappa2, pi, &scalefactor, Root);
1711
*nR=nr = 2+(model==K80||model>=F84)+(model>=HKY85);
1712
U[0*4+0]=U[1*4+0]=U[2*4+0]=U[3*4+0]=1;
1713
U[0*4+1]=U[1*4+1]=1/Y; U[2*4+1]=U[3*4+1]=-1/R;
1714
U[0*4+2]=U[1*4+2]=0; U[2*4+2]=G/R; U[3*4+2]=-A/R;
1715
U[2*4+3]=U[3*4+3]=0; U[0*4+3]=C/Y; U[1*4+3]=-T/Y;
1718
V[1*4+0]=R*T; V[1*4+1]=R*C;
1719
V[1*4+2]=-Y*A; V[1*4+3]=-Y*G;
1720
V[2*4+0]=V[2*4+1]=0; V[2*4+2]=1; V[2*4+3]=-1;
1721
V[3*4+0]=1; V[3*4+1]=-1; V[3*4+2]=V[3*4+3]=0;
1723
FOR (i,4) FOR (j,4) {
1724
Cijk[i*4*nr+j*nr+0]=U[i*4+0]*V[0*4+j];
1728
for (k=1,t=0; k<4; k++) t += U[i*4+k]*V[k*4+j];
1729
Cijk[i*4*nr+j*nr+1] = t;
1733
Cijk[i*4*nr+j*nr+1]=U[i*4+1]*V[1*4+j];
1734
for (k=2,t=0; k<4; k++) t += U[i*4+k]*V[k*4+j];
1735
Cijk[i*4*nr+j*nr+2]=t;
1737
case HKY85: case T92: case TN93:
1738
for (k=1; k<4; k++) Cijk[i*4*nr+j*nr+k] = U[i*4+k]*V[k*4+j];
1741
error2("model in EigenTN93");
1745
FOR (i,64) CijkIs0[i] = (Cijk[i]==0);
1755
#if (defined(CODEML) || defined(YN00))
1757
int printfcode (FILE *fout, double fb61[], double space[])
1761
int i, n=Nsensecodon;
1763
fprintf (fout, "\nCodon freq., x 10000\n");
1765
for(i=0; i<n; i++) space[FROM61[i]] = fb61[i]*10000;
1766
printcu(fout, space, com.icode);
1771
int printsmaCodon (FILE *fout,char * z[],int ns,int ls,int lline,int simple)
1773
/* print, in blocks, multiple aligned and transformed codon sequences.
1775
This is needed as codons are coded 0,1, 2, ..., 60, and
1776
printsma won't work.
1778
int ig, ngroup, lt, il,is, i,b, lspname=20;
1779
char equal='.',*pz, c0[4],c[4];
1781
if(ls==0) return(1);
1782
ngroup = (ls-1)/lline + 1;
1783
for (ig=0,FPN(fout); ig<ngroup; ig++) {
1784
/* fprintf (fout,"%-8d\n", ig*lline+1); */
1785
for (is=0; is<ns; is++) {
1786
fprintf(fout,"%-*s ", lspname,com.spname[is]);
1788
for(il=ig*lline,pz=z[is]+il; lt<lline && il<ls; il++,lt++,pz++) {
1791
c[0] = (char)(b/16);
1792
c[1] = (char)((b%16)/4);
1796
c[i] = BASEs[(int)c[i]];
1800
c0[0]=(char)(b/16); c0[1]=(char)((b%16)/4); c0[2]=(char)(b%4);
1802
if (c[i]==BASEs[(int)c0[i]]) c[i]=equal;
1804
fprintf(fout,"%3s ", c);
1813
int setmark_61_64 (void)
1815
/* This sets two matrices FROM61[], and FROM64[], which translate between two
1816
codings of codons. In one coding, codons go from 0, 1, ..., 63 while in
1817
the other codons range from 0, 1, ..., 61 with the three stop codons removed.
1818
FROM61[] translates from the 61-state coding to the 64-state coding, while
1819
FROM64[] translates from the 64-state coding to the 61-state coding.
1821
This routine also sets up FourFold[4][4], which defines the 4-fold codon
1824
int i,j,k, *code=GeneticCode[com.icode];
1825
int c[3],aa0,aa1, by[3]={16,4,1};
1826
double nSilent, nStop, nRepl;
1829
for (i=0; i<64; i++) {
1830
if (code[i]==-1) FROM64[i]=-1;
1831
else { FROM61[Nsensecodon]=i; FROM64[i]=Nsensecodon++; }
1833
com.ncode=Nsensecodon;
1835
for(i=0; i<4; i++) for(j=0; j<4; j++) {
1837
FourFold[i][j] = (code[k]==code[k+1] && code[k]==code[k+2] && code[k]==code[k+3]);
1840
for (i=0,nSilent=nStop=nRepl=0; i<64; i++) {
1841
c[0]=i/16; c[1]=(i/4)%4; c[2]=i%4;
1842
if((aa0=code[i])==-1) continue;
1843
for(j=0; j<3; j++) for(k=0; k<3; k++) {
1844
aa1 = code[i + ((c[j]+k+1)%4 - c[j])*by[j]];
1845
if(aa1==-1) nStop++;
1846
else if(aa0==aa1) nSilent++;
1851
printf("\ncode Stop Silent Replace\n");
1852
printf("%3d (%d) %6.0f%6.0f%6.0f %12.6f%12.6f\n",
1853
com.icode, 64-com.ncode, nStop,nSilent,nRepl,nStop*3/(com.ncode*9),nSilent*3/(com.ncode*9));
1858
int DistanceMatNG86 (FILE *fout, FILE*fds, FILE*fdn, FILE*ft, double alpha)
1860
/* Estimation of dS and dN by the method of Nei & Gojobori (1986)
1861
This works with both coded (com.cleandata==1) and uncoded data.
1862
In the latter case (com.cleandata==0), the method does pairwise delection.
1864
alpha for gamma rates is used for dN only.
1867
int is,js, i,k,h, wname=20, status=0, ndiff,nsd[4];
1868
int nb[3],ib[3][4], missing;
1869
double ns,na, nst,nat, S,N, St,Nt, dS,dN,dN_dS,y, bigD=3, lst;
1870
double SEds, SEdn, p;
1873
fputs("\n\n\nNei & Gojobori 1986. dN/dS (dN, dS)",fout);
1874
if(com.cleandata==0) fputs("\n(Pairwise deletion)",fout);
1875
fputs("\n(Note: This matrix is not used in later ML. analysis.\n",fout);
1876
fputs("Use runmode = -2 for ML pairwise comparison.)\n",fout);
1880
fprintf(fds,"%6d\n",com.ns);
1881
fprintf(fdn,"%6d\n",com.ns);
1882
fprintf(ft,"%6d\n",com.ns);
1884
if(noisy>1 && com.ns>10) puts("NG distances for seqs.:");
1885
for(is=0; is<com.ns; is++) {
1887
fprintf(fout,"\n%-*s", wname,com.spname[is]);
1889
fprintf(fds, "%-*s ",wname,com.spname[is]);
1890
fprintf(fdn, "%-*s ",wname,com.spname[is]);
1891
fprintf(ft, "%-*s ",wname,com.spname[is]);
1893
for(js=0; js<is; js++) {
1894
for(k=0; k<4; k++) nsd[k] = 0;
1895
for (h=0,lst=0,nst=nat=S=N=0; h<com.npatt; h++) {
1896
if(com.z[is][h]>=com.ncode || com.z[js][h]>=com.ncode)
1898
codon[0] = CODONs[com.z[is][h]];
1899
codon[1] = CODONs[com.z[js][h]];
1900
lst += com.fpatt[h];
1901
ndiff = difcodonNG(codon[0], codon[1], &St, &Nt, &ns, &na, 0, com.icode);
1902
nsd[ndiff] += (int)com.fpatt[h];
1903
S += St*com.fpatt[h];
1904
N += Nt*com.fpatt[h];
1905
nst += ns*com.fpatt[h];
1906
nat += na*com.fpatt[h];
1910
else { /* rescale for stop codons */
1916
printf("\n%3d %3d:Sites %7.1f +%7.1f =%7.1f\tDiffs %7.1f +%7.1f =%7.1f",
1917
is+1,js+1,S,N,S+N,nst,nat, nst+nat);
1919
dS = (S<=0 ? 0 : 1-4./3*nst/S);
1920
dN = (N<=0 ? 0 : 1-4./3*nat/N);
1921
if(noisy>=9 && (dS<=0 || dN<=0))
1922
{ puts("\nNG86 unusable."); status=-1;}
1924
else dS = (dS<=0 ? -1 : 3./4*(-log(dS)));
1926
else dN = (dN<=0 ? -1 : 3./4*(alpha==0?-log(dN):alpha*(pow(dN,-1/alpha)-1)));
1928
dN_dS = (dS>0 ? dN/dS : -1);
1929
if(fout) fprintf(fout,"%7.4f (%5.4f %5.4f)", dN_dS, dN, dS);
1931
if(N>0 && dN<0) dN = bigD;
1932
if(S>0&&dS<0) dS = bigD;
1935
SeqDistance[is*(is-1)/2+js] = (S<=0||N<=0 ? 0 : (S*dS+N*dN)*3/(S+N));
1939
fprintf(fds," %7.4f", dS);
1940
fprintf(fdn," %7.4f", dN);
1941
fprintf(ft," %7.4f", (S*dS+N*dN)*3/(S+N));
1943
if(alpha==0 && dS<bigD) { p=nst/S; SEds=sqrt(9*p*(1-p)/(square(3-4*p)*S)); }
1944
if(alpha==0 && dN<bigD) { p=nat/N; SEdn=sqrt(9*p*(1-p)/(square(3-4*p)*N)); }
1947
FPN(fds); FPN(fdn); FPN(ft);
1949
if(noisy>1 && com.ns>10) printf(" %3d", is+1);
1953
if(status) fprintf (fout, "NOTE: -1 means that NG86 is inapplicable.\n");
1964
int EigenQREVbase (FILE* fout, double kappa[],
1965
double pi[], int *nR, double Root[], double Cijk[])
1969
int i,j,k, nr=(com.ngene>1&&com.Mgene>=3?com.nrate/com.ngene:com.nrate);
1970
double Q[16], U[16], V[16], mr, space_pisqrt[4];
1975
if(com.model==REV) {
1976
for(i=0,k=0,Q[3*4+2]=Q[2*4+3]=1; i<3; i++) for (j=i+1; j<4; j++)
1977
if(i*4+j!=11) Q[i*4+j]=Q[j*4+i]=kappa[k++];
1979
else /* (model==REVu) */
1980
FOR(i,3) for(j=i+1; j<4; j++)
1981
Q[i*4+j]=Q[j*4+i] = (StepMatrix[i*4+j] ? kappa[StepMatrix[i*4+j]-1] : 1);
1983
FOR(i,4) FOR(j,4) Q[i*4+j] *= pi[j];
1985
for (i=0,mr=0; i<4; i++)
1986
{ Q[i*4+i]=0; Q[i*4+i]=-sum(Q+i*4, 4); mr-=pi[i]*Q[i*4+i]; }
1990
mr=2*(pi[0]*Q[0*4+1]+pi[2]*Q[2*4+3]);
1992
fprintf(fout, "\nRate parameters: ");
1994
fprintf(fout, " %8.5f", kappa[j]);
1995
fprintf(fout, "\nBase frequencies: ");
1997
fprintf(fout," %8.5f", pi[j]);
1999
fprintf (fout, "\nRate matrix Q, Average Ts/Tv =%9.4f", mr/(1-mr));
2000
matout (fout, Q, 4,4);
2003
eigenQREV(Q, pi, 4, Root, U, V, space_pisqrt);
2004
FOR (i,4) FOR(j,4) FOR(k,4) Cijk[i*4*4+j*4+k] = U[i*4+k]*V[k*4+j];
2010
int QUNREST (FILE *fout, double Q[], double rate[], double pi[])
2012
/* This constructs the rate matrix Q for the unrestricted model.
2013
pi[] is changed in the routine.
2016
double mr, space[20];
2018
if(com.model==UNREST) {
2019
for (i=0,k=0,Q[14]=1; i<4; i++) FOR(j,4)
2020
if (i!=j && i*4+j != 14) Q[i*4+j]=rate[k++];
2022
else /* (model==UNRESTu) */
2025
Q[i*4+j] = (StepMatrix[i*4+j] ? rate[StepMatrix[i*4+j]-1] : 1);
2027
FOR(i,4) { Q[i*4+i]=0; Q[i*4+i]=-sum(Q+i*4, 4); }
2031
QtoPi (Q, com.pi, 4, space);
2033
for (i=0,mr=0; i<4; i++) mr -= pi[i]*Q[i*4+i];
2034
for (i=0; i<4*4; i++) Q[i]/=mr;
2037
mr=pi[0]*Q[0*4+1]+pi[1]*Q[1*4+0]+pi[2]*Q[2*4+3]+pi[3]*Q[3*4+2];
2039
fprintf(fout, "Rate parameters: ");
2040
FOR(j,com.nrate) fprintf(fout, " %8.5f", rate[j]);
2041
fprintf(fout, "\nBase frequencies: ");
2042
FOR(j,4) fprintf(fout," %8.5f", pi[j]);
2043
fprintf (fout,"\nrate matrix Q, Average Ts/Tv (similar to kappa/2) =%9.4f",mr/(1-mr));
2044
matout (fout, Q, 4, 4);
2054
double *SeqDistance=NULL;
2059
/* This finds the most recent common ancestor of species is and js.
2061
int is, js, it, a1, a2;
2063
for(is=0; is<com.ns; is++) for(js=0; js<is; js++) {
2064
it = is*(is-1)/2+js;
2066
for (a1=is; a1!=-1; a1=nodes[a1].father) {
2067
for (a2=js; a2!=-1; a2=nodes[a2].father)
2068
if (a1==a2) { ancestor[it] = a1; break; }
2069
if (ancestor[it] != -1) break;
2071
if (ancestor[it] == -1) error2("no ancestor");
2076
int fun_LS (double x[], double diff[], int np, int npair);
2078
int fun_LS (double x[], double diff[], int np, int npair)
2080
int i,j, aa, it=-np;
2083
if (SetBranch(x) && noisy>2) puts ("branch len.");
2084
if (npair != com.ns*(com.ns-1)/2) error2("# seq pairs err.");
2085
for(i=0; i<com.ns; i++) for(j=0; j<i; j++) {
2087
for (aa=i,dexp=0; aa!=ancestor[it]; aa=nodes[aa].father)
2088
dexp += nodes[aa].branch;
2089
for (aa=j; aa!=ancestor[it]; aa=nodes[aa].father)
2090
dexp += nodes[aa].branch;
2091
diff[it] = SeqDistance[it] - dexp;
2093
if(fabs(diff[it])>1000) {
2094
printf("\ndistances very different: diff = %12.6f ", diff[it]);
2101
int LSDistance (double *ss,double x[],int (*testx)(double x[],int np))
2103
/* get Least Squares estimates of branch lengths for a given tree topology
2104
This uses nls2, a general least squares algorithm for nonlinear programming
2105
to estimate branch lengths, and it thus inefficient.
2109
if ((*testx)(x, com.ntime)) {
2110
matout (F0, x, 1, com.ntime);
2111
puts ("initial err in LSDistance()");
2114
i = nls2((com.ntime>20&&noisy>=3?F0:NULL),
2115
ss,x,com.ntime,fun_LS,NULL,testx,com.ns*(com.ns-1)/2,1e-6);
2120
double PairDistanceML(int is, int js)
2122
/* This calculates the ML distance between is and js, the sum of ML branch
2123
lengths along the path between is and js.
2124
LSdistance() has to be called once to set ancestor before calling this
2130
if(is==js) return(0);
2131
if(is<js) { it=is; is=js; js=it; }
2134
for (a=is; a!=ancestor[it]; a=nodes[a].father)
2135
dij += nodes[a].branch;
2136
for (a=js; a!=ancestor[it]; a=nodes[a].father)
2137
dij += nodes[a].branch;
2142
int GroupDistances()
2144
/* This calculates average group distances by summing over the ML
2146
int newancestor=0, i,j, ig,jg;
2147
/* int ngroup=2, Ningroup[10], group[200]={1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2148
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2149
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2150
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2151
1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2152
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2153
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2154
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2155
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; */ /* dloop for HC200.paup */
2156
int ngroup=10, Ningroup[10], group[115]={
2157
10, 9, 9, 9, 9, 9, 9, 9, 9, 10,
2158
9, 9, 3, 3, 1, 1, 1, 1, 1, 1,
2159
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2160
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2161
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2162
1, 2, 2, 2, 2, 2, 2, 4, 4, 4,
2163
4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
2164
4, 4, 4, 4, 4, 4, 5, 5, 5, 5,
2165
5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
2166
5, 5, 5, 5, 6, 6, 6, 6, 6, 6,
2167
6, 7, 7, 7, 7, 7, 7, 7, 7, 7,
2168
8, 8, 8, 8, 8}; /* dloop data for Anne Yoder, ns=115 */
2169
double dgroup, npairused;
2171
/* ngroup=2; for(j=0;j<com.ns; j++) group[j]=1+(group[j]>2); */
2173
for(j=0;j<ngroup;j++) Ningroup[j]=0;
2174
for(j=0;j<com.ns; j++) Ningroup[group[j]-1]++;
2175
printf("\n# sequences in group:");
2176
matIout(F0,Ningroup,1,ngroup);
2177
if(ancestor==NULL) {
2179
ancestor=(int*)realloc(ancestor, com.ns*(com.ns-1)/2*sizeof(int));
2180
if(ancestor==NULL) error2("oom ancestor");
2184
for(ig=0; ig<ngroup; ig++) {
2185
printf("\ngroup %2d",ig+1);
2186
for(jg=0; jg<ig+1; jg++) {
2187
dgroup=0; npairused=0;
2188
for(i=0;i<com.ns;i++) for(j=0;j<com.ns;j++) {
2189
if(i!=j && group[i]==ig+1 && group[j]==jg+1) {
2190
dgroup += PairDistanceML(i, j);
2195
printf("%9.4f", dgroup);
2197
/* printf("%6.1f", dgroup/0.2604*5); */ /* 0.2604, 0.5611 */
2200
if(newancestor==1) free(ancestor);
2206
#ifdef NODESTRUCTURE
2208
void BranchToNode (void)
2210
/* tree.root need to be specified before calling this
2214
tree.nnode=tree.nbranch+1;
2215
for(i=0; i<tree.nnode; i++)
2216
{ nodes[i].father=nodes[i].ibranch=-1; nodes[i].nson=0; }
2217
for (i=0; i<tree.nbranch; i++) {
2218
from=tree.branches[i][0];
2219
to =tree.branches[i][1];
2220
nodes[from].sons[nodes[from].nson++]=to;
2221
nodes[to].father=from;
2222
nodes[to].ibranch=i;
2224
/* nodes[tree.root].branch=0; this breaks method=1 */
2227
void NodeToBranchSub (int inode);
2229
void NodeToBranchSub (int inode)
2233
for(i=0; i<nodes[inode].nson; i++) {
2234
tree.branches[tree.nbranch][0] = inode;
2235
tree.branches[tree.nbranch][1] = ison = nodes[inode].sons[i];
2236
nodes[ison].ibranch = tree.nbranch++;
2237
if(nodes[ison].nson>0) NodeToBranchSub(ison);
2241
void NodeToBranch (void)
2244
NodeToBranchSub (tree.root);
2245
if(tree.nnode != tree.nbranch+1)
2246
error2("nnode != nbranch + 1?");
2250
void ClearNode (int inode)
2252
/* a source of confusion. Try not to use this routine.
2254
nodes[inode].father = nodes[inode].ibranch = -1;
2255
nodes[inode].nson = 0;
2256
nodes[inode].branch = nodes[inode].age = 0;
2257
/* nodes[inode].label = -1; */
2258
/* nodes[inode].branch = 0; clear node structure only, not branch lengths */
2259
/* for(i=0; i<com.ns; i++) nodes[inode].sons[i]=-1; */
2262
int ReadTreeB (FILE *ftree, int popline)
2265
int nodemark[NS*2-1]={0}; /* 0: absent; 1: father only (root); 2: son */
2266
int i,j, state=0, YoungAncestor=0;
2269
puts("\nbranch representation of tree might not work with clock model.");
2273
fscanf (ftree, "%d", &tree.nbranch);
2274
for(j=0; j<tree.nbranch; j++) {
2275
for(i=0; i<2; i++) {
2276
if (fscanf (ftree, "%d", & tree.branches[j][i]) != 1) state=-1;
2277
tree.branches[j][i]--;
2278
if(tree.branches[j][i]<0 || tree.branches[j][i]>com.ns*2-1)
2279
error2("ReadTreeB: node numbers out of range");
2281
nodemark[tree.branches[j][1]]=2;
2282
if(nodemark[tree.branches[j][0]]!=2) nodemark[tree.branches[j][0]]=1;
2283
if (tree.branches[j][0]<com.ns) YoungAncestor=1;
2285
printf ("\nBranch #%3d: %3d -> %3d",j+1,tree.branches[j][0]+1,tree.branches[j][1]+1);
2288
if(popline) fgets(line, 254, ftree);
2289
for(i=0,tree.root=-1; i<tree.nbranch; i++)
2290
if(nodemark[tree.branches[i][0]]!=2) tree.root=tree.branches[i][0];
2291
if(tree.root==-1) error2("root err");
2292
for(i=0; i<com.ns; i++)
2293
if(nodemark[i]==0) {
2294
matIout(F0,nodemark,1,com.ns);
2295
error2("branch specification of tree");
2299
puts("\nAncestors in the data? Take care.");
2300
if(!com.cleandata) {
2301
puts("This kind of tree does not work with unclean data.");
2307
com.ntime = com.clock ? (tree.nbranch+1)-com.ns+(tree.root<com.ns)
2316
int OutTreeB (FILE *fout)
2319
char *fmt[]={" %3d..%-3d", " %2d..%-2d"};
2320
FOR (j, tree.nbranch)
2321
fprintf(fout, fmt[0], tree.branches[j][0]+1,tree.branches[j][1]+1);
2325
int GetTreeFileType(FILE *ftree, int *ntree, int *pauptree, int shortform);
2327
int GetTreeFileType(FILE *ftree, int *ntree, int *pauptree, int shortform)
2329
/* paupstart="begin trees" and paupend="translate" identify paup tree files.
2330
paupch=";" will be the last character before the list of trees.
2331
Modify if necessary.
2333
int i,k, lline=32000, ch=0, paupch=';';
2335
char *paupstart="begin tree", *paupend="translate";
2338
k=fscanf(ftree,"%d%d",&i,ntree);
2340
if(i==com.ns) return(0); /* old paml style */
2341
else error2("Number of sequences different in tree and seq files.");
2343
else if(k==1) { *ntree=i; return(0); } /* phylip & molphy style */
2344
while(ch!='(' && !isalnum(ch) && ch!=EOF) ch=fgetc(ftree); /* treeview style */
2345
if(ch=='(') { *ntree=-1; ungetc(ch,ftree); return(0); }
2347
puts("\n# seqs in tree file does not match. Read as the nexus format.");
2349
if(fgets(line,lline,ftree)==NULL) error2("tree err1: EOF");
2351
if (strstr(line,paupstart)) { *pauptree=1; *ntree=-1; break; }
2353
if(shortform) return(0);
2355
if(fgets(line,lline,ftree)==NULL) error2("tree err2: EOF");
2357
if (strstr(line,paupend)) break;
2360
if((ch=fgetc(ftree))==EOF) error2("tree err3: EOF");
2361
if (ch==paupch) break;
2363
if(fgets(line,lline,ftree)==NULL) error2("tree err4: EOF");
2368
int PopPaupTreeRubbish(FILE *ftree);
2369
int PopPaupTreeRubbish(FILE *ftree)
2371
/* This reads out the string in front of the tree in the nexus format,
2372
typically "tree PAUP_1 = [&U]" with "[&U]" optional
2378
if(ch=='(') { ungetc(ch,ftree); return(0); }
2379
else if(ch==EOF) return(-1);
2385
static int *CladeLabel = NULL;
2387
void DownTreeCladeLabel (int inode, int cLabel)
2389
/* This goes down the tree to change $ labels (stored in CladeLabel[]) into
2390
# labels (stored in nodes[].label). To deal with nested clade labels,
2391
branches within a clade are labeled by negative numbers initially, and
2392
converted to positive labels at the end of the algorithm.
2394
nodes[].label and CladeLabel[] are initialized to -1 before this routine
2400
if(CladeLabel[inode] != -1)
2401
label = CladeLabel[inode];
2402
if(inode != tree.root && nodes[inode].label == -1)
2403
nodes[inode].label = label;
2404
for(i=0; i<nodes[inode].nson; i++)
2405
DownTreeCladeLabel(nodes[inode].sons[i], label);
2408
int IsNameNumber(char line[])
2410
/* returns 0 if line has species number; 1 if name; 2 if both number and name
2412
int isname=0, j,k, ns=com.ns;
2413
int SeparatorFixed=(int)'_';
2415
if(ns<1) error2("ns=0 in IsNameNumber");
2416
/* both name and number? */
2417
k = strchr(line, SeparatorFixed) - line;
2419
if(!isdigit(line[j])) break;
2423
for(j=0; line[j]; j++) /* name or number? */
2424
if(!isdigit(line[j])) { isname=1; break; }
2426
if(isname==0 || isname==2) {
2427
sscanf(line,"%d",&k);
2429
printf("species number %d outside range.", k);
2438
int ReadTreeN (FILE *ftree, int *haslength, int *haslabel, int copyname, int popline)
2440
/* Read a tree from ftree, using the parenthesis node representation of trees.
2441
Branch lengths are read in nodes[].branch, and branch (node) labels
2442
(integers) are preceeded by # and read in nodes[].label. If the clade label
2443
$ is used, the label is read into CladeLabel[] first and then moved into
2444
nodes[].label in the routine DownTreeCladeLabel().
2446
This assumes that com.ns is known.
2447
Species names are considered case-sensitive, with trailing spaces ignored.
2449
copyname = 0: species numbers and names are both accepted, but names have
2450
to match the names in com.spname[], which are from the
2451
sequence data file. Used by baseml and codeml, for example.
2452
1: species names are copied into com.spname[], but species
2453
numbers are accepted. Used by evolver for simulation,
2454
in which case no species names were read before.
2455
2: the tree must have species names, which are copied into com.spname[].
2456
Note that com.ns is assumed known. To remove this restrition,
2457
one has to consider the space for nodes[], CladeLabel, starting
2460
isname = 0: species number; 1: species name; 2:both number and name
2462
int cnode, cfather=-1; /* current node and father */
2463
int inodeb=0; /* node number that will have the next branch length */
2464
int i,j,k, level=0, isname, ch=' ', icurspecies=0;
2465
char check[NS], delimiters[]="(),:#$=@><;", quote[]="\"\'";
2467
char line[32000], *pch;
2469
if(com.ns<=0) error2("need to know ns before reading tree.");
2471
if((CladeLabel=(int*)malloc((com.ns*2-1)*sizeof(int)))==NULL)
2472
error2("oom trying to get space for cladelabel");
2473
for(i=0; i<2*com.ns-1; i++)
2476
/* initialization */
2477
for(i=0; i<com.ns; i++) check[i]=0;
2478
*haslength = 0; *haslabel = 0;
2479
tree.nnode = com.ns; tree.nbranch = 0;
2480
for(i=0; i<2*com.ns-1; i++) {
2481
nodes[i].father = nodes[i].ibranch = -1;
2482
nodes[i].nson = 0; nodes[i].label = -1; nodes[i].branch = 0;
2483
nodes[i].age = 0; /* TipDate models set this for each tree later. */
2484
#if (defined(BASEML) || defined(CODEML))
2485
nodes[i].fossil = 0;
2489
ch=fgetc(ftree); /* skip spaces */
2492
{ ReadTreeB(ftree,popline); return(0); }
2494
PopPaupTreeRubbish(ftree);
2498
if (ch==EOF) return(-1);
2499
else if (ch == ';') {
2500
if(level!=0) error2("; in treefile");
2504
else if (!isgraph(ch))
2506
else if (ch == '(') { /* left ( */
2509
if(tree.nnode>2*com.ns-1)
2510
error2("check #seqs and tree: perhaps too many '('?");
2512
if(nodes[cfather].nson >= MAXNSONS) {
2513
printf("there are at least %d daughter nodes, raise MAXNSONS?", nodes[cfather].nson);
2516
nodes[cfather].sons[nodes[cfather].nson++] = cnode;
2517
nodes[cnode].father = cfather;
2518
tree.branches[tree.nbranch][0] = cfather;
2519
tree.branches[tree.nbranch][1] = cnode;
2520
nodes[cnode].ibranch = tree.nbranch++;
2526
/* treating : and > in the same way is risky. */
2527
else if (ch==')') { level--; inodeb=cfather; cfather=nodes[cfather].father; }
2528
else if (ch==':'||ch=='>') {
2529
if(ch==':') *haslength=1;
2530
fscanf(ftree,"%lf",&nodes[inodeb].branch);
2532
else if (ch==quote[0] || ch==quote[1]) {
2533
for (k=0; ; k++) { /* read notes into line[] */
2534
line[k] = (char)fgetc(ftree);
2535
if((int)line[k] == EOF)
2536
error2("EOF when reading node label");
2537
if(line[k] == quote[0] || line[k] == quote[1])
2541
nodes[inodeb].nodeStr = (char*)malloc(k*sizeof(char));
2542
if (nodes[inodeb].nodeStr == NULL) error2("oom nodeStr");
2543
strcpy(nodes[inodeb].nodeStr, line);
2544
if((pch = strchr(line,'#')) || (pch = strchr(line,'<'))) {
2545
*haslabel=1; sscanf(pch+1, "%lf", &nodes[inodeb].label);
2547
if((pch = strchr(line,'>'))) {
2548
sscanf(pch+1, "%lf", &nodes[inodeb].branch);
2550
if((pch = strchr(line,'$'))) {
2551
*haslabel=1; sscanf(pch+1, "%d", &CladeLabel[inodeb]);
2553
if((pch = strchr(line,'=')) || (pch = strchr(line,'@'))) {
2554
sscanf(pch+1, "%lf", &nodes[inodeb].age);
2555
#if (defined(BASEML) || defined(CODEML))
2556
if(com.clock) nodes[inodeb].fossil = 1;
2558
#if (defined(CODEML))
2559
nodes[inodeb].omega = 0;
2563
else if (ch=='#'||ch=='<') { *haslabel=1; fscanf(ftree,"%lf",&nodes[inodeb].label); }
2564
else if (ch=='$') { *haslabel=1; fscanf(ftree,"%d",&CladeLabel[inodeb]); }
2565
else if (ch=='@'||ch=='=') {
2566
fscanf(ftree,"%lf",&nodes[inodeb].age);
2567
#if (defined(BASEML) || defined(CODEML))
2568
if(com.clock) nodes[inodeb].fossil = 1;
2570
#if (defined(CODEML))
2571
nodes[inodeb].omega = 0;
2574
else { /* read species name or number */
2575
line[0]=(char)ch; line[1]=(char)fgetc(ftree);
2576
/* if(line[1]==(char)EOF) error2("eof in tree file"); */
2578
for (i=1; i<lline; ) { /* read species name into line[] until delimiter */
2579
if ((strchr(delimiters,line[i]) && line[i]!='@')
2580
|| line[i]==(char)EOF || line[i]=='\n')
2581
{ ungetc(line[i],ftree); line[i]=0; break; }
2582
line[++i]=(char)fgetc(ftree);
2584
for(j=i-1;j>0;j--) /* trim spaces*/
2585
if(isgraph(line[j])) break; else line[j]=0;
2586
isname = IsNameNumber(line);
2588
if (isname==2) { /* both number and name */
2589
sscanf(line, "%d", &cnode); cnode--;
2590
strcpy(com.spname[cnode], line);
2592
else if (isname==0) { /* number */
2593
if(copyname==2) error2("Use names in tree.");
2594
sscanf(line, "%d", &cnode);
2599
for(i=0; i<com.ns; i++) if (!strcmp(line,com.spname[i])) break;
2600
if((cnode=i)==com.ns) { printf("\nSpecies %s?\n", line); exit(-1); }
2603
if(icurspecies>com.ns-1) {
2604
error2("error in tree: too many species in tree");
2606
strcpy(com.spname[cnode=icurspecies++], line);
2609
nodes[cnode].father=cfather;
2610
if(nodes[cfather].nson>=MAXNSONS)
2611
error2("too many daughter nodes, raise MAXNSONS");
2613
nodes[cfather].sons[nodes[cfather].nson++] = cnode;
2614
tree.branches[tree.nbranch][0] = cfather;
2615
tree.branches[tree.nbranch][1] = cnode;
2616
nodes[cnode].ibranch = tree.nbranch++;
2623
fgets(line, lline, ftree);
2624
for(i=0; i<com.ns; i++) {
2626
printf("\nSeq #%d occurs more than once in the tree\n",i+1); exit(-1);
2628
else if(check[i]<1) {
2629
printf("\nSeq #%d (%s) is missing in the tree\n",i+1,com.spname[i]); exit(-1);
2632
if(tree.nbranch>2*com.ns-2) {
2633
printf("nbranch %d", tree.nbranch); puts("too many branches in tree?");
2635
if (tree.nnode != tree.nbranch+1) {
2636
printf ("\nnnode%6d != nbranch%6d + 1\n", tree.nnode, tree.nbranch);
2640
/* check that it is o.k. to comment out this line
2641
com.ntime = com.clock ? (tree.nbranch+1)-com.ns+(tree.root<com.ns)
2645
#if(defined(BASEML) || defined(CODEML))
2646
/* check and convert clade labels $ */
2647
if(com.clock>1 || (com.seqtype==1 && com.model>=2)) {
2648
for(i=0,j=0; i<tree.nnode; i++) {
2649
if(CladeLabel[i] != -1) j++;
2651
if(j) {/* j is number of clade labels */
2652
DownTreeCladeLabel(tree.root, 0);
2655
for(i=0; i<tree.nnode; i++)
2656
if(i!=tree.root && nodes[i].label==-1) nodes[i].label = 0;
2658
/* OutTreeN(F0,1,PrBranch|PrNodeNum); FPN(F0); */
2659
/* FPN(F0); OutTreeN(F0,1,PrLabel); FPN(F0); */
2661
for(i=0,com.nbtype=0; i<tree.nnode; i++) {
2662
if(i == tree.root) continue;
2663
j = (int)nodes[i].label;
2664
if(j+1 > com.nbtype) com.nbtype=j+1;
2665
if(j<0 || j>tree.nbranch-1)
2666
error2("branch label in the tree (note labels start from 0 and are consecutive)");
2669
error2("need branch labels in the tree for the model.");
2671
printf("\n%d branch types are in tree. Stop if wrong.", com.nbtype);
2674
#if(defined(CODEML))
2675
if(com.seqtype==1 && com.NSsites==2 && com.model==3 && com.nbtype>NBTYPE)
2676
error2("nbtype too large. Raise NBTYPE");
2677
else if(com.seqtype==1 && com.NSsites && com.model==2 && com.nbtype!=2)
2678
error2("only two branch types are allowed for branch models.");
2690
int OutSubTreeN (FILE *fout, int inode, int spnames, int printopt, char *labelfmt);
2692
int OutSubTreeN (FILE *fout, int inode, int spnames, int printopt, char *labelfmt)
2694
int i, dad = nodes[inode].father, nsib = (inode==tree.root ? 0 : nodes[dad].nson);
2696
if(inode != tree.root && inode == nodes[dad].sons[0])
2699
for(i=0; i<nodes[inode].nson; i++)
2700
OutSubTreeN(fout, nodes[inode].sons[i], spnames, printopt, labelfmt);
2702
if(nodes[inode].nson==0) { /* inode is tip */
2704
if(printopt&PrNodeNum) fprintf(fout, "%d_",inode+1);
2705
fprintf(fout, "%s",com.spname[inode]);
2708
fprintf(fout, "%d", inode+1);
2710
if((printopt & PrNodeNum) && nodes[inode].nson)
2711
fprintf(fout," %d ", inode+1);
2712
if((printopt & PrLabel) && nodes[inode].label>0)
2713
fprintf(fout, labelfmt, nodes[inode].label);
2714
if((printopt & PrAge) && nodes[inode].age)
2715
fprintf(fout, " @%.3f", nodes[inode].age);
2717
/* Add branch labels to be read by Rod Page's TreeView. */
2718
#if (defined CODEML)
2719
if((printopt & PrOmega) && inode != tree.root)
2720
fprintf(fout," '#%.4f' ", nodes[inode].omega);
2721
#elif (defined (EVOLVER) || defined (MCMCTREE))
2722
if((printopt & PrLabel) && nodes[inode].nodeStr && nodes[inode].nodeStr[0])
2723
fprintf(fout," '%s'", nodes[inode].nodeStr);
2726
if((printopt & PrBranch) && (inode!=tree.root || nodes[inode].branch>0))
2727
fprintf(fout,": %.6f", nodes[inode].branch);
2728
if(nsib == 0) /* root */
2730
else if (inode == nodes[dad].sons[nsib-1]) /* last sib */
2732
else /* not last sib */
2733
fprintf(fout, ", ");
2739
int OutTreeN (FILE *fout, int spnames, int printopt)
2741
/* print the current tree.
2742
Can the block of print statements be moved inside the recursive function?
2745
char* labelfmt[2]={"'#%.5f'", "'#%.0f'"};
2747
if(printopt & PrLabel) {
2748
for(i=0; i<tree.nnode; i++)
2749
if(nodes[i].label-(int)nodes[i].label != 0) intlabel=0;
2752
OutSubTreeN(fout, tree.root, spnames, printopt, labelfmt[intlabel]);
2760
/* This cnages the bifurcation at the root into a trifurcation, but setting one of
2761
the sons to be the new root. The new root is the first son that is not a tip.
2762
tree.nnode is updated, but the routine does not re-number the nodes, so the new
2763
node labels do not go from ns, ns + 1, ..., as they normally should.
2765
int i, ison, sib, root = tree.root;
2767
if(nodes[root].nson!=2) error2("in DeRoot?");
2769
ison = nodes[root].sons[i = 0];
2770
if(nodes[ison].nson==0)
2771
ison = nodes[root].sons[i = 1];
2772
sib = nodes[root].sons[1 - i];
2773
nodes[sib].branch += nodes[ison].branch;
2774
nodes[sib].father = tree.root = ison;
2775
nodes[tree.root].father = -1;
2776
nodes[tree.root].sons[nodes[tree.root].nson++] = sib; /* sib added as the last child of the new root */
2777
nodes[tree.root].branch = 0;
2778
tree.nnode --; /* added 2007/4/9 */
2784
int PruneSubTreeN (int inode, int keep[])
2786
/* This prunes tips from the tree, using keep[com.ns]. Removed nodes in the
2787
big tree has nodes[].father=-1 and nodes[].nson=0.
2788
Do not change nodes[inode].nson and nodes[inode].sons[] until after the
2789
node's descendent nodes are all processed. So when a son is deleted,
2790
only the father node's nson is changed, but not
2792
int i,j, ison, father=nodes[inode].father, nson0=nodes[inode].nson;
2794
for(i=0; i<nson0; i++)
2795
PruneSubTreeN(nodes[inode].sons[i], keep);
2797
/* remove inode because of no descendents.
2798
Note this does not touch the father node */
2799
if(inode<com.ns && keep[inode]==0)
2800
nodes[inode].father=-1;
2801
else if(inode>=com.ns) {
2802
for(i=0,nodes[inode].nson=0; i<nson0; i++) {
2803
ison=nodes[inode].sons[i];
2804
if(nodes[ison].father!=-1)
2805
nodes[inode].sons[ nodes[inode].nson++ ] = nodes[inode].sons[i];
2807
if(nodes[inode].nson==0)
2808
nodes[inode].father=-1;
2811
/* remove inode if it has a single descendent ison */
2812
if(inode>=com.ns && nodes[inode].nson==1 && inode!=tree.root) {
2813
ison=nodes[inode].sons[0];
2814
nodes[ison].father=father;
2815
nodes[ison].branch+=nodes[inode].branch;
2816
for(j=0;j<nodes[father].nson;j++) {
2817
if(nodes[father].sons[j]==inode)
2818
{ nodes[father].sons[j]=ison; break; }
2820
nodes[inode].nson=0;
2821
nodes[inode].father=-1;
2823
/* move down the root if the root has only one descendent */
2824
else if(inode==tree.root) {
2825
if(nodes[inode].nson==1) {
2826
for(; ; inode=nodes[inode].sons[0]) {
2827
nodes[inode].father=-1;
2828
if(nodes[inode].nson>1) break;
2829
nodes[inode].nson=0;
2832
/* collapse down the root. ison is new root */
2833
if(!com.clock && Nsonroot>=3 && nodes[inode].nson==2) DeRoot();
2840
int GetSubTreeN (int keep[], int space[])
2842
/* This removes some tips to generate the subtree. Branch lengths are
2843
preserved by summing them up when some nodes are removed.
2844
The algorithm use post-order tree traversal to remove tips and nodes. It
2845
then switches to the branch representation to renumber nodes.
2846
space[] can be NULL. If not, it returns newnodeNO[], which holds the
2847
new node numbers; for exmaple, newnodeNO[12]=5 means that old node 12 now
2850
The routine does not change com.ns or com.spname[], which have to be updated
2853
CHANGE OF ROOT happens if the root in the old tree had >=3 sons, but has 2
2854
sons in the new tree and if (!com.clock). In that case, the tree is derooted.
2856
This routine does not work if a current seq is ancestral to some others
2857
and if that sequence is removed. (***check this comment ***)
2859
Different formats for keep[] are used. Suppose the current tree is for
2860
nine species: a b c d e f g h i.
2862
(A) keep[]={1,0,1,1,1,0,0,1,0} means that a c d e h are kept in the tree.
2863
The old tip numbers are not changed, so that OutTreeN(?,1,?) gives the
2864
correct species names or OutTreeN(?,0,?) gives the old species numbers.
2866
(B) keep[]={1,0,2,3,4,0,0,5,0} means that a c d e h are kept in the tree, and
2867
they are renumbered 0 1 2 3 4 and all the internal nodes are renumbered
2868
as well to be consecutive. Note that the positive numbers have to be
2869
consecutive natural numbers.
2871
keep[]={5,0,2,1,4,0,0,3,0} means that a c d e h are kept in the tree.
2872
However, the order of the sequences are changed to d c h e a, so that the
2873
numbers are now 0 1 2 3 4 for d c h e a. This is useful when the subtree
2874
is extracted from a big tree for a subset of the sequence data, while the
2875
species are odered d c h e a in the sequence data file.
2876
This option can be used to renumber the tips in the complete tree.
2878
int nsnew, i,j,k, nnode0=tree.nnode, sumnumber=0, newnodeNO[2*NS-1];
2882
Nsonroot=nodes[tree.root].nson;
2884
if(debug) { FOR(i,com.ns) printf("%-15s %2d\n", com.spname[i], keep[i]); }
2885
for(i=0,nsnew=0;i<com.ns;i++)
2886
if(keep[i]) { nsnew++; sumnumber+=keep[i]; }
2887
if(nsnew<2) return(-1);
2889
/* mark removed nodes in the big tree by father=-1 && nson=0 */
2890
PruneSubTreeN(tree.root, keep);
2891
if(debug) printtree(1);
2893
for(i=0,k=1; i<tree.nnode; i++) if(nodes[i].father!=-1) k++;
2897
if(sumnumber>nsnew) {
2898
if(sumnumber!=nsnew*(nsnew+1)/2) error2("keep[] not right in GetSubTreeN");
2899
if((branch0=(double*)malloc(nnode0*sizeof(double)))==NULL) error2("oom#");
2900
FOR(i,nnode0) branch0[i]=nodes[i].branch;
2901
FOR(i,nnode0) newnodeNO[i]=-1;
2902
FOR(i,com.ns) if(keep[i]) newnodeNO[i]=keep[i]-1;
2904
newnodeNO[tree.root] = k = nsnew; tree.root=k++;
2905
for( ; i<nnode0; i++) {
2906
if(nodes[i].father==-1) continue;
2907
for(j=0; j<tree.nbranch; j++) if(i==tree.branches[j][1]) break;
2908
if(j==tree.nbranch) error2("strange here");
2911
for(j=0; j<tree.nbranch; j++) FOR(i,2)
2912
tree.branches[j][i] = newnodeNO[tree.branches[j][i]];
2914
for(i=0;i<nnode0;i++) {
2916
nodes[newnodeNO[i]].branch=branch0[i];
2920
if(space) memmove(space, newnodeNO, (com.ns*2-1)*sizeof(int));
2925
void printtree (int timebranches)
2929
printf("\nns = %d nnode = %d", com.ns, tree.nnode);
2930
printf("\n%7s%7s", "father","node");
2931
if(timebranches) printf("%10s%10s%10s", "time","branch","label");
2932
printf(" %7s%7s", "nson:","sons");
2933
FOR (i, tree.nnode) {
2934
printf ("\n%7d%7d", nodes[i].father, i);
2936
printf(" %9.6f %9.6f %9.0f", nodes[i].age, nodes[i].branch,nodes[i].label);
2938
printf ("%7d: ", nodes[i].nson);
2939
FOR(j,nodes[i].nson) printf(" %2d", nodes[i].sons[j]);
2942
OutTreeN(F0,0,0); FPN(F0);
2943
OutTreeN(F0,1,0); FPN(F0);
2944
OutTreeN(F0,1,1); FPN(F0);
2948
void PointconPnodes (void)
2950
/* This points the nodes[com.ns+inode].conP to the right space in com.conP.
2951
The space is different depending on com.cleandata (0 or 1)
2952
This routine updates internal nodes com.conP only.
2953
End nodes (com.conP0) are updated in InitConditionalPNode().
2955
size_t nintern=0, i;
2957
for(i=0; i<tree.nbranch+1; i++)
2958
if(nodes[i].nson>0) /* more thinking */
2959
nodes[i].conP = com.conP + com.ncode*com.npatt*nintern ++;
2963
int SetxInitials (int np, double x[], double xb[][2])
2965
/* This forces initial values into the boundary of the space
2969
for (i=com.ntime; i<np; i++) {
2970
if (x[i]<xb[i][0]*1.05) x[i]=xb[i][0]*1.05;
2971
if (x[i]>xb[i][1]/1.05) x[i]=xb[i][1]/1.05;
2973
for (i=0; i<com.np; i++) {
2974
if (x[i]<xb[i][0]) x[i]=xb[i][0]*1.2;
2975
if (x[i]>xb[i][1]) x[i]=xb[i][1]*.8;
2981
#if(defined(BASEML) || defined(CODEML))
2983
double *AgeLow=NULL;
2984
int NFossils=0, AbsoluteRate=0;
2985
double ScaleTimes_TipDate=1, TipDate=0;
2987
MutationRate = mut/ScaleTimes_TipDate;
2988
age=age*ScaleTimes_TipDate
2991
void SetAge(int inode, double x[]);
2992
void GetAgeLow (int inode);
2993
/* number of internal node times, usd to deal with known ancestors. Broken? */
2994
static int innode_time=0;
2996
/* Ziheng Yang, 25 January 2003
2997
The following routines deal with clock and local clock models, including
2998
Andrew Rambaut's TipDate models (Rambaut 2000 Bioinformatics 16:395-399;
2999
Yoder & Yang 2000 Mol Biol Evol 17:1081-1090; Yang & Yoder 2003 Syst Biol).
3000
The tree is rooted. The routine SetAge assumes that ancestral nodes are
3001
arranged in the increasing order and so works only if the input tree uses
3002
the parenthesis notation and not the branch notation. The option of known
3003
ancestors is probably broken.
3005
The flag AbsoluteRate=1 if(TipDate || NFossils). This could be removed
3006
as the flags TipDate and NFossils are sufficient.
3008
clock = 1: global clock, deals with TipDate with no or many fossils,
3009
ignores branch rates (#) in tree if any.
3010
= 2: local clock models, as above, but requires branch rates #
3012
= 3: as 2, but requires Mgene and option G in sequence file.
3014
Order of variables in x[]: divergence times, rates for branches, rgene, ...
3015
In the following ngene=4, com.nbtype=3, with r_ij to be the rate
3016
of gene i and branch class j.
3019
[times, r00(if absolute) r01 r02 rgene1 rgene2 rgene3]
3020
NOTE: rgene[] has relative rates
3022
[times, r00(if absolute) r01 r02 r11 r12 r21 r22 r31 r32 rgene1 rgene2 rgene3]
3023
NOTE: rgene1=r10, rgene2=r20, rgene3=r30
3025
If(nodes[tree.root].fossil==0) x[0] has absolute time for the root.
3026
Otherwise x[0] has proportional ages.
3030
double GetBranchRate(int igene, int ibrate, double x[], int *ix)
3032
/* This finds the right branch rate in x[]. The rate is absolute if AbsoluteRate.
3033
ibrate=0,1,..., indicates the branch rate class.
3034
This routine is used in the likeihood calculation and in formatting output.
3035
ix (k) has the position in x[] for the branch rate if the rate is a parameter.
3036
and is -1 if the rate is not a parameter in the ML iteration. This is
3039
int nage=tree.nnode-com.ns-NFossils, k=nage+AbsoluteRate;
3040
double rate00=(AbsoluteRate?x[nage]:1), brate=rate00;
3042
if(igene==0 && ibrate==0)
3043
k = (AbsoluteRate?nage:-1);
3044
else if(com.clock==GlobalClock) {
3045
brate = x[k=com.ntime+igene-1]; /* igene>0, rgene[] has absolute rates */
3047
else if(com.clock==LocalClock) { /* rgene[] has relative rates */
3048
if(igene==0 && ibrate) { brate = x[k+=ibrate-1]; }
3049
else if(igene && ibrate==0){ brate = rate00*x[com.ntime+igene-1]; k=-1; }
3050
else if(igene && ibrate) { brate = x[k+ibrate-1]*x[com.ntime+igene-1]; k=-1; }
3052
else if(com.clock==ClockCombined) {
3053
if(ibrate==0 && igene) brate = x[k=com.ntime+igene-1];
3054
else brate = x[k+=ibrate-1+igene*(com.nbtype-1)]; /* ibrate>0 */
3061
int GetTipDate (void)
3063
/* This scans sequences for @ to collect dates if (com.clock), for Andrew
3064
Rambaut's TipDate models. This routine is called from GetInitialsTimes()
3066
Divergence times are rescaled by using ScaleTimes_TipDate.
3068
int i, ndates=0, mark='@';
3069
double young=-1,old=-1;
3073
ScaleTimes_TipDate=1;
3074
for(i=0,ndates=0; i<com.ns; i++) {
3076
p=strchr(com.spname[i], mark);
3077
if(p==NULL) continue;
3079
sscanf(p+1, "%lf", &nodes[i].age);
3080
if(nodes[i].age<0) error2("tip date<0");
3081
if(i==0) young=old=nodes[i].age;
3082
else { old=min2(old,nodes[i].age); young=max2(young,nodes[i].age); }
3084
if(ndates==0) return(0);
3086
/* TipDate models */
3088
error2("TipDate model: each sequence must have a date");
3090
ScaleTimes_TipDate=(TipDate-old)*5;
3091
if(ScaleTimes_TipDate==0) error2("All sequences of the same age?");
3092
for(i=0; i<tree.nnode; i++) {
3093
if(i<com.ns || nodes[i].fossil)
3094
nodes[i].age=(TipDate-nodes[i].age)/ScaleTimes_TipDate;
3097
if(noisy) printf("\nTipDate model: Date range: (%.2f, %.2f), (0, %.2f) after scaling\n",
3098
young, old, (young-old)/ScaleTimes_TipDate);
3104
void SetAge (int inode, double x[])
3106
/* This is called from SetBranch(), to set up age for nodes under clock
3107
models (clock=1,2,3).
3108
if(TipDate||NFossil), that is, if(AbsoluteRate), this routine sets up
3109
times (nodes[].age) and then SetBranch() sets up branch lengths by
3110
multiplying times with rate:
3111
[].age[i] = AgeLov[i]+([father].age-AgeLov[i])*x[i]
3113
The routine assumes that times are arranged in the order of node numbers,
3114
and should work if parenthesis notation of tree is used in the tree file,
3115
but not if the branch notation is used.
3119
FOR (i,nodes[inode].nson) {
3120
ison=nodes[inode].sons[i];
3121
if(nodes[ison].nson) {
3123
if(!nodes[ison].fossil)
3124
nodes[ison].age = AgeLow[ison]
3125
+(nodes[inode].age-AgeLow[ison])*x[innode_time++];
3128
nodes[ison].age=nodes[inode].age*x[innode_time++];
3134
void GetAgeLow (int inode)
3136
/* This sets AgeLow[], the minimum age of each node. It moves down the tree to
3137
scan [].age, which has tip dates and fossil dates. It is needed if(AbsoluteRate)
3138
and is called by GetInitialsTimes().
3143
FOR(i, nodes[inode].nson) {
3144
ison=nodes[inode].sons[i];
3145
if(nodes[ison].nson)
3147
tlow = max2(tlow, nodes[ison].age);
3149
if(nodes[inode].fossil) {
3150
if(nodes[inode].age<tlow)
3151
error2("age in tree is in conflict.");
3152
AgeLow[inode]=nodes[inode].age;
3155
AgeLow[inode]=nodes[inode].age=tlow;
3160
int SetBranch (double x[])
3162
/* if(AbsoluteRate), mutation rate is not multiplied here, but during the
3163
likelihood calculation. It is copied into com.rgene[0].
3169
for(i=0; i<tree.nnode; i++) {
3171
if((nodes[i].branch=x[nodes[i].ibranch])<small) status = -1;
3176
if(!LASTROUND) { /* transformed variables (proportions) are used */
3177
if(!nodes[tree.root].fossil) /* note order of times in x[] */
3178
nodes[tree.root].age = x[innode_time++];
3179
SetAge(tree.root, x);
3181
else { /* times are used */
3182
for(i=com.ns; i<tree.nnode; i++)
3183
if(!nodes[i].fossil) nodes[i].age = x[innode_time++];
3186
for(i=0; i<tree.nnode; i++) { /* [].age to [].branch */
3187
if(i==tree.root) continue;
3188
nodes[i].branch = nodes[nodes[i].father].age-nodes[i].age;
3189
if(nodes[i].branch<small)
3196
int GetInitialsTimes (double x[])
3198
/* this counts com.ntime and initializes x[] under clock and local clock models,
3199
including TipDate and ClockCombined models. See above for notes.
3200
Under local clock models, com.ntime includes both times and rates for
3202
A recursive algorithm is used to specify initials if(TipDate||NFossil).
3208
if(com.fix_blength==2)
3209
{ com.ntime=0; com.method=0; return(0); }
3210
else if(com.clock==0) {
3211
com.ntime = tree.nbranch;
3212
if(com.fix_blength==1) return(0);
3213
for(i=0; i<com.ntime; i++)
3214
x[i] = rndu()*0.1+0.01;
3216
if(com.fix_blength==0 && com.clock<5 && ancestor && com.ntime<100)
3217
LSDistance (&t, x, testx);
3222
/* clock models: check branch rate labels and fossil dates first */
3226
for(i=0; i<tree.nnode; i++) nodes[i].label=0;
3228
for(i=0; i<tree.nnode; i++) {
3229
if(i!=tree.root && (j=(int)nodes[i].label+1)>com.nbtype) {
3231
if(j<0 || j>tree.nbranch-1) error2("branch label in the tree.");
3234
for(j=0; j<com.nbtype; j++) {
3235
for(i=0; i<tree.nnode; i++)
3236
if(i!=tree.root && j==(int)nodes[i].label) break;
3238
printf("\nNot all branch labels (0, ..., %d) are found on tree?", com.nbtype-1);
3240
if(noisy) printf("\nfound %d branch rates in tree.\n", com.nbtype);
3241
if(com.nbtype<=1) error2("use clock = 1 or add branch rate labels in tree");
3243
for(i=0; i<tree.nbranch; i++)
3244
printf("%3.0f",nodes[tree.branches[i][1]].label); FPN(F0);
3247
for(i=0,NFossils=0,maxage=0; i<tree.nnode; i++) {
3248
if(nodes[i].nson && nodes[i].fossil) {
3250
maxage=max2(maxage,nodes[i].age);
3253
if(NFossils && maxage>10)
3254
error2("Change time unit so that fossil dates fall in (0.00001, 10).");
3257
AbsoluteRate=(TipDate || NFossils);
3258
if(com.clock>=5 && AbsoluteRate==0)
3259
error2("needs fossil calibrations");
3261
com.ntime = AbsoluteRate+(tree.nnode-com.ns-NFossils)+(com.nbtype-1);
3262
if(com.clock == ClockCombined) com.ntime += (com.ngene-1)*(com.nbtype-1);
3263
com.ntime += (tree.root<com.ns); /* root is a known sequence. Broken? */
3265
/* DANGER! AgeLow is not freed in the program. Fix this? */
3268
AgeLow = (double*)realloc(AgeLow, tree.nnode*sizeof(double));
3269
GetAgeLow(tree.root);
3271
if(!nodes[tree.root].fossil)
3272
x[k++] = (AbsoluteRate?nodes[tree.root].age*(1.2+rndu()) : rndu()*.5+.1); /* root age */
3273
for(; k<tree.nnode-com.ns-NFossils; k++) /* relative times */
3275
if(com.clock!=6) /* branch rates */
3276
for( ; k<com.ntime; k++)
3277
x[k]=0.1*(.5+rndu());
3279
for(j=0,k=com.ntime-1; j<data.ngene; j++,k++)
3280
x[k]=0.1*(.5+rndu());
3284
int OutputTimesRates (FILE *fout, double x[], double var[])
3286
/* SetBranch() has been called before calling this, so that [].age is up
3289
int i,j,k=AbsoluteRate+tree.nnode-com.ns-NFossils, jeffnode;
3290
double scale=(TipDate?ScaleTimes_TipDate:1);
3293
if(AbsoluteRate && com.clock<5) {
3294
fputs("\nSubstitution rate is per time unit\n", fout);
3295
if(com.nbtype>1) fprintf(fout,"Rates for branch groups\n");
3296
for(i=0; i<com.ngene; i++,FPN(fout)) {
3297
if(com.ngene>1) fprintf(fout,"Gene %2d: ", i+1);
3298
for(j=0; j<com.nbtype; j++) {
3299
fprintf(fout,"%12.6f", GetBranchRate(i,j,x,&k)/scale);
3300
if(i==0 && j==0 && !AbsoluteRate) continue;
3301
if((com.clock!=LocalClock||com.ngene==1) && com.getSE) {
3302
if(k==-1) error2("we are in trouble. k should not be -1 here.");
3303
fprintf(fout," +- %8.6f", sqrt(var[k*com.np+k])/scale);
3310
fprintf (fout,"rates for branches: 1");
3311
for(k=tree.nnode-com.ns; k<com.ntime; k++) fprintf(fout," %8.5f",x[k]);
3317
fputs("\nNodes and Times\n",fout);
3318
fputs("(JeffNode is for Thorne's multidivtime. ML analysis uses ingroup data only.)\n\n",fout);
3320
if(TipDate) { /* DANGER! SE not printed if(TipDate && NFossil). */
3321
for(i=0,k=0; i<tree.nnode; i++,FPN(fout)) {
3322
jeffnode=(i<com.ns?i:tree.nnode-1+com.ns-i);
3323
fprintf(fout,"Node %3d (Jeffnode %3d) Time %7.2f ",i+1, jeffnode,
3324
TipDate-nodes[i].age*scale);
3325
if(com.getSE && i>=com.ns && !nodes[i].fossil) {
3326
fprintf(fout," +- %6.2f", sqrt(var[k*com.np+k])*scale);
3331
else if(AbsoluteRate) {
3332
for(i=com.ns,k=0; i<tree.nnode; i++,FPN(fout)) {
3333
jeffnode=tree.nnode-1+com.ns-i;
3334
fprintf(fout,"Node %3d (Jeffnode %3d) Time %9.5f", i+1, tree.nnode-1+com.ns-i,
3336
if(com.getSE && i>=com.ns && !nodes[i].fossil) {
3337
fprintf(fout," +- %7.5f", sqrt(var[k*com.np+k]));
3338
if(fabs(nodes[i].age-x[k])>1e-5) error2("node order wrong.");
3347
int SetxBoundTimes (double xb[][2])
3349
/* This sets bounds for times (or branch lengths) and branch rates
3352
double tb[]={4e-6,50}, rateb[]={1e-4,99}, pb[]={.000001,.999999};
3355
for(i=0;i<com.ntime;i++) {
3361
k=0; xb[0][0]=tb[0]; xb[0][1]=tb[1];
3362
if(!nodes[tree.root].fossil) {
3363
if(AbsoluteRate) xb[0][0]=AgeLow[tree.root];
3366
for( ; k<tree.nnode-com.ns-NFossils; k++) /* proportional ages */
3367
{ xb[k][0]=pb[0]; xb[k][1]=pb[1]; }
3368
for(; k<com.ntime; k++) /* rate and branch rates */
3369
FOR(j,2) xb[k][j]=rateb[j];
3377
#if(defined(BASEML) || defined(BASEMLG) || defined(CODEML))
3380
int readx(double x[], int *fromfile)
3382
/* this reads parameters from file, used as initial values
3383
if(runmode>0), this reads common substitution parameters only into x[], which
3384
should be copied into another place before heuristic tree search. This is broken
3385
right now. Ziheng, 9 July 2003.
3386
fromfile=0: if nothing read from file, 1: read from file, -1:fix parameters
3392
times++; *fromfile=0;
3393
if(finitials==NULL || (com.runmode>0 && times>1)) return(0);
3394
if(com.runmode<=0) { npin=com.np; xin=x; }
3395
else { npin=com.np-com.ntime; xin=x+com.ntime; }
3397
if(npin<=0) return(0);
3398
if(com.runmode>0&&com.seqtype==1&&com.model) error2("option or in.codeml");
3399
printf("\nReading initials/paras from file (np=%d). Stop if wrong.\n",npin);
3400
fscanf(finitials,"%lf",&xin[i=0]);
3402
if(xin[0]==-1) { *fromfile=-1; LASTROUND=1; }
3404
for( ; i<npin; i++) if(fscanf(finitials,"%lf",&xin[i])!=1) break;
3406
{ printf("err at #%d. Edit or remove it.\n",i+1); exit(-1); }
3408
matout(F0,xin,1,npin);
3409
puts("Those are fixed for tree search. Stop if wrong.");
3416
#if(defined(BASEML) || defined(CODEML))
3418
int CollapsNode (int inode, double x[])
3420
/* Merge inode to its father. Update the first com.ntime elments of
3421
x[] only if (x!=NULL), by using either x[] if clock=1 or
3422
nodes[].branch if clock=0. So when clock=0, the routine works
3423
properly only if SetBranch() is called before this routine, which
3424
is true if m.l. or l.s. has been used to estimate branch lengths.
3426
int i,j, ifather, ibranch, ison;
3428
if (inode==tree.root || inode<com.ns) error2("err CollapsNode");
3429
ibranch=nodes[inode].ibranch; ifather=nodes[inode].father;
3430
for (i=0; i<nodes[inode].nson; i++) {
3431
ison=nodes[inode].sons[i];
3432
tree.branches[nodes[ison].ibranch][0]=ifather;
3434
for (i=ibranch+1; i<tree.nbranch; i++)
3435
for (j=0; j<2; j++) tree.branches[i-1][j]=tree.branches[i][j];
3436
tree.nbranch--; com.ntime--;
3437
for (i=0; i<tree.nbranch; i++) for (j=0; j<2; j++)
3438
if (tree.branches[i][j]>inode) tree.branches[i][j]--;
3443
for (i=inode+1; i<tree.nnode+1; i++) x[i-1-com.ns]=x[i-com.ns];
3445
for (i=ibranch+1; i<tree.nbranch+1; i++) x[i-1]=x[i];
3456
void DescentGroup (int inode);
3457
void BranchPartition (char partition[], int parti2B[]);
3459
static char *PARTITION;
3461
void DescentGroup (int inode)
3464
for (i=0; i<nodes[inode].nson; i++)
3465
if (nodes[inode].sons[i]<com.ns)
3466
PARTITION[nodes[inode].sons[i]]=1;
3468
DescentGroup (nodes[inode].sons[i]);
3471
void BranchPartition (char partition[], int parti2B[])
3473
/* calculates branch partitions.
3474
partition[0,...,ns-1] marks the species bi-partition by the first interior
3475
branch. It uses 0 and 1 to indicate which side of the branch each species
3477
partition[ns,...,2*ns-1] marks the second interior branch.
3478
parti2B[0] maps the partition (internal branch) to the branch in tree.
3479
Use NULL for parti2B if this information is not needed.
3480
partition[nib*com.ns]. nib: # of interior branches.
3482
int i,j, nib; /* number of internal branches */
3484
for (i=0,nib=0; i<tree.nbranch; i++) {
3485
if (tree.branches[i][1]>=com.ns){
3486
PARTITION=partition+nib*com.ns;
3487
FOR (j,com.ns) PARTITION[j]=0;
3488
DescentGroup (tree.branches[i][1]);
3489
if (parti2B) parti2B[nib]=i;
3491
/* set first species to 0 */
3492
if(PARTITION[0]) FOR(j,com.ns) PARTITION[j]=(char)!PARTITION[j];
3495
if (nib!=tree.nbranch-com.ns) error2("err BranchPartition");
3499
int NSameBranch (char partition1[],char partition2[], int nib1,int nib2,
3502
/* counts the number of correct (identical) bipartitions.
3503
nib1 and nib2 are the numbers of interior branches in the two trees
3504
correctIB[0,...,(correctbranch-1)] lists the correct interior branches,
3505
that is, interior branches in tree 1 that is also in tree 2.
3506
IBsame[i]=1 if interior branch i is correct.
3508
int i,j,k, nsamebranch,nsamespecies;
3510
for (i=0,nsamebranch=0; i<nib1; i++) for(j=0,IBsame[i]=0; j<nib2; j++) {
3511
for (k=0,nsamespecies=0;k<com.ns;k++)
3512
if(partition1[i*com.ns+k]!=partition2[j*com.ns+k]) break;
3514
{ nsamebranch++; IBsame[i]=1; break; }
3516
return (nsamebranch);
3521
int AddSpecies (int is, int ib)
3523
/* Add species (is) to tree at branch ib. The tree currently has
3524
is+1-1 species. Interior node numbers are increased by 2 to make
3525
room for the new nodes.
3526
if(com.clock && ib==tree.nbranch), the new species is added as an
3527
outgroup to the rooted tree.
3531
if(ib>tree.nbranch+1 || (ib==tree.nbranch && !com.clock)) return(-1);
3533
if(ib==tree.nbranch && com.clock) {
3534
FOR(i,tree.nbranch) FOR(j,2)
3535
if (tree.branches[i][j]>=is) tree.branches[i][j]+=2;
3536
it=tree.root; if(tree.root>=is) it+=2;
3537
FOR(i,2) tree.branches[tree.nbranch+i][0]=tree.root=is+1;
3538
tree.branches[tree.nbranch++][1]=it;
3539
tree.branches[tree.nbranch++][1]=is;
3542
FOR(i,tree.nbranch) FOR(j,2)
3543
if (tree.branches[i][j]>=is) tree.branches[i][j]+=2;
3544
it=tree.branches[ib][1];
3545
tree.branches[ib][1]=is+1;
3546
tree.branches[tree.nbranch][0]=is+1;
3547
tree.branches[tree.nbranch++][1]=it;
3548
tree.branches[tree.nbranch][0]=is+1;
3549
tree.branches[tree.nbranch++][1]=is;
3550
if (tree.root>=is) tree.root+=2;
3560
{struct TREEB tree; struct TREEN nodes[2*NS-1]; double x[NP]; }
3564
{struct TREEB tree; struct TREEN nodes[2*NS-1];} treestar;
3567
int Perturbation(FILE* fout, int initialMP, double space[]);
3569
int Perturbation(FILE* fout, int initialMP, double space[])
3571
/* heuristic tree search by the NNI tree perturbation algorithm.
3572
Some trees are evaluated multiple times as no trees are kept.
3573
This needs more work.
3575
int step=0, ntree=0, nmove=0, improve=0, ineighb, i,j;
3576
int sizetree=(2*com.ns-1)*sizeof(struct TREEN);
3577
double *x=treestar.x;
3580
if(com.clock) error2("\n\aerr: pertubation does not work with a clock yet.\n");
3581
if(initialMP&&!com.cleandata)
3582
error2("\ncannot get initial parsimony tree for gapped data yet.");
3584
fprintf(fout, "\n\nHeuristic tree search by NNI perturbation\n");
3586
if (noisy) printf("\nInitial tree from stepwise addition with MP:\n");
3587
fprintf(fout, "\nInitial tree from stepwise addition with MP:\n");
3588
StepwiseAdditionMP (space);
3591
if (noisy) printf ("\nInitial tree read from file %s:\n", com.treef);
3592
fprintf(fout, "\nInitial tree read from file.\n");
3593
if ((ftree=fopen (com.treef,"r"))==NULL) error2("treefile not exist?");
3594
fscanf (ftree, "%d%d", &i, &ntree);
3595
if (i!=com.ns) error2("ns in the tree file");
3596
if(ReadTreeN(ftree, &i, &j, 0, 1)) error2("err tree..");
3599
if (noisy) { FPN (F0); OutTreeN(F0,0,0); FPN(F0); }
3600
tree.lnL=TreeScore(x, space);
3601
if (noisy) { OutTreeN(F0,0,1); printf("\n lnL = %.4f\n",-tree.lnL); }
3602
OutTreeN(fout,1,1); fprintf(fout, "\n lnL = %.4f\n",-tree.lnL);
3603
if (com.np>com.ntime) {
3604
fprintf(fout, "\tparameters:");
3605
for(i=com.ntime; i<com.np; i++) fprintf(fout, "%9.5f", x[i]);
3609
treebest.tree=tree; memcpy(treebest.nodes, nodes, sizetree);
3611
for (step=0; ; step++) {
3612
for (ineighb=0,improve=0; ineighb<(tree.nbranch-com.ns)*2; ineighb++) {
3613
tree=treebest.tree; memcpy (nodes, treebest.nodes, sizetree);
3614
NeighborNNI (ineighb);
3616
printf("\nTrying tree # %d (%d move[s]) \n", ++ntree,nmove);
3617
OutTreeN(F0,0,0); FPN(F0);
3619
tree.lnL=TreeScore(x, space);
3620
if (noisy) { OutTreeN(F0,1,1); printf("\n lnL = %.4f\n",-tree.lnL);}
3621
if (noisy && com.np>com.ntime) {
3622
printf("\tparameters:");
3623
for(i=com.ntime; i<com.np; i++) printf("%9.5f", x[i]);
3626
if (tree.lnL<=treebest.tree.lnL) {
3627
treebest.tree=tree; memcpy (treebest.nodes, nodes, sizetree);
3629
if (noisy) printf(" moving to this tree\n");
3631
fprintf(fout, "\nA better tree:\n");
3632
OutTreeN(fout,0,0); FPN(fout); OutTreeN(fout,1,1); FPN(fout);
3633
fprintf(fout, "\nlnL = %.4f\n", tree.lnL);
3634
if (com.np>com.ntime) {
3635
fprintf(fout,"\tparameters:");
3636
for(i=com.ntime; i<com.np; i++) fprintf(fout,"%9.5f", x[i]);
3643
if (!improve) break;
3645
tree=treebest.tree; memcpy (nodes, treebest.nodes, sizetree);
3647
printf("\n\nBest tree found:\n");
3648
OutTreeN(F0,0,0); FPN(F0); OutTreeN(F0,1,1); FPN(F0);
3649
printf("\nlnL = %.4f\n", tree.lnL);
3652
fprintf(fout, "\n\nBest tree found:\n");
3653
OutTreeN(fout,0,0); FPN(fout); OutTreeN(fout,1,1); FPN(fout);
3654
fprintf(fout, "\nlnL = %.4f\n", tree.lnL);
3660
static int *_U0, *_step0, _mnnode;
3661
/* up pass characters and changes for the star tree: each of size npatt*nnode*/
3663
int StepwiseAdditionMP (double space[])
3665
/* tree search by species addition.
3668
int ns0=com.ns, is, i,j,h, tiestep=0,tie,bestbranch=0;
3669
int sizetree=(2*com.ns-1)*sizeof(struct TREEN);
3670
double bestscore=0,score;
3673
_U0=(int*)malloc(com.npatt*_mnnode*sizeof(int));
3674
_step0=(int*)malloc(com.npatt*_mnnode*sizeof(int));
3676
printf("\n%9ld bytes for MP (U0 & N0)\n", 2*com.npatt*_mnnode*sizeof(int));
3677
if (_U0==NULL || _step0==NULL) error2("oom U0&step0");
3679
FOR (i,ns0) z0[i]=com.z[i];
3680
tree.nbranch=tree.root=com.ns=3;
3681
FOR (i, tree.nbranch) { tree.branches[i][0]=com.ns; tree.branches[i][1]=i; }
3685
{ _U0[h*_mnnode+i]=1<<(com.z[i][h]-1); _step0[h*_mnnode+i]=0; }
3686
for (is=com.ns,tie=0; is<ns0; is++) {
3687
treestar.tree=tree; memcpy (treestar.nodes, nodes, sizetree);
3689
for (j=0; j<treestar.tree.nbranch; j++,com.ns--) {
3690
tree=treestar.tree; memcpy (nodes, treestar.nodes, sizetree);
3693
score=MPScoreStepwiseAddition(is, space, 0);
3696
printf(" Add sp %d (ns=%d) at branch %d, score %.0f\n", is+1,com.ns,j+1,score);
3698
if (j && score==bestscore) tiestep=1;
3699
if (j==0 || score<bestscore || (score==bestscore&&rndu()<.1)) {
3701
bestscore=score; bestbranch=j;
3705
tree=treestar.tree; memcpy (nodes, treestar.nodes, sizetree);
3707
AddSpecies (is, bestbranch);
3708
score=MPScoreStepwiseAddition(is, space, 1);
3711
{ printf("\r Added %d [%5.0f steps]",is+1,-bestscore); fflush(F0);}
3713
if (noisy>2) printf(" %d stages with ties, ", tie);
3715
free(_U0); free(_step0);
3719
double MPScoreStepwiseAddition (int is, double space[], int save)
3721
/* this changes only the part of the tree affected by the newly added
3723
save=1 for the best tree, so that _U0 & _step0 are updated
3725
int *U,*N,U3[3], h,ist, i,father,son2,*pU0=_U0,*pN0=_step0;
3728
U=(int*)space; N=U+_mnnode;
3729
for (h=0,score=0; h<com.npatt; h++,pU0+=_mnnode,pN0+=_mnnode) {
3730
FOR (i, tree.nnode) { U[i]=pU0[i-2*(i>=is)]; N[i]=pN0[i-2*(i>=is)]; }
3731
U[is]=1<<(com.z[is][h]-1); N[is]=0;
3732
for (ist=is; (father=nodes[ist].father)!=tree.root; ist=father) {
3733
if ((son2=nodes[father].sons[0])==ist) son2=nodes[father].sons[1];
3734
N[father]=N[ist]+N[son2];
3735
if ((U[father]=U[ist]&U[son2])==0)
3736
{ U[father]=U[ist]|U[son2]; N[father]++; }
3738
FOR (i,3) U3[i]=U[nodes[tree.root].sons[i]];
3740
if (U3[0]&U3[1]&U3[2]) N[tree.root]=0;
3741
else if (U3[0]&U3[1] || U3[1]&U3[2] || U3[0]&U3[2]) N[tree.root]=1;
3742
FOR(i,3) N[tree.root]+=N[nodes[tree.root].sons[i]];
3745
memcpy (pU0, U, tree.nnode*sizeof(int));
3746
memcpy (pN0, N, tree.nnode*sizeof(int));
3748
score+=N[tree.root]*com.fpatt[h];
3754
double TreeScore(double x[], double space[])
3756
static int fromfile=0;
3758
double xb[NP][2], e=1e-9, lnL=0;
3760
if(com.clock==2) error2("local clock in TreeScore");
3761
com.ntime = com.clock ? tree.nnode-com.ns : tree.nbranch;
3763
GetInitials(x, &i); /* this shoulbe be improved??? */
3767
if(com.method==0 || !fromfile) SetxBound(com.np, xb);
3770
lnL = com.plfun(x,com.np);
3774
if(com.method==0 || com.ntime==0)
3775
ming2(NULL,&lnL,com.plfun,NULL,x,xb, space,e,com.np);
3777
minB(NULL, &lnL, x, xb, e, space);
3783
int StepwiseAddition (FILE* fout, double space[])
3785
/* heuristic tree search by species addition. Species are added in the order
3786
of occurrence in the data.
3787
Try to get good initial values.
3789
char *z0[NS], *spname0[NS];
3790
int ns0=com.ns, is, i,j, bestbranch=0, randadd=0, order[NS];
3791
int sizetree=(2*com.ns-1)*sizeof(struct TREEN);
3792
double bestscore=0,score, *x=treestar.x;
3794
if(com.ns>50) printf("if this crashes, increase com.sspace?");
3796
if(com.ns<3) error2("2 sequences, no need for tree search");
3797
if (noisy) printf("\n\nHeuristic tree search by stepwise addition\n");
3798
if (fout) fprintf(fout, "\n\nHeuristic tree search by stepwise addition\n");
3799
FOR (i,ns0) { z0[i]=com.z[i]; spname0[i]=com.spname[i]; }
3800
tree.nbranch=tree.root=com.ns=(com.clock?2:3);
3802
FOR(i,ns0) order[i]=i;
3805
{ j=(int)(ns0*rndu()); is=order[i]; order[i]=order[j]; order[j]=is; }
3806
if(noisy) FOR(i,ns0) printf(" %d", order[i]+1);
3808
fputs("\nOrder of species addition:\n",fout);
3809
FOR(i,ns0)fprintf(fout,"%3d %-s\n", order[i]+1,com.spname[order[i]]);
3811
for(i=0; i<ns0; i++) {
3812
com.z[i]=z0[order[i]];
3813
com.spname[i]=spname0[order[i]];
3817
for(i=0; i<tree.nbranch; i++) {
3818
tree.branches[i][0]=com.ns; tree.branches[i][1]=i;
3821
for (is=com.ns; is<ns0; is++) { /* add the is_th species */
3822
treestar.tree=tree; memcpy (treestar.nodes, nodes, sizetree);
3824
for (j=0; j<treestar.tree.nbranch+(com.clock>0); j++,com.ns--) {
3825
tree=treestar.tree; memcpy(nodes, treestar.nodes, sizetree);
3828
score=TreeScore(x, space);
3830
{ printf("\n "); OutTreeN(F0, 0, 0); printf("%12.3f",-score); }
3832
if (j==0 || score<bestscore || (score==bestscore&&rndu()<.2)) {
3833
treebest.tree=tree; memcpy(treebest.nodes, nodes, sizetree);
3834
xtoy (x, treebest.x, com.np);
3835
bestscore=score; bestbranch=j;
3838
tree=treebest.tree; memcpy(nodes,treebest.nodes, sizetree);
3839
xtoy (treebest.x, x, com.np);
3843
printf("\n\nAdded sp. %d, %s [%.3f]\n",is+1,com.spname[is],-bestscore);
3844
OutTreeN(F0,0,0); FPN(F0); OutTreeN(F0,1,0); FPN(F0);
3845
if (com.np>com.ntime) {
3846
printf("\tparameters:");
3847
for(i=com.ntime; i<com.np; i++) printf("%9.5f", x[i]);
3852
fprintf(fout,"\n\nAdded sp. %d, %s [%.3f]\n",
3853
is+1, com.spname[is], -bestscore);
3854
OutTreeN(fout,0,0); FPN(fout);
3855
OutTreeN(fout,1,1); FPN(fout);
3856
if (com.np>com.ntime) {
3857
fprintf(fout, "\tparameters:");
3858
for(i=com.ntime; i<com.np; i++) fprintf(fout, "%9.5f", x[i]);
3870
int DecompTree (int inode, int ison1, int ison2);
3871
#define hdID(i,j) (max2(i,j)*(max2(i,j)-1)/2+min2(i,j))
3873
int StarDecomposition (FILE *fout, double space[])
3875
/* automatic tree search by star decomposition, nhomo<=1
3876
returns (0,1,2,3) for the 4s problem.
3878
int status=0,stage=0, i,j, itree,ntree=0,ntreet,best=0,improve=1,collaps=0;
3879
int inode, nson=0, ison1,ison2, son1, son2;
3880
int sizetree=(2*com.ns-1)*sizeof(struct TREEN);
3882
FILE *ftree, *fsum=frst;
3884
if (com.runmode==1) { /* read the star-like tree from tree file */
3885
if ((ftree=fopen (com.treef,"r"))==NULL) error2("no treefile");
3886
fscanf (ftree, "%d%d", &i, &ntree);
3887
if (ReadTreeN(ftree, &i, &j, 0, 1)) error2("err tree file");
3890
else { /* construct the star tree of ns species */
3891
tree.nnode = (tree.nbranch=tree.root=com.ns)+1;
3892
for (i=0; i<tree.nbranch; i++)
3893
{ tree.branches[i][0]=com.ns; tree.branches[i][1]=i; }
3894
com.ntime = com.clock?1:tree.nbranch;
3897
if (noisy) { printf("\n\nstage 0: "); OutTreeN(F0,0,0); }
3898
if (fsum) { fprintf(fsum,"\n\nstage 0: "); OutTreeN(fsum,0,0); }
3899
if (fout) { fprintf(fout,"\n\nstage 0: "); OutTreeN(fout,0,0); }
3901
tree.lnL=TreeScore(x,space);
3903
if (noisy) printf("\nlnL:%14.6f%6d", -tree.lnL, NFunCall);
3904
if (fsum) fprintf(fsum,"\nlnL:%14.6f%6d", -tree.lnL, NFunCall);
3906
fprintf(fout,"\nlnL(ntime:%3d np:%3d):%14.6f\n",
3907
com.ntime, com.np, -tree.lnL);
3908
OutTreeB (fout); FPN(fout);
3909
FOR (i, com.np) fprintf (fout,"%9.5f", x[i]); FPN (fout);
3911
treebest.tree=tree; memcpy(treebest.nodes,nodes,sizetree);
3912
FOR (i,com.np) treebest.x[i]=x[i];
3913
for (ntree=0,stage=1; ; stage++) {
3914
for (inode=treebest.tree.nnode-1; inode>=0; inode--) {
3915
nson=treebest.nodes[inode].nson;
3917
if (com.clock) { if (nson>2) break; }
3918
else if (nson>2+(inode==treebest.tree.root)) break;
3920
if (inode==-1 || /*stage>com.ns-3+com.clock ||*/ !improve) { /* end */
3921
tree=treebest.tree; memcpy (nodes, treebest.nodes, sizetree);
3924
printf("\n\nbest tree: "); OutTreeN(F0,0,0);
3925
printf(" lnL:%14.6f\n", -tree.lnL);
3928
fprintf(fsum, "\n\nbest tree: "); OutTreeN(fsum,0,0);
3929
fprintf(fsum, " lnL:%14.6f\n", -tree.lnL);
3932
fprintf(fout, "\n\nbest tree: "); OutTreeN(fout,0,0);
3933
fprintf(fout, " lnL:%14.6f\n", -tree.lnL);
3934
OutTreeN(fout,1,1); FPN(fout);
3938
treestar=treebest; memcpy(nodes,treestar.nodes,sizetree);
3940
if (collaps && stage) {
3941
printf ("\ncollapsing nodes\n");
3942
OutTreeN(F0, 1, 1); FPN(F0);
3944
tree=treestar.tree; memcpy(nodes, treestar.nodes, sizetree);
3945
for (i=com.ns,j=0; i<tree.nnode; i++)
3946
if (i!=tree.root && nodes[i].branch<1e-7)
3947
{ CollapsNode (i, treestar.x); j++; }
3948
treestar.tree=tree; memcpy(treestar.nodes, nodes, sizetree);
3951
fprintf (fout, "\n%d node(s) collapsed\n", j);
3952
OutTreeN(fout, 1, 1); FPN(fout);
3955
printf ("\n%d node(s) collapsed\n", j);
3956
OutTreeN(F0, 1, 1); FPN(F0);
3957
/* if (j) getchar (); */
3961
ntreet = nson*(nson-1)/2;
3962
if (!com.clock && inode==treestar.tree.root && nson==4) ntreet=3;
3963
com.ntime++; com.np++;
3966
printf ("\n\nstage %d:%6d trees, ntime:%3d np:%3d\nstar tree: ",
3967
stage, ntreet, com.ntime, com.np);
3969
printf (" lnL:%10.3f\n", -treestar.tree.lnL);
3972
fprintf (fsum, "\n\nstage %d:%6d trees, ntime:%3d np:%3d\nstar tree: ",
3973
stage, ntreet, com.ntime, com.np);
3974
OutTreeN(fsum, 0, 0);
3975
fprintf (fsum, " lnL:%10.6f\n", -treestar.tree.lnL);
3978
fprintf (fout,"\n\nstage %d:%6d trees\nstar tree: ", stage, ntreet);
3979
OutTreeN(fout, 0, 0);
3980
fprintf (fout, " lnL:%14.6f\n", -treestar.tree.lnL);
3981
OutTreeN(fout, 1, 1); FPN (fout);
3984
for (ison1=0,itree=improve=0; ison1<nson; ison1++)
3985
for (ison2=ison1+1; ison2<nson&&itree<ntreet; ison2++,itree++,ntree++) {
3986
DecompTree (inode, ison1, ison2);
3987
son1=nodes[tree.nnode-1].sons[0];
3988
son2=nodes[tree.nnode-1].sons[1];
3990
for(i=com.np-1; i>0; i--) x[i]=treestar.x[i-1];
3992
for (i=0; i<tree.nbranch; i++)
3993
x[i]=max2(nodes[tree.branches[i][1]].branch*0.99, 0.0001);
3995
for (i=1,x[0]=max2(x[0],.01); i<com.ntime; i++) x[i]=.5;
3998
printf("\nS=%d:%3d/%d T=%4d ", stage,itree+1,ntreet,ntree+1);
4002
fprintf(fsum, "\nS=%d:%3d/%d T=%4d ", stage,itree+1,ntreet,ntree+1);
4003
OutTreeN(fsum, 0, 0);
4006
fprintf(fout,"\nS=%d:%4d/%4d T=%4d ",stage,itree+1,ntreet,ntree+1);
4007
OutTreeN(fout, 0, 0);
4009
tree.lnL=TreeScore(x, space);
4011
if (tree.lnL<treebest.tree.lnL) {
4012
treebest.tree=tree; memcpy (treebest.nodes, nodes, sizetree);
4013
FOR(i,com.np) treebest.x[i]=x[i];
4014
best=itree+1; improve=1;
4016
if (noisy) printf("%6d%2c %+8.6f",
4017
NFunCall,(status?'?':'X'),treestar.tree.lnL-tree.lnL);
4019
fprintf(fsum, "%6d%2c", NFunCall, (status?'?':'X'));
4020
for (i=com.ntime; i<com.np; i++) fprintf(fsum, "%7.3f", x[i]);
4021
fprintf(fsum, " %+8.6f", treestar.tree.lnL-tree.lnL);
4025
fprintf(fout,"\nlnL(ntime:%3d np:%3d):%14.6f\n",
4026
com.ntime, com.np, -tree.lnL);
4027
OutTreeB (fout); FPN(fout);
4028
FOR (i,com.np) fprintf(fout,"%9.5f", x[i]);
4029
FPN(fout); fflush(fout);
4032
son1=treebest.nodes[tree.nnode-1].sons[0];
4033
son2=treebest.nodes[tree.nnode-1].sons[1];
4036
if (com.ns<=4 && !improve && best) error2("strange");
4038
if (com.ns<=4) return (best);
4042
int DecompTree (int inode, int ison1, int ison2)
4044
/* decompose treestar at NODE inode into tree and nodes[]
4047
int sizetree=(2*com.ns-1)*sizeof(struct TREEN);
4048
double bt, fmid=0.001, fclock=0.0001;
4050
tree=treestar.tree; memcpy (nodes, treestar.nodes, sizetree);
4051
for (i=0,bt=0; i<tree.nnode; i++)
4052
if (i!=tree.root) bt+=nodes[i].branch/tree.nbranch;
4054
nodes[tree.nnode].nson=2;
4055
nodes[tree.nnode].sons[0]=son1=nodes[inode].sons[ison1];
4056
nodes[tree.nnode].sons[1]=son2=nodes[inode].sons[ison2];
4057
nodes[tree.nnode].father=inode;
4058
nodes[son1].father=nodes[son2].father=tree.nnode;
4060
nodes[inode].sons[ison1]=tree.nnode;
4061
for (i=ison2; i<nodes[inode].nson; i++)
4062
nodes[inode].sons[i]=nodes[inode].sons[i+1];
4063
nodes[inode].nson--;
4068
nodes[tree.nnode-1].branch=bt*fmid;
4070
nodes[tree.nnode-1].age=nodes[inode].age*(1-fclock);
4079
int MultipleGenes (FILE* fout, FILE*fpair[], double space[])
4081
/* This does the separate analysis of multiple-gene data.
4082
Note that com.pose[] is not correct and so RateAncestor = 0 should be set
4083
in baseml and codeml.
4085
int ig=0, j, ngene0, npatt0, lgene0[NGENE], posG0[NGENE+1];
4086
int nb = ((com.seqtype==1 && !com.cleandata) ? 3 : 1);
4088
if(com.ndata>1) error2("multiple data sets & multiple genes?");
4090
ngene0=com.ngene; npatt0=com.npatt;
4091
FOR (ig, ngene0) lgene0[ig]=com.lgene[ig];
4092
FOR (ig, ngene0+1) posG0[ig]=com.posG[ig];
4096
printf("\nStart from gene (1-%d)? ", com.ngene);
4101
for ( ; ig<ngene0; ig++) {
4104
com.ls=com.lgene[0]= ig==0?lgene0[0]:lgene0[ig]-lgene0[ig-1];
4105
com.npatt = ig==ngene0-1 ? npatt0-posG0[ig] : posG0[ig+1]-posG0[ig];
4106
com.posG[0]=0; com.posG[1]=com.npatt;
4107
FOR (j,com.ns) com.z[j]+=posG0[ig]*nb; com.fpatt+=posG0[ig];
4108
xtoy (com.piG[ig], com.pi, com.ncode);
4110
printf ("\n\nGene %2d ls:%4d npatt:%4d\n",ig+1,com.ls,com.npatt);
4111
fprintf(fout,"\nGene %2d ls:%4d npatt:%4d\n",ig+1,com.ls,com.npatt);
4112
fprintf(frst,"\nGene %2d ls:%4d npatt:%4d\n",ig+1,com.ls,com.npatt);
4113
fprintf(frst1,"%d\t%d\t%d",ig+1,com.ls,com.npatt);
4116
if(com.seqtype==CODONseq) {
4117
DistanceMatNG86(fout,fpair[0],fpair[1],fpair[2],0);
4118
if(com.codonf>=F1x4MG) com.pf3x4 = com.f3x4[ig];
4122
DistanceMatNuc(fout,fpair[0],com.model,com.alpha);
4125
if (com.runmode==0) Forestry(fout);
4127
else if (com.runmode==-2) {
4128
if(com.seqtype==CODONseq) PairwiseCodon(fout,fpair[3],fpair[4],fpair[5],space);
4129
else PairwiseAA(fout,fpair[0]);
4132
else StepwiseAddition(fout, space);
4134
for(j=0; j<com.ns; j++) com.z[j] -= posG0[ig]*nb;
4135
com.fpatt -= posG0[ig];
4140
com.ls = lgene0[ngene0-1];
4141
for(ig=0; ig<ngene0; ig++)
4142
com.lgene[ig] = lgene0[ig];
4143
for(ig=0; ig<ngene0+1; ig++)
4144
com.posG[ig] = posG0[ig];
4148
void printSeqsMgenes (void)
4150
/* separate sites from different partitions (genes) into different files.
4151
called before sequences are coded.
4152
Note that this is called before PatternWeight and so posec or posei is used
4153
and com.pose is not yet allocated.
4154
In case of codons, com.ls is the number of codons.
4159
int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);
4161
puts("Separating sites in genes into different files.\n");
4162
for (ig=0, FPN(F0); ig<com.ngene; ig++) {
4163
for (h=0,lg=0; h<com.ls; h++)
4166
sprintf(seqf, "Gene%d.seq", ig+1);
4167
if((fseq=fopen(seqf,"w"))==NULL) error2("file creation err.");
4168
printf("%d sites in gene %d go to file %s\n", lg, ig+1,seqf);
4170
fprintf (fseq, "%8d%8d\n", com.ns, lg*n31);
4171
for (j=0; j<com.ns; j++) {
4173
/* fprintf(fseq,"*\n>\n%s\n", com.spname[j]); */
4174
fprintf(fseq,"%-20s ", com.spname[j]);
4175
if (n31==1) { /* nucleotide or aa sequences */
4178
fprintf(fseq, "%c", com.z[j][h]);
4180
else { /* codon sequences */
4182
if(com.pose[h]==ig) {
4183
FOR (i,3) fprintf(fseq,"%c", com.z[j][h*3+i]);
4194
void printSeqsMgenes2 (void)
4196
/* This print sites from certain genes into one file.
4197
called before sequences are coded.
4198
In case of codons, com.ls is the number of codons.
4201
char seqf[20]="newseqs";
4203
int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);
4206
char *genenames[44]={"atpa", "atpb", "atpe", "atpf", "atph", "petb", "petg", "psaa",
4207
"psab", "psac", "psaj", "psba", "psbb", "psbc", "psbd", "psbe",
4208
"psbf", "psbh", "psbi", "psbj", "psbk", "psbl", "psbn", "psbt",
4209
"rl14", "rl16", "rl2", "rl20", "rl36", "rpob", "rpoc", "rpod", "rs11",
4210
"rs12", "rs14", "rs18", "rs19", "rs2", "rs3", "rs4", "rs7", "rs8",
4212
int wantgene[44]={0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4213
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4216
for(ig=0,lg=0; ig<com.ngene; ig++) wantgene[ig]=!wantgene[ig];
4219
if(com.ngene!=44) error2("ngene!=44");
4221
printf("%3d",com.pose[h]);
4222
if((h+1)%20==0) FPN(F0); if((h+1)%500==0) getchar();
4224
matIout(F0,com.lgene,1,com.ngene);
4225
matIout(F0,wantgene,1,com.ngene);
4227
for(ig=0,lg=0; ig<com.ngene; ig++)
4228
if(wantgene[ig]) { ngenekept++; lg+=com.lgene[ig]; }
4230
if((fseq=fopen(seqf,"w"))==NULL) error2("file creation err.");
4231
fprintf(fseq,"%4d %4d G\nG %d ", com.ns, lg*n31, ngenekept);
4232
FOR(ig,com.ngene) if(wantgene[ig]) fprintf(fseq," %3d", com.lgene[ig]);
4235
for (j=0; j<com.ns; FPN(fseq),j++) {
4236
fprintf(fseq,"%-20s ", com.spname[j]);
4237
if (n31==1) { /* nucleotide or aa sequences */
4239
if(wantgene[ig=com.pose[h]]) fprintf(fseq,"%c",com.z[j][h]);
4241
else { /* codon sequences */
4243
if (wantgene[ig=com.pose[h]]) {
4244
FOR (i,3) fprintf(fseq,"%c", com.z[j][h*3+i]);
4250
FOR(ig,com.ngene) if(wantgene[ig]) fprintf(fseq," %s", genenames[ig]);
4257
#endif /* ifdef REALSEQUENCE */
4258
#endif /* ifdef TREESEARCH */
4259
#endif /* ifdef NODESTRUCTURE */
4265
void UpPassScoreOnly (int inode);
4266
void UpPassScoreOnlyB (int inode);
4268
static int *Nsteps, *chUB; /* MM */
4269
static char *Kspace, *chU, *NchU;
4270
/* Elements of chU are character states (there are NchU of them). This
4271
representation is used to speed up calculation for large trees.
4272
Bit operations on chUB are performed for binary trees
4275
void UpPassScoreOnly (int inode)
4277
/* => VU, VL, & MM, theorem 2 */
4279
char *K=Kspace, maxK; /* chMark (VV) not used in up pass */
4281
FOR (i,nodes[inode].nson)
4282
if (nodes[nodes[inode].sons[i]].nson>0)
4283
UpPassScoreOnly (nodes[inode].sons[i]);
4285
FOR (i,com.ncode) K[i]=0;
4286
FOR (i,nodes[inode].nson)
4287
for (j=0,ison=nodes[inode].sons[i]; j<NchU[ison]; j++)
4288
K[(int)chU[ison*com.ncode+j]]++;
4289
for (i=0,maxK=0; i<com.ncode; i++) if (K[i]>maxK) maxK=K[i];
4290
for (i=0,NchU[inode]=0; i<com.ncode; i++)
4291
if (K[i]==maxK) chU[inode*com.ncode+NchU[inode]++]=(char)i;
4292
Nsteps[inode]=nodes[inode].nson-maxK;
4293
FOR (i, nodes[inode].nson) Nsteps[inode]+=Nsteps[nodes[inode].sons[i]];
4296
void UpPassScoreOnlyB (int inode)
4298
/* uses bit operation, for binary trees only
4300
int ison1,ison2, i, change=0;
4302
FOR (i,nodes[inode].nson)
4303
if (nodes[nodes[inode].sons[i]].nson>0)
4304
UpPassScoreOnlyB (nodes[inode].sons[i]);
4306
ison1=nodes[inode].sons[0]; ison2=nodes[inode].sons[1];
4307
if ((chUB[inode]=(chUB[ison1] & chUB[ison2]))==0)
4308
{ chUB[inode]=(chUB[ison1] | chUB[ison2]); change=1; }
4309
Nsteps[inode]=change+Nsteps[ison1]+Nsteps[ison2];
4313
double MPScore (double space[])
4315
/* calculates MP score for a given tree using Hartigan's (1973) algorithm.
4316
sizeof(space) = nnode*sizeof(int)+(nnode+2)*ncode*sizeof(char).
4317
Uses Nsteps[nnode], chU[nnode*ncode], NchU[nnode].
4318
if(BitOperation), bit operations are used on binary trees.
4320
int h,i, BitOperation,U[3],change;
4324
BitOperation=(tree.nnode==2*com.ns-1 - (nodes[tree.root].nson==3));
4325
BitOperation=(BitOperation&&com.ncode<32);
4326
if (BitOperation) chUB=Nsteps+tree.nnode;
4328
chU=(char*)(Nsteps+tree.nnode);
4329
NchU=chU+tree.nnode*com.ncode; Kspace=NchU+tree.nnode;
4331
for (h=0,score=0; h<com.npatt; h++) {
4332
FOR (i,tree.nnode) Nsteps[i]=0;
4334
FOR (i,com.ns) chUB[i]=1<<(com.z[i][h]);
4335
UpPassScoreOnlyB (tree.root);
4336
if (nodes[tree.root].nson>2) {
4337
FOR (i,3) U[i]=chUB[nodes[tree.root].sons[i]];
4339
if (U[0]&U[1]&U[2]) change=0;
4340
else if (U[0]&U[1] || U[1]&U[2] || U[0]&U[2]) change=1;
4341
for (i=0,Nsteps[tree.root]=change; i<3; i++)
4342
Nsteps[tree.root]+=Nsteps[nodes[tree.root].sons[i]];
4345
else { /* polytomies, use characters */
4347
{chU[i*com.ncode]=(char)(com.z[i][h]); NchU[i]=(char)1; }
4348
for (i=com.ns; i<tree.nnode; i++) NchU[i]=0;
4349
UpPassScoreOnly (tree.root);
4351
score+=Nsteps[tree.root]*com.fpatt[h];
4353
printf("\nh %3d: ", h+1);
4354
FOR(i,com.ns) printf("%2d ", com.z[i][h]);
4355
printf(" %6d ", Nsteps[tree.root]);
4356
if((h+1)%10==0) exit(1);
4363
double RemoveMPNinfSites (double *nsiteNinf)
4365
/* Removes parsimony-noninformative sites and return the number of changes
4367
Changes .z[], .fpatt[], .npatt, etc.
4369
int h,j, it, npatt0=com.npatt, markb[NCODE], gt2;
4372
for (h=0,com.npatt=0,MPScoreNinf=0,*nsiteNinf=0; h<npatt0; h++) {
4373
FOR (j, com.ncode) markb[j]=0;
4374
FOR (j, com.ns) markb[(int)com.z[j][h]]++;
4375
for (j=0,it=gt2=0; j<com.ncode; j++)
4376
if (markb[j]>=2) { it++; gt2=1; }
4377
if (it<2) { /* non-informative */
4378
*nsiteNinf+=com.fpatt[h];
4379
FOR (j,com.ncode) if(markb[j]==1) MPScoreNinf+=com.fpatt[h];
4380
if (!gt2) MPScoreNinf-=com.fpatt[h];
4383
FOR (j, com.ns) com.z[j][com.npatt]=com.z[j][h];
4384
com.fpatt[com.npatt++]=com.fpatt[h];
4387
return (MPScoreNinf);
4393
#ifdef RECONSTRUCTION
4395
static char *chMark, *chMarkU, *chMarkL; /* VV, VU, VL */
4396
/* chMark, chMarkU, chMarkL (VV, VU, VL) have elements 0 or 1, marking
4397
whether the character state is present in the set */
4398
static char *PATHWay, *NCharaCur, *ICharaCur, *CharaCur;
4399
/* PATHWay, NCharaCur, ICharaCur, CharaCur are for the current
4403
int UpPass (int inode);
4404
int DownPass (int inode);
4406
int UpPass (int inode)
4408
/* => VU, VL, & MM, theorem 2 */
4409
int n=com.ncode, i, j;
4410
char *K=chMark, maxK; /* chMark (VV) not used in up pass */
4412
FOR (i,nodes[inode].nson)
4413
if (nodes[nodes[inode].sons[i]].nson>0) UpPass (nodes[inode].sons[i]);
4416
FOR (i,nodes[inode].nson)
4417
FOR (j, n) if(chMarkU[nodes[inode].sons[i]*n+j]) K[j]++;
4418
for (i=0,maxK=0; i<n; i++) if (K[i]>maxK) maxK=K[i];
4419
for (i=0; i<n; i++) {
4420
if (K[i]==maxK) chMarkU[inode*n+i]=1;
4421
else if (K[i]==maxK-1) chMarkL[inode*n+i]=1;
4423
Nsteps[inode]=nodes[inode].nson-maxK;
4424
FOR (i, nodes[inode].nson) Nsteps[inode]+=Nsteps[nodes[inode].sons[i]];
4428
int DownPass (int inode)
4430
/* VU, VL => VV, theorem 3 */
4431
int n=com.ncode, i, j, ison;
4433
FOR (i,nodes[inode].nson) {
4434
ison=nodes[inode].sons[i];
4435
FOR (j,n) if (chMark[inode*n+j]>chMarkU[ison*n+j]) break;
4437
FOR (j,n) chMark[ison*n+j]=chMark[inode*n+j];
4441
(char)(chMarkU[ison*n+j]||(chMark[inode*n+j]&&chMarkL[ison*n+j]));
4443
FOR (i,nodes[inode].nson)
4444
if (nodes[nodes[inode].sons[i]].nson>0) DownPass (nodes[inode].sons[i]);
4449
int DownStates (int inode)
4451
/* VU, VL => NCharaCur, CharaCur, theorem 4 */
4454
FOR (i,nodes[inode].nson)
4455
if (nodes[inode].sons[i]>=com.ns)
4456
DownStatesOneNode (nodes[inode].sons[i], inode);
4460
int DownStatesOneNode (int ison, int father)
4462
/* States down inode, given father */
4463
char chi=PATHWay[father-com.ns];
4464
int n=com.ncode, j, in;
4466
if((in=ison-com.ns)<0) return (0);
4467
if (chMarkU[ison*n+chi]) {
4468
NCharaCur[in]=1; CharaCur[in*n+0]=chi;
4470
else if (chMarkL[ison*n+chi]) {
4471
for (j=0,NCharaCur[in]=0; j<n; j++)
4472
if (chMarkU[ison*n+j] || j==chi) CharaCur[in*n+NCharaCur[in]++]=(char)j;
4475
for (j=0,NCharaCur[in]=0; j<n; j++)
4476
if (chMarkU[ison*n+j]) CharaCur[in*n+NCharaCur[in]++]=(char)j;
4478
PATHWay[in]=CharaCur[in*n+(ICharaCur[in]=0)];
4479
FOR (j, nodes[ison].nson) if (nodes[ison].sons[j]>=com.ns) break;
4480
if (j<nodes[ison].nson) DownStates (ison);
4485
int InteriorStatesMP (int job, int h, int *nchange, char NChara[NS-1],
4486
char Chara[(NS-1)*NCODE], double space[]);
4488
int InteriorStatesMP (int job, int h, int *nchange, char NChara[NS-1],
4489
char Chara[(NS-1)*NCODE], double space[])
4491
/* sizeof(space) = nnode*sizeof(int)+3*nnode*ncode*sizeof(char)
4492
job: 0=# of changes; 1:equivocal states
4494
int n=com.ncode, i,j;
4496
Nsteps=(int*)space; chMark=(char*)(Nsteps+tree.nnode);
4497
chMarkU=chMark+tree.nnode*n; chMarkL=chMarkU+tree.nnode*n;
4498
FOR (i,tree.nnode) Nsteps[i]=0;
4499
FOR (i,3*n*tree.nnode) chMark[i]=0;
4500
FOR (i,com.ns) chMark[i*n+com.z[i][h]]=chMarkU[i*n+com.z[i][h]]=1;
4502
*nchange=Nsteps[tree.root];
4503
if (job==0) return (0);
4504
FOR (i,n) chMark[tree.root*n+i]=chMarkU[tree.root*n+i];
4505
DownPass (tree.root);
4506
FOR (i,tree.nnode-com.ns)
4507
for (j=0,NChara[i]=0; j<n; j++)
4508
if (chMark[(i+com.ns)*n+j]) Chara[i*n+NChara[i]++]=(char)j;
4513
int PathwayMP (FILE *fout, double space[])
4515
/* Hartigan, JA. 1973. Minimum mutation fits to a given tree.
4516
Biometrics, 29:53-65.
4518
char *pch=(com.seqtype==0?BASEs:AAs), visit[NS-1];
4519
int n=com.ncode, nid=tree.nbranch-com.ns+1, it, i,j,k, h, npath;
4520
int nchange, nchange0;
4521
char nodeb[NNODE], Equivoc[NS-1];
4523
PATHWay=(char*)malloc(nid*(n+3)*sizeof(char));
4524
NCharaCur=PATHWay+nid; ICharaCur=NCharaCur+nid; CharaCur=ICharaCur+nid;
4526
for (j=0,visit[i=0]=(char)(tree.root-com.ns); j<tree.nbranch; j++)
4527
if (tree.branches[j][1]>=com.ns)
4528
visit[++i]=(char)(tree.branches[j][1]-com.ns);
4530
printf ("\nOrder in nodes: ");
4531
FOR (j, nid) printf ("%4d", visit[j]+1+com.ns); FPN(F0);
4533
for (h=0; h<com.npatt; h++) {
4534
fprintf (fout, "\n%4d%6.0f ", h+1, com.fpatt[h]);
4535
FOR (j, com.ns) fprintf (fout, "%c", pch[(int)com.z[j][h]]);
4536
fprintf (fout, ": ");
4538
FOR (j,com.ns) nodeb[j]=(char)(com.z[j][h]);
4540
InteriorStatesMP (1, h, &nchange, NCharaCur, CharaCur, space);
4541
ICharaCur[j=tree.root-com.ns]=0; PATHWay[j]=CharaCur[j*n+0];
4542
FOR (j,nid) Equivoc[j]=(char)(NCharaCur[j]>1);
4543
DownStates (tree.root);
4546
for (j=0,k=visit[nid-1]; j<NCharaCur[k]; j++) {
4547
PATHWay[k]=CharaCur[k*n+j]; npath++;
4548
FOR (i, nid) fprintf (fout, "%c", pch[(int)PATHWay[i]]);
4549
fprintf (fout, " ");
4551
FOR (i,nid) nodeb[i+com.ns]=PATHWay[i];
4552
for (i=0,nchange0=0; i<tree.nbranch; i++)
4553
nchange0+=(nodeb[tree.branches[i][0]]!=nodeb[tree.branches[i][1]]);
4554
if (nchange0!=nchange)
4555
{ puts("\a\nerr:PathwayMP"); fprintf(fout,".%d. ", nchange0);}
4558
for (j=nid-2; j>=0; j--) {
4559
if(Equivoc[k=visit[j]] == 0) continue;
4560
if (ICharaCur[k]+1<NCharaCur[k]) {
4561
PATHWay[k] = CharaCur[k*n + (++ICharaCur[k])];
4562
DownStates (k+com.ns);
4565
else { /* if (next equivocal node is not ancestor) update node k */
4566
for (i=j-1; i>=0; i--) if (Equivoc[(int)visit[i]]) break;
4568
for (it=k+com.ns,i=visit[i]+com.ns; ; it=nodes[it].father)
4569
if (it==tree.root || nodes[it].father==i) break;
4571
DownStatesOneNode(k+com.ns, nodes[k+com.ns].father);
4577
fprintf (fout, " |%4d (%d)", npath, nchange);
4587
#if(BASEML || CODEML)
4590
int BootstrapSeq (char* seqf)
4592
/* This is called from within ReadSeq(), right after the sequences are read
4593
and before the data are coded.
4594
jackknife if(lsb<com.ls && com.ngene==1).
4595
gmark[start+19] marks the position of the 19th site in that gene.
4597
int iboot,nboot=com.bootstrap, h,is,ig,lg[NGENE]={0},j, start;
4598
int lsb=com.ls, n31=1,gap=10, gpos[NGENE];
4599
int *sites=(int*)malloc(com.ls*sizeof(int)), *gmark=NULL;
4600
FILE *fseq=(FILE*)gfopen(seqf,"w");
4601
enum {PAML=0, PAUP};
4602
char *dt=(com.seqtype==AAseq?"protein":"dna");
4603
char *paupstart="paupstart",*paupblock="paupblock",*paupend="paupend";
4604
int format=0; /* 0: paml-phylip; 1:paup-nexus */
4606
if(com.readpattern) error2("work on bootstrapping pattern data.");
4608
printf("\nGenerating bootstrap samples in file %s\n", seqf);
4610
printf("%s, %s, & %s will be appended if existent.\n",
4611
paupstart,paupblock,paupend);
4612
appendfile(fseq,paupstart);
4615
if(com.seqtype==CODONseq||com.seqtype==CODON2AAseq) { n31=3; gap=1; }
4616
if(sites==NULL) error2("oom in BootstrapSeq");
4618
if(lsb<com.ls) error2("jackknife when #gene>1");
4619
if((gmark=(int*)malloc(com.ls*sizeof(int)))==NULL)
4620
error2("oom in BootstrapSeq");
4622
for(ig=0; ig<com.ngene; ig++) com.lgene[ig] = gpos[ig] = 0;
4623
for(h=0; h<com.ls; h++) com.lgene[com.pose[h]]++;
4624
for(j=0; j<com.ngene; j++) lg[j] = com.lgene[j];
4625
for(j=1; j<com.ngene; j++) com.lgene[j] += com.lgene[j-1];
4627
if(noisy && com.ngene>1) {
4628
printf("Bootstrap uses stratefied sampling for %d partitions.",com.ngene);
4629
printf("\nnumber of sites in each partition: ");
4630
FOR(ig,com.ngene) printf(" %4d", lg[ig]);
4634
for(h=0; h<com.ls; h++) { /* create gmark[] */
4636
start = (ig==0 ? 0 : com.lgene[ig-1]);
4637
gmark[start + gpos[ig]++] = h;
4641
for (iboot=0; iboot<nboot; iboot++,FPN(fseq)) {
4643
for(h=0; h<lsb; h++) sites[h] = (int)(rndu()*com.ls);
4645
for(ig=0; ig<com.ngene; ig++) {
4646
start = (ig==0 ? 0 : com.lgene[ig-1]);
4647
for(h=0; h<lg[ig]; h++)
4648
sites[start+h] = gmark[start+(int)(rndu()*lg[ig])];
4652
/* print out the bootstrap sample */
4654
fprintf(fseq,"\n\n[Replicate # %d]\n", iboot+1);
4655
fprintf(fseq,"\nbegin data;\n");
4656
fprintf(fseq," dimensions ntax=%d nchar=%d;\n", com.ns, lsb*n31);
4657
fprintf(fseq," format datatype=%s missing=? gap=-;\n matrix\n",dt);
4659
for(is=0;is<com.ns;is++,FPN(fseq)) {
4660
fprintf(fseq,"%-20s ", com.spname[is]);
4661
for(h=0; h<lsb; h++) {
4662
for(j=0; j<n31; j++) fprintf(fseq,"%c", com.z[is][sites[h]*n31+j]);
4663
if((h+1)%gap==0) fprintf(fseq," ");
4667
fprintf(fseq, " ;\nend;");
4668
/* site partitions */
4670
fprintf(fseq, "\n\nbegin paup;\n");
4671
for(ig=0; ig<com.ngene; ig++)
4672
fprintf(fseq, " charset partition%-2d = %-4d - %-4d;\n",
4673
ig+1, (ig==0?1:com.lgene[ig-1]+1),com.lgene[ig]);
4674
fprintf(fseq, "end;\n");
4676
appendfile(fseq, paupblock);
4680
fprintf(fseq,"%6d %6d\n", com.ns, lsb*n31);
4682
fprintf(fseq,"%6d %6d G\nG %d ", com.ns, lsb*n31, com.ngene);
4683
for(ig=0; ig<com.ngene; ig++)
4684
fprintf(fseq," %4d", lg[ig]);
4685
fprintf(fseq,"\n\n");
4687
for(is=0;is<com.ns;is++,FPN(fseq)) {
4688
fprintf(fseq,"%-20s ", com.spname[is]);
4689
for(h=0; h<lsb; h++) {
4690
for(j=0; h<n31; h++)
4691
fprintf(fseq,"%c", com.z[is][sites[h]*n31+j]);
4692
if((h+1)%gap==0) fprintf(fseq," ");
4697
if(noisy && (iboot+1)%10==0) printf("\rdid sample #%d", iboot+1);
4699
free(sites); if(com.ngene>1) free(gmark);
4705
int rell (FILE*flnf, FILE*fout, int ntree)
4707
/* This implements three methods for tree topology comparison. The first
4708
tests the log likelihood difference using a normal approximation
4709
(Kishino and Hasegawa 1989). The second does approximate bootstrap sampling
4710
(the RELL method, Kishino and Hasegawa 1989, 1993). The third is a
4711
modification of the K-H test with a correction for multiple comparison
4712
(Shimodaira and Hasegawa 1999) .
4713
The routine reads input from the file lnf.
4715
fpattB[npatt] stores the counts of site patterns in the bootstrap sample,
4716
with sitelist[ls] listing sites by gene, for stratefied sampling.
4718
com.space[ntree*(npatt+nr+5)]:
4719
lnf[ntree*npatt] lnL0[ntree] lnL[ntree*nr] pRELL[ntree] pSH[ntree] vdl[ntree]
4722
char *line, timestr[64];
4723
int nr=(com.ls<100000?10000:(com.ls<10000?5000:500));
4724
int lline=16000, ntree0,ns0=com.ns, ls0,npatt0;
4725
int itree, h,ir,j,k, ig, mltree, nbtree, *btrees, status=0;
4726
int *sitelist, *fpattB, *lgeneB, *psitelist;
4727
double *lnf, *lnL0, *lnL, *pRELL, *lnLmSH, *pSH, *vdl, y, mdl, small=1e-5;
4731
puts( "\nTree comparisons (Kishino & Hasegawa 1989; Shimodaira & Hasegawa 1999)");
4732
fputs("\nTree comparisons (Kishino & Hasegawa 1989; Shimodaira & Hasegawa 1999)\n",fout);
4733
fprintf(fout,"Number of replicates: %d\n", nr);
4735
fscanf(flnf,"%d%d%d", &ntree0, &ls0, & npatt0);
4736
if(ntree0!=-1 && ntree0!=ntree) error2("rell: input data file strange. Check.");
4737
if (ls0!=com.ls || npatt0!=com.npatt)
4738
error2("rell: input data file incorrect.");
4739
s = ntree*(com.npatt+nr+5)*sizeof(double);
4740
if(com.sspace < s) {
4741
if(noisy) printf("resetting space to %lu bytes in rell.\n",s);
4743
if((com.space=(double*)realloc(com.space,com.sspace))==NULL)
4744
error2("oom space");
4746
lnf=com.space; lnL0=lnf+ntree*com.npatt; lnL=lnL0+ntree; pRELL=lnL+ntree*nr;
4747
pSH=pRELL+ntree; vdl=pSH+ntree; btrees=(int*)(vdl+ntree);
4748
fpattB=(int*)malloc((com.npatt+com.ls+com.ngene)*sizeof(int));
4749
if(fpattB==NULL) error2("oom fpattB in rell.");
4750
sitelist=fpattB+com.npatt; lgeneB=sitelist+com.ls;
4752
lline = (com.seqtype==1 ? ns0*8 : ns0) + 100;
4753
lline = max2(16000, lline);
4754
if((line=(char*)malloc((lline+1)*sizeof(char)))==NULL) error2("oom rell");
4756
/* read lnf from file flnf, calculates lnL0[] & find ML tree */
4757
for(itree=0,mltree=0; itree<ntree; itree++) {
4758
printf("\r\tReading lnf for tree # %d", itree+1);
4759
fscanf(flnf, "%d", &j);
4761
{ printf("\nerr: lnf, reading tree %d.",itree+1); return(-1); }
4762
for(h=0,lnL0[itree]=0; h<com.npatt; h++) {
4763
fscanf (flnf, "%d%d%lf", &j, &k, &y);
4765
{ printf("\nlnf, patt %d.",h+1); return(-1); }
4766
fgets(line,lline,flnf);
4767
lnL0[itree]+=com.fpatt[h]*(lnf[itree*com.npatt+h]=y);
4769
if(itree && lnL0[itree]>lnL0[mltree]) mltree=itree;
4771
printf(", done.\n");
4774
/* calculates SEs (vdl) by sitewise comparison */
4777
printf("\r\tCalculating SEs by sitewise comparison");
4779
if(itree==mltree) { vdl[itree]=0; continue; }
4780
mdl=(lnL0[itree]-lnL0[mltree])/com.ls;
4781
for(h=0,vdl[itree]=0; h<com.npatt; h++) {
4782
y=lnf[itree*com.npatt+h]-lnf[mltree*com.npatt+h];
4783
vdl[itree]+=com.fpatt[h]*(y-mdl)*(y-mdl);
4785
vdl[itree]=sqrt(vdl[itree]);
4787
printf(", %s\n", printtime(timestr));
4789
/* bootstrap resampling */
4790
for(ig=0; ig<com.ngene; ig++)
4791
lgeneB[ig]=(ig?com.lgene[ig]-com.lgene[ig-1]:com.lgene[ig]);
4792
for(h=0,k=0;h<com.npatt;h++)
4793
FOR(j,(int)com.fpatt[h]) sitelist[k++]=h;
4795
zero(pRELL,ntree); zero(pSH,ntree); zero(lnL,ntree*nr);
4796
for(ir=0; ir<nr; ir++) {
4797
for(h=0; h<com.npatt; h++) fpattB[h]=0;
4798
for(ig=0,psitelist=sitelist; ig<com.ngene; psitelist+=lgeneB[ig++]) {
4799
for(k=0; k<lgeneB[ig]; k++) {
4800
j=(int)(lgeneB[ig]*rndu());
4805
for(h=0; h<com.npatt; h++) {
4807
for(itree=0; itree<ntree; itree++)
4808
lnL[itree*nr+ir] += fpattB[h]*lnf[itree*com.npatt+h];
4811
/* y is the lnL for the best tree from replicate ir. */
4812
for(j=1,nbtree=1,btrees[0]=0,y=lnL[ir]; j<ntree; j++) {
4813
if(fabs(lnL[j*nr+ir]-y)<small)
4815
else if (lnL[j*nr+ir]>y)
4816
{ nbtree=1; btrees[0]=j; y=lnL[j*nr+ir]; }
4819
for(j=0; j<nbtree; j++)
4820
pRELL[btrees[j]]+=1./(nr*nbtree);
4821
if(nr>100 && (ir+1)%(nr/100)==0)
4822
printf("\r\tRELL Bootstrapping.. replicate: %6d / %d %s",ir+1,nr, printtime(timestr));
4827
if(fabs(1-sum(pRELL,ntree))>1e-6) error2("sum pRELL != 1.");
4829
/* Shimodaira & Hasegawa correction (1999), working on lnL[ntree*nr] */
4830
printf("\nnow doing S-H test");
4831
if((lnLmSH=(double*)malloc(nr*sizeof(double))) == NULL) error2("oom in rell");
4832
for(j=0; j<ntree; j++) /* step 3: centering */
4833
for(ir=0,y=sum(lnL+j*nr,nr)/nr; ir<nr; ir++) lnL[j*nr+ir] -= y;
4834
for(ir=0; ir<nr; ir++) {
4835
for(j=1,lnLmSH[ir]=lnL[ir]; j<ntree; j++)
4836
if(lnL[j*nr+ir]>lnLmSH[ir]) lnLmSH[ir] = lnL[j*nr+ir];
4838
for(itree=0; itree<ntree; itree++) { /* steps 4 & 5 */
4839
for(ir=0; ir<nr; ir++)
4840
if(lnLmSH[ir]-lnL[itree*nr+ir] > lnL0[mltree]-lnL0[itree])
4841
pSH[itree] += 1./nr;
4844
fprintf(fout,"\n%6s %12s %9s %9s%8s%10s%9s\n\n",
4845
"tree","li","Dli"," +- SE","pKH","pSH","pRELL");
4847
mdl=lnL0[j]-lnL0[mltree];
4848
if(j==mltree || fabs(vdl[j])<1e-6) { y=-1; pSH[j]=-1; status=-1; }
4849
else y=1-CDFNormal(-mdl/vdl[j]);
4850
fprintf(fout,"%6d%c%12.3f %9.3f %9.3f%8.3f%10.3f%9.3f\n",
4851
j+1,(j==mltree?'*':' '),lnL0[j],mdl,vdl[j],y,pSH[j],pRELL[j]);
4854
fprintf(frst1,"%3d %12.6f",mltree+1, lnL0[mltree]);
4855
for(j=0;j<ntree;j++) fprintf(frst1," %5.3f",pRELL[j]);
4857
for(j=0;j<ntree;j++) if(j!=mltree) fprintf(frst1,"%9.6f",pSH[j]);
4860
fputs("\npKH: P value for KH normal test (Kishino & Hasegawa 1989)\n",fout);
4861
fputs("pRELL: RELL bootstrap proportions (Kishino & Hasegawa 1989)\n",fout);
4862
fputs("pSH: P value with multiple-comparison correction (MC in table 1 of Shimodaira & Hasegawa 1999)\n",fout);
4863
if(status) fputs("(-1 for P values means N/A)\n",fout);
4876
#ifdef RECONSTRUCTION
4879
void ListAncestSeq(FILE *fout, char *zanc);
4881
void ListAncestSeq(FILE *fout, char *zanc)
4883
/* zanc[nintern*com.npatt] holds ancestral sequences.
4884
Extant sequences are coded if cleandata.
4886
int wname=15, j,h, n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);
4887
int lst=(com.readpattern?com.npatt:com.ls);
4889
fputs("\n\n\nList of extant and reconstructed sequences\n\n",fout);
4890
if(!com.readpattern) fprintf(fout, "%6d %6d\n\n", tree.nnode, lst*n31);
4891
else fprintf(fout, "%6d %6d P\n\n", tree.nnode, lst*n31);
4892
for(j=0;j<com.ns;j++,FPN(fout)) {
4893
fprintf(fout,"%-*s ", wname,com.spname[j]);
4894
print1seq(fout, com.z[j], lst, com.pose);
4896
for(j=0;j<tree.nnode-com.ns;j++,FPN(fout)) {
4897
fprintf(fout,"node #%-*d ", wname-5,com.ns+j+1);
4898
print1seq(fout, zanc+j*com.npatt, lst, com.pose);
4900
if(com.readpattern) {
4901
for(h=0,FPN(fout); h<com.npatt; h++) {
4902
fprintf(fout," %4.0f", com.fpatt[h]);
4903
if((h+1)%15==0) FPN(fout);
4905
fprintf(fout,"\n\n");
4909
int ProbSitePattern(double x[], double *lnL, double fhsiteAnc[], double ScaleC[]);
4910
int AncestralMarginal(FILE *fout, double x[], double fhsiteAnc[], double Sir[]);
4911
int AncestralJointPPSG2000(FILE *fout, double x[]);
4914
int ProbSitePattern (double x[], double *lnL, double fhsiteAnc[], double ScaleC[])
4916
/* This calculates probabilities for observing site patterns fhsite[].
4917
The following notes are for ncatG>1 and method = 0.
4918
The routine calculates the scale factor common to all site classes (ir),
4919
that is, the greatest of the scale factors among the ir classes.
4920
The common scale factors will be used in scaling nodes[].conP for all site
4921
classes for all nodes in PostProbNode(). Small conP for some site classes
4922
will be essentially set to 0, which is fine.
4927
Ziheng Yang, 7 Sept, 2001
4932
if(com.ncatG>1 && com.method==1) error2("don't need this?");
4933
if (SetParameters(x)) puts ("par err.");
4934
for(h=0; h<com.npatt; h++)
4937
for (ig=0,*lnL=0; ig<com.ngene; ig++) {
4938
if(com.Mgene>1) SetPGene(ig, 1, 1, 0, x);
4939
ConditionalPNode (tree.root, ig, x);
4940
for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
4941
for (i=0; i<com.ncode; i++)
4942
fhsiteAnc[h] += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];
4943
*lnL -= log(fhsiteAnc[h])*com.fpatt[h];
4945
for(k=0; k<com.NnodeScale; k++)
4946
*lnL -= com.nodeScaleF[k*com.npatt+h]*com.fpatt[h];
4951
for (ig=0; ig<com.ngene; ig++) {
4952
if(com.Mgene>1 || com.nalpha>1)
4953
SetPGene(ig, com.Mgene>1, com.Mgene>1, com.nalpha>1, x);
4954
for (ir=0; ir<com.ncatG; ir++) {
4956
if(com.seqtype==1 && com.NSsites /* && com.model */) IClass=ir;
4958
SetPSiteClass(ir, x);
4959
ConditionalPNode (tree.root, ig, x);
4961
for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
4962
for (i=0,fh=0; i<com.ncode; i++)
4963
fh += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];
4965
if(com.NnodeScale) {
4966
for(k=0,S=0; k<com.NnodeScale; k++) S += com.nodeScaleF[k*com.npatt+h];
4968
if(ir==0) ScaleC[h]=S;
4969
else if(S<=ScaleC[h]) y=exp(S-ScaleC[h]);
4970
else /* change of scale factor */
4971
{ fhsiteAnc[h] *= exp(ScaleC[h]-S); ScaleC[h]=S; }
4973
fhsiteAnc[h] += com.freqK[ir]*fh*y;
4977
for(h=0, *lnL=0; h<com.npatt; h++)
4978
*lnL -= log(fhsiteAnc[h])*com.fpatt[h];
4980
for(h=0; h<com.npatt; h++)
4981
*lnL -= ScaleC[h]*com.fpatt[h];
4983
/* if(noisy) printf("\nlnL = %12.6f from ProbSitePattern.\n", - *lnL); */
4989
int updateconP(double x[], int inode);
4991
int PostProbNode (int inode, double x[], double fhsiteAnc[], double ScaleC[],
4992
double *lnL, double pChar1node[], char za[], double pnode[])
4994
/* This calculates the full posterior distribution for node inode at each site.
4995
Below are special comments on gamma models and method = 0.
4997
Marginal reconstruction under gamma models, with complications arising from
4998
scaling on large trees (com.NnodeScale) and the use of two iteration algorithms
5002
The algorithm is different depending on method, which makes the code clumsy.
5004
gamma method=0 or 2 (simultaneous updating):
5005
nodes[].conP overlap and get destroyed for different site classes (ir)
5006
The same for scale factors com.nodeScaleF.
5007
fhsite[npatt] and common scale factors ScaleC[npatt] are calculated for all
5008
nodes before this routine is called. The common scale factors are then
5009
used to adjust nodes[].conP before they are summed across ir classes.
5011
gamma method=1 (one branch at a time):
5012
nodes[].conP (and com.nodeScaleF if node scaling is on) are separately
5013
allocated for different site classes (ir), so that all info needed is
5014
available. Use of updateconP() saves computation on large trees.
5015
Scale factor Sir[] is of size ncatG and reused for each h.
5017
int n=com.ncode, i,k,h, ir,it=-1,best, ig;
5018
double fh, y,pbest, *Sir=ScaleC, S;
5021
zero(pChar1node,com.npatt*n);
5023
/* nodes[].conP are reused for different ir, with or without node scaling */
5024
if (com.ncatG>1 && com.method!=1) {
5026
for (ig=0; ig<com.ngene; ig++) {
5027
if(com.Mgene>1 || com.nalpha>1)
5028
SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,x);
5029
for (ir=0; ir<com.ncatG; ir++) {
5031
if(com.seqtype==1 && com.NSsites) IClass=ir;
5033
SetPSiteClass(ir, x);
5034
ConditionalPNode (tree.root, ig, x);
5036
for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
5037
if(!com.NnodeScale) S=1;
5039
for(k=0,S=0; k<com.NnodeScale; k++)
5040
S += com.nodeScaleF[k*com.npatt+h];
5043
for (i=0,fh=0; i<n; i++) {
5044
y = com.freqK[ir]*com.pi[i]*nodes[tree.root].conP[h*n+i] * S;
5046
pChar1node[h*n+i] += y ;
5051
for (h=0; h<com.npatt; h++) {
5052
for(i=0,y=0;i<n;i++) y += (pChar1node[h*n+i]/=fhsiteAnc[h]);
5054
error2("PostProbNode: sum!=1");
5055
for (i=0,best=-1,pbest=-1; i<n; i++)
5056
if (pChar1node[h*n+i]>pbest) {
5058
pbest=pChar1node[h*n+i];
5060
za[(inode-com.ns)*com.npatt+h] = (char)best;
5061
pnode[(inode-com.ns)*com.npatt+h] = pbest;
5062
*lnL -= log(fhsiteAnc[h])*com.fpatt[h];
5063
if(com.NnodeScale) *lnL -= ScaleC[h]*com.fpatt[h];
5066
else { /* all other cases: (alpha==0 || method==1) */
5067
for(i=0; i<tree.nnode; i++) com.oldconP[i] = 1;
5069
updateconP(x,inode);
5070
if (com.alpha==0 && com.ncatG<=1) { /* (alpha==0) (ngene>1 OK) */
5071
for (ig=0; ig<com.ngene; ig++) {
5072
if(com.Mgene==2 || com.Mgene==4)
5073
xtoy(com.piG[ig], com.pi, n);
5074
for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
5075
for (i=0,fh=0,pbest=0,best=-1; i<n; i++) {
5076
y = com.pi[i]*nodes[tree.root].conP[h*n+i];
5079
{ pbest=y; best=i; }
5080
pChar1node[h*n+i] = y;
5082
za[(inode-com.ns)*com.npatt+h] = (char)best;
5083
pnode[(inode-com.ns)*com.npatt+h] = (pbest/=fh);
5085
pChar1node[h*n+i] /= fh;
5086
*lnL -= log(fh)*(double)com.fpatt[h];
5087
for(i=0; i<com.NnodeScale; i++)
5088
*lnL -= com.nodeScaleF[i*com.npatt+h]*com.fpatt[h];
5092
else { /* (ncatG>1 && method = 1) This should work for NSsites? */
5093
for (ig=0; ig<com.ngene; ig++) {
5094
if(com.Mgene==2 || com.Mgene==4)
5095
xtoy(com.piG[ig], com.pi, n);
5096
for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
5098
for(ir=0,it=0; ir<com.ncatG; ir++) { /* Sir[it] is the biggest */
5099
for(k=0,Sir[ir]=0; k<com.NnodeScale; k++)
5100
Sir[ir] += com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h];
5101
if(Sir[ir]>Sir[it]) it = ir;
5103
for (i=0,fh=0; i<n; i++) {
5104
for(ir=0; ir<com.ncatG; ir++) {
5106
y = nodes[tree.root].conP[ir*(tree.nnode-com.ns)*com.npatt*n+h*n+i];
5108
y = nodes[tree.root].conP[h*n+i]; /* wrong right now */
5109
y *= com.pi[i]*com.freqK[ir];
5110
if(com.NnodeScale) y *= exp(Sir[ir]-Sir[it]);
5112
pChar1node[h*n+i] += y;
5116
for (i=0,best=0; i<n; i++) {
5117
pChar1node[h*n+i] /= fh;
5118
if(i && pChar1node[h*n+best]<pChar1node[h*n+i])
5121
za[(inode-com.ns)*com.npatt+h] = (char)best;
5122
pnode[(inode-com.ns)*com.npatt+h] = pChar1node[h*n+best];
5123
*lnL -= log(fh)*com.fpatt[h];
5124
if(com.NnodeScale) *lnL -= Sir[it]*com.fpatt[h];
5133
void getCodonNode1Site(char codon[], char zanc[], int inode, int site);
5135
int AncestralMarginal (FILE *fout, double x[], double fhsiteAnc[], double Sir[])
5137
/* Ancestral reconstruction for each interior node. This works under both
5138
the one rate and gamma rates models.
5139
pnode[npatt*nid] stores the prob for the best chara at a node and site.
5140
The best character is kept in za[], coded as 0,...,n-1.
5141
The data may be coded (com.cleandata==1) or not (com.cleandata==0).
5142
Call ProbSitePatt() before running this routine.
5143
pMAPnode[NS-1], pMAPnodeA[] stores the MAP probabilities (accuracy)
5144
for a site and for the entire sequence, respectively.
5146
The routine PostProbNode calculates pChar1node[npatt*ncode], which stores
5147
prob for each char at each pattern at each given node inode. The rest of
5148
the routine is to output the results in different ways.
5150
Deals with node scaling to avoid underflows. See above
5151
(Z. Yang, 2 Sept 2001)
5153
char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs)), *zanc;
5154
char str[4]="",codon[2][4]={" "," "}, aa[4]="";
5155
char *sitepatt=(com.readpattern?"pattern":"site");
5156
int n=com.ncode, inode, ic=0,b[3],i,j,k1=-1,k2=-1,c1,c2,k3, lsc=com.ls;
5157
int lst=(com.readpattern?com.npatt:com.ls);
5158
int h,hp,ig, best, oldroot=tree.root;
5159
int nid=tree.nnode-com.ns, nchange;
5160
double lnL=0, fh, y, pbest, *pChar1node, *pnode, p1=-1,p2=-1;
5161
double pMAPnode[NS-1], pMAPnodeA[NS-1], smallp=0.001;
5163
char coding=0, *bestAA=NULL;
5164
double pAA[21], *pbestAA=NULL, ns,na, nst,nat,S,N;
5165
/* bestAA[nid*npatt], pbestAA[nid*npatt]:
5166
To reconstruct aa seqs using codon or nucleotide seqs, universal code */
5168
if(noisy) puts("Marginal reconstruction.");
5170
fprintf (fout,"\n(1) Marginal reconstruction of ancestral sequences\n");
5171
fprintf (fout,"(eqn. 4 in Yang et al. 1995 Genetics 141:1641-1650).\n");
5172
pChar1node = (double*)malloc(com.npatt*n*sizeof(double));
5173
pnode = (double*)malloc((nid*com.npatt+1)*(sizeof(double)+sizeof(char)));
5174
if (pnode==NULL||pChar1node==NULL)
5175
error2("oom pnode");
5176
zanc = (char*)(pnode+nid*com.npatt);
5179
if(com.seqtype==0 && com.ls%3==0 && com.coding) { coding=1; lsc=com.ls/3; }
5181
if(com.seqtype==1) { coding=1; lsc=com.npatt; }
5183
if((pbestAA=(double*)malloc(nid*lsc*2*sizeof(double)))==NULL)
5184
error2("oom pbestAA");
5185
bestAA = (char*)(pbestAA+nid*lsc);
5188
if(SetParameters(x)) puts("par err.");
5191
fprintf(fout,"\nProb distribs at nodes, those with p < %.3f not listed\n", smallp);
5193
/* This loop reroots the tree at inode & reconstructs sequence at inode */
5194
for (inode=com.ns; inode<tree.nnode; inode++) {
5196
PostProbNode (inode, x, fhsiteAnc, Sir, &lnL, pChar1node, zanc, pnode);
5197
if(noisy) printf ("\tNode %3d: lnL = %12.6f\n", inode+1, -lnL);
5199
/* print Prob distribution at inode if com.verbose>1 */
5200
if (com.verbose>1) {
5201
fprintf(fout,"\nProb distribution at node %d, by %s\n", inode+1, sitepatt);
5202
fprintf(fout,"\n%7s Freq Data\n\n", sitepatt);
5203
for(h=0;h<lst;h++,FPN(fout)) {
5204
hp = (!com.readpattern ? com.pose[h] : h);
5205
fprintf (fout,"%7d%7.0f ", h+1, com.fpatt[hp]);
5206
print1site(fout, hp);
5208
for(j=0; j<n; j++) {
5209
if (com.seqtype!=CODONseq) {
5214
strcpy(str, CODONs[j]);
5215
fprintf(fout,"%s(%5.3f) ", str, pChar1node[hp*n+j]);
5218
} /* if (verbose) */
5221
/* find the best amino acid for coding seqs */
5223
if(com.seqtype==CODONseq)
5224
for(h=0; h<com.npatt; h++) {
5225
for(j=0; j<20; j++) pAA[j]=0;
5226
for(j=0; j<n; j++) {
5227
i = GeneticCode[com.icode][FROM61[j]];
5228
pAA[i] += pChar1node[h*n+j];
5230
/* matout(F0,pAA,1,20); */
5231
for(j=0,best=0,pbest=0; j<20; j++)
5232
if(pAA[j]>pbest) { pbest=pAA[j]; best=j; }
5233
bestAA[(inode-com.ns)*com.npatt+h] = (char)best;
5234
pbestAA[(inode-com.ns)*com.npatt+h] = pbest;
5237
if(com.seqtype==0 && coding) { /* coding seqs analyzed by baseml */
5238
for(h=0; h<lsc; h++) { /* h-th codon */
5239
/* sums up probs for the 20 AAs for each node. Stop codons are
5240
ignored, and so those probs are approxiamte. */
5241
for(j=0,y=0; j<20; j++) pAA[j]=0;
5242
for(k1=0; k1<4; k1++) for(k2=0; k2<4; k2++) for(k3=0; k3<4; k3++) {
5244
b[0] = com.pose[h*3+0]*n+k1;
5245
b[1] = com.pose[h*3+1]*n+k2;
5246
b[2] = com.pose[h*3+2]*n+k3;
5247
fh = pChar1node[b[0]]*pChar1node[b[1]]*pChar1node[b[2]];
5248
if((ic=GeneticCode[com.icode][ic])==-1)
5253
if(fabs(1-y-sum(pAA,20))>1e-6) error2("AncestralMarginal strange?");
5255
for(j=0,best=0,pbest=0; j<20; j++)
5256
if(pAA[j]>pbest) { pbest=pAA[j]; best=j; }
5258
bestAA[(inode-com.ns)*com.ls/3+h] = (char)best;
5259
pbestAA[(inode-com.ns)*com.ls/3+h] = pbest/(1-y);
5262
} /* for (inode), This closes the big loop */
5264
for(i=0; i<tree.nnode; i++)
5266
ReRootTree(oldroot);
5268
if(com.seqtype==0 && coding && !com.readpattern) { /* coding seqs analyzed by baseml */
5269
fputs("\nBest amino acids reconstructed from nucleotide model.\n",fout);
5270
fputs("Prob at each node listed by amino acid (codon) site\n",fout);
5271
fputs("(Please ignore if not relevant)\n\n",fout);
5272
for(h=0;h<com.ls/3;h++,FPN(fout)) {
5273
fprintf(fout,"%4d ", h+1);
5274
for(j=0; j<com.ns; j++) {
5275
getCodonNode1Site(codon[0], NULL, j, h);
5276
Codon2AA(codon[0], aa, com.icode, &i);
5277
fprintf(fout," %s(%c)",codon[0],AAs[i]);
5280
for (j=0; j<tree.nnode-com.ns; j++) {
5281
fprintf(fout," %1c (%5.3f)", AAs[bestAA[j*com.ls/3+h]], pbestAA[j*com.ls/3+h]);
5286
/* calculate accuracy measures */
5287
zero(pMAPnode,nid); fillxc(pMAPnodeA, 1., nid);
5288
for (inode=0; inode<tree.nnode-com.ns; inode++) {
5289
for(h=0; h<com.npatt; h++) {
5290
pMAPnode[inode] += com.fpatt[h]*pnode[inode*com.npatt+h]/com.ls;
5291
pMAPnodeA[inode] *= pow(pnode[inode*com.npatt+h], com.fpatt[h]);
5295
fprintf(fout,"\nProb of best state at each node, listed by %s", sitepatt);
5296
if (com.ngene>1) fprintf(fout,"\n\n%7s (g) Freq Data: \n", sitepatt);
5297
else fprintf(fout,"\n\n%7s Freq Data: \n", sitepatt);
5299
for(h=0; h<lst; h++) {
5300
hp = (!com.readpattern ? com.pose[h] : h);
5301
fprintf(fout,"\n%4d ",h+1);
5302
if (com.ngene>1) { /* which gene the site is from */
5303
for(ig=1; ig<com.ngene; ig++)
5304
if(hp<com.posG[ig]) break;
5305
fprintf(fout,"(%d)",ig);
5307
fprintf(fout," %5.0f ", com.fpatt[hp]);
5308
print1site(fout, hp);
5309
fprintf(fout, ": ");
5311
for(j=0; j<nid; j++) {
5312
if (com.seqtype!=CODONseq)
5313
fprintf(fout,"%c(%5.3f) ", pch[(int)zanc[j*com.npatt+hp]],pnode[j*com.npatt+hp]);
5316
ic = zanc[j*com.npatt+hp];
5317
Codon2AA(CODONs[ic], aa, com.icode, &i);
5318
fprintf(fout," %s %1c %5.3f (%1c %5.3f)",
5319
CODONs[ic], AAs[i], pnode[j*com.npatt+hp], AAs[(int)bestAA[j*com.npatt+hp]], pbestAA[j*com.npatt+hp]);
5323
if(noisy && (h+1)%100000==0) printf("\r\tprinting, %d sites done", h+1);
5325
if(noisy && h>=100000) printf("\n");
5327
/* Map changes onto branches
5328
k1 & k2 are the two characters; p1 and p2 are the two probs. */
5330
if(!com.readpattern) {
5331
fputs("\n\nSummary of changes along branches.\n",fout);
5332
fputs("Check root of tree for directions of change.\n",fout);
5333
if(!com.cleandata && com.seqtype==1)
5334
fputs("Counts of n & s are incorrect along tip branches with ambiguity data.\n",fout);
5335
for(j=0; j<tree.nbranch; j++,FPN(fout)) {
5336
inode = tree.branches[j][1];
5338
fprintf(fout,"\nBranch %d:%5d..%-2d",j+1,tree.branches[j][0]+1,inode+1);
5339
if(inode<com.ns) fprintf(fout," (%s) ",com.spname[inode]);
5342
lsc = (com.seqtype==1 ? com.ls : com.ls/3);
5343
for (h=0,nst=nat=0; h<lsc; h++) {
5344
getCodonNode1Site(codon[0], zanc, inode, h);
5345
getCodonNode1Site(codon[1], zanc, tree.branches[j][0], h);
5346
difcodonNG(codon[0], codon[1], &S, &N, &ns,&na, 0, com.icode);
5350
fprintf(fout," (n=%4.1f s=%4.1f)",nat,nst);
5352
fprintf(fout,"\n\n");
5353
for(h=0; h<lst; h++) {
5354
hp = (!com.readpattern ? com.pose[h] : h);
5355
if (com.seqtype!=CODONseq) {
5357
k2 = pch[(int)com.z[inode][hp]];
5359
k2 = pch[(int)zanc[(inode-com.ns)*com.npatt+hp]];
5360
p2 = pnode[(inode-com.ns)*com.npatt+hp];
5362
k1 = pch[ zanc[(tree.branches[j][0]-com.ns)*com.npatt+hp] ];
5363
p1 = pnode[(tree.branches[j][0]-com.ns)*com.npatt+hp];
5368
strcpy(codon[1], CODONs[com.z[inode][hp]]);
5369
k2 = GetAASiteSpecies(inode, hp);
5372
strcpy(codon[1], CODONs[(int)zanc[(inode-com.ns)*com.npatt+hp]]);
5373
k2 = AAs[(int)bestAA[(inode-com.ns)*com.npatt+hp]];
5374
p2 = pbestAA[(inode-com.ns)*com.npatt+hp];
5376
strcpy(codon[0], CODONs[(int)zanc[(tree.branches[j][0]-com.ns)*com.npatt+hp]]);
5377
k1 = AAs[(int)bestAA[(tree.branches[j][0]-com.ns)*com.npatt+hp]];
5378
p1 = pbestAA[(tree.branches[j][0]-com.ns)*com.npatt+hp];
5380
if(strcmp(codon[0],codon[1])) {
5382
fprintf(fout,"\t%4d %s (%c) %.3f -> %s (%c)\n", h+1,codon[0],k1,p1, codon[1],k2);
5384
fprintf(fout,"\t%4d %s (%c) %.3f -> %s (%c) %.3f\n",h+1,codon[0],k1,p1, codon[1],k2,p2);
5389
if(k1==k2) continue;
5390
fprintf(fout,"\t%4d ",h+1);
5393
if(sitelabels) fprintf(fout," %5s ",sitelabels[h]);
5395
if(inode<com.ns) fprintf(fout,"%c %.3f -> %1c\n",k1,p1,k2);
5396
else fprintf(fout,"%c %.3f -> %1c %.3f\n",k1,p1,k2,p2);
5402
ListAncestSeq(fout, zanc);
5403
fprintf(fout,"\n\nOverall accuracy of the %d ancestral sequences:", nid);
5404
matout2(fout,pMAPnode, 1, nid, 9,5); fputs("for a site.\n",fout);
5405
matout2(fout,pMAPnodeA, 1, nid, 9,5); fputs("for the sequence.\n", fout);
5407
/* best amino acid sequences from codonml */
5409
if(com.seqtype==1) {
5410
fputs("\n\nAmino acid sequences inferred by codonml.\n",fout);
5412
fputs("Results unreliable for sites with alignment gaps.\n",fout);
5413
for(inode=0; inode<nid; inode++) {
5414
fprintf(fout,"\nNode #%-10d ",com.ns+inode+1);
5415
for(h=0; h<lst; h++) {
5416
hp = (!com.readpattern ? com.pose[h] : h);
5417
fprintf(fout, "%c", AAs[(int)bestAA[inode*com.npatt+hp]]);
5418
if((h+1)%10==0) fputc(' ', fout);
5424
ChangesSites(fout, coding, zanc);
5428
if(coding) free(pbestAA);
5433
void getCodonNode1Site(char codon[], char zanc[], int inode, int site)
5435
/* this is used to retrive the codon from a codon sequence for codonml
5436
or coding sequence in baseml, used in ancestral reconstruction
5437
zanc has ancestral sequences
5442
for(i=0; i<3; i++) /* to force crashes */
5444
if(com.seqtype==CODONseq) {
5445
hp = (!com.readpattern ? com.pose[site] : site);
5448
strcpy(codon, CODONs[zanc[(inode-com.ns)*com.npatt+hp]]);
5450
strcpy(codon, CODONs[com.z[inode][hp]]);
5453
else { /* baseml coding reconstruction */
5456
codon[i] = BASEs[(int)zanc[(inode-com.ns)*com.npatt+com.pose[site*3+i]]];
5458
for(i=0; i<3; i++) codon[i] = BASEs[ com.z[inode][com.pose[site*3+i]] ];
5463
int ChangesSites(FILE*frst, int coding, char *zanc)
5465
/* this lists and counts changes at sites from reconstructed ancestral sequences
5466
com.z[] has the data, and zanc[] has the ancestors
5467
For codon sequences (codonml or baseml with com.coding), synonymous and
5468
nonsynonymous changes are counted separately.
5471
char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs));
5472
char codon[2][4]={" "," "};
5473
int h,hp,inode,k1,k2,d, ls1=(com.readpattern?com.npatt:com.ls);
5474
double S,N,Sd,Nd, S1,N1,Sd1,Nd1, b,btotal=0, p,C;
5476
if(com.seqtype==0 && coding) ls1/=3;
5478
fprintf(frst,"\n\nCounts of changes at sites, listed by %s\n\n",
5479
(com.readpattern?"pattern":"site"));
5480
fprintf(frst1,"\nList of sites with changes according to ancestral reconstruction\n");
5481
fprintf(frst1,"Suzuki-Gojobori (1999) style test\n");
5483
fprintf(frst, "(Counts of n & s are incorrect at sites with ambiguity data)\n\n");
5485
for(inode=0; inode<tree.nnode; inode++)
5486
if(inode!=tree.root) btotal += nodes[inode].branch;
5487
for(h=0; h<ls1; h++) {
5488
fprintf(frst,"%4d ",h+1);
5489
for(inode=0,S=N=Sd=Nd=0; inode<tree.nnode; inode++) {
5490
if(inode==tree.root) continue;
5491
b = nodes[inode].branch;
5492
getCodonNode1Site(codon[0], zanc, nodes[inode].father, h);
5493
getCodonNode1Site(codon[1], zanc, inode, h);
5495
difcodonNG(codon[0], codon[1], &S1, &N1, &Sd1, &Nd1, 0, com.icode);
5501
fprintf(frst," %3s.%3s ",codon[0],codon[1]);
5504
b = S+N; S /= b; N /= b;
5505
fprintf(frst,"(S N: %7.3f%7.3f Sd Nd: %6.1f %5.1f)\n", S*3,N*3,Sd,Nd);
5506
fprintf(frst1,"%4d S N: %7.3f%7.3f Sd Nd: %6.1f %5.1f ", h+1,S*3,N*3,Sd,Nd);
5509
for(d=0,p=0,C=1; d<=Nd; d++) {
5510
p += C*pow(N,d) * pow(1-N,Sd+Nd-d);
5511
C *= (Sd+Nd-d)/(d+1);
5513
fprintf(frst1," - p =%6.3f %s", p,(p<.01?"**":(p<.05?"*":"")));
5516
for(d=0,p=0,C=1; d<=Sd; d++) {
5517
p += C*pow(S,d)*pow(1-S,Sd+Nd-d);
5518
C *= (Sd+Nd-d)/(d+1);
5520
fprintf(frst1," + p =%6.3f %s", p,(p<.01?"**":(p<.05?"*":"")));
5523
fprintf(frst1,"\n");
5526
else { /* noncoding nucleotide or aa sequences */
5527
fprintf(frst,"\n\nCounts of changes at sites%s\n\n",
5528
(com.readpattern?", listed by pattern":""));
5529
for(h=0; h<ls1; h++) {
5530
hp=(!com.readpattern ? com.pose[h] : h);
5531
fprintf(frst,"%4d ",h+1);
5532
for(inode=0,d=0;inode<tree.nnode;inode++) {
5533
if(inode==tree.root) continue;
5534
k1=pch[(int) zanc[(nodes[inode].father-com.ns)*com.npatt+hp] ];
5536
k2 = pch[com.z[inode][hp]];
5538
k2 = pch[(int) zanc[(inode-com.ns)*com.npatt+hp] ];
5541
fprintf(frst," %c%c", k1,k2);
5544
fprintf(frst," (%d)\n", d);
5552
#define NBESTANC 4 /* use 1 2 3 or 4 */
5553
int parsimony=0, *nBestScore, *icharNode[NBESTANC], *combIndex;
5554
double *fhsiteAnc, *lnPanc[NBESTANC], *PMatTips, *combScore;
5555
char *charNode[NBESTANC], *ancSeq, *ancState1site;
5557
int largeReconstruction;
5559
void DownPassPPSG2000OneSite (int h, int inode, int inodestate, int ipath);
5560
void PrintAncState1site (char ancState1site[], double prob);
5563
double P0[16]={0, 1, 1.5, 1.5,
5568
double piroot[NCODE]={0};
5570
/* combIndex[] uses two bits for each son to record the path that is taken by
5571
each reconstruction; for 32-bit integers, the maximum number of sons for
5574
lnPanc[3][(tree.nnode-com.ns)*npatt*n] uses the space of com.conP.
5575
It holds the ln(Pr) for the best reconstructions at the subtree down inode
5576
given the state of the father node.
5577
charNode[0,1,2] holds the corresponding state at inode.
5579
int nBestScore[maxnson];
5580
int combIndex[2*n*ncomb];
5581
double *combScore[n*ncomb];
5582
char ancSeq[nintern*npatt], ancState1site[nintern];
5583
int icharNode[NBESTANC][nintern*npatt*n];
5584
char charNode[NBESTANC][nintern*npatt*n];
5587
void UpPassPPSG2000 (int inode, int igene, double x[])
5589
/* The algorithm of PPSG2000, modified. This routine is based on ConditionalPNode().
5590
lnPanc[h*n+i] is the best lnP, given that inode has state i.
5591
charNode[] stores the characters that achieved the best lnP.
5594
int n=com.ncode, it,ibest,i,j,k,h, ison, nson=nodes[inode].nson, *pc;
5595
int pos0=com.posG[igene],pos1=com.posG[igene+1], ichar,jchar;
5596
int ncomb=1,icomb, ipath;
5597
double t, y, psum1site=-1;
5599
if(com.ncode!=4) debug=0;
5601
for(i=0; i<nson; i++)
5602
if(nodes[nodes[inode].sons[i]].nson>0)
5603
UpPassPPSG2000(nodes[inode].sons[i], igene, x);
5604
for(i=0,ncomb=1; i<nson; i++)
5605
ncomb *= (nBestScore[i] = (nodes[nodes[inode].sons[i]].nson>0 ? NBESTANC : 1));
5607
printf("\n\nNode %2d has sons ", inode+1);
5608
for(i=0; i<nson; i++) printf(" %2d", nodes[inode].sons[i]+1);
5609
printf(" ncomb=%2d: ", ncomb);
5610
for(i=0; i<nson; i++) printf(" %2d", nBestScore[i]); FPN(F0);
5613
if(inode!=tree.root) { /* calculate log{P(t)} from father to inode */
5614
t=nodes[inode].branch*_rateSite;
5616
if(com.clock) t *= GetBranchRate(igene,(int)nodes[inode].label,x,NULL);
5617
else t *= com.rgene[igene];
5619
GetPMatBranch(PMat, x, t, inode);
5620
for(j=0; j<n*n; j++)
5621
PMat[j] = (PMat[j]<1e-300 ? 300 : -log(PMat[j]));
5624
for(h=pos0; h<pos1; h++) { /* loop through site patterns */
5626
/* The last round for inode==tree.root, shares some code with other nodes,
5627
and is thus embedded in the same loop. Alternatively this round can be
5628
taken out of the loop with some code duplicated.
5630
for(ichar=0; ichar<(inode!=tree.root?n:1); ichar++) { /* ichar for father */
5631
/* given ichar for the father, what are the best reconstructions at
5632
inode? Look at n*ncomb possibilities, given father state ichar.
5635
if(inode==tree.root) printf("\n\nfather is root\n");
5636
else printf("\n\nichar = %2d %c for father\n", ichar+1,BASEs[ichar]);
5639
for(icomb=0; icomb<n*ncomb; icomb++) {
5640
jchar = icomb/ncomb; /* jchar is for inode */
5641
if(inode==tree.root)
5642
combScore[icomb] = -log(com.pi[jchar]+1e-300);
5644
combScore[icomb] = PMat[ichar*n+jchar];
5646
if(inode==tree.root && parsimony) combScore[icomb] = 0;
5648
if(debug) printf("comb %2d %c", icomb+1,BASEs[jchar]);
5650
for(i=0,it=icomb%ncomb; i<nson; i++) { /* The ibest-th state in ison. */
5651
ison = nodes[inode].sons[i];
5652
ibest = it%nBestScore[i];
5653
it /= nBestScore[i];
5655
if(nodes[ison].nson) /* internal node */
5656
y = lnPanc[ibest][(ison-com.ns)*com.npatt*n+h*n+jchar];
5657
else if (com.cleandata) /* tip clean: PMatTips[] has log{P(t)}. */
5658
y = PMatTips[ ison*n*n + jchar*n + com.z[ison][h] ];
5659
else { /* tip unclean: PMatTips[] has P(t). */
5660
for(k=0,y=0; k<nChara[com.z[ison][h]]; k++)
5661
y += PMatTips[ ison*n*n+jchar*n + CharaMap[com.z[ison][h]][k] ];
5665
combScore[icomb] += y;
5666
if(debug) printf("%*s son %2d #%2d %7.1f\n", (i?10:1),"", ison+1, ibest+1,y);
5670
if(debug) { printf("score "); for(i=0;i<n*ncomb; i++) printf(" %4.1f",combScore[i]); FPN(F0); }
5671
indexing(combScore, n*ncomb, combIndex, 0, combIndex+n*ncomb);
5672
if(debug) { printf("index "); for(i=0;i<n*ncomb; i++) printf(" %4d",combIndex[i]); FPN(F0); }
5675
/* print out reconstructions at the site if inode is root. */
5676
if(inode==tree.root) {
5677
fprintf(fanc,"%4d ", h+1);
5678
if(com.ngene>1) fprintf(fanc,"(%d) ", igene+1);
5679
fprintf(fanc," %6.0f ",com.fpatt[h]);
5680
print1site(fanc, h);
5681
fprintf(fanc, ": ");
5683
psum1site=0; /* used if inode is root */
5685
for(j=0; j<(inode!=tree.root ? NBESTANC : n*ncomb); j++) {
5686
jchar=(it=combIndex[j])/ncomb; it%=ncomb;
5688
lnPanc[j][(inode-com.ns)*com.npatt*n+h*n+ichar]=combScore[combIndex[j]];
5689
charNode[j][(inode-com.ns)*com.npatt*n+h*n+ichar]=jchar;
5691
if(debug) printf("\t#%d: %6.1f %c ", j+1, combScore[combIndex[j]], BASEs[jchar]);
5693
for(i=0,ipath=0; i<nson; i++) {
5694
ison=nodes[inode].sons[i];
5695
ibest=it%nBestScore[i];
5697
ipath |= ibest<<(2*i);
5698
if(debug) printf("%2d", ibest+1);
5701
icharNode[j][(inode-com.ns)*com.npatt*n+h*n+ichar]=ipath;
5703
if(debug) printf(" (%o)", ipath);
5705
/* print if inode is root. */
5706
if(inode==tree.root) {
5707
ancState1site[inode-com.ns]=jchar;
5708
if(parsimony) y = combScore[combIndex[j]];
5709
else psum1site += y = exp(-combScore[combIndex[j]]-fhsiteAnc[h]);
5711
for(i=0; i<nson; i++) {
5712
if(nodes[ison=nodes[inode].sons[i]].nson)
5713
DownPassPPSG2000OneSite(h, tree.root, jchar, ipath);
5715
PrintAncState1site(ancState1site, y);
5716
if(j>NBESTANC && y<.001) break;
5720
if(inode==tree.root) fprintf(fanc," (total %6.3f)\n", psum1site);
5722
if(largeReconstruction && (h+1)%2000==0)
5723
printf("\r\tUp pass for gene %d node %d sitepatt %d.", igene+1,inode+1,h+1);
5726
if(largeReconstruction)
5727
printf("\r\tUp pass for gene %d node %d.", igene+1,inode+1);
5731
void DownPassPPSG2000OneSite (int h, int inode, int inodestate, int ipath)
5733
/* this puts the state in ancState1site[nintern], using
5734
int icharNode[NBESTANC][nintern*npatt*n],
5735
char charNode[NBESTANC][nintern*npatt*n].
5736
jchar is the state at inode, and ipath is the ipath code for inode.
5738
int n=com.ncode, i, ison, ibest, sonstate;
5740
for(i=0; i<nodes[inode].nson; i++) {
5741
ison=nodes[inode].sons[i];
5742
if(nodes[ison].nson>1) {
5743
ibest = (ipath & (3<<(2*i))) >> (2*i);
5744
ancState1site[ison-com.ns] = sonstate =
5745
charNode[ibest][(ison-com.ns)*com.npatt*n+h*n+inodestate];
5746
DownPassPPSG2000OneSite(h, ison, sonstate,
5747
icharNode[ibest][(ison-com.ns)*com.npatt*n+h*n+inodestate]);
5753
void PrintAncState1site (char ancState1site[], double prob)
5756
char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs)),codon[4]="";
5758
for(i=0; i<tree.nnode-com.ns; i++) {
5759
if(com.seqtype==1) {
5761
fprintf(fanc,"%s ",getcodon(codon,FROM61[(int)ancState1site[i]]));
5765
fprintf(fanc,"%c",pch[(int)ancState1site[i]]);
5767
fprintf(fanc," (%5.3f) ", prob);
5770
void DownPassPPSG2000 (int inode)
5772
/* this reads out the best chara for inode from charNode[] into ancSeq[].
5777
for(h=0; h<com.npatt; h++) {
5778
if(inode!=tree.root)
5779
c0=ancSeq[(nodes[inode].father-com.ns)*com.npatt+h];
5780
ancSeq[(inode-com.ns)*com.npatt+h]
5781
= charNode[0][(inode-com.ns)*com.npatt*com.ncode+h*com.ncode+c0];
5783
for(i=0; i<nodes[inode].nson; i++)
5784
if(nodes[ison=nodes[inode].sons[i]].nson > 1)
5785
DownPassPPSG2000(ison);
5790
int AncestralJointPPSG2000 (FILE *fout, double x[])
5792
/* Ziheng Yang, 8 June 2000, rewritten on 8 June 2005.
5793
Joint ancestral reconstruction, taking character states for all nodes at a
5794
site as one entity, based on the algorithm of Pupko et al. (2000
5795
Mol. Biol. Evol. 17:890-896).
5797
fhsiteAns[]: fh[] for each site pattern
5798
nodes[].conP[] are destroyed and restored at the end of the routine.
5799
ancSeq[] stores the ancestral seqs, the best reconstruction.
5801
This outputs results by pattern. I tried to print results by sites, but gave up as
5802
some variables use the same memory (e.g., combIndex) for different site patterns.
5804
char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs)),codon[4]="";
5805
int n=com.ncode,nintern=tree.nnode-com.ns, i,j,k,h,hp,igene;
5806
int maxnson=0, maxncomb, lst=(com.readpattern?com.npatt:com.ls);
5807
char *sitepatt=(com.readpattern?"pattern":"site");
5809
size_t sconPold = com.sconP, s;
5811
largeReconstruction = (noisy && (com.ns>300 || com.ls>1000000));
5813
if(noisy) puts("Joint reconstruction.");
5815
for(i=0; i<tree.nnode; i++) maxnson=max2(maxnson,nodes[i].nson);
5816
if(maxnson>16 || NBESTANC>4) /* for int at least 32 bits */
5817
error2("NBESTANC too large or too many sons.");
5818
for(i=0,maxncomb=1; i<maxnson; i++) maxncomb*=NBESTANC;
5819
if((PMatTips=(double*)malloc(com.ns*n*n*sizeof(double)))==NULL)
5820
error2("oom PMatTips");
5821
s = NBESTANC*nintern*(size_t)com.npatt*n*sizeof(double);
5824
printf("\n%9lu bytes for conP, adjusted\n", com.sconP);
5825
if((com.conP=(double*)realloc(com.conP,com.sconP))==NULL)
5828
s = NBESTANC*nintern*com.npatt*n;
5829
s = ((s*sizeof(int)+(s+nintern)*sizeof(char)+16)/sizeof(double))*sizeof(double);
5830
if(s > com.sspace) {
5832
printf("\n%9lu bytes for space, adjusted\n",com.sspace);
5833
if((com.space=(double*)realloc(com.space,com.sspace))==NULL) error2("oom space");
5835
for(i=0; i<NBESTANC; i++) {
5836
lnPanc[i]= com.conP+i*nintern*com.npatt*n;
5837
icharNode[i] = (int*)com.space+i*nintern*com.npatt*n;
5838
charNode[i] = (char*)((int*)com.space+NBESTANC*nintern*com.npatt*n)
5839
+ i*nintern*com.npatt*n;
5840
ancState1site = charNode[0]+NBESTANC*nintern*com.npatt*n;
5842
if((ancSeq=(char*)malloc(nintern*com.npatt*n*sizeof(char)))==NULL)
5843
error2("oom charNode");
5845
if((combScore=(double*)malloc((3*n*maxncomb+com.ns)*sizeof(double)))==NULL)
5846
error2("oom combScore");
5847
nBestScore = (int*)(combScore+n*maxncomb);
5848
combIndex = nBestScore + com.ns; /* combIndex[2*n*ncomb] contains work space */
5851
fprintf(fout, "\n\n(2) Joint reconstruction of ancestral sequences\n");
5852
fprintf(fout, "(eqn. 2 in Yang et al. 1995 Genetics 141:1641-1650), using ");
5853
fprintf(fout, "the algorithm of Pupko et al. (2000 Mol Biol Evol 17:890-896),\n");
5854
fprintf(fout, "modified to generate sub-optimal reconstructions.\n");
5855
fprintf(fout, "\nReconstruction (prob.), listed by pattern (use the observed data to find the right site).\n");
5856
fprintf(fout, "\nPattern Freq Data:\n\n");
5858
for(igene=0; igene<com.ngene; igene++) {
5859
if(com.Mgene>1) SetPGene(igene,1,1,0,x);
5860
for(i=0; i<com.ns; i++) {
5861
t=nodes[i].branch*_rateSite;
5863
if(com.clock) t *= GetBranchRate(igene,(int)nodes[i].label,x,NULL);
5864
else t *= com.rgene[igene];
5866
GetPMatBranch(PMatTips+i*n*n, x, t, i);
5870
for(i=0; i<com.ns*n*n; i++)
5871
PMatTips[i] = (PMatTips[i]<1e-20 ? 300 : -log(PMatTips[i]));
5873
if(parsimony) for(i=0; i<com.ns; i++)
5874
xtoy(P0, PMatTips+i*n*n, n*n);
5876
UpPassPPSG2000(tree.root, igene, x); /* this prints into frst as well */
5879
if(largeReconstruction) puts("\n\tDown pass.");
5880
DownPassPPSG2000(tree.root);
5882
ListAncestSeq(fout, ancSeq);
5887
com.sconP = sconPold;
5888
if((com.conP=(double*)realloc(com.conP,com.sconP))==NULL)
5896
int AncestralSeqs (FILE *fout, double x[])
5898
/* Ancestral sequence reconstruction using likelihood (Yang et al. 1995).
5899
Marginal works with constant rate and variable rates among sites.
5900
Joint works only with constant rate among sites (ncatG=1).
5903
double lnL, *ScaleC=NULL; /* collected scale factors */
5906
error2("When Mgene=1, use RateAncestor = 0.");
5907
if (tree.nnode==com.ns)
5908
{ puts("\nNo ancestral nodes to reconstruct..\n"); return(0); }
5909
if (noisy) printf ("\nReconstructed ancestral states go into file rst.\n");
5910
fprintf(fout, "\nAncestral reconstruction by %sML.\n",
5911
(com.seqtype==0?"BASE":(com.seqtype==1?"CODON":"AA")));
5912
FPN(fout); OutTreeN(fout,1,1); FPN(fout); FPN(fout);
5913
OutTreeN(fout,0,0); FPN(fout); FPN(fout);
5914
OutTreeB(fout); FPN(fout);
5916
fputs("\ntree with node labels for Rod Page's TreeView\n",fout);
5917
OutTreeN(fout,1,PrNodeNum); FPN(fout);
5919
fprintf (fout, "\nNodes %d to %d are ancestral\n", com.ns+1,tree.nnode);
5920
if((fhsiteAnc=(double*)malloc(com.npatt*sizeof(double)))==NULL)
5921
error2("oom fhsiteAnc");
5922
if(com.NnodeScale && com.ncatG>1)
5923
if((ScaleC=(double*)malloc(max2(com.npatt,com.ncatG) *sizeof(double)))==NULL)
5924
error2("oom ScaleC in AncestralSeqs");
5927
puts("Rates are variable among sites, marginal reconstructions only.");
5928
if(!com.verbose) fputs("Constant sites not listed for verbose=0\n",fout);
5929
if(!com.cleandata) fputs("Unreliable at sites with alignment gaps\n",fout);
5931
if (com.ncatG<=1 || com.method!=1)
5932
ProbSitePattern (x, &lnL, fhsiteAnc, ScaleC);
5934
AncestralMarginal(fout, x, fhsiteAnc, ScaleC);
5937
/* fhsiteAnc[] is modified by both Marginal and Joint. */
5938
if(com.ncatG<=1 && tree.nnode>com.ns+1) {
5939
ProbSitePattern (x, &lnL, fhsiteAnc, ScaleC);
5940
for(h=0; h<com.npatt; h++) {
5941
fhsiteAnc[h] = log(fhsiteAnc[h]);
5942
for(k=0; k<com.NnodeScale; k++)
5943
fhsiteAnc[h] += com.nodeScaleF[k*com.npatt+h];
5945
/* AncestralJointPPSG2000 corrupts com.conP[] and fhsiteAnc[].
5947
AncestralJointPPSG2000(fout, x);
5951
if(com.NnodeScale && com.ncatG>1) free(ScaleC);
5959
int SetNodeScale(int inode);
5960
int NodeScale(int inode, int pos0, int pos1);
5962
void InitializeNodeScale(void)
5964
/* This allocates memory to hold scale factors for nodes and also decide on the
5965
nodes for scaling by calling SetNodeScale().
5966
The scaling node is chosen before the iteration by counting the number of
5967
nodes visited in the post-order tree travesal algorithm (see the routine
5969
See Yang (2000 JME 51:423-432) for details.
5970
The memory required is com.NnodeScale*com.npatt*sizeof(double).
5975
if(com.clock>=5) return;
5978
com.nodeScale = (char*)realloc(com.nodeScale, tree.nnode*sizeof(char));
5979
if(com.nodeScale==NULL) error2("oom");
5980
for(i=0; i<tree.nnode; i++) com.nodeScale[i] = 0;
5981
SetNodeScale(tree.root);
5982
nS = com.NnodeScale*com.npatt;
5983
if(com.conPSiteClass) nS *= com.ncatG;
5984
if(com.NnodeScale) {
5985
if((com.nodeScaleF=(double*)realloc(com.nodeScaleF, nS*sizeof(double)))==NULL)
5986
error2("oom nscale");
5987
for(i=0; i<nS; i++) com.nodeScaleF[i] = 0;
5990
printf("\n%d node(s) used for scaling (Yang 2000 J Mol Evol 51:423-432):\n",com.NnodeScale);
5991
for(i=0; i<tree.nnode; i++)
5992
if(com.nodeScale[i]) printf(" %2d",i+1);
5999
int SetNodeScale (int inode)
6001
/* This marks nodes for applying scaling factors when calculating f[h].
6003
int i,ison, d=0, every;
6005
if(com.seqtype==0) every=100; /* baseml */
6006
else if(com.seqtype==1) every=15; /* codonml */
6007
else every=50; /* aaml */
6009
for(i=0; i<nodes[inode].nson; i++) {
6010
ison = nodes[inode].sons[i];
6011
d += (nodes[ison].nson ? SetNodeScale(ison) : 1);
6013
if(inode!=tree.root && d>every) {
6014
com.nodeScale[inode] = 1;
6022
int NodeScale (int inode, int pos0, int pos1)
6024
/* scale to avoid underflow
6026
int h,k,j, n=com.ncode;
6027
double t, smallw=1e-12;
6029
for(j=0,k=0; j<tree.nnode; j++) /* k-th node for scaling */
6031
else if(com.nodeScale[j]) k++;
6033
for(h=pos0; h<pos1; h++) {
6034
for(j=0,t=0;j<n;j++)
6035
if(nodes[inode].conP[h*n+j]>t) t=nodes[inode].conP[h*n+j];
6038
for(j=0;j<n;j++) nodes[inode].conP[h*n+j]=1; /* both 0 and 1 fine */
6039
com.nodeScaleF[k*com.npatt+h]=-800; /* this is problematic? */
6042
for(j=0;j<n;j++) nodes[inode].conP[h*n+j]/=t;
6043
com.nodeScaleF[k*com.npatt+h]=log(t);
6051
static double *dfsites;
6053
int fx_r(double x[], int np);
6056
#if (BASEML || CODEML)
6058
int HessianSKT2004 (double xmle[], double lnLm, double g[], double H[])
6060
/* this calculates the hessian matrix of branch lengths using the approximation
6061
of Seo et al. (2004), especially useful for approximate likelihood calcualtion
6062
in divergence time estimation.
6063
df[0][i*com.npatt+h] has d log(f_h)/d b_i.
6064
method = 0 uses difference approximation to first derivatives.
6065
method = 1 uses analytical calculation of first derivatives (Yang 2000).
6066
I am under the impression that method = 1 may be useful for very large datasets
6067
with >10M sites, but I have not implemented this method because the analytical
6068
calculation of first derivatives is possible for branch lengths only, and not
6069
available for other parameters. Right now with method = 0, H and the SEs are
6070
calculated for all parameters although the H matrix in rst2 is a subset for
6071
branch lengths only. More thought about what to do. Ziheng's note on 8 March 2010.
6073
int method=0, backforth, h, i, j, lastround0=LASTROUND;
6074
double *x, *lnL[2], *df[2], eh0=Small_Diff*2, eh, small;
6076
if(com.np!=tree.nbranch && method==1)
6077
error2("I think HessianSKT2004 works for branch lengths only");
6078
df[0] = (double*)malloc((com.npatt*2+1)*com.np*sizeof(double));
6079
if(df[0]==NULL) error2("oom space in HessianSKT2004");
6080
df[1] = df[0] + com.npatt*com.np;
6081
x = df[1] + com.npatt*com.np;
6082
lnL[0] = (double*)malloc(com.np*2*sizeof(double));
6083
lnL[1] = lnL[0]+com.np;
6087
for(backforth=0; backforth<2; backforth++) {
6088
for(i=0; i<com.np; i++) {
6089
xtoy(xmle, x, com.np);
6090
eh = eh0*(fabs(xmle[i]) + 1);
6091
if(backforth==0) x[i] = xmle[i] - eh;
6092
else x[i] = xmle[i] + eh;
6094
printf("HessianSKT2004 warning: x[%d] = %8.5g < 0\n", i+1, x[i]);
6095
dfsites = df[backforth] + i*com.npatt;
6096
lnL[backforth][i] = -com.plfun(x, com.np);
6100
for(i=0; i<com.np; i++) {
6101
eh = eh0*(fabs(xmle[i]) + 1);
6102
g[i] = (lnL[1][i] - lnL[0][i])/(eh*2);
6105
printf("\nx gL g H");
6106
matout(F0, xmle, 1, com.np);
6107
matout(F0, g, 1, com.np);
6109
zero(H, com.np*com.np);
6110
for(i=0; i<com.np; i++) {
6111
eh = eh0*(fabs(xmle[i]) + 1);
6112
for(h=0; h<com.npatt; h++)
6113
df[0][i*com.npatt+h] = (df[1][i*com.npatt+h] - df[0][i*com.npatt+h])/(eh*2);
6116
for(i=0; i<com.np; i++) {
6117
for(j=0; j<com.np; j++)
6118
for(h=0; h<com.npatt; h++)
6119
H[i*com.np+j] -= df[0][i*com.npatt+h] * df[0][j*com.npatt+h] * com.fpatt[h];
6122
LASTROUND = lastround0;
6130
int lfunRates (FILE* fout, double x[], int np)
6132
/* for dG, AdG or similar non-parametric models
6133
This distroys com.fhK[], and in return,
6134
fhK[<npatt] stores rates for conditional mean (re), and
6135
fhK[<2*npatt] stores the most probable rate category number.
6136
fhsite[npatt] stores fh=log(fh).
6138
int ir,il,it, h,hp,j, nscale=1, direction=-1;
6139
int lst=(com.readpattern?com.npatt:com.ls);
6140
double lnL=0,fh,fh1, t, re,mre,vre, b1[NCATG],b2[NCATG],*fhsite;
6142
if (noisy) printf("\nEstimated rates for sites go into file %s\n",ratef);
6143
if (SetParameters(x)) puts ("par err. lfunRates");
6145
fprintf(fout, "\nEstimated rates for sites from %sML.\n",
6146
(com.seqtype==0?"BASE":(com.seqtype==1?"CODON":"AA")));
6147
OutTreeN(fout,1,1); FPN(fout);
6148
fprintf (fout,"\nFrequencies and rates for categories (K=%d)", com.ncatG);
6149
fprintf(fout, "\nrate:"); FOR(j,com.ncatG) fprintf(fout," %8.5f",com.rK[j]);
6150
fprintf(fout, "\nfreq:"); FOR(j,com.ncatG) fprintf(fout," %8.5f",com.freqK[j]);
6154
fprintf(fout,"\nTransition prob matrix over sites");
6155
matout2(fout,com.MK,com.ncatG,com.ncatG,8,4);
6158
if((fhsite=(double*)malloc(com.npatt*sizeof(double)))==NULL) error2("oom fhsite");
6160
if(com.NnodeScale) {
6162
for(ir=1,it=0; ir<com.ncatG; ir++)
6163
if(com.fhK[ir*com.npatt+h] > com.fhK[it*com.npatt+h])
6165
t = com.fhK[it*com.npatt+h];
6166
lnL -= com.fpatt[h]*t;
6167
for(ir=0; ir<com.ncatG; ir++)
6168
com.fhK[ir*com.npatt+h] = exp(com.fhK[ir*com.npatt+h] - t);
6171
for(h=0; h<com.npatt; h++) {
6172
for(ir=0,fhsite[h]=0; ir<com.ncatG; ir++)
6173
fhsite[h] += com.freqK[ir]*com.fhK[ir*com.npatt+h];
6176
if (com.rho==0) { /* dG model */
6178
fprintf(fout,"\nPosterior probabilities for site classes, by %s\n\n",
6179
(com.readpattern?"pattern":"site"));
6180
for (h=0; h<lst; h++,FPN(fout)) {
6181
fprintf(fout, " %5d ", h+1);
6182
hp = (!com.readpattern ? com.pose[h] : h);
6183
for (ir=0; ir<com.ncatG; ir++)
6184
fprintf(fout, " %9.4f", com.freqK[ir]*com.fhK[ir*com.npatt+hp]/fhsite[hp]);
6188
fprintf(fout,"\n%7s Freq Data Rate (posterior mean & category)\n\n",
6189
(com.readpattern?"Pattern":"Site"));
6190
for (h=0,mre=vre=0; h<com.npatt; h++) {
6191
for (ir=0,it=0,t=re=0; ir<com.ncatG; ir++) {
6192
fh1 = com.freqK[ir]*com.fhK[ir*com.npatt+h];
6193
if(fh1>t) { t=fh1; it=ir; }
6194
re += fh1*com.rK[ir];
6196
lnL -= com.fpatt[h]*log(fhsite[h]);
6199
mre += com.fpatt[h]*re/com.ls;
6200
vre += com.fpatt[h]*re*re/com.ls;
6202
com.fhK[com.npatt+h] = it+1.;
6205
for(h=0; h<lst; h++) {
6206
hp=(!com.readpattern ? com.pose[h] : h);
6207
fprintf(fout,"%7d %5.0f ",h+1, com.fpatt[hp]);
6208
print1site(fout, hp);
6209
fprintf(fout," %8.3f%6.0f\n", com.fhK[hp], com.fhK[com.npatt+hp]);
6212
else { /* Auto-dGamma model */
6213
fputs("\nSite Freq Data Rates\n\n",fout);
6214
h = (direction==1?com.ls-1:0);
6215
for (il=0,mre=vre=0; il<lst; h-=direction,il++) {
6216
hp=(!com.readpattern ? com.pose[h] : h);
6218
FOR(ir,com.ncatG) b1[ir]=com.fhK[ir*com.npatt+hp];
6220
for (ir=0; ir<com.ncatG; ir++) {
6221
for (j=0,fh=0; j<com.ncatG; j++)
6222
fh+=com.MK[ir*com.ncatG+j]*b1[j];
6223
b2[ir] = fh*com.fhK[ir*com.npatt+hp];
6225
xtoy (b2, b1, com.ncatG);
6227
if ((il+1)%nscale==0)
6228
{ fh=sum(b1,com.ncatG); abyx(1/fh,b1,com.ncatG); lnL-=log(fh); }
6230
for (ir=0,it=-1,re=fh1=t=0; ir<com.ncatG; ir++) {
6231
re+=com.freqK[ir]*b1[ir]*com.rK[ir];
6232
fh1+=com.freqK[ir]*b1[ir];
6233
if (b1[ir]>t) {it=ir; t=b1[ir]; }
6237
vre += re*re/com.ls;
6239
fprintf(fout,"%4d %5.0f ",h+1, com.fpatt[hp]);
6240
print1site(fout, hp);
6241
fprintf(fout," %8.3f%6.0f\n", re, it+1.);
6244
for (ir=0,fh=0; ir<com.ncatG; ir++) fh += com.freqK[ir]*b1[ir];
6247
if (noisy) printf ("lnL =%14.6f\n", -lnL);
6248
fprintf (fout,"\nlnL =%14.6f\n", -lnL);
6250
fprintf (fout,"\nmean(r^)=%9.4f var(r^)=%9.4f", mre, vre);
6251
fprintf (fout,"\nAccuracy of rate prediction: corr(r^,r) =%9.4f\n",
6252
sqrt(com.alpha*vre));
6259
double lfunAdG (double x[], int np)
6261
/* Auto-Discrete-Gamma rates for sites
6262
See notes in lfundG().
6264
int nscale=1, h,il, ir, j, FPE=0;
6265
int direction=-1; /* 1: n->1; -1: 1->n */
6266
double lnL=0, b1[NCATG], b2[NCATG], fh;
6272
fh=com.fhK[0*com.npatt+h];
6273
lnL-=fh*com.fpatt[h];
6274
for(ir=1,com.fhK[h]=1; ir<com.ncatG; ir++)
6275
com.fhK[ir*com.npatt+h]=exp(com.fhK[ir*com.npatt+h]-fh);
6277
h = (direction==1?com.ls-1:0);
6278
for (il=0; il<com.ls; h-=direction,il++) {
6280
FOR(ir,com.ncatG) b1[ir]=com.fhK[ir*com.npatt+com.pose[h]];
6282
for (ir=0; ir<com.ncatG; ir++) {
6283
for (j=0,fh=0; j<com.ncatG; j++)
6284
fh+=com.MK[ir*com.ncatG+j]*b1[j];
6285
b2[ir]=fh*com.fhK[ir*com.npatt+com.pose[h]];
6287
xtoy(b2,b1,com.ncatG);
6289
if((il+1)%nscale==0) {
6290
fh=sum(b1,com.ncatG);
6293
FPE=1; printf ("h,fh%6d %12.4e\n", h+1,fh);
6299
abyx(1/fh,b1,com.ncatG); lnL-=log(fh);
6302
for (ir=0,fh=0; ir<com.ncatG; ir++) fh+=com.freqK[ir]*b1[ir];
6312
#if (defined(BASEML))
6314
int GetPMatBranch (double Pt[], double x[], double t, int inode)
6316
/* P(t) for branch leading to inode, called by routines ConditionalPNode()
6317
and AncestralSeq() in baseml and codeml. x[] is not used by baseml.
6319
double space[16] = {0};
6322
PMatK80(Pt, t, (com.nhomo==2?nodes[inode].kappa:com.kappa));
6325
EigenTN93(com.model,nodes[inode].kappa, -1, com.pi,&nR,Root,Cijk);
6326
else if (com.nhomo>2) /* need kappa on each node if fix_kappa ==0 */
6327
EigenTN93(com.model,nodes[inode].kappa, -1, nodes[inode].pi,&nR,Root,Cijk);
6328
if(com.model<=REV||com.model==REVu)
6331
QUNREST(NULL, Pt, x+com.ntime+com.nrgene, com.pi);
6332
matexp (Pt, t, 4, 10, space);
6338
#elif (defined(CODEML))
6340
int GetPMatBranch (double Pt[], double x[], double t, int inode)
6342
/* P(t) for branch leading to inode, called by routines ConditionalPNode()
6343
and AncestralSeq() in baseml and codeml.
6345
Qfactor in branch & site models (model = 2 or 3 and NSsites = 2 or 3):
6346
Qfactor scaling is applied here and not inside EigenQcodon().
6348
int iUVR=0, nUVR=NBTYPE+2, ib = (int)nodes[inode].label, updateUVR=0;
6349
double *pkappa, w, mr=1, Qfactor=1;
6350
double *pomega = com.pomega; /* x+com.ntime+com.nrgene+com.nkappa; */
6352
pkappa = (com.hkyREV||com.codonf==FMutSel?x+com.ntime+com.nrgene:&com.kappa);
6354
if(com.seqtype==CODONseq && com.NSsites && com.model) {
6355
/* branch&site models (both NSsites & model):
6356
Usual likelihood calculation, no need to re-calculate UVRoot.
6357
Only need to point to the right place.
6359
iUVR = Set_UVR_BranchSite (IClass, ib);
6360
Qfactor = Qfactor_NS_branch[ib];
6362
else if (com.seqtype==CODONseq && BayesEB==2 && com.model>1) { /* BEB for A&C */
6363
/* branch&site models (both NSsites & model) BEB calculation:
6364
Need to calculate UVRoot, as w is different. com.pomega points to wbranches[]
6365
in get_grid_para_like_M2M8() or get_grid_para_like_AC().
6367
Qfactor_NS_branch[] is fixed at the MLE:
6368
"we fix the branch lengths at the synonymous sites (i.e., the expected
6369
number of synonymous substitutions per codon) at their MLEs."
6372
EigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, w, Pt);
6373
Qfactor = Qfactor_NS_branch[ib];
6375
else if (com.seqtype==CODONseq && (com.model==1 ||com.model==2) && com.nbtype<=nUVR) {
6376
/* branch model, also for AAClasses */
6377
iUVR = (int)nodes[inode].label;
6378
U=_UU[iUVR]; V=_VV[iUVR]; Root=_Root[iUVR];
6380
else if (com.seqtype==CODONseq && com.model) {
6382
if(com.aaDist==AAClasses) { /* AAClass model */
6383
com.pomega = PointOmega(x+com.ntime, -1, inode, -1);
6384
EigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, -1, Pt);
6386
else if(com.nbtype>nUVR) { /* branch models, with more than 8 omega */
6387
EigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, nodes[inode].omega, Pt);
6391
if (com.seqtype == AAseq && com.model == Poisson)
6392
PMatJC69like(Pt, t, com.ncode);
6395
PMatUVRoot(Pt, t, com.ncode, U, V, Root);
6405
void print_lnf_site (int h, double logfh)
6407
#if(defined BASEML || defined CODEML)
6410
fprintf(frst, " %12.10f", exp(logfh));
6412
fprintf(frst, "\n");
6414
fprintf(flnf, "\n%6d %6.0f %16.10f %16.12f %12.4f ",
6415
h+1, com.fpatt[h], logfh, exp(logfh), com.ls*exp(logfh));
6416
print1site(flnf, h);
6421
double lfundG (double x[], int np)
6423
/* likelihood function for site-class models.
6424
This deals with scaling for nodes to avoid underflow if(com.NnodeScale).
6425
The routine calls fx_r() to calculate com.fhK[], which holds log{f(x|r)}
6426
when scaling or f(x|r) when not. Scaling factors are set and used for each
6427
site class (ir) to calculate log(f(x|r). When scaling is used, the routine
6428
converts com.fhK[] into f(x|r), by collecting scaling factors into lnL.
6429
The rest of the calculation then becomes the same and relies on f(x|r).
6430
Check notes in fx_r.
6431
This is also used for NSsites models in codonml.
6432
Note that scaling is used between fx_r() and ConditionalPNode()
6434
int h,ir, it, FPE=0;
6435
double lnL=0, fh=0,t;
6440
for(h=0; h<com.npatt; h++) {
6441
if (com.fpatt[h]<=0 && com.print>=0) continue;
6442
if(com.NnodeScale) { /* com.fhK[] has log{f(x|r}. Note the scaling for nodes */
6443
for(ir=1,it=0; ir<com.ncatG; ir++) /* select term for scaling */
6444
if(com.fhK[ir*com.npatt+h] > com.fhK[it*com.npatt+h]) it = ir;
6445
t = com.fhK[it*com.npatt+h];
6446
for(ir=0,fh=0; ir<com.ncatG; ir++)
6447
fh += com.freqK[ir]*exp(com.fhK[ir*com.npatt+h]-t);
6451
for(ir=0,fh=0; ir<com.ncatG;ir++)
6452
fh += com.freqK[ir]*com.fhK[ir*com.npatt+h];
6455
FPE=1; matout(F0,x,1,np);
6456
printf("\nlfundG: h=%4d fhK=%9.6e\ndata: ", h+1, fh);
6464
lnL -= fh*com.fpatt[h];
6465
if(LASTROUND==2) dfsites[h] = fh;
6466
if (com.print<0) print_lnf_site(h,fh);
6473
int SetPSiteClass(int iclass, double x[])
6475
/* This sets parameters for the iclass-th site class
6476
This is used by ConditionalPNode() and also updateconP in both algorithms
6479
int k = com.nrgene + !com.fix_kappa;
6480
double *pkappa=NULL, *xcom=x+com.ntime, mr;
6482
_rateSite = com.rK[iclass];
6486
pkappa = (com.hkyREV||com.codonf==FMutSel ? xcom+com.nrgene : &com.kappa);
6487
if(com.seqtype == CODONseq && com.NSsites) {
6491
if(com.aaDist<10) com.pomega = xcom + k + com.ncatG - 1 + 2*iclass;
6492
else if(com.aaDist==11) com.pomega = xcom + k + com.ncatG - 1 + 4*iclass;
6493
else if(com.aaDist==12) com.pomega = xcom + k + com.ncatG - 1 + 5*iclass;
6495
EigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, com.rK[iclass], PMat);
6502
extern int prt, Locus, Ir;
6505
int fx_r (double x[], int np)
6507
/* This calculates f(x|r) if(com.NnodeScale==0) or log{f(x|r)}
6508
if(com.NnodeScale>0), that is, the (log) probability of observing data x
6509
at a site, given the rate r or dN/dS ratio for the site. This is used by
6510
the discrete-gamma models in baseml and codeml as well as the NSsites models
6512
The results are stored in com.fhK[com.ncatG*com.npatt].
6513
This deals with underflows with large trees using global variables
6514
com.nodeScale and com.nodeScaleF[com.NnodeScale*com.npatt].
6516
int h, ir, i,k, ig, FPE=0;
6517
double fh, smallw=1e-12; /* for testing site class with w=0 */
6520
if(SetParameters(x)) puts("\npar err..");
6522
for(ig=0; ig<com.ngene; ig++) { /* alpha may differ over ig */
6523
if(com.Mgene>1 || com.nalpha>1)
6524
SetPGene(ig, com.Mgene>1, com.Mgene>1, com.nalpha>1, x);
6525
for(ir=0; ir<com.ncatG; ir++) {
6526
if(ir && com.conPSiteClass) { /* shift com.nodeScaleF & conP */
6528
com.nodeScaleF += (size_t)com.npatt*com.NnodeScale;
6529
for(i=com.ns; i<tree.nnode; i++)
6530
nodes[i].conP += (tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;
6532
SetPSiteClass(ir,x);
6533
ConditionalPNode(tree.root,ig, x);
6535
for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
6536
if (com.fpatt[h]<=0 && com.print>=0) continue;
6537
for (i=0,fh=0; i<com.ncode; i++)
6538
fh += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];
6540
if(fh<-1e-10 /* && !FPE */) { /* note that 0 may be o.k. here */
6541
FPE=1; matout(F0,x,1,np);
6542
printf("\nfx_r: h = %d r = %d fhK = %.5e ", h+1,ir+1,fh);
6543
if(com.seqtype==0||com.seqtype==2) {
6552
com.fhK[ir*com.npatt+h] = fh;
6554
for(k=0,com.fhK[ir*com.npatt+h]=log(fh); k<com.NnodeScale; k++)
6555
com.fhK[ir*com.npatt+h] += com.nodeScaleF[k*com.npatt+h];
6559
if(com.conPSiteClass) { /* shift pointers conP back */
6561
com.nodeScaleF -= (com.ncatG-1)*com.NnodeScale*(size_t)com.npatt;
6562
for(i=com.ns; i<tree.nnode; i++)
6563
nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;
6570
double lfun (double x[], int np)
6572
/* likelihood function for models of one rate for all sites including
6575
int h,i,k, ig, FPE=0;
6579
if(SetParameters(x)) puts ("\npar err..");
6580
for(ig=0; ig<com.ngene; ig++) {
6582
SetPGene(ig,1,1,0,x);
6583
ConditionalPNode (tree.root, ig, x);
6585
for(h=com.posG[ig]; h<com.posG[ig+1]; h++) {
6586
if (com.fpatt[h]<=0 && com.print>=0) continue;
6587
for(i=0,fh=0; i<com.ncode; i++)
6588
fh += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];
6590
if(fh<-1e-5 && noisy) {
6591
printf("\nfh = %.6f negative\n",fh);
6595
FPE=1; matout(F0,x,1,np);
6596
printf("lfun: h=%4d fh=%9.6e\nData: ", h+1,fh);
6603
for(k=0; k<com.NnodeScale; k++)
6604
fh += com.nodeScaleF[k*com.npatt+h];
6606
lnL -= fh*com.fpatt[h];
6607
if(LASTROUND==2) dfsites[h] = fh;
6609
print_lnf_site(h,fh);
6618
int print1site (FILE*fout, int h)
6620
/* This print out one site in the sequence data, com.z[]. It may be the h-th
6621
site in the original data file or the h-th pattern. The data are coded.
6622
naa > 1 if the codon codes for more than one amino acid.
6624
char *pch = (com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs)), compatibleAAs[20]="";
6625
int n=com.ncode, i, b, aa=0;
6627
for(i=0; i<com.ns; i++) {
6629
if(com.seqtype==0 || com.seqtype==2)
6630
fprintf(fout,"%c", pch[b]);
6632
else if(com.seqtype==1) {
6633
aa = GetAASiteSpecies(i, h);
6634
fprintf(fout, "%s (%c) ", CODONs[b], aa);
6642
#if(defined MINIMIZATION)
6644
/* November, 1999, Minimization branch by branch */
6645
int noisy_minbranches;
6646
double *space_minbranches, *g_minbranches, *varb_minbranches, e_minbranches;
6648
double minbranches(double xcom[], int np);
6649
int lfunt(double t, int a,int b,double x[],double *l, double space[]);
6650
int lfuntdd(double t, int a,int b,double x[], double *l,double*dl,double*ddl,
6652
int lfunt_SiteClass(double t, int a,int b,double x[],double *l,double space[]);
6653
int lfuntdd_SiteClass(double t, int a,int b,double x[],
6654
double *l,double*dl,double*ddl,double space[]);
6656
int minB (FILE*fout, double *lnL,double x[],double xb[][2],double e0, double space[])
6658
/* This calculates lnL for given values of common parameters by optimizing
6659
branch lengths, cycling through them.
6660
Z. Yang, November 1999
6661
This calls minbranches to optimize branch lengths and ming2 to
6662
estimate other paramters.
6663
At the end of the routine, there is a call to lfun to restore nodes[].conP.
6664
Returns variances of branch lengths in space[].
6665
space[] is com.space[]. com.space may be reallocated here, which may be unsafe
6666
as the pointers in the calling routine may not be pointing to the right places.
6668
return value: 0 convergent; -1: not convergent.
6670
int ntime0=com.ntime, fix_blength0=com.fix_blength;
6671
int status=0, i, npcom=com.np-com.ntime;
6673
double *xcom=x+com.ntime, lnL0= *lnL, dl, e=1e-5;
6674
double (*xbcom)[2]=xb+ntime0;
6675
int small_times=0, max_small_times=100, ir,maxr=(npcom?200:1);
6676
double small_improvement=0.001;
6679
if(com.conPSiteClass) {
6680
s = (2*com.ncode*com.ncode+com.ncode*(size_t)com.npatt)*sizeof(double);
6681
if(com.sspace < s) { /* this assumes that space is com.space */
6682
printf("\n%lu bytes in space, %lu bytes needed\n", com.sspace, s);
6683
printf("minB: reallocating memory for working space.\n");
6684
com.space = (double*)realloc(com.space, s);
6685
if(com.space==NULL) error2("oom space");
6689
g_minbranches = com.space;
6690
varb_minbranches = com.space + com.np;
6691
s = (3*com.ncode*com.ncode + (com.conPSiteClass) * 4 *(size_t)com.npatt) *sizeof(double);
6692
if((space_minbranches=(double*)malloc(s))==NULL)
6694
if(com.ntime==0) error2("minB: should not come here");
6696
if(*lnL<=0) *lnL = com.plfun(x,com.np);
6697
e = e_minbranches = (npcom ? 5.0 : e0);
6698
com.ntime = 0; com.fix_blength = 2;
6700
if(com.NSsites==0) com.pomega = xcom+com.nrgene+!com.fix_kappa;
6703
for(ir=0; (npcom==0||com.method) && ir<maxr; ir++) {
6705
if(noisy>2) printf("\n\nRound %da: Paras (%d) (e=%.6g)",ir+1,npcom,e);
6706
ming2(NULL,lnL,com.plfun,NULL,xcom, xbcom, com.space,e,npcom);
6708
FPN(F0); FOR(i,npcom) printf("%12.6f",xcom[i]);
6709
printf("%8s%s\n", "", printtime(timestr));
6713
noisy_minbranches = noisy;
6715
printf("\nRound %db: Blengths (%d, e=%.6g)\n",ir+1,tree.nbranch,e_minbranches);
6717
*lnL = minbranches(xcom, -1);
6718
for(i=0; i<tree.nnode; i++)
6720
x[nodes[i].ibranch] = nodes[i].branch;
6721
if(noisy>2) printf("\n%s\n", printtime(timestr));
6723
if((dl=fabs(*lnL-lnL0))<e0 && e<=0.02) break;
6724
if(dl<small_improvement) small_times++;
6726
if((small_times>max_small_times && ntime0<200) || (com.method==2&&ir==1)) {
6727
if(noisy && com.method!=2) puts("\nToo slow, switching algorithm.");
6731
if(noisy && small_times>5)
6732
printf("\n%d rounds of small improvement.",small_times);
6734
e/=2; if(dl<1) e/=2;
6735
if(dl<0.5) e = min2(e,1e-3);
6736
else if(dl>10) e = max2(e,0.1);
6737
e_minbranches = max2(e, 1e-6);
6742
fprintf(fout,"%4d %12.5f x ", ir+1,*lnL);
6743
for(i=0;i<com.np;i++) fprintf(fout,"%9.5f",x[i]);
6744
FPN(fout); fflush(fout);
6747
if (npcom && ir==maxr) status=-1;
6749
if(npcom && status==2) {
6750
noisy_minbranches = 0;
6752
com.fix_blength = fix_blength0;
6753
ming2(NULL,lnL,com.plfun,NULL,x,xb, com.space,e0,com.np);
6754
for(i=0; i<tree.nnode; i++) space[i] = -1;
6757
for(i=0; i<tree.nnode; i++)
6758
if(i!=tree.root) x[nodes[i].ibranch] = nodes[i].branch;
6760
if(noisy>2) printf("\nlnL = %12.6f\n",- *lnL);
6763
com.fix_blength = fix_blength0;
6764
*lnL = com.plfun(x,com.np); /* restore things, for e.g. AncestralSeqs */
6765
if(fabs(*lnL-lnL0) > 1e-5)
6766
printf("%.6f != %.6f lnL error. Something is wrong in minB\n", *lnL, lnL0);
6767
free(space_minbranches);
6769
return (status==-1 ? -1 : 0);
6773
/********************* START: Testing iteration algorithm ******************/
6775
int minB2 (FILE*fout, double *lnL,double x[],double xb[][2],double e0, double space[])
6779
int ntime0=com.ntime, fix_blength0=com.fix_blength;
6780
int status=0, i, npcom=com.np-com.ntime;
6782
double *xcom=x+com.ntime, lnL0= *lnL;
6783
double (*xbcom)[2]=xb+ntime0;
6785
s = (3*com.ncode*com.ncode + (com.conPSiteClass) * 4*(size_t)com.npatt) * sizeof(double);
6786
if((space_minbranches=(double*)malloc(s))==NULL) error2("oom minB2");
6787
if(com.ntime==0 || npcom==0) error2("minB2: should not come here");
6789
noisy_minbranches=0;
6790
/* if(*lnL<=0) *lnL=com.plfun(x,com.np); */
6791
com.ntime=0; com.fix_blength=2;
6793
if(com.NSsites==0) com.pomega=xcom+com.nrgene+!com.fix_kappa;
6796
ming2(NULL, lnL, minbranches, NULL, xcom, xbcom, space, e0, npcom);
6799
com.ntime = ntime0; com.fix_blength = fix_blength0;
6800
for(i=0; i<tree.nnode; i++)
6801
if(i!=tree.root) x[nodes[i].ibranch] = nodes[i].branch;
6802
*lnL = com.plfun(x,com.np); /* restore things, for e.g. AncestralSeqs */
6803
free(space_minbranches);
6805
return (status==-1 ? -1 : 0);
6808
/********************* END: Testing iteration algorithm ******************/
6812
int updateconP (double x[], int inode)
6814
/* update conP for inode.
6816
Confusing decision about x[] follows. Think about redesign.
6818
(1) Called by PostProbNode for ancestral reconstruction, with com.clock = 0,
6819
1, 2: x[] is passed over and com.ntime is used to get xcom in
6821
(2) Called from minbranches(), with com.clock = 0. xcom[] is passed
6822
over by minbranches and com.ntime=0 is set. So SetPSiteClass()
6823
can still get the correct substitution parameters.
6824
Also look at ConditionalPNode().
6826
Note that com.nodeScaleF and nodes[].conP are shifted if(com.conPSiteClass).
6830
if(com.conPSiteClass==0)
6831
for(ig=0; ig<com.ngene; ig++) {
6832
if(com.Mgene>1 || com.nalpha>1)
6833
SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,x);
6834
/* x[] needed by local clock models and if(com.aaDist==AAClasses).
6835
This is called from PostProbNode */
6836
ConditionalPNode(inode, ig, x);
6838
else { /* site-class models */
6845
com.nodeScaleF += com.NnodeScale*(size_t)com.npatt;
6846
for(i=com.ns; i<tree.nnode; i++)
6847
nodes[i].conP += (tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;
6849
SetPSiteClass(ir, x);
6850
for(ig=0; ig<com.ngene; ig++) {
6851
if(com.Mgene>1 || com.nalpha>1)
6852
SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,x);
6853
if(com.nalpha>1) SetPSiteClass(ir, x);
6854
ConditionalPNode(inode,ig, x);
6858
/* shift positions */
6859
com.nodeScaleF-=(com.ncatG-1)*com.NnodeScale*com.npatt;
6860
for(i=com.ns; i<tree.nnode; i++)
6861
nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;
6867
double minbranches (double x[], int np)
6869
/* Z. Yang, November 1999.
6870
optimizing one branch at a time
6872
for each branch a..b, reroot the tree at b, and
6873
then calculate conditional probability for node a.
6874
For each branch, this routine determines the Newton search direction
6875
p = -dl/dll. It then halves the steplength to make sure -lnL is decreased.
6876
When the Newton solution is correct, this strategy will waste one
6877
extra call to lfunt. It does not seem possible to remove calculation of
6878
l (lnL) in lfuntddl().
6879
lfun or lfundG and thus SetParameters are called once beforehand to set up
6880
globals like com.pomega.
6881
This works with NSsites and NSbranch models.
6883
com.oldconP[] marks nodes that need to be updated when the tree is rerooted.
6884
The array is declared in baseml and codeml and used in the following
6885
routines: ReRootTree, minbranches, and ConditionalPNode.
6887
Note: At the end of the routine, nodes[].conP are not updated.
6889
int ib,oldroot=tree.root, a,b;
6890
int icycle, maxcycle=1000, icycleb, ncycleb=10, i;
6891
double lnL, lnL0=0, l0,l,dl,ddl=-1, t,t0,t00, p,step=1, small=1e-20,y;
6892
double tb[2]={1e-8,50}, e=e_minbranches, *space=space_minbranches;
6893
double *xcom=x+com.ntime; /* this is incorrect as com.ntime=0 */
6894
double smallddl=0.25/com.ls*(1-0.25/com.ls)/com.ls;
6896
if(com.ntime) error2("ntime should be 0 in minbranches");
6897
lnL0 = l0 = l = lnL = com.plfun(xcom,-1);
6899
if(noisy_minbranches>2) printf("\tlnL0 = %14.6f\n",-lnL0);
6901
for(icycle=0; icycle<maxcycle; icycle++) {
6902
for(ib=0; ib<tree.nbranch; ib++) {
6903
t = t0 = t00 = nodes[tree.branches[ib][1]].branch;
6905
a = tree.branches[ib][0];
6906
b = tree.branches[ib][1];
6907
for(i=0; i<tree.nnode; i++)
6912
for(icycleb=0; icycleb<ncycleb; icycleb++) { /* iterating a branch */
6913
if(!com.conPSiteClass)
6914
lfuntdd(t, a, b, xcom, &y, &dl, &ddl, space);
6916
lfuntdd_SiteClass(t, a, b, xcom, &y, &dl, &ddl, space);
6918
if(fabs(y-l)>1e-3 && noisy_minbranches>2)
6919
printf("\nWarning rounding error? b=%d cycle=%d lnL=%12.7f != %12.7f\n",ib,icycleb,l,y);
6921
/* p = -dl/ddl; newton direction */
6922
if (fabs(p)<small) step = 0;
6923
else if(p<0) step = min2(1,(tb[0]-t0)/p);
6924
else step = min2(1,(tb[1]-t0)/p);
6926
if(icycle==0 && step!=1 && step!=0) step *= 0.99; /* avoid border */
6927
for (i=0; step>small; i++,step/=4) {
6929
if(!com.conPSiteClass) lfunt(t, a, b, xcom, &l, space);
6930
else lfunt_SiteClass(t, a, b, xcom, &l, space);
6933
if(step<=small) { t=t0; l=l0; break; }
6934
if(fabs(t-t0)<e*fabs(1+t) && fabs(l-l0)<e) break;
6937
nodes[a].branch = t;
6939
g_minbranches[ib] = -dl;
6940
varb_minbranches[ib] = -ddl;
6943
if(noisy_minbranches>2) printf("\tCycle %2d: %14.6f\n",icycle+1, -l);
6944
if(fabs(lnL-lnL0) < e) break;
6946
} /* for (icycle) */
6947
ReRootTree(oldroot); /* did not update conP */
6948
FOR(i,tree.nnode) com.oldconP[i]=0;
6954
int lfunt(double t, int a, int b, double xcom[], double *l, double space[])
6956
/* See notes for lfunt_dd and minbranches
6958
int i,j,k, h,ig, n=com.ncode, nroot=n;
6959
int n1 = (com.cleandata&&b<com.ns ? 1 : n), xb;
6960
double expt,uexpt=0,multiply;
6961
double *P=space, piqi,pqj, fh, mr=0;
6965
pkappa=(com.hkyREV||com.codonf==FMutSel ? xcom+com.nrgene : &com.kappa);
6966
if (com.seqtype==CODONseq && com.model) {
6967
if(com.model==2 && com.nOmega<=5) {
6968
U = _UU[(int)nodes[a].label];
6969
V = _VV[(int)nodes[a].label];
6970
Root = _Root[(int)nodes[a].label];
6973
EigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, nodes[a].omega, PMat);
6980
EigenTN93(com.model,nodes[a].kappa,1,com.pi,&nR,Root,Cijk);
6985
for (ig=0; ig<com.ngene; ig++) {
6986
if(com.Mgene>1) SetPGene(ig,1,1,0,xcom); /* com.ntime=0 */
6987
for(i=0; i<n*n; i++) P[i] = 0;
6989
for(k=0,expt=1; k<nroot; k++) {
6990
multiply = com.rgene[ig]*Root[k];
6991
if(k) expt = exp(t*multiply);
6993
#if (CODEML) /* uses U & V */
6995
for(j=0,uexpt=U[i*n+k]*expt; j<n; j++)
6996
P[i*n+j] += uexpt*V[k*n+j];
6997
#elif (BASEML) /* uses Cijk */
6998
for(i=0; i<n; i++) for(j=0; j<n; j++)
6999
P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;
7003
for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
7004
n1 = (b<com.ns ? nChara[com.z[b][h]] : n);
7005
for(i=0,fh=0; i<n1; i++) {
7007
if(b<com.ns) piqi = com.pi[ xb = CharaMap[com.z[b][h]][i] ];
7008
else piqi = com.pi[i] * nodes[b].conP[h*n+i];
7010
for(j=0,pqj=0; j<n; j++)
7011
pqj += P[xb*n+j]*nodes[a].conP[h*n+j];
7014
if(noisy && fh<1e-250)
7015
printf("a bit too small: fh[%d] = %10.6e\n",h,fh);
7019
*l -= fh*com.fpatt[h];
7020
for(i=0; i<com.NnodeScale; i++)
7021
*l -= com.nodeScaleF[i*com.npatt+h]*com.fpatt[h];
7028
int lfuntdd(double t, int a, int b, double xcom[], double *l, double*dl, double*ddl, double space[])
7030
/* Calculates lnL for branch length t for branch b->a.
7031
See notes in minbranches().
7032
Conditional probability updated correctly already.
7036
int i,j,k, h,ig,n=com.ncode, nroot=n;
7037
int n1 = (com.cleandata&&b<com.ns ? 1 : n), xb;
7038
double expt, uexpt = 0, multiply;
7039
double *P=space, *dP=P+n*n,*ddP=dP+n*n, piqi,pqj,dpqj,ddpqj, fh, dfh, ddfh;
7040
double *pkappa, mr=0;
7043
pkappa=(com.hkyREV||com.codonf==FMutSel ? xcom+com.nrgene : &com.kappa);
7044
if (com.seqtype==CODONseq && com.model) {
7045
if(com.model==2 && com.nOmega<=5) {
7046
U = _UU[(int)nodes[a].label];
7047
V = _VV[(int)nodes[a].label];
7048
Root = _Root[(int)nodes[a].label];
7051
EigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, nodes[a].omega,PMat);
7058
EigenTN93(com.model,nodes[a].kappa,1,com.pi,&nR,Root,Cijk);
7061
*l = *dl = *ddl = 0;
7062
for(ig=0; ig<com.ngene; ig++) {
7063
if(com.Mgene>1) SetPGene(ig,1,1,0,xcom); /* com.ntime=0 */
7064
for(i=0; i<n*n; i++) P[i] = dP[i] = ddP[i] = 0;
7066
for(k=0,expt=1; k<nroot; k++) {
7067
multiply = com.rgene[ig]*Root[k];
7068
if(k) expt = exp(t*multiply);
7070
#if (CODEML) /* uses U & V */
7072
for(j=0,uexpt=U[i*n+k]*expt; j<n; j++) {
7073
P[i*n+j] += uexpt*V[k*n+j];
7075
dP[i*n+j] += uexpt*V[k*n+j]*multiply;
7076
ddP[i*n+j] += uexpt*V[k*n+j]*multiply*multiply;
7079
#elif (BASEML) /* uses Cijk */
7080
for(i=0; i<n; i++) for(j=0; j<n; j++) {
7081
P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;
7083
dP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply;
7084
ddP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply*multiply;
7089
for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
7090
n1 = (b<com.ns ? nChara[com.z[b][h]] : n);
7091
for(i=0,fh=dfh=ddfh=0; i<n1; i++) {
7093
if(b<com.ns) piqi = com.pi[ xb = CharaMap[com.z[b][h]][i] ];
7094
else piqi = com.pi[i] * nodes[b].conP[h*n+i];
7095
for(j=0,pqj=dpqj=ddpqj=0; j<n; j++) {
7096
pqj += P[xb*n+j] * nodes[a].conP[h*n+j];
7097
dpqj += dP[xb*n+j] * nodes[a].conP[h*n+j];
7098
ddpqj += ddP[xb*n+j] * nodes[a].conP[h*n+j];
7104
if(noisy && fh<1e-250) {
7105
printf("too small: fh[%d] = %10.6e\n",h,fh);
7108
*l -= log(fh)*com.fpatt[h];
7109
for(i=0; i<com.NnodeScale; i++)
7110
*l -= com.nodeScaleF[i*com.npatt+h]*com.fpatt[h];
7111
*dl -= dfh/fh * com.fpatt[h];
7112
*ddl -= (fh*ddfh - dfh*dfh)/(fh*fh) * com.fpatt[h];
7119
int lfunt_SiteClass(double t, int a, int b, double xcom[], double *l, double space[])
7121
/* see notes in lfuntdd_SiteClass
7122
For branch&site models, look at the notes in GetPMatBranch()
7124
int i,j,k, h,ig,ir,it, n=com.ncode, nroot=n;
7125
int n1=(com.cleandata&&b<com.ns?1:n), xb;
7126
double y,expt,uexpt=0,multiply, piqi,pqj;
7127
double *P=space, *fh=P+n*n;
7128
double *Sh=fh+com.npatt; /* scale factor for each site pattern*/
7129
double *pK=com.fhK; /* proportion for each site class after scaling */
7130
double smallw=1e-12;
7134
EigenTN93(com.model,nodes[a].kappa,1,com.pi,&nR,Root,Cijk);
7138
if(com.NnodeScale==0)
7139
for(ir=0; ir<com.ncatG; ir++)
7140
for (h=0; h<com.npatt; h++)
7141
pK[ir*com.npatt+h] = com.freqK[ir];
7143
for(h=0; h<com.npatt; h++) {
7144
for(ir=0,it=0; ir<com.ncatG; ir++) {
7145
for(k=0,y=0; k<com.NnodeScale; k++)
7146
y += com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h];
7147
if((pK[ir*com.npatt+h]=y) > pK[it*com.npatt+h])
7150
Sh[h] = pK[it*com.npatt+h];
7151
for(ir=0; ir<com.ncatG; ir++)
7152
pK[ir*com.npatt+h] = com.freqK[ir]*exp(pK[ir*com.npatt+h]-Sh[h]);
7156
for(h=0; h<com.npatt; h++) fh[h] = 0;
7157
for(ir=0; ir<com.ncatG; ir++) {
7158
SetPSiteClass(ir, xcom); /* com.ntime=0 */
7160
#if CODEML /* branch b->a */
7161
/* branch&site models */
7162
if(com.seqtype==CODONseq && com.NSsites && com.model)
7163
Set_UVR_BranchSite (ir, (int)nodes[a].label);
7167
for(i=com.ns;i<tree.nnode;i++)
7168
nodes[i].conP += (tree.nnode-com.ns)*n*(size_t)com.npatt;
7170
for (ig=0; ig<com.ngene; ig++) {
7171
if(com.Mgene>1 || com.nalpha>1)
7172
SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,xcom); /* com.ntime=0 */
7173
if(com.nalpha>1) SetPSiteClass(ir, xcom); /* com.ntime=0 */
7175
for(i=0; i<n*n; i++) P[i] = 0;
7176
for(k=0,expt=1; k<nroot; k++) {
7177
multiply = com.rgene[ig]*Root[k]*_rateSite;
7179
if(com.seqtype==1 && com.model>=2)
7180
multiply *= Qfactor_NS_branch[(int)nodes[a].label];
7182
if(k) expt = exp(t*multiply);
7184
#if (CODEML) /* uses U & V */
7186
for(j=0,uexpt=U[i*n+k]*expt; j<n; j++)
7187
P[i*n+j] += uexpt*V[k*n+j];
7188
#elif (BASEML) /* uses Cijk */
7191
P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;
7193
} /* for (k), look through eigenroots */
7194
for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
7195
n1 = (b<com.ns ? nChara[com.z[b][h]] : n);
7196
for(i=0; i<n1; i++) {
7198
if(b<com.ns) piqi = pK[ir*com.npatt+h] * com.pi[ xb = CharaMap[com.z[b][h]][i] ];
7199
else piqi = pK[ir*com.npatt+h] * com.pi[i] * nodes[b].conP[h*n+i];
7201
for(j=0,pqj=0; j<n; j++)
7202
pqj += P[xb*n+j]*nodes[a].conP[h*n+j];
7209
for(i=com.ns; i<tree.nnode; i++) /* shift position */
7210
nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*n*(size_t)com.npatt;
7211
for(h=0,*l=0; h<com.npatt; h++) {
7213
printf("small (lfunt_SiteClass): fh[%d] = %10.6e\n",h,fh[h]);
7215
*l -= log(fh[h])*com.fpatt[h];
7216
if(com.NnodeScale) *l -= Sh[h]*com.fpatt[h];
7222
int lfuntdd_SiteClass(double t, int a,int b,double xcom[],
7223
double *l,double*dl,double*ddl,double space[])
7225
/* dt and ddt for site-class models, modified from lfuntdd()
7226
nodes[].conP (and com.nodeScaleF if scaling is used) is shifted for ir,
7227
and moved back to the rootal place at the end of the routine.
7229
At the start of this routine, nodes[].conP has the conditional probabilties
7230
for each node, each site pattern, for each site class (ir).
7231
Scaling: When scaling is used, scale factors
7232
com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h] for all nodes
7233
are collected into Sh[h], after adjusting for rate classes, since the
7234
sum is taken over ir. Sh[h] and pK[ir*com.npatt+h] together store the
7235
scale factors and proportions for site classes. com.freqK[ir] is not
7236
used in this routine beyond this point.
7237
if(com.Malpha), com.freqK[]=1/com.ncatG and does not change with ig,
7238
and so the collection of Sh for sites at the start of the routine is o.k.
7240
The space for com.fhK[] is used.
7241
space[2*ncode*ncode + 4*npatt]:
7242
dP[ncode*ncode],ddP[ncode*ncode],fh[npatt],dfh[npatt],ddfh[npatt],Sh[npatt]
7243
pK[ncatG*npatt]=com.fhK[]
7245
int i,j,k, h,ig,ir,it, n=com.ncode, nroot=n;
7246
int n1=(com.cleandata&&b<com.ns?1:n), xb;
7247
double y,expt,uexpt=0,multiply, piqi,pqj,dpqj,ddpqj;
7248
double *P=PMat, *dP=space,*ddP=dP+n*n;
7249
double *fh=ddP+n*n, *dfh=fh+com.npatt, *ddfh=dfh+com.npatt;
7250
double *Sh=ddfh+com.npatt; /* scale factor for each site pattern */
7251
double *pK=com.fhK; /* proportion for each site class after scaling */
7252
double smallw=1e-12;
7257
EigenTN93(com.model, nodes[a].kappa, 1, com.pi, &nR, Root, Cijk);
7260
if(com.NnodeScale==0)
7261
for(ir=0; ir<com.ncatG; ir++)
7262
for(h=0; h<com.npatt; h++)
7263
pK[ir*com.npatt+h] = com.freqK[ir];
7265
for(h=0; h<com.npatt; h++) {
7266
for(ir=0,it=0; ir<com.ncatG; ir++) {
7267
for(k=0,y=0; k<com.NnodeScale; k++)
7268
y += com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h];
7269
if((pK[ir*com.npatt+h]=y) > pK[it*com.npatt+h])
7272
Sh[h] = pK[it*com.npatt+h];
7273
for(ir=0; ir<com.ncatG; ir++)
7274
pK[ir*com.npatt+h] = com.freqK[ir] * exp(pK[ir*com.npatt+h]-Sh[h]);
7278
for(h=0; h<com.npatt; h++)
7279
fh[h] = dfh[h] = ddfh[h] = 0;
7280
for(ir=0; ir<com.ncatG; ir++) {
7281
SetPSiteClass(ir, xcom); /* com.ntime=0 */
7283
#if CODEML /* branch b->a */
7284
/* branch&site models */
7285
if(com.seqtype==CODONseq && com.NSsites && com.model)
7286
Set_UVR_BranchSite (ir, (int)nodes[a].label);
7290
for(i=com.ns; i<tree.nnode; i++)
7291
nodes[i].conP += (tree.nnode-com.ns)*n*(size_t)com.npatt;
7293
for (ig=0; ig<com.ngene; ig++) {
7294
if(com.Mgene>1 || com.nalpha>1)
7295
SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,xcom); /* com.ntime=0 */
7296
if(com.nalpha>1) SetPSiteClass(ir, xcom); /* com.ntime=0 */
7298
for(i=0; i<n*n; i++)
7299
P[i] = dP[i] = ddP[i]=0;
7300
for(k=0,expt=1; k<nroot; k++) { /* k loops through eigenroots */
7301
multiply = com.rgene[ig]*Root[k]*_rateSite;
7303
if(com.seqtype==1 && com.model>=2)
7304
multiply *= Qfactor_NS_branch[(int)nodes[a].label];
7306
if(k) expt = exp(t*multiply);
7308
#if (CODEML) /* uses U & V */
7310
for(j=0,uexpt=U[i*n+k]*expt; j<n; j++) {
7311
P[i*n+j] += uexpt*V[k*n+j];
7313
dP[i*n+j] += uexpt*V[k*n+j]*multiply;
7314
ddP[i*n+j] += uexpt*V[k*n+j]*multiply*multiply;
7317
#elif (BASEML) /* uses Cijk */
7318
for(i=0; i<n; i++) for(j=0; j<n; j++) {
7319
P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;
7321
dP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply;
7322
ddP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply*multiply;
7328
for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
7329
n1 = (b<com.ns ? nChara[com.z[b][h]] : n);
7330
for(i=0; i<n1; i++) {
7333
piqi = pK[ir*com.npatt+h] * com.pi[ xb = CharaMap[com.z[b][h]][i] ];
7335
piqi = pK[ir*com.npatt+h] * com.pi[i] * nodes[b].conP[h*n+i];
7337
for(j=0,pqj=dpqj=ddpqj=0; j<n; j++) {
7338
pqj += P[xb*n+j]*nodes[a].conP[h*n+j];
7339
dpqj += dP[xb*n+j]*nodes[a].conP[h*n+j];
7340
ddpqj += ddP[xb*n+j]*nodes[a].conP[h*n+j];
7343
dfh[h] += piqi*dpqj;
7344
ddfh[h] += piqi*ddpqj;
7350
for(i=com.ns; i<tree.nnode; i++)
7351
nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*n*(size_t)com.npatt;
7352
for(h=0,*l=*dl=*ddl=0; h<com.npatt; h++) {
7354
printf("small fh[%d] = %10.6e\n",h,fh[h]);
7356
*l -= log(fh[h])*com.fpatt[h];
7357
if(com.NnodeScale) *l -= Sh[h]*com.fpatt[h];
7358
*dl -= dfh[h]/fh[h] * com.fpatt[h];
7359
*ddl -= (fh[h]*ddfh[h] - dfh[h]*dfh[h])/(fh[h]*fh[h]) * com.fpatt[h];
7368
#endif /* #ifdef LFUNCTIONS */
7372
void BranchLengthBD(int rooted, double birth, double death, double sample,
7375
/* Generate random branch lengths (nodes[].branch) using the birth and
7376
death process with species sampling, or the Yule (coalescent?) process
7377
if sample=0, when only parameter mut is used.
7378
Note: older interior nodes have larger node numbers, so root is at
7379
node com.ns*2-2 with time t[ns-2], while the youngest node is at
7380
node com.ns with time t[0]. When unrooted=0, the root is removed with
7381
branch lengths adjusted.
7382
This works with the tree generated from RandomLHistory().
7384
int i,j, it, imin,fixt0=1;
7385
double la=birth, mu=death, rho=sample, tmin, r, t[NS-1];
7388
if (sample==0) /* coalescent model. Check this!!! */
7389
for (i=com.ns,y=0; i>1; i--)
7390
nodes[com.ns*2-i].age=y += -log(rndu())/(i*(i-1.)/2.)*mut/2;
7391
else { /* BD with sampling */
7392
if (fixt0) t[com.ns-2]=1;
7393
if (fabs(la-mu)>1e-6) {
7395
phi = (rho*la*(eml-1)+(mu-la)*eml)/(eml-1);
7396
for (i=0; i<com.ns-1-(fixt0); i++) {
7397
r=rndu(); t[i]=log((phi-r*rho*la)/(phi-r*rho*la+r*(la-mu)))/(mu-la);
7401
for (i=0; i<com.ns-1-(fixt0); i++)
7402
{ r=rndu(); t[i]=r/(1+la*rho*(1-r)); }
7404
for (i=0; i<com.ns-1-1; i++) {
7405
for (j=i+1,tmin=t[i],imin=i; j<com.ns-1; j++)
7406
if (tmin>t[j]) { tmin=t[j]; imin=j; }
7410
for (i=com.ns; i>1; i--) nodes[com.ns*2-i].age=t[com.ns-i]*mut;
7412
for(i=0; i<com.ns; i++) nodes[i].age = 0;
7413
for (i=0; i<tree.nnode; i++)
7415
nodes[i].branch = nodes[nodes[i].father].age - nodes[i].age;
7417
it = nodes[tree.root].sons[2];
7418
nodes[it].branch = 2*nodes[2*com.ns-2].age - nodes[tree.root].age - nodes[it].age;
7425
#ifdef NODESTRUCTURE
7428
int RandomLHistory (int rooted, double space[])
7430
/* random coalescence tree, with each labeled history having equal probability.
7431
interior nodes are numbered ns, ns+1, ..., 2*ns-1-!rooted
7433
int ns=com.ns, i, j, it=0, *nodea=(int*)space;
7436
for (i=0; i<2*ns-1-!rooted; i++) ClearNode(i);
7438
for (i=0; i<ns; i++) nodea[i]=i;
7439
for (i=ns,t=0; i>(1+!rooted); i--) {
7440
nodes[it=2*ns-i].nson = 2;
7441
j = (int)(i*rndu());
7442
nodes[nodea[j]].father = it;
7443
nodes[it].sons[0] = nodea[j];
7444
nodea[j] = nodea[i-1];
7445
j = (int)((i-1)*rndu());
7446
nodes[nodea[j]].father = it;
7447
nodes[it].sons[1] = nodea[j];
7449
if (!rooted && i==3) {
7451
nodes[nodea[1-j]].father = it;
7452
nodes[it].sons[2] = nodea[1-j];
7456
tree.nnode = ns*2-1-!rooted;
7463
#endif /* NODESTRUCTURE */
7467
/* routines for dating analysis of heterogeneous data */
7468
#if (defined BASEML || defined CODEML || defined MCMCTREE)
7471
#if (defined MCMCTREE)
7473
int ProcessFossilInfo()
7475
/* This processes fossil calibration information that has been read into
7476
nodes[].nodeStr. It uses both sptree and nodes[], before it is destroyed.
7477
This is called before sequence alignments at loci are read.
7479
Possible confusions:
7480
Simple lower and upper bounds can be specified using <, >, or both < and > in
7481
the tree either with or without quotation marks. These are read in ReadTreeN()
7482
and processed in ReadTreeSeqs().
7483
Other distributions such as G, SN, ST must be specified using the format 'G(alpha, beta)',
7484
say, and are processed here. Simple bounds can also be specified using the format
7485
'L(0.5)', 'U(1.0)', or 'B(0.5, 1.0)', in which case they are processed here.
7486
I kept this complexity, (i) to keep the option of using <, >, which is intuitive,
7487
(ii) for ReadTreeN to be able to read other node labels such as #, $, either with
7490
int i,j,k, nfossiltype=8;
7492
double tailL=0.025, tailR=0.025, p_LOWERBOUND=0.1, c_LOWERBOUND=1.0;
7494
for(i=sptree.nspecies; i<tree.nnode; i++) {
7495
if(nodes[i].nodeStr == NULL)
7497
if(sptree.nodes[i].fossil) { /* fossila specified using <, >, already processed. */
7498
free(nodes[i].nodeStr);
7501
for(j=1; j<nfossiltype+1; j++)
7502
if((pch = strstr(nodes[i].nodeStr, fossils[j]))) break;
7503
if(j == nfossiltype+1)
7504
printf("\nunrecognized fossil calibration: %s\n", nodes[i].nodeStr);
7506
sptree.nodes[i].fossil = j;
7507
pch = strchr(nodes[i].nodeStr, '(') + 1;
7511
/* truncated Cauchy default prior L(tL, p, c) */
7512
sptree.nodes[i].pfossil[1] = p_LOWERBOUND;
7513
sptree.nodes[i].pfossil[2] = c_LOWERBOUND;
7514
sptree.nodes[i].pfossil[3] = tailL;
7515
sscanf(pch, "%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1],
7516
&sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3]);
7519
sptree.nodes[i].pfossil[2] = tailR;
7520
sscanf(pch, "%lf,%lf", &sptree.nodes[i].pfossil[1], &sptree.nodes[i].pfossil[2]);
7523
sptree.nodes[i].pfossil[2] = tailL;
7524
sptree.nodes[i].pfossil[3] = tailR;
7525
sscanf(pch, "%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1],
7526
&sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3]);
7527
if(sptree.nodes[i].pfossil[0] > sptree.nodes[i].pfossil[1]) {
7528
printf("fossil bounds (%.4f, %.4f)", sptree.nodes[i].pfossil[0], sptree.nodes[i].pfossil[1]);
7529
error2("fossil bounds in tree incorrect");
7533
sscanf(pch, "%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1]);
7536
sscanf(pch, "%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1], &sptree.nodes[i].pfossil[2]);
7539
sscanf(pch, "%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1], &sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3]);
7542
sscanf(pch, "%lf,%lf,%lf,%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1],
7543
&sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3], &sptree.nodes[i].pfossil[4],
7544
&sptree.nodes[i].pfossil[5], &sptree.nodes[i].pfossil[6]);
7549
sptree.nodes[i].usefossil = 1;
7550
nodes[i].branch = nodes[i].label = 0;
7551
free(nodes[i].nodeStr);
7560
int GenerateGtree (int locus);
7562
int ReadTreeSeqs (FILE*fout)
7564
/* This reads the combined species tree, the fossil calibration information,
7565
and sequence data at each locus. sptree.nodes[].pfossil[] has tL, tU for
7566
bounds or alpha and beta for the gamma prior.
7568
This routine also processes fossil calibration information specified using
7569
<, >, or both. More complex specifications are stored in nodes[].nodeStr and
7570
processed in ProcessFossilInfo(). See notes in that routine.
7572
This also constructs the gene tree at each locus, by pruning the master
7576
int i,j, h, locus, clean0=com.cleandata;
7577
double tailL=0.025, tailR=0.025, p_LOWERBOUND=0.1, c_LOWERBOUND=1.0;
7579
ftree = gfopen(com.treef,"r");
7581
/* read master species tree and process fossil calibration info */
7582
fscanf(ftree, "%d%d", &sptree.nspecies, &i);
7583
com.ns = sptree.nspecies;
7584
if(com.ns>NS) error2("raise NS?");
7585
/* to read master species names into sptree.nodes[].name */
7586
if(noisy) puts("Reading master tree.");
7587
for(j=0; j<sptree.nspecies; j++)
7588
com.spname[j] = sptree.nodes[j].name;
7591
ReadTreeN(ftree, &i, &j, 1, 1);
7593
for(i=j=0; i<tree.nnode; i++)
7594
if(i!=tree.root && nodes[i].branch>0) j++;
7596
printf("\aTree with fossil calibrations should not have branch lengths!");
7598
if(com.clock==5 || com.clock==6)
7599
for(i=0; i<tree.nnode; i++) nodes[i].branch = nodes[i].label = 0;
7600
for(i=0; i<tree.nnode; i++)
7601
if(nodes[i].label<0) nodes[i].label = 0; /* change -1 into 0 */
7603
/* OutTreeN(F0,0,0); FPN(F0); */
7604
OutTreeN(F0,1,0); FPN(F0);
7605
/* OutTreeN(F0,1,1); FPN(F0); */
7606
/* copy master tree into sptree */
7607
if(tree.nnode != 2*com.ns-1)
7608
error2("check and think about multificating trees.");
7609
sptree.nnode = tree.nnode; sptree.nbranch = tree.nbranch;
7610
sptree.root = tree.root; sptree.nfossil = 0;
7611
for(i=0; i<sptree.nspecies*2-1; i++) {
7612
sptree.nodes[i].father = nodes[i].father;
7613
sptree.nodes[i].nson = nodes[i].nson;
7614
if(nodes[i].nson!=0 && nodes[i].nson!=2)
7615
error2("master tree has to be binary.");
7616
for(j=0; j<sptree.nodes[i].nson; j++)
7617
sptree.nodes[i].sons[j] = nodes[i].sons[j];
7619
sptree.nodes[i].fossil = nodes[i].fossil;
7620
sptree.nodes[i].age = nodes[i].age;
7621
sptree.nodes[i].pfossil[0] = nodes[i].branch; /* ">": Lower bound */
7622
sptree.nodes[i].pfossil[1] = nodes[i].label; /* "<": Upper bound */
7624
if(nodes[i].branch && nodes[i].label > 0) { /* fossil calibration */
7625
if(nodes[i].age == 0) {
7626
sptree.nodes[i].fossil = BOUND_F;
7627
sptree.nodes[i].pfossil[2] = tailL;
7628
sptree.nodes[i].pfossil[3] = tailR;
7631
error2("\nUse 'G(alpha, beta)' to specify the gamma calibration");
7635
else if(nodes[i].branch) {
7636
sptree.nodes[i].fossil = LOWER_F;
7638
/* truncated Cauchy default prior L(tL, p, c) */
7639
sptree.nodes[i].pfossil[1] = p_LOWERBOUND;
7640
sptree.nodes[i].pfossil[2] = c_LOWERBOUND;
7641
sptree.nodes[i].pfossil[3] = tailL;
7643
else if(nodes[i].label > 0) {
7644
sptree.nodes[i].fossil = UPPER_F;
7646
sptree.nodes[i].pfossil[2] = tailR;
7649
if(sptree.nodes[i].fossil)
7650
sptree.nodes[i].usefossil = 1;
7652
nodes[i].branch = nodes[i].label = 0;
7655
#if (defined MCMCTREE)
7656
ProcessFossilInfo();
7659
/* read sequences at each locus, construct gene tree by pruning sptree */
7660
data.ngene = com.ndata;
7662
fseq = gfopen(com.seqf,"r");
7663
if((gnodes=(struct TREEN**)malloc(sizeof(struct TREEN*)*data.ngene)) == NULL)
7666
printf("\nReading sequence data.. %d loci\n", data.ngene);
7667
for(locus=0; locus<data.ngene; locus++) {
7668
fprintf(fout, "\n\n*** Locus %d ***\n", locus+1);
7669
printf("\n\n*** Locus %d ***\n", locus+1);
7671
com.cleandata=(char)clean0;
7672
for(j=0; j<sptree.nspecies; j++)
7673
com.spname[j] = NULL; /* points to nowhere */
7674
#if (defined CODEML)
7675
if(com.seqtype==1) {
7676
com.icode = data.icode[locus];
7680
ReadSeq (fout, fseq, clean0); /* allocates com.spname[] */
7681
#if (defined CODEML)
7682
if(com.seqtype == 1) {
7683
if(com.sspace < max2(com.ngene+1,com.ns)*(64+12+4)*sizeof(double)) {
7684
com.sspace = max2(com.ngene+1,com.ns)*(64+12+4)*sizeof(double);
7685
if((com.space = (double*)realloc(com.space,com.sspace))==NULL)
7686
error2("oom space for #c");
7688
InitializeCodon(fout,com.space);
7692
if(com.seqtype==0 || com.seqtype==2)
7693
InitializeBaseAA(fout);
7695
if((com.seqtype==0 || com.seqtype==2) && com.model==0)
7696
PatternWeightJC69like(fout);
7697
xtoy(com.pi, data.pi[locus], com.ncode);
7699
data.cleandata[locus] = (char)com.cleandata;
7701
data.ns[locus] = com.ns;
7702
data.ls[locus] = com.ls;
7703
data.npatt[locus] = com.npatt;
7704
data.fpatt[locus] = com.fpatt; com.fpatt=NULL;
7705
for(i=0; i<com.ns; i++) {
7706
data.z[locus][i] = com.z[i];
7710
printf("%3d patterns, %s\n", com.npatt,(com.cleandata?"clean":"messy"));
7711
GenerateGtree(locus); /* free com.spname[] */
7713
for(i=0,com.cleandata=1; i<data.ngene; i++)
7714
if(data.cleandata[i]==0)
7717
fclose(ftree); fclose(fseq);
7724
int GenerateGtree (int locus)
7726
/* construct the gene tree at locus by pruning tips in the master species
7727
tree. com.spname[] have names of species at the current locus and
7728
the routine use them to compare with sptree.nodes[].name to decide which
7729
species to keep for the locus. See GetSubTreeN() for more info.
7731
int ns=data.ns[locus], i,j, ipop[NS], keep[NS], newnodeNO[2*NS-1];
7733
for(j=0;j<sptree.nspecies;j++) keep[j]=0;
7735
for(j=0;j<sptree.nspecies;j++)
7736
if(!strcmp(com.spname[i], sptree.nodes[j].name)) break;
7737
if(j==sptree.nspecies) {
7738
printf("species %s not found in master tree\n", com.spname[i]);
7741
keep[j]=i+1; ipop[i]=j;
7742
free(com.spname[i]);
7744
/* copy master species tree and then prune it. */
7746
GetSubTreeN(keep, newnodeNO);
7749
for(i=0;i<sptree.nnode;i++)
7750
if(newnodeNO[i]!=-1) nodes[newnodeNO[i]].ipop = i;
7751
/* printGtree(0); */
7753
gnodes[locus] = (struct TREEN*)malloc((ns*2-1)*sizeof(struct TREEN));
7754
if(gnodes[locus] == NULL) error2("oom gtree");
7755
memcpy(gnodes[locus], nodes, (ns*2-1)*sizeof(struct TREEN));
7756
data.root[locus]=tree.root;
7762
void printGtree (int printBlength)
7766
for(i=0; i<com.ns; i++)
7767
com.spname[i]=sptree.nodes[nodes[i].ipop].name;
7768
for(i=0;i<tree.nnode;i++)
7770
nodes[i].branch=nodes[nodes[i].father].age-nodes[i].age;
7771
printf("\nns = %d nnode = %d", com.ns, tree.nnode);
7772
printf("\n%7s%7s %8s %7s%7s","father","node","(ipop)","nson:","sons");
7773
for(i=0; i<tree.nnode; i++) {
7774
printf ("\n%7d%7d (%2d) %7d ",
7775
nodes[i].father+1, i+1, nodes[i].ipop+1, nodes[i].nson);
7776
for(j=0; j<nodes[i].nson; j++) printf (" %2d", nodes[i].sons[j]+1);
7778
FPN(F0); OutTreeN(F0,0,0); FPN(F0); OutTreeN(F0,1,0); FPN(F0);
7779
if(printBlength) { OutTreeN(F0,1,1); FPN(F0); }
7783
void copySptree (void)
7785
/* This copies sptree into nodes = nodes_t, for printing or editing
7790
com.ns = sptree.nspecies; tree.root = sptree.root;
7791
tree.nnode = sptree.nnode; tree.nbranch = sptree.nbranch;
7792
for(i=0; i<sptree.nnode; i++) {
7793
if(i<com.ns) com.spname[i] = sptree.nodes[i].name;
7794
nodes[i].father =sptree.nodes[i].father;
7795
nodes[i].nson = sptree.nodes[i].nson;
7796
for(j=0;j<nodes[i].nson;j++)
7797
nodes[i].sons[j] = sptree.nodes[i].sons[j];
7798
nodes[i].fossil = sptree.nodes[i].fossil;
7799
nodes[i].age = sptree.nodes[i].age;
7801
nodes[i].branch = sptree.nodes[nodes[i].father].age-sptree.nodes[i].age;
7805
void printSptree (void)
7809
printf("\n************\nSpecies tree\nns = %d nnode = %d", sptree.nspecies, sptree.nnode);
7810
printf("\n%7s%7s %-8s %12s %12s%16s\n","father","node","name","time","fossil","sons");
7811
for (i=0; i<sptree.nnode; i++) {
7812
printf("%7d%7d %-14s %9.5f",
7813
sptree.nodes[i].father+1, i+1, sptree.nodes[i].name, sptree.nodes[i].age);
7816
if((k = sptree.nodes[i].fossil)) {
7817
printf(" %s ( ", fossils[k]);
7818
for(j=0; j<npfossils[k]; j++) {
7819
printf("%6.4f", sptree.nodes[i].pfossil[j + (k==UPPER_F)]);
7820
printf("%s", (j==npfossils[k]-1 ? " ) " : ", "));
7825
if(sptree.nodes[i].nson)
7826
printf(" (%2d %2d)", sptree.nodes[i].sons[0]+1, sptree.nodes[i].sons[1]+1);
7830
FPN(F0); OutTreeN(F0,0,0); FPN(F0); OutTreeN(F0,1,0); FPN(F0);
7831
OutTreeN(F0,1,1); FPN(F0);
7837
#if (defined BASEML || defined CODEML)
7839
#if (defined CODEML)
7841
int GetMemPUVR(int nc, int nUVR)
7843
/* this gets mem for nUVR sets of matrices
7847
PMat=(double*)malloc((nc*nc+nUVR*nc*nc*2+nUVR*nc)*sizeof(double));
7848
if(PMat==NULL) error2("oom getting P&U&V&Root");
7849
U=_UU[0]=PMat+nc*nc; V=_VV[0]=_UU[0]+nc*nc; Root=_Root[0]=_VV[0]+nc*nc;
7850
for(i=1; i<nUVR; i++) {
7851
_UU[i]=_UU[i-1]+nc*nc*2+nc; _VV[i]=_VV[i-1]+nc*nc*2+nc;
7852
_Root[i]=_Root[i-1]+nc*nc*2+nc;
7857
void FreeMemPUVR(void)
7863
int GetUVRoot_codeml (void)
7865
/* This uses data.daafile[] to set up the eigen matrices U, V, Root for
7866
combined clock analyses of multiple protein data sets (clock = 5 or 6).
7868
int locus, nc=(com.seqtype==1?64:20), nUVR=data.ngene;
7871
if(com.seqtype==1 && (!com.fix_kappa || !com.fix_omega)) nUVR=1;
7872
GetMemPUVR(nc, nUVR);
7874
if(nUVR>6) error2("The maximum number of proteins is set to 6.");
7875
if(com.seqtype==2) {
7876
for(locus=0; locus<data.ngene; locus++) {
7878
strcpy(com.daafile, data.daafile[locus]);
7879
GetDaa(NULL, com.daa);
7880
if(com.model==Empirical_F)
7881
xtoy(data.pi[locus], com.pi, nc);
7882
EigenQaa(NULL, _Root[locus], _UU[locus], _VV[locus], NULL);
7884
printf("Protein # %2d uses %-20s\n", locus+1,data.daafile[locus]);
7885
matout(F0, com.pi, 1, nc);
7886
matout(F0, _Root[locus], 1, nc);
7889
else if(com.seqtype==1 && com.fix_kappa & com.fix_omega) {
7890
for(locus=0; locus<data.ngene; locus++) {
7891
if(com.seqtype==1) {
7892
com.icode=data.icode[locus];
7895
com.kappa=data.kappa[locus];
7896
com.omega=data.omega[locus];
7897
xtoy(data.pi[locus], com.pi, com.ncode);
7898
EigenQcodon(0,-1,NULL,NULL,NULL, _Root[locus], _UU[locus], _VV[locus], &mr,
7899
&com.kappa, com.omega, PMat);
7909
int UseLocus (int locus, int copycondP, int setmodel, int setSeqName)
7911
/* This point nodes to the gene tree at locus gnodes[locus] and set com.z[]
7912
etc. for likelihood calculation for the locus.
7918
com.ns=data.ns[locus]; com.ls=data.ls[locus];
7919
tree.root=data.root[locus];
7920
tree.nnode=2*com.ns-1; /* assumes binary tree */
7921
tree.nbranch=tree.nnode-1;
7923
nodes=gnodes[locus];
7925
com.cleandata=data.cleandata[locus];
7926
com.npatt=com.posG[1]=data.npatt[locus]; com.posG[0]=0;
7927
com.fpatt=data.fpatt[locus];
7928
for(i=0; i<com.ns; i++) com.z[i]=data.z[locus][i];
7930
/* The following is model-dependent */
7933
com.kappa=data.kappa[locus];
7934
com.omega=data.omega[locus];
7935
com.alpha=data.alpha[locus];
7938
if(com.seqtype==1) {
7939
com.icode=data.icode[locus];
7945
if(com.seqtype==0 && com.model!=0 && com.model!=1)
7946
xtoy(data.pi[locus], com.pi, com.ncode);
7948
EigenTN93(com.model, com.kappa, com.kappa, com.pi, &nR, Root, Cijk);
7949
else if (com.model==REV)
7950
EigenQREVbase (NULL, &com.kappa, com.pi, &nR, Root, Cijk);
7952
if((com.seqtype==1 && com.codonf) || (com.seqtype==2 && com.model==3))
7953
xtoy(data.pi[locus], com.pi, com.ncode);
7955
if((com.seqtype==2 && (com.model==2 || com.model==3))
7956
|| (com.seqtype==1 && com.fix_kappa && com.fix_omega)) {
7957
Root=_Root[locus]; U=_UU[locus]; V=_VV[locus];
7960
EigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr, &com.kappa, com.omega,PMat);
7965
DiscreteGamma (com.freqK,com.rK,com.alpha,com.alpha,com.ncatG,DGammaMean);
7967
com.NnodeScale = data.NnodeScale[locus];
7968
com.nodeScale = data.nodeScale[locus];
7969
nS = com.NnodeScale*com.npatt * (com.conPSiteClass ? com.ncatG : 1);
7970
for(i=0; i<nS; i++) com.nodeScaleF[i] = 0;
7973
for(i=0; i<com.ns; i++)
7974
com.spname[i] = sptree.nodes[nodes[i].ipop].name;
7979
void GetMemBC (void)
7981
/* This gets memory for baseml and codeml under local clock models for analysis
7982
of combined data from multiple loci.
7983
com.conP[] is shared across loci.
7984
fhK[] uses shared space for loci.
7986
int j, locus, nc = (com.seqtype==1?64:com.ncode);
7987
size_t maxsizeScale=0, nS, sfhK=0, s1, snode;
7990
for(locus=0,com.sconP=0; locus<data.ngene; locus++) {
7991
snode = nc*data.npatt[locus];
7992
s1 = snode*(data.ns[locus]-1)*sizeof(double);
7993
if(com.alpha) { /* this is for step 1, using method = 1 */
7994
com.conPSiteClass = 1;
7997
if(s1>com.sconP) com.sconP = s1;
7998
if(com.alpha && (size_t)data.npatt[locus]>sfhK)
7999
sfhK = data.npatt[locus];
8002
com.conP = (double*)malloc(com.sconP);
8003
printf("\n%5lu bytes for conP\n", com.sconP);
8007
sfhK *= com.ncatG*sizeof(double);
8008
if((com.fhK=(double*)realloc(com.fhK,sfhK))==NULL) error2("oom");
8011
/* set gnodes[locus][].conP for internal nodes */
8012
for(locus=0; locus<data.ngene; locus++) {
8013
snode = nc*data.npatt[locus];
8014
for(j=data.ns[locus]; j<data.ns[locus]*2-1; j++)
8015
gnodes[locus][j].conP = com.conP + (j-data.ns[locus])*snode;
8017
for(locus=0; locus<data.ngene; locus++) {
8018
if(!data.cleandata[locus]) {
8019
UseLocus(locus, -1, 0, 0);
8023
if(sptree.nspecies>20) {
8024
for(locus=0; locus<data.ngene; locus++) {
8025
UseLocus(locus, -1, 0, 0);
8027
com.nodeScale = data.nodeScale[locus]=(char*)malloc(tree.nnode*sizeof(char));
8028
if(com.nodeScale==NULL) error2("oom");
8029
for(j=0; j<tree.nnode; j++) com.nodeScale[j] = 0;
8031
SetNodeScale(tree.root);
8033
data.NnodeScale[locus] = com.NnodeScale;
8034
nS = com.NnodeScale*com.npatt;
8035
if(com.conPSiteClass) nS *= com.ncatG;
8036
maxsizeScale = max2(maxsizeScale, nS);
8038
if(com.NnodeScale) {
8039
printf("\n%d node(s) used for scaling at locus %d: \n",com.NnodeScale,locus+1);
8040
FOR(j,tree.nnode) if(com.nodeScale[j]) printf(" %2d",j+1);
8045
if((com.nodeScaleF=(double*)malloc(maxsizeScale*sizeof(double)))==NULL)
8046
error2("oom nscale");
8047
for(j=0; j<maxsizeScale; j++) com.nodeScaleF[j] = 0;
8053
void FreeMemBC (void)
8057
for(locus=0; locus<data.ngene; locus++)
8058
free(gnodes[locus]);
8061
for(locus=0; locus<data.ngene; locus++) {
8062
free(data.fpatt[locus]);
8063
for(j=0;j<data.ns[locus]; j++)
8064
free(data.z[locus][j]);
8069
if(sptree.nspecies>20) {
8070
for(locus=0; locus<data.ngene; locus++)
8071
free(data.nodeScale[locus]);
8072
if(com.nodeScaleF) free(com.nodeScaleF);
8079
double nu_AHRS=0.001, *varb_AHRS;
8082
double funSS_AHRS(double x[], int np);
8085
double lnLfunHeteroData (double x[], int np)
8087
/* This calculates the log likelihood, the log of the probability of the data
8088
given gtree[] for each locus. This is for step 3 of Yang (2004. Acta
8089
Zoologica Sinica 50:645-656)
8090
x[0,1,...s-k] has node ages in the species tree, followed by branch rates
8091
for genes 1, 2, ..., then kappa for genes, then alpha for genes
8094
double lnL=0, lnLt, *pbrate;
8096
/* ??? need more work for codon sequences */
8097
for(locus=0,k=com.ntime-1; locus<data.ngene; locus++)
8098
k+=data.nbrate[locus];
8099
if(!com.fix_kappa) FOR(locus,data.ngene) data.kappa[locus]=x[k++];
8100
if(!com.fix_omega) FOR(locus,data.ngene) data.omega[locus]=x[k++];
8101
if(!com.fix_alpha) FOR(locus,data.ngene) data.alpha[locus]=x[k++];
8103
/* update node ages in species tree */
8106
FOR(i,tree.nnode) sptree.nodes[i].age=nodes[i].age;
8108
for(locus=0,pbrate=x+com.ntime-1; locus<data.ngene; locus++) {
8110
UseLocus(locus, -1, 1, 1);
8111
/* copy node ages to gene tree */
8112
FOR(i,tree.nnode) nodes[i].age=sptree.nodes[nodes[i].ipop].age;
8115
nodes[i].branch = (nodes[nodes[i].father].age-nodes[i].age)
8116
* pbrate[(int)nodes[i].label];
8117
if(nodes[i].branch<-1e-4)
8121
lnL += lnLt = com.plfun(x, -1);
8122
pbrate += data.nbrate[locus];
8128
double funSS_AHRS (double x[], int np)
8130
/* Function to be minimized in the ad hoc rate smoothing procedure:
8132
nodes[].label has node rate.
8133
lnLb is weighted sum of squares using approximate variances for branch lengths.
8135
lnLr is the log of the prior of rates under the geometric Brownian motion
8136
model of rate evolution. There is no need for recursion as the order at
8137
which sptree.nodes are visited is unimportant. The rates are stored in
8139
The root rate is fixed to be the weighted average rate of its two sons,
8140
inversely weighted by the divergence times.
8142
int locus, j,k, root, pa, son0, son1;
8143
double lnLb, lnLr, lnLbi, lnLri; /* lnLb & lnLr are sum of squares for b and r */
8144
double b,be,t, t0,t1, r,rA, w,y, small=1e-20, smallage=AgeLow[sptree.root]*small;
8145
double nu = nu_AHRS, *varb=varb_AHRS;
8147
/* set up node ages in species tree */
8150
for(j=0; j<tree.nnode; j++)
8151
sptree.nodes[j].age = nodes[j].age;
8154
for(locus=0,lnLb=lnLr=0; locus<data.ngene; varb+=com.ns*2-1,locus++) {
8155
UseLocus(locus, -1, 0, 0);
8156
if(data.fix_nu==2) nu = x[np-1];
8157
else if(data.fix_nu==3) nu = x[np-1-(data.ngene-1-locus)];
8160
son0 = nodes[root].sons[0];
8161
son1 = nodes[root].sons[1];
8162
/* copy node ages and rates into gene tree nodes[]. */
8163
for(j=0; j<tree.nnode; j++) { /* age and rates */
8164
nodes[j].age=sptree.nodes[nodes[j].ipop].age;
8166
nodes[j].label = x[k++];
8168
t0 = nodes[root].age-nodes[son0].age;
8169
t1 = nodes[root].age-nodes[son1].age;
8171
error2("small root branch. Think about what to do.");
8172
nodes[root].label = (nodes[son0].label*t1+nodes[son1].label*t0)/(t0+t1);
8174
for(j=0,lnLbi=0; j<tree.nnode; j++) {
8175
if(j==son0 || j==son1) continue;
8176
pa = nodes[j].father;
8178
b = nodes[son0].branch+nodes[son1].branch;
8179
be = (nodes[j].age-nodes[son0].age) * (nodes[root].label+nodes[son0].label)/2
8180
+ (nodes[j].age-nodes[son1].age) * (nodes[root].label+nodes[son1].label)/2;
8183
b = nodes[j].branch;
8184
be = (nodes[pa].age-nodes[j].age) * (nodes[pa].label+nodes[j].label)/2;
8188
puts("small variance");
8189
lnLbi -= square(be-b)/(2*w);
8192
for(j=0,lnLri=0; j<tree.nnode; j++) {
8193
if(j==root) continue;
8194
pa = nodes[j].father;
8195
t = nodes[pa].age - nodes[j].age;
8196
t = max2(t,smallage);
8198
rA= nodes[pa].label;
8200
if(rA<small || t<small || r<small) puts("small r, rA, or t");
8201
y = log(r/rA)+t*nu/2;
8202
lnLri -= y*y/(2*t*nu) - log(r) - log(2*Pi*t*nu)/2;
8205
if(data.fix_nu>1) lnLri += -nu/nu_AHRS-log(nu); /* exponential prior */
8209
return (lnLb + lnLr);
8213
void SetBranchRates(int inode)
8215
/* this uses node rates to set branch rates, and is used only after the ad hoc
8216
rate smoothing iteration is finished.
8220
nodes[inode].label = (nodes[inode].label + nodes[nodes[inode].father].label)/2;
8222
for(i=0; i<nodes[inode].nson; i++)
8223
SetBranchRates(nodes[inode].sons[i]);
8227
int GetInitialsClock6Step1 (double x[], double xb[][2])
8229
/* This is for clock 6 step 1.
8232
double tb[]={.0001, 999};
8234
com.ntime=k=tree.nbranch;
8235
GetInitialsTimes (x);
8237
com.plfun = (com.alpha==0 ? lfun : lfundG);
8238
com.conPSiteClass = (com.method && com.plfun==lfundG);
8240
/* InitializeNodeScale(); */
8242
if(com.seqtype==0) com.nrate = !com.fix_kappa;
8244
com.np=com.ntime+!com.fix_kappa+!com.fix_alpha;
8245
if(com.seqtype==1 && !com.fix_omega) com.np++;
8247
if(!com.fix_kappa) x[k++]=com.kappa;
8248
if(!com.fix_omega) x[k++]=com.omega;
8249
if(!com.fix_alpha) x[k++]=com.alpha;
8252
for(i=0; i<com.ntime; i++)
8253
{ xb[i][0]=tb[0]; xb[i][1]=tb[1]; }
8254
for( ; i<com.np; i++)
8255
{ xb[i][0]=.001; xb[i][1]=999; }
8257
if(noisy>3 && com.np<200) {
8258
printf("\nInitials (np=%d)\n", com.np);
8259
for(i=0; i<com.np; i++) printf(" %10.5f", x[i]); FPN(F0);
8260
for(i=0; i<com.np; i++) printf(" %10.5f", xb[i][0]); FPN(F0);
8261
for(i=0; i<com.np; i++) printf(" %10.5f", xb[i][1]); FPN(F0);
8268
int GetInitialsClock56Step3 (double x[])
8270
/* This is for clock 5 or clock 6 step 3
8272
int i, j,k=0, naa=20;
8275
GetInitialsTimes (x);
8277
com.plfun = (com.alpha==0 ? lfun : lfundG);
8278
com.conPSiteClass = (com.method && com.plfun==lfundG);
8280
/* InitializeNodeScale(); */
8282
com.np = com.ntime-1 + (1+!com.fix_kappa+!com.fix_omega+!com.fix_alpha)*data.ngene;
8284
for(i=com.ntime-1;i<com.np;i++) x[i]=.2+rndu();
8285
else if(com.clock==6) {
8286
for(j=0,k=com.ntime-1; j<data.ngene; k+=data.nbrate[j],j++)
8287
com.np += data.nbrate[j]-1;
8289
for(j=0; j<data.ngene; j++) x[k++]=data.kappa[j];
8291
for(j=0; j<data.ngene; j++) x[k++]=data.omega[j];
8293
for(j=0; j<data.ngene; j++) x[k++]=data.alpha[j];
8294
for(i=k;i<com.np;i++) x[i]=(.5+rndu())/2;
8300
double GetMeanRate (void)
8302
/* This gets the rough average rate for the locus
8304
int inode, i,j,k, ipop, nleft,nright,marks[NS], sons[2], nfossil;
8308
for(inode=com.ns; inode<tree.nnode; inode++) {
8309
ipop = nodes[inode].ipop;
8310
if(sptree.nodes[ipop].fossil == 0) continue;
8311
sons[0] = nodes[inode].sons[0];
8312
sons[1] = nodes[inode].sons[1];
8313
for(i=0,nleft=nright=0; i<com.ns; i++) {
8314
for(j=i,marks[i]=0; j!=tree.root; j=nodes[j].father) {
8315
if(j==sons[0]) { marks[i]=1; nleft++; break; }
8316
else if (j==sons[1]) { marks[i]=2; nright++; break; }
8319
if(nleft==0 || nright==0) {
8320
puts("this calibration is not in gene tree.");
8325
for(i=0,md=0; i<com.ns; i++) {
8326
for(j=0; j<com.ns; j++) {
8327
if(marks[i]==1 && marks[j]==2) {
8328
for(k=i; k!=inode; k=nodes[k].father)
8329
md+=nodes[k].branch;
8330
for(k=j; k!=inode; k=nodes[k].father)
8331
md+=nodes[k].branch;
8335
md /= (nleft*nright);
8336
mr += md/(sptree.nodes[ipop].age*2);
8339
printf("node age & mr n%-4d %9.5f%9.5f ", inode, sptree.nodes[ipop].age, md);
8340
if(com.ns<100) FOR(i,com.ns) printf("%d",marks[i]);
8346
{ printf("need fossils for this locus\n"); exit(-1); }
8352
int AdHocRateSmoothing (FILE*fout, double x[NS*3], double xb[NS*3][2], double space[])
8354
/* ad hoc rate smoothing for likelihood estimation of divergence times.
8355
Step 1: Use JC69 to estimate branch lengths under no-clock model.
8356
Step 2: ad hoc rate smoothing, estimating one set of divergence times
8357
and many sets of branch rates for loci. Rate at root is set to
8358
weighted average of rate at the two sons.
8360
int model0=com.model, ntime0=com.ntime; /* is this useful? */
8361
int fix_kappa0=com.fix_kappa, fix_omega0=com.fix_omega, fix_alpha0=com.fix_alpha;
8363
double kappa0=com.kappa, omega0=com.omega, alpha0=com.alpha, t0,t1, *varb;
8364
double f, e=1e-8, pb=0.00001, rb[]={0.001,99}, lnL,lnLsum=0;
8365
double mbrate[20], Rj[20], r,minr,maxr, beta, *pnu=&nu_AHRS,nu, mr[NGENE];
8366
int i,j,k,k0, locus, nbrate[20],maxnbrate=20;
8368
FILE *fBV = gfopen("in.BV","w");
8369
FILE *fdist = gfopen("RateDist.txt","w");
8370
FILE *finStep1 = fopen("in.ClockStep1","r"),
8371
*finStep2 = fopen("in.ClockStep2","r");
8374
for(locus=0,k=0; locus<data.ngene; locus++)
8375
k += 2*data.ns[locus]-1;
8376
if((varb_AHRS=(double*)malloc(k*sizeof(double)))==NULL)
8378
for(i=0; i<k;i++) varb_AHRS[i]=-1;
8381
/* Step 1: Estimate branch lengths without clock. */
8382
printf("\nStep 1: Estimate branch lengths under no clock.\n");
8383
fprintf(fout,"\n\nStep 1: Estimate branch lengths under no clock.\n");
8384
com.clock=0; com.method=1;
8386
com.model=0; com.fix_kappa=1; com.kappa=1;
8387
com.fix_alpha=1; com.alpha=0;
8389
for(locus=0; locus<data.ngene; locus++) {
8390
if(!com.fix_kappa) data.kappa[locus]=com.kappa;
8391
if(!com.fix_omega) data.omega[locus]=com.omega;
8392
if(!com.fix_alpha) data.alpha[locus]=com.alpha;
8394
for(locus=0,varb=varb_AHRS; locus<data.ngene; varb+=com.ns*2-1,locus++) {
8395
UseLocus(locus, -1, 1, 1);
8397
fprintf(fout,"\nLocus %d (%d sequences)\n", locus+1, com.ns);
8399
son0 = nodes[tree.root].sons[0];
8400
son1 = nodes[tree.root].sons[1];
8402
GetInitialsClock6Step1 (x, xb);
8405
if(com.ns>30) fprintf(frub, "\n\nLocus %d\n", locus+1);
8407
puts("read MLEs from step 1 from file");
8408
for(i=0; i<com.np; i++)
8409
fscanf(finStep1,"%lf",&x[i]);
8412
j = minB((com.ns>30?frub:NULL), &lnL, x, xb, e, space);
8413
for(j=0; j<com.ns*2-1; j++) {
8414
ib = nodes[j].ibranch;
8415
if(j!=tree.root) varb[j] = (x[ib]>1e-8 ? -1/varb_minbranches[ib] : 999);
8418
matout(F0, x, 1, com.ntime);
8419
matout2(F0, varb, 1, tree.nnode, 10, 7);
8425
if(!com.fix_kappa) data.kappa[locus] = x[com.ntime];
8426
if(!com.fix_omega) data.omega[locus] = x[com.ntime + !com.fix_kappa];
8427
if(!com.fix_alpha) data.alpha[locus] = x[com.ntime + !com.fix_kappa + !com.fix_omega];
8431
t0 = nodes[son0].branch;
8432
t1 = nodes[son1].branch;
8433
varb[tree.root] = varb[t0>t1?son0:son1];
8434
nodes[son0].branch = nodes[son1].branch = (t0+t1)/2; /* arbitrary */
8435
mr[locus] = GetMeanRate();
8437
printf(" Locus %d: %d sequences, %d blengths, lnL = %15.6f mr=%.5f%10s\n",
8438
locus+1, com.ns, com.np-1,-lnL,mr[locus], printtime(timestr));
8439
fprintf(fout,"\nlnL = %.6f\n\n", -lnL);
8440
OutTreeB(fout); FPN(fout);
8441
for(i=0; i<com.np; i++) fprintf(fout," %8.5f",x[i]); FPN(fout);
8442
for(i=0; i<tree.nbranch; i++) fprintf(fout," %8.5f", sqrt(varb[tree.branches[i][1]])); FPN(fout);
8443
FPN(fout); OutTreeN(fout,1,1); FPN(fout); fflush(fout);
8445
fprintf(fBV, "\n\nLocus %d: %d sequences, %d+1 branches\nlnL = %15.6f\n\n",
8446
locus+1, com.ns, tree.nbranch-1, -lnL);
8447
OutTreeB(fBV); FPN(fBV);
8448
for(i=0; i<tree.nbranch; i++) fprintf(fBV," %12.9f",x[i]); FPN(fBV);
8449
for(i=0; i<tree.nbranch; i++) fprintf(fBV," %12.9f", sqrt(varb[tree.branches[i][1]])); FPN(fBV);
8450
FPN(fBV); OutTreeN(fBV,1,1); FPN(fBV); fflush(fBV);
8453
if(data.ngene>1) fprintf(fout,"\nSum of lnL over loci = %15.6f\n", -lnLsum);
8455
/* Step 2: ad hoc rate smoothing to estimate branch rates. */
8456
printf("\nStep 2: Ad hoc rate smoothing to estimate branch rates.\n");
8457
fprintf(fout, "\n\nStep 2: Ad hoc rate smoothing to estimate branch rates.\n");
8458
/* s - 1 - NFossils node ages, (2*s_i - 2) rates for branches at each locus */
8461
GetInitialsTimes (x);
8463
for(locus=0,com.np=com.ntime-1; locus<data.ngene; locus++)
8464
com.np += data.ns[locus]*2-2;
8465
if(data.fix_nu==2) com.np++;
8466
if(data.fix_nu==3) com.np+=data.ngene;
8468
if(com.np>NS*6) error2("change NP for ad hoc rate smoothing.");
8469
for(i=0; i<com.ntime-1; i++)
8470
{ xb[i][0]=pb; xb[i][1]=1-pb; }
8471
if(!nodes[tree.root].fossil)
8472
{ xb[0][0]=AgeLow[tree.root]*1.0001; xb[0][1]=max2(AgeLow[tree.root]*10,50); }
8473
for( ; i<com.np; i++) { /* for rates */
8474
xb[i][0]=rb[0]; xb[i][1]=rb[1];
8476
for(locus=0,i=com.ntime-1; locus<data.ngene; locus++)
8477
for(j=0; j<data.ns[locus]*2-2; j++)
8478
x[i++]=mr[locus]*(.8+.4*rndu());
8479
for( ; i<com.np; i++) /* nu */
8480
x[i]=0.001+0.1*rndu();
8483
for(i=0; i<com.np; i++)
8484
{ printf(" %10.5f", x[i]); if(i==com.ntime-2) FPN(F0); } FPN(F0);
8486
for(i=0; i<com.np; i++) printf(" %10.5f", xb[i][0]); FPN(F0);
8487
for(i=0; i<com.np; i++) printf(" %10.5f", xb[i][1]); FPN(F0);
8492
pnu = x+com.np-(data.fix_nu==2 ? 1 : data.ngene);
8493
printf(" %d times, %d rates, %d parameters, ", com.ntime-1,k,com.np);
8496
f = funSS_AHRS(x, com.np);
8497
if(noisy>2) printf("\nf0 = %12.6f\n",f );
8500
puts("read MLEs from step 2 from file");
8501
for(i=0; i<com.np; i++) fscanf(finStep2,"%lf",&x[i]);
8502
matout(F0,x,1,com.np);
8505
j = ming2(frub, &f, funSS_AHRS, NULL, x, xb, space, 1e-9, com.np);
8507
/* generate output to in.clockStep2
8508
matout(fout,x,1,com.np);
8512
{ puts("\nad hoc rate smoothing iteration may not have converged.\nEnter to continue; Ctrl-C to break.");
8517
fputs("\nEstimated divergence times from ad hoc rate smoothing\n\n",fout);
8519
FOR(i,tree.nnode) nodes[i].branch*=100;
8520
for(i=com.ns; i<tree.nnode; i++)
8521
fprintf(fout, "Node %2d Time %9.5f\n", i+1, nodes[i].age*100);
8522
FPN(fout); OutTreeN(fout,1,1); FPN(fout);
8524
fprintf(fout, "\nEstimated rates from ad hoc rate smoothing\n");
8525
for(locus=0,k=k0=com.ntime-1; locus<data.ngene; k0+=data.nbrate[locus++]) {
8527
UseLocus(locus, -1, 0, 1);
8528
for(i=0; i<tree.nnode; i++)
8529
if(i!=tree.root) nodes[i].label=x[k++];
8530
son0=nodes[tree.root].sons[0]; son1=nodes[tree.root].sons[1];
8531
t0=nodes[tree.root].age-nodes[son0].age;
8532
t1=nodes[tree.root].age-nodes[son1].age;
8533
nodes[tree.root].label = (nodes[son0].label*t1+nodes[son1].label*t0)/(t0+t1);
8534
SetBranchRates(tree.root); /* node rates -> branch rates */
8536
nu = (data.fix_nu==3 ? *(pnu+locus) : *pnu);
8537
fprintf(fout,"\nLocus %d (%d sequences)\n\n", locus+1, com.ns);
8538
fprintf(fout,"nu = %.6g\n", nu);
8540
/* this block can be deleted? */
8541
fprintf(fout, "\nnode \tage \tlength \trate\n");
8542
for(i=0; i<tree.nnode; i++,FPN(fout)) {
8543
fprintf(fout, "%02d\t%.3f", i+1,nodes[i].age);
8545
fprintf(fout, "\t%.5f\t%.5f", nodes[i].branch,nodes[i].label);
8548
fprintf(fout,"\nRates as labels in tree:\n");
8549
OutTreeN(fout,1,PrLabel); FPN(fout); fflush(fout);
8551
if(data.nbrate[locus]>maxnbrate) error2("too many rate classes? Change source.");
8552
for(i=0,minr=1e6,maxr=0; i<tree.nnode; i++)
8556
puts("node label<0?");
8557
minr = min2(minr,r);
8558
maxr = max2(maxr,r);
8561
fprintf(fdist, "\n%6d\n", tree.nnode-1);
8562
for(i=0; i<tree.nnode; i++) {
8563
if(i==tree.root) continue;
8564
fprintf(fdist, "R%-10.7f ", nodes[i].label);
8567
fprintf(fdist, " %9.6f", fabs(nodes[i].label-nodes[j].label));
8572
for(j=0; j<data.nbrate[locus]; j++)
8573
Rj[j]=minr+(j+1)*(maxr-minr)/data.nbrate[locus];
8575
beta = pow(1/(data.nbrate[locus]+1.), 1/(data.nbrate[locus]-1.));
8576
beta = 0.25+0.25*log((double)data.nbrate[locus]);
8577
if(beta>1) beta=0.99;
8578
for(j=0; j<data.nbrate[locus]; j++)
8579
Rj[j]=minr+(maxr-minr)*pow(beta, data.nbrate[locus]-1.-j);
8581
printf("\nLocus %d: nu = %.6f, rate range (%.6f, %.6f)\n", locus+1,nu,minr,maxr);
8582
printf("Cutting points:\n");
8583
for(j=0; j<data.nbrate[locus]; j++)
8584
printf(" < %.6f, ", Rj[j]);
8585
printf("\nThe number of rate groups (0 for no change)? ");
8586
/* scanf("%d", &j); */
8589
data.nbrate[locus]=j;
8590
printf("input %d cutting points? ", data.nbrate[locus]-1);
8591
for(j=0,Rj[data.nbrate[locus]-1]=maxr; j<data.nbrate[locus]-1; j++)
8592
scanf("%lf", &Rj[j]);
8595
for(i=0;i<data.nbrate[locus];i++) { mbrate[i]=0; nbrate[i]=0; }
8596
for(i=0; i<tree.nnode; i++) {
8597
if(i==tree.root) continue;
8599
for(j=0; j<data.nbrate[locus]-1; j++)
8605
nodes[tree.root].label=-1;
8606
for(i=0;i<data.nbrate[locus];i++)
8607
mbrate[i] = (nbrate[i]?mbrate[i]/nbrate[i]:-1);
8609
fprintf(fout,"\nCollapsing rates into groups\nRate range: (%.6f, %.6f)\n", minr,maxr);
8610
/* fprintf(fout,"\nCollapsing rates into groups\nbeta = %.6g Rate range: (%.6f, %.6f)\n", beta, minr,maxr);
8612
for(j=0; j<data.nbrate[locus]; j++)
8613
fprintf(fout,"rate group %d (%2d): <%9.6f, mean %9.6f\n",
8614
j, nbrate[j], Rj[j], mbrate[j]);
8616
FPN(fout); OutTreeN(fout,1,PrLabel); FPN(fout);
8617
fprintf(fout, "\n\nRough rates for branch groups at locus %d\n", locus+1);
8618
for(i=0; i<data.nbrate[locus]; i++)
8619
x[k0+i] = mbrate[i];
8622
printf("\n\n%d times, %d timerates from AHRS:\n", com.ntime-1,k0);
8623
fprintf(fout,"\n\n%d times, %d timerates from AHRS\n", com.ntime-1,k0);
8624
for(i=0; i<k0; i++) {
8625
printf("%12.6f", x[i]);
8626
if(i==com.ntime-2) FPN(F0);
8627
fprintf(fout,"%12.6f", x[i]);
8628
if(i==com.ntime-2) FPN(fout);
8632
for(i=0; i<k0; i++) x[i]*=0.9+0.2*rndu();
8634
com.model=model0; com.clock=6;
8637
com.fix_kappa=fix_kappa0; com.kappa=kappa0;
8638
com.fix_omega=fix_omega0; com.omega=omega0;
8639
com.fix_alpha=fix_alpha0; com.alpha=alpha0;
8642
/* fix parameters: value > 0, precise value unimportant */
8643
if(!fix_kappa0) { com.fix_kappa=1; com.kappa=0.1; }
8644
if(!fix_omega0) { com.fix_omega=1; com.omega=0.1; }
8645
if(!fix_alpha0) { com.fix_alpha=1; com.alpha=0.1; }
8650
printf(" %10s\n", printtime(timestr));
8652
if(finStep1) fclose(finStep1);
8653
if(finStep2) fclose(finStep2);
8659
void DatingHeteroData (FILE* fout)
8661
/* This is for clock and local-clock dating using heterogeneous data from
8662
multiple loci. Some species might be missing at some loci. Thus
8663
gnodes[locus] stores the gene tree at locus. Branch lengths in the gene
8664
tree are constructed using the divergence times in the master species tree,
8665
and the rates for genes and branches.
8667
com.clock = 5: global clock
8671
int i,j,k, s, np, sconP0=0, locus;
8672
double x[NS*6],xb[NS*6][2], lnL,e=1e-7, *var=NULL;
8674
size_t maxnpML, maxnpADRS;
8679
printf("nu (1:fix; 2:estimate one for all genes; 3:estimate one for every gene)? ");
8680
scanf("%d", &data.fix_nu);
8681
if(data.fix_nu==1) scanf("%lf", &nu_AHRS);
8686
for(j=0; j<sptree.nnode; j++) {
8687
sptree.nodes[j].pfossil[0] = sptree.nodes[j].pfossil[1] = -1;
8689
for(j=sptree.nspecies, com.ntime=j-1, sptree.nfossil=0; j<sptree.nnode; j++) {
8690
if(sptree.nodes[j].fossil) {
8693
printf("node %2d age fixed at %.3f\n", j, sptree.nodes[j].age);
8697
s = sptree.nspecies;
8698
maxnpML = s-1 + (5+2)*data.ngene;
8699
maxnpADRS = s-1 + (2*s-1)*data.ngene + 2*data.ngene;
8700
com.sspace = max2(com.sspace, spaceming2(maxnpADRS));
8701
com.sspace = max2(com.sspace, maxnpML*(maxnpML+1)*sizeof(double));
8702
if((com.space = (double*)realloc(com.space,com.sspace))==NULL)
8703
error2("oom space");
8705
#if (defined CODEML)
8706
GetUVRoot_codeml ();
8709
if(data.fix_nu<=1) {
8710
printf("nu & nbrate? ");
8711
scanf("%lf%d? ", &nu_AHRS, &nbrate);
8713
for(locus=0; locus<data.ngene; locus++)
8714
data.nbrate[locus] = nbrate;
8715
AdHocRateSmoothing(fout, x, xb, com.space);
8717
printf("\nStep 3: ML estimation of times and rates.");
8718
fprintf(fout,"\n\nStep 3: ML estimation of times and rates.\n");
8720
else { /* clock = 5, global clock */
8721
for(locus=0; locus<data.ngene; locus++)
8722
for(i=0,data.nbrate[locus]=1; i<data.ns[locus]*2-1; i++)
8723
gnodes[locus][i].label=0;
8729
GetInitialsClock56Step3(x);
8732
SetxBound (com.np, xb);
8733
lnL = lnLfunHeteroData(x,np);
8736
printf("\nntime & nrate & np:%6d%6d%6d\n",com.ntime-1,com.nrate,com.np);
8738
printf("\nlnL0 = %12.6f\n",-lnL);
8741
j = ming2(noisy>2?frub:NULL,&lnL,lnLfunHeteroData,NULL,x,xb, com.space,e,np);
8743
if(noisy) printf("Out...\nlnL = %12.6f\n", -lnL);
8746
for(i=0,j=!sptree.nodes[sptree.root].fossil; i<sptree.nnode; i++)
8747
if(i!=sptree.root && sptree.nodes[i].nson && !sptree.nodes[i].fossil)
8748
x[j++]=sptree.nodes[i].age; /* copy node ages into x[] */
8751
if(np>100 || (com.seqtype && np>20)) puts("Calculating SE's");
8753
Hessian (np,x,lnL,com.space,var,lnLfunHeteroData,var+np*np);
8754
matinv(var,np,np,var+np*np);
8758
fprintf(fout,"\n\nTree: "); OutTreeN(fout,0,0);
8759
fprintf(fout,"\nlnL(ntime:%3d np:%3d):%14.6f\n", com.ntime-1,np,-lnL);
8760
OutTreeB(fout); FPN (fout);
8761
for(i=0;i<np;i++) fprintf(fout," %9.5f",x[i]); FPN(fout); fflush(fout);
8764
fprintf(fout,"SEs for parameters:\n");
8765
for(i=0;i<np;i++) fprintf(fout," %9.5f",(var[i*np+i]>0.?sqrt(var[i*np+i]):-1));
8767
if (com.getSE==2) matout2(fout, var, np, np, 15, 10);
8770
fprintf(fout,"\nTree with node ages for TreeView\n");
8771
FOR(i,tree.nnode) nodes[i].branch*=100;
8772
FPN(fout); OutTreeN(fout,1,1); FPN(fout);
8773
FPN(fout); OutTreeN(fout,1,PrNodeNum); FPN(fout);
8774
FPN(fout); OutTreeN(fout,1,PrLabel|PrAge); FPN(fout);
8775
FPN(fout); OutTreeN(fout,1,0); FPN(fout);
8776
OutputTimesRates(fout, x, var);
8778
fprintf(fout,"\nSubstititon rates for genes (per time unit)\n");
8779
for(j=0,k=com.ntime-1; j<data.ngene; j++,FPN(fout)) {
8780
fprintf(fout," Gene %2d: ", j+1);
8781
for(i=0; i<data.nbrate[j]; i++,k++) {
8782
fprintf(fout,"%10.5f", x[k]);
8783
if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));
8785
if(com.clock==6) fprintf(fout," ");
8787
if(!com.fix_kappa) {
8788
fprintf(fout,"\nkappa for genes\n");
8789
for(j=0; j<data.ngene; j++,k++) {
8790
fprintf(fout,"%10.5f", data.kappa[j]);
8791
if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));
8794
if(!com.fix_omega) {
8795
fprintf(fout,"\nomega for genes\n");
8796
for(j=0; j<data.ngene; j++,k++) {
8797
fprintf(fout,"%10.5f", data.omega[j]);
8798
if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));
8801
if(!com.fix_alpha) {
8802
fprintf(fout,"\nalpha for genes\n");
8803
for(j=0; j<data.ngene; j++,k++) {
8804
fprintf(fout,"%10.5f", data.alpha[j]);
8805
if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));
8810
printf("\nTime used: %s\n", printtime(timestr));