384
384
while(isspace(*p)) p++;
385
if ((ch=strstr(p," ")-p)<lspname && ch>0) lspname=ch;
385
if ((ch=strstr(p," ")-p)<lspname && ch>0)
386
387
strncpy (com.spname[j], p, lspname);
387
k=strlen(com.spname[j]);
388
p+=(k<lspname?k:lspname);
388
k = strlen(com.spname[j]);
389
p += (k<lspname?k:lspname);
390
391
for (; k>0; k--) /* trim spaces */
391
if (!isgraph(com.spname[j][k])) com.spname[j][k]=0;
392
if (!isgraph(com.spname[j][k]))
393
396
if(noisy>=2) printf("Reading seq #%2d: %s \r",j+1,com.spname[j]);
395
398
for (; *p && *p!='\n'; p++) {
396
399
if (lt[j]==com.ls) break;
397
*p=(char)toupper(*p);
400
*p = (char)toupper(*p);
398
401
if((com.seqtype==BASEseq || com.seqtype==CODONseq) && *p=='U')
403
p1 = strchr(pch, *p);
401
404
if (p1 && p1-pch>=nchar)
547
void MarkStopCodons(void)
549
/* this converts the whole column into ??? if there is a stop codon in one sequence.
550
Data in com.z[] are just read in and not encoded yet.
552
int i,j,h,k, fixed=0;
553
char codon[4]="", stops[10][4]={"","",""}, nstops=0;
555
if(com.seqtype!=1) error2("should not be here");
558
if(GeneticCode[com.icode][i]==-1)
559
getcodon(stops[nstops++], i);
561
for(h=0; h<com.ls/3; h++) {
562
for(i=0; i<com.ns; i++) {
563
codon[0] = com.z[i][h*3+0];
564
codon[1] = com.z[i][h*3+1];
565
codon[2] = com.z[i][h*3+2];
566
for(j=0; j<nstops; j++)
567
if(strcmp(codon, stops[j])==0) break;
571
for(i=0; i<com.ns; i++)
572
com.z[i][h*3+0] = com.z[i][h*3+1] = com.z[i][h*3+2] = '?';
577
printf("\n%2d columns are converted into ??? because of stop codons\nPress Enter to continue", fixed);
536
585
void RemoveEmptySequences(void)
538
587
/* this removes empty sequences (? or - only) and adjust com.ns
606
655
/* This encodes sequences and set up com.TipMap[][], called after sites are collapsed
609
For codonml, codons are coded into 0, 1, ..., 60 for the universal code.
610
For yn00, codons are coded into 0, 1, ..., 63 for the universal code.
611
This does not look like a good idea, and perhaps should be changed.
613
658
int n=com.ncode, nA, is,h, i, j, k,ic, indel=0, ch, b[3];
614
char *pchar = ((com.seqtype==0||com.seqtype==1) ? BASEs : (com.seqtype==2 ? AAs : BINs));
659
char *pch = ((com.seqtype==0||com.seqtype==1) ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));
615
660
unsigned char c[4]="", str[4]=" ";
617
if(com.seqtype==0 || com.seqtype==2) {
662
if(com.seqtype != 1) {
618
663
for(is=0; is<com.ns; is++) {
619
664
for (h=0; h<com.npatt; h++) {
620
665
ch = com.z[is][h];
621
k = strchr(pchar, ch) - pchar;
666
com.z[is][h] = (char)(k = strchr(pch, ch) - pch);
623
668
printf("strange character %c in seq %d site %d\n", ch, is+1, h+1);
643
687
if(b[k]<0) printf("strange nucleotide %c in seq %d\n", c[k], j+1);
645
689
if(b[0]<4 && b[1]<4 && b[2]<4) {
646
k = FROM64[b[0]*16+b[1]*4+b[2]];
690
k = FROM64[b[0]*16 + b[1]*4 + b[2]];
648
printf("\nstop codon %s in seq #%2d: %s\n", c, j+1,com.spname[j]);
692
printf("\nstop codon %s in seq #%2d: %s\n", c, j+1, com.spname[j]);
693
printf("\ncodons in other sequences are\n");
694
for(i=0; i<com.ns; i++) {
695
for(k=0; k<3; k++) c[k] = com.z[i][h*3+k];
696
printf("seq #%2d %-30s %s\n", i+1, com.spname[i], c);
677
726
/* This sets up CharaMap, the map from the ambiguity characters to resolved characters.
679
728
int n=com.ncode, i,j, i0,i1,i2, nb[3], ib[3][4], ic;
680
char *pchar = (com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs : BINs));
729
char *pch = (com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));
730
char *pbases = (com.seqtype==0 ? BASEs : (com.seqtype==5 ? BASEs5: NULL));
731
char **pEquateBASE = (com.seqtype==0 ? EquateBASE : (com.seqtype==5 ? EquateBASE5 : NULL));
734
for(j=0; j<n; j++) { /* basic characters, coded according to the definition in pch. */
736
CharaMap[j][0] = (char)j;
687
if(com.seqtype==0 || com.seqtype==2) {
688
for(j=n,pchar+=n; *pchar; j++,pchar++) {
690
nChara[j] = strlen(EquateBASE[j]);
739
if(com.seqtype != 1) {
740
for(j=n,pch+=n; *pch; j++,pch++) {
741
if(com.seqtype==0 || com.seqtype==5) { /* ambiguities are allowed for those 2 types */
742
nChara[j] = (char)strlen(pEquateBASE[j]);
691
743
for(i=0; i<nChara[j]; i++)
692
CharaMap[j][i] = (char)(strchr(BASEs, EquateBASE[j][i]) - BASEs);
744
CharaMap[j][i] = (char)(strchr(pbases, pEquateBASE[j][i]) - pbases);
746
else { /* for non-nucleotide characters, ambiguity characters must be ? or -. */
696
748
for(i=0; i<n; i++)
749
CharaMap[j][i] = (char)i;
752
printf("character %c (%d): ", pbases[j], nChara[j]);
753
for(i=0; i<nChara[j]; i++)
754
printf("%c", pbases[CharaMap[j][i]]);
702
else if(com.seqtype==1) {
703
761
for(j=n; j<256 && CODONs[j][0]; j++) {
705
763
for(i=0; i<3; i++)
706
764
NucListall(CODONs[j][i], &nb[i], ib[i]);
707
765
for(i0=0; i0<nb[0]; i0++) {
999
1057
This routine is called by baseml and aaml. codonml uses another
1000
1058
routine InitializeCodon()
1002
char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs)), indel[]="-?";
1003
int wname=30, h,js,k, ig, nconstp, nc=com.ncode;
1060
char *pch = (com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));
1062
int wname=30, h,js,k, ig, nconstp, n=com.ncode;
1004
1063
int irf, nrf=20;
1005
1064
double pi0[20], t,lmax=0, X2G[2], *pisg; /* freq for species & gene, for X2 & G */
1007
1066
if(noisy) printf("Counting frequencies..");
1008
1067
if(fout) fprintf(fout,"\nFrequencies..");
1009
if((pisg=(double*)malloc(com.ns*nc*sizeof(double))) == NULL)
1068
if((pisg=(double*)malloc(com.ns*n*sizeof(double))) == NULL)
1010
1069
error2("oom pisg");
1011
1070
for(h=0,nconstp=0; h<com.npatt; h++) {
1012
1071
for (js=1; js<com.ns; js++)
1014
1073
if (js==com.ns && com.z[0][h]!=indel[0] && com.z[0][h]!=indel[1])
1015
1074
nconstp += (int)com.fpatt[h];
1017
for (ig=0,zero(com.pi,nc); ig<com.ngene; ig++) {
1076
for (ig=0,zero(com.pi,n); ig<com.ngene; ig++) {
1018
1077
if (com.ngene>1)
1019
1078
fprintf (fout,"\n\nGene %2d (len %4d)", ig+1, com.lgene[ig]-(ig==0?0:com.lgene[ig-1]));
1020
1079
fprintf(fout,"\n%*s",wname, "");
1021
for(k=0; k<nc; k++) fprintf(fout,"%7c", pch[k]);
1080
for(k=0; k<n; k++) fprintf(fout,"%7c", pch[k]);
1023
1082
/* The following block calculates freqs in each species for each gene.
1024
1083
Ambiguities are resolved in each species. com.pi and com.piG are
1025
1084
used for output only, and are not be used later with missing data.
1027
zero(com.piG[ig], nc);
1028
zero(pisg, com.ns*nc);
1086
zero(com.piG[ig], n);
1087
zero(pisg, com.ns*n);
1029
1088
for(js=0; js<com.ns; js++) {
1030
fillxc(pi0, 1.0/nc, nc);
1089
fillxc(pi0, 1.0/n, n);
1031
1090
for(irf=0; irf<nrf; irf++) {
1033
1092
AddFreqSeqGene(js, ig, pi0, com.pi);
1034
t = sum(com.pi, nc);
1036
1095
printf("Some sequences are empty.\n");
1037
fillxc(com.pi, 1.0/nc, nc);
1096
fillxc(com.pi, 1.0/n, n);
1040
abyx(1/t, com.pi, nc);
1041
if(com.cleandata || com.cleandata || (t=distance(com.pi,pi0,nc))<1e-8)
1099
abyx(1/t, com.pi, n);
1100
if(com.cleandata || com.cleandata || (t=distance(com.pi,pi0,n))<1e-8)
1043
xtoy(com.pi, pi0, nc);
1102
xtoy(com.pi, pi0, n);
1044
1103
} /* for(irf) */
1045
1104
fprintf(fout,"\n%-*s", wname, com.spname[js]);
1046
for(k=0; k<nc; k++) fprintf(fout, "%7.4f", com.pi[k]);
1047
for(k=0; k<nc; k++) com.piG[ig][k] += com.pi[k]/com.ns;
1048
xtoy(com.pi, pisg+js*nc, nc);
1105
for(k=0; k<n; k++) fprintf(fout, "%8.5f", com.pi[k]);
1106
for(k=0; k<n; k++) com.piG[ig][k] += com.pi[k]/com.ns;
1107
xtoy(com.pi, pisg+js*n, n);
1049
1108
} /* for(js,ns) */
1050
1109
if(com.ngene>1) {
1051
1110
fprintf(fout,"\n\n%-*s",wname,"Mean");
1052
for(k=0; k<nc; k++) fprintf(fout,"%7.4f",com.piG[ig][k]);
1111
for(k=0; k<n; k++) fprintf(fout,"%7.4f",com.piG[ig][k]);
1055
Chi2FreqHomo(pisg, com.ns, nc, X2G);
1114
Chi2FreqHomo(pisg, com.ns, n, X2G);
1057
1116
fprintf(fout,"\n\nHomogeneity statistic: X2 = %.5f G = %.5f ",X2G[0], X2G[1]);
1067
1126
each gene for com.piG[]). These are used in ML calculation later.
1069
1128
if(com.cleandata) {
1070
for (ig=0,zero(com.pi,nc); ig<com.ngene; ig++) {
1129
for (ig=0,zero(com.pi,n); ig<com.ngene; ig++) {
1071
1130
t = (ig==0 ? com.lgene[0] : com.lgene[ig]-com.lgene[ig-1])/(double)com.ls;
1072
for(k=0; k<nc; k++) com.pi[k] += com.piG[ig][k]*t;
1131
for(k=0; k<n; k++) com.pi[k] += com.piG[ig][k]*t;
1076
1135
for (ig=0; ig<com.ngene; ig++) {
1077
xtoy(com.piG[ig], pi0, nc);
1136
xtoy(com.piG[ig], pi0, n);
1078
1137
for(irf=0; irf<nrf; irf++) { /* com.piG[] */
1079
zero(com.piG[ig], nc);
1138
zero(com.piG[ig], n);
1080
1139
for(js=0; js<com.ns; js++)
1081
1140
AddFreqSeqGene(js, ig, pi0, com.piG[ig]);
1082
t = sum(com.piG[ig], nc);
1141
t = sum(com.piG[ig], n);
1084
1143
puts("empty sequences?");
1085
abyx(1/t, com.piG[ig], nc);
1086
if(distance(com.piG[ig], pi0, nc)<1e-8) break;
1087
xtoy(com.piG[ig], pi0, nc);
1144
abyx(1/t, com.piG[ig], n);
1145
if(distance(com.piG[ig], pi0, n)<1e-8) break;
1146
xtoy(com.piG[ig], pi0, n);
1088
1147
} /* for(irf) */
1089
1148
} /* for(ig) */
1091
for(k=0; k<nc; k++) for(ig=0; ig<com.ngene; ig++)
1150
for(k=0; k<n; k++) for(ig=0; ig<com.ngene; ig++)
1092
1151
pi0[k] += com.piG[ig][k]/com.ngene;
1093
1152
for(irf=0; irf<nrf; irf++) { /* com.pi[] */
1095
1154
for(ig=0; ig<com.ngene; ig++) for(js=0; js<com.ns; js++)
1096
1155
AddFreqSeqGene(js, ig, pi0, com.pi);
1097
abyx(1/sum(com.pi,nc), com.pi, nc);
1098
if(distance(com.pi, pi0, nc)<1e-8) break;
1099
xtoy(com.pi,pi0,nc);
1156
abyx(1/sum(com.pi,n), com.pi, n);
1157
if(distance(com.pi, pi0, n)<1e-8) break;
1158
xtoy(com.pi, pi0, n);
1100
1159
} /* for(ig) */
1102
1161
fprintf (fout, "\n\n%-*s", wname, "Average");
1103
for(k=0; k<nc; k++) fprintf(fout," %7.4f", com.pi[k]);
1162
for(k=0; k<n; k++) fprintf(fout,"%8.5f", com.pi[k]);
1104
1163
if(!com.cleandata) fputs("\n(Ambiguity characters are used to calculate freqs.)\n",fout);
1106
1165
fprintf (fout,"\n\n# constant sites: %6d (%.2f%%)",
1107
1166
nconstp, (double)nconstp*100./com.ls);
1109
1168
if (com.model==0 || (com.seqtype==BASEseq && com.model==1)) {
1110
fillxc(com.pi, 1./nc, nc);
1111
FOR(ig,com.ngene) xtoy (com.pi, com.piG[ig], nc);
1169
fillxc(com.pi, 1./n, n);
1170
for(ig=0; ig<com.ngene; ig++)
1171
xtoy(com.pi, com.piG[ig], n);
1113
1173
if (com.seqtype==BASEseq && com.model==5) { /* T92 model */
1114
com.pi[0]=com.pi[2]=(com.pi[0]+com.pi[2])/2;
1115
com.pi[1]=com.pi[3]=(com.pi[1]+com.pi[3])/2;
1174
com.pi[0] = com.pi[2] = (com.pi[0] + com.pi[2])/2;
1175
com.pi[1] = com.pi[3] = (com.pi[1] + com.pi[3])/2;
1116
1176
for(ig=0; ig<com.ngene; ig++) {
1117
1177
com.piG[ig][0] = com.piG[ig][2] = (com.piG[ig][0] + com.piG[ig][2])/2;
1118
1178
com.piG[ig][1] = com.piG[ig][3] = (com.piG[ig][1] + com.piG[ig][3])/2;
1122
1182
/* this is used only for REV & REVu in baseml and model==3 in aaml */
1123
1183
if(com.seqtype==AAseq) {
1124
for (k=0,t=0; k<nc; k++) t+=(com.pi[k]>0);
1184
for (k=0,t=0; k<n; k++) t += (com.pi[k]>0);
1126
1186
puts("\n\a\t\tAre these a.a. sequences?");
1128
1188
if(com.cleandata && com.ngene==1) {
1129
1189
for(h=0,lmax=-(double)com.ls*log((double)com.ls); h<com.npatt; h++)
1130
if(com.fpatt[h]>1) lmax+=com.fpatt[h]*log((double)com.fpatt[h]);
1190
if(com.fpatt[h]>1) lmax += com.fpatt[h]*log((double)com.fpatt[h]);
1133
1193
if(lmax) fprintf(fout, "\nln Lmax (unconstrained) = %.6f\n", lmax);
1190
1250
All characters in com.z[][] not found in the character string pch are
1191
1251
considered ambiguity characters and are removed.
1193
int h,k, j,js,lnew,nindel, n31,nchar;
1194
char b, *pch, *miss; /* miss[h]=1 if site (codon) h is missing, 0 otherwise */
1253
int n=com.ncode, h,k, j,js,lnew,nindel, n31=1;
1254
char b, *miss; /* miss[h]=1 if site (codon) h is missing, 0 otherwise */
1255
char *pch=((com.seqtype<=1||com.seqtype==CODON2AAseq)?BASEs:(com.seqtype==2?AAs: (com.seqtype==5?BASEs5:BINs)));
1196
if(com.seqtype==CODONseq||com.seqtype==CODON2AAseq)
1197
{ n31=3; nchar=4; pch=BASEs; }
1200
if(com.seqtype==AAseq) { nchar=20; pch=AAs; }
1201
else if(com.seqtype==BASEseq) { nchar=4; pch=BASEs; }
1202
else { nchar=2; pch=BINs; }
1257
if(com.seqtype==CODONseq || com.seqtype==CODON2AAseq) {
1205
1261
if (com.ls%n31) error2("ls in RemoveIndel.");
1206
1262
if((miss=(char*)malloc(com.ls/n31 *sizeof(char)))==NULL)
1207
1263
error2("oom miss");
1208
FOR (h,com.ls/n31) miss[h]=0;
1264
for(h=0; h<com.ls/n31; h++)
1209
1266
for (js=0; js<com.ns; js++) {
1210
1267
for (h=0,nindel=0; h<com.ls/n31; h++) {
1211
1268
for (k=0; k<n31; k++) {
1212
b=(char)toupper(com.z[js][h*n31+k]);
1213
FOR(j,nchar) if(b==pch[j]) break;
1214
if(j==nchar) { miss[h]=1; nindel++; }
1269
b = (char)toupper(com.z[js][h*n31+k]);
1271
if(b==pch[j]) break;
1273
miss[h]=1; nindel++;
1217
1277
if (noisy>2 && nindel)
1306
1367
If com.pose is not NULL, the routine also updates com.pose. This allows
1307
1368
the program to work if com.readpattern==1.
1309
char zh[NS], b, gap, *pch=(com.seqtype==0 ? BASEs : AAs);
1370
char zh[NS], b, gap;
1371
char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));
1310
1372
int npatt0=com.npatt, h, ht, j,k, same=0, ig, recode;
1312
1374
if(com.seqtype==1)
1313
1375
error2("PatternWeightJC69like does not work for codon seqs");
1314
1376
if(noisy) printf("Counting site patterns again, for JC69.\n");
1315
gap = strchr(pch, (int)'-') - pch;
1377
gap = (char) (strchr(pch, (int)'-') - pch);
1316
1378
for (h=0,com.npatt=0,ig=-1; h<npatt0; h++) {
1317
1379
if (ig<com.ngene-1 && h==com.posG[ig+1])
1318
1380
com.posG[++ig] = com.npatt;
1771
int printsmaCodon (FILE *fout,char * z[],int ns,int ls,int lline,int simple)
1834
int printsmaCodon (FILE *fout, unsigned char * z[],int ns,int ls,int lline,int simple)
1773
1836
/* print, in blocks, multiple aligned and transformed codon sequences.
1774
1837
indels removed.
1775
1838
This is needed as codons are coded 0,1, 2, ..., 60, and
1776
1839
printsma won't work.
1778
int ig, ngroup, lt, il,is, i,b, lspname=20;
1841
int ig, ngroup, lt, il,is, i,b, lspname=30;
1779
1842
char equal='.',*pz, c0[4],c[4];
1781
1844
if(ls==0) return(1);
1964
int EigenQREVbase (FILE* fout, double kappa[],
1965
double pi[], int *nR, double Root[], double Cijk[])
2027
int eigenQREVbase (FILE* fout, double kappa[], double pi[], int *nR, double Root[], double Cijk[])
1967
2029
/* pi[] is constant
1969
int i,j,k, nr=(com.ngene>1&&com.Mgene>=3?com.nrate/com.ngene:com.nrate);
1970
double Q[16], U[16], V[16], mr, space_pisqrt[4];
2031
int n=com.ncode, i,j,k;
2032
int nr = (com.ngene>1 && com.Mgene>=3 ? com.nrate/com.ngene : com.nrate);
2033
double Q[NCODE*NCODE], U[NCODE*NCODE], V[NCODE*NCODE], mr, space_pisqrt[NCODE*NCODE];
1975
2038
if(com.model==REV) {
1976
for(i=0,k=0,Q[3*4+2]=Q[2*4+3]=1; i<3; i++) for (j=i+1; j<4; j++)
1977
if(i*4+j!=11) Q[i*4+j]=Q[j*4+i]=kappa[k++];
2039
if(n!=4) error2("ncode != 4 for REV");
2040
Q[3*n+2] = Q[2*n+3] = 1; /* r_AG = r_GA = 1. */
2041
for(i=0,k=0; i<n-1; i++) for (j=i+1; j<n; j++)
2043
Q[i*n+j] = Q[j*n+i] = kappa[k++];
1979
2045
else /* (model==REVu) */
1980
FOR(i,3) for(j=i+1; j<4; j++)
1981
Q[i*4+j]=Q[j*4+i] = (StepMatrix[i*4+j] ? kappa[StepMatrix[i*4+j]-1] : 1);
1983
FOR(i,4) FOR(j,4) Q[i*4+j] *= pi[j];
1985
for (i=0,mr=0; i<4; i++)
1986
{ Q[i*4+i]=0; Q[i*4+i]=-sum(Q+i*4, 4); mr-=pi[i]*Q[i*4+i]; }
2046
for(i=0; i<n-1; i++) for(j=i+1; j<n; j++)
2047
Q[i*n+j]=Q[j*n+i] = (StepMatrix[i*n+j] ? kappa[StepMatrix[i*n+j]-1] : 1);
2049
for(i=0; i<n; i++) for(j=0; j<n; j++)
2052
for (i=0,mr=0; i<n; i++) {
2054
Q[i*n+i] = -sum(Q+i*n, n);
2055
mr -= pi[i]*Q[i*n+i];
1990
mr=2*(pi[0]*Q[0*4+1]+pi[2]*Q[2*4+3]);
2060
mr = 2*pi[0]*Q[0*n+1] + 2*pi[2]*Q[2*n+3];
1991
2061
if(com.nhomo==0) {
1992
2062
fprintf(fout, "\nRate parameters: ");
1993
2063
for(j=0; j<nr; j++)
1994
2064
fprintf(fout, " %8.5f", kappa[j]);
1995
2065
fprintf(fout, "\nBase frequencies: ");
1997
2067
fprintf(fout," %8.5f", pi[j]);
1999
2069
fprintf (fout, "\nRate matrix Q, Average Ts/Tv =%9.4f", mr/(1-mr));
2000
matout (fout, Q, 4,4);
2070
matout (fout, Q, n, n);
2003
eigenQREV(Q, pi, 4, Root, U, V, space_pisqrt);
2004
FOR (i,4) FOR(j,4) FOR(k,4) Cijk[i*4*4+j*4+k] = U[i*4+k]*V[k*4+j];
2073
eigenQREV(Q, pi, n, Root, U, V, space_pisqrt);
2074
for(i=0; i<n; i++) for(j=0; j<n; j++) for(k=0; k<n; k++)
2075
Cijk[i*n*n+j*n+k] = U[i*n+k]*V[k*n+j];
2012
2083
/* This constructs the rate matrix Q for the unrestricted model.
2013
2084
pi[] is changed in the routine.
2016
double mr, space[20];
2086
int n=com.ncode, i,j,k;
2087
double mr, ts, space[20];
2018
2089
if(com.model==UNREST) {
2019
for (i=0,k=0,Q[14]=1; i<4; i++) FOR(j,4)
2020
if (i!=j && i*4+j != 14) Q[i*4+j]=rate[k++];
2090
if(n!=4) error2("ncode != 4 for UNREST");
2091
for (i=0,k=0,Q[14]=1; i<n; i++) for(j=0; j<n; j++)
2092
if (i!=j && i*n+j != 14) Q[i*n+j] = rate[k++];
2022
2094
else /* (model==UNRESTu) */
2025
Q[i*4+j] = (StepMatrix[i*4+j] ? rate[StepMatrix[i*4+j]-1] : 1);
2095
for(i=0; i<n; i++) for(j=0; j<n; j++)
2097
Q[i*n+j] = (StepMatrix[i*n+j] ? rate[StepMatrix[i*n+j]-1] : 1);
2027
FOR(i,4) { Q[i*4+i]=0; Q[i*4+i]=-sum(Q+i*4, 4); }
2099
for(i=0; i<n; i++) {
2101
Q[i*n+i] = -sum(Q+i*n, n);
2031
QtoPi (Q, com.pi, 4, space);
2033
for (i=0,mr=0; i<4; i++) mr -= pi[i]*Q[i*4+i];
2034
for (i=0; i<4*4; i++) Q[i]/=mr;
2105
QtoPi(Q, com.pi, n, space);
2107
for (i=0,mr=0; i<n; i++) mr -= pi[i]*Q[i*n+i];
2108
for (i=0; i<n*n; i++) Q[i] /= mr;
2037
mr=pi[0]*Q[0*4+1]+pi[1]*Q[1*4+0]+pi[2]*Q[2*4+3]+pi[3]*Q[3*4+2];
2111
ts = pi[0]*Q[0*n+1] + pi[1]*Q[1*n+0] + pi[2]*Q[2*n+3] + pi[3]*Q[3*n+2];
2039
2113
fprintf(fout, "Rate parameters: ");
2040
FOR(j,com.nrate) fprintf(fout, " %8.5f", rate[j]);
2114
for(j=0; j<com.nrate; j++) fprintf(fout, " %8.5f", rate[j]);
2041
2115
fprintf(fout, "\nBase frequencies: ");
2042
FOR(j,4) fprintf(fout," %8.5f", pi[j]);
2043
fprintf (fout,"\nrate matrix Q, Average Ts/Tv (similar to kappa/2) =%9.4f",mr/(1-mr));
2044
matout (fout, Q, 4, 4);
2116
for(j=0; j<n; j++) fprintf(fout," %8.5f", pi[j]);
2118
fprintf (fout,"\nrate matrix Q, Average Ts/Tv (similar to kappa/2) =%9.4f", ts/(1-ts));
2120
fprintf (fout,"\nrate matrix Q");
2121
matout (fout, Q, n, n);
2458
2535
node number etc.
2460
2537
isname = 0: species number; 1: species name; 2:both number and name
2539
Ziheng note (18/12/2011): I have changed the code so that sequence number is not used
2540
anymore. isname = 1 always.
2462
2542
int cnode, cfather=-1; /* current node and father */
2463
2543
int inodeb=0; /* node number that will have the next branch length */
2464
int i,j,k, level=0, isname, ch=' ', icurspecies=0;
2544
int cladeLabels=0, i,j,k, level=0, isname, ch=' ', icurspecies=0;
2465
2545
char check[NS], delimiters[]="(),:#$=@><;", quote[]="\"\'";
2466
2546
int lline=32000;
2467
2547
char line[32000], *pch;
2563
else if (ch=='#'||ch=='<') { *haslabel=1; fscanf(ftree,"%lf",&nodes[inodeb].label); }
2564
else if (ch=='$') { *haslabel=1; fscanf(ftree,"%d",&CladeLabel[inodeb]); }
2565
else if (ch=='@'||ch=='=') {
2566
fscanf(ftree,"%lf",&nodes[inodeb].age);
2643
else if (ch=='#' || ch=='<') { *haslabel=1; fscanf(ftree, "%lf", &nodes[inodeb].label); }
2644
else if (ch=='$') { *haslabel=1; fscanf(ftree, "%d", &CladeLabel[inodeb]); }
2645
else if (ch=='@' || ch=='=') {
2646
fscanf(ftree,"%lf", &nodes[inodeb].age);
2567
2647
#if (defined(BASEML) || defined(CODEML))
2568
2648
if(com.clock) nodes[inodeb].fossil = 1;
2642
2727
: tree.nbranch;
2645
#if(defined(BASEML) || defined(CODEML))
2646
2731
/* check and convert clade labels $ */
2647
if(com.clock>1 || (com.seqtype==1 && com.model>=2)) {
2732
#if(defined(BASEML) || defined(CODEML))
2733
#if(defined(BASEML))
2734
if(com.seqtype==0 && com.nhomo==5) cladeLabels = 1;
2736
if(com.clock>1 || (com.seqtype==1 && com.model>=2)) cladeLabels = 1;
2648
2738
for(i=0,j=0; i<tree.nnode; i++) {
2649
2739
if(CladeLabel[i] != -1) j++;
2651
if(j) {/* j is number of clade labels */
2741
if(j) { /* j is number of clade labels */
2652
2742
DownTreeCladeLabel(tree.root, 0);
2655
for(i=0; i<tree.nnode; i++)
2656
if(i!=tree.root && nodes[i].label==-1) nodes[i].label = 0;
2745
/*** Somehow some labels are still -1 after this, so I changed this. Needs checking. ***/
2746
for(i=0; i<tree.nnode; i++)
2747
if(i!=tree.root && nodes[i].label==-1) nodes[i].label = 0;
2658
2749
/* OutTreeN(F0,1,PrBranch|PrNodeNum); FPN(F0); */
2659
/* FPN(F0); OutTreeN(F0,1,PrLabel); FPN(F0); */
2750
/* FPN(F0); OutTreeN(F0,1,PrLabel); FPN(F0); */
2661
2752
for(i=0,com.nbtype=0; i<tree.nnode; i++) {
2662
2753
if(i == tree.root) continue;
2663
2754
j = (int)nodes[i].label;
2664
if(j+1 > com.nbtype) com.nbtype=j+1;
2755
if(j+1 > com.nbtype) com.nbtype = j+1;
2665
2756
if(j<0 || j>tree.nbranch-1)
2666
2757
error2("branch label in the tree (note labels start from 0 and are consecutive)");
4062
4152
FOR(i,com.np) treebest.x[i]=x[i];
4063
4153
best=itree+1; improve=1;
4065
if (noisy) printf("%6d%2c %+8.6f",
4066
NFunCall,(status?'?':'X'),treestar.tree.lnL-tree.lnL);
4156
printf("%6d%2c %+8.6f", NFunCall,(status?'?':'X'),treestar.tree.lnL-tree.lnL);
4068
4158
fprintf(fsum, "%6d%2c", NFunCall, (status?'?':'X'));
4069
4159
for (i=com.ntime; i<com.np; i++) fprintf(fsum, "%7.3f", x[i]);
4643
4733
jackknife if(lsb<com.ls && com.ngene==1).
4644
4734
gmark[start+19] marks the position of the 19th site in that gene.
4646
int iboot,nboot=com.bootstrap, h,is,ig,lg[NGENE]={0},j, start;
4736
int iboot,nboot=com.bootstrap, h, is, ig, lg[NGENE]={0}, j, start;
4647
4737
int lsb=com.ls, n31=1,gap=10, gpos[NGENE];
4648
4738
int *sites=(int*)malloc(com.ls*sizeof(int)), *gmark=NULL;
4649
FILE *fseq=(FILE*)gfopen(seqf,"w");
4739
FILE *fseq=(FILE*)gfopen(seqf, "w");
4650
4740
enum {PAML=0, PAUP};
4651
char *dt=(com.seqtype==AAseq?"protein":"dna");
4652
char *paupstart="paupstart",*paupblock="paupblock",*paupend="paupend";
4653
int format=0; /* 0: paml-phylip; 1:paup-nexus */
4741
char *datatype = (com.seqtype==AAseq?"protein":"dna");
4742
char *paupstart="paupstart", *paupblock="paupblock", *paupend="paupend";
4743
int format=0; /* 0: paml-phylip; 1:paup-nexus */
4655
4745
if(com.readpattern) error2("work on bootstrapping pattern data.");
4703
4793
fprintf(fseq,"\n\n[Replicate # %d]\n", iboot+1);
4704
4794
fprintf(fseq,"\nbegin data;\n");
4705
4795
fprintf(fseq," dimensions ntax=%d nchar=%d;\n", com.ns, lsb*n31);
4706
fprintf(fseq," format datatype=%s missing=? gap=-;\n matrix\n",dt);
4796
fprintf(fseq," format datatype=%s missing=? gap=-;\n matrix\n", datatype);
4708
4798
for(is=0;is<com.ns;is++,FPN(fseq)) {
4709
4799
fprintf(fseq,"%-20s ", com.spname[is]);
5199
5290
Deals with node scaling to avoid underflows. See above
5200
5291
(Z. Yang, 2 Sept 2001)
5202
char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs)), *zanc;
5203
char str[4]="",codon[2][4]={" "," "}, aa[4]="";
5293
char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));
5294
char *zanc, str[4]="",codon[2][4]={" "," "}, aa[4]="";
5204
5295
char *sitepatt=(com.readpattern?"pattern":"site");
5205
5296
int n=com.ncode, inode, ic=0,b[3],i,j,k1=-1,k2=-1,c1,c2,k3, lsc=com.ls;
5206
5297
int lst=(com.readpattern?com.npatt:com.ls);
5517
5608
nonsynonymous changes are counted separately.
5518
5609
Added in Nov 2000.
5520
char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs));
5611
char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));
5521
5612
char codon[2][4]={" "," "};
5522
5613
int h,hp,inode,k1,k2,d, ls1=(com.readpattern?com.npatt:com.ls);
5523
5614
double S,N,Sd,Nd, S1,N1,Sd1,Nd1, b,btotal=0, p,C;
5850
5940
This outputs results by pattern. I tried to print results by sites, but gave up as
5851
5941
some variables use the same memory (e.g., combIndex) for different site patterns.
5853
char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs)),codon[4]="";
5943
char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));
5854
5945
int n=com.ncode,nintern=tree.nnode-com.ns, i,j,k,h,hp,igene;
5855
5946
int maxnson=0, maxncomb, lst=(com.readpattern?com.npatt:com.ls);
5856
5947
char *sitepatt=(com.readpattern?"pattern":"site");
6085
6175
for(h=pos0; h<pos1; h++) {
6086
6176
for(j=0,t=0;j<n;j++)
6087
if(nodes[inode].conP[h*n+j]>t) t=nodes[inode].conP[h*n+j];
6177
if(nodes[inode].conP[h*n+j]>t)
6178
t = nodes[inode].conP[h*n+j];
6090
for(j=0;j<n;j++) nodes[inode].conP[h*n+j]=1; /* both 0 and 1 fine */
6091
com.nodeScaleF[k*com.npatt+h]=-800; /* this is problematic? */
6182
nodes[inode].conP[h*n+j]=1; /* both 0 and 1 fine */
6183
com.nodeScaleF[k*com.npatt+h] = -800; /* this is problematic? */
6094
for(j=0;j<n;j++) nodes[inode].conP[h*n+j]/=t;
6095
com.nodeScaleF[k*com.npatt+h]=log(t);
6187
nodes[inode].conP[h*n+j]/=t;
6188
com.nodeScaleF[k*com.npatt+h] = log(t);
6369
6462
/* P(t) for branch leading to inode, called by routines ConditionalPNode()
6370
6463
and AncestralSeq() in baseml and codeml. x[] is not used by baseml.
6372
double space[16] = {0};
6466
double space[NCODE*NCODE*3] = {0};
6374
6468
if (com.model<=K80)
6375
PMatK80(Pt, t, (com.nhomo==2?nodes[inode].kappa:com.kappa));
6469
PMatK80(Pt, t, (com.nhomo==2 ? *nodes[inode].pkappa : com.kappa));
6377
6471
if (com.nhomo==2)
6378
EigenTN93(com.model,nodes[inode].kappa, -1, com.pi,&nR,Root,Cijk);
6379
else if (com.nhomo>2) /* need kappa on each node if fix_kappa ==0 */
6380
EigenTN93(com.model,nodes[inode].kappa, -1, nodes[inode].pi,&nR,Root,Cijk);
6472
eigenTN93(com.model, *nodes[inode].pkappa, -1, com.pi, &nR, Root, Cijk);
6473
else if (com.nhomo>2 && com.model<=TN93)
6474
eigenTN93(com.model, *nodes[inode].pkappa, *(nodes[inode].pkappa+1), nodes[inode].pi, &nR, Root, Cijk);
6475
else if (com.nhomo>2 && com.model==REV)
6476
eigenQREVbase(NULL, nodes[inode].pkappa, nodes[inode].pi, &nR, Root, Cijk);
6381
6478
if(com.model<=REV||com.model==REVu)
6384
6481
QUNREST(NULL, Pt, x+com.ntime+com.nrgene, com.pi);
6385
matexp (Pt, t, 4, 10, space);
6482
for(i=0; i<n*n; i++) Pt[i] *= t;
6483
matexp (Pt, n, 7, 5, space);
6422
6520
number of synonymous substitutions per codon) at their MLEs."
6424
6522
w = com.pomega[ib];
6425
EigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, w, Pt);
6523
eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, w, Pt);
6426
6524
Qfactor = Qfactor_NS_branch[ib];
6428
6526
else if (com.seqtype==CODONseq && (com.model==1 ||com.model==2) && com.nbtype<=nUVR) {
6435
6533
if(com.aaDist==AAClasses) { /* AAClass model */
6436
6534
com.pomega = PointOmega(x+com.ntime, -1, inode, -1);
6437
EigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, -1, Pt);
6535
eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, -1, Pt);
6439
6537
else if(com.nbtype>nUVR) { /* branch models, with more than 8 omega */
6440
EigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, nodes[inode].omega, Pt);
6538
eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, nodes[inode].omega, Pt);
6542
6640
else if(com.aaDist==11) com.pomega = xcom + k + com.ncatG - 1 + 4*iclass;
6543
6641
else if(com.aaDist==12) com.pomega = xcom + k + com.ncatG - 1 + 5*iclass;
6545
EigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, com.rK[iclass], PMat);
6643
eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, com.rK[iclass], PMat);
6941
7040
Note: At the end of the routine, nodes[].conP are not updated.
6943
7042
int ib,oldroot=tree.root, a,b;
6944
int icycle, maxcycle=1000, icycleb, ncycleb=10, i;
7043
int icycle, maxcycle=500, icycleb, ncycleb=10, i;
6945
7044
double lnL, lnL0=0, l0,l,dl,ddl=-1, t,t0,t00, p,step=1, small=1e-20,y;
6946
7045
double tb[2]={1e-8,50}, e=e_minbranches, *space=space_minbranches;
6947
7046
double *xcom=x+com.ntime; /* this is incorrect as com.ntime=0 */
7026
7125
Root = _Root[(int)nodes[a].label];
7029
EigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, nodes[a].omega, PMat);
7128
eigenQcodon(1, -1, NULL, NULL, NULL, Root, U, V, &mr, pkappa, nodes[a].omega, PMat);
7035
7134
if (com.nhomo==2)
7036
EigenTN93(com.model,nodes[a].kappa,1,com.pi,&nR,Root,Cijk);
7135
eigenTN93(com.model, *nodes[a].pkappa, 1, com.pi, &nR, Root, Cijk);
7105
7204
Root = _Root[(int)nodes[a].label];
7108
EigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, nodes[a].omega, PMat);
7207
eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, nodes[a].omega, PMat);
7114
7213
if (com.nhomo==2)
7115
EigenTN93(com.model,nodes[a].kappa,1,com.pi,&nR,Root,Cijk);
7214
eigenTN93(com.model, *nodes[a].pkappa, 1, com.pi, &nR, Root, Cijk);
7118
7217
*l = *dl = *ddl = 0;
7796
7895
int GenerateGtree (int locus)
7798
7897
/* construct the gene tree at locus by pruning tips in the master species
7799
tree. com.spname[] have names of species at the current locus and
7800
the routine use them to compare with sptree.nodes[].name to decide which
7801
species to keep for the locus. See GetSubTreeN() for more info.
7898
tree. com.spname[] have names of species at the current locus (probably read
7899
from the sequence alignment at the locus). They are used by the routine to compare
7900
with sptree.nodes[].name to decide which species to keep for the locus.
7901
See GetSubTreeN() for more details.
7803
7903
int ns=data.ns[locus], i,j, ipop[NS], keep[NS], newnodeNO[2*NS-1];
7805
for(j=0;j<sptree.nspecies;j++) keep[j]=0;
7905
for(j=0; j<sptree.nspecies; j++) keep[j]=0;
7806
7906
for(i=0;i<ns;i++) {
7807
7907
for(j=0;j<sptree.nspecies;j++)
7808
7908
if(!strcmp(com.spname[i], sptree.nodes[j].name)) break;
7912
8017
int ResetFinetuneSteps(FILE *fout, double Pjump[], double finetune[], int nsteps)
7914
8019
int j, verybadstep=0;
7915
double PjumpOpt = 0.30; /* this is the optimum for the 2NormalSym move. */
7916
double P0Change[2]={0.2, 0.5}; /* no change if in interval? */
7918
printf("\n\nCurrent Pjump: ");
7919
for(j=0; j<nsteps; j++)
7920
printf(" %8.5f", Pjump[j]);
7921
printf("\nCurrent finetune: ");
7922
for(j=0; j<nsteps; j++)
7923
printf(" %8.5f", finetune[j]);
8020
double PjumpOpt = 0.30; /* this is the optimum for the Bactrian move. */
8023
printf("\n\nCurrent Pjump: ");
8024
for(j=0; j<nsteps; j++)
8025
printf(" %8.5f", Pjump[j]);
8026
printf("\nCurrent finetune: ");
8027
for(j=0; j<nsteps; j++)
8028
printf(" %8.5f", finetune[j]);
7926
8031
fprintf(fout, "\nCurrent Pjump: ");
7927
8032
for(j=0; j<nsteps; j++)
8011
8113
GetDaa(NULL, com.daa);
8012
8114
if(com.model==Empirical_F)
8013
8115
xtoy(data.pi[locus], com.pi, nc);
8014
EigenQaa(NULL, _Root[locus], _UU[locus], _VV[locus], NULL);
8016
printf("Protein # %2d uses %-20s\n", locus+1,data.daafile[locus]);
8017
matout(F0, com.pi, 1, nc);
8018
matout(F0, _Root[locus], 1, nc);
8116
eigenQaa(NULL, _Root[locus], _UU[locus], _VV[locus], NULL);
8021
8119
else if(com.seqtype==1 && com.fix_kappa & com.fix_omega) {
8027
8125
com.kappa=data.kappa[locus];
8028
8126
com.omega=data.omega[locus];
8029
8127
xtoy(data.pi[locus], com.pi, com.ncode);
8030
EigenQcodon(0,-1,NULL,NULL,NULL, _Root[locus], _UU[locus], _VV[locus], &mr,
8128
eigenQcodon(1,-1,NULL,NULL,NULL, _Root[locus], _UU[locus], _VV[locus], &mr,
8031
8129
&com.kappa, com.omega, PMat);
8076
8174
if(com.seqtype==0 && com.model!=0 && com.model!=1)
8077
8175
xtoy(data.pi[locus], com.pi, com.ncode);
8078
8176
if(com.model<=TN93)
8079
EigenTN93(com.model, com.kappa, com.kappa, com.pi, &nR, Root, Cijk);
8177
eigenTN93(com.model, com.kappa, com.kappa, com.pi, &nR, Root, Cijk);
8080
8178
else if (com.model==REV)
8081
EigenQREVbase (NULL, &com.kappa, com.pi, &nR, Root, Cijk);
8179
eigenQREVbase (NULL, &com.kappa, com.pi, &nR, Root, Cijk);
8083
8181
if((com.seqtype==1 && com.codonf) || (com.seqtype==2 && com.model==3))
8084
8182
xtoy(data.pi[locus], com.pi, com.ncode);
8088
8186
Root=_Root[locus]; U=_UU[locus]; V=_VV[locus];
8091
EigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr, &com.kappa, com.omega,PMat);
8189
eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, &com.kappa, com.omega,PMat);