4
Released under GPL - see the 'COPYING' file
6
Copyright (C) 2006 Timo Lassmann <timolassmann@gmail.com>
8
This program is free software; you can redistribute it and/or modify
9
it under the terms of the GNU General Public License as published by
10
the Free Software Foundation; either version 2 of the License, or
13
This program is distributed in the hope that it will be useful,
14
but WITHOUT ANY WARRANTY; without even the implied warranty of
15
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
GNU General Public License for more details.
18
You should have received a copy of the GNU General Public License
19
along with this program; if not, write to the Free Software
20
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22
Please send bug reports, comments etc. to:
23
timolassmann@gmail.com
30
void add_feature_information_from_alignment(int* path,int* fprof1,int* fprof2,int weight)
56
float* update2(const float* profa, const float* profb,float* newp,int* path,int sipa,int sipb,float internal_gap_weight)
62
gap_len = malloc(sizeof(int)* (path[0]+1));
65
//fprintf(stderr,"%d len,,,,\n",path[0]);
66
for(i = 1; i <= path[0];i++){
67
// fprintf(stderr,"%d,%d ",i,path[i]);
68
gap_len[i] = (path[i] >> 16);
69
path[i] = path[i] & 0x0000ffff;
70
// fprintf(stderr,"%d %d\n",path[i],gap_len[i]);
72
//gap_len[path[0]] = 0;
76
/*while(path[c] != 3){
77
fprintf(stderr,"%d %d %d\n",c,path[c],gap_len[c]);
87
while(!path[c] && path[c] != 3){
88
// fprintf(stderr,"Align %d %d\n",c,path[c]);
90
newp[i] = profa[i] + profb[i];
97
}else if (path[c] & 1){
98
//fprintf(stderr,"%d\n",gap_len[c]);
99
if(path[c] & 128){//N terminal gap !!!!!!1
100
for (i = 0; i < gap_len[c]-1;i++){
101
gap_cost += profb[29+64*i];
102
// fprintf(stderr,"i:%d %d\n",i,gap_cost);
104
gap_cost += profb[27+64*i];
105
// fprintf(stderr,"i:%d %d\n",i,gap_cost);
106
}else if(path[c] & 64){//c terminal gap !!!!!!1
107
// fprintf(stderr,"c terminal gap\n");
108
gap_cost += profb[27+64];
109
// fprintf(stderr,"i:%d %d\n",0,gap_cost);
110
for (i = 1; i < gap_len[c];i++){
111
gap_cost += profb[29+64*i];
112
// fprintf(stderr,"i:%d %d\n",i,gap_cost);
115
// fprintf(stderr,"middle gap\n");
116
gap_cost += profb[27+64];
117
// fprintf(stderr,"i:%d %d\n",0,gap_cost);
118
for (i = 1; i < gap_len[c]-1;i++){
119
gap_cost += profb[28+64*i];
120
// fprintf(stderr,"i:%d %d\n",i,gap_cost);
122
gap_cost += profb[27+64*i];
123
// fprintf(stderr,"i:%d %d\n",i,gap_cost);
125
//fprintf(stderr,"gap_A %d %d length:%d cost:%d\n",c,path[c],gap_len[c],gap_cost);
126
gap_cost /= gap_len[c];
127
gap_cost *= internal_gap_weight;
129
while(path[c] & 1 && path[c] != 3){
130
// fprintf(stderr,"gap_A %d %d cost:%d\n",c,path[c],gap_cost);
134
newp[23] += gap_cost;
135
for (i = 32; i < 55;i++){
142
}else if (path[c] & 2){
143
//fprintf(stderr,"%d\n",gap_len[c]);
144
if(path[c] & 128){//N terminal gap !!!!!!1
145
for (i = 0; i < gap_len[c]-1;i++){
146
gap_cost += profa[29+64*i];
147
// fprintf(stderr,"i:%d %d\n",i,gap_cost);
149
gap_cost += profa[27+64*i];
150
// fprintf(stderr,"i:%d %d\n",i,gap_cost);
151
}else if(path[c] & 64){//c terminal gap !!!!!!1
152
// fprintf(stderr,"c terminal gap\n");
153
gap_cost += profa[27+64];
154
// fprintf(stderr,"i:%d %d\n",c-1,gap_cost);
155
for (i = 1; i < gap_len[c];i++){
156
gap_cost += profa[29+64*i];
157
// fprintf(stderr,"i:%d %d\n",i,gap_cost);
160
// fprintf(stderr,"middle gap\n");
161
gap_cost += profa[27+64];
162
// fprintf(stderr,"i:%d %d\n",c-1,gap_cost);
163
for (i = 1; i < gap_len[c]-1;i++){
164
gap_cost += profa[28+64*i];
165
// fprintf(stderr,"i:%d %d\n",i,gap_cost);
167
gap_cost += profa[27+64*i];
168
// fprintf(stderr,"i:%d %d\n",i,gap_cost);
171
gap_cost /= gap_len[c];
173
gap_cost *= internal_gap_weight;
175
while(path[c] & 2 && path[c] != 3){
176
// fprintf(stderr,"gap_b %d %d cost:%d\n",c,path[c],gap_cost);
180
newp[23] += gap_cost;
181
for (i = 32;i < 55;i++){
191
newp[i] = profa[i] + profb[i];
201
void smooth_gaps(float* prof,int len,int window,float strength)
210
for ( i = (window/2); i < len - (window/2);i++){
214
for (j = -(window/2); j < (window/2);j++){
215
tmp_gpo += (float)prof[27+((i+j)*64)]*strength;
216
tmp_gpe += (float) prof[28+((i+j)*64)]*strength;
217
tmp_tgpe += (float) prof[29+((i+j)*64)]*strength;
222
prof[27+(i*64)] = prof[27+(i*64)]*(1.0-strength) + tmp_gpo;
223
prof[28+(i*64)] = prof[28+(i*64)]*(1.0-strength) + tmp_gpe;
224
prof[29+(i*64)] = prof[29+(i*64)]*(1.0-strength) + tmp_tgpe;
229
void increase_gaps(float* prof,int len,int window,float strength)
236
mod = malloc(sizeof(float)*window);
237
for ( i = 0; i < window;i++){
238
mod[i] = (strength - i*(float)strength / (float) window) - (0.5*strength);
241
for ( i = 0; i < len;i++){
242
// // fprintf(stderr,"(%0.2f:%0.2f) ",prof[26],prof[23]);
250
for ( i = 0; i < len;i++){
254
start_pos = i-window;
256
c = start_pos + window;
262
prof[26 - (64*(j+1))] += mod[j];
270
//fprintf(stderr,"%d %d\n",i,c);
271
for (j = 0;j < c;j++){
272
prof[26 +(64*(j+1))] += mod[j];
279
for ( i = 0; i < len;i++){
280
// // fprintf(stderr,"(%0.2f:%0.2f) ",prof[26],prof[23]);
281
prof[27] = prof[27] * (prof[26]+1.0);
282
prof[28] = prof[28] * (prof[26]+1.0);
283
prof[29] = prof[29] * (prof[26]+1.0);
292
void set_gap_penalties2(float* prof,int len,int nsip,int window,float strength)
301
prof[27] = prof[55]*nsip*-gpo;
302
prof[28] = prof[55]*nsip*-gpe;
303
prof[29] = prof[55]*nsip*-tgpe;
308
prof[27] = prof[55]*nsip*-gpo;
309
prof[28] = prof[55]*nsip*-gpe;
311
prof[29] = prof[55]*nsip*-tgpe;
318
for ( i = (window/2); i < len - (window/2);i++){
322
for (j = -(window/2); j < (window/2);j++){
323
tmp_gpo += (float)prof[27+((i+j)*64)]*strength;
324
tmp_gpe += (float) prof[28+((i+j)*64)]*strength;
325
tmp_tgpe += (float) prof[29+((i+j)*64)]*strength;
330
prof[27+(i*64)] = prof[27+(i*64)]*(1-strength) + tmp_gpo;
331
prof[28+(i*64)] = prof[28+(i*64)]*(1-strength) + tmp_gpe;
332
prof[29+(i*64)] = prof[29+(i*64)]*(1-strength) + tmp_tgpe;
335
/*for ( i = 2; i < len-2;i++){
336
prof[27+(i*64)] = (prof[27+((i-2)*64)] +prof[27+((i-1)*64)] + prof[27+(i*64)] + prof[27+((i+1)*64)] +prof[27+((i+2)*64)])/ 5;
338
/* for ( i = 2; i < len-2;i++){
339
prof[28+(i*64)] = (prof[28+((i-2)*64)] + prof[28+((i-1)*64)] + prof[28+(i*64)] + prof[28+((i+1)*64)] +prof[28+((i+2)*64)])/ 5;
341
for ( i = 2; i < len-2;i++){
342
prof[29+(i*64)] = (prof[29+((i-2)*64)] + prof[29+((i-1)*64)] + prof[29+(i*64)] + prof[29+((i+1)*64)] +prof[29+((i+2)*64)])/ 5;
349
float* make_profile2(float* prof, int* seq,int len,float** subm)
352
prof = malloc(sizeof(float)*(len+1)*64);
355
for (i = 0;i < 64;i++){
364
for (j = 0;j < 64;j++){
373
prof[j] = subm[c][j];
384
float* feature_update(const float* profa, const float* profb,float* newp,int* path,int stride)
390
for (i = stride; i--;){
391
newp[i] = profa[i] + profb[i];
397
for (i = stride; i--;){
404
for (i = stride; i--;){
412
for (i = stride; i--;){
413
newp[i] = profa[i] + profb[i];
415
newp -= path[0] *stride;
419
float* make_wu_profile(float* prof,float* wu,int len)
424
prof = malloc(sizeof(float)*(len+1)*2);
427
for (i = 0;i < (len+1)*2;i++){
430
for (i = 0; i < len;i++){
436
prof[i<<1] = wu[i]+1;
437
prof[(i<<1)+1] = wu[i]+1;
444
float* make_feature_profile(float* prof,struct feature* f,int len,struct feature_matrix* fm)
449
prof = malloc(sizeof(int)*(len+1)*fm->stride);
452
for (i = 0;i < (len+1)*fm->stride;i++){
458
if(f->start < len && f->end < len){
459
for (i = f->start-1;i < f->end;i++){
460
prof[i*fm->stride + f->color] += 1;
461
for ( j =fm->mdim ;j < fm->stride;j++){
462
prof[i*fm->stride+j] += fm->m[f->color][j-fm->mdim];
474
float* make_profile(float* prof, int* seq,int len, float** subm)
477
prof = malloc(sizeof(float)*(len+2)*64);
478
prof += (64 *(len+1));
480
for (i = 0;i < 64;i++){
492
for (j = 0;j < 64;j++){
502
prof[j] = subm[c][j];
511
for (i = 0;i < 64;i++){
520
float* dna_make_profile(float* prof, int* seq,int len,float** subm)
521
//int* make_profile(int* prof, int* seq,int len)
524
prof = malloc(sizeof(float)*(len+2)*22);
525
prof += (22 *(len+1));
526
//fprintf(stderr,"Len:%d %d\n",len,64*len);
528
for (i = 0;i < 22;i++){
539
//fprintf(stderr,"-64\n");
540
//for (j = 64; j--;){
541
for (j = 0;j < 22;j++){
553
prof[j] = subm[c][j];
561
for (i = 0;i < 22;i++){
574
float* update(const float* profa, const float* profb,float* newp,int* path,int sipa,int sipb)
578
newp[i] = profa[i] + profb[i];
588
//Idea: limit the 'virtual' number of residues of one type to x.
589
// i.e. only allow a maximum of 10 alanines to be registered in each column
590
// the penalty for aligning a 'G' to this column will stay stable even when many (>10) alanines are present.
591
// the difference in score between the 'correct' (all alanine) and incorrect (alanines + glycine) will not increase
592
// with the number of sequences. -> see Durbin pp 140
595
//fprintf(stderr,"Align %d\n",c);
597
newp[i] = profa[i] + profb[i];
606
//fprintf(stderr,"Gap_A:%d\n",c);
607
//printf("open:%d ext:%d %d %d\n",si->nsip[a] * gpo,si->nsip[a] * gpe,si->nsip[a] * profb[41],si->nsip[a] * profb[46]);
615
newp[25] += sipa;//1;
618
newp[24] += sipa;//1;
622
for (j = 32; j < 55;j++){
627
// fprintf(stderr,"close_open");
629
newp[25] += sipa;//1;
631
newp[23] += sipa;//1;
634
newp[23] += sipa;//1;
638
for (j = 32; j < 55;j++){
643
// fprintf(stderr,"Gap_open");
645
newp[25] += sipa;//1;
647
newp[23] += sipa;//1;
650
newp[23] += sipa;//1;
653
for (j = 32; j < 55;j++){
663
//fprintf(stderr,"Gap_B:%d\n",c);
664
//printf("open:%d ext:%d %d %d\n",si->nsip[b] * gpo,si->nsip[b] * gpe,profa[26],profa[27]);
672
newp[25] += sipb;//1;
675
newp[24] += sipb;//1;
678
for (j = 32; j < 55;j++){
683
// fprintf(stderr,"close_open");
685
newp[25] += sipb;//1;
687
newp[23] += sipb;//1;
690
newp[23] += sipb;//1;
693
for (j = 32; j < 55;j++){
698
// fprintf(stderr,"Gap_open");
700
newp[25] += sipb;//1;
702
newp[23] += sipb;//1;
705
newp[23] += sipb;//1;
709
for (j = 32; j < 55;j++){
721
newp[i] = profa[i] + profb[i];
723
newp -= (path[0]+1) *64;
727
float* dna_update(const float* profa, const float* profb, float* newp,int* path,int sipa,int sipb)
732
newp[i] = profa[i] + profb[i];
742
//Idea: limit the 'virtual' number of residues of one type to x.
743
// i.e. only allow a maximum of 10 alanines to be registered in each column
744
// the penalty for aligning a 'G' to this column will stay stable even when many (>10) alanines are present.
745
// the difference in score between the 'correct' (all alanine) and incorrect (alanines + glycine) will not increase
746
// with the number of sequences. -> see Durbin pp 140
749
//fprintf(stderr,"Align %d\n",c);
751
newp[i] = profa[i] + profb[i];
759
//fprintf(stderr,"Gap_A:%d\n",c);
760
//printf("open:%d ext:%d %d %d\n",si->nsip[a] * gpo,si->nsip[a] * gpe,si->nsip[a] * profb[41],si->nsip[a] * profb[46]);
774
for (j = 11; j < 16;j++){
779
// fprintf(stderr,"close_open");
790
for (j = 11; j < 16;j++){
795
// fprintf(stderr,"Gap_open");
805
for (j = 11; j < 16; j++){
814
//fprintf(stderr,"Gap_B:%d\n",c);
815
//printf("open:%d ext:%d %d %d\n",si->nsip[b] * gpo,si->nsip[b] * gpe,profa[26],profa[27]);
828
for (j = 11; j < 16;j++){
833
// fprintf(stderr,"close_open");
843
for (j = 11; j < 16;j++){
848
// fprintf(stderr,"Gap_open");
859
for (j = 11; j < 16;j++){
870
newp[i] = profa[i] + profb[i];
872
newp -= (path[0]+1) *22;
877
void dna_set_gap_penalties(float* prof,int len,int nsip)
881
prof += (22 *(len+1));
882
prof[8] = prof[16]*nsip;//gap open or close
883
prof[9] = prof[17]*nsip;//gap extention
885
prof[10] = prof[18]*nsip;//gap open or close
886
//prof[30] = prof[58]*nsip;//gap extention
892
prof[8] = prof[16]*nsip;//gap open or close
893
prof[9] = prof[17]*nsip;//gap extention
895
prof[10] = prof[18]*nsip;//gap open or close
896
// prof[30] = prof[58]*nsip;//gap extention
900
void set_gap_penalties(float* prof,int len,int nsip)
904
prof += (64 *(len+1));
905
prof[27] = prof[55]*nsip;//gap open or close
906
prof[28] = prof[56]*nsip;//gap extention
908
prof[29] = prof[57]*nsip;//gap open or close
912
prof[27] = prof[55]*nsip;//gap open or close
913
prof[28] = prof[56]*nsip;//gap extention
915
prof[29] = prof[57]*nsip;//gap open or close