1
/*****************************************************************
2
* Unipro UGENE - Integrated Bioinformatics Suite
3
* Copyright (C) 2008 Unipro, Russia (http://ugene.unipro.ru)
6
* This source code is distributed under the terms of the
7
* GNU General Public License. See the files COPYING and LICENSE
9
*****************************************************************/
11
/*****************************************************************
12
* HMMER - Biological sequence analysis with profile HMMs
13
* Copyright (C) 1992-2003 Washington University School of Medicine
16
* This source code is distributed under the terms of the
17
* GNU General Public License. See the files COPYING and LICENSE
19
*****************************************************************/
22
* SRE, Thu Mar 3 07:56:01 1994
24
* Calculate weights for sequences in an alignment.
25
* RCS $Id: weight.c,v 1.11 2003/04/14 16:00:16 eddy Exp $
31
static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node);
32
static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt,
33
float *fwt, int node);
34
static float simple_distance(char *s1, char *s2);
35
static int simple_diffmx(char **aseqs,int num, float ***ret_dmx);
37
/* Function: GSCWeights()
39
* Purpose: Use Erik's tree-based algorithm to set weights for
40
* sequences in an alignment. upweight() and downweight()
41
* are derived from Graeme Mitchison's code.
43
* Args: aseq - array of (0..nseq-1) aligned sequences
44
* nseq - number of seqs in alignment
45
* alen - length of alignment
46
* wgt - allocated [0..nseq-1] array of weights to be returned
52
GSCWeights(char **aseq, int nseq, int alen, float *wgt)
54
float **dmx; /* distance (difference) matrix */
56
float *lwt, *rwt; /* weight on left, right of this tree node */
57
float *fwt; /* final weight assigned to this node */
62
if (nseq == 1) { wgt[0] = 1.0; return; }
64
/* I use a simple fractional difference matrix derived by
65
* pairwise identity. Perhaps I should include a Poisson
66
* distance correction.
68
MakeDiffMx(aseq, nseq, &dmx);
69
if (! Cluster(dmx, nseq, CLUSTER_MIN, &tree)) Die("Cluster() failed");
73
lwt = (float*)MallocOrDie (sizeof(float) * (2 * nseq - 1));
74
rwt = (float*)MallocOrDie (sizeof(float) * (2 * nseq - 1));
75
fwt = (float*)MallocOrDie (sizeof(float) * (2 * nseq - 1));
77
/* lwt and rwt are the total branch weight to the left and
78
* right of a node or sequence. They are 0..2N-2. 0..N-1 are
79
* the sequences; these have weight 0. N..2N-2 are the actual
82
for (i = 0; i < nseq; i++)
83
lwt[i] = rwt[i] = 0.0;
84
/* recursively calculate rwt, lwt, starting
85
at node nseq (the root) */
86
upweight(tree, nseq, lwt, rwt, nseq);
88
/* recursively distribute weight across the
91
downweight(tree, nseq, lwt, rwt, fwt, nseq);
92
/* collect the weights */
93
for (i = 0; i < nseq; i++)
97
FreePhylo(tree, nseq);
98
free(lwt); free(rwt); free(fwt);
102
upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node)
106
ld = tree[node-nseq].left;
107
if (ld >= nseq) upweight(tree, nseq, lwt, rwt, ld);
108
rd = tree[node-nseq].right;
109
if (rd >= nseq) upweight(tree, nseq, lwt, rwt, rd);
110
lwt[node] = lwt[ld] + rwt[ld] + tree[node-nseq].lblen;
111
rwt[node] = lwt[rd] + rwt[rd] + tree[node-nseq].rblen;
116
downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, float *fwt, int node)
121
ld = tree[node-nseq].left;
122
rd = tree[node-nseq].right;
123
if (lwt[node] + rwt[node] > 0.0)
125
fwt[ld] = fwt[node] * (lwt[node] / (lwt[node] + rwt[node]));
126
fwt[rd] = fwt[node] * (rwt[node] / (lwt[node] + rwt[node]));
130
lnum = (ld >= nseq) ? tree[ld-nseq].incnum : 1.0;
131
rnum = (rd >= nseq) ? tree[rd-nseq].incnum : 1.0;
132
fwt[ld] = fwt[node] * lnum / (lnum + rnum);
133
fwt[rd] = fwt[node] * rnum / (lnum + rnum);
136
if (ld >= nseq) downweight(tree, nseq, lwt, rwt, fwt, ld);
137
if (rd >= nseq) downweight(tree, nseq, lwt, rwt, fwt, rd);
143
/* Function: VoronoiWeights()
145
* Purpose: Calculate weights using the scheme of Sibbald &
146
* Argos (JMB 216:813-818 1990). The scheme is
147
* slightly modified because the original algorithm
148
* actually doesn't work on gapped alignments.
149
* The sequences are assumed to be protein.
151
* Args: aseq - array of (0..nseq-1) aligned sequences
152
* nseq - number of sequences
153
* alen - length of alignment
154
* wgt - allocated [0..nseq-1] array of weights to be returned
160
VoronoiWeights(char **aseq, int nseq, int alen, float *wgt)
162
float **dmx; /* distance (difference) matrix */
163
float *halfmin; /* 1/2 minimum distance to other seqs */
164
char **psym; /* symbols seen in each column */
165
int *nsym; /* # syms seen in each column */
166
int symseen[27]; /* flags for observed syms */
167
char *randseq; /* randomly generated sequence */
168
int acol; /* pos in aligned columns */
169
int idx; /* index in sequences */
170
int symidx; /* 0..25 index for symbol */
171
int i; /* generic counter */
172
float min; /* minimum distance */
173
float dist; /* distance between random and real */
174
float challenge, champion; /* for resolving ties */
175
int itscale; /* how many iterations per seq */
177
int best; /* index of nearest real sequence */
179
/* Sanity check first
181
if (nseq == 1) { wgt[0] = 1.0; return; }
185
/* Precalculate 1/2 minimum distance to other
186
* sequences for each sequence
188
if (! simple_diffmx(aseq, nseq, &dmx))
189
Die("simple_diffmx() failed");
190
halfmin = (float*)MallocOrDie (sizeof(float) * nseq);
191
for (idx = 0; idx < nseq; idx++)
193
for (min = 1.0, i = 0; i < nseq; i++)
195
if (i == idx) continue;
196
if (dmx[idx][i] < min) min = dmx[idx][i];
198
halfmin[idx] = min / 2.0;
200
Free2DArray((void **) dmx, nseq);
202
/* Set up the random sequence generating model.
204
psym = (char**)MallocOrDie (alen * sizeof(char *));
205
nsym = (int*)MallocOrDie (alen * sizeof(int));
206
for (acol = 0; acol < alen; acol++)
207
psym[acol] = (char*)MallocOrDie (27 * sizeof(char));
209
/* #ifdef ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
210
for (acol = 0; acol < alen; acol++)
212
memset(symseen, 0, sizeof(int) * 27);
213
for (idx = 0; idx < nseq; idx++)
214
if (! isgap(aseq[idx][acol]))
216
if (isupper((int) aseq[idx][acol]))
217
symidx = aseq[idx][acol] - 'A';
219
symidx = aseq[idx][acol] - 'a';
220
if (symidx >= 0 && symidx < 26)
224
symseen[26] = 1; /* a gap */
226
for (nsym[acol] = 0, i = 0; i < 26; i++)
229
psym[acol][nsym[acol]] = 'A'+i;
232
if (symseen[26]) { psym[acol][nsym[acol]] = ' '; nsym[acol]++; }
234
/* #endif ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
236
/* Note: the original Sibbald&Argos algorithm calls for
237
* bounding the sampled space using a template-like random
238
* sequence generator. However, this leads to one minor
239
* and one major problem. The minor problem is that
240
* exceptional amino acids in a column can have a
241
* significant effect by altering the amount of sampled
242
* sequence space; the larger the data set, the worse
243
* this problem becomes. The major problem is that
244
* there is no reasonable way to deal with gaps.
245
* Gapped sequences simply inhabit a different dimensionality
246
* and it's pretty painful to imagine calculating Voronoi
247
* volumes when the N in your N-space is varying.
248
* Note that all the examples shown by Sibbald and Argos
249
* are *ungapped* examples.
251
* The best way I've found to circumvent this problem is
252
* just not to bound the sampled space; count gaps as
253
* symbols and generate completely random sequences.
255
#ifdef ALL_SEQUENCE_SPACE
256
for (acol = 0; acol < alen; acol++)
258
strcpy(psym[acol], "ACDEFGHIKLMNPQRSTVWY ");
263
/* Sibbald and Argos algorithm:
264
* 1) assign all seqs weight 0.
265
* 2) generate a "random" sequence
266
* 3) calculate distance to every other sequence
267
* (if we get a distance < 1/2 minimum distance
268
* to other real seqs, we can stop)
269
* 4) if unique closest sequence, increment its weight 1.
270
* if multiple closest seq, choose one randomly
271
* 5) repeat 2-4 for lots of iterations
272
* 6) normalize all weights to sum to nseq.
274
randseq = (char*)MallocOrDie ((alen+1) * sizeof(char));
276
best = 42.; /* solely to silence GCC uninit warnings. */
277
FSet(wgt, nseq, 0.0);
278
for (iteration = 0; iteration < itscale * nseq; iteration++)
280
for (acol = 0; acol < alen; acol++)
281
randseq[acol] = (nsym[acol] == 0) ? ' ' : psym[acol][CHOOSE(nsym[acol])];
282
randseq[acol] = '\0';
284
champion = sre_random();
285
for (min = 1.0, idx = 0; idx < nseq; idx++)
287
dist = simple_distance(aseq[idx], randseq);
288
if (dist < halfmin[idx])
294
{ champion = sre_random(); best = idx; min = dist; }
295
else if (dist == min)
297
challenge = sre_random();
298
if (challenge > champion)
299
{ champion = challenge; best = idx; min = dist; }
305
for (idx = 0; idx < nseq; idx++)
306
wgt[idx] = wgt[idx] / (float) itscale;
311
Free2DArray((void **) psym, alen);
315
/* Function: simple_distance()
317
* Purpose: For two identical-length null-terminated strings, return
318
* the fractional difference between them. (0..1)
319
* (Gaps don't count toward anything.)
322
simple_distance(char *s1, char *s2)
327
for (; *s1 != '\0'; s1++, s2++)
329
if (isgap(*s1) || isgap(*s2)) continue;
330
if (*s1 != *s2) diff++;
333
return (valid > 0 ? ((float) diff / (float) valid) : 0.0);
336
/* Function: simple_diffmx()
338
* Purpose: Given a set of flushed, aligned sequences, construct
339
* an NxN fractional difference matrix using the
340
* simple_distance rule.
342
* Args: aseqs - flushed, aligned sequences
343
* num - number of aseqs
344
* ret_dmx - RETURN: difference matrix (caller must free)
346
* Return: 1 on success, 0 on failure.
349
simple_diffmx(char **aseqs,
353
float **dmx; /* RETURN: distance matrix */
354
int i,j; /* counters over sequences */
358
if ((dmx = (float **) malloc (sizeof(float *) * num)) == NULL)
359
Die("malloc failed");
360
for (i = 0; i < num; i++)
361
if ((dmx[i] = (float *) malloc (sizeof(float) * num)) == NULL)
362
Die("malloc failed");
364
/* Calculate distances, symmetric matrix
366
for (i = 0; i < num; i++)
367
for (j = i; j < num; j++)
368
dmx[i][j] = dmx[j][i] = simple_distance(aseqs[i], aseqs[j]);
378
/* Function: BlosumWeights()
379
* Date: SRE, Fri Jul 16 17:33:59 1999 (St. Louis)
381
* Purpose: Assign weights to a set of aligned sequences
382
* using the BLOSUM rule:
383
* - do single linkage clustering at some pairwise identity
384
* - in each cluster, give each sequence 1/clustsize
387
* The clusters have no pairwise link >= maxid.
389
* O(N) in memory. Probably ~O(NlogN) in time; O(N^2)
390
* in worst case, which is no links between sequences
391
* (e.g., values of maxid near 1.0).
393
* Args: aseqs - alignment
394
* nseq - number of seqs in alignment
395
* alen - # of columns in alignment
396
* maxid - fractional identity (e.g. 0.62 for BLOSUM62)
397
* wgt - [0..nseq-1] array of weights to be returned
400
BlosumWeights(char **aseqs, int nseq, int alen, float maxid, float *wgt)
403
int *nmem; /* number of seqs in each cluster */
404
int i; /* loop counter */
406
SingleLinkCluster(aseqs, nseq, alen, maxid, &c, &nc);
408
FSet(wgt, nseq, 1.0);
409
nmem = (int*)MallocOrDie(sizeof(int) * nc);
411
for (i = 0; i < nc; i++) nmem[i] = 0;
412
for (i = 0; i < nseq; i++) nmem[c[i]]++;
413
for (i = 0; i < nseq; i++) wgt[i] = 1. / (float) nmem[c[i]];
421
/* Function: PositionBasedWeights()
422
* Date: SRE, Fri Jul 16 17:47:22 1999 [St. Louis]
424
* Purpose: Implementation of Henikoff and Henikoff position-based
425
* weights (JMB 243:574-578, 1994) [Henikoff94b].
427
* A significant advantage of this approach that Steve and Jorja
428
* don't point out is that it is O(N) in memory, unlike
429
* many other approaches like GSC weights or Voronoi.
431
* A potential disadvantage that they don't point out
432
* is that in the theoretical limit of infinite sequences
433
* in the alignment, weights go flat: eventually every
434
* column has at least one representative of each of 20 aa (or 4 nt)
437
* They also don't give a rule for how to handle gaps.
438
* The rule used here seems the obvious and sensible one
439
* (ignore them). This means that longer sequences
440
* initially get more weight; hence a "double
441
* normalization" in which the weights are first divided
442
* by sequence length (to compensate for that effect),
443
* then normalized to sum to nseq.
446
* Implemented in a way that's alphabet-independent:
447
* it uses the 26 upper case letters as "residues".
448
* Any alphabetic character in aseq is interpreted as
449
* a unique "residue" (case insensitively; lower case
450
* mapped to upper case). All other characters are
451
* interpreted as gaps.
453
* This way, we don't have to pass around any alphabet
454
* type info (DNA vs. RNA vs. protein) and don't have
455
* to deal with remapping IUPAC degenerate codes
456
* probabilistically. However, on the down side,
457
* a sequence with a lot of degenerate IUPAC characters
458
* will get an artifactually high PB weight.
460
* Args: aseq - sequence alignment to weight
461
* nseq - number of sequences in alignment
462
* alen - length of alignment
463
* wgt - RETURN: weights filled in (pre-allocated 0..nseq-1)
466
* wgt is allocated (0..nseq-1) by caller, and filled in here.
469
PositionBasedWeights(char **aseq, int nseq, int alen, float *wgt)
471
int rescount[26]; /* count of A-Z residues in a column */
472
int nres; /* number of different residues in col */
473
int idx, pos; /* indices into aseq */
477
FSet(wgt, nseq, 0.0);
478
for (pos = 0; pos < alen; pos++)
480
for (x = 0; x < 26; x++) rescount[x] = 0;
481
for (idx = 0; idx < nseq; idx++)
482
if (isalpha((int) aseq[idx][pos]))
483
rescount[toupper((int) aseq[idx][pos]) - 'A'] ++;
486
for (x = 0; x < 26; x++)
487
if (rescount[x] > 0) nres++;
489
for (idx = 0; idx < nseq; idx++)
490
if (isalpha((int) aseq[idx][pos]))
491
wgt[idx] += 1. / (float) (nres * rescount[toupper((int) aseq[idx][pos]) - 'A']);
494
for (idx = 0; idx < nseq; idx++)
495
wgt[idx] /= (float) DealignedLength(aseq[idx]);
496
norm = (float) nseq / FSum(wgt, nseq);
497
FScale(wgt, nseq, norm);
504
/* Function: FilterAlignment()
505
* Date: SRE, Wed Jun 30 09:19:30 1999 [St. Louis]
507
* Purpose: Constructs a new alignment by removing near-identical
508
* sequences from a given alignment (where identity is
509
* calculated *based on the alignment*).
510
* Does not affect the given alignment.
511
* Keeps earlier sequence, discards later one.
513
* Usually called as an ad hoc sequence "weighting" mechanism.
516
* Unparsed Stockholm markup is not propagated into the
519
* Args: msa -- original alignment
520
* cutoff -- fraction identity cutoff. 0.8 removes sequences > 80% id.
521
* ret_new -- RETURN: new MSA, usually w/ fewer sequences
524
* ret_new must be free'd by caller: MSAFree().
527
FilterAlignment(MSA *msa, float cutoff, MSA **ret_new)
529
int nnew; /* number of seqs in new alignment */
536
/* find which seqs to keep (list) */
537
/* diff matrix; allow ragged ends */
538
list = (int*) MallocOrDie(sizeof(int) * msa->nseq);
539
useme = (int*) MallocOrDie(sizeof(int) * msa->nseq);
540
for (i = 0; i < msa->nseq; i++) useme[i] = FALSE;
543
for (i = 0; i < msa->nseq; i++)
546
for (j = 0; j < nnew; j++)
548
ident = PairwiseIdentity(msa->aseq[i], msa->aseq[list[j]]);
555
if (remove == FALSE) {
561
MSASmallerAlignment(msa, useme, ret_new);
568
/* Function: SampleAlignment()
569
* Date: SRE, Wed Jun 30 10:13:56 1999 [St. Louis]
571
* Purpose: Constructs a new, smaller alignment by sampling a given
572
* number of sequences at random. Does not change the
573
* alignment nor the order of the sequences.
575
* If you ask for a sample that is larger than nseqs,
576
* it silently returns the original alignment.
578
* Not really a weighting method, but this is as good
579
* a place as any to keep it, since it's similar in
580
* construction to FilterAlignment().
582
* Args: msa -- original alignment
583
* sample -- number of sequences in new alignment (0 < sample <= nseq)
584
* ret_new -- RETURN: new MSA
587
* ret_new must be free'd by caller: MSAFree().
590
SampleAlignment(MSA *msa, int sample, MSA **ret_new)
592
int *list; /* array for random selection w/o replace */
593
int *useme; /* array of flags 0..nseq-1: TRUE to use */
599
list = (int *) MallocOrDie(sizeof(int) * msa->nseq);
600
useme = (int *) MallocOrDie (sizeof(int) * msa->nseq);
601
for (i = 0; i < msa->nseq; i++)
609
if (sample >= msa->nseq) sample = msa->nseq;
611
/* random selection w/o replacement */
612
for (len = msa->nseq, i = 0; i < sample; i++)
615
useme[list[idx]] = TRUE;
616
list[idx] = list[--len];
619
MSASmallerAlignment(msa, useme, ret_new);
626
/* Function: SingleLinkCluster()
627
* Date: SRE, Fri Jul 16 15:02:57 1999 [St. Louis]
629
* Purpose: Perform simple single link clustering of seqs in a
630
* sequence alignment. A pairwise identity threshold
631
* defines whether two sequences are linked or not.
633
* Important: runs in O(N) memory, unlike standard
634
* graph decomposition algorithms that use O(N^2)
635
* adjacency matrices or adjacency lists. Requires
636
* O(N^2) time in worst case (which is when you have
637
* no links at all), O(NlogN) in "average"
638
* case, and O(N) in best case (when there is just
639
* one cluster in a completely connected graph.
641
* (Developed because hmmbuild could no longer deal
642
* with GP120, a 16,013 sequence alignment.)
645
* CASE-SENSITIVE. Assumes aseq have been put into
646
* either all lower or all upper case; or at least,
647
* within a column, there's no mixed case.
650
* I don't know if this algorithm is published. I
651
* haven't seen it in graph theory books, but that might
652
* be because it's so obvious that nobody's bothered.
654
* In brief, we're going to do a breadth-first search
655
* of the graph, and we're going to calculate links
656
* on the fly rather than precalculating them into
657
* some sort of standard adjacency structure.
659
* While working, we keep two stacks of maximum length N:
660
* a : list of vertices that are still unconnected.
661
* b : list of vertices that we've connected to
662
* in our current breadth level, but we haven't
663
* yet tested for other connections to a.
664
* The current length (number of elements in) a and b are
667
* We store our results in an array of length N:
668
* c : assigns each vertex to a component. for example
669
* c[4] = 1 means that vertex 4 is in component 1.
670
* nc is the number of components. Components
671
* are numbered from 0 to nc-1. We return c and nc
677
* a <-- all the vertices
684
* while (a is not empty)
685
* pop a vertex off a, push onto b
686
* while (b is not empty)
689
* for each vertex w in a:
690
* compare v,w. If w is linked to v, remove w
691
* from a, push onto b.
695
* Args: aseq - aligned sequences
696
* nseq - number of sequences in aseq
697
* alen - alignment length
698
* maxid - fractional identity threshold 0..1. if id >= maxid, seqs linked
699
* ret_c - RETURN: 0..nseq-1 assignments of seqs to components (clusters)
700
* ret_nc - RETURN: number of components
703
* ret_c is allocated here. Caller free's with free(*ret_c)
706
SingleLinkCluster(char **aseq, int nseq, int alen, float maxid,
707
int **ret_c, int *ret_nc)
709
int *a, na; /* stack of available vertices */
710
int *b, nb; /* stack of working vertices */
711
int *c; /* array of results */
712
int nc; /* total number of components */
713
int v,w; /* index of a working vertices */
714
int i; /* loop counter */
716
/* allocations and initializations
718
a = (int*)MallocOrDie (sizeof(int) * nseq);
719
b = (int*)MallocOrDie (sizeof(int) * nseq);
720
c = (int*)MallocOrDie (sizeof(int) * nseq);
721
for (i = 0; i < nseq; i++) a[i] = i;
730
v = a[na-1]; na--; /* pop a vertex off a, */
731
b[nb] = v; nb++; /* and push onto b */
734
v = b[nb-1]; nb--; /* pop vertex off b */
735
c[v] = nc; /* assign it to component nc */
736
for (i = na-1; i >= 0; i--)/* backwards, because of deletion/swapping we do*/
737
if (simple_distance(aseq[v], aseq[a[i]]) <= 1. - maxid) /* linked? */
739
w = a[i]; a[i] = a[na-1]; na--; /* delete w from a (note swap) */
740
b[nb] = w; nb++; /* push w onto b */
746
/* Cleanup and return