1
/*****************************************************************
2
* SQUID - a library of functions for biological sequence analysis
3
* Copyright (C) 1992-2002 Washington University School of Medicine
5
* This source code is freely distributed under the terms of the
6
* GNU General Public License. See the files COPYRIGHT and LICENSE
8
*****************************************************************/
11
* SRE, Mon Jul 12 11:57:37 1993
12
* RCS $Id: alignio.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: alignio.c,v 1.11 2002/10/09 14:26:09 eddy Exp)
14
* Input/output of sequence alignments.
22
#include "sre_random.h"
24
/* Function: AllocAlignment()
26
* Purpose: Allocate space for an alignment, given the number
27
* of sequences and the alignment length in columns.
29
* Args: nseq - number of sequences
30
* alen - width of alignment
31
* ret_aseq - RETURN: alignment itself
32
* ainfo - RETURN: other info associated with alignment
35
* aseq, ainfo free'd by caller: FreeAlignment(aseq, &ainfo).
36
* note that ainfo itself is alloc'ed in caller, usually
37
* just by a "AINFO ainfo" definition.
40
AllocAlignment(int nseq, int alen, char ***ret_aseq, AINFO *ainfo)
47
aseq = (char **) MallocOrDie (sizeof(char *) * nseq);
48
for (idx = 0; idx < nseq; idx++)
49
aseq[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1));
54
ainfo->wgt = (float *) MallocOrDie (sizeof(float) * nseq);
55
FSet(ainfo->wgt, nseq, 1.0);
57
ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq);
58
for (idx = 0; idx < nseq; idx++)
59
ainfo->sqinfo[idx].flags = 0;
65
/* Function: InitAinfo()
66
* Date: SRE, Tue Jan 19 10:16:02 1999 [St. Louis]
68
* Purpose: Initialize the fields in ainfo structure to
69
* default (null) values. Does nothing with
70
* fields that are dependent on nseq or alen.
72
* Args: ainfo - optional info structure for an alignment
74
* Returns: (void). ainfo is modified.
77
InitAinfo(AINFO *ainfo)
87
ainfo->tc1 = ainfo->tc2 = 0.0;
88
ainfo->nc1 = ainfo->nc2 = 0.0;
89
ainfo->ga1 = ainfo->ga2 = 0.0;
93
/* Function: FreeAlignment()
95
* Purpose: Free the space allocated to alignment, names, and optional
98
* Args: aseqs - sequence alignment
99
* ainfo - associated alignment data.
102
FreeAlignment(char **aseqs, AINFO *ainfo)
106
for (i = 0; i < ainfo->nseq; i++)
108
if (ainfo->sqinfo[i].flags & SQINFO_SS) free(ainfo->sqinfo[i].ss);
109
if (ainfo->sqinfo[i].flags & SQINFO_SA) free(ainfo->sqinfo[i].sa);
111
if (ainfo->cs != NULL) free(ainfo->cs);
112
if (ainfo->rf != NULL) free(ainfo->rf);
113
if (ainfo->name != NULL) free(ainfo->name);
114
if (ainfo->desc != NULL) free(ainfo->desc);
115
if (ainfo->acc != NULL) free(ainfo->acc);
116
if (ainfo->au != NULL) free(ainfo->au);
120
Free2DArray((void **) aseqs, ainfo->nseq);
125
/* Function: SAMizeAlignment()
126
* Date: SRE, Tue Jun 30 09:49:40 1998 [St. Louis]
128
* Purpose: Make a "best effort" attempt to convert an alignment
129
* to SAM gap format: - in delete col, . in insert col.
130
* Only works if alignment adheres to SAM's upper/lower
131
* case convention, which is true for instance of old
134
* Args: aseq - alignment to convert
135
* nseq - number of seqs in alignment
136
* alen - length of alignment
141
SAMizeAlignment(char **aseq, int nseq, int alen)
143
int col; /* counter for aligned columns */
144
int i; /* counter for seqs */
145
int sawlower, sawupper, sawgap;
148
for (col = 0; col < alen; col++)
150
sawlower = sawupper = sawgap = 0;
151
/* pass 1: do we see only upper or lower? */
152
for (i = 0; i < nseq; i++)
154
if (isgap(aseq[i][col])) { sawgap = 1; continue; }
155
if (isupper((int) aseq[i][col])) { sawupper = 1; continue; }
156
if (islower((int) aseq[i][col])) sawlower = 1;
158
/* select gap character for column */
159
gapchar = '-'; /* default */
160
if (sawlower && ! sawupper) gapchar = '.';
162
/* pass 2: set gap char */
163
for (i = 0; i < nseq; i++)
164
if (isgap(aseq[i][col])) aseq[i][col] = gapchar;
169
/* Function: SAMizeAlignmentByGapFrac()
170
* Date: SRE, Tue Jun 30 10:58:38 1998 [St. Louis]
172
* Purpose: Convert an alignment to SAM's gap and case
173
* conventions, using gap fraction in a column
174
* to choose match versus insert columns. In match columns,
175
* residues are upper case and gaps are '-'.
176
* In insert columns, residues are lower case and
179
* Args: aseq - aligned sequences
180
* nseq - number of sequences
181
* alen - length of alignment
182
* maxgap - if more gaps than this fraction, column is insert.
184
* Returns: (void) Characters in aseq may be altered.
187
SAMizeAlignmentByGapFrac(char **aseq, int nseq, int alen, float maxgap)
189
int apos; /* counter over columns */
190
int idx; /* counter over sequences */
191
int ngap; /* number of gaps seen */
193
for (apos = 0; apos < alen; apos++)
197
for (idx = 0; idx < nseq; idx++)
198
if (isgap(aseq[idx][apos])) ngap++;
200
/* convert to SAM conventions */
201
if ((float) ngap / (float) nseq > maxgap)
202
{ /* insert column */
203
for (idx = 0; idx < nseq; idx++)
204
if (isgap(aseq[idx][apos])) aseq[idx][apos] = '.';
205
else aseq[idx][apos] = (char) tolower((int) aseq[idx][apos]);
209
for (idx = 0; idx < nseq; idx++)
210
if (isgap(aseq[idx][apos])) aseq[idx][apos] = '-';
211
else aseq[idx][apos] = (char) toupper((int) aseq[idx][apos]);
219
/* Function: MakeAlignedString()
221
* Purpose: Given a raw string of some type (secondary structure, say),
222
* align it to a given aseq by putting gaps wherever the
225
* Args: aseq: template for alignment
226
* alen: length of aseq
227
* ss: raw string to align to aseq
228
* ret_s: RETURN: aligned ss
230
* Return: 1 on success, 0 on failure (and squid_errno is set.)
231
* ret_ss is malloc'ed here and must be free'd by caller.
234
MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s)
239
new = (char *) MallocOrDie ((alen+1) * sizeof(char));
240
for (apos = rpos = 0; apos < alen; apos++)
241
if (! isgap(aseq[apos]))
243
new[apos] = ss[rpos];
250
if (rpos != strlen(ss))
251
{ squid_errno = SQERR_PARAMETER; free(new); return 0; }
257
/* Function: MakeDealignedString()
259
* Purpose: Given an aligned string of some type (either sequence or
260
* secondary structure, for instance), dealign it relative
261
* to a given aseq. Return a ptr to the new string.
263
* Args: aseq : template alignment
264
* alen : length of aseq
265
* ss: : string to make dealigned copy of; same length as aseq
266
* ret_s : RETURN: dealigned copy of ss
268
* Return: 1 on success, 0 on failure (and squid_errno is set)
269
* ret_s is alloc'ed here and must be freed by caller
272
MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s)
277
new = (char *) MallocOrDie ((alen+1) * sizeof(char));
278
for (apos = rpos = 0; apos < alen; apos++)
279
if (! isgap(aseq[apos]))
281
new[rpos] = ss[apos];
285
if (alen != strlen(ss))
286
{ squid_errno = SQERR_PARAMETER; free(new); return 0; }
292
/* Function: DealignedLength()
294
* Purpose: Count the number of non-gap symbols in seq.
295
* (i.e. find the length of the unaligned sequence)
297
* Args: aseq - aligned sequence to count symbols in, \0 terminated
299
* Return: raw length of seq.
302
DealignedLength(char *aseq)
305
for (rlen = 0; *aseq; aseq++)
306
if (! isgap(*aseq)) rlen++;
311
/* Function: WritePairwiseAlignment()
313
* Purpose: Write a nice formatted pairwise alignment out,
314
* with a BLAST-style middle line showing identities
315
* as themselves (single letter) and conservative
318
* Args: ofp - open fp to write to (stdout, perhaps)
319
* aseq1, aseq2 - alignments to write (not necessarily
320
* flushed right with gaps)
321
* name1, name2 - names of sequences
322
* spos1, spos2 - starting position in each (raw) sequence
323
* pam - PAM matrix; positive values define
324
* conservative changes
325
* indent - how many extra spaces to print on left
327
* Return: 1 on success, 0 on failure
330
WritePairwiseAlignment(FILE *ofp,
331
char *aseq1, char *name1, int spos1,
332
char *aseq2, char *name2, int spos2,
333
int **pam, int indent)
335
char sname1[11]; /* shortened name */
337
int still_going; /* True if writing another block */
338
char buf1[61]; /* buffer for writing seq1; CPL+1*/
339
char bufmid[61]; /* buffer for writing consensus */
341
char *s1, *s2; /* ptrs into each sequence */
342
int count1, count2; /* number of symbols we're writing */
343
int rpos1, rpos2; /* position in raw seqs */
344
int rawcount1, rawcount2; /* number of nongap symbols written */
347
strncpy(sname1, name1, 10);
349
strtok(sname1, WHITESPACE);
351
strncpy(sname2, name2, 10);
353
strtok(sname2, WHITESPACE);
365
/* get next line's worth from both */
366
strncpy(buf1, s1, 60); buf1[60] = '\0';
367
strncpy(buf2, s2, 60); buf2[60] = '\0';
368
count1 = strlen(buf1);
369
count2 = strlen(buf2);
371
/* is there still more to go? */
372
if ((count1 == 60 && s1[60] != '\0') ||
373
(count2 == 60 && s2[60] != '\0'))
376
/* shift seq ptrs by a line */
380
/* assemble the consensus line */
381
for (apos = 0; apos < count1 && apos < count2; apos++)
383
if (!isgap(buf1[apos]) && !isgap(buf2[apos]))
385
if (buf1[apos] == buf2[apos])
386
bufmid[apos] = buf1[apos];
387
else if (pam[buf1[apos] - 'A'][buf2[apos] - 'A'] > 0)
398
for (apos = 0; apos < count1; apos++)
399
if (!isgap(buf1[apos])) rawcount1++;
402
for (apos = 0; apos < count2; apos++)
403
if (!isgap(buf2[apos])) rawcount2++;
405
(void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
406
sname1, rpos1, buf1, rpos1 + rawcount1 -1);
407
(void) fprintf(ofp, "%*s %s\n", indent, "",
409
(void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
410
sname2, rpos2, buf2, rpos2 + rawcount2 -1);
411
(void) fprintf(ofp, "\n");
421
/* Function: MingapAlignment()
423
* Purpose: Remove all-gap columns from a multiple sequence alignment
424
* and its associated data. The alignment is assumed to be
425
* flushed (all aseqs the same length).
428
MingapAlignment(char **aseqs, AINFO *ainfo)
430
int apos; /* position in original alignment */
431
int mpos; /* position in new alignment */
434
/* We overwrite aseqs, using its allocated memory.
436
for (apos = 0, mpos = 0; aseqs[0][apos] != '\0'; apos++)
438
/* check for all-gap in column */
439
for (idx = 0; idx < ainfo->nseq; idx++)
440
if (! isgap(aseqs[idx][apos]))
442
if (idx == ainfo->nseq) continue;
444
/* shift alignment and ainfo */
447
for (idx = 0; idx < ainfo->nseq; idx++)
448
aseqs[idx][mpos] = aseqs[idx][apos];
450
if (ainfo->cs != NULL) ainfo->cs[mpos] = ainfo->cs[apos];
451
if (ainfo->rf != NULL) ainfo->rf[mpos] = ainfo->rf[apos];
455
/* null terminate everything */
456
for (idx = 0; idx < ainfo->nseq; idx++)
457
aseqs[idx][mpos] = '\0';
458
ainfo->alen = mpos; /* set new length */
459
if (ainfo->cs != NULL) ainfo->cs[mpos] = '\0';
460
if (ainfo->rf != NULL) ainfo->rf[mpos] = '\0';
466
/* Function: RandomAlignment()
468
* Purpose: Create a random alignment from raw sequences.
470
* Ideally, we would like to sample an alignment from the
471
* space of possible alignments according to its probability,
472
* given a prior probability distribution for alignments.
473
* I don't see how to describe such a distribution, let alone
476
* This is a rough approximation that tries to capture some
477
* desired properties. We assume the alignment is generated
478
* by a simple HMM composed of match and insert states.
479
* Given parameters (pop, pex) for the probability of opening
480
* and extending an insertion, we can find the expected number
481
* of match states, M, in the underlying model for each sequence.
482
* We use an average M taken over all the sequences (this is
483
* an approximation. The expectation of M given all the sequence
484
* lengths is a nasty-looking summation.)
486
* M = len / ( 1 + pop ( 1 + 1/ (1-pex) ) )
488
* Then, we assign positions in each raw sequence onto the M match
489
* states and M+1 insert states of this "HMM", by rolling random
490
* numbers and inserting the (rlen-M) inserted positions randomly
491
* into the insert slots, taking into account the relative probability
492
* of open vs. extend.
494
* The resulting alignment has two desired properties: insertions
495
* tend to follow the HMM-like exponential distribution, and
496
* the "sparseness" of the alignment is controllable through
499
* Args: rseqs - raw sequences to "align", 0..nseq-1
500
* sqinfo - array of 0..nseq-1 info structures for the sequences
501
* nseq - number of sequences
502
* pop - probability to open insertion (0<pop<1)
503
* pex - probability to extend insertion (0<pex<1)
504
* ret_aseqs - RETURN: alignment (flushed)
505
* ainfo - fill in: alignment info
507
* Return: 1 on success, 0 on failure. Sets squid_errno to indicate cause
511
RandomAlignment(char **rseqs, SQINFO *sqinfo, int nseq, float pop, float pex,
512
char ***ret_aseqs, AINFO *ainfo)
514
char **aseqs; /* RETURN: alignment */
515
int alen; /* length of alignment */
516
int *rlen; /* lengths of each raw sequence */
517
int M; /* length of "model" */
518
int **ins; /* insertion counts, 0..nseq-1 by 0..M */
519
int *master_ins; /* max insertion counts, 0..M */
525
/* calculate expected length of model, M
527
rlen = (int *) MallocOrDie (sizeof(int) * nseq);
530
for (idx = 0; idx < nseq; idx++)
532
rlen[idx] = strlen(rseqs[idx]);
534
minlen = (rlen[idx] < minlen) ? rlen[idx] : minlen;
536
M = (int) ((float) M / (1.0 + pop * (1.0 + 1.0 / (1.0 - pex))));
538
if (M > minlen) M = minlen;
540
/* make arrays that count insertions in M+1 possible insert states
542
ins = (int **) MallocOrDie (sizeof(int *) * nseq);
543
master_ins = (int *) MallocOrDie (sizeof(int) * (M+1));
544
for (idx = 0; idx < nseq; idx++)
546
ins[idx] = (int *) MallocOrDie (sizeof(int) * (M+1));
547
for (rpos = 0; rpos <= M; rpos++)
551
pop = pop / (pop+pex);
553
/* make insertions for individual sequences */
554
for (idx = 0; idx < nseq; idx++)
557
for (rpos = 0; rpos < rlen[idx]-M; rpos++)
559
if (sre_random() < pop || apos == -1) /* open insertion */
560
apos = CHOOSE(M+1); /* choose 0..M */
564
/* calculate master_ins, max inserts */
566
for (apos = 0; apos <= M; apos++)
568
master_ins[apos] = 0;
569
for (idx = 0; idx < nseq; idx++)
570
if (ins[idx][apos] > master_ins[apos])
571
master_ins[apos] = ins[idx][apos];
572
alen += master_ins[apos];
576
/* Now, construct alignment
578
aseqs = (char **) MallocOrDie (sizeof (char *) * nseq);
579
for (idx = 0; idx < nseq; idx++)
580
aseqs[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1));
581
for (idx = 0; idx < nseq; idx++)
585
for (statepos = 0; statepos <= M; statepos++)
587
for (count = 0; count < ins[idx][statepos]; count++)
588
aseqs[idx][apos++] = rseqs[idx][rpos++];
589
for (; count < master_ins[statepos]; count++)
590
aseqs[idx][apos++] = ' ';
593
aseqs[idx][apos++] = rseqs[idx][rpos++];
595
aseqs[idx][alen] = '\0';
600
ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq);
601
for (idx = 0; idx < nseq; idx++)
602
SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx]));
606
Free2DArray((void **) ins, nseq);
611
/* Function: AlignmentHomogenousGapsym()
612
* Date: SRE, Sun Mar 19 19:37:12 2000 [wren, St. Louis]
614
* Purpose: Sometimes we've got to convert alignments to
615
* a lowest common denominator, and we need
616
* a single specific gap character -- for example,
617
* PSI-BLAST blastpgp -B takes a very simplistic
618
* alignment input format which appears to only
619
* allow '-' as a gap symbol.
621
* Anything matching the isgap() macro is
624
* Args: aseq - aligned character strings, [0..nseq-1][0..alen-1]
625
* nseq - number of aligned strings
626
* alen - length of alignment
627
* gapsym - character to use for gaps.
629
* Returns: void ("never fails")
632
AlignmentHomogenousGapsym(char **aseq, int nseq, int alen, char gapsym)
636
for (i = 0; i < nseq; i++)
637
for (apos = 0; apos < alen; apos++)
638
if (isgap(aseq[i][apos])) aseq[i][apos] = gapsym;