1
/*****************************************************************
2
* SQUID - a library of functions for biological sequence analysis
3
* Copyright (C) 1992-2002 Washington University School of Medicine
5
* This source code is freely distributed under the terms of the
6
* GNU General Public License. See the files COPYRIGHT and LICENSE
8
*****************************************************************/
11
* SRE, Sun Jul 11 16:17:32 1993
13
* Import/export of GCG MSF multiple sequence alignment
14
* formatted files. Designed using format specifications
15
* kindly provided by Steve Smith of Genetics Computer Group.
17
* RCS $Id: msf.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: msf.c,v 1.4 2001/04/23 00:35:33 eddy Exp)
29
/*****************************************************************
31
* cc -DTESTDRIVE_MSF -g -O2 -Wall -o test msf.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c sqio.c alignio.c selex.c interleaved.c types.c -lm
35
main(int argc, char **argv)
43
if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
44
Die("Couldn't open %s\n", file);
46
while ((msa = ReadMSF(afp)) != NULL)
48
WriteMSF(stdout, msa);
55
/******************************************************************/
56
#endif /* testdrive_msf */
60
/* Function: ReadMSF()
61
* Date: SRE, Tue Jun 1 08:07:22 1999 [St. Louis]
63
* Purpose: Parse an alignment read from an open MSF format
64
* alignment file. (MSF is a single-alignment format.)
65
* Return the alignment, or NULL if we've already
68
* Args: afp - open alignment file
70
* Returns: MSA * - an alignment object
71
* caller responsible for an MSAFree()
72
* NULL if no more alignments
75
* Will Die() here with a (potentially) useful message
76
* if a parsing error occurs.
93
if (feof(afp->f)) return NULL;
94
if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
96
/* The first line is the header.
97
* This is a new-ish GCG feature. Don't count on it, so
98
* we can be a bit more tolerant towards non-GCG software
99
* generating "MSF" files.
101
msa = MSAAlloc(10, 0);
102
if (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) {
104
if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
105
} else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) {
107
if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
110
/* Now we're in the free text comment section of the MSF file.
111
* It ends when we see the "MSF: Type: Check: .." line.
112
* This line must be present.
116
if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) &&
117
Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3))
119
alleged_alen = atoi(sqd_parse[0]);
120
switch (*(sqd_parse[1])) {
121
case 'N' : alleged_type = kRNA; break;
122
case 'P' : alleged_type = kAmino; break;
123
case 'X' : alleged_type = kOtherSeq; break;
124
default : alleged_type = kOtherSeq;
126
alleged_checksum = atoi(sqd_parse[3]);
127
if (msa->type == kOtherSeq) msa->type = alleged_type;
128
break; /* we're done with comment section. */
130
if (! IsBlankline(s))
131
MSAAddComment(msa, s);
132
} while ((s = MSAFileGetLine(afp)) != NULL);
134
/* Now we're in the name section.
135
* GCG has a relatively poorly documented feature: only sequences that
136
* appear in this list will be read from the alignment section. Commenting
137
* out sequences in the name list (by preceding them with "!") is
138
* allowed as a means of manually defining subsets of sequences in
139
* the alignment section. We can support this feature reasonably
140
* easily because of the hash table for names in the MSA: we
141
* only add names to the hash table when we see 'em in the name section.
143
while ((s = MSAFileGetLine(afp)) != NULL)
145
while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */
147
if (*s == '\n') continue; /* skip blank lines */
148
else if (*s == '!') MSAAddComment(msa, s);
149
else if ((sp = strstr(s, "Name:")) != NULL)
151
/* We take the name and the weigh, and that's it */
153
tok = sre_strtok(&sp, " \t", &slen); /* <sequence name> */
154
sqidx = GKIStoreKey(msa->index, tok);
155
if (sqidx >= msa->nseqalloc) MSAExpand(msa);
156
msa->sqname[sqidx] = sre_strdup(tok, slen);
159
if ((sp = strstr(sp, "Weight:")) == NULL)
160
Die("No Weight: on line %d for %s in name section of MSF file %s\n",
161
afp->linenumber, msa->sqname[sqidx], afp->fname);
163
tok = sre_strtok(&sp, " \t", &slen);
164
msa->wgt[sqidx] = atof(tok);
165
msa->flags |= MSA_SET_WGT;
167
else if (strncmp(s, "//", 2) == 0)
171
Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n",
172
afp->linenumber, afp->fname, s);
173
squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */
179
/* And now we're in the sequence section.
180
* As discussed above, if we haven't seen a sequence name, then we
181
* don't include the sequence in the alignment.
182
* Also, watch out for coordinate-only lines.
184
while ((s = MSAFileGetLine(afp)) != NULL)
187
if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue;
188
if ((seq = sre_strtok(&sp, "\n", &slen)) == NULL) continue;
190
/* The test for a coord line: digits starting both fields
192
if (isdigit(*name) && isdigit(*seq))
195
/* It's not blank, and it's not a coord line: must be sequence
197
sqidx = GKIKeyIndex(msa->index, name);
198
if (sqidx < 0) continue; /* not a sequence we recognize */
200
msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen);
203
/* We've left blanks in the aseqs; take them back out.
205
for (sqidx = 0; sqidx < msa->nseq; sqidx++)
207
if (msa->aseq[sqidx] == NULL)
208
Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname);
210
for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++)
212
if (*s == ' ' || *s == '\t') {
222
MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */
227
/* Function: WriteMSF()
228
* Date: SRE, Mon May 31 11:25:18 1999 [St. Louis]
230
* Purpose: Write an alignment in MSF format to an open file.
232
* Args: fp - file that's open for writing.
233
* msa - alignment to write.
235
* Note that msa->type, usually optional, must be
236
* set for WriteMSF to work. If it isn't, a fatal
237
* error is generated.
242
WriteMSF(FILE *fp, MSA *msa)
244
time_t now; /* current time as a time_t */
245
char date[64]; /* today's date in GCG's format "October 3, 1996 15:57" */
246
char **gcg_aseq; /* aligned sequences with gaps converted to GCG format */
247
char **gcg_sqname; /* sequence names with GCG-valid character sets */
248
int idx; /* counter for sequences */
249
char *s; /* pointer into sqname or seq */
250
int len; /* tmp variable for name lengths */
251
int namelen; /* maximum name length used */
252
int pos; /* position counter */
253
char buffer[51]; /* buffer for writing seq */
254
int i; /* another position counter */
256
/*****************************************************************
257
* Make copies of sequence names and sequences.
258
* GCG recommends that name characters should only contain
259
* alphanumeric characters, -, or _
260
* Some GCG and GCG-compatible software is sensitive to this.
261
* We silently convert all other characters to '_'.
263
* For sequences, GCG allows only ~ and . for gaps.
264
* Otherwise, everthing is interpreted as a residue;
265
* so squid's IUPAC-restricted chars are fine. ~ means
266
* an external gap. . means an internal gap.
267
*****************************************************************/
269
/* make copies that we can edit */
270
gcg_aseq = MallocOrDie(sizeof(char *) * msa->nseq);
271
gcg_sqname = MallocOrDie(sizeof(char *) * msa->nseq);
272
for (idx = 0; idx < msa->nseq; idx++)
274
gcg_aseq[idx] = sre_strdup(msa->aseq[idx], msa->alen);
275
gcg_sqname[idx] = sre_strdup(msa->sqname[idx], -1);
277
/* alter names as needed */
278
for (idx = 0; idx < msa->nseq; idx++)
279
for (s = gcg_sqname[idx]; *s != '\0'; s++)
280
if (! isalnum((int) *s) && *s != '-' && *s != '_')
282
/* alter gap chars in seq */
283
for (idx = 0; idx < msa->nseq; idx++)
285
for (s = gcg_aseq[idx]; *s != '\0' && isgap(*s); s++)
287
for (; *s != '\0'; s++)
288
if (isgap(*s)) *s = '.';
289
for (pos = msa->alen-1; pos > 0 && isgap(gcg_aseq[idx][pos]); pos--)
290
gcg_aseq[idx][pos] = '~';
292
/* calculate max namelen used */
294
for (idx = 0; idx < msa->nseq; idx++)
295
if ((len = strlen(msa->sqname[idx])) > namelen)
298
/*****************************************************
299
* Write the MSF header
300
*****************************************************/
301
/* required file type line */
302
if (msa->type == kOtherSeq)
303
msa->type = GuessAlignmentSeqtype(msa->aseq, msa->nseq);
305
if (msa->type == kRNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
306
else if (msa->type == kDNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
307
else if (msa->type == kAmino) fprintf(fp, "!!AA_MULTIPLE_ALIGNMENT 1.0\n");
308
else if (msa->type == kOtherSeq)
309
Die("WriteMSF(): couldn't guess whether that alignment is RNA or protein.\n");
311
Die("Invalid sequence type %d in WriteMSF()\n", msa->type);
313
/* free text comments */
314
if (msa->ncomment > 0)
316
for (idx = 0; idx < msa->ncomment; idx++)
317
fprintf(fp, "%s\n", msa->comment[idx]);
320
/* required checksum line */
322
if (strftime(date, 64, "%B %d, %Y %H:%M", localtime(&now)) == 0)
323
Die("What time is it on earth? strftime() failed in WriteMSF().\n");
324
fprintf(fp, " %s MSF: %d Type: %c %s Check: %d ..\n",
325
msa->name != NULL ? msa->name : "squid.msf",
327
msa->type == kRNA ? 'N' : 'P',
329
GCGMultchecksum(gcg_aseq, msa->nseq));
332
/*****************************************************
333
* Names/weights section
334
*****************************************************/
336
for (idx = 0; idx < msa->nseq; idx++)
338
fprintf(fp, " Name: %-*.*s Len: %5d Check: %4d Weight: %.2f\n",
342
GCGchecksum(gcg_aseq[idx], msa->alen),
348
/*****************************************************
349
* Write the sequences
350
*****************************************************/
352
for (pos = 0; pos < msa->alen; pos += 50)
354
fprintf(fp, "\n"); /* Blank line between sequence blocks */
356
/* Coordinate line */
357
len = (pos + 50) > msa->alen ? msa->alen - pos : 50;
359
fprintf(fp, "%*s %-6d%*s%6d\n", namelen, "",
361
len + ((len-1)/10) - 12, "",
364
fprintf(fp, "%*s %-6d\n", namelen, "", pos+1);
366
for (idx = 0; idx < msa->nseq; idx++)
368
fprintf(fp, "%-*s ", namelen, gcg_sqname[idx]);
369
/* get next line's worth of 50 from seq */
370
strncpy(buffer, gcg_aseq[idx] + pos, 50);
372
/* draw the sequence line */
373
for (i = 0; i < len; i++)
375
if (! (i % 10)) fputc(' ', fp);
376
fputc(buffer[i], fp);
382
Free2DArray((void **) gcg_aseq, msa->nseq);
383
Free2DArray((void **) gcg_sqname, msa->nseq);