2
* ===========================================================================
5
* National Center for Biotechnology Information
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government have not placed any restriction on its use or reproduction.
14
* Although all reasonable efforts have been taken to ensure the accuracy
15
* and reliability of the software and data, the NLM and the U.S.
16
* Government do not and cannot warrant the performance or results that
17
* may be obtained by using this software or data. The NLM and the U.S.
18
* Government disclaim all warranties, express or implied, including
19
* warranties of performance, merchantability or fitness for any particular
22
* Please cite the author in any work or product based on this material.
24
* ===========================================================================
27
/*****************************************************************************
31
Author: Alejandro Schaffer
33
Contents: main routines for makematrices program to convert
34
PSI-BLAST checkpoints into score matrices.
36
*****************************************************************************/
53
/*counts the number of items in sequencesFile and matricesFile, assumed to
54
be one per line, and checks that the numbers are equal.
55
returns the number if equal, 0 if unequal, rewinds the file descriptors
57
static Int4 countProfiles(FILE *sequencesFile, FILE *profilesFile)
59
Int4 sequencesCount = 0; /*count for sequencesFile*/
60
Int4 matricesCount = 0; /*count for profilesFile*/
61
Char oneFileName[MAXLINELEN]; /*for reading one line per file*/
63
while (fgets(oneFileName,MAXLINELEN,sequencesFile))
65
while (fgets(oneFileName,MAXLINELEN,profilesFile))
68
rewind(sequencesFile);
69
if (sequencesCount == matricesCount) {
70
return(sequencesCount);
72
ErrPostEx(SEV_FATAL, 0, 0, "profiles: Sequences file has %ld entries; Matrices file has %d entries; these should be equal\n", (long) sequencesCount,matricesCount);
78
/*converts name of profile file to matrix file by changing
79
suffix to mtx or appending suffix mtx*/
80
static Char *makeMatrixName(Char *profileName)
82
int length; /*length of a name*/
83
Char *returnName; /*string to treturn*/
84
int c, lastc; /*loop indices*/
86
length = strlen(profileName);
87
returnName = (Char *) MemNew((length + 5) * sizeof(Char));
89
for(c = 0; c < length; c++) {
90
returnName[c] = profileName[c];
91
if(('.' == profileName[c]) && ('c' == profileName[c+1])
92
&& ('h' == profileName[c+2]))
95
returnName[lastc] = '.';
96
returnName[lastc+1] = 'm';
97
returnName[lastc+2] = 't';
98
returnName[lastc+3] = 'x';
99
returnName[lastc+4] = '\0';
103
/*print out some parameters associated with a Karlin-Alschul
105
checkFile is the file descriptor to write to
106
kbp is the pointer to a structure with the parameters
107
scaling determines whether scores are being scaled or not
108
scalingDown is 1/scalingFactor because if scores are
109
scaled up, then Lambda is to be scaled down*/
110
static void putMatrixKbp(FILE * checkFile, BLAST_KarlinBlkPtr kbp, Boolean scaling, Nlm_FloatHi scalingDown)
113
fprintf(checkFile,"%le\n",kbp->Lambda * scalingDown);
115
fprintf(checkFile,"%le\n",kbp->Lambda);
117
fprintf(checkFile,"%le\n",kbp->K);
118
fprintf(checkFile,"%le\n",kbp->logK);
119
fprintf(checkFile,"%le\n",kbp->H);
122
/*print out a score matrix into the file descriptor checkfile
123
compactSerarch and psoSearch stroe information about the
124
matrix and the associated sequence
125
scaleScores determines whether scores are scaled or not*/
126
static void putMatrixMatrix(FILE *checkFile, compactSearchItems * compactSearch, posSearchItems *posSearch, Boolean scaleScores)
128
Int4 i, j; /*loop indices*/
131
for(i = 0; i < compactSearch->qlength; i++) {
132
for(j = 0; j < compactSearch->alphabetSize; j++)
133
fprintf(checkFile,"%ld ", (long) posSearch->posPrivateMatrix[i][j]);
134
fprintf(checkFile,"\n");
138
for(i = 0; i < compactSearch->qlength; i++) {
139
for(j = 0; j < compactSearch->alphabetSize; j++)
140
fprintf(checkFile,"%ld ", (long) posSearch->posMatrix[i][j]);
141
fprintf(checkFile,"\n");
146
/*Write out the matrix
147
compactSearch and PosSearch include fields that store the matrix
149
sbp includes information about the underlying
151
fileName is where the matrix is to be written
152
error_return holds error messages
153
scaleScores indicates whether scores in the matrix are to be scaled
154
scalingFactor is the multiplicative factor to use if scaleScores
156
static Boolean takeMatrixCheckpoint(compactSearchItems * compactSearch,
157
posSearchItems *posSearch, BLAST_ScoreBlkPtr sbp,
158
Char *fileName,ValNodePtr *error_return, Boolean scaleScores, Nlm_FloatHi
162
FILE * checkFile; /*file in which to take the checkpoint*/
163
Int4 length; /*length of query sequence, and an index for it*/
164
Int4 i; /*indices to position and alphabet */
165
Char localChar; /*temporary character*/
167
checkFile = FileOpen(fileName, "w");
169
if (NULL == checkFile) {
170
ErrPostEx(SEV_ERROR, 0,0, "Could not open checkpoint file");
174
length = compactSearch->qlength;
175
fprintf(checkFile,"%ld\n",(long) length);
177
for(i = 0; i < length; i++) {
178
localChar = getRes(compactSearch->query[i]);
180
fprintf(checkFile,"%c",localChar);
181
posSearch->posMatrix[i][Xchar] = Xscore;
182
posSearch->posPrivateMatrix[i][Xchar] = Xscore * scalingFactor;
185
fprintf(checkFile,"\n");
186
putMatrixKbp(checkFile, compactSearch->kbp_gap_std[0], scaleScores, 1/scalingFactor);
187
putMatrixKbp(checkFile, compactSearch->kbp_gap_psi[0], scaleScores, 1/scalingFactor);
188
putMatrixKbp(checkFile, sbp->kbp_ideal, scaleScores, 1/scalingFactor);
189
putMatrixMatrix(checkFile, compactSearch, posSearch, scaleScores);
191
FileClose(checkFile);
195
/*convert to matrices is the high-level procedure to convert a set of
196
PSI-BLAST checkpoints into a corresponding set of score matrices
197
profilesFile is a descriptor to a file listing the file names of files
198
containing checkpoints, one file name per line
199
sequencesFile is a descriptor to a file listing the file names of sequences
200
containing the master sequences for checkpoints, one file name per line
201
matricesFile is an output file to print out the names of the newly
202
produced files containing score matrices, one file name per line
203
auxiliaryFile is an outputput file to contain some general information
204
about the library of matrices and some parameters for each matrix
205
count is the number of checkpoints/sequences
206
gap_open is the cost of opening a gap
207
gap_extend is the cost of extending a gap
208
effectiveSize is the the size of the original sequence database used to
209
make the PSI-BLAST matrices
210
underlyignMatrixName is the original score matrix used to make the
212
scaleScores indicates whether scores should be scaled
213
scalingFactor indicates by how much scores are scaled, if at all*/
215
static Int4 convertToMatrices(FILE *profilesFile, FILE *sequencesFile, FILE *matricesFile, FILE *auxiliaryFile, Int4 count, Int4 gap_open, Int4 gap_extend,
216
Int4 effectiveSize, Char *underlyingMatrixName, Boolean scaleScores,
217
Nlm_FloatHi scalingFactor, Char *directoryPrefix)
219
int i; /*loop index over profiles*/
220
FILE *thisProfileFile, *thisSequenceFile; /*file
221
descriptors for a single profile*/
222
Char profileFileName[MAX_NAME_LENGTH], sequenceFileName[MAX_NAME_LENGTH];
223
/*file names for profiles*/
224
Char * matrixFileName, *relativeMatrixFileName; /*file name for corresponding matrix file*/
225
Char relativeProfileFileName[MAX_NAME_LENGTH], relativeSequenceFileName[MAX_NAME_LENGTH];
226
Int4 prefixLength; /*length of directoryPrefix*/
227
Int4 c1,c2; /*indices over characters in names*/
228
posSearchItems *posSearch; /*used to store matrix*/
229
Uint1Ptr query =NULL; /*query sequence read in*/
230
Int4 queryLength; /*length of query sequence*/
231
Int4 c; /*index over query*/
232
compactSearchItems *compactSearch; /*stores query related items*/
233
ValNodePtr error_return; /*stores error messages*/
234
Boolean success; /*did one checkpoint recovery succeed*/
235
BLAST_ResFreqPtr stdrfp; /* gets standard frequencies in prob field */
236
Int4 a; /*index over characters*/
237
SeqCodeTablePtr sctp;
238
BLAST_ScoreBlkPtr sbp;
239
BioseqPtr query_bsp; /*structure to hold query information*/
240
SeqEntryPtr sep; /*structure to hold query retrieval result*/
241
Int4 array_size; /*holds returns from computing the Karlin-Altschul
243
Int4Ptr open, extend; /*gap open and extension costs*/
244
Nlm_FloatHiPtr lambda, K, H; /*Karlin_Altschul score paramters*/
245
Int4 index; /*loop index for array_size*/
246
Int4 *lengthArray; /*array of sequence lengths*/
247
Nlm_FloatHi *KArray; /*array of K values, one per sequence*/
248
Int4 maxLength; /*maximum length of a sequence*/
249
Int4 KarlinReturn; /*return value from calls to set up matrix parameters*/
252
posSearch = (posSearchItems *) MemNew (1 * sizeof(posSearchItems));
253
compactSearch = (compactSearchItems *) MemNew (1 * sizeof(compactSearchItems));
254
sctp = SeqCodeTableFindObj(Seq_code_ncbistdaa);
255
compactSearch->alphabetSize = sctp->num;
257
fprintf(auxiliaryFile,"%s\n",underlyingMatrixName);
258
fprintf(auxiliaryFile,"%ld\n",(long) gap_open);
259
fprintf(auxiliaryFile,"%ld\n",(long) gap_extend);
261
lengthArray = (Int4 *) MemNew(count * sizeof(Int4));
262
KArray = (Nlm_FloatHi *) MemNew(count * sizeof(Nlm_FloatHi));
265
if ('\0' != directoryPrefix[0]) {
266
strcpy(profileFileName, directoryPrefix);
267
strcpy(sequenceFileName, directoryPrefix);
268
prefixLength = strlen(directoryPrefix);
271
for(i = 0; i < count; i++) {
272
if ('\0' == directoryPrefix[0])
273
fscanf(profilesFile,"%s", profileFileName);
275
fscanf(profilesFile,"%s", relativeProfileFileName);
276
for(c1 = prefixLength, c2 = 0; relativeProfileFileName[c2] != '\0';
278
profileFileName[c1] = relativeProfileFileName[c2];
279
profileFileName[c1] = '\0';
281
if ((thisProfileFile = FileOpen(profileFileName, "rb")) == NULL) {
282
ErrPostEx(SEV_FATAL, 0, 0, "Unable to open file %s\n", profileFileName);
285
if ('\0' == directoryPrefix[0])
286
fscanf(sequencesFile,"%s", sequenceFileName);
288
fscanf(sequencesFile,"%s", relativeSequenceFileName);
289
for(c1 = prefixLength, c2 = 0; relativeSequenceFileName[c2] != '\0';
291
sequenceFileName[c1] = relativeSequenceFileName[c2];
292
sequenceFileName[c1] = '\0';
294
if ((thisSequenceFile = FileOpen(sequenceFileName, "r")) == NULL) {
295
ErrPostEx(SEV_FATAL, 0, 0, "Unable to open file %s\n", sequenceFileName);
299
sep = FastaToSeqEntryEx(thisSequenceFile, FALSE, NULL, FALSE);
302
SeqEntryExplore(sep, &query_bsp, FindProt);
303
if (query_bsp == NULL) {
304
ErrPostEx(SEV_FATAL, 0, 0, "Unable to obtain bioseq\n");
307
query = BlastGetSequenceFromBioseq(query_bsp, &queryLength);
309
compactSearch->query = query;
310
for (c= 0; c < queryLength; c++)
311
query[c] = ResToInt(query[c]);
312
compactSearch->qlength = queryLength;
315
sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa, 1);
316
sbp->read_in_matrix = TRUE;
317
sbp->protein_alphabet = TRUE;
318
sbp->posMatrix = NULL;
319
sbp->number_of_contexts = 1;
320
BlastScoreBlkMatFill(sbp, underlyingMatrixName);
321
compactSearch->matrix = sbp->matrix;
322
compactSearch->gapped_calculation = TRUE;
323
compactSearch->pseudoCountConst = 10;
324
compactSearch->ethresh = 0.001;
325
BlastScoreBlkFill(sbp, (CharPtr) query, queryLength, 0);
328
fprintf(auxiliaryFile, "%le\n", sbp->kbp_std[0]->K);
329
fprintf(auxiliaryFile, "%le\n", sbp->kbp_std[0]->H);
332
sbp->kbp_gap_std[0] = BlastKarlinBlkCreate();
333
KarlinReturn = BlastKarlinBlkGappedCalc(sbp->kbp_gap_std[0], gap_open, gap_extend,
334
sbp->name, &error_return);
335
if (1 == KarlinReturn) {
336
BlastErrorPrint(error_return);
339
sbp->kbp_gap_psi[0] = BlastKarlinBlkCreate();
340
KarlinReturn = BlastKarlinBlkGappedCalc(sbp->kbp_gap_psi[0], gap_open, gap_extend,
341
sbp->name, &error_return);
342
if (1 == KarlinReturn) {
343
BlastErrorPrint(error_return);
347
array_size = BlastKarlinGetMatrixValues(sbp->name, &open, &extend, &lambda, &K, &H, NULL);
348
if (array_size > 0) {
349
for (index=0; index<array_size; index++) {
350
if (open[index] == INT2_MAX && extend[index] == INT2_MAX) {
351
sbp->kbp_ideal = BlastKarlinBlkCreate();
352
sbp->kbp_ideal->Lambda = lambda[index];
353
sbp->kbp_ideal->K = K[index];
354
sbp->kbp_ideal->H = H[index];
363
if (sbp->kbp_ideal == NULL)
364
sbp->kbp_ideal = BlastKarlinBlkStandardCalcEx(sbp);
365
compactSearch->lambda = sbp->kbp_gap_std[0]->Lambda;
366
compactSearch->kbp_std = sbp->kbp_std;
367
compactSearch->kbp_psi = sbp->kbp_psi;
368
compactSearch->kbp_gap_psi = sbp->kbp_gap_psi;
369
compactSearch->kbp_gap_std = sbp->kbp_gap_std;
370
compactSearch->lambda_ideal = sbp->kbp_ideal->Lambda;
371
compactSearch->K_ideal = sbp->kbp_ideal->K;
373
stdrfp = BlastResFreqNew(sbp);
374
BlastResFreqStdComp(sbp,stdrfp);
375
compactSearch->standardProb = MemNew(compactSearch->alphabetSize * sizeof(Nlm_FloatHi));
377
if (NULL == compactSearch->standardProb)
379
for(a = 0; a < compactSearch->alphabetSize; a++)
380
compactSearch->standardProb[a] = stdrfp->prob[a];
381
stdrfp = BlastResFreqDestruct(stdrfp);
383
posSearch->posInformation = NULL;
384
success = impalaReadCheckpoint(posSearch, compactSearch, profileFileName, &error_return, scalingFactor);
386
ErrPostEx(SEV_FATAL,0,0, "Unable to recover checkpoint from %s\n",profileFileName);
389
/*conversion to matrix and scaling is done in impalaReadCheckpopint*/
390
if ('\0' == directoryPrefix[0]) {
391
matrixFileName = makeMatrixName(profileFileName);
392
fprintf(matricesFile,"%s\n",matrixFileName);
394
matrixFileName = makeMatrixName(profileFileName);
395
relativeMatrixFileName = makeMatrixName(relativeProfileFileName);
396
fprintf(matricesFile,"%s\n",relativeMatrixFileName);
399
success = takeMatrixCheckpoint(compactSearch, posSearch, sbp, matrixFileName, &error_return, scaleScores, scalingFactor);
402
ErrPostEx(SEV_FATAL,0,0, "Unable to take matrix checkpoint from %s\n",profileFileName);
406
lengthArray[i] = queryLength;
407
KArray[i] = sbp->kbp_gap_psi[0]->K;
409
if (lengthArray[i] > maxLength)
410
maxLength = lengthArray[i];
412
posCheckpointFreeMemory(posSearch, queryLength);
413
FileClose(thisProfileFile);
414
thisProfileFile = NULL;
415
FileClose(thisSequenceFile);
416
thisSequenceFile = NULL;
419
sbp = BLAST_ScoreBlkDestruct(sbp);
420
compactSearch->standardProb = MemFree(compactSearch->standardProb);
423
MemFree(matrixFileName);
424
if ('\0' != directoryPrefix[0])
425
MemFree(relativeMatrixFileName);
429
fprintf(auxiliaryFile, "%ld\n", (long) maxLength);
430
fprintf(auxiliaryFile, "%ld\n", (long) effectiveSize);
431
fprintf(auxiliaryFile, "%lf\n", scalingFactor);
433
for(i = 0; i < count; i++) {
434
fprintf(auxiliaryFile, "%ld\n", (long) lengthArray[i]);
435
fprintf(auxiliaryFile, "%le\n", KArray[i]);
438
MemFree(lengthArray);
439
FileClose(profilesFile);
440
FileClose(sequencesFile);
441
FileClose(matricesFile);
442
FileClose(auxiliaryFile);
443
compactSearchDestruct(compactSearch);
445
BLAST_ScoreBlkDestruct(sbp);
451
static Args myargs [NUMARG] = {
452
{ "Database name for profile database",
453
"stdin", NULL, NULL, FALSE, 'P', ARG_FILE_IN, 0.0, 0, NULL},
454
{ "Cost to open a gap",
455
"11", NULL, NULL, FALSE, 'G', ARG_INT, 0.0, 0, NULL},
456
{ "Cost to extend a gap",
457
"1", NULL, NULL, FALSE, 'E', ARG_INT, 0.0, 0, NULL},
458
{ "Underlying Matrix",
459
"BLOSUM62", NULL, NULL, FALSE, 'U', ARG_STRING, 0.0, 0, NULL},
460
{ "Underlying sequence database used to make profiles",
461
"nr", NULL, NULL, FALSE, 'd', ARG_STRING, 0.0, 0, NULL},
462
{ "Effective length of the profile database (0 for length of -d option)",
463
"0", NULL, NULL, FALSE, 'z', ARG_INT, 0.0, 0, NULL},
464
{ "Scaling factor for matrix outputs to avoid round-off problems",
465
"100.0", NULL, NULL, FALSE, 'S', ARG_FLOAT, 0.0, 0, NULL},
466
{ "Print help; overrides all other arguments",
467
"F", NULL, NULL, FALSE, 'H', ARG_BOOLEAN, 0.0, 0, NULL}
473
Char *profilesDatabase;
474
Char profilesFileName[MAX_NAME_LENGTH];
475
Char sequencesFileName[MAX_NAME_LENGTH];
476
Char matricesFileName[MAX_NAME_LENGTH];
477
Char auxiliaryFileName[MAX_NAME_LENGTH];
478
Char mmapFileName[MAX_NAME_LENGTH];
479
FILE *profilesFile, *sequencesFile, *matricesFile, *auxiliaryFile;
480
Int4 count; /*how many profiles*/
481
Int4 effSize; /*effective database size to use*/
483
ReadDBFILEPtr rdpt=NULL; /*holds result of attempt to read database*/
484
Boolean scaling; /*are score matrix values going to be scaled*/
485
Char *directoryPrefix; /*directory where profile library is kept, used
486
to reach other directories indirectly*/
488
if (! GetArgs ("makematrices", NUMARG, myargs)) {
492
if (! SeqEntryLoad())
495
UseLocalAsnloadDataAndErrMsg();
496
ErrSetLogLevel(SEV_WARNING);
498
if ((Boolean) myargs[7].intvalue) {
499
IMPALAPrintHelp(FALSE, 80, "makemat", stdout);
502
profilesDatabase = myargs[0].strvalue;
503
directoryPrefix = (Char *) MemNew(MAX_NAME_LENGTH *sizeof(char));
504
strcpy(directoryPrefix,profilesDatabase);
505
impalaMakeFileNames(profilesDatabase,auxiliaryFileName,
506
mmapFileName,sequencesFileName,matricesFileName,
507
profilesFileName, directoryPrefix);
509
if ((profilesFile = FileOpen(profilesFileName, "r")) == NULL) {
510
ErrPostEx(SEV_FATAL, 0, 0, "Unable to open profiles file %s\n", profilesFileName);
514
if ((sequencesFile = FileOpen(sequencesFileName, "r")) == NULL) {
515
ErrPostEx(SEV_FATAL, 0, 0, "Unable to open sequences file %s\n", sequencesFileName);
519
if ((matricesFile = FileOpen(matricesFileName, "w")) == NULL) {
520
ErrPostEx(SEV_FATAL, 0, 0, "Unable to open matrices file %s\n", matricesFileName);
524
if ((auxiliaryFile = FileOpen(auxiliaryFileName, "w")) == NULL) {
525
ErrPostEx(SEV_FATAL, 0, 0, "Unable to open auxiliary file %s\n", auxiliaryFileName);
529
effSize = myargs[5].intvalue;
531
rdpt = readdb_new(myargs[4].strvalue, TRUE);
532
effSize = readdb_get_dblen(rdpt);
533
rdpt = readdb_destruct(rdpt);
536
count = countProfiles(sequencesFile, profilesFile);
537
scaling = ((myargs[6].floatvalue < 0.99) ||
538
(myargs[6].floatvalue > 1.01));
540
retcode = convertToMatrices(profilesFile, sequencesFile, matricesFile,
541
auxiliaryFile, count, myargs[1].intvalue,
542
myargs[2].intvalue, effSize, myargs[3].strvalue,
543
scaling, myargs[6].floatvalue, directoryPrefix);
545
MemFree(directoryPrefix);