1
/* $Id: seedtop.c,v 6.6 1999/09/22 17:54:21 shavirin Exp $ */
2
/**************************************************************************
6
* This software/database is categorized as "United States Government *
7
* Work" under the terms of the United States Copyright Act. It was *
8
* produced as part of the author's official duties as a Government *
9
* employee and thus can not be copyrighted. This software/database is *
10
* freely available to the public for use without a copyright notice. *
11
* Restrictions can not be placed on its present or future use. *
13
* Although all reasonable efforts have been taken to ensure the accuracy *
14
* and reliability of the software and data, the National Library of *
15
* Medicine (NLM) and the U.S. Government do not and can not warrant the *
16
* performance or results that may be obtained by using this software, *
17
* data, or derivative works thereof. The NLM and the U.S. Government *
18
* disclaim any and all warranties, expressed or implied, as to the *
19
* performance, merchantability or fitness for any particular purpose or *
22
* In any work or product derived from this material, proper attribution *
23
* of the author(s) as the source of the software or data would be *
26
**************************************************************************/
27
/*****************************************************************************
31
Authors: Zheng Zhang and Alejandro Schaffer
32
Maintainer: Alejandro Schaffer
34
Contents: main routine for pseed3, stand-alone counterpart to PHI-BLAST.
39
Revision 6.6 1999/09/22 17:54:21 shavirin
40
Now functions will collect messages in ValNodePtr before printing out.
42
*****************************************************************************/
68
static Args myargs [NUMARG] = {
70
"nr", NULL, NULL, FALSE, 'd', ARG_STRING, 0.0, 0, NULL},
72
"stdin", NULL, NULL, FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
74
"hit_file", NULL, NULL, FALSE, 'k', ARG_FILE_IN, 0.0, 0, NULL},
75
{ "Output File for Alignment",
76
"stdout", NULL, NULL, TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
77
{ "Cost to open a gap",
78
"11", NULL, NULL, FALSE, 'G', ARG_INT, 0.0, 0, NULL},
79
{ "Cost to extend a gap",
80
"1", NULL, NULL, FALSE, 'E', ARG_INT, 0.0, 0, NULL},
81
{ "Cost decline to align",
82
"99999", NULL, NULL, FALSE, 'D', ARG_INT, 0.0, 0, NULL},
83
{ "X dropoff value for gapped alignment (in bits)",
84
"15", NULL, NULL, FALSE, 'X', ARG_INT, 0.0, 0, NULL},
86
"30", NULL, NULL, FALSE, 'S', ARG_INT, 0.0, 0, NULL},
87
{ "Score only or not",
88
"1", NULL, NULL, FALSE, 'C', ARG_INT, 0.0, 0, NULL},
89
{ "Show GI's in deflines",
90
"F", NULL, NULL, FALSE, 'I', ARG_BOOLEAN, 0.0, 0, NULL},
91
{ "Expectation value(E)",
92
"10.0", NULL, NULL, FALSE, 'e', ARG_FLOAT, 0.0, 0, NULL},
93
{ "Believe the query defline",
94
"F", NULL, NULL, FALSE, 'J', ARG_BOOLEAN, 0.0, 0, NULL},
95
{ "SeqAlign file ('Believe the query defline' must be TRUE)",
96
NULL, NULL, NULL, TRUE, 'O', ARG_FILE_OUT, 0.0, 0, NULL},
98
"BLOSUM62", NULL, NULL, FALSE, 'M', ARG_STRING, 0.0, 0, NULL},
99
{ "Number of one-line descriptions (V)",
100
"500", NULL, NULL, FALSE, 'v', ARG_INT, 0.0, 0, NULL},
101
{ "Number of alignments to show (B)",
102
"250", NULL, NULL, FALSE, 'b', ARG_INT, 0.0, 0, NULL},
104
"patmatchp", NULL, NULL, FALSE, 'p', ARG_STRING, 0.0, 0, NULL},
105
{ "Cost for a match",
106
"10", NULL, NULL, FALSE, 'r', ARG_INT, 0.0, 0, NULL},
107
{ "Cost for a mismatch",
108
"-10", NULL, NULL, FALSE, 'q', ARG_INT, 0.0, 0, NULL},
109
{ "Filter query sequence with SEG",
110
"F", NULL, NULL, FALSE, 'F', ARG_BOOLEAN, 0.0, 0, NULL}
116
BioseqPtr query_bsp; /*structure to hold query information*/
117
SeqEntryPtr sep; /*structure to hold query retrieval result*/
118
Int4 queryLength; /*length of query sequence*/
119
Int4 queryPos; /*index over query sequence*/
120
Int4 i, j; /*indices over DNA characters*/
121
Nlm_FloatHi dbLength, adjustdbLength; /*total number of characters in database*/
122
Uint1Ptr query =NULL; /*query sequence read in*/
123
Uint1Ptr unfilter_query =NULL; /*needed if seg will filter query*/
124
SeqLocPtr seg_slp; /*pointer to structure for seg filtering*/
125
Uint1Ptr seqFromDb; /*newly read sequence from database*/
126
Uint1Ptr reverseSeqFromDb = NULL; /*reverse of seqFromDb*/
127
Int4 lenSeqFromDb; /* length of seqFromDb */
128
Char *pattern; /*string description of a pettern*/
129
Char *pname; /*name of pattern*/
130
CharPtr queryfile; /*name of file containing query string*/
131
CharPtr database; /*name of database to use*/
132
CharPtr patfile; /*file describing patterns to match*/
133
CharPtr outputfile; /*file to which output should be written*/
134
Int4 seed; /*position in sequence where pattern match starts*/
135
Int4 lenPatMatch; /*number of positions taken by pattern match*/
136
Int4 num_seq; /*number of sequences in database*/
137
Int4 program_flag; /*which program is being called*/
138
hit_ptr hit_list; /*list of matches to one database sequence*/
139
Int4 list[MAX_HIT]; /*list of lengths and start positions where
140
pattern matches sequence*/
141
Int4 *occurArray; /*places in query where pattern occurs*/
142
Int4 *hitArray; /* beginning and end of pattern in query. */
143
Int4 numPatOccur; /*number of pattern occurrences in query string*/
144
Int4 effectiveOccurrences; /*number of occurrences not overlapping
145
in more than half the pattern*/
146
Int4 occurIndex; /*index over pattern ocuurences*/
147
Int4 twiceNumMatches; /*stores return value from find_hits*/
148
Int4 matchIndex; /*index for matches to a single sequence*/
149
Int4 totalOccurrences = 0, newOccurrences; /*total occurrences of pattern in database*/
150
Nlm_FloatHi eThresh; /*e-value threshold for hits*/
151
ReadDBFILEPtr rdpt=NULL; /*holds result of attempt to read database*/
152
GapAlignBlkPtr gap_align; /*structure to keep track of gapped
153
alingment information*/
154
BLAST_ScoreBlkPtr sbp; /*BLAST structure used to hold matrix*/
155
FILE *infp, *patfp; /*file descriptors for query file and pattern file*/
156
FILE *outfp; /*file descriptor for output file*/
157
Boolean is_dna; /*are we working with DNA or protein sequences*/
158
qseq_ptr query_seq; /*query sequence in record format*/
159
seedSearchItems *seedSearch; /*holds parameters related to seed*/
160
seedResultItems *seedResults = NULL; /*holds list of matching sequences*/
161
patternSearchItems *patternSearch = NULL; /*holds parameters
162
related to pattern1.c*/
163
BLAST_OptionsBlkPtr options; /*used as placeholder for fillCandLambda*/
164
ValNodePtr error_returns=NULL; /*store error messages*/
165
ValNodePtr info_vnp = NULL; /* store information messages */
167
seedSearch = (seedSearchItems *) ckalloc(sizeof(seedSearchItems));
168
seedResults = (seedResultItems *) ckalloc(sizeof(seedResultItems));
169
patternSearch = (patternSearchItems *) ckalloc(sizeof(patternSearchItems));
170
if (! GetArgs ("pseed3", NUMARG, myargs))
175
if (! SeqEntryLoad())
178
database = myargs[0].strvalue;
179
queryfile = myargs[1].strvalue;
180
if ((infp = FileOpen(queryfile, "r")) == NULL)
182
ErrPostEx(SEV_FATAL, 0, 0, "seed: Unable to open input file %s\n", queryfile);
186
patfile = myargs[2].strvalue;
187
if ((patfp = FileOpen(patfile, "r")) == NULL)
189
ErrPostEx(SEV_FATAL, 0, 0, "seed: Unable to open pattern file %s\n", patfile);
192
outputfile = myargs[3].strvalue;
194
if (outputfile != NULL)
196
if ((outfp = FileOpen(outputfile, "w")) == NULL)
198
ErrPostEx(SEV_FATAL, 0, 0, "seed: Unable to open output file %s\n", outputfile);
203
program_flag = convertProgramToFlag(myargs[17].strvalue, &is_dna);
205
gap_align = GapAlignBlkNew(1, 1);
206
gap_align->gap_open = myargs[4].intvalue;
207
gap_align->gap_extend = myargs[5].intvalue;
208
gap_align->decline_align = myargs[6].intvalue;
211
options = BLASTOptionNew("blastp", TRUE);
212
BLASTOptionSetGapParams(options, myargs[14].strvalue, 0, 0);
213
options->gap_open = myargs[4].intvalue;
214
options->gap_extend = myargs[5].intvalue;
215
fillCandLambda(seedSearch, myargs[14].strvalue, options);
216
gap_align->x_parameter = myargs[7].intvalue*NCBIMATH_LN2/seedSearch->paramLambda;
218
/* seedSearch->cutoffScore = myargs[8].intvalue; */
219
eThresh = (Nlm_FloatHi) myargs[11].floatvalue;
220
if (eThresh > MAX_EVALUE) {
221
ErrPostEx(SEV_FATAL, 0, 0, "E-value threshold is too high\n");
226
initProbs(seedSearch);
228
if (program_flag != PATTERN_FLAG) {
229
if (program_flag != PAT_MATCH_FLAG) {
230
sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa, 1);
231
sbp->read_in_matrix = TRUE;
232
BlastScoreBlkMatFill(sbp, myargs[14].strvalue);
233
gap_align->matrix = sbp->matrix;
236
for (i = 0; i < DNA_ALPHABET_SIZE; i++)
237
for (j = 0; j < DNA_ALPHABET_SIZE; j++)
238
if (i==j) /*characters match */
239
gap_align->matrix[i][j] = myargs[18].intvalue;
240
else /* characters mismatch*/
241
gap_align->matrix[i][j] = myargs[19].intvalue;
245
sep = FastaToSeqEntryEx(infp, is_dna, NULL, (Boolean) myargs[12].intvalue);
251
SeqEntryExplore(sep, &query_bsp, FindNuc);
255
SeqEntryExplore(sep, &query_bsp, FindProt);
257
if (query_bsp == NULL)
259
ErrPostEx(SEV_FATAL, 0, 0, "Unable to obtain bioseq\n");
262
query = BlastGetSequenceFromBioseq(query_bsp, &queryLength);
263
seg_slp = BlastBioseqFilter(query_bsp, options->filter_string);
265
unfilter_query = MemNew((queryLength + 1) * sizeof(Uint1));
266
for (i = 0; i < queryLength; i++)
267
unfilter_query[i] = query[i];
268
BlastMaskTheResidues(query,queryLength,21,seg_slp,FALSE, 0);
271
init_order(gap_align->matrix, program_flag, is_dna, seedSearch);
272
/*translation from ASCII to 0,1,2,3 alphabet*/
273
for (queryPos = 0; queryPos < queryLength; queryPos++)
274
query[queryPos] = seedSearch->order[query[queryPos]];
276
for (queryPos = 0; queryPos < queryLength; queryPos++)
277
unfilter_query[queryPos] = seedSearch->order[unfilter_query[queryPos]];
281
init_order(NULL, program_flag, is_dna, seedSearch);
283
rdpt = readdb_new(database, !is_dna);
284
if (program_flag == PATTERN_FLAG) {
285
search_pat(rdpt, patfile, is_dna, seedSearch, patternSearch, &error_returns, &info_vnp);
286
PGPOutTextMessages(info_vnp, outfp);
287
info_vnp = ValNodeFreeData(info_vnp);
289
BlastErrorPrint(error_returns);
290
rdpt = readdb_destruct(rdpt);
293
occurArray = (Int4 *) ckalloc(sizeof(Int4)*queryLength*2);
294
hitArray = (Int4 *) MemNew(sizeof(Int4)*queryLength*2);
297
num_seq = readdb_get_num_entries(rdpt);
298
dbLength = (Nlm_FloatHi) readdb_get_dblen(rdpt);
299
/*correct the effective size of the database to take out
300
the number of positions at the end of each sequence where
301
pattern match cannot start*/
303
while (pattern = get_a_pat(patfp, &pname, occurArray, hitArray, &numPatOccur,
304
&effectiveOccurrences,
305
program_flag, unfilter_query, query,
307
patternSearch, seedSearch, TRUE, &error_returns, &info_vnp)) {
309
PGPOutTextMessages(info_vnp, outfp);
310
info_vnp = ValNodeFreeData(info_vnp);
312
BlastErrorPrint(error_returns);
313
if (patternSearch->patternProbability > PAT_PROB_THRESH &&
314
(patternSearch->patternProbability * dbLength > EXPECT_MATCH_THRESH)) {
315
fprintf(outfp,"Pattern %s is too likely to occur in the database to be informative\n",pname);
318
if (patternSearch->wildcardProduct > WILDCARD_THRESH) {
319
fprintf(outfp, "Due to variable wildcards pattern %s is likely to occur too many times in a single sequence\n",pname);
323
adjustdbLength = dbLength - (num_seq * patternSearch->minPatternMatchLength);
325
for (occurIndex = 0; occurIndex < numPatOccur; occurIndex++) {
326
seed = occurArray[occurIndex];
327
totalOccurrences = 0;
328
fprintf(outfp,"Name %sPattern %s At position %d of query sequence\n",
329
pname, pattern, seed);
330
if ((twiceNumMatches=find_hits(list, &query[seed-1], queryLength-seed+1, FALSE, patternSearch)) < 2 ||
332
fprintf(outfp,"twiceNumMatches=%d list[1]=%d\n", i, list[1]);
333
ErrPostEx(SEV_FATAL, 0, 0, "pattern does not match the query at the place\n");
336
if (program_flag != PAT_MATCH_FLAG) {
337
lenPatMatch = list[0]+1;
340
query_seq = split_target_seq(query, seed, lenPatMatch, queryLength);
342
fprintf(outfp, "effective database length=%.1e\n pattern probability=%.1e\nlengthXprobability=%.1e\n", (Nlm_FloatHi) adjustdbLength,
343
patternSearch->patternProbability,
344
patternSearch->patternProbability * adjustdbLength);
346
/*extra caution about what values are tolerated*/
347
seedSearch->cutoffScore = eValueFit(
348
MIN(MAX_EVALUE, 10 * eThresh), adjustdbLength,
349
seedSearch, effectiveOccurrences, patternSearch->patternProbability);
352
seedSearch->cutoffScore = myargs[8].intvalue;
354
for (num_seq = 0; num_seq < rdpt->num_seqs; num_seq++) {
355
lenSeqFromDb = readdb_get_sequence(rdpt, num_seq, &seqFromDb);
356
/*if (num_seq == 3530)
357
printf("\n Stop here");*/
358
hit_list = get_hits(query_seq, lenPatMatch, seqFromDb,
359
lenSeqFromDb, gap_align, is_dna, patternSearch,
360
seedSearch, &newOccurrences);
361
if (newOccurrences > 0)
362
totalOccurrences += newOccurrences;
364
storeOneMatch(hit_list, num_seq, seqFromDb, seedResults);
369
quicksort_hits(matchIndex, seedResults);
370
fprintf(outfp,"\nNumber of occurrences of pattern in the database is %d\n", totalOccurrences);
371
/*Note: for stand-alone program, score only will be shown*/
372
output_hits(rdpt, TRUE, query, query_seq,
373
lenPatMatch, adjustdbLength, gap_align, is_dna,
374
effectiveOccurrences, seedSearch, seedResults,
375
patternSearch, FALSE, totalOccurrences, eThresh,
376
NULL, 0.0, NULL, matchIndex, NULL, TRUE,
379
if(query_seq != NULL) {
380
MemFree(query_seq->lseq);
384
PGPOutTextMessages(info_vnp, outfp);
385
info_vnp = ValNodeFreeData(info_vnp);
387
seed_free_all(seedResults);
394
BLAST_ScoreBlkDestruct(sbp);
395
GapAlignBlkDelete(gap_align);
396
BLASTOptionDelete(options);
397
rdpt = readdb_destruct(rdpt);
401
MemFree(unfilter_query);
403
MemFree(seedResults);
404
MemFree(patternSearch);