1
/* $Id: blast_returns.c,v 1.12 2004/10/06 15:01:01 dondosha Exp $
2
* ===========================================================================
5
* National Center for Biotechnology Information
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's offical duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government have not placed any restriction on its use or reproduction.
14
* Although all reasonable efforts have been taken to ensure the accuracy
15
* and reliability of the software and data, the NLM and the U.S.
16
* Government do not and cannot warrant the performance or results that
17
* may be obtained by using this software or data. The NLM and the U.S.
18
* Government disclaim all warranties, express or implied, including
19
* warranties of performance, merchantability or fitness for any particular
22
* Please cite the author in any work or product based on this material.
24
* ===========================================================================*/
26
/*****************************************************************************
28
File name: blast_returns.c
30
Author: Ilya Dondoshansky
32
Contents: Manipulating data returned from BLAST other than Seq-aligns
36
******************************************************************************
40
static char const rcsid[] = "$Id: blast_returns.c,v 1.12 2004/10/06 15:01:01 dondosha Exp $";
42
#include <algo/blast/api/blast_returns.h>
43
#include <algo/blast/api/blast_seq.h>
44
#include <algo/blast/core/blast_filter.h>
45
#include <algo/blast/core/blast_util.h>
47
TxDfDbInfo* Blast_GetDbInfo(ReadDBFILE* rdfp)
49
TxDfDbInfo* dbinfo = NULL;
55
dbinfo = calloc(1, sizeof(TxDfDbInfo));
57
dbinfo->name = strdup(readdb_get_filename(rdfp));
59
if (((chptr = readdb_get_title(rdfp)) == NULL) && dbinfo->name)
60
dbinfo->definition = strdup(dbinfo->name);
62
dbinfo->definition = strdup(chptr);
64
dbinfo->date = strdup(readdb_get_date(rdfp));
66
dbinfo->is_protein = readdb_is_prot(rdfp);
68
readdb_get_totals_ex(rdfp, &dbinfo->total_length, &dbinfo->number_seqs, TRUE);
74
Blast_GetParametersBuffer(EBlastProgramType program_number,
75
const Blast_SummaryReturn* sum_returns)
78
char* ret_buffer=NULL;
79
Int2 ret_buffer_length=0;
80
BlastUngappedStats* ungapped_stats = NULL;
81
BlastGappedStats* gapped_stats = NULL;
82
BlastRawCutoffs* raw_cutoffs = NULL;
83
Blast_SearchParams* search_params=NULL;
84
Blast_DatabaseStats* db_stats = NULL;
85
BlastDiagnostics* diagnostics = NULL;
88
if (sum_returns == NULL)
91
search_params = sum_returns->search_params;
92
db_stats = sum_returns->db_stats;
93
diagnostics = sum_returns->diagnostics;
96
ungapped_stats = diagnostics->ungapped_stat;
97
gapped_stats = diagnostics->gapped_stat;
98
raw_cutoffs = diagnostics->cutoffs;
101
if (program_number == eBlastTypeBlastn)
103
sprintf(buffer, "Matrix: blastn matrix:%d %d", search_params->match, search_params->mismatch);
104
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
106
else if (search_params->matrix)
108
sprintf(buffer, "Matrix: %s", search_params->matrix);
109
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
112
if (search_params->gapped_search) {
113
sprintf(buffer, "Gap Penalties: Existence: %ld, Extension: %ld",
114
(long) search_params->gap_open,
115
(long) search_params->gap_extension);
116
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
119
sprintf(buffer, "Number of Sequences: %ld", (long) db_stats->dbnum);
120
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
121
if (ungapped_stats) {
122
sprintf(buffer, "Number of Hits to DB: %s",
123
Nlm_Int8tostr(ungapped_stats->lookup_hits, 1));
124
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
126
sprintf(buffer, "Number of extensions: %ld",
127
(long) ungapped_stats->init_extends);
128
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
129
sprintf(buffer, "Number of successful extensions: %ld",
130
(long) ungapped_stats->good_init_extends);
131
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
135
if (search_params->expect > 0.1) {
136
sprintf(buffer, "Number of sequences better than %4.1f: %ld",
137
search_params->expect,
138
(long) gapped_stats->num_seqs_passed);
140
sprintf(buffer, "Number of sequences better than %3.1e: %ld",
141
search_params->expect,
142
(long) gapped_stats->num_seqs_passed);
144
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
146
if (search_params->gapped_search) {
148
"Number of HSP's better than %4.1f without gapping: %ld",
149
search_params->expect,
150
(long) gapped_stats->seqs_ungapped_passed);
151
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
153
"Number of HSP's gapped: %ld",
154
(long) gapped_stats->extensions);
155
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
157
"Number of HSP's successfully gapped: %ld",
158
(long) gapped_stats->good_extensions);
159
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
161
"Number of extra gapped extensions for HSPs above %4.1f: %ld",
162
search_params->expect,
163
(long) gapped_stats->extra_extensions);
164
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
168
/* Query length makes sense only for single query sequence. */
169
if (db_stats->qlen > 0) {
170
sprintf(buffer, "Length of query: %ld", (long)db_stats->qlen);
171
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
174
sprintf(buffer, "Length of database: %s", Nlm_Int8tostr (db_stats->dblength, 1));
175
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
177
if (db_stats->qlen > 0) {
178
sprintf(buffer, "Length adjustment: %ld", (long) db_stats->hsp_length);
179
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
181
/** FIXME: Should this be different for RPS BLAST? */
182
sprintf(buffer, "Effective length of query: %ld", (long)db_stats->eff_qlen);
183
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
185
sprintf(buffer, "Effective length of database: %s",
186
Nlm_Int8tostr (db_stats->eff_dblength , 1));
187
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
188
sprintf(buffer, "Effective search space: %8.0f", (double) db_stats->eff_searchsp);
189
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
190
sprintf(buffer, "Effective search space used: %8.0f", (double) db_stats->eff_searchsp_used);
191
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
193
sprintf(buffer, "Neighboring words threshold: %ld",
194
(long) search_params->threshold);
195
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
196
sprintf(buffer, "Window for multiple hits: %ld",
197
(long) search_params->window_size);
198
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
201
BLAST_KAParameters* ka_params_gap = sum_returns->ka_params_gap;
202
BLAST_KAParameters* ka_params = sum_returns->ka_params;
204
sprintf(buffer, "X1: %ld (%4.1f bits)",
205
(long)raw_cutoffs->x_drop_ungapped, (raw_cutoffs->x_drop_ungapped)*(ka_params->Lambda)/NCBIMATH_LN2);
206
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
208
if (search_params->gapped_search) {
209
sprintf(buffer, "X2: %ld (%4.1f bits)",
210
(long)raw_cutoffs->x_drop_gap, raw_cutoffs->x_drop_gap*(ka_params_gap->Lambda)/NCBIMATH_LN2);
211
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
212
sprintf(buffer, "X3: %ld (%4.1f bits)",
213
(long)raw_cutoffs->x_drop_gap_final, raw_cutoffs->x_drop_gap_final*(ka_params_gap->Lambda)/NCBIMATH_LN2);
214
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
216
sprintf(buffer, "S1: %ld (%4.1f bits)",
217
(long)raw_cutoffs->gap_trigger,
218
((raw_cutoffs->gap_trigger*(ka_params->Lambda))-(log(ka_params->K)))/NCBIMATH_LN2);
219
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
222
if (search_params->gapped_search)
223
sprintf(buffer, "S2: %ld (%4.1f bits)", (long) raw_cutoffs->cutoff_score,
224
(((raw_cutoffs->cutoff_score)*(ka_params_gap->Lambda))-(log(ka_params_gap->K)))/NCBIMATH_LN2);
226
sprintf(buffer, "S2: %ld (%4.1f bits)", (long) raw_cutoffs->cutoff_score,
227
(((raw_cutoffs->cutoff_score)*(ka_params->Lambda))-(log(ka_params->K)))/NCBIMATH_LN2);
229
add_string_to_buffer(buffer, &ret_buffer, &ret_buffer_length);
235
/** Save the Karlin-Altschul parameters calculated in the BLAST search.
236
* @param sbp Internal scoring block structure [in]
237
* @param context Index into the array of structures containing
238
* Karlin-Altschul parameters [in]
239
* @param sum_returns Returns summary structure [out]
242
Blast_SummaryKAParametersFill(const BlastScoreBlk* sbp, Int4 context,
243
Blast_SummaryReturn* sum_returns)
245
Blast_KarlinBlk* kbp;
251
kbp = sbp->kbp[context];
252
sum_returns->ka_params =
253
(BLAST_KAParameters*) malloc(sizeof(BLAST_KAParameters));
254
sum_returns->ka_params->Lambda = kbp->Lambda;
255
sum_returns->ka_params->K = kbp->K;
256
sum_returns->ka_params->H = kbp->H;
260
kbp = sbp->kbp_gap[context];
261
sum_returns->ka_params_gap =
262
(BLAST_KAParameters*) malloc(sizeof(BLAST_KAParameters));
263
sum_returns->ka_params_gap->Lambda = kbp->Lambda;
264
sum_returns->ka_params_gap->K = kbp->K;
265
sum_returns->ka_params_gap->H = kbp->H;
269
/** Frees Blast_DatabaseStats object
270
* @param db_stats object to be freed [in]
271
* @return NULL pointer
273
static Blast_DatabaseStats*
274
Blast_SummaryDBStatsFree(Blast_DatabaseStats* db_stats)
280
/** Allocated and fills in Blast_DatabaseStats
281
* Either rdfp or subject_slp should be non-NULL, not both!
283
* @param program_number blastn/blastp/etc. [in]
284
* @param eff_len_options pointer to effective length options [in]
285
* @param query_info information about query [in]
286
* @param rdfp pointer to BLAST DB reader [in]
287
* @param subject_slp Seq-loc for target sequence [in]
288
* @param db_stats object to be returned [out]
289
* @return zero on success
292
Blast_SummaryDBStatsFill(EBlastProgramType program_number,
293
const BlastEffectiveLengthsOptions* eff_len_options,
294
const BlastQueryInfo* query_info, ReadDBFILE* rdfp,
295
SeqLoc* subject_slp, Blast_DatabaseStats** db_stats)
297
Int8 total_length=0; /* total length of database. */
298
Int4 num_entries=0; /* number of entries in database. */
299
Int4 num_frames; /* number of frames allowed. */
303
*db_stats = (Blast_DatabaseStats*) calloc(1, sizeof(Blast_DatabaseStats));
304
if (*db_stats == NULL)
307
if (eff_len_options->db_length) {
308
total_length = eff_len_options->db_length;
310
readdb_get_totals_ex(rdfp, &total_length, &num_entries, TRUE);
313
for (slp = subject_slp; slp; slp = slp->next) {
314
total_length += SeqLocLen(slp);
318
if (program_number == eBlastTypeTblastn ||
319
program_number == eBlastTypeRpsTblastn ||
320
program_number == eBlastTypeTblastx)
323
(*db_stats)->dblength = total_length;
326
if (eff_len_options->dbseq_num)
327
num_entries = eff_len_options->dbseq_num;
329
num_entries = ValNodeLen(subject_slp);
331
(*db_stats)->dbnum = num_entries;
333
if (program_number == eBlastTypeBlastx ||
334
program_number == eBlastTypeTblastx)
335
num_frames = NUM_FRAMES;
336
else if (program_number == eBlastTypeBlastn)
342
if (query_info->last_context < num_frames) { /* Only one query here. */
343
Int4 qlen = BLAST_GetQueryLength(query_info, query_info->first_context);
344
(*db_stats)->hsp_length = query_info->length_adjustments[query_info->first_context];
345
/** FIXME: Should this be different for RPS BLAST? */
346
(*db_stats)->qlen = qlen;
347
(*db_stats)->eff_qlen = qlen - ((*db_stats)->hsp_length);
348
(*db_stats)->eff_dblength = total_length - num_entries*((*db_stats)->hsp_length);
349
(*db_stats)->eff_searchsp = query_info->eff_searchsp_array[query_info->first_context];
350
if (eff_len_options && eff_len_options->searchsp_eff)
351
(*db_stats)->eff_searchsp_used = eff_len_options->searchsp_eff;
353
(*db_stats)->eff_searchsp_used = (*db_stats)->eff_searchsp;
359
/** Free Blast_SearchParams structure and underlying data
361
* @param search_params the object to be freed [in]
362
* @return NULL pointer
364
static Blast_SearchParams*
365
Blast_SummarySearchParamsFree(Blast_SearchParams* search_params)
368
if (search_params == NULL)
371
sfree(search_params->matrix);
372
sfree(search_params->entrez_query);
373
sfree(search_params->filter_string);
374
sfree(search_params);
379
/** Allocated and fills some search parameters.
381
* @param program_number identifies blastn/blastp/etc. [in]
382
* @param score_options pointer to scoring options [in]
383
* @param lookup_options pointer to options for lookup table creation [in]
384
* @param hit_options options for saving and evaluating hits [in]
385
* @param query_setup options for filtering etc. [in]
386
* @param word_options options for processing initial hits [in]
387
* @param entrez_query limit search by this query [in]
388
* @param search_params object to be allocated and filled [out]
389
* @return zero on success
392
Blast_SummarySearchParamsFill(EBlastProgramType program_number,
393
const BlastScoringOptions* score_options,
394
const LookupTableOptions* lookup_options,
395
const BlastHitSavingOptions* hit_options,
396
const QuerySetUpOptions* query_setup,
397
const BlastInitialWordOptions* word_options,
398
const char* entrez_query,
399
Blast_SearchParams** search_params)
401
Blast_SearchParams* search_params_lcl;
403
ASSERT(search_params);
404
ASSERT(score_options && lookup_options && hit_options && query_setup);
406
*search_params = search_params_lcl = (Blast_SearchParams*) calloc(1, sizeof(Blast_SearchParams));
407
if (search_params_lcl == NULL)
410
if (program_number == eBlastTypeBlastn)
412
search_params_lcl->match = score_options->reward;
413
search_params_lcl->mismatch = score_options->penalty;
415
else if (score_options->matrix)
417
search_params_lcl->matrix = StringSave(score_options->matrix);
420
if (score_options->gapped_calculation)
422
search_params_lcl->gapped_search = TRUE;
423
search_params_lcl->gap_open = score_options->gap_open;
424
search_params_lcl->gap_extension = score_options->gap_extend;
427
search_params_lcl->gapped_search = FALSE;
429
if (query_setup && query_setup->filter_string)
430
search_params_lcl->filter_string = StringSave(query_setup->filter_string);
432
search_params_lcl->filter_string = StringSave("F");
434
search_params_lcl->expect = hit_options->expect_value;
436
if (lookup_options->phi_pattern)
437
search_params_lcl->pattern = StringSave(lookup_options->phi_pattern);
439
search_params_lcl->threshold = lookup_options->threshold;
441
search_params_lcl->window_size = word_options->window_size;
444
search_params_lcl->entrez_query = StringSave(entrez_query);
451
Blast_SummaryReturnFree(Blast_SummaryReturn* sum_returns)
454
sfree(sum_returns->ka_params);
455
sfree(sum_returns->ka_params_gap);
456
sum_returns->db_stats = Blast_SummaryDBStatsFree(sum_returns->db_stats);
457
sum_returns->search_params = Blast_SummarySearchParamsFree(sum_returns->search_params);
458
sum_returns->diagnostics = Blast_DiagnosticsFree(sum_returns->diagnostics);
464
Int2 Blast_SummaryReturnFill(EBlastProgramType program_number,
465
const BlastScoringOptions* score_options,
466
const BlastScoreBlk* sbp,
467
const LookupTableOptions* lookup_options,
468
const BlastInitialWordOptions* word_options,
469
const BlastExtensionOptions* ext_options,
470
const BlastHitSavingOptions* hit_options,
471
const BlastEffectiveLengthsOptions* eff_len_options,
472
const QuerySetUpOptions* query_setup,
473
const BlastQueryInfo* query_info,
474
ReadDBFILE* rdfp, SeqLoc* subject_slp,
475
BlastDiagnostics** diagnostics,
476
Blast_SummaryReturn** sum_returns_out)
478
Blast_SummaryReturn* sum_returns =
479
(Blast_SummaryReturn*) calloc(1, sizeof(Blast_SummaryReturn));
481
if (sum_returns_out == NULL)
484
if (score_options == NULL || sbp == NULL || lookup_options == NULL ||
485
word_options == NULL || ext_options == NULL || hit_options == NULL ||
486
eff_len_options == NULL || query_info == NULL)
489
if (rdfp == NULL && subject_slp == NULL)
492
Blast_SummaryKAParametersFill(sbp, query_info->first_context, sum_returns);
493
Blast_SummaryDBStatsFill(program_number, eff_len_options, query_info, rdfp,
494
subject_slp, &(sum_returns->db_stats));
496
Blast_SummarySearchParamsFill(program_number, score_options,
497
lookup_options, hit_options, query_setup, word_options,
498
NULL, &(sum_returns->search_params));
501
sum_returns->diagnostics = *diagnostics;
504
*sum_returns_out = sum_returns;