1
/* $Id: blast_engine.c,v 1.159 2004/10/04 17:11:21 dondosha Exp $
2
* ===========================================================================
5
* National Center for Biotechnology Information
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's offical duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government have not placed any restriction on its use or reproduction.
14
* Although all reasonable efforts have been taken to ensure the accuracy
15
* and reliability of the software and data, the NLM and the U.S.
16
* Government do not and cannot warrant the performance or results that
17
* may be obtained by using this software or data. The NLM and the U.S.
18
* Government disclaim all warranties, express or implied, including
19
* warranties of performance, merchantability or fitness for any particular
22
* Please cite the author in any work or product based on this material.
24
* ===========================================================================
26
* Author: Ilya Dondoshansky
30
/** @file blast_engine.c
31
* High level BLAST functions
34
static char const rcsid[] =
35
"$Id: blast_engine.c,v 1.159 2004/10/04 17:11:21 dondosha Exp $";
37
#include <algo/blast/core/blast_engine.h>
38
#include <algo/blast/core/lookup_wrap.h>
39
#include <algo/blast/core/aa_ungapped.h>
40
#include <algo/blast/core/blast_util.h>
41
#include <algo/blast/core/blast_setup.h>
42
#include <algo/blast/core/blast_gapalign.h>
43
#include <algo/blast/core/blast_traceback.h>
44
#include <algo/blast/core/phi_extend.h>
45
#include <algo/blast/core/link_hsps.h>
47
/** Structure to be passed to BLAST_SearchEngineCore, containing pointers
48
to various preallocated structures and arrays. */
49
typedef struct BlastCoreAuxStruct {
51
Blast_ExtendWord* ewp; /**< Structure for keeping track of diagonal
52
information for initial word matches */
53
BlastWordFinderType WordFinder; /**< Word finder function pointer */
54
BlastGetGappedScoreType GetGappedScore; /**< Gapped extension function
56
BlastInitHitList* init_hitlist; /**< Placeholder for HSPs after
58
BlastHSPList* hsp_list; /**< Placeholder for HSPs after gapped
60
Uint4* query_offsets; /**< Placeholder for initial word match query
62
Uint4* subject_offsets; /**< Placeholder for initial word match
64
Uint1* translation_buffer; /**< Placeholder for translated subject
66
Uint1* translation_table; /**< Translation table for forward strand */
67
Uint1* translation_table_rc; /**< Translation table for reverse
71
/** Deallocates all memory in BlastCoreAuxStruct */
72
static BlastCoreAuxStruct*
73
BlastCoreAuxStructFree(BlastCoreAuxStruct* aux_struct)
75
BlastExtendWordFree(aux_struct->ewp);
76
BLAST_InitHitListFree(aux_struct->init_hitlist);
77
Blast_HSPListFree(aux_struct->hsp_list);
78
sfree(aux_struct->query_offsets);
79
sfree(aux_struct->subject_offsets);
85
/** Adjust HSP coordinates for out-of-frame gapped extension.
86
* @param program One of blastx or tblastn [in]
87
* @param init_hitlist List of hits after ungapped extension [in]
88
* @param query_info Query information containing context offsets;
89
* needed for blastx only [in]
90
* @param subject_frame Frame of the subject sequence; tblastn only [in]
91
* @param subject_length Length of the original nucleotide subject sequence;
93
* @param offset Shift in the subject sequence protein coordinates [in]
95
static void TranslateHSPsToDNAPCoord(EBlastProgramType program,
96
BlastInitHitList* init_hitlist, BlastQueryInfo* query_info,
97
Int2 subject_frame, Int4 subject_length, Int4 offset)
99
BlastInitHSP* init_hsp;
100
Int4 index, context, frame;
101
Int4* context_offsets = query_info->context_offsets;
103
for (index = 0; index < init_hitlist->total; ++index) {
104
init_hsp = &init_hitlist->init_hsp_array[index];
106
if (program == eBlastTypeBlastx) {
108
BSearchInt4(init_hsp->q_off, context_offsets,
109
query_info->last_context+1);
113
(init_hsp->q_off - context_offsets[context]) * CODON_LENGTH +
114
context_offsets[context-frame] + frame;
115
init_hsp->ungapped_data->q_start =
116
(init_hsp->ungapped_data->q_start - context_offsets[context])
117
* CODON_LENGTH + context_offsets[context-frame] + frame;
119
init_hsp->s_off += offset;
120
init_hsp->ungapped_data->s_start += offset;
121
if (subject_frame > 0) {
123
(init_hsp->s_off * CODON_LENGTH) + subject_frame - 1;
124
init_hsp->ungapped_data->s_start =
125
(init_hsp->ungapped_data->s_start * CODON_LENGTH) +
128
init_hsp->s_off = (init_hsp->s_off * CODON_LENGTH) +
129
subject_length - subject_frame;
130
init_hsp->ungapped_data->s_start =
131
(init_hsp->ungapped_data->s_start * CODON_LENGTH) +
132
subject_length - subject_frame;
139
/** The core of the BLAST search: comparison between the (concatenated)
140
* query against one subject sequence. Translation of the subject sequence
141
* into 6 frames is done inside, if necessary. If subject sequence is
142
* too long, it can be split into several chunks.
143
* @ param program_number BLAST program type [in]
144
* @param query Query sequence structure [in]
145
* @param query_info_in Query information [in]
146
* @param subject Subject sequence structure [in]
147
* @param lookup Lookup table [in]
148
* @param gap_align Structure for gapped alignment information [in]
149
* @param score_params Scoring parameters [in]
150
* @param word_params Initial word finding and ungapped extension
152
* @param ext_params Gapped extension parameters [in]
153
* @param hit_params Hit saving parameters [in]
154
* @param db_options Database options [in]
155
* @param diagnostics Hit counts and other diagnostics [in] [out]
156
* @param aux_struct Structure containing different auxiliary data and memory
157
* for the preliminary stage of the BLAST search [in]
158
* @param hsp_list_out List of HSPs found for a given subject sequence [in]
161
BLAST_SearchEngineCore(EBlastProgramType program_number, BLAST_SequenceBlk* query,
162
BlastQueryInfo* query_info_in, BLAST_SequenceBlk* subject,
163
LookupTableWrap* lookup, BlastGapAlignStruct* gap_align,
164
BlastScoringParameters* score_params,
165
BlastInitialWordParameters* word_params,
166
BlastExtensionParameters* ext_params,
167
BlastHitSavingParameters* hit_params,
168
const BlastDatabaseOptions* db_options,
169
BlastDiagnostics* diagnostics,
170
BlastCoreAuxStruct* aux_struct,
171
BlastHSPList** hsp_list_out)
173
BlastInitHitList* init_hitlist = aux_struct->init_hitlist;
174
BlastHSPList* hsp_list = aux_struct->hsp_list;
175
Uint1* translation_buffer = NULL;
176
Int4* frame_offsets = NULL;
177
BlastHitSavingOptions* hit_options = hit_params->options;
178
BlastScoringOptions* score_options = score_params->options;
179
BlastHSPList* combined_hsp_list = NULL;
181
Int4 context, first_context, last_context;
182
Int4 orig_length = subject->length;
183
Uint1* orig_sequence = subject->sequence;
186
BlastUngappedStats* ungapped_stats = NULL;
187
BlastGappedStats* gapped_stats = NULL;
188
BlastQueryInfo* query_info = query_info_in;
189
Boolean prelim_traceback =
190
(ext_params->options->ePrelimGapExt == eGreedyWithTracebackExt);
192
const Boolean k_translated_subject = (program_number == eBlastTypeTblastn
193
|| program_number == eBlastTypeTblastx
194
|| program_number == eBlastTypeRpsTblastn);
196
if (k_translated_subject) {
199
if (score_options->is_ooframe) {
200
BLAST_GetAllTranslations(orig_sequence, NCBI2NA_ENCODING,
201
orig_length, db_options->gen_code_string, &translation_buffer,
202
&frame_offsets, &subject->oof_sequence);
203
subject->oof_sequence_allocated = TRUE;
204
} else if (program_number == eBlastTypeRpsTblastn ) {
205
/* For RPS tblastn, subject is actually query, which has already
206
been translated during the setup stage. */
207
translation_buffer = orig_sequence - 1;
208
frame_offsets = query_info_in->context_offsets;
210
BLAST_GetAllTranslations(orig_sequence, NCBI2NA_ENCODING,
211
orig_length, db_options->gen_code_string, &translation_buffer,
212
&frame_offsets, NULL);
214
} else if (program_number == eBlastTypeBlastn) {
222
*hsp_list_out = NULL;
224
if (gap_align->positionBased)
225
matrix = gap_align->sbp->posMatrix;
227
matrix = gap_align->sbp->matrix;
229
hsp_num_max = (hit_options->hsp_num_max ? hit_options->hsp_num_max : INT4_MAX);
232
ungapped_stats = diagnostics->ungapped_stat;
233
gapped_stats = diagnostics->gapped_stat;
236
/* Substitute query info by concatenated database info for RPS BLAST search */
237
if (program_number == eBlastTypeRpsBlast ||
238
program_number == eBlastTypeRpsTblastn) {
239
BlastRPSLookupTable* lut = (BlastRPSLookupTable*) lookup->lut;
240
query_info = (BlastQueryInfo*) calloc(1, sizeof(BlastQueryInfo));
241
query_info->num_queries = lut->num_profiles;
242
query_info->last_context = lut->num_profiles - 1;
243
query_info->context_offsets = lut->rps_seq_offsets;
246
/* Loop over frames of the subject sequence */
247
for (context=first_context; context<=last_context; context++) {
248
Int4 chunk; /* loop variable below. */
249
Int4 num_chunks; /* loop variable below. */
250
Int4 offset = 0; /* Used as offset into subject sequence (if chunked) */
251
Int4 total_subject_length; /* Length of subject sequence used when split. */
253
if (k_translated_subject) {
254
subject->frame = BLAST_ContextToFrame(eBlastTypeBlastx, context);
256
translation_buffer + frame_offsets[context] + 1;
258
frame_offsets[context+1] - frame_offsets[context] - 1;
260
subject->frame = context;
263
/* Split subject sequence into chunks if it is too long */
264
num_chunks = (subject->length - DBSEQ_CHUNK_OVERLAP) /
265
(MAX_DBSEQ_LEN - DBSEQ_CHUNK_OVERLAP) + 1;
266
total_subject_length = subject->length;
268
for (chunk = 0; chunk < num_chunks; ++chunk) {
270
offset += subject->length - DBSEQ_CHUNK_OVERLAP;
271
if (program_number == eBlastTypeBlastn) {
273
(subject->length - DBSEQ_CHUNK_OVERLAP)/COMPRESSION_RATIO;
275
subject->sequence += (subject->length - DBSEQ_CHUNK_OVERLAP);
278
subject->length = MIN(total_subject_length - offset,
281
BlastInitHitListReset(init_hitlist);
283
aux_struct->WordFinder(subject, query, lookup,
284
matrix, word_params, aux_struct->ewp, aux_struct->query_offsets,
285
aux_struct->subject_offsets, GetOffsetArraySize(lookup),
286
init_hitlist, ungapped_stats);
288
if (init_hitlist->total == 0)
291
if (score_options->gapped_calculation) {
292
Int4 prot_length = 0;
293
if (score_options->is_ooframe) {
294
/* Convert query offsets in all HSPs into the mixed-frame
296
TranslateHSPsToDNAPCoord(program_number, init_hitlist,
297
query_info, subject->frame, orig_length, offset);
298
if (k_translated_subject) {
299
prot_length = subject->length;
300
subject->length = 2*orig_length + 1;
303
/** NB: If queries are concatenated, HSP offsets must be adjusted
304
* inside the following function call, so coordinates are
305
* relative to the individual contexts (i.e. queries, strands or
306
* frames). Contexts should also be filled in HSPs when they
309
aux_struct->GetGappedScore(program_number, query, query_info,
310
subject, gap_align, score_params, ext_params, hit_params,
311
init_hitlist, &hsp_list, gapped_stats);
312
if (score_options->is_ooframe && k_translated_subject)
313
subject->length = prot_length;
315
BLAST_GetUngappedHSPList(init_hitlist, query_info, subject,
316
hit_options, &hsp_list);
319
if (hsp_list->hspcnt == 0)
322
/* The subject ordinal id is not yet filled in this HSP list */
323
hsp_list->oid = subject->oid;
325
/* Assign frames in all HSPs. */
326
Blast_HSPListSetFrames(program_number, hsp_list,
327
score_options->is_ooframe);
329
Blast_HSPListAdjustOffsets(hsp_list, offset);
330
/* Allow merging of HSPs either if traceback is already
331
available, or if it is an ungapped search */
332
Blast_HSPListsMerge(hsp_list, &combined_hsp_list, hsp_num_max, offset,
333
(Boolean)(prelim_traceback || !score_options->gapped_calculation));
334
} /* End loop on chunks of subject sequence */
336
if (Blast_HSPListAppend(combined_hsp_list, hsp_list_out, hsp_num_max)) {
341
combined_hsp_list = Blast_HSPListFree(combined_hsp_list);
342
} /* End loop on frames */
344
/* Restore the original contents of the subject block */
345
subject->length = orig_length;
346
subject->sequence = orig_sequence;
351
hsp_list = *hsp_list_out;
353
if (hit_params->link_hsp_params) {
354
status = BLAST_LinkHsps(program_number, hsp_list, query_info,
355
subject->length, gap_align->sbp, hit_params->link_hsp_params,
356
score_options->gapped_calculation);
357
} else if (hit_options->phi_align) {
358
/* These e-values will not be accurate yet, since we don't know
359
the number of pattern occurrencies in the database. That
360
is arbitrarily set to 1 at this time. */
361
Blast_HSPListPHIGetEvalues(hsp_list, gap_align->sbp);
363
/* Calculate e-values for all HSPs. Skip this step
364
for RPS blast, since calculating the E values
365
requires precomputation that has not been done yet */
366
if (program_number != eBlastTypeRpsBlast &&
367
program_number != eBlastTypeRpsTblastn)
368
status = Blast_HSPListGetEvalues(query_info, hsp_list,
369
score_options->gapped_calculation, gap_align->sbp, 0);
372
/* Free the local query info structure when needed. */
373
if (query_info != query_info_in)
376
/* Discard HSPs that don't pass the e-value test. */
377
status = Blast_HSPListReapByEvalue(hsp_list, hit_options);
379
/* If there are no HSPs left, destroy the HSP list too. */
380
if (hsp_list && hsp_list->hspcnt == 0)
381
*hsp_list_out = hsp_list = Blast_HSPListFree(hsp_list);
383
if (gapped_stats && hsp_list && hsp_list->hspcnt > 0) {
384
++gapped_stats->num_seqs_passed;
385
gapped_stats->good_extensions += hsp_list->hspcnt;
388
/* Free translation buffer and frame offsets, except for RPS tblastn,
389
* where they are taken from different structures, and hence shouldn't
392
if (program_number != eBlastTypeRpsTblastn) {
393
if (translation_buffer) {
394
sfree(translation_buffer);
397
sfree(frame_offsets);
405
FillReturnCutoffsInfo(BlastRawCutoffs* return_cutoffs,
406
BlastScoringParameters* score_params,
407
BlastInitialWordParameters* word_params,
408
BlastExtensionParameters* ext_params,
409
BlastHitSavingParameters* hit_params)
411
/* since the cutoff score here will be used for display
412
putposes, strip out any internal scaling of the scores */
414
Int4 scale_factor = (Int4)score_params->scale_factor;
419
return_cutoffs->x_drop_ungapped =
420
word_params->x_dropoff / scale_factor;
421
return_cutoffs->x_drop_gap = ext_params->gap_x_dropoff / scale_factor;
422
return_cutoffs->x_drop_gap_final = ext_params->gap_x_dropoff_final /
424
return_cutoffs->gap_trigger = ext_params->gap_trigger / scale_factor;
425
return_cutoffs->cutoff_score = hit_params->cutoff_score;
430
/** Setup of the auxiliary BLAST structures;
431
* also calculates internally used parameters from options.
432
* @param seq_src Sequence source information, with callbacks to get
433
* sequences, their lengths, etc. [in]
434
* @param lookup_wrap Lookup table, already constructed. [in]
435
* @param word_options options for initial word finding. [in]
436
* @param ext_options options for gapped extension. [in]
437
* @param hit_options options for saving hits. [in]
438
* @param query The query sequence block [in]
439
* @param aux_struct_ptr Placeholder joining various auxiliary memory
443
BLAST_SetUpAuxStructures(const BlastSeqSrc* seq_src,
444
LookupTableWrap* lookup_wrap,
445
const BlastInitialWordOptions* word_options,
446
const BlastExtensionOptions* ext_options,
447
const BlastHitSavingOptions* hit_options,
448
BLAST_SequenceBlk* query, BlastCoreAuxStruct** aux_struct_ptr)
451
BlastCoreAuxStruct* aux_struct;
452
Boolean blastp = (lookup_wrap->lut_type == AA_LOOKUP_TABLE ||
453
lookup_wrap->lut_type == RPS_LOOKUP_TABLE);
454
Boolean mb_lookup = (lookup_wrap->lut_type == MB_LOOKUP_TABLE);
455
Boolean phi_lookup = (lookup_wrap->lut_type == PHI_AA_LOOKUP ||
456
lookup_wrap->lut_type == PHI_NA_LOOKUP);
457
Boolean ag_blast = (word_options->extension_method == eRightAndLeft);
458
Int4 offset_array_size = GetOffsetArraySize(lookup_wrap);
459
Uint4 avg_subj_length;
460
Boolean is_na = mb_lookup || (lookup_wrap->lut_type == NA_LOOKUP_TABLE);
464
*aux_struct_ptr = aux_struct = (BlastCoreAuxStruct*)
465
calloc(1, sizeof(BlastCoreAuxStruct));
467
avg_subj_length = BLASTSeqSrcGetAvgSeqLen(seq_src);
469
if ((status = BlastExtendWordNew(is_na, query->length, word_options,
470
avg_subj_length, &aux_struct->ewp)) != 0)
474
aux_struct->WordFinder = MB_WordFinder;
475
} else if (phi_lookup) {
476
aux_struct->WordFinder = PHIBlastWordFinder;
478
aux_struct->WordFinder = BlastAaWordFinder;
479
} else if (ag_blast) {
480
aux_struct->WordFinder = BlastNaWordFinder_AG;
482
aux_struct->WordFinder = BlastNaWordFinder;
485
aux_struct->query_offsets =
486
(Uint4*) malloc(offset_array_size * sizeof(Uint4));
487
aux_struct->subject_offsets =
488
(Uint4*) malloc(offset_array_size * sizeof(Uint4));
490
aux_struct->init_hitlist = BLAST_InitHitListNew();
491
/* Pick which gapped alignment algorithm to use. */
493
aux_struct->GetGappedScore = PHIGetGappedScore;
494
else if (ext_options->ePrelimGapExt == eDynProgExt)
495
aux_struct->GetGappedScore = BLAST_GetGappedScore;
497
aux_struct->GetGappedScore = BLAST_MbGetGappedScore;
499
aux_struct->hsp_list = Blast_HSPListNew(hit_options->hsp_num_max);
504
BLAST_RPSSearchEngine(EBlastProgramType program_number,
505
BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
506
const BlastSeqSrc* seq_src, BlastScoreBlk* sbp,
507
const BlastScoringOptions* score_options,
508
LookupTableWrap* lookup_wrap,
509
const BlastInitialWordOptions* word_options,
510
const BlastExtensionOptions* ext_options,
511
const BlastHitSavingOptions* hit_options,
512
const BlastEffectiveLengthsOptions* eff_len_options,
513
const PSIBlastOptions* psi_options,
514
const BlastDatabaseOptions* db_options,
515
BlastHSPStream* hsp_stream, BlastDiagnostics* diagnostics,
516
BlastHSPResults** results)
518
BlastCoreAuxStruct* aux_struct = NULL;
519
BlastHSPList* hsp_list = NULL;
520
BlastScoringParameters* score_params = NULL;
521
BlastInitialWordParameters* word_params = NULL;
522
BlastExtensionParameters* ext_params = NULL;
523
BlastHitSavingParameters* hit_params = NULL;
524
BlastEffectiveLengthsParameters* eff_len_params = NULL;
525
BlastGapAlignStruct* gap_align;
529
Uint4 avg_subj_length = 0;
530
BlastRPSLookupTable *lookup = (BlastRPSLookupTable *)lookup_wrap->lut;
531
BLAST_SequenceBlk concat_db;
532
BlastQueryInfo concat_db_info;
533
RPSAuxInfo *rps_info;
534
BlastRawCutoffs* raw_cutoffs = NULL;
536
if (program_number != eBlastTypeRpsBlast &&
537
program_number != eBlastTypeRpsTblastn)
541
BLAST_SetUpAuxStructures(seq_src, lookup_wrap, word_options,
542
ext_options, hit_options, query, &aux_struct)) != 0)
546
BLAST_GapAlignSetUp(program_number, seq_src, score_options,
547
eff_len_options, ext_options, hit_options, query_info, sbp,
548
&score_params, &ext_params, &hit_params, &eff_len_params,
552
BlastInitialWordParametersNew(program_number, word_options,
553
hit_params, ext_params, sbp, query_info,
554
BLASTSeqSrcGetAvgSeqLen(seq_src), &word_params);
555
/* Update the parameters for linking HSPs, if necessary. */
556
BlastLinkHSPParametersUpdate(word_params, ext_params,
557
hit_params, score_options->gapped_calculation);
559
/* modify scoring and gap alignment structures for
560
use with RPS blast. */
562
rps_info = lookup->rps_aux_info;
563
gap_align->positionBased = TRUE;
564
gap_align->sbp->posMatrix = lookup->rps_pssm;
566
/* determine the total number of residues in the db.
567
This figure must also include one trailing NULL for
570
num_db_seqs = BLASTSeqSrcGetNumSeqs(seq_src);
571
dbsize = BLASTSeqSrcGetTotLen(seq_src) + num_db_seqs;
572
if (dbsize > INT4_MAX)
575
/* Concatenate all of the DB sequences together, and pretend
576
this is a large multiplexed sequence. Note that because the
577
scoring is position-specific, the actual sequence data is
580
memset(&concat_db, 0, sizeof(concat_db)); /* fill in SequenceBlk */
581
concat_db.length = (Int4) dbsize;
583
/* Change the table of diagonals that will be used for the
584
search; we need a diag table that can fit the entire
586
avg_subj_length = (Uint4) (dbsize / num_db_seqs);
587
BlastExtendWordFree(aux_struct->ewp);
588
/* First argument in BlastExtendWordNew is false because RPS search is
589
never a blastn search. */
590
BlastExtendWordNew(FALSE, concat_db.length, word_options,
591
avg_subj_length, &aux_struct->ewp);
593
/* Run the search; the input query is what gets scanned
594
and the concatenated DB is the sequence associated with
595
the score matrix. This essentially means that 'query'
596
and 'subject' have opposite conventions for the search.
598
Note that while scores can be calculated for any alignment
599
found, we have not set up any Karlin parameters or effective
600
search space sizes for the concatenated DB. This means that
601
E-values cannot be calculated after hits are found. */
603
BLAST_SearchEngineCore(program_number, &concat_db, query_info,
604
query, lookup_wrap, gap_align, score_params,
605
word_params, ext_params, hit_params, db_options,
606
diagnostics, aux_struct, &hsp_list);
608
/* Fill the cutoff values in the diagnostics structure */
609
if (diagnostics->cutoffs)
610
raw_cutoffs = diagnostics->cutoffs;
612
FillReturnCutoffsInfo(raw_cutoffs, score_params, word_params, ext_params, hit_params);
614
/* Save the resulting list of HSPs. 'query' and 'subject' are
616
if (hsp_list && hsp_list->hspcnt > 0) {
617
/* Save the HSPs into a hit list */
618
BlastHSPStreamWrite(hsp_stream, &hsp_list);
621
/* Do the traceback. After this call, query and
622
subject have reverted to their traditional meanings. */
623
memset(&concat_db_info, 0, sizeof(concat_db_info)); /* fill in QueryInfo */
624
concat_db_info.num_queries = num_db_seqs;
625
concat_db_info.first_context = 0;
626
concat_db_info.last_context = num_db_seqs - 1;
627
concat_db_info.context_offsets = lookup->rps_seq_offsets;
629
BLAST_RPSTraceback(program_number, hsp_stream, &concat_db,
630
&concat_db_info, query, query_info, gap_align,
631
score_params, ext_params, hit_params, db_options,
632
rps_info->karlin_k, results);
634
/* free the internal structures used */
635
/* Do not destruct score block here */
637
gap_align->sbp->posMatrix = NULL;
638
gap_align->positionBased = FALSE;
639
gap_align->sbp = NULL;
640
BLAST_GapAlignStructFree(gap_align);
641
BlastCoreAuxStructFree(aux_struct);
646
sfree(eff_len_params);
651
BLAST_PreliminarySearchEngine(EBlastProgramType program_number,
652
BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
653
const BlastSeqSrc* seq_src, BlastGapAlignStruct* gap_align,
654
BlastScoringParameters* score_params,
655
LookupTableWrap* lookup_wrap,
656
const BlastInitialWordOptions* word_options,
657
BlastExtensionParameters* ext_params,
658
BlastHitSavingParameters* hit_params,
659
BlastEffectiveLengthsParameters* eff_len_params,
660
const PSIBlastOptions* psi_options,
661
const BlastDatabaseOptions* db_options,
662
BlastHSPStream* hsp_stream, BlastDiagnostics* diagnostics)
664
BlastCoreAuxStruct* aux_struct = NULL;
665
BlastHSPList* hsp_list = NULL;
669
BlastRawCutoffs* raw_cutoffs = NULL;
670
Boolean prelim_traceback;
671
const BlastScoringOptions* score_options = score_params->options;
672
const BlastHitSavingOptions* hit_options = hit_params->options;
673
const BlastExtensionOptions* ext_options = ext_params->options;
674
BlastInitialWordParameters* word_params = NULL;
675
Boolean gapped_calculation = score_options->gapped_calculation;
676
BlastScoreBlk* sbp = gap_align->sbp;
677
BlastSeqSrcIterator* itr;
679
BlastInitialWordParametersNew(program_number, word_options,
680
hit_params, ext_params, sbp, query_info,
681
BLASTSeqSrcGetAvgSeqLen(seq_src), &word_params);
682
/* Update the parameters for linking HSPs, if necessary. */
683
BlastLinkHSPParametersUpdate(word_params, ext_params,
684
hit_params, gapped_calculation);
687
BLAST_SetUpAuxStructures(seq_src, lookup_wrap, word_options,
688
ext_options, hit_options, query, &aux_struct)) != 0)
691
prelim_traceback = (ext_options->ePrelimGapExt == eGreedyWithTracebackExt);
693
memset((void*) &seq_arg, 0, sizeof(seq_arg));
695
/* Encoding is set so there are no sentinel bytes, and protein/nucleotide
696
sequences are retieved in ncbistdaa/ncbi2na encodings respectively. */
697
seq_arg.encoding = BLASTP_ENCODING;
699
db_length = BLASTSeqSrcGetTotLen(seq_src);
701
itr = BlastSeqSrcIteratorNew(0);
703
/* iterate over all subject sequences */
704
while ( (seq_arg.oid = BlastSeqSrcIteratorNext(seq_src, itr))
705
!= BLAST_SEQSRC_EOF) {
706
if (seq_arg.oid == BLAST_SEQSRC_ERROR)
708
if (BLASTSeqSrcGetSequence(seq_src, (void*) &seq_arg) < 0)
710
if (db_length == 0) {
711
/* This is not a database search, hence need to recalculate and save
712
the effective search spaces and length adjustments for all
713
queries based on the length of the current single subject
715
if ((status = BLAST_OneSubjectUpdateParameters(program_number,
716
seq_arg.seq->length, score_options, query_info,
717
sbp, ext_params, hit_params, word_params,
718
eff_len_params)) != 0)
722
/* Calculate cutoff scores for linking HSPs. Do this only for ungapped
724
if (hit_params->link_hsp_params && program_number != eBlastTypeBlastn &&
725
!gapped_calculation) {
726
CalculateLinkHSPCutoffs(program_number, query_info, sbp,
727
hit_params->link_hsp_params, ext_params, db_length,
728
seq_arg.seq->length);
732
BLAST_SearchEngineCore(program_number, query, query_info,
733
seq_arg.seq, lookup_wrap, gap_align, score_params, word_params,
734
ext_params, hit_params, db_options, diagnostics, aux_struct,
740
if (hsp_list && hsp_list->hspcnt > 0) {
741
if (program_number == eBlastTypeBlastn) {
742
if (prelim_traceback || !gapped_calculation) {
744
Blast_HSPListReevaluateWithAmbiguities(hsp_list, query,
745
seq_arg.seq, hit_options, query_info, sbp, score_params,
748
/* Check for HSP inclusion */
749
status = Blast_HSPListUniqSort(hsp_list);
750
/* Relink HSPs if sum statistics is used. */
751
if (hit_params->link_hsp_params) {
752
status = BLAST_LinkHsps(program_number, hsp_list, query_info,
753
seq_arg.seq->length, sbp, hit_params->link_hsp_params,
758
if (prelim_traceback || !gapped_calculation) {
759
/* Calculate and fill the bit scores, but only if final scores are
760
already available, i.e. either traceback has already been done,
761
or this is an ungapped search. */
762
Blast_HSPListGetBitScores(hsp_list,
763
gapped_calculation, sbp);
766
/* Sort HSPs with e-values as first priority and scores as
767
* tie-breakers. This will guarantee that HSP lists will be ordered
768
* by best e-value after preliminary stage. The HSPs will then have to
769
* be sorted by score at the beginning of traceback stage.
770
* Note that for RPS BLAST e-values are not available at this stage,
771
* so the HSPs will be sorted by score here.
773
Blast_HSPListSortByEvalue(hsp_list);
774
/* Save the results. */
775
BlastHSPStreamWrite(hsp_stream, &hsp_list);
777
BLASTSeqSrcRetSequence(seq_src, (void*) &seq_arg);
780
BlastSequenceBlkFree(seq_arg.seq);
781
itr = BlastSeqSrcIteratorFree(itr);
783
/* Fill the cutoff values in the diagnostics structure */
784
if (diagnostics && diagnostics->cutoffs)
785
raw_cutoffs = diagnostics->cutoffs;
787
FillReturnCutoffsInfo(raw_cutoffs, score_params, word_params, ext_params, hit_params);
789
if (hit_options->phi_align) {
790
/* Save the product of effective occurrencies of pattern in query and
791
occurrencies of pattern in database */
793
if (diagnostics && diagnostics->ungapped_stat)
794
db_hits = diagnostics->ungapped_stat->lookup_hits;
795
gap_align->sbp->effective_search_sp *= db_hits;
798
word_params = BlastInitialWordParametersFree(word_params);
799
BlastCoreAuxStructFree(aux_struct);
804
BLAST_SearchEngine(EBlastProgramType program_number,
805
BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
806
const BlastSeqSrc* seq_src, BlastScoreBlk* sbp,
807
const BlastScoringOptions* score_options,
808
LookupTableWrap* lookup_wrap,
809
const BlastInitialWordOptions* word_options,
810
const BlastExtensionOptions* ext_options,
811
const BlastHitSavingOptions* hit_options,
812
const BlastEffectiveLengthsOptions* eff_len_options,
813
const PSIBlastOptions* psi_options,
814
const BlastDatabaseOptions* db_options,
815
BlastHSPStream* hsp_stream, BlastDiagnostics* diagnostics,
816
BlastHSPResults** results)
819
BlastScoringParameters* score_params = NULL;
820
BlastExtensionParameters* ext_params = NULL;
821
BlastHitSavingParameters* hit_params = NULL;
822
BlastEffectiveLengthsParameters* eff_len_params = NULL;
823
BlastGapAlignStruct* gap_align;
825
if (program_number == eBlastTypeRpsBlast ||
826
program_number == eBlastTypeRpsTblastn) {
828
BLAST_RPSSearchEngine(program_number, query, query_info,
829
seq_src, sbp, score_options, lookup_wrap,
830
word_options, ext_options, hit_options, eff_len_options,
831
psi_options, db_options, hsp_stream, diagnostics, results);
835
BLAST_GapAlignSetUp(program_number, seq_src, score_options,
836
eff_len_options, ext_options, hit_options, query_info, sbp,
837
&score_params, &ext_params, &hit_params, &eff_len_params,
841
if ((status=BLAST_PreliminarySearchEngine(program_number, query, query_info, seq_src,
842
gap_align, score_params, lookup_wrap, word_options, ext_params,
843
hit_params, eff_len_params, psi_options, db_options, hsp_stream,
847
/* Prohibit any subsequent writing to the HSP stream. */
848
BlastHSPStreamClose(hsp_stream);
851
BLAST_ComputeTraceback(program_number, hsp_stream, query, query_info,
852
seq_src, gap_align, score_params, ext_params, hit_params,
853
eff_len_params, db_options, psi_options, results)) != 0)
856
/* Do not destruct score block here */
857
gap_align->sbp = NULL;
858
BLAST_GapAlignStructFree(gap_align);
860
score_params = BlastScoringParametersFree(score_params);
861
hit_params = BlastHitSavingParametersFree(hit_params);
862
ext_params = BlastExtensionParametersFree(ext_params);
863
eff_len_params = BlastEffectiveLengthsParametersFree(eff_len_params);