33
33
Contents: Utilities for BLAST
37
37
******************************************************************************/
40
40
* $Log: blastutl.c,v $
41
* Revision 6.466 2006/08/10 17:34:38 merezhuk
42
* Fix for reading -z advanced option by StringToInt8; RT # 15187990
44
* Revision 6.465 2006/02/15 18:23:47 madden
45
* Made changes so that CheckStartForGappedAlignment by default
46
* checks ungapped alignments of length 11, rather than length 10.
47
* Made changes to the rules used when the starting point is close to
48
* the edge of the preliminary gapped alignment.
51
* Revision 6.464 2005/12/01 15:10:23 madden
52
* Gave BLASTCheckHSPInclusion external linkage (i.e. removed the static specifier).
54
* Revision 6.463 2005/10/13 15:59:06 camacho
55
* Add code to fix cutoff scores in PSI-BLAST.
57
* Revision 6.462 2005/07/28 14:57:09 coulouri
60
* Revision 6.461 2005/07/27 15:51:54 coulouri
61
* remove unused queue_callback
63
* Revision 6.460 2005/05/02 16:03:14 coulouri
64
* refactor code to set db_chunk_size
66
* Revision 6.459 2005/04/25 14:16:36 coulouri
67
* set db_chunk_size adaptively
69
* Revision 6.458 2005/04/04 20:44:27 camacho
70
* Do not overwrite the effective search space in Pssm2Sequences if specified in the options structure
72
* Revision 6.457 2005/02/07 15:30:08 dondosha
73
* Removed restriction on the value of longest intron option
75
* Revision 6.456 2005/01/24 20:37:37 camacho
76
* Added conditional compilation to structs need for BLAST_CLUSTER_HITS
78
* Revision 6.455 2005/01/18 14:54:13 camacho
79
* Change in tie-breakers for score comparison, suggestion by Mike Gertz
81
* Revision 6.454 2004/12/20 15:22:16 camacho
82
* Calculate kbp_ideal values rather than loading them from pre-computed values
84
* Revision 6.453 2004/12/01 17:24:15 coulouri
85
* do not dereference null pointer
87
* Revision 6.452 2004/11/22 16:10:11 dondosha
88
* Minor fix to make sure that "evalue" score type is always used when hsp is not part of a linked set
90
* Revision 6.451 2004/11/04 15:51:55 bealer
91
* - bl2seq should use dblen as average length if database is not available.
93
* Revision 6.450 2004/11/01 14:07:56 madden
96
* - In query_offset_compare_hsp and query_end_compare_hsp, use the
97
* subject query/offset as a tie-breaker. Without this tie-breaker
98
* CheckGappedAlignmentsForOverlap won't work properly.
100
* - In CheckGappedAlignmentsForOverlap check that hsp_array, rather
101
* than *hsp_array, is not nil.
103
* - In BlastSaveCurrentHsp, rewrote the binary search to use
104
* score_compare_hsps, so that the answers are consistent with the
105
* heap code used in the algo/blast/core code.
107
* - In BlastGappedScoreInternal delete gapped extensions that don't
108
* reach the cutoff score (cutoff_s1).
110
* Revision 6.449 2004/10/25 18:36:17 papadopo
111
* From Michael Gertz: remove unneeded decrement of alignment offsets in BlastNtSaveCurrentHsp
113
* Revision 6.448 2004/10/19 19:42:17 dondosha
114
* Optimized algorithm in BlastPruneSeqAlignByGiList to make it up to 25 times faster; Added new function BlastPruneSeqAlignBySortedGiList
41
116
* Revision 6.447 2004/10/18 13:02:41 madden
42
117
* Changes from Mike Gertz:
43
118
* - In score_compare_hsps, query_offset_compare_hsp and
2940
3014
BLAST_ScorePtr *retval = NULL;
2941
3015
posSearchItems *posSearch = NULL;
2942
Int4 qlen, alphabet_sz, rv, array_sz;
3016
Int4 qlen, alphabet_sz, rv;
2943
3017
Nlm_FloatHi scalingFactor = search->pbp->scalingFactor;
2944
3018
BLAST_ScoreBlkPtr sbp = NULL;
2945
3019
ValNodePtr error_return;
2946
Int4Ptr open, extend; /* gap open and extension costs */
2947
Nlm_FloatHiPtr lambda, K, H; /* Karlin_Altschul score paramters */
2948
3020
Int4 i, gap_open, gap_extend;
2950
3022
if (!search || !compactSearch || !posFreqs)
2998
array_sz = BlastKarlinGetMatrixValues(sbp->name, &open, &extend, &lambda,
3001
for (i = 0; i < array_sz; i++) {
3002
if (open[i] == INT2_MAX && extend[i] == INT2_MAX) {
3003
sbp->kbp_ideal = BlastKarlinBlkCreate();
3004
sbp->kbp_ideal->Lambda = lambda[i];
3005
sbp->kbp_ideal->K = K[i];
3006
sbp->kbp_ideal->H = H[i];
3009
MemFree(open); MemFree(extend);
3010
MemFree(lambda); MemFree(K); MemFree(H);
3012
3070
if (sbp->kbp_ideal == NULL)
3013
3071
sbp->kbp_ideal = BlastKarlinBlkStandardCalcEx(sbp);
3014
3072
compactSearch->lambda = sbp->kbp_gap_std[0]->Lambda;
3407
3465
options->db_length : SeqLocLen(pssm_slp);
3408
3466
Int4 nseqs = (options->dbseq_num != 0) ? options->dbseq_num : 1;
3410
search->searchsp_eff = BLASTCalculateSearchSpace(options, nseqs,
3411
dblen, SeqLocLen(target_seqs[i]));
3468
/* If search space has been specified in the options structure, the it
3469
* must have been set in BLASTSetUpSearchEx, so don't overwrite it */
3470
if ( ! (options->searchsp_eff > 0) ) {
3471
search->searchsp_eff = BLASTCalculateSearchSpace(options, nseqs,
3472
dblen, SeqLocLen(target_seqs[i]));
3412
3474
sa_array[i] = B2SPssmOnTheFlyByLoc(search, target_seqs[i]);
3621
3683
seqalign = BlastTwoSequencesCore(search, subject_slp, subject_seq, subject_length, reverse);
3624
/* This is turned off for right now */
3625
if (options->tweak_parameters || options->smith_waterman) {
3626
Boolean free_subject_info = FALSE;
3628
if (search->subject_info == NULL) {
3629
search->subject_info = BLASTSubjectInfoNew(
3630
SeqIdSetDup(SeqLocId(subject_slp)), NULL, subject_length);
3631
free_subject_info = TRUE;
3634
/* Use composition-based statistics */
3635
seqalign = RedoAlignmentCore(search, options,
3636
search->result_struct->hitlist_count,
3637
options->tweak_parameters, options->smith_waterman);
3638
for (index = 0; index<search->result_struct->hitlist_count; index++)
3639
BLASTResultFreeHsp(search->result_struct->results[index]);
3640
for (index = 0; index<search->result_struct->hitlist_count; index++)
3641
SeqAlignSetFree(search->result_struct->results[index]->seqalign);
3643
if (free_subject_info)
3644
search->subject_info =
3645
BLASTSubjectInfoDestruct(search->subject_info);
3649
3685
if (complement)
3651
3687
seqalign = SeqAlignListReverseStrand(seqalign);
4450
4489
else /* Should never happen */
4454
4494
#define CLUSTER_LENGTH_THRESH 0.1
4455
4495
#define CLUSTER_OVERLAP_THRESH 0.9
4456
4496
#define CLUSTER_SCORE_THRESH 1.6
4498
Nlm_FloatHi s_ComputeAverageLength(const BlastSearchBlk* search)
4500
Nlm_FloatHi retval = 0.0;
4502
if (StringCmp(search->prog_name, "blastn") != 0) {
4503
retval = BLAST_AA_AVGLEN;
4505
retval = BLAST_NT_AVGLEN;
4509
Int4 total_number = 0;
4510
Int8 total_length = 0;
4512
readdb_get_totals(search->rdfp, &total_length, &total_number);
4513
if (total_number > 0)
4514
retval = total_length/total_number;
4515
} else if (search->dblen > 0 && search->dbseq_num == 1) {
4516
retval = search->dblen;
4458
4522
SeqAlignPtr LIBCALL
4459
4523
BioseqBlastEngineCore(BlastSearchBlkPtr search, BLAST_OptionsBlkPtr options,
4460
4524
Int4Ptr *pos_matrix)
4462
BLASTResultHspPtr hsp, hsp1;
4463
BlastResultHspWithIdPtr PNTR hspp;
4464
BLASTResultsStructPtr result_struct;
4465
BLASTResultHitlistPtr result_hitlist;
4467
GapXEditBlockPtr edit_block;
4468
Int4 hitlist_count, hitlist_max, hspcnt, index, index1, index2;
4469
Int4 length, sequence_length;
4470
Int4 q_overlap, total_num_hsp=0;
4471
SeqAlignPtr sap, head, seqalign, seqalign_var;
4472
SeqIdPtr gi_list=NULL, subject_id;
4475
BioseqPtr bsp1, bsp2, PNTR bspp;
4476
BlastSearchBlkPtr search1;
4527
SeqAlignPtr head, seqalign;
4528
#ifdef BLAST_CLUSTER_HITS
4529
BLASTResultHspPtr hsp, hsp1;
4530
BlastResultHspWithIdPtr PNTR hspp;
4531
BLASTResultsStructPtr result_struct;
4532
BLASTResultHitlistPtr result_hitlist;
4533
Int4 hspcnt, index, index1, index2;
4535
BioseqPtr bsp1, bsp2, PNTR bspp;
4536
BlastSearchBlkPtr search1;
4477
4537
BLAST_KarlinBlkPtr kbp;
4478
4538
FloatHi bit_score;
4481
4541
head = seqalign = NULL;
4483
4543
if (search == NULL || search->query_invalid)
4487
4546
/* If pos_matrix is not NULL, then psi-blast iterations are being
4488
4547
performed. The first psi-blast iteration should be with normal
4516
4576
BlastNewFindWords(search, search->required_start, search->required_end, search->pbp->threshold_second, (Uint1) 0);
4517
4577
lookup_position_aux_destruct(search->wfp->lookup);
4518
4578
search->wfp_second = search->wfp_first;
4581
/* recalculate the cutoff scores with the newly calculated
4582
Karlin-Altschul parameters. */
4583
blast_set_parameters(search,
4584
options->dropoff_1st_pass,
4585
options->dropoff_2nd_pass,
4586
s_ComputeAverageLength(search),
4587
search->searchsp_eff,
4588
options->window_size);
4589
/* ... and make sure that the appropriate cutoff is used in the
4591
search->pbp->cutoff_s2_set = TRUE;
4521
4595
/* Starting awake thread if multithreaded. */
7836
7909
score_compare_hsps(VoidPtr v1, VoidPtr v2)
7839
BLAST_HSPPtr h1, h2;
7840
BLAST_HSPPtr PNTR hp1, PNTR hp2;
7842
hp1 = (BLAST_HSPPtr PNTR) v1;
7843
hp2 = (BLAST_HSPPtr PNTR) v2;
7848
return (h2 == NULL) ? 0 : 1;
7849
} else if (h2 == NULL) {
7912
BLAST_HSPPtr hsp1, hsp2; /* the HSPs to be compared */
7913
int result = 0; /* the result of the comparison */
7915
hsp1 = *((BLAST_HSPPtr PNTR) v1);
7916
hsp2 = *((BLAST_HSPPtr PNTR) v2);
7918
/* Null HSPs are "greater" than any non-null ones, so they go to the end
7919
of a sorted list. */
7927
if (0 == (result = BLAST_CMP(hsp2->score, hsp1->score)) &&
7928
0 == (result = BLAST_CMP(hsp1->subject.offset, hsp2->subject.offset)) &&
7929
0 == (result = BLAST_CMP(hsp2->subject.end, hsp1->subject.end)) &&
7930
0 == (result = BLAST_CMP(hsp1->query .offset, hsp2->query .offset))) {
7931
/* if all other test can't distinguish the HSPs, then the final
7932
test is the result */
7933
result = BLAST_CMP(hsp2->query.end, hsp1->query.end);
7852
/* Neither HSP is nil */
7853
else if (h1->score > h2->score)
7855
else if (h1->score < h2->score)
7857
/* Tie-breakers: decreasing subject offsets; decreasing subject
7858
ends, decreasing query offsets, decreasing query ends */
7859
else if (h1->subject.offset > h2->subject.offset)
7861
else if (h1->subject.offset < h2->subject.offset)
7863
else if (h1->subject.end > h2->subject.end)
7865
else if (h1->subject.end < h2->subject.end)
7867
else if (h1->query.offset > h2->query.offset)
7869
else if (h1->query.offset < h2->query.offset)
7871
else if (h1->query.end > h2->query.end)
7873
else if (h1->query.end < h2->query.end)
7938
Function to check that the highest scoring region in an HSP still gives a positive
7939
score. This value was originally calcualted by GetStartForGappedAlignment but it
7940
may have changed due to the introduction of ambiguity characters. Such a change
7941
can lead to 'strange' results from ALIGN.
7997
Check whether the starting point for gapped alignment lies in
7998
region that has positive score. This routine is called after a
7999
preliminary gapped alignment has been computed, but before the
8000
traceback is computed. The score of the region containing the
8001
starting point may have changed due to the introduction of
8002
ambiguity characters, further filtering of the sequences or the
8003
application of composition based statistics.
8005
Usually, we check an ungapped alignment of length 11 about the
8006
starting point: 5 characters to the left and 5 to the right.
8007
However, the actual region checked is occassionally shorter because
8008
we don't check characters before the start, or after the end, of
8009
the preliminarily aligned regions in the query or subject.
7944
CheckStartForGappedAlignment (BlastSearchBlkPtr search, BLAST_HSPPtr hsp, Uint1Ptr query, Uint1Ptr subject, Int4Ptr PNTR matrix)
8012
CheckStartForGappedAlignment (BlastSearchBlkPtr search, BLAST_HSPPtr hsp,
8013
Uint1Ptr query, Uint1Ptr subject,
8014
Int4Ptr PNTR matrix)
7946
Int4 index1, score, start, end, width;
7947
Uint1Ptr query_var, subject_var;
7948
Boolean positionBased =
7949
(search->positionBased && search->sbp->posMatrix);
7951
width = MIN((hsp->query.gapped_start-hsp->query.offset), HSP_MAX_WINDOW/2);
7952
start = hsp->query.gapped_start - width;
7953
end = MIN(hsp->query.end, hsp->query.gapped_start + width);
7954
/* Assures that the start of subject is above zero. */
7955
if ((hsp->subject.gapped_start + start - hsp->query.gapped_start) < 0)
7956
start -= hsp->subject.gapped_start + (start - hsp->query.gapped_start);
7957
query_var = query + start;
7958
subject_var = subject + hsp->subject.gapped_start + (start - hsp->query.gapped_start);
7960
for (index1=start; index1<end; index1++)
7963
score += matrix[*query_var][*subject_var];
7965
score += search->sbp->posMatrix[index1][*subject_var];
7966
query_var++; subject_var++;
8016
Int4 left, right; /* Number of aligned characters to the
8017
left and right of the starting point */
8018
Int4 score; /* Score of the word alignment */
8019
Uint1Ptr subject_var; /* Current character in the subject sequence */
8020
Uint1Ptr subject_right; /* last character to be considered in the subject
8022
Boolean positionBased =
8023
(search->positionBased && search->sbp->posMatrix);
8025
/* Compute the number of characters to the left of the start
8026
to include in the word */
8027
left = -HSP_MAX_WINDOW/2;
8028
if (left < hsp->query.offset - hsp->query.gapped_start) {
8029
left = hsp->query.offset - hsp->query.gapped_start;
8031
if (left < hsp->subject.offset - hsp->subject.gapped_start) {
8032
left = hsp->subject.offset - hsp->subject.gapped_start;
8035
/* Compute the number of characters to right to include in the word,
8036
including the starting point itself. */
8037
right = HSP_MAX_WINDOW/2 + 1;
8038
if (right > hsp->query.end - hsp->query.gapped_start) {
8039
right = hsp->query.end - hsp->query.gapped_start;
8041
if (right > hsp->subject.end - hsp->subject.gapped_start) {
8042
right = hsp->subject.end - hsp->subject.gapped_start;
8045
/* Calculate the score of the word */
8047
subject_var = subject + hsp->subject.gapped_start + left;
8048
subject_right = subject + hsp->subject.gapped_start + right;
8049
if ( !positionBased ) {
8050
Uint1Ptr query_var; /* Current character in the query */
8051
query_var = query + hsp->query.gapped_start + left;
8052
for ( ; subject_var < subject_right; subject_var++, query_var++) {
8053
score += matrix[*query_var][*subject_var];
8056
Int4 query_index; /* Current position in the query */
8057
query_index = hsp->query.gapped_start + left;
8058
for ( ; subject_var < subject_right; subject_var++, query_index++) {
8059
score += search->sbp->posMatrix[query_index][*subject_var];
7974
8071
Gets the ratio used to change an evalue calculated with the subject
7975
8072
sequence length to one with a db length.
8516
8623
hsp->query.length = hsp->query.end - hsp->query.offset;
8517
8624
hsp->subject.length = hsp->subject.end - hsp->subject.offset;
8518
8625
hsp->score = gap_align->score;
8520
/* AM: Changed to support query concatenation */
8521
if( !search->mult_queries )
8522
hsp->evalue = BlastKarlinStoE_simple(hsp->score, search->sbp->kbp_gap[search->first_context], search->searchsp_eff);
8525
query_num = GetQueryNum( search->mult_queries,
8529
hsp->evalue = BlastKarlinStoE_simple(hsp->score,
8530
search->sbp->kbp_gap[search->first_context],
8531
search->mult_queries->SearchSpEff[query_num]);
8626
if( hsp->score >= search->pbp->cutoff_s1 ) {
8627
/* AM: Changed to support query concatenation */
8628
if( !search->mult_queries )
8630
BlastKarlinStoE_simple(hsp->score,
8632
kbp_gap[search->first_context],
8633
search->searchsp_eff);
8635
query_num = GetQueryNum( search->mult_queries,
8640
BlastKarlinStoE_simple(hsp->score,
8642
kbp_gap[search->first_context],
8643
search->mult_queries->
8644
SearchSpEff[query_num]);
8535
/* Fill in the helper structure. */
8536
helper[index].qoffset = hsp->query.offset;
8537
helper[index].qend = hsp->query.end;
8539
helper[index].soffset = hsp->subject.offset;
8540
helper[index].send = hsp->subject.end;
8541
helper[index1].qframe = hsp->query.frame;
8542
helper[index1].sframe = hsp->subject.frame;
8648
/* Fill in the helper structure. */
8649
helper[index].qoffset = hsp->query.offset;
8650
helper[index].qend = hsp->query.end;
8652
/* Score of the gapped extension is below the required
8653
cutoff, delete this hsp */
8654
hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
8546
8658
{ /* Contained within another HSP, delete. */
8547
8659
hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
10333
/* TODO: function is not used in this file */
10335
Compares two HSP's looking for overlap in both the
10336
query and the subject sequence.
10338
if hsp1 should be deleted, then -1 is returned,
10339
if hsp2 should be deleted, then +1 is returned,
10340
if neither should be deleted, then 0 is returned.
10344
CheckHspOverlap (BLAST_HSPPtr PNTR hsp_array, BLAST_HSPPtr hsp2, Int4 hspcnt, Boolean PNTR hsp_deleted)
10348
BLAST_SegPtr query, subject;
10351
if (hsp_array == NULL || hsp2 == NULL)
10355
*hsp_deleted = FALSE;
10356
query = &(hsp2->query);
10357
subject = &(hsp2->subject);
10359
for (index=0; index<hspcnt; index++)
10361
hsp1 = hsp_array[index];
10365
if (SIGN(hsp1->query.frame) != SIGN(query->frame))
10368
if (SIGN(hsp1->subject.frame) != SIGN(subject->frame))
10371
if (hsp1->query.offset > query->offset &&
10372
hsp1->query.end > query->end)
10375
if (hsp1->query.offset < query->offset &&
10376
hsp1->query.end < query->end)
10379
if (hsp1->subject.offset > subject->offset &&
10380
hsp1->subject.end > subject->end)
10383
if (hsp1->subject.offset < subject->offset &&
10384
hsp1->subject.end < subject->end)
10387
if (hsp1->score > hsp2->score)
10389
if (*hsp_deleted == FALSE)
10396
hsp_array[index] = BLAST_HSPFree(hsp_array[index]);
10397
*hsp_deleted = TRUE;
10405
10431
static void OOF_TranslateHspToDNAP(BLAST_HSPPtr hspp, Int4 length)
10407
10433
Int4 from, to, frame;
10514
10539
/* Use a binary search to insert the HSP. */
10518
highscore = hsp_array[0]->score;
10519
lowscore = hsp_array[hspcnt-1]->score;
10527
if (score >= highscore)
10531
else if (score <= lowscore)
10533
new_index = hspcnt;
10538
high_index = hspcnt-1;
10539
new_index = (low_index+high_index)/2;
10540
old_index = new_index;
10542
for (index=0; index<BLAST_SAVE_ITER_MAX; index++)
10544
if (score > hsp_array[new_index]->score)
10546
high_index = new_index;
10550
low_index = new_index;
10552
new_index = (low_index+high_index)/2;
10553
if (new_index == old_index)
10554
{ /* Perform this check as new_index get rounded DOWN a
10556
if (score < hsp_array[new_index]->score)
10562
old_index = new_index;
10541
high_index = hspcnt;
10542
while(low_index < high_index) {
10543
Int4 next_index = (low_index + high_index)/2;
10544
if( score_compare_hsps(&new_hsp, &hsp_array[next_index]) > 0 ) {
10545
low_index = next_index + 1;
10547
high_index = next_index;
10566
10551
if (hspcnt >= hspmax)
10568
if (new_index >= hspcnt)
10569
{ /* this HSP is less significant than others on a full list.*/
10570
new_hsp = BLAST_HSPFree(new_hsp);
10574
{ /* Delete the last HPS on the list. */
10575
hspcnt = --current_hitlist->hspcnt;
10576
hsp_array[hspcnt] = BLAST_HSPFree(hsp_array[hspcnt]);
10553
if (low_index >= hspcnt) {
10554
/* this HSP is less significant than others on a full list.*/
10555
new_hsp = BLAST_HSPFree(new_hsp);
10558
/* Delete the last HPS on the list. */
10559
hspcnt = --current_hitlist->hspcnt;
10560
hsp_array[hspcnt] = BLAST_HSPFree(hsp_array[hspcnt]);
10579
current_hitlist->hspcnt++;
10580
Nlm_MemMove((hsp_array+new_index+1), (hsp_array+new_index), (hspcnt-new_index)*sizeof(hsp_array[0]));
10581
hsp_array[new_index] = new_hsp;
10563
/* Move existing elements out of the way */
10564
Nlm_MemMove(&hsp_array[low_index] + 1, &hsp_array[low_index],
10565
(hspcnt-low_index)*sizeof(hsp_array[0]));
10566
hspcnt = ++current_hitlist->hspcnt;
10568
/* Insert the new HSP */
10569
hsp_array[low_index] = new_hsp;
11014
Remove hits from a SeqAlignPtr that are not from a gi list
11000
/* Remove hits from a SeqAlignPtr that are not from a gi list. The function
11001
* is optimized with an assumption that the incoming gi list is not sorted.
11002
* Since sorting of the gi list may be expensive, the hit gis are found
11003
* and sorted. Then for each gi in the (presumably large) incoming gi list,
11004
* a binary search is performed to check if it is present in the list of hit
11005
* gis. This procedure is linear in the gi list size.
11018
BlastPruneSeqAlignByGiList(SeqAlignPtr seqalign, Int4Ptr gi_list, Int4 gi_list_total)
11008
BlastPruneSeqAlignByGiList(SeqAlignPtr seqalign, Int4Ptr gi_list,
11009
Int4 gi_list_total, Int4 hitlist_size)
11020
11011
SeqAlignPtr head = NULL, last_sap = NULL, next_sap, sap;
11021
11012
SeqIdPtr sip;
11022
11013
BioseqPtr bsp;
11014
Int4 gi = 0, index;
11016
Int4 num_hit_gis, gi_index;
11018
Boolean good_gi = FALSE;
11025
11020
if (!gi_list || gi_list_total <= 0)
11028
HeapSort((VoidPtr)gi_list, gi_list_total, sizeof(Int4), Int4Compare);
11023
/* If the size of the gi list is small, sort it and use a different
11024
routine, which takes a sorted list argument. */
11026
if (hitlist_size >= gi_list_total) {
11027
qsort((void*)gi_list, gi_list_total, sizeof(Int4), Int4Compare);
11028
return BlastPruneSeqAlignBySortedGiList(seqalign, gi_list,
11032
hit_gis = (Int4*) MemNew(hitlist_size*sizeof(Int4));
11036
/* Find all subject gis in the Seq-align chain */
11030
11037
for (sap = seqalign; sap; sap = sap->next) {
11031
11038
sip = SeqAlignId(sap, 1);
11032
11039
if (sip->choice != SEQID_GI) {
11039
11046
if (sip->choice == SEQID_GI) {
11040
gi = sip->data.intvalue;
11041
index = BinarySearchInt4(gi, gi_list, gi_list_total);
11042
if (gi_list[index] == gi) {
11043
next_sap = Malloc(sizeof(SeqAlign));
11044
MemCpy(next_sap, sap, sizeof(SeqAlign));
11046
head = last_sap = next_sap;
11048
last_sap->next = next_sap;
11049
last_sap = last_sap->next;
11055
last_sap->next = NULL;
11047
/* Save this gi if the previous value of gi
11048
is different from the current value. */
11049
if (gi != sip->data.intvalue) {
11050
gi = sip->data.intvalue;
11051
hit_gis[index] = gi;
11056
num_hit_gis = index;
11057
qsort((void*)hit_gis, num_hit_gis, sizeof(Int4),
11059
good_gis = (Boolean*) MemNew(num_hit_gis*sizeof(Boolean));
11061
for (index = 0; index < gi_list_total; ++index) {
11062
gi_index = BinarySearchInt4(gi_list[index], hit_gis, num_hit_gis);
11063
if (hit_gis[gi_index] == gi_list[index])
11064
good_gis[gi_index] = TRUE;
11067
for (sap = seqalign; sap; sap = next_sap) {
11068
next_sap = sap->next;
11069
sip = SeqAlignId(sap, 1);
11070
if (sip->choice != SEQID_GI) {
11071
bsp = BioseqLockById(sip);
11073
sip = SeqIdFindBest(bsp->id, SEQID_GI);
11077
if (sip->choice == SEQID_GI) {
11078
/* Do the following check only if the previous value of gi
11079
is different from the current value. */
11080
if (gi != sip->data.intvalue) {
11081
gi = sip->data.intvalue;
11082
index = BinarySearchInt4(gi, hit_gis, num_hit_gis);
11083
good_gi = good_gis[index];
11089
/* Advance the pointer to the last link in the pruned chain to
11090
the current Seq-align. */
11092
head = last_sap = sap;
11097
/* Link last Seq-align in the pruned chain to the next Seq-align
11098
in the original chain. */
11100
last_sap->next = sap->next;
11102
/* Free this Seq-align, since it's no longer needed. */
11103
sap = SeqAlignFree(sap);
11111
/* Remove hits from a SeqAlignPtr that are not from a sorted gi list.
11112
* No check is made that incoming gi list is sorted. User must make
11113
* sure that it is that way. The pruning is done by a single pass over the
11114
* list of Seq-aligns, in which a binary search is performed for any new
11115
* subject gi to check if it is present in the gi list.
11118
BlastPruneSeqAlignBySortedGiList(SeqAlignPtr seqalign, Int4Ptr gi_list,
11119
Int4 gi_list_total)
11121
SeqAlignPtr head = NULL, last_sap = NULL, next_sap, sap;
11125
Boolean good_gi = FALSE;
11127
if (!gi_list || gi_list_total <= 0)
11130
/* Find all subject gis in the Seq-align chain */
11131
for (sap = seqalign; sap; sap = next_sap) {
11132
next_sap = sap->next;
11133
sip = SeqAlignId(sap, 1);
11134
if (sip->choice != SEQID_GI) {
11135
bsp = BioseqLockById(sip);
11137
sip = SeqIdFindBest(bsp->id, SEQID_GI);
11141
if (sip->choice == SEQID_GI) {
11142
/* Do the following check only if the previous value of gi is
11143
different from the current value. Otherwise the "good_gi"
11144
variable is left with its previous value. */
11145
if (gi != sip->data.intvalue) {
11147
gi = sip->data.intvalue;
11148
index = BinarySearchInt4(gi, gi_list, gi_list_total);
11149
good_gi = (gi_list[index] == gi);
11156
/* Advance the pointer to the last link in the pruned chain to
11157
the current Seq-align. */
11159
head = last_sap = sap;
11164
/* Link last Seq-align in the pruned chain to the next Seq-align
11165
in the original chain. */
11167
last_sap->next = sap->next;
11169
/* Free this Seq-align, since it's no longer needed. */
11170
sap = SeqAlignFree(sap);
11065
11183
BlastPruneSeqAlignByEvalueRange(SeqAlignPtr seqalign, FloatHi expect_low,
11066
11184
FloatHi expect_high)
11068
SeqAlignPtr head = NULL, last_sap = NULL, next_sap, sap;
11186
SeqAlignPtr head = NULL, last_sap = NULL, sap, next_sap;
11069
11187
Int4 score, number;
11070
11188
FloatHi evalue, bit_score;
11071
11189
SeqIdPtr sip = NULL;
11073
for (sap = seqalign; sap; sap = sap->next) {
11191
for (sap = seqalign; sap; sap = next_sap) {
11192
next_sap = sap->next;
11074
11193
GetScoreAndEvalue(sap, &score, &bit_score, &evalue, &number);
11075
11194
if (evalue >= expect_low && evalue <= expect_high) {
11076
next_sap = Malloc(sizeof(SeqAlign));
11077
MemCpy(next_sap, sap, sizeof(SeqAlign));
11195
/* Leave this Seq-align */
11197
head = last_sap = sap;
11079
11202
if (sip && SeqIdComp(TxGetSubjectIdFromSeqAlign(sap), sip)
11080
11203
== SIC_YES) {
12161
12286
return score_set;
12289
/** Configure the database chunk size adaptively.
12290
* Note: Must be called from a single-threaded context
12291
* @param search the The search block to configure [inout]
12292
* @param num_seq The number of sequences in the database [in]
12294
void ConfigureDbChunkSize(BlastSearchBlkPtr search, Int4 num_seq)
12296
/* Emit a tick after how many sequences? */
12297
search->thr_info->db_incr = num_seq / BLAST_NTICKS;
12299
/* Divide the search into chunks */
12300
search->thr_info->db_chunk_size = MAX(num_seq / 100,1);
12302
/* Loadbalance more finely for multithreaded searches. */
12303
if (NlmThreadsAvailable() && search->pbp->process_num > 1)
12304
search->thr_info->db_chunk_size = MAX(num_seq/(100*(search->pbp->process_num