32
32
#include <seqport.h>
33
33
#include <sequtil.h>
34
34
#include <objloc.h>
35
36
#include <algo/blast/api/blast_seq.h>
36
37
#include <algo/blast/core/blast_filter.h>
37
38
#include <algo/blast/core/blast_util.h>
38
39
#include <algo/blast/core/blast_encoding.h>
39
40
#include <algo/blast/core/blast_setup.h> /* For BlastSeqLoc_RestrictToInterval */
41
#include <algo/blast/core/blast_inline.h>
41
43
/** @addtogroup CToolkitAlgoBlast
49
/** Structure used for hash-based comparison of sequence IDs */
50
typedef struct SeqIdHash {
51
SeqId *id; /**< The ID of this entry */
52
Int4 query_index; /**< index of query with this ID */
53
Int4 next_id; /**< Offset of the next hash entry in a chain */
57
BlastSeqlocsHaveDuplicateIDs(SeqLoc* query_seqlocs)
59
Boolean retval = FALSE;
60
const Int4 kNumSeqs = ValNodeLen(query_seqlocs);
61
const Int4 kLog2HashSize = 11;
62
SeqIdHash *id_entries;
70
/* allocate hashtable */
71
hashtable = (Uint4 *)calloc(1 << kLog2HashSize, sizeof(Uint4));
72
id_entries = (SeqIdHash *)malloc((kNumSeqs + 1) * sizeof(SeqIdHash));
74
for (slp = query_seqlocs, curr_id_num = 1; slp; slp = slp->next) {
77
SeqIdPtr id = SeqLocId(slp);
80
/* hash the ID of the next query sequence */
81
SeqIdLabel(id, buffer, sizeof(buffer), OM_LABEL_CONTENT);
82
hashval = readdb_sequence_hash(buffer, strlen(buffer));
83
hashval = hashval >> (32 - kLog2HashSize);
84
if (hashtable[hashval] != 0) {
85
Int4 offset = hashtable[hashval];
86
/* check the chain of ID's in the resulting hash
87
entry for a duplicate */
89
SeqIdHash *curr_entry = id_entries + offset;
90
if (SeqIdMatch(id, curr_entry->id)) {
94
offset = curr_entry->next_id;
98
/* no duplicate found; add the ID to the hashtable */
99
id_entries[curr_id_num].id = id;
100
id_entries[curr_id_num].next_id = hashtable[hashval];
101
hashtable[hashval] = curr_id_num++;
46
110
/** Converts a SeqLocPtr to a BlastSeqLoc, used for formatting.
47
111
* @param slp SeqLocPtr to be converted [in]
48
112
* @param head_loc BlastSeqLoc returned from last call [in]
78
142
BlastMaskLoc* retval = NULL;
79
143
Int4 query_index = 0;
80
144
const unsigned int kNumContexts = BLAST_GetNumberOfContexts(program_number);
81
SeqLocPtr current_query_loc = NULL;
145
const Int4 kLog2HashSize = 11;
146
SeqIdHash *id_entries;
148
Int4 curr_id_num = 1;
149
SeqLocPtr query_slp, mask_slp;
83
151
if (!mask_seqlocs)
86
154
retval = BlastMaskLocNew(kNumSeqs*kNumContexts);
88
for (current_query_loc = query_seqlocs, query_index = 0;
90
current_query_loc = current_query_loc->next, query_index++) {
156
/* create hashtable for query IDs */
157
hashtable = (Uint4 *)calloc(1 << kLog2HashSize, sizeof(Uint4));
158
id_entries = (SeqIdHash *)malloc((kNumSeqs + 1) * sizeof(SeqIdHash));
160
/* add the ID of each query sequence to the hashtable */
161
for (query_slp = query_seqlocs; query_slp; query_slp = query_slp->next) {
163
SeqIdPtr seq_id = SeqLocId(query_slp);
166
SeqIdLabel(seq_id, buffer, sizeof(buffer), OM_LABEL_CONTENT);
167
hashval = readdb_sequence_hash(buffer, strlen(buffer));
168
hashval = hashval >> (32 - kLog2HashSize);
170
id_entries[curr_id_num].id = seq_id;
171
id_entries[curr_id_num].query_index = query_index++;
172
id_entries[curr_id_num].next_id = hashtable[hashval];
173
hashtable[hashval] = curr_id_num++;
176
/* for each mask location, find the query sequence containing
177
that mask and add to the list of filter locations for
178
that query. Note that this assumes IDs for all query
179
sequences are unique */
181
for (mask_slp = mask_seqlocs; mask_slp; mask_slp = mask_slp->next) {
182
SeqLocPtr current_mask = (SeqLocPtr) mask_slp->data.ptrvalue;
187
if (current_mask == NULL)
190
mask_id = SeqLocId(current_mask);
191
SeqIdLabel(mask_id, buffer, sizeof(buffer), OM_LABEL_CONTENT);
192
hashval = readdb_sequence_hash(buffer, strlen(buffer));
193
hashval = hashval >> (32 - kLog2HashSize);
195
/* examine only the query IDs that hash to the same value */
196
if (hashtable[hashval] != 0) {
197
Int4 offset = hashtable[hashval];
198
while (offset != 0) {
200
SeqIdHash *q_entry = id_entries + offset;
202
if (SeqIdMatch(mask_id, q_entry->id)) {
203
Int4 context_idx = kNumContexts * q_entry->query_index;
204
retval->seqloc_array[context_idx] =
205
s_BlastSeqLocFromSeqLoc(current_mask,
206
retval->seqloc_array[context_idx]);
209
offset = q_entry->next_id;
217
/* iterate through the query sequences and compute
218
the complement of the filtering locations for each */
220
for (query_slp = query_seqlocs, query_index = 0;
222
query_slp = query_slp->next, query_index++) {
92
224
const int kCtxIndex = kNumContexts * query_index; /* context index */
93
SeqLocPtr mask_slp = NULL;
95
for (mask_slp = mask_seqlocs; mask_slp; mask_slp = mask_slp->next)
97
SeqLocPtr current_mask = (SeqLocPtr) mask_slp->data.ptrvalue;
98
/* If mask is empty, advance to the next link in the mask chain.
99
If mask Seq-id does not match sequence Seq-id, stay with this mask
100
for the next link in the sequence Seq-loc chain. */
102
SeqIdMatch(SeqLocId(current_mask), SeqLocId(current_query_loc)))
104
retval->seqloc_array[kCtxIndex] =
105
s_BlastSeqLocFromSeqLoc(current_mask,
106
retval->seqloc_array[kCtxIndex]);
110
226
if (retval->seqloc_array[kCtxIndex])
113
229
!Blast_QueryIsTranslated(program_number) &&
114
230
!Blast_ProgramIsPhiBlast(program_number);
115
231
BlastSeqLoc_RestrictToInterval(&retval->seqloc_array[kCtxIndex],
116
SeqLocStart(current_query_loc),
117
SeqLocStop(current_query_loc));
232
SeqLocStart(query_slp),
233
SeqLocStop(query_slp));
119
235
/* N.B.: Unlike in the C++ APIs, this logic is only applied to
120
236
* non-translated nucleotide queries. See comment for
121
237
* BlastMaskLocDNAToProtein */
122
Uint1 strand = SeqLocStrand(current_query_loc);
238
Uint1 strand = SeqLocStrand(query_slp);
123
239
if (strand == Seq_strand_minus) {
124
240
retval->seqloc_array[kCtxIndex+1] =
125
241
retval->seqloc_array[kCtxIndex];
156
272
const Boolean k_translate = Blast_QueryIsTranslated(program_number);
157
273
const Uint1 k_num_frames = BLAST_GetNumberOfContexts(program_number);
274
const Boolean kIsNucl = (program_number == eBlastTypeBlastn);
276
Boolean all_minus = TRUE;
160
278
if (mask_loc == NULL || mask_loc->seqloc_array == NULL)
281
for (slp = query_loc; slp; slp = slp->next)
283
Uint1 strand = SeqLocStrand(slp);
284
if (strand != Seq_strand_minus)
163
291
for (index=0, slp = query_loc; slp; ++index, slp = slp->next)
165
293
const int kCtxIndex = k_num_frames * index; /* context index */
171
299
BlastSeqLoc* loc = NULL;
172
300
SeqLocPtr mask_slp_head = NULL, mask_slp_tail = NULL;
173
for (loc = mask_loc->seqloc_array[tmp_index]; loc; loc = loc->next)
301
if (all_minus || BlastIsReverseStrand(kIsNucl , tmp_index) == FALSE)
175
SeqIntPtr si = SeqIntNew();
176
si->from = loc->ssr->left + slp_from;
177
si->to = loc->ssr->right + slp_from;
178
si->id = SeqIdDup(seqid);
179
/* Append the pointer, but also keep track of the tail of the list
180
* so that appending to the list is a constant operation */
181
mask_slp_tail = ValNodeAddPointer
182
( (mask_slp_tail ? &mask_slp_tail : &mask_slp_head),
303
for (loc = mask_loc->seqloc_array[tmp_index]; loc; loc = loc->next)
305
SeqIntPtr si = SeqIntNew();
306
si->from = loc->ssr->left + slp_from;
307
si->to = loc->ssr->right + slp_from;
308
si->id = SeqIdDup(seqid);
309
/* Append the pointer, but also keep track of the tail of the list
310
* so that appending to the list is a constant operation */
311
mask_slp_tail = ValNodeAddPointer
312
( (mask_slp_tail ? &mask_slp_tail : &mask_slp_head),
186
317
if (mask_slp_head) {
212
343
* @param qinfo Query info structure containing contexts. [in/out]
213
344
* @param index Index of the context to fill. [in]
214
345
* @param length Length of this context. [in]
215
* @param prog Program type of this search. [in]
218
348
s_QueryInfoSetContextInfo(BlastQueryInfo* qinfo,
221
EBlastProgramType prog)
223
qinfo->contexts[index].frame =
224
BLAST_ContextToFrame(prog, index);
226
qinfo->contexts[index].query_index =
227
Blast_GetQueryIndexFromContext(index, prog);
230
353
Uint4 prev_loc = qinfo->contexts[index-1].query_offset;
231
354
Uint4 prev_len = qinfo->contexts[index-1].query_length;
273
if ((query_info = (BlastQueryInfo*) calloc(1, sizeof(BlastQueryInfo)))
400
if ((query_info = BlastQueryInfoNew(program, ValNodeLen(slp))) == NULL)
277
query_info->first_context = 0;
278
query_info->num_queries = ValNodeLen(slp);
279
query_info->last_context = query_info->num_queries*num_frames - 1;
280
total_contexts = query_info->last_context + 1;
282
403
if ((strand = SeqLocStrand(slp)) == Seq_strand_minus) {
284
405
query_info->first_context = 3;
286
407
query_info->first_context = 1;
289
query_info->contexts = calloc(total_contexts, sizeof(BlastContextInfo));
291
410
/* Fill the context offsets */
292
411
for (index = 0; slp; slp = slp->next, index += num_frames) {
317
436
s_QueryInfoSetContextInfo(query_info,
323
441
/* Set the unused trailing contexts if any */
324
442
for (frame = last_frame + 1; frame < num_frames; ++frame) {
325
s_QueryInfoSetContextInfo(query_info, index+frame, 0, program);
443
s_QueryInfoSetContextInfo(query_info, index+frame, 0);
328
446
max_length = MAX(max_length, length);
331
449
if (strand == Seq_strand_plus) {
332
s_QueryInfoSetContextInfo(query_info, index, length, program);
333
s_QueryInfoSetContextInfo(query_info, index+1, 0, program);
450
s_QueryInfoSetContextInfo(query_info, index, length);
451
s_QueryInfoSetContextInfo(query_info, index+1, 0);
334
452
} else if (strand == Seq_strand_minus) {
335
s_QueryInfoSetContextInfo(query_info, index, 0, program);
336
s_QueryInfoSetContextInfo(query_info, index+1, length, program);
453
s_QueryInfoSetContextInfo(query_info, index, 0);
454
s_QueryInfoSetContextInfo(query_info, index+1, length);
338
s_QueryInfoSetContextInfo(query_info, index, length, program);
339
s_QueryInfoSetContextInfo(query_info, index+1, length, program);
456
s_QueryInfoSetContextInfo(query_info, index, length);
457
s_QueryInfoSetContextInfo(query_info, index+1, length);
342
s_QueryInfoSetContextInfo(query_info, index, length, program);
460
s_QueryInfoSetContextInfo(query_info, index, length);