1
/* $Id: blast_seqalign.c,v 1.45 2004/10/06 15:00:44 dondosha Exp $
2
* ===========================================================================
5
* National Center for Biotechnology Information
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's offical duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government have not placed any restriction on its use or reproduction.
14
* Although all reasonable efforts have been taken to ensure the accuracy
15
* and reliability of the software and data, the NLM and the U.S.
16
* Government do not and cannot warrant the performance or results that
17
* may be obtained by using this software or data. The NLM and the U.S.
18
* Government disclaim all warranties, express or implied, including
19
* warranties of performance, merchantability or fitness for any particular
22
* Please cite the author in any work or product based on this material.
24
* ===========================================================================*/
26
/*****************************************************************************
28
File name: blast_seqalign.c
30
Author: Ilya Dondoshansky
32
Contents: Conversion of BLAST results to the SeqAlign form
34
******************************************************************************
38
static char const rcsid[] = "$Id: blast_seqalign.c,v 1.45 2004/10/06 15:00:44 dondosha Exp $";
40
#include <algo/blast/api/blast_seqalign.h>
42
extern Int4 LIBCALL HspArrayPurge PROTO((BlastHSP** hsp_array,
43
Int4 hspcnt, Boolean clear_num));
44
extern SeqIdPtr GetTheSeqAlignID (SeqIdPtr seq_id);
45
extern ScorePtr MakeBlastScore (ScorePtr PNTR old, CharPtr scoretype,
46
Nlm_FloatHi prob, Int4 score);
49
GetScoreSetFromBlastHsp(BlastHSP* hsp)
51
ScorePtr score_set=NULL;
58
MakeBlastScore(&score_set, "score", 0.0, score);
64
MakeBlastScore(&score_set, scoretype, 0.0, score);
68
scoretype = "e_value";
75
MakeBlastScore(&score_set, scoretype, prob, 0);
78
/* Calculate bit score from the raw score */
79
if (hsp->bit_score >= 0.)
80
MakeBlastScore(&score_set, "bit_score", hsp->bit_score, 0);
82
if (hsp->num_ident > 0)
83
MakeBlastScore(&score_set, "num_ident", 0.0, hsp->num_ident);
88
/** Add information on what gis to use to the score set */
89
static Int2 AddGiListToScoreSet(ScorePtr score_set, ValNodePtr gi_list)
93
MakeBlastScore(&score_set, "use_this_gi", 0.0, gi_list->data.intvalue);
94
gi_list = gi_list->next;
100
/*************************************************************************
102
* This function fills in the DenseDiag Information from the variable
103
* hsp. On the first call to this function *old should be
104
* NULL, after that pass in the head of the DenseDiagPtr chain.
105
* The newest DenseDiagPtr is returned.
107
************************************************************************/
110
BLAST_HSPToDenseDiag(DenseDiagPtr* old, BlastHSP* hsp, Boolean reverse,
111
Int4 query_length, Int4 subject_length)
113
DenseDiagPtr ddp, new;
115
new = DenseDiagNew();
117
new->dim = 2; /* Only 2 is supported in spec. */
118
new->len = hsp->query.length;
119
new->starts = (Int4*) calloc(2, sizeof(Int4));
120
new->strands = (Uint1*) calloc(2, sizeof(Uint1));
123
if (hsp->subject.frame >= 0)
125
new->strands[0] = Seq_strand_plus;
126
new->starts[0] = hsp->subject.offset;
130
new->strands[0] = Seq_strand_minus;
131
new->starts[0] = subject_length - hsp->subject.offset - hsp->subject.length;
133
if (hsp->query.frame >= 0)
135
new->strands[1] = Seq_strand_plus;
136
new->starts[1] = hsp->query.offset;
140
new->strands[1] = Seq_strand_minus;
141
new->starts[1] = query_length - hsp->query.offset - hsp->query.length;
146
if (hsp->query.frame >= 0)
148
new->strands[0] = Seq_strand_plus;
149
new->starts[0] = hsp->query.offset;
153
new->strands[0] = Seq_strand_minus;
154
new->starts[0] = query_length - hsp->query.offset - hsp->query.length;
156
if (hsp->subject.frame >= 0)
158
new->strands[1] = Seq_strand_plus;
159
new->starts[1] = hsp->subject.offset;
163
new->strands[1] = Seq_strand_minus;
164
new->starts[1] = subject_length - hsp->subject.offset - hsp->subject.length;
167
new->scores = GetScoreSetFromBlastHsp(hsp);
169
/* Go to the end of the chain, and then attach "new" */
187
/*************************************************************************
189
* This function fills in the StdSeg Information from the variable
190
* hsp. On the first call to this function *old should be
191
* NULL, after that pass in the head of the DenseDiagPtr chain.
192
* The newest StdSeg* is returned.
194
************************************************************************/
196
BLAST_HSPToStdSeg(StdSeg** old, BlastHSP* hsp, Int4 query_length,
197
Int4 subject_length, SeqIdPtr sip, Boolean reverse)
200
SeqIdPtr query_sip, subject_sip;
201
SeqIntPtr seq_int1, seq_int2;
205
/* Duplicate the id and split it up into query and subject parts */
206
query_sip = SeqIdDup(sip);
207
subject_sip = SeqIdDup(sip->next);
209
new->dim = 2; /* Only 2 is supported in spec. */
210
seq_int1 = SeqIntNew();
211
if (hsp->query.frame == 0)
213
seq_int1->from = hsp->query.offset;
214
seq_int1->to = hsp->query.offset + hsp->query.length - 1;
215
seq_int1->strand = Seq_strand_unknown;
217
else if (hsp->query.frame < 0)
219
seq_int1->to = query_length - CODON_LENGTH*hsp->query.offset
221
seq_int1->from = query_length
222
- CODON_LENGTH*(hsp->query.offset+hsp->query.length)
223
+ hsp->query.frame + 1;
224
seq_int1->strand = Seq_strand_minus;
226
else if (hsp->query.frame > 0)
228
seq_int1->from = CODON_LENGTH*(hsp->query.offset) + hsp->query.frame - 1;
229
seq_int1->to = CODON_LENGTH*(hsp->query.offset+hsp->query.length)
230
+ hsp->query.frame - 2;
231
seq_int1->strand = Seq_strand_plus;
233
seq_int1->id = query_sip;
234
seq_int2 = SeqIntNew();
235
if (hsp->subject.frame == 0)
237
seq_int2->from = hsp->subject.offset;
238
seq_int2->to = hsp->subject.offset + hsp->subject.length - 1;
239
seq_int2->strand = Seq_strand_unknown;
241
else if (hsp->subject.frame < 0)
243
seq_int2->from = subject_length -
244
CODON_LENGTH*(hsp->subject.offset + hsp->subject.length)
245
+ hsp->subject.frame + 1;
246
seq_int2->to = subject_length - CODON_LENGTH*(hsp->subject.offset)
247
+ hsp->subject.frame;
248
seq_int2->strand = Seq_strand_minus;
250
else if (hsp->subject.frame > 0)
252
seq_int2->from = CODON_LENGTH*(hsp->subject.offset) +
253
hsp->subject.frame - 1;
254
seq_int2->to = CODON_LENGTH*(hsp->subject.offset + hsp->subject.length)
255
+ hsp->subject.frame - 2;
256
seq_int2->strand = Seq_strand_plus;
258
seq_int2->id = subject_sip;
262
ValNodeAddPointer(&slp, SEQLOC_INT, seq_int2);
263
ValNodeAddPointer(&slp, SEQLOC_INT, seq_int1);
267
ValNodeAddPointer(&slp, SEQLOC_INT, seq_int1);
268
ValNodeAddPointer(&slp, SEQLOC_INT, seq_int2);
272
new->scores = GetScoreSetFromBlastHsp(hsp);
274
/* Go to the end of the chain, and then attach "new" */
292
/************************************************************************
294
* This function assembles all the components of the Seq-align from
295
* a "sparse" BLAST HitList. "sparse" means that the hitlist
296
* may contain no sequence and not even a descriptor. It is only
297
* required to contain the sequence_number that readdb refers to
298
* and scoring/alignment information.
300
* If dbname is non-NULL, then only a general ("gnl") ID is
301
* issued, with the ordinal number of the subject sequence in
304
* Boolean reverse: reverse the query and db order in SeqAlign.
306
************************************************************************/
308
BLAST_UngappedHSPToSeqAlign(EBlastProgramType program_number,
309
BlastHSPList* hsp_list, SeqIdPtr query_id,
310
SeqIdPtr subject_id, Int4 query_length,
311
Int4 subject_length, SeqAlignPtr* seqalign_ptr)
314
DenseDiagPtr ddp_head=NULL, ddp;
316
SeqIdPtr new_sip=NULL;
317
StdSeg* ssp_head=NULL,* ssp;
318
SeqAlignPtr seqalign;
319
Int4 hsp_cnt, index2, hspset_cnt_old;
320
Boolean getdensediag =
321
(program_number == eBlastTypeBlastn ||
322
program_number == eBlastTypeRpsBlast ||
323
program_number == eBlastTypeBlastp);
330
seqalign = SeqAlignNew();
331
seqalign->type = 2; /* alignment is diags */
334
hsp_cnt = hsp_list->hspcnt;
336
for (index2=0; index2<hsp_cnt; index2++) {
337
hsp = hsp_list->hsp_array[index2];
339
sip = GetTheSeqAlignID(query_id);
340
sip->next = SeqIdDup(subject_id);
343
ddp = BLAST_HSPToDenseDiag(&ddp_head, hsp, FALSE, query_length,
347
ssp = BLAST_HSPToStdSeg(&ssp_head, hsp, query_length,
348
subject_length, sip, FALSE);
351
sip = NULL; /* This SeqIdPtr is now on the SeqAlign. */
355
seqalign->segs = ddp_head;
356
seqalign->segtype = 1; /* DenseDiag */
358
seqalign->segs = ssp_head;
359
seqalign->segtype = 3; /* StdSeg */
363
new_sip = SeqIdFree(new_sip);
365
*seqalign_ptr = seqalign;
370
/** Get the current position. */
371
static Int4 get_current_pos(Int4* pos, Int4 length)
375
val = -(*pos + length -1);
382
/** Given gap information in an edit block form, fill the starts, lengths and
385
Boolean GapCollectDataForSeqalign(GapEditBlock* edit_block,
386
GapEditScript* curr_in, Int4 numseg,
390
Int4* start1, Int4* start2)
393
Boolean reverse, translate1, translate2;
395
Int4 begin1, begin2, index, length1, length2;
396
Int4 original_length1, original_length2, i;
397
Int4* length,* start;
398
Uint1 strand1, strand2;
401
reverse = edit_block->reverse;
402
length1 = edit_block->length1;
403
length2 = edit_block->length2;
404
original_length1 = edit_block->original_length1;
405
original_length2 = edit_block->original_length2;
406
translate1 = edit_block->translate1;
407
translate2 = edit_block->translate2;
408
frame1 = edit_block->frame1;
409
frame2 = edit_block->frame2;
412
strand1 = Seq_strand_plus;
414
strand1 = Seq_strand_minus;
416
strand1 = Seq_strand_unknown;
419
strand2 = Seq_strand_plus;
421
strand2 = Seq_strand_minus;
423
strand2 = Seq_strand_unknown;
425
start = (Int4 *) calloc((2*numseg+1), sizeof(Int4));
426
length = (Int4 *) calloc((numseg+1), sizeof(Int4));
427
strands = (Uint1 *) calloc((2*numseg+1), sizeof(Uint1));
430
for (i = 0, curr=curr_in; curr && i < numseg; curr=curr->next, i++) {
431
switch(curr->op_type) {
432
case eGapAlignDecline:
434
if (strand1 != Seq_strand_minus) {
435
if(translate1 == FALSE)
436
begin1 = get_current_pos(start1, curr->num);
438
begin1 = frame1 - 1 + CODON_LENGTH*get_current_pos(start1, curr->num);
440
if(translate1 == FALSE)
441
begin1 = length1 - get_current_pos(start1, curr->num) - curr->num;
443
begin1 = original_length1 - CODON_LENGTH*(get_current_pos(start1, curr->num)+curr->num) + frame1 + 1;
446
if (strand2 != Seq_strand_minus) {
447
if(translate2 == FALSE)
448
begin2 = get_current_pos(start2, curr->num);
450
begin2 = frame2 - 1 + CODON_LENGTH*get_current_pos(start2, curr->num);
452
if(translate2 == FALSE)
453
begin2 = length2 - get_current_pos(start2, curr->num) - curr->num;
455
begin2 = original_length2 - CODON_LENGTH*(get_current_pos(start2, curr->num)+curr->num) + frame2 + 1;
459
strands[2*index] = strand2;
460
strands[2*index+1] = strand1;
461
start[2*index] = begin2;
462
start[2*index+1] = begin1;
464
strands[2*index] = strand1;
465
strands[2*index+1] = strand2;
466
start[2*index] = begin1;
467
start[2*index+1] = begin2;
474
if (strand2 != Seq_strand_minus) {
475
if(translate2 == FALSE)
476
begin2 = get_current_pos(start2, curr->num);
478
begin2 = frame2 - 1 + CODON_LENGTH*get_current_pos(start2, curr->num);
480
if(translate2 == FALSE)
481
begin2 = length2 - get_current_pos(start2, curr->num) - curr->num;
483
begin2 = original_length2 - CODON_LENGTH*(get_current_pos(start2, curr->num)+curr->num) + frame2 + 1;
487
strands[2*index] = strand2;
489
strands[2*index+1] = strands[2*(index-1)+1];
491
strands[2*index+1] = Seq_strand_unknown;
492
start[2*index] = begin2;
493
start[2*index+1] = begin1;
496
strands[2*index] = strands[2*(index-1)];
498
strands[2*index] = Seq_strand_unknown;
499
strands[2*index+1] = strand2;
500
start[2*index] = begin1;
501
start[2*index+1] = begin2;
507
if (strand1 != Seq_strand_minus) {
508
if(translate1 == FALSE)
509
begin1 = get_current_pos(start1, curr->num);
511
begin1 = frame1 - 1 + CODON_LENGTH*get_current_pos(start1, curr->num);
513
if(translate1 == FALSE)
514
begin1 = length1 - get_current_pos(start1, curr->num) - curr->num;
516
begin1 = original_length1 - CODON_LENGTH*(get_current_pos(start1, curr->num)+curr->num) + frame1 + 1;
521
strands[2*index] = strands[2*(index-1)];
523
strands[2*index] = Seq_strand_unknown;
524
strands[2*index+1] = strand1;
525
start[2*index] = begin2;
526
start[2*index+1] = begin1;
528
strands[2*index] = strand1;
530
strands[2*index+1] = strands[2*(index-1)+1];
532
strands[2*index+1] = Seq_strand_unknown;
533
start[2*index] = begin1;
534
start[2*index+1] = begin2;
541
length[index] = curr->num;
550
*length_out = length;
554
*strands_out = strands;
561
static void GapCorrectUASequence(GapEditBlock* edit_block)
563
GapEditScript* curr,* curr_last,* curr_last2;
570
for (curr=edit_block->esp; curr; curr = curr->next) {
572
if(curr->op_type == eGapAlignDecline && last_indel == TRUE) {
573
/* This is invalid condition and regions should be
576
if(curr_last2 != NULL)
577
curr_last2->next = curr;
579
edit_block->esp = curr; /* First element in a list */
581
curr_last->next = curr->next;
582
curr->next = curr_last;
587
if(curr->op_type == eGapAlignIns || curr->op_type == eGapAlignDel) {
589
curr_last2 = curr_last;
597
static SeqAlignPtr GapMakeSeqAlign(SeqIdPtr query_id, SeqIdPtr subject_id,
598
Boolean reverse, Boolean translate1,
599
Boolean translate2, Int4 numseg,
600
Int4* length, Int4* start,
605
StdSeg* sseg,* sseg_head,* sseg_old;
606
SeqLocPtr slp, slp1, slp2;
613
sap->dim =2; /**only two dimention alignment**/
615
/**make the Denseg Object for SeqAlign**/
616
if (translate1 == FALSE && translate2 == FALSE) {
617
sap->segtype = SAS_DENSEG; /** use denseg to store the alignment **/
618
sap->type = SAT_PARTIAL; /**partial for gapped translating search.*/
621
dsp->numseg = numseg;
623
dsp->ids = SeqIdDup(subject_id);
624
dsp->ids->next = SeqIdDup(query_id);
626
dsp->ids = SeqIdDup(query_id);
627
dsp->ids->next = SeqIdDup(subject_id);
630
dsp->strands = strands;
635
sap->type = SAT_PARTIAL; /**partial for gapped translating search. */
636
sap->segtype = SAS_STD; /**use stdseg to store the alignment**/
640
if(reverse) { /* Reverting translation flags */
641
tmp_value = translate1;
642
translate1 = translate2;
643
translate2 = tmp_value;
646
for (index=0; index<numseg; index++) {
649
if (sseg_head == NULL) {
653
sseg->ids = SeqIdDup(subject_id);
654
sseg->ids->next = SeqIdDup(query_id);
656
sseg->ids = SeqIdDup(query_id);
657
sseg->ids->next = SeqIdDup(subject_id);
661
if (start[2*index] != -1) {
662
seq_int1 = SeqIntNew();
663
seq_int1->from = start[2*index];
665
seq_int1->to = start[2*index] + CODON_LENGTH*length[index] - 1;
667
seq_int1->to = start[2*index] + length[index] - 1;
668
seq_int1->strand = strands[2*index];
671
seq_int1->id = SeqIdDup(subject_id);
673
seq_int1->id = SeqIdDup(query_id);
676
ValNodeAddPointer(&slp1, SEQLOC_INT, seq_int1);
679
ValNodeAddPointer(&slp1, SEQLOC_EMPTY, SeqIdDup(subject_id));
681
ValNodeAddPointer(&slp1, SEQLOC_EMPTY, SeqIdDup(query_id));
685
if (start[2*index+1] != -1) {
686
seq_int1 = SeqIntNew();
687
seq_int1->from = start[2*index+1];
689
seq_int1->to = start[2*index+1] + CODON_LENGTH*length[index] - 1;
691
seq_int1->to = start[2*index+1] + length[index] - 1;
692
seq_int1->strand = strands[2*index+1];
695
seq_int1->id = SeqIdDup(query_id);
697
seq_int1->id = SeqIdDup(subject_id);
700
ValNodeAddPointer(&slp2, SEQLOC_INT, seq_int1);
703
ValNodeAddPointer(&slp2, SEQLOC_EMPTY, SeqIdDup(query_id));
705
ValNodeAddPointer(&slp2, SEQLOC_EMPTY, SeqIdDup(subject_id));
723
sseg_old->next = sseg;
726
sap->segs = sseg_head;
737
/** Convert an EditScript chain to a SeqAlign of type DenseSeg.
738
* Used for a non-simple interval (i.e., one without subs. or
741
* The first SeqIdPtr in the chain of subject_id and query_id is duplicated for
745
GapEditBlockToSeqAlign(GapEditBlock* edit_block, SeqIdPtr subject_id, SeqIdPtr query_id)
748
GapEditScript* curr,* curr_last;
749
Int4 numseg, start1, start2;
750
Int4* length,* start;
752
Boolean is_disc_align = FALSE;
753
SeqAlignPtr sap, sap_disc, sap_head, sap_tail;
754
Boolean skip_region = FALSE;
756
is_disc_align = FALSE; numseg = 0;
758
for (curr=edit_block->esp; curr; curr = curr->next) {
760
if(/*edit_block->discontinuous && */curr->op_type == eGapAlignDecline)
761
is_disc_align = TRUE;
764
start1 = edit_block->start1;
765
start2 = edit_block->start2;
767
/* If no eGapAlignDecline regions exists output seqalign will be
768
regular Den-Seg or Std-seg */
769
if(is_disc_align == FALSE) {
770
/* Please note, that edit_block passed only for data like
771
strand, translate, reverse etc. Real data is taken starting
772
from "curr" and taken only "numseg" segments */
774
GapCollectDataForSeqalign(edit_block, edit_block->esp, numseg,
775
&start, &length, &strands, &start1, &start2);
777
/* Result of this function will be either den-seg or Std-seg
778
depending on translation options */
779
sap = GapMakeSeqAlign(query_id, subject_id, edit_block->reverse,
780
edit_block->translate1, edit_block->translate2,
781
numseg, length, start, strands);
784
/* By request of Steven Altschul - we need to have
785
the unaligned part being to the left if it is adjacent to the
786
gap (insertion or deletion) - so this function will do
789
GapCorrectUASequence(edit_block);
791
sap_disc = SeqAlignNew();
793
sap_disc->type = SAT_PARTIAL; /* ordered segments, over part of seq */
794
sap_disc->segtype = SAS_DISC; /* discontinuous alignment */
796
curr_last = edit_block->esp;
797
sap_head = NULL; sap_tail = NULL;
798
while(curr_last != NULL) {
800
for (numseg = 0, curr = curr_last;
801
curr; curr = curr->next, numseg++) {
803
if(curr->op_type == eGapAlignDecline) {
804
if(numseg != 0) { /* End of aligned area */
807
while(curr && curr->op_type == eGapAlignDecline) {
817
GapCollectDataForSeqalign(edit_block, curr_last, numseg,
818
&start, &length, &strands,
822
sap = GapMakeSeqAlign(query_id, subject_id,
824
edit_block->translate1,
825
edit_block->translate2,
826
numseg, length, start, strands);
828
/* Collecting all seqaligns into single linked list */
829
if(sap_tail == NULL) {
830
sap_head = sap_tail = sap;
832
sap_tail->next = sap;
839
sap_disc->segs = sap_head;
846
/** This function is used for Out-Of-Frame traceback conversion
847
* Convert an OOF EditScript chain to a SeqAlign of type StdSeg.
848
* Used for a non-simple interval (i.e., one without subs. or
850
* The first SeqIdPtr in the chain of subject_id and query_id is
851
* duplicated for the SeqAlign.
853
static SeqAlignPtr LIBCALL
854
BLAST_OOFEditBlockToSeqAlign(EBlastProgramType program,
855
GapEditBlock* edit_block, SeqIdPtr query_id, SeqIdPtr subject_id)
857
Boolean reverse = FALSE;
858
GapEditScript* curr,* esp;
861
Int4 original_length1, original_length2;
863
SeqIntPtr seq_int1, seq_int2;
864
SeqIntPtr seq_int1_last = NULL, seq_int2_last = NULL;
865
SeqIdPtr sip, id1, id2;
866
SeqLocPtr slp, slp1, slp2;
867
StdSeg* sseg,* sseg_head,* sseg_old;
868
Uint1 strand1, strand2;
871
if (program == eBlastTypeBlastx) {
873
start1 = edit_block->start2;
874
start2 = edit_block->start1;
875
frame1 = edit_block->frame2;
876
frame2 = edit_block->frame1;
877
original_length1 = edit_block->original_length2;
878
original_length2 = edit_block->original_length1;
882
start1 = edit_block->start1;
883
start2 = edit_block->start2;
884
frame1 = edit_block->frame1;
885
frame2 = edit_block->frame2;
886
original_length1 = edit_block->original_length1;
887
original_length2 = edit_block->original_length2;
893
strand1 = Seq_strand_plus;
895
strand1 = Seq_strand_minus;
897
strand1 = Seq_strand_unknown;
900
strand2 = Seq_strand_plus;
902
strand2 = Seq_strand_minus;
904
strand2 = Seq_strand_unknown;
906
esp = edit_block->esp;
910
sap->dim =2; /**only two dimention alignment**/
912
sap->type =3; /**partial for gapped translating search. */
913
sap->segtype =3; /**use denseg to store the alignment**/
916
esp = edit_block->esp;
920
for (curr=esp; curr; curr=curr->next) {
925
switch (curr->op_type) {
926
case eGapAlignDel: /* deletion of three nucleotides. */
930
seq_int1 = SeqIntNew();
931
seq_int1->from = get_current_pos(&start1, curr->num);
932
seq_int1->to = start1 - 1;
934
if(seq_int1->to >= original_length1)
935
seq_int1->to = original_length1-1;
937
seq_int1->id = SeqIdDup(id1);
938
seq_int1->strand = strand1;
940
ValNodeAddPointer(&slp1, SEQLOC_INT, seq_int1);
942
/* Empty nucleotide piece */
943
ValNodeAddPointer(&slp2, SEQLOC_EMPTY, SeqIdDup(id2));
945
seq_int1_last = seq_int1;
946
/* Keep previous seq_int2_last, in case there is a frame shift
947
immediately after this gap */
951
case eGapAlignIns: /* insertion of three nucleotides. */
953
/* If gap is followed after frameshift - we have to
954
add this element for the alignment to be correct */
956
if(first_shift == TRUE) { /* Second frameshift in a row */
957
/* Protein coordinates */
958
seq_int1 = SeqIntNew();
959
seq_int1->from = get_current_pos(&start1, 1);
960
seq_int1->to = start1 - 1;
962
if(seq_int1->to >= original_length1)
963
seq_int1->to = original_length1-1;
965
seq_int1->id = SeqIdDup(id1);
966
seq_int1->strand = strand1;
968
ValNodeAddPointer(&slp1, SEQLOC_INT, seq_int1);
970
/* Nucleotide scale shifted by op_type */
971
seq_int2 = SeqIntNew();
973
seq_int2->from = get_current_pos(&start2, 3);
974
seq_int2->to = start2 - 1;
976
if(seq_int2->to >= original_length2) {
977
seq_int2->to = original_length2 - 1;
981
/* Transfer to DNA minus strand coordinates */
982
if(strand2 == Seq_strand_minus) {
984
tmp_int = seq_int2->to;
985
seq_int2->to = original_length2 - seq_int2->from - 1;
986
seq_int2->from = original_length2 - tmp_int - 1;
989
seq_int2->id = SeqIdDup(id2);
990
seq_int2->strand = strand2;
992
ValNodeAddPointer(&slp2, SEQLOC_INT, seq_int2);
994
/* seq_int1_last = seq_int1;
995
seq_int2_last = seq_int2; */
997
/* first_shift = FALSE; */
1002
sip = SeqIdDup(id2);
1003
sip->next = SeqIdDup(id1);
1007
sip = SeqIdDup(id1);
1008
sip->next = SeqIdDup(id2);
1014
if (sseg_head == NULL)
1021
sseg_old->next = sseg;
1029
first_shift = FALSE;
1031
/* Protein piece is empty */
1032
ValNodeAddPointer(&slp1, SEQLOC_EMPTY, SeqIdDup(id1));
1034
/* Nucleotide scale shifted by 3, protein gapped */
1035
seq_int2 = SeqIntNew();
1036
seq_int2->from = get_current_pos(&start2, curr->num*3);
1037
seq_int2->to = start2 - 1;
1039
if(seq_int2->to >= original_length2) {
1040
seq_int2->to = original_length2 -1;
1043
/* Transfer to DNA minus strand coordinates */
1044
if(strand2 == Seq_strand_minus) {
1046
tmp_int = seq_int2->to;
1047
seq_int2->to = original_length2 - seq_int2->from - 1;
1048
seq_int2->from = original_length2 - tmp_int - 1;
1051
seq_int2->id = SeqIdDup(id2);
1052
seq_int2->strand = strand2;
1054
ValNodeAddPointer(&slp2, SEQLOC_INT, seq_int2);
1056
seq_int1_last = NULL;
1057
seq_int2_last = seq_int2; /* Will be used to adjust "to" value */
1061
case eGapAlignSub: /* Substitution. */
1063
first_shift = FALSE;
1065
/* Protein coordinates */
1066
seq_int1 = SeqIntNew();
1067
seq_int1->from = get_current_pos(&start1, curr->num);
1068
seq_int1->to = start1 - 1;
1070
if(seq_int1->to >= original_length1)
1071
seq_int1->to = original_length1-1;
1073
seq_int1->id = SeqIdDup(id1);
1074
seq_int1->strand = strand1;
1076
ValNodeAddPointer(&slp1, SEQLOC_INT, seq_int1);
1078
/* Nucleotide scale shifted by op_type */
1079
seq_int2 = SeqIntNew();
1082
get_current_pos(&start2, curr->num*(Uint1)curr->op_type);
1083
seq_int2->to = start2 - 1;
1085
/* Chop off three bases and one residue at a time.
1086
Why does this happen, seems like a bug?
1088
while (seq_int2->to >= original_length2) {
1093
/* Transfer to DNA minus strand coordinates */
1094
if(strand2 == Seq_strand_minus) {
1096
tmp_int = seq_int2->to;
1097
seq_int2->to = original_length2 - seq_int2->from - 1;
1098
seq_int2->from = original_length2 - tmp_int - 1;
1101
seq_int2->id = SeqIdDup(id2);
1102
seq_int2->strand = strand2;
1104
ValNodeAddPointer(&slp2, SEQLOC_INT, seq_int2);
1106
seq_int1_last = seq_int1; /* Will be used to adjust "to" value */
1107
seq_int2_last = seq_int2; /* Will be used to adjust "to" value */
1110
case eGapAlignDel2: /* gap of two nucleotides. */
1111
case eGapAlignDel1: /* Gap of one nucleotide. */
1112
case eGapAlignIns1: /* Insertion of one nucleotide. */
1113
case eGapAlignIns2: /* Insertion of two nucleotides. */
1115
if(first_shift == TRUE) { /* Second frameshift in a row */
1116
/* Protein coordinates */
1117
seq_int1 = SeqIntNew();
1118
seq_int1->from = get_current_pos(&start1, 1);
1119
seq_int1->to = start1 - 1;
1121
if(seq_int1->to >= original_length1)
1122
seq_int1->to = original_length1-1;
1124
seq_int1->id = SeqIdDup(id1);
1125
seq_int1->strand = strand1;
1127
ValNodeAddPointer(&slp1, SEQLOC_INT, seq_int1);
1129
/* Nucleotide scale shifted by op_type */
1130
seq_int2 = SeqIntNew();
1133
get_current_pos(&start2, (Uint1)curr->op_type);
1134
seq_int2->to = start2 - 1;
1136
if(seq_int2->to >= original_length2) {
1137
seq_int2->to = original_length2 -1;
1141
/* Transfer to DNA minus strand coordinates */
1142
if(strand2 == Seq_strand_minus) {
1144
tmp_int = seq_int2->to;
1145
seq_int2->to = original_length2 - seq_int2->from - 1;
1146
seq_int2->from = original_length2 - tmp_int - 1;
1149
seq_int2->id = SeqIdDup(id2);
1150
seq_int2->strand = strand2;
1152
ValNodeAddPointer(&slp2, SEQLOC_INT, seq_int2);
1154
seq_int1_last = seq_int1;
1155
seq_int2_last = seq_int2;
1157
/* first_shift = FALSE; */
1164
/* If this substitution is following simple frameshift
1165
we do not need to start new segment, but may continue
1167
if(seq_int2_last != NULL) {
1168
get_current_pos(&start2, curr->num*((Uint1)curr->op_type-3));
1169
if(strand2 != Seq_strand_minus) {
1170
seq_int2_last->to = start2 - 1;
1172
/* Transfer to DNA minus strand coordinates */
1173
seq_int2_last->from = original_length2 - start2;
1176
/* Adjustment for multiple shifts - theoretically possible,
1177
but very unprobable */
1178
if(seq_int2_last->from > seq_int2_last->to) {
1180
if(strand2 != Seq_strand_minus) {
1181
seq_int2_last->to += 3;
1183
seq_int2_last->from -= 3;
1186
if(seq_int1_last != 0)
1190
} else if ((Uint1)curr->op_type > 3) {
1191
/* Protein piece is empty */
1192
ValNodeAddPointer(&slp1, SEQLOC_EMPTY, SeqIdDup(id1));
1193
/* Simulating insertion of nucleotides */
1194
seq_int2 = SeqIntNew();
1196
get_current_pos(&start2,
1197
curr->num*((Uint1)curr->op_type-3));
1198
seq_int2->to = start2 - 1;
1200
if(seq_int2->to >= original_length2) {
1201
seq_int2->to = original_length2 - 1;
1204
/* Transfer to DNA minus strand coordinates */
1205
if(strand2 == Seq_strand_minus) {
1207
tmp_int = seq_int2->to;
1208
seq_int2->to = original_length2 - seq_int2->from - 1;
1209
seq_int2->from = original_length2 - tmp_int - 1;
1212
seq_int2->id = SeqIdDup(id2);
1213
seq_int2->strand = strand2;
1215
ValNodeAddPointer(&slp2, SEQLOC_INT, seq_int2);
1217
seq_int1_last = NULL;
1218
seq_int2_last = seq_int2; /* Will be used to adjust "to" value */
1221
continue; /* Main loop */
1223
continue; /* Main loop */
1226
continue; /* Main loop */
1233
sip = SeqIdDup(id2);
1234
sip->next = SeqIdDup(id1);
1238
sip = SeqIdDup(id1);
1239
sip->next = SeqIdDup(id2);
1245
if (sseg_head == NULL)
1252
sseg_old->next = sseg;
1256
sap->segs = sseg_head;
1263
BLAST_GapInfoToSeqAlign(EBlastProgramType program_number,
1264
BlastHSPList* hsp_list, SeqIdPtr query_id,
1265
SeqIdPtr subject_id, Int4 query_length,
1266
Boolean is_ooframe, SeqAlignPtr* head_seqalign)
1269
BlastHSP** hsp_array;
1270
SeqAlignPtr last_seqalign = NULL, seqalign = NULL;
1273
*head_seqalign = NULL;
1275
hsp_array = hsp_list->hsp_array;
1277
for (index=0; index<hsp_list->hspcnt; index++) {
1278
hsp_array[index]->gap_info->original_length1 = query_length;
1280
seqalign = BLAST_OOFEditBlockToSeqAlign(program_number,
1281
hsp_array[index]->gap_info,
1282
query_id, subject_id);
1284
/* The following line is needed for negative frames of translated
1286
seqalign = GapEditBlockToSeqAlign(hsp_array[index]->gap_info,
1287
subject_id, query_id);
1290
*head_seqalign = last_seqalign = seqalign;
1292
last_seqalign->next = seqalign;
1293
last_seqalign = last_seqalign->next;
1295
seqalign->score = GetScoreSetFromBlastHsp(hsp_array[index]);
1301
Int2 BLAST_ResultsToSeqAlign(EBlastProgramType program_number,
1302
BlastHSPResults* results, SeqLocPtr query_slp,
1303
ReadDBFILE* rdfp, SeqLoc* subject_slp,
1304
Boolean is_gapped, Boolean is_ooframe,
1305
SeqAlignPtr* head_seqalign)
1307
Int4 query_index, subject_index;
1308
SeqLocPtr slp = query_slp;
1309
SeqIdPtr query_id, subject_id = NULL;
1310
BlastHitList* hit_list;
1311
BlastHSPList* hsp_list;
1312
SeqAlignPtr seqalign = NULL, last_seqalign = NULL;
1313
Int4 subject_length = 0;
1314
SeqLoc** subject_loc_array = NULL;
1316
*head_seqalign = NULL;
1320
if (!rdfp && !subject_slp)
1325
(SeqLoc**) malloc(ValNodeLen(subject_slp)*sizeof(SeqLoc*));
1326
for (slp = subject_slp, subject_index = 0; slp; slp = slp->next, ++subject_index)
1327
subject_loc_array[subject_index] = slp;
1331
for (query_index = 0; slp && query_index < results->num_queries;
1332
++query_index, slp = slp->next) {
1333
hit_list = results->hitlist_array[query_index];
1336
query_id = SeqLocId(slp);
1338
for (subject_index = 0; subject_index < hit_list->hsplist_count;
1340
hsp_list = hit_list->hsplist_array[subject_index];
1344
/* NB: The following call allocates the SeqId structure. */
1345
readdb_get_descriptor(rdfp, hsp_list->oid, &subject_id, NULL);
1346
subject_length = readdb_get_sequence_length(rdfp, hsp_list->oid);
1348
/* NB: The following call does not allocate the SeqId structure,
1349
but returns the existing one. */
1350
subject_id = SeqLocId(subject_loc_array[hsp_list->oid]);
1351
subject_length = SeqLocLen(subject_loc_array[hsp_list->oid]);
1355
BLAST_GapInfoToSeqAlign(program_number, hsp_list, query_id,
1356
subject_id, SeqLocLen(slp), is_ooframe, &seqalign);
1358
BLAST_UngappedHSPToSeqAlign(program_number, hsp_list, query_id,
1359
subject_id, SeqLocLen(slp), subject_length, &seqalign);
1362
/* The subject id must be deallocated only in case of a ReadDB
1365
subject_id = SeqIdSetFree(subject_id);
1368
if (!last_seqalign) {
1369
*head_seqalign = last_seqalign = seqalign;
1371
last_seqalign->next = seqalign;
1373
for ( ; last_seqalign->next; last_seqalign = last_seqalign->next);
1378
sfree(subject_loc_array);
1383
void Blast_AdjustOffsetsInSeqAlign(SeqAlign* head, SeqLoc* query_slp,
1384
SeqLoc* subject_slp)
1386
SeqAlign* seqalign, *next_seqalign = NULL, *last_seqalign;
1387
SeqLoc* qslp, *sslp;
1388
SeqId* query_id, *subject_id;
1393
for (seqalign = head; seqalign; seqalign = next_seqalign) {
1394
query_id = TxGetQueryIdFromSeqAlign(seqalign);
1395
subject_id = TxGetSubjectIdFromSeqAlign(seqalign);
1397
/* Find Seq-locs corresponding to these query and subject. Start looking
1398
from the previously found Seq-locs, because Seq-aligns are sorted
1400
for ( ; qslp; qslp = qslp->next) {
1401
if (SeqIdComp(query_id, SeqLocId(qslp)) == SIC_YES)
1403
/* Changing query: return subject Seq-loc pointer to the beginning
1404
of the list of subject Seq-locs.*/
1407
for ( ; sslp; sslp = sslp->next) {
1408
if (SeqIdComp(subject_id, SeqLocId(sslp)) == SIC_YES)
1412
/* Find the first Seq-align that has either different subject or
1414
last_seqalign = seqalign;
1415
for (next_seqalign = seqalign->next; next_seqalign;
1416
next_seqalign = next_seqalign->next) {
1417
if ((SeqIdComp(subject_id, TxGetSubjectIdFromSeqAlign(next_seqalign))
1419
(SeqIdComp(query_id, TxGetQueryIdFromSeqAlign(next_seqalign))
1421
/* Unlink the Seq-align chain */
1422
last_seqalign->next = NULL;
1425
last_seqalign = next_seqalign;
1428
AdjustOffSetsInSeqAlign(seqalign, qslp, sslp);
1429
/* Reconnect the Seq-align chain */
1430
last_seqalign->next = next_seqalign;