28
28
* Conversion of BLAST results to the SeqAlign form
31
static char const rcsid[] = "$Id: blast_seqalign.c,v 1.52 2005/04/06 23:27:53 dondosha Exp $";
31
static char const rcsid[] = "$Id: blast_seqalign.c,v 1.57 2006/03/21 15:27:58 madden Exp $";
33
33
#include <algo/blast/api/blast_seqalign.h>
35
extern Int4 LIBCALL HspArrayPurge PROTO((BlastHSP** hsp_array,
36
Int4 hspcnt, Boolean clear_num));
37
35
extern SeqIdPtr GetTheSeqAlignID (SeqIdPtr seq_id);
38
36
extern ScorePtr MakeBlastScore (ScorePtr PNTR old, CharPtr scoretype,
39
37
Nlm_FloatHi prob, Int4 score);
45
SBlastSeqalignArrayNew(Int4 size)
47
SBlastSeqalignArray* retval = NULL;
52
retval = (SBlastSeqalignArray*) malloc(sizeof(SBlastSeqalignArray));
55
retval->num_queries = size;
56
retval->array = (SeqAlign**) calloc(size, sizeof(SeqAlign*));
57
if (retval->array == NULL)
67
SBlastSeqalignArrayFree(SBlastSeqalignArray* seqalign_vec)
71
if (seqalign_vec == NULL)
74
for (index=0; index<seqalign_vec->num_queries; index++)
76
SeqAlignSetFree(seqalign_vec->array[index]);
78
sfree(seqalign_vec->array);
46
83
/** Creates a score set corresponding to one HSP.
47
84
* @param hsp HSP structure [in]
48
85
* @return Score set for this HSP.
386
GapCollectDataForSeqalign(BlastHSP* hsp, GapEditScript* curr_in, Int4 numseg,
427
GapCollectDataForSeqalign(BlastHSP* hsp, GapEditScript* esp, Int4 first, Int4 number,
387
428
Int4 query_length, Int4 subject_length,
388
429
Boolean translate1, Boolean translate2,
389
430
Int4** start_out, Int4** length_out,
390
431
Uint1** strands_out, Int4* start1, Int4* start2)
393
433
Int2 frame1, frame2;
394
434
Int4 begin1, begin2, index, length1, length2;
395
435
Int4 original_length1, original_length2, i;
424
464
strand2 = Seq_strand_unknown;
426
start = (Int4 *) calloc((2*numseg+1), sizeof(Int4));
427
length = (Int4 *) calloc((numseg+1), sizeof(Int4));
428
strands = (Uint1 *) calloc((2*numseg+1), sizeof(Uint1));
466
start = (Int4 *) calloc((2*number+1), sizeof(Int4));
467
length = (Int4 *) calloc((number+1), sizeof(Int4));
468
strands = (Uint1 *) calloc((2*number+1), sizeof(Uint1));
431
for (i = 0, curr=curr_in; curr && i < numseg; curr=curr->next, i++) {
432
switch(curr->op_type) {
471
for (i=first; i<number; i++)
473
switch(esp->op_type[i]) {
433
474
case eGapAlignDecline:
434
475
case eGapAlignSub:
435
476
if (strand1 != Seq_strand_minus) {
436
477
if(translate1 == FALSE)
437
begin1 = s_GetCurrentPos(start1, curr->num);
478
begin1 = s_GetCurrentPos(start1, esp->num[i]);
439
begin1 = frame1 - 1 + CODON_LENGTH*s_GetCurrentPos(start1, curr->num);
480
begin1 = frame1 - 1 + CODON_LENGTH*s_GetCurrentPos(start1, esp->num[i]);
441
482
if(translate1 == FALSE)
442
begin1 = length1 - s_GetCurrentPos(start1, curr->num) - curr->num;
483
begin1 = length1 - s_GetCurrentPos(start1, esp->num[i]) - esp->num[i];
444
begin1 = original_length1 - CODON_LENGTH*(s_GetCurrentPos(start1, curr->num)+curr->num) + frame1 + 1;
485
begin1 = original_length1 - CODON_LENGTH*(s_GetCurrentPos(start1, esp->num[i])+esp->num[i]) + frame1 + 1;
447
488
if (strand2 != Seq_strand_minus) {
448
489
if(translate2 == FALSE)
449
begin2 = s_GetCurrentPos(start2, curr->num);
490
begin2 = s_GetCurrentPos(start2, esp->num[i]);
451
begin2 = frame2 - 1 + CODON_LENGTH*s_GetCurrentPos(start2, curr->num);
492
begin2 = frame2 - 1 + CODON_LENGTH*s_GetCurrentPos(start2, esp->num[i]);
453
494
if(translate2 == FALSE)
454
begin2 = length2 - s_GetCurrentPos(start2, curr->num) - curr->num;
495
begin2 = length2 - s_GetCurrentPos(start2, esp->num[i]) - esp->num[i];
456
begin2 = original_length2 - CODON_LENGTH*(s_GetCurrentPos(start2, curr->num)+curr->num) + frame2 + 1;
497
begin2 = original_length2 - CODON_LENGTH*(s_GetCurrentPos(start2, esp->num[i])+esp->num[i]) + frame2 + 1;
459
500
strands[2*index] = strand1;
468
509
if (strand2 != Seq_strand_minus) {
469
510
if(translate2 == FALSE)
470
begin2 = s_GetCurrentPos(start2, curr->num);
511
begin2 = s_GetCurrentPos(start2, esp->num[i]);
472
begin2 = frame2 - 1 + CODON_LENGTH*s_GetCurrentPos(start2, curr->num);
513
begin2 = frame2 - 1 + CODON_LENGTH*s_GetCurrentPos(start2, esp->num[i]);
474
515
if(translate2 == FALSE)
475
begin2 = length2 - s_GetCurrentPos(start2, curr->num) - curr->num;
516
begin2 = length2 - s_GetCurrentPos(start2, esp->num[i]) - esp->num[i];
477
begin2 = original_length2 - CODON_LENGTH*(s_GetCurrentPos(start2, curr->num)+curr->num) + frame2 + 1;
518
begin2 = original_length2 - CODON_LENGTH*(s_GetCurrentPos(start2, esp->num[i])+esp->num[i]) + frame2 + 1;
490
531
case eGapAlignIns:
491
532
if (strand1 != Seq_strand_minus) {
492
533
if(translate1 == FALSE)
493
begin1 = s_GetCurrentPos(start1, curr->num);
534
begin1 = s_GetCurrentPos(start1, esp->num[i]);
495
begin1 = frame1 - 1 + CODON_LENGTH*s_GetCurrentPos(start1, curr->num);
536
begin1 = frame1 - 1 + CODON_LENGTH*s_GetCurrentPos(start1, esp->num[i]);
497
538
if(translate1 == FALSE)
498
begin1 = length1 - s_GetCurrentPos(start1, curr->num) - curr->num;
539
begin1 = length1 - s_GetCurrentPos(start1, esp->num[i]) - esp->num[i];
500
begin1 = original_length1 - CODON_LENGTH*(s_GetCurrentPos(start1, curr->num)+curr->num) + frame1 + 1;
541
begin1 = original_length1 - CODON_LENGTH*(s_GetCurrentPos(start1, esp->num[i])+esp->num[i]) + frame1 + 1;
503
544
strands[2*index] = strand1;
539
580
s_GapCorrectUASequence(BlastHSP* hsp)
541
GapEditScript* curr,* curr_last,* curr_last2;
548
for (curr=hsp->gap_info; curr; curr = curr->next) {
550
if(curr->op_type == eGapAlignDecline && last_indel == TRUE) {
582
GapEditScript* esp = hsp->gap_info;
585
for (index=0; index<esp->size; index++)
587
// if GAPALIGN_DECLINE immediately follows an insertion or deletion
588
if (index > 0 && esp->op_type[index] == eGapAlignDecline &&
589
(esp->op_type[index-1] == eGapAlignIns || esp->op_type[index-1] == eGapAlignDel))
551
591
/* This is invalid condition and regions should be
554
if(curr_last2 != NULL)
555
curr_last2->next = curr;
557
hsp->gap_info = curr; /* First element in a list */
559
curr_last->next = curr->next;
560
curr->next = curr_last;
565
if(curr->op_type == eGapAlignIns || curr->op_type == eGapAlignDel) {
567
curr_last2 = curr_last;
593
int temp_num = esp->num[index];
594
EGapAlignOpType temp_op = esp->op_type[index];
596
esp->num[index] = esp->num[index-1];
597
esp->op_type[index] = esp->op_type[index-1];
598
esp->num[index-1] = temp_num;
599
esp->op_type[index-1] = temp_op;
699
729
Int4 query_length, Int4 subject_length)
702
GapEditScript* curr,* curr_last;
703
Int4 numseg, start1, start2;
704
734
Int4* length,* start;
706
736
Boolean is_disc_align = FALSE;
707
737
SeqAlignPtr sap, sap_disc, sap_head, sap_tail;
708
Boolean skip_region = FALSE;
709
738
Boolean translate1, translate2;
711
is_disc_align = FALSE; numseg = 0;
713
for (curr=hsp->gap_info; curr; curr = curr->next) {
715
if(/*edit_block->discontinuous && */curr->op_type == eGapAlignDecline)
741
is_disc_align = FALSE;
744
for (index=0; index<esp->size; index++)
746
if(esp->op_type[index] == eGapAlignDecline)
716
748
is_disc_align = TRUE;
719
753
start1 = hsp->query.offset;
730
764
strand, translate, reverse etc. Real data is taken starting
731
765
from "curr" and taken only "numseg" segments */
733
GapCollectDataForSeqalign(hsp, hsp->gap_info, numseg, query_length,
767
GapCollectDataForSeqalign(hsp, hsp->gap_info, 0, esp->size, query_length,
734
768
subject_length, translate1, translate2,
735
769
&start, &length, &strands, &start1, &start2);
737
771
/* Result of this function will be either den-seg or Std-seg
738
772
depending on translation options */
739
773
sap = s_GapMakeSeqAlign(query_id, subject_id, translate1, translate2,
740
numseg, length, start, strands);
774
esp->size, length, start, strands);
743
777
/* By request of Steven Altschul - we need to have
752
786
sap_disc->type = SAT_PARTIAL; /* ordered segments, over part of seq */
753
787
sap_disc->segtype = SAS_DISC; /* discontinuous alignment */
755
curr_last = hsp->gap_info;
756
789
sap_head = NULL; sap_tail = NULL;
757
while(curr_last != NULL) {
759
for (numseg = 0, curr = curr_last;
760
curr; curr = curr->next, numseg++) {
790
for (index=0; index<esp->size; index++)
793
Boolean skip_region = FALSE;
796
for (index2=first; index2<esp->size; index2++, numseg++) {
762
if(curr->op_type == eGapAlignDecline) {
798
if(esp->op_type[index2] == eGapAlignDecline) {
763
799
if(numseg != 0) { /* End of aligned area */
766
while(curr && curr->op_type == eGapAlignDecline) {
802
while (index2<esp->size && esp->op_type[index2] == eGapAlignDecline) {
770
806
skip_region = TRUE;
776
GapCollectDataForSeqalign(hsp, curr_last, numseg, query_length,
815
GapCollectDataForSeqalign(hsp, esp, first, numseg, query_length,
777
816
subject_length, translate1, translate2,
778
817
&start, &length, &strands, &start1,
783
821
s_GapMakeSeqAlign(query_id, subject_id, translate1,
784
822
translate2, numseg, length, start,
1276
/** Adjusts the offsets in a Seq-align list produced by a BLAST search
1277
* of multiple queries against multiple subjects. Seq-aligns in a list are
1278
* assumed to be sorted by query, and then by subject, i.e. all Seq-aligns for
1279
* the first query are at the beginning of the list, then all for the second
1280
* query, etc. Within a list for any single query, all Seq-aligns for one
1281
* subject are grouped together. The order of queries in the Seq-align list
1282
* must be the same as in the query Seq-loc list; the order of subjects
1283
* within a Seq-align list for a given query is the same as in the subject
1284
* Seq-loc list. Some or all queries or subjects might be skipped in the
1286
* @param head List of Seq-aligns [in|out]
1287
* @param query_slp List of query locations [in]
1288
* @param subject_slp List of subject locations [in]
1291
s_AdjustOffsetsInSeqAlign(SeqAlign* head, SeqLoc* query_slp,
1292
SeqLoc* subject_slp)
1294
SeqAlign* seqalign, *next_seqalign = NULL, *last_seqalign;
1295
SeqLoc* qslp, *sslp;
1296
SeqId* query_id, *subject_id;
1301
for (seqalign = head; seqalign; seqalign = next_seqalign) {
1302
query_id = TxGetQueryIdFromSeqAlign(seqalign);
1303
subject_id = TxGetSubjectIdFromSeqAlign(seqalign);
1305
/* Find Seq-locs corresponding to these query and subject. Start looking
1306
from the previously found Seq-locs, because Seq-aligns are sorted
1308
for ( ; qslp; qslp = qslp->next) {
1309
if (SeqIdComp(query_id, SeqLocId(qslp)) == SIC_YES)
1311
/* Changing query: return subject Seq-loc pointer to the beginning
1312
of the list of subject Seq-locs.*/
1315
for ( ; sslp; sslp = sslp->next) {
1316
if (SeqIdComp(subject_id, SeqLocId(sslp)) == SIC_YES)
1320
/* Find the first Seq-align that has either different subject or
1322
last_seqalign = seqalign;
1323
for (next_seqalign = seqalign->next; next_seqalign;
1324
next_seqalign = next_seqalign->next) {
1325
if ((SeqIdComp(subject_id, TxGetSubjectIdFromSeqAlign(next_seqalign))
1327
(SeqIdComp(query_id, TxGetQueryIdFromSeqAlign(next_seqalign))
1329
/* Unlink the Seq-align chain */
1330
last_seqalign->next = NULL;
1333
last_seqalign = next_seqalign;
1336
AdjustOffSetsInSeqAlign(seqalign, qslp, sslp);
1337
/* Reconnect the Seq-align chain */
1338
last_seqalign->next = next_seqalign;
1342
1314
Int2 BLAST_ResultsToSeqAlign(EBlastProgramType program_number,
1343
BlastHSPResults* results, SeqLocPtr query_slp,
1315
BlastHSPResults** results_ptr, SeqLocPtr query_slp,
1344
1316
ReadDBFILE* rdfp, SeqLoc* subject_slp,
1345
1317
Boolean is_gapped, Boolean is_ooframe,
1346
SeqAlignPtr* head_seqalign)
1318
SBlastSeqalignArray* *seqalign_arr)
1348
1320
Int4 query_index, subject_index;
1349
1321
SeqLocPtr slp = query_slp;
1350
1322
SeqIdPtr query_id, subject_id = NULL;
1351
BlastHitList* hit_list;
1352
BlastHSPList* hsp_list;
1353
SeqAlignPtr seqalign = NULL, last_seqalign = NULL;
1354
Int4 subject_length = 0;
1355
1323
SeqLoc** subject_loc_array = NULL;
1357
*head_seqalign = NULL;
1324
BlastHSPResults* results = NULL;
1326
if (results_ptr == NULL)
1329
results = *results_ptr;
1331
if (!results || results->num_queries <= 0)
1361
1334
if (!rdfp && !subject_slp)
1337
*seqalign_arr = SBlastSeqalignArrayNew(results->num_queries);
1338
if (*seqalign_arr == NULL)
1365
1343
subject_loc_array =
1366
1344
(SeqLoc**) malloc(ValNodeLen(subject_slp)*sizeof(SeqLoc*));
1371
1349
slp = query_slp;
1372
1350
for (query_index = 0; slp && query_index < results->num_queries;
1373
1351
++query_index, slp = slp->next) {
1374
hit_list = results->hitlist_array[query_index];
1352
SeqAlignPtr head_seqalign = NULL, last_seqalign = NULL;
1353
BlastHitList* hit_list = results->hitlist_array[query_index];
1377
1357
query_id = SeqLocId(slp);
1379
1359
for (subject_index = 0; subject_index < hit_list->hsplist_count;
1380
1360
++subject_index) {
1381
hsp_list = hit_list->hsplist_array[subject_index];
1361
SeqAlignPtr seqalign = NULL;
1362
Int4 subject_length = 0;
1363
BlastHSPList* hsp_list = hit_list->hsplist_array[subject_index];
1385
// Sort HSPs with e-values as first priority and scores as
1386
// tie-breakers, since that is the order we want to see them in
1367
/* Sort HSPs with e-values as first priority and scores as
1368
tie-breakers, since that is the order we want to see them in
1388
1370
Blast_HSPListSortByEvalue(hsp_list);