1
/* $RCSfile: blastclust.c,v $ $Revision: 6.28 $ $Date: 2001/11/16 20:34:17 $
1
static char const rcsid[] = "$Id: blastclust.c,v 6.45 2004/08/02 20:08:49 dondosha Exp $";
3
/* $RCSfile: blastclust.c,v $ $Revision: 6.45 $ $Date: 2004/08/02 20:08:49 $
2
4
* ===========================================================================
4
6
* PUBLIC DOMAIN NOTICE
26
28
* Author: Ilya Dondoshansky
28
30
* File Description:
29
* This is server which does neighboring searches
31
* Program to perform a single-linkage clustering of a set of protein or DNA
32
* sequences. See file README.bcl for description
31
34
* ---------------------------------------------------------------------------
32
35
* $Log: blastclust.c,v $
36
* Revision 6.45 2004/08/02 20:08:49 dondosha
37
* Write and read header integers separately instead of as a structure; changed first number from 1 byte to 4, to prevent padding with garbage
39
* Revision 6.44 2004/06/30 21:03:57 madden
40
* Add include for blfmtutl.h
42
* Revision 6.43 2004/05/03 19:59:48 dondosha
43
* Display version number when run with no arguments
45
* Revision 6.42 2004/02/24 15:18:15 dondosha
46
* Fixed several memory leaks for nucleotide clustering
48
* Revision 6.41 2004/02/18 20:42:59 dondosha
49
* Cleaned up how id strings are retrieved for output
51
* Revision 6.40 2004/02/18 15:18:46 dondosha
52
* Minor fix when freeing title strings
54
* Revision 6.39 2003/07/22 17:59:48 dondosha
55
* Added call to BlastErrorPrint to make warnings more informative
57
* Revision 6.38 2003/06/13 19:59:29 dondosha
58
* Fixed 2 purify errors
60
* Revision 6.37 2003/05/30 17:31:09 coulouri
63
* Revision 6.36 2003/05/13 16:02:42 coulouri
64
* make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
66
* Revision 6.35 2003/05/06 20:21:13 dondosha
69
* Revision 6.34 2003/05/06 20:19:45 dondosha
70
* Changed a confusing error message to a meaningful one
72
* Revision 6.33 2002/10/08 15:32:42 dondosha
73
* Corrected file description comment in the NCBI header
75
* Revision 6.32 2002/09/03 16:01:11 dondosha
76
* Introduced a restriction on the total length of concatenated sequences for nucleotide case
78
* Revision 6.31 2002/05/30 16:20:16 dondosha
79
* 1. Correction in evaluation of hits for DNA case,
80
* 2. Added word size parameter to make DNA clustering more flexible
82
* Revision 6.30 2002/03/14 21:07:11 dondosha
83
* Slight improvement in finding the used id list from a gi list when reclustering
85
* Revision 6.29 2002/02/11 21:32:10 dondosha
86
* Free Bioseqs after use to avoid accumulation
33
88
* Revision 6.28 2001/11/16 20:34:17 dondosha
34
89
* Bug fix: BioseqUnlock was missing; also added ObjMgrFreeCache calls
160
216
} ClusterParameters, PNTR ClusterParametersPtr;
162
typedef struct cluster_log_header
164
Boolean numeric_id_type;
166
} ClusterLogHeader, PNTR ClusterLogHeaderPtr;
168
218
static ClusterParametersPtr global_parameters;
170
220
typedef struct cluster_log_info
352
402
CharPtr PNTR id_list = NULL;
353
403
CharPtr ptr, id_string;
354
404
FloatHi length_coverage, score_coverage;
355
ClusterLogHeader header;
405
Uint4 header_size, numeric_id_type;
356
406
Int4 last_seq = -1;
357
407
CharPtr id = NULL;
358
408
Boolean PNTR used_id_index = NULL;
360
FileRead(&header, sizeof(ClusterLogHeader), 1, infofp);
410
FileRead(&numeric_id_type, sizeof(Int4), 1, infofp);
411
FileRead(&header_size, sizeof(Int4), 1, infofp);
362
if (header.numeric_id_type) {
363
num_queries = header.size;
413
if (numeric_id_type) {
414
num_queries = header_size;
364
415
gi_list = (Int4Ptr) MemNew(num_queries*sizeof(Int4));
365
416
FileRead(gi_list, sizeof(Int4), num_queries, infofp);
367
total_id_len = header.size;
418
total_id_len = header_size;
369
420
id_string = (CharPtr) MemNew(total_id_len+1);
370
421
FileRead(id_string, sizeof(Char), total_id_len, infofp);
407
458
FileRead(used_id_string, sizeof(Char), FILE_BUFFER_SIZE, idfp))) {
408
459
ptr = used_id_string;
410
id_str = (CharPtr) Malloc(10);
411
460
while ((id = StringTokMT(ptr, delimiter_list, &ptr))
413
462
if (ptr==NULL && length==FILE_BUFFER_SIZE) {
417
466
for (i=0; i<num_queries; i++) {
419
sprintf(id_str, "%ld", gi_list[i]);
422
if (!StrCmp(id_str, id))
467
if ((gi_list && atoi(id) == gi_list[i]) ||
468
(!gi_list && !StrCmp(id_list[i], id)))
425
471
if (i < num_queries)
426
472
used_id_index[i] = TRUE;
433
477
while ((num_hits = FileRead(info, sizeof(ClusterLogInfo), INFO_LIST_SIZE,
553
597
if (search->prog_number == blast_type_blastp)
554
598
subject_length = readdb_get_sequence(search->rdfp, search->subject_id, &subject);
555
599
else { /* Mega BLAST saves ncbi4na-encoded sequence */
556
subject = search->subject->sequence_start;
600
subject = search->subject->sequence_start + 1;
557
601
subject_length = search->subject->length;
559
603
hspcnt = current_hitlist->hspcnt;
565
609
id2 = search->subject_id;
567
if (id1 < id2) { /* Must be always true */
568
612
#define BUF_CHUNK_SIZE 1024
569
613
Int4 query_length, q_length, s_length;
570
614
FloatHi length_coverage, bit_score, score_coverage, perc_identity;
596
640
HeapSort(hsp_info_array, hspcnt, sizeof(BlastClustHspInfoPtr),
597
641
hsp_info_id_score_compare);
642
/* Leave only one highest scoring hsp per "query" sequence.
643
Deallocate memory for all others. */
598
644
for (index=1; index<hspcnt; index++) {
599
645
if (hsp_info_array[index]->id < id2 &&
600
646
hsp_info_array[index]->id !=
679
724
perc_identity += MegaBlastGetNumIdentical(query, subject, start[2*i], start[2*i+1], length[i], FALSE);
681
726
perc_identity = perc_identity / align_length * 100;
727
start = MemFree(start);
728
length = MemFree(length);
729
strands = MemFree(strands);
683
731
if (global_parameters->score_threshold < 3.0)
684
732
score_coverage = bit_score / (MAX(q_length, s_length));
773
/* Deallocate memory for the auxiliary array of HSP info
774
structures. Also free all gap information structures,
775
because they are not freed when hit list is cleaned. */
722
776
if (search->prog_number == blast_type_blastn) {
723
777
for (index=0; index<query_count; index++)
724
778
MemFree(hsp_info_array[index]);
725
779
hsp_info_array = MemFree(hsp_info_array);
781
for (index=0; index < current_hitlist->hspcnt; ++index) {
782
hsp = current_hitlist->hsp_array[index];
783
hsp->gap_info = GapXEditBlockDelete(hsp->gap_info);
728
if (global_parameters->logfp) {
787
if (global_parameters->logfp && loginfo) {
729
788
FileWrite(loginfo, sizeof(ClusterLogInfo), query_count,
730
789
global_parameters->logfp);
732
/*fflush(global_parameters->logfp);*/
790
loginfo = MemFree(loginfo);
791
fflush(global_parameters->logfp);
735
fprintf(stderr, "Error: this can't happen!\n");
794
/* This can't happen in normal situation. If it does, the most likely
795
reason is presense of non-unique identifiers in the input file */
796
ErrPostEx(SEV_FATAL, 1, 0, "Blastclust cannot process input files with"
797
" non-unique sequence identifiers\n");
774
839
{ "Enable id parsing in database formatting?", /* 13 */
775
840
"TRUE", NULL, NULL, FALSE, 'e', ARG_BOOLEAN, 0.0, 0, NULL},
776
841
{ "Configuration file with advanced options", /* 14 */
777
NULL, NULL, NULL, TRUE, 'c', ARG_FILE_IN, 0.0, 0, NULL}
842
NULL, NULL, NULL, TRUE, 'c', ARG_FILE_IN, 0.0, 0, NULL},
843
{ "Word size to use (0 for default: proteins 3, nucleotides 32)",/* 15 */
844
"0", NULL, NULL, FALSE, 'W', ARG_INT, 0.0, 0, NULL}
780
848
#define MAX_DB_SIZE 100000
781
849
#define MAX_GI_LENGTH 9
782
850
#define PROGRESS_INTERVAL 1000
783
851
#define MAX_NUM_QUERIES 16383 /* == 1/2 INT2_MAX */
852
#define MAX_TOTAL_LENGTH 5000000
801
870
Int8 total_length;
802
871
Int4Ptr gi_list = NULL, seq_len = NULL;
803
872
CharPtr PNTR id_list = NULL, id_string = NULL;
804
Boolean db_formatted = FALSE, numeric_id_type = TRUE;
873
Boolean db_formatted = FALSE;
874
Boolean numeric_id_type = TRUE;
805
875
Char db_file[BUFFER_SIZE];
806
ClusterLogHeader header;
807
876
Boolean print_progress, finish_incomplete, is_prot, parse_mode;
808
877
FDB_optionsPtr fdb_options;
809
878
Char timestr[24];
880
Int4 total_query_length;
881
CharPtr title = NULL;
882
Char buf[256] = { '\0' };
812
if (! GetArgs ("blastclust", NUMARG, myargs))
884
StringCpy(buf, "blastclust ");
885
StringNCat(buf, BlastGetVersionNumber(), sizeof(buf)-StringLen(buf)-1);
886
if (! GetArgs (buf, NUMARG, myargs))
815
889
UseLocalAsnloadDataAndErrMsg ();
833
907
if (progress_file != NULL &&
834
908
(progressfp = FileOpen(progress_file, "w")) == NULL) {
835
ErrPostEx(SEV_FATAL, 0, 0, "blastclust: Unable to open progress file %s\n",
909
ErrPostEx(SEV_FATAL, 1, 0, "blastclust: Unable to open progress file %s\n",
842
916
if (blast_outputfile != NULL) {
843
917
if ((outfp = FileOpen(blast_outputfile, "w")) == NULL) {
844
ErrPostEx(SEV_FATAL, 0, 0, "blastclust: Unable to open output file %s\n", blast_outputfile);
918
ErrPostEx(SEV_FATAL, 1, 0, "blastclust: Unable to open output file %s\n", blast_outputfile);
851
925
/* Non-empty string means only retrieve neighbors for reclustering */
852
926
if ((infofp = FileOpen(info_file, "rb")) == NULL) {
853
ErrPostEx(SEV_FATAL, 0, 0, "blastclust: Unable to open neighbors file %s for reading\n", info_file);
927
ErrPostEx(SEV_FATAL, 1, 0, "blastclust: Unable to open neighbors file %s for reading\n", info_file);
856
930
if (myargs[11].strvalue) {
857
931
if ((idfp = FileOpen(myargs[11].strvalue, "r")) == NULL) {
858
ErrPostEx(SEV_FATAL, 0, 0, "blastclust: Unable to open id file %s for reading\n", myargs[11].strvalue);
932
ErrPostEx(SEV_FATAL, 1, 0, "blastclust: Unable to open id file %s for reading\n", myargs[11].strvalue);
897
971
blast_database = blast_inputfile;
899
fdb_options = MemNew(sizeof(FDB_options));
900
fdb_options->db_file = StringSave(blast_inputfile);
901
fdb_options->is_protein = (Int4) is_prot;
902
fdb_options->parse_mode = (Int4) parse_mode;
903
fdb_options->base_name = StringSave(blast_database);
904
FastaToBlastDB(fdb_options, blast_database, 0);
973
fdb_options = FDBOptionsNew(blast_inputfile, is_prot, NULL, FALSE,
974
FALSE, FALSE, FALSE, TRUE, parse_mode,
975
blast_database, NULL, 0, 0, FORMATDB_VER,
977
FastaToBlastDB(fdb_options, 0);
906
MemFree(fdb_options->db_file);
907
MemFree(fdb_options->base_name);
908
MemFree(fdb_options);
979
fdb_options = FDBOptionsFree(fdb_options);
911
982
logfile = myargs[6].strvalue;
912
983
if (logfile) { /* Empty string means do not write log information */
913
984
if (finish_incomplete) {
914
985
if ((global_parameters->logfp = FileOpen(logfile, "ab+")) == NULL) {
915
ErrPostEx(SEV_FATAL, 0, 0, "blast: Unable to open log file %s for appending\n", logfile);
986
ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open log file %s for appending\n", logfile);
919
990
if ((global_parameters->logfp = FileOpen(logfile, "wb+")) == NULL) {
920
ErrPostEx(SEV_FATAL, 0, 0, "blast: Unable to open log file %s for writing\n", logfile);
991
ErrPostEx(SEV_FATAL, 1, 0, "blast: Unable to open log file %s for writing\n", logfile);
946
1017
options->number_of_cpus = myargs[1].intvalue;
948
1019
options->is_megablast_search = TRUE;
1020
options->gap_open = options->gap_extend = 0;
949
1021
options->wordsize = 32;
950
options->gap_open = options->gap_extend = 0;
951
1022
options->block_width = 0;
952
1023
options->window_size = 0;
1026
if (myargs[15].intvalue != 0)
1027
options->wordsize = myargs[15].intvalue;
955
1029
if (myargs[14].strvalue) {
956
1030
CharPtr string_options = NULL;
957
1031
Int2 string_options_len = 0;
960
1034
Char opt_line[BUFFER_SIZE+1];
962
1036
if ((config_file = FileOpen(myargs[14].strvalue, "r")) == NULL) {
963
ErrPostEx(SEV_FATAL, 0, 0,
1037
ErrPostEx(SEV_FATAL, 1, 0,
964
1038
"Cannot open advanced options configuration file %s\n",
965
1039
myargs[14].strvalue);
999
1073
root = (Int4Ptr) Malloc(num_queries*sizeof(Int4));
1001
ReadDBBioseqFetchEnable ("blastclust", blast_database, db_is_na, TRUE);
1003
1075
if (is_prot && finish_incomplete) {
1004
1076
first_seq = ReclusterFromFile(global_parameters->logfp, NULL, &gi_list,
1005
1077
&id_list, &seq_len, NULL);
1079
Uint4 header_size = 0;
1080
Uint4 header_numeric_id_type = 0;
1008
1082
for (index=0; index<num_queries; index++)
1009
1083
root[index] = index;
1013
1087
seq_len = (Int4Ptr) MemNew(num_queries*sizeof(Int4));
1015
1089
for (index=0; index<num_queries; index++) {
1016
readdb_get_descriptor(rdfp, index, &sip, NULL);
1090
readdb_get_descriptor(rdfp, index, &sip, &title);
1017
1091
seq_len[index] = readdb_get_sequence_length(rdfp, index);
1019
if (sip->choice == SEQID_LOCAL) {
1020
BioseqPtr bsp = BioseqLockById(sip);
1021
CharPtr title = StringSave(BioseqGetTitle(bsp));
1023
numeric_id_type = FALSE;
1024
id_list[index] = StringTokMT(title, " ", &title);
1027
GetAccessionFromSeqId(bsp->id, &gi_list[index],
1031
} else if (sip->choice != SEQID_GENERAL)
1093
if (sip->choice != SEQID_GENERAL ||
1094
StringCmp(((DbtagPtr)sip->data.ptrvalue)->db, "BL_ORD_ID")) {
1032
1095
numeric_id_type &=
1033
1096
GetAccessionFromSeqId(sip, &gi_list[index],
1034
1097
&id_list[index]);
1036
BioseqPtr bsp = BioseqLockById(sip);
1037
CharPtr title = StringSave(BioseqGetTitle(bsp));
1039
numeric_id_type = FALSE;
1040
id_list[index] = StringTokMT(title, " ", &title);
1099
CharPtr dummy_ptr = NULL;
1100
numeric_id_type = FALSE;
1101
if (title && *title != NULLB) {
1103
StringSave(StringTokMT(title, " ", &dummy_ptr));
1042
numeric_id_type = FALSE;
1043
1105
id_list[index] = (CharPtr) Malloc(BUFFER_SIZE+1);
1044
1106
SeqIdWrite(sip, id_list[index],
1045
1107
PRINTID_FASTA_SHORT, BUFFER_SIZE);
1049
sip = SeqIdSetFree(sip);
1110
title = MemFree(title);
1111
sip = SeqIdSetFree(sip);
1051
header.numeric_id_type = numeric_id_type;
1053
1114
if (numeric_id_type) {
1054
1115
id_list = MemFree(id_list);
1055
header.size = num_queries;
1116
header_size = num_queries;
1117
header_numeric_id_type = 1;
1057
1119
total_id_len = 0;
1058
1120
/* Check if some ids were gis and convert them to strings */
1069
1131
StringCat(id_string, id_list[i]);
1070
1132
StringCat(id_string, " ");
1072
header.size = total_id_len;
1134
header_size = total_id_len;
1075
FileWrite(&header, sizeof(ClusterLogHeader), 1,
1076
global_parameters->logfp);
1137
FileWrite(&header_numeric_id_type, sizeof(Uint4), 1,
1138
global_parameters->logfp);
1139
FileWrite(&header_size, sizeof(Uint4), 1, global_parameters->logfp);
1078
1141
if (numeric_id_type)
1079
1142
FileWrite(gi_list, sizeof(Int4), num_queries,
1080
1143
global_parameters->logfp);
1107
1170
num_left = num_queries;
1108
1171
while (num_left>0) {
1109
1172
num_bsps = MIN(num_left, MAX_NUM_QUERIES);
1110
for (index=0; index<num_bsps; index++)
1173
total_query_length = 0;
1174
for (index=0; index<num_bsps; index++) {
1111
1175
query_bsp_array[index] = readdb_get_bioseq(rdfp, index);
1176
total_query_length += query_bsp_array[index]->length;
1177
if (total_query_length > MAX_TOTAL_LENGTH)
1112
1181
query_bsp_array[num_bsps] = NULL;
1114
1183
head = BioseqMegaBlastEngine(query_bsp_array, blast_program,
1126
1195
/* Set up the search */
1127
1196
options->first_db_seq = index + 1;
1129
search = BLASTSetUpSearchWithReadDbInternal(NULL, query_bsp, blast_program, seq_len[index], blast_database, options, NULL, NULL, NULL, 0, rdfp);
1198
search = BLASTSetUpSearchWithReadDbInternal(NULL, query_bsp,
1199
blast_program, seq_len[index], blast_database,
1200
options, NULL, NULL, NULL, 0, rdfp);
1130
1201
if (search != NULL && !search->query_invalid) {
1131
1202
search->handle_results = PrintNeighbors;
1135
1206
search->thr_info->tick_callback = NULL;
1137
1208
do_the_blast_run(search);
1209
} else if (search) {
1210
BlastErrorPrint(search->error_return);
1139
1212
search = BlastSearchBlkDestruct(search);
1140
1213
query_bsp = BioseqFree(query_bsp);
1143
1215
if (print_progress && (index + 1)%PROGRESS_INTERVAL == 0) {
1144
1216
DayTimeStr(timestr, TRUE, TRUE);
1148
1220
} /* End of loop on queries */
1222
/*ReadDBBioseqFetchDisable();*/
1150
1223
rdfp = readdb_destruct(rdfp);
1151
1224
BlastClusterNeighbours(num_queries, seq_len, id_list, gi_list, NULL);