1
/* $Id: fdselect.c,v 6.3 2000/11/22 21:10:12 shavirin Exp $ */
2
/*****************************************************************************
6
National Center for Biotechnology Information
8
This software/database is a "United States Government Work" under the
9
terms of the United States Copyright Act. It was written as part of
10
the author's official duties as a United States Government employee
11
and thus cannot be copyrighted. This software/database is freely
12
available to the public for use. The National Library of Medicine and
13
the U.S. Government have not placed any restriction on its use or
16
Although all reasonable efforts have been taken to ensure the accuracy
17
and reliability of the software and data, the NLM and the U.S.
18
Government do not and cannot warrant the performance or results that
19
may be obtained by using this software or data. The NLM and the U.S.
20
Government disclaim all warranties, express or implied, including
21
warranties of performance, merchantability or fitness for any
24
Please cite the author in any work or product based on this material.
26
***************************************************************************
30
Author: Sergei B. Shavirin
32
Version Creation Date: 09/13/99
36
File Description: Create few subsets of FASTA database.
39
Revision 6.3 2000/11/22 21:10:12 shavirin
40
Added tax_id parameter into function FDBAddBioseq.
42
Revision 6.2 2000/03/13 18:37:37 madden
43
Added insert_ctrlA Boolean to readdb_get_bioseq_ex
45
Revision 6.1 1999/09/13 16:20:00 shavirin
49
*****************************************************************************/
54
/* We will use regular WWW encoding of request to make specific
55
filtering using database information file
57
The most general query string will look like:
59
taxid=555,666,777&owner=5,6,7&div=AAA,BBB,CCC
63
#define TAXID_LABEL "taxid"
64
#define OWNER_LABEL "owner"
65
#define DIV_LABEL "div"
66
#define DBNAME_LABEL "dbname"
68
#define MAX_SR_ELEMENTS 36
69
#define MAX_FLT_DATABASES 64
71
typedef struct _SR_Info
73
Int4 taxid[MAX_SR_ELEMENTS];
74
Int4 owner[MAX_SR_ELEMENTS];
75
Char div[MAX_SR_ELEMENTS][4];
77
} SR_Info, PNTR SR_InfoPtr;
86
typedef struct _NewDBdata
90
} NewDBdata, PNTR NewDBdataPtr;
94
Args flt_args[NUMARG] = {
95
{ "Title for output database file",
96
NULL, NULL, NULL, TRUE, 't', ARG_STRING, 0.0, 0, NULL},
97
{"Input file for the filtering (this parameter must be set)",
98
NULL, NULL,NULL,FALSE,'i',ARG_FILE_IN, 0.0,0,NULL},
99
{"Configuration file for database subsets creation",
100
"fdselect.cfg", NULL,NULL,TRUE,'c',ARG_FILE_IN, 0.0,0,NULL},
101
{"Input file with a list of gis",
102
NULL, NULL,NULL,TRUE,'g',ARG_FILE_IN, 0.0,0,NULL},
103
{"Create sparse indexes in the filtered database",
104
"F", NULL,NULL,TRUE,'s',ARG_BOOLEAN, 0.0,0,NULL},
105
{"Query string for creating database subset",
106
NULL, NULL,NULL,TRUE,'q',ARG_STRING, 0.0,0,NULL},
107
{"Input database is protein",
108
"T", NULL,NULL,TRUE,'p',ARG_BOOLEAN, 0.0,0,NULL},
110
"fdselect.log", NULL,NULL,TRUE,'l',ARG_FILE_OUT, 0.0,0,NULL},
113
#define FLT_Input flt_args[1].strvalue
114
#define IS_Protein flt_args[6].intvalue
117
void FDB_optionsFree(FDB_optionsPtr options)
122
MemFree(options->db_title);
123
MemFree(options->db_file);
124
MemFree(options->LogFileName);
130
Boolean SRReadCharData(CharPtr buffer, CharPtr PNTR div_in)
132
CharPtr tmp, ch, ch2;
134
Char div[MAX_SR_ELEMENTS][4];
136
tmp = StringSave(buffer);
137
MemSet(div, NULLB, sizeof(div));
139
for(ch2 = tmp, j = 0; j < MAX_SR_ELEMENTS; j++) {
141
if((ch = StringChr(ch2, ',')) == NULL) {
142
StringNCpy(div[j], ch2, 3);
148
StringNCpy(div[j], ch2, 3);
152
MemCpy(div_in, div, sizeof(div));
159
Boolean SRReadIntData(CharPtr buffer, Int4Ptr id)
161
CharPtr tmp, ch, ch2;
163
tmp = StringSave(buffer);
165
for(ch2 = tmp, j = 0; j < MAX_SR_ELEMENTS; j++) {
167
if((ch = StringChr(ch2, ',')) == NULL) {
169
id[j+1] = -1; /* Terminating character */
184
SR_InfoPtr SRReadSRInfo(CharPtr buffer)
187
WWWInfoDataPtr info_data;
191
info_data = (WWWInfoDataPtr) MemNew(sizeof(WWWInfoData));
192
info_data->query = StringSave(buffer);
193
info_data->entries = WWWGetEntries(&info_data->num_entries,
194
info_data->query, FALSE);
195
info = (VoidPtr) info_data;
197
srip = MemNew(sizeof(SR_Info));
199
if((chptr = WWWGetValueByName(info, TAXID_LABEL)) != NULL)
200
SRReadIntData(chptr, srip->taxid);
201
if((chptr = WWWGetValueByName(info, OWNER_LABEL)) != NULL)
202
SRReadIntData(chptr, srip->owner);
203
if((chptr = WWWGetValueByName(info, DIV_LABEL)) != NULL)
204
SRReadCharData(chptr, (CharPtr PNTR) srip->div);
205
if((chptr = WWWGetValueByName(info, DBNAME_LABEL)) != NULL)
206
StringCpy(srip->dbname, chptr);
208
/* If no conditions exists - this is an error */
209
if(srip->taxid[0] <= 0 && srip->owner[0] <= 0 && **srip->div == NULLB) {
210
ErrPostEx(SEV_ERROR, 0, 0,
211
"No valid conditions exists in query string");
219
FDB_optionsPtr FDB_CreateCLOptions(Boolean is_prot)
221
FDB_optionsPtr options;
224
options = MemNew(sizeof(FDB_options));
226
options->db_title = StringSave(flt_args[0].strvalue);
228
sprintf(buffer, "%s.flt", FLT_Input);
229
options->db_file = StringSave(buffer);
231
options->LogFileName = StringSave(flt_args[7].strvalue);
232
options->is_protein = is_prot;
233
options->parse_mode = TRUE;
234
options->isASN = FALSE;
235
options->asnbin = FALSE;
236
options->is_seqentry = FALSE;
237
options->base_name = NULL;
238
options->dump_info = FALSE;
239
options->sparse_idx = flt_args[4].intvalue;
244
#define GI_ALLOC_CHUNK 1024
246
SeqIdPtr MySeqIdFree(SeqIdPtr sip)
253
} while(sip != NULL);
258
/* Functions used in filtering by gi number */
259
GiListPtr GiListNew(void)
263
glp = MemNew(sizeof(GiList));
264
glp->allocated = GI_ALLOC_CHUNK;
265
glp->seq_num = MemNew(sizeof(Int4) * glp->allocated);
270
void GiListFree(GiListPtr glp)
275
MemFree(glp->seq_num);
280
Boolean ReadGiList(ReadDBFILEPtr rdfp, GiListPtr glp, CharPtr filename)
283
Int4 gi, retvalue, seqnum;
285
if((fd = FileOpen(filename, "r")) == NULL)
288
while((retvalue = fscanf(fd, "%d", &gi)) != EOF) {
289
if(retvalue == 0) continue;
291
if(glp->count >= glp->allocated) {
292
glp->allocated += GI_ALLOC_CHUNK;
293
glp->seq_num = Realloc(glp->seq_num,
294
sizeof(Int4) * glp->allocated);
297
seqnum = readdb_gi2seq(rdfp, gi);
300
ErrPostEx(SEV_WARNING, 0,0, "Gi %d is not found", gi);
304
glp->seq_num[glp->count] = seqnum;
312
/* Here we will check, that data[2] == tax_id,
313
data[3] == owner, div=div */
314
Boolean CheckSRCondition(SR_InfoPtr srip, Int4Ptr data, CharPtr div)
318
Boolean cond_ok = FALSE;
320
/* checking tax_id */
322
if(srip->taxid[0] > 0) { /* At least one element exists */
324
for(i = 0; srip->taxid[i] > 0 && i < MAX_SR_ELEMENTS; i++) {
325
if(data[2] == srip->taxid[i]) {
336
if(srip->owner[0] > 0) { /* At least one element exists */
338
for(i = 0; srip->owner[i] > 0 && i < MAX_SR_ELEMENTS; i++) {
339
if(data[3] == srip->owner[i]) {
348
/* checking division */
350
if(*srip->div[0] != NULLB) { /* At least one element exists */
352
for(i = 0; *srip->div[i] != NULLB && i < MAX_SR_ELEMENTS; i++) {
353
if(!StringCmp(div, srip->div[i])) {
365
Boolean FDGetGiListByQuery(ReadDBFILEPtr rdfp, SR_InfoPtr srip,
366
GiListPtr glp, CharPtr filename)
369
Int4 length, data[7];
373
if((fd = FileOpen(filename, "r")) == NULL) {
374
ErrPostEx(SEV_ERROR, 0,0, "Unable to open input info file");
378
length = sizeof(buffer);
379
while(fgets(buffer, length, fd) != NULL) {
380
sscanf(buffer, "%d %d %d %d %s %d %d %d",
381
&data[0], &data[1], &data[2], &data[3],
382
div, &data[4], &data[5], &data[6]);
384
/* If line agree with condition seq_num added to the list */
385
if(CheckSRCondition(srip, data, div)) {
386
if(glp->count >= glp->allocated) {
387
glp->allocated += GI_ALLOC_CHUNK;
388
glp->seq_num = Realloc(glp->seq_num,
389
sizeof(Int4) * glp->allocated);
391
glp->seq_num[glp->count] = data[0];
401
VoidPtr NewDBThread(VoidPtr data)
405
FDB_optionsPtr options;
410
if((ndbp = data) == NULL)
413
if((rdfp = readdb_new (FLT_Input, IS_Protein)) == NULL) {
414
ErrPostEx(SEV_ERROR, 0, 0,
415
"Failure to intialise database %s", FLT_Input);
419
/* ---------------------------------------------- */
420
/* ------- Initializing thread options --------- */
421
/* ---------------------------------------------- */
423
if((options = FDB_CreateCLOptions(IS_Protein)) == NULL)
426
MemFree(options->db_file);
427
options->db_file = StringSave(ndbp->dbname);
429
/* ---------------------------------------------- */
430
/* ----- Initializing formatdb structure ------- */
431
/* ---------------------------------------------- */
433
if ((fdbp = FormatDBInit(options)) == NULL)
436
/* ---------------------------------------------- */
437
/* ---------------- Main loop ------------------- */
438
/* ---------------------------------------------- */
440
for(i = 0; i < ndbp->glp->count; i++) {
442
if((seqnum = ndbp->glp->seq_num[i]) == -1)
445
bsp = readdb_get_bioseq_ex(rdfp, seqnum, FALSE, FALSE);
446
FDBAddBioseq(fdbp, bsp, 0);
448
SeqIdSetFree(bsp->id);
449
BioseqFreeComponents(bsp);
452
/* BioseqFree(bsp); */
455
GiListFree(ndbp->glp);
457
if(FormatDBClose(fdbp))
460
readdb_destruct(rdfp);
461
FDB_optionsFree(options);
466
NewDBdataPtr GetNdprByString(CharPtr string, ReadDBFILEPtr rdfp,
472
if((srip = SRReadSRInfo(string)) == NULL)
475
ndbp = MemNew(sizeof(NewDBdata));
476
ndbp->glp = GiListNew();
478
if(!FDGetGiListByQuery(rdfp, srip, ndbp->glp, indname))
481
/* If database name set - replacing default one */
482
if(*srip->dbname != NULLB) {
483
ndbp->dbname = StringSave(srip->dbname);
488
void TrimSpaces(CharPtr buffer)
496
for(chptr = buffer, i = 0; *chptr != NULLB; chptr++) {
497
if(IS_WHITESP(*chptr))
508
Int4 i, glp_count, buflen;
509
NewDBdataPtr ndbp[MAX_FLT_DATABASES];
510
Char indname[128], buffer[1024];
514
if ( !GetArgs ("fdselect", NUMARG, flt_args) )
517
if ( !ErrSetLog (flt_args[7].strvalue) ) { /* Logfile */
520
ErrSetOpts (ERR_CONTINUE, ERR_LOG_ON);
523
/* ---------------------------------------------- */
524
/* ------ Initializing readdb structures -------- */
525
/* ---------------------------------------------- */
527
if((rdfp = readdb_new (FLT_Input, IS_Protein)) == NULL) {
528
ErrPostEx(SEV_ERROR, 0, 0,
529
"Failure to intialise database %s", FLT_Input);
533
sprintf(indname, "%s.%cdi", FLT_Input, IS_Protein? 'p' : 'n');
535
if(flt_args[3].strvalue != NULL) {/* This is list of gis */
537
ndbp[0] = MemNew(sizeof(NewDBdata));
538
ndbp[0]->glp = GiListNew();
541
if(!ReadGiList(rdfp, ndbp[0]->glp, flt_args[2].strvalue))
544
} else if(flt_args[5].strvalue != NULL) {/* This is string for gis */
546
TrimSpaces(flt_args[5].strvalue);
548
ndbp[0] = GetNdprByString(flt_args[5].strvalue, rdfp, indname);
553
/* Default behaviour is to read and interprete every line of the
554
configuration file */
556
if((fd = FileOpen(flt_args[2].strvalue, "r")) == NULL) {
557
ErrPostEx(SEV_ERROR, 0, 0, "Failure to open config file %s",
558
flt_args[2].strvalue);
561
buflen = sizeof(buffer);
562
for(glp_count = 0; fgets(buffer, buflen, fd) != NULL &&
563
glp_count < MAX_FLT_DATABASES;) {
570
ndbp[glp_count] = GetNdprByString(buffer, rdfp, indname);
575
readdb_destruct(rdfp);
577
for(i = 0; i < glp_count; i++) {
578
NlmThreadCreate(NewDBThread, (VoidPtr) ndbp[i]);