2
* ===========================================================================
5
* National Center for Biotechnology Information (NCBI)
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government do not place any restriction on its use or reproduction.
13
* We would, however, appreciate having the NCBI and the author cited in
14
* any work or product based on this material
16
* Although all reasonable efforts have been taken to ensure the accuracy
17
* and reliability of the software and data, the NLM and the U.S.
18
* Government do not and cannot warrant the performance or results that
19
* may be obtained by using this software or data. The NLM and the U.S.
20
* Government disclaim all warranties, express or implied, including
21
* warranties of performance, merchantability or fitness for any particular
24
* ===========================================================================
26
* File Name: gi2accn.c
28
* Author: Jonathan Kans
30
* Version Creation Date: 4/15/02
37
* --------------------------------------------------------------------------
38
* Date Name Description of modification
39
* ------- ---------- -----------------------------------------------------
42
* ==========================================================================
58
static void ConvertGiToAccn (SeqIdPtr sip, Pointer userdata)
64
if (sip == NULL) return;
65
if (sip->choice != SEQID_GI) return;
66
gi = sip->data.intvalue;
67
newsip = GetSeqIdForGI (gi);
68
if (newsip == NULL) return;
69
if (newsip->choice == SEQID_GIBBSQ ||
70
newsip->choice == SEQID_GIBBMT ||
71
newsip->choice == SEQID_GI) {
75
SeqIdStripLocus (newsip);
76
sip->choice = newsip->choice;
77
sip->data.ptrvalue = newsip->data.ptrvalue;
78
newsip->choice = SEQID_NOT_SET;
79
newsip->data.ptrvalue = NULL;
83
static void UpdateAligns (
89
VisitSeqIdsInSeqAlign (sap, NULL, ConvertGiToAccn);
92
static Boolean IsSipMrna (SeqIdPtr sip, Int4Ptr gilist, Int4 count)
99
if (sip == NULL) return FALSE;
100
bsp = BioseqFind (sip);
101
if (bsp != NULL) return FALSE;
102
if (sip->choice == SEQID_GI) {
103
gi = (Int4) sip->data.intvalue;
105
gi = GetGIForSeqId (sip);
107
if (gi < 1) return FALSE;
114
if (gilist [mid] < gi) {
121
if (gilist [R] == gi) return TRUE;
126
static Boolean IsMrnaAlignment (SeqAlignPtr align, Int4Ptr gilist, Int4 count)
135
if (align == NULL) return FALSE;
137
if (align->segtype == 1) {
138
ddp = (DenseDiagPtr) align->segs;
140
for (sip = ddp->id; sip != NULL; sip = sip->next) {
141
if (IsSipMrna (sip, gilist, count)) return TRUE;
144
} else if (align->segtype == 2) {
145
dsp = (DenseSegPtr) align->segs;
147
for (sip = dsp->ids; sip != NULL; sip = sip->next) {
148
if (IsSipMrna (sip, gilist, count)) return TRUE;
151
} else if (align->segtype == 3) {
152
ssp = (StdSegPtr) align->segs;
154
for (tloc = ssp->loc; tloc != NULL; tloc = tloc->next) {
155
sip = SeqLocId (tloc);
156
if (IsSipMrna (sip, gilist, count)) return TRUE;
163
static SeqAnnotPtr MakeMrnaSeqAnnot (void)
172
annot = SeqAnnotNew ();
175
adp = ValNodeNew (NULL);
176
adp->choice = Annot_descr_user;
177
uop = UserObjectNew ();
178
adp->data.ptrvalue = uop;
179
oip = ObjectIdNew ();
180
oip->str = StringSave ("Blast Type");
181
ufp = UserFieldNew ();
184
oip = ObjectIdNew ();
185
oip->str = StringSave ("BLASTN - mrna");
188
ufp->data.intvalue = 1;
191
adp = ValNodeNew (NULL);
192
adp->choice = Annot_descr_user;
193
uop = UserObjectNew ();
194
adp->data.ptrvalue = uop;
195
oip = ObjectIdNew ();
196
oip->str = StringSave ("Hist Seqalign");
197
ufp = UserFieldNew ();
200
oip = ObjectIdNew ();
201
oip->str = StringSave ("Hist Seqalign");
204
ufp->data.boolvalue = TRUE;
205
adp->next = annot->desc;
211
static SeqAnnotPtr ExtractBlastMrna (SeqAlignPtr sap, Pointer PNTR prevlink, Int4Ptr gilist, Int4 count)
214
SeqAnnotPtr annot = NULL;
217
while (sap != NULL) {
220
if (IsMrnaAlignment (sap, gilist, count)) {
221
*prevlink = sap->next;
225
annot = MakeMrnaSeqAnnot ();
228
sap->next = annot->data;
233
sap->idx.prevlink = prevlink;
234
prevlink = (Pointer PNTR) &(sap->next);
243
static void CountGIsInAligns (SeqIdPtr sip, Pointer userdata)
248
if (sip == NULL || sip->choice != SEQID_GI || userdata == NULL) return;
249
countp = (Int4Ptr) userdata;
253
typedef struct strgi {
256
} UidData, PNTR UidDataPtr;
258
static void StoreGIsFromAligns (SeqIdPtr sip, Pointer userdata)
263
if (sip == NULL || sip->choice != SEQID_GI || userdata == NULL) return;
264
udp = (UidDataPtr) userdata;
265
udp->uidlist [udp->count] = (Int4) sip->data.intvalue;
269
static int LIBCALLBACK SortByInt4 (VoidPtr a, VoidPtr b)
272
if (a == NULL || b == NULL) return 0;
273
return (*(const Int4 *) a) - (*(const Int4 *) b);
276
static void FindBlastMrna (SeqAnnotPtr sap, Int4Ptr uidlist, Int4 count)
280
Entrez2BooleanReplyPtr e2br;
281
Entrez2IdListPtr e2id;
282
Entrez2RequestPtr e2rq;
283
Entrez2ReplyPtr e2ry;
288
e2rq = EntrezCreateBooleanRequest (TRUE, FALSE, "nucleotide", NULL, 0, 0, NULL, 0, 0);
289
EntrezAddToBooleanRequest (e2rq, "biomol_mrna [PROP]", 0, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
290
EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_AND, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
291
EntrezAddToBooleanRequest (e2rq, NULL, 0, NULL, NULL, NULL, 0, count, uidlist, NULL, TRUE, TRUE);
292
e2ry = EntrezSynchronousQuery (e2rq);
293
e2rq = Entrez2RequestFree (e2rq);
294
if (e2ry == NULL) return;
296
if (reply == NULL || reply->choice != E2Reply_eval_boolean) return;
297
e2br = EntrezExtractBooleanReply (e2ry);
298
if (e2br == NULL) return;
300
if (count < 1) return;
302
if (e2id == NULL || e2id->num < 1 || e2id->uids == NULL) return;
304
gilist = (Int4Ptr) MemNew (sizeof (Int4) * (e2id->num + 1));
305
if (gilist != NULL) {
307
BSSeek (e2id->uids, 0, SEEK_SET);
308
for (i = 0; i < e2id->num; i++) {
309
gilist [i] = Nlm_BSGetUint4 (e2id->uids);
311
HeapSort (gilist, e2id->num, sizeof (Int4), SortByInt4);
313
annot = ExtractBlastMrna ((SeqAlignPtr) sap->data, (Pointer PNTR) &(sap->data), gilist, e2id->num);
315
annot->next = sap->next;
322
Entrez2BooleanReplyFree (e2br);
325
static void ProcessBlastNR (SeqAnnotPtr sap)
336
if (sap == NULL || sap->type != 2) return;
338
VisitSeqIdsInSeqAnnot (sap, (Pointer) &count, CountGIsInAligns);
339
if (count < 1) return;
340
uidlist = (Int4Ptr) MemNew (sizeof (Int4) * (count + 1));
341
if (uidlist == NULL) return;
344
ud.uidlist = uidlist;
345
VisitSeqIdsInSeqAnnot (sap, (Pointer) &ud, StoreGIsFromAligns);
347
HeapSort (uidlist, count, sizeof (Int4), SortByInt4);
349
/* unique sorted gi list */
352
for (i = 0, j = 0; i < count; i++) {
362
FindBlastMrna (sap, uidlist, count);
367
static void FindBlastNR (SeqAnnotPtr sap, Pointer userdata)
375
if (sap == NULL || sap->type != 2) return;
376
for (adp = sap->desc; adp != NULL; adp = adp->next) {
377
if (adp->choice != Annot_descr_user) continue;
378
for (uop = adp->data.ptrvalue; uop != NULL; uop = uop->next) {
380
if (oip == NULL) continue;
381
if (StringCmp (oip->str, "Blast Type") != 0) continue;
383
if (ufp == NULL) continue;
385
if (oip == NULL) continue;
386
if (StringCmp (oip->str, "BLASTN - nr") != 0) continue;
387
ProcessBlastNR (sap);
388
oip->str = MemFree (oip->str);
389
oip->str = StringSave ("BLASTN - nr minus mrna");
394
/* Args structure contains command-line arguments */
396
#define i_argInputFile 0
397
#define o_argOutputFile 1
398
#define c_argConvertGIs 2
399
#define x_argExtractMrnas 3
402
{"Input File", "stdin", NULL, NULL,
403
FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
404
{"Output File", "stdout", NULL, NULL,
405
FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
406
{"Convert GIs", "T", NULL, NULL,
407
TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
408
{"Extract mRNAs", "T", NULL, NULL,
409
TRUE, 'x', ARG_BOOLEAN, 0.0, 0, NULL},
417
Boolean extractmrnas;
418
CharPtr infile, outfile;
423
ErrSetFatalLevel (SEV_MAX);
424
ErrClearOptFlags (EO_SHOW_USERSTR);
425
UseLocalAsnloadDataAndErrMsg ();
428
/* finish resolving internal connections in ASN.1 parse tables */
430
if (! AllObjLoad ()) {
431
Message (MSG_FATAL, "AllObjLoad failed");
434
if (! SubmitAsnLoad ()) {
435
Message (MSG_FATAL, "SubmitAsnLoad failed");
438
if (! FeatDefSetLoad ()) {
439
Message (MSG_FATAL, "FeatDefSetLoad failed");
442
if (! SeqCodeSetLoad ()) {
443
Message (MSG_FATAL, "SeqCodeSetLoad failed");
446
if (! GeneticCodeTableLoad ()) {
447
Message (MSG_FATAL, "GeneticCodeTableLoad failed");
451
/* process command line arguments */
453
if (! GetArgs ("gi2accn", sizeof (myargs) / sizeof (Args), myargs)) {
457
infile = (CharPtr) myargs [i_argInputFile].strvalue;
458
outfile = (CharPtr) myargs [o_argOutputFile].strvalue;
459
convertgis = (Boolean) myargs [c_argConvertGIs].intvalue;
460
extractmrnas = (Boolean) myargs [x_argExtractMrnas].intvalue;
462
aip = AsnIoOpen (infile, "r");
464
Message (MSG_FATAL, "AsnIoOpen input file failed");
468
sep = SeqEntryAsnRead (aip, NULL);
471
Message (MSG_FATAL, "SeqEntryAsnRead failed");
475
PubSeqFetchEnable ();
477
LookupFarSeqIDs (sep, FALSE, FALSE, FALSE, TRUE, FALSE);
479
/* Extract mRNA hits from BLAST against nr */
482
VisitAnnotsInSep (sep, NULL, FindBlastNR);
483
DeleteMarkedObjects (0, OBJ_SEQENTRY, (Pointer) sep);
486
/* convert gi numbers to accession.version */
489
VisitAlignmentsInSep (sep, NULL, UpdateAligns);
492
PubSeqFetchDisable ();
494
BasicSeqEntryCleanup (sep);
496
aip = AsnIoOpen (outfile, "w");
498
Message (MSG_FATAL, "AsnIoOpen output file failed");
502
SeqEntryAsnWrite (sep, aip, NULL);