2
* ===========================================================================
5
* National Center for Biotechnology Information (NCBI)
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government do not place any restriction on its use or reproduction.
13
* We would, however, appreciate having the NCBI and the author cited in
14
* any work or product based on this material
16
* Although all reasonable efforts have been taken to ensure the accuracy
17
* and reliability of the software and data, the NLM and the U.S.
18
* Government do not and cannot warrant the performance or results that
19
* may be obtained by using this software or data. The NLM and the U.S.
20
* Government disclaim all warranties, express or implied, including
21
* warranties of performance, merchantability or fitness for any particular
24
* ===========================================================================
26
* File Name: src_chk.c
28
* Author: Colleen Bollin
30
* Version Creation Date: 4/12/07
37
* --------------------------------------------------------------------------
38
* Date Name Description of modification
39
* ------- ---------- -----------------------------------------------------
42
* ==========================================================================
55
#define NLM_GENERATED_CODE_PROTO
60
#define SRC_CHK_APP_VER "1.0"
62
CharPtr SRC_CHK_APPLICATION = SRC_CHK_APP_VER;
65
static ValNodePtr CollectFieldList(BioseqPtr bsp)
69
SeqMgrDescContext dcontext;
70
ValNodePtr list = NULL, vnp;
72
for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
74
sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_source, &dcontext)) {
75
biop = (BioSourcePtr) sdp->data.ptrvalue;
76
vnp = GetSourceQualFieldListFromBioSource (biop);
77
ValNodeLink (&list, vnp);
83
static void PrintHeader (FILE *fp, ValNodePtr field_list)
87
if (fp == NULL || field_list == NULL) {
90
/* first field accession, second field GI, third field tax ID */
91
fprintf (fp, "\t\tTaxID");
92
while (field_list != NULL) {
93
txt = SummarizeFieldType (field_list);
94
fprintf (fp, "\t%s", txt);
96
field_list = field_list->next;
102
static Int4 GetTaxIdFromOrgRef (OrgRefPtr orp)
110
for (vnp = orp->db; vnp != NULL; vnp = vnp->next)
112
d = (DbtagPtr) vnp->data.ptrvalue;
113
if (StringCmp(d->db, "taxon") == 0)
124
static void PrintBioSourceLine (FILE *fp, BioSourcePtr biop, ValNodePtr field_list)
128
if (fp == NULL || biop == NULL || field_list == NULL) {
132
fprintf (fp, "\t%d", GetTaxIdFromOrgRef(biop->org));
134
while (field_list != NULL) {
135
txt = GetSourceQualFromBioSource (biop, field_list->data.ptrvalue, NULL);
136
fprintf (fp, "\t%s", txt == NULL ? "" : txt);
138
field_list = field_list->next;
143
static void PrintBioseqLines (FILE *fp, BioseqPtr bsp, ValNodePtr field_list)
146
SeqMgrDescContext dcontext;
147
Char id_txt[255], id_txt2[255];
148
SeqIdPtr sip, sip_gi = NULL, sip_gb = NULL;
150
if (fp == NULL || bsp == NULL || field_list == NULL) {
154
for (sip = bsp->id; sip != NULL; sip = sip->next) {
155
if (sip->choice == SEQID_GENBANK
156
|| (sip->choice == SEQID_EMBL && sip_gb == NULL)
157
|| (sip->choice == SEQID_SWISSPROT && sip_gb == NULL)
158
|| (sip->choice == SEQID_DDBJ && sip_gb == NULL)
159
|| (sip->choice == SEQID_PIR && sip_gb == NULL)) {
161
} else if (sip->choice == SEQID_GI) {
166
if (sip_gb == NULL && sip_gi == NULL) {
167
SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), id_txt, PRINTID_REPORT, sizeof (id_txt) - 1);
170
if (sip_gb == NULL) {
173
SeqIdWrite (sip_gb, id_txt, PRINTID_REPORT, sizeof (id_txt) - 1);
175
if (sip_gi == NULL) {
178
SeqIdWrite (sip_gi, id_txt2, PRINTID_REPORT, sizeof (id_txt2) - 1);
182
for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
184
sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_source, &dcontext)) {
185
fprintf (fp, "%s\t%s", id_txt, id_txt2);
186
PrintBioSourceLine (fp, sdp->data.ptrvalue, field_list);
192
static void PrintBioseqErrorLine (FILE *fp, SeqIdPtr sip)
196
if (fp == NULL || sip == NULL) {
200
SeqIdWrite (sip, id_txt, PRINTID_REPORT, sizeof (id_txt) - 1);
202
if (sip->choice == SEQID_GI) {
203
fprintf (fp, "\t%s\n", id_txt);
205
fprintf (fp, "%s\t\n", id_txt);
210
static Boolean IsAllDigits (CharPtr str)
214
if (StringHasNoText (str)) return FALSE;
217
while (*cp != 0 && isdigit (*cp)) {
228
static SeqIdPtr SmartGuessMakeId (CharPtr str)
233
if (StringHasNoText (str)) {
235
} else if (StringChr (str, '|') != NULL) {
236
sip = MakeSeqID (str);
237
} else if (IsAllDigits (str)) {
238
id_txt = (CharPtr) MemNew (sizeof (Char) * (StringLen (str) + 4));
239
sprintf (id_txt, "gi|%s", str);
240
sip = MakeSeqID (id_txt);
241
id_txt = MemFree (id_txt);
243
id_txt = (CharPtr) MemNew (sizeof (Char) * (StringLen (str) + 4));
244
sprintf (id_txt, "gb|%s", str);
245
sip = MakeSeqID (id_txt);
246
id_txt = MemFree (id_txt);
252
/* Args structure contains command-line arguments */
254
#define i_argInputFile 0
255
#define o_argOutputFile 1
258
{"Input File", NULL, NULL, NULL,
259
TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
260
{"Output File", NULL, NULL, NULL,
261
TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL}
265
static void SortFieldListForSrcChk (ValNodePtr PNTR field_list)
267
ValNodePtr vnp, vnp_s, vnp_prev = NULL;
269
if (field_list == NULL || *field_list == NULL) return;
271
SortUniqueFieldTypeList (field_list);
273
/* move taxname to front of list */
274
for (vnp = *field_list; vnp != NULL; vnp_prev = vnp, vnp = vnp->next) {
275
if (vnp->choice == FieldType_source_qual) {
276
vnp_s = vnp->data.ptrvalue;
278
&& vnp_s->choice == SourceQualChoice_textqual
279
&& vnp_s->data.intvalue == Source_qual_taxname) {
280
/* only need to move if not already at front of list */
281
if (vnp_prev != NULL) {
282
vnp_prev->next = vnp->next;
283
vnp->next = *field_list;
299
CharPtr id_file, line;
301
ValNodePtr field_list = NULL;
303
ValNodePtr bsp_list = NULL, vnp;
310
ErrSetFatalLevel (SEV_MAX);
311
ErrClearOptFlags (EO_SHOW_USERSTR);
312
UseLocalAsnloadDataAndErrMsg ();
315
/* finish resolving internal connections in ASN.1 parse tables */
317
if (! AllObjLoad ()) {
318
Message (MSG_FATAL, "AllObjLoad failed");
321
if (! SubmitAsnLoad ()) {
322
Message (MSG_FATAL, "SubmitAsnLoad failed");
325
if (! FeatDefSetLoad ()) {
326
Message (MSG_FATAL, "FeatDefSetLoad failed");
329
if (! SeqCodeSetLoad ()) {
330
Message (MSG_FATAL, "SeqCodeSetLoad failed");
333
if (! GeneticCodeTableLoad ()) {
334
Message (MSG_FATAL, "GeneticCodeTableLoad failed");
338
PubSeqFetchEnable ();
340
/* process command line arguments */
342
sprintf (app, "src_chk %s", SRC_CHK_APPLICATION);
343
if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
347
id_file = (CharPtr) myargs [i_argInputFile].strvalue;
349
rbd.fp = FileOpen (id_file, "r");
350
if (rbd.fp == NULL) {
351
Message (MSG_ERROR, "Unable to open %s", (CharPtr) myargs [i_argInputFile].strvalue);
354
rbd.current_data = NULL;
355
line = AbstractReadFunction (&rbd);
356
while (line != NULL && line[0] != EOF) {
357
if (!StringHasNoText (line)) {
359
sip = SmartGuessMakeId (line);
360
bsp = BioseqLockById (sip);
362
printf ("Unable to download Bioseq for %s\n", line);
364
ValNodeLink (&field_list, CollectFieldList (bsp));
367
ValNodeAddPointer (&bsp_list, 0, sip);
369
line = MemFree (line);
370
line = AbstractReadFunction (&rbd);
375
SortFieldListForSrcChk (&field_list);
377
fp = FileOpen ((CharPtr) myargs [o_argOutputFile].strvalue, "w");
379
Message (MSG_ERROR, "Unable to open %s", (CharPtr) myargs [o_argOutputFile].strvalue);
382
PrintHeader (fp, field_list);
383
for (vnp = bsp_list; vnp != NULL; vnp = vnp->next) {
384
bsp = BioseqLockById (vnp->data.ptrvalue);
386
PrintBioseqErrorLine (fp, vnp->data.ptrvalue);
388
PrintBioseqLines (fp, bsp, field_list);
391
vnp->data.ptrvalue = SeqIdFree (vnp->data.ptrvalue);
395
bsp_list = ValNodeFree (bsp_list);
396
field_list = FieldTypeListFree (field_list);