2
* ===========================================================================
5
* National Center for Biotechnology Information (NCBI)
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government do not place any restriction on its use or reproduction.
13
* We would, however, appreciate having the NCBI and the author cited in
14
* any work or product based on this material
16
* Although all reasonable efforts have been taken to ensure the accuracy
17
* and reliability of the software and data, the NLM and the U.S.
18
* Government do not and cannot warrant the performance or results that
19
* may be obtained by using this software or data. The NLM and the U.S.
20
* Government disclaim all warranties, express or implied, including
21
* warranties of performance, merchantability or fitness for any particular
24
* ===========================================================================
26
* File Name: replicon.c
28
* Author: Colleen Bollin
30
* Version Creation Date: Feb. 1, 2012
37
* --------------------------------------------------------------------------
38
* Date Name Description of modification
39
* ------- ---------- -----------------------------------------------------
42
* ==========================================================================
66
#include <util/creaders/alnread.h>
69
#ifdef INTERNAL_NCBI_TBL2ASN
70
#include <accpubseq.h>
72
#define NLM_GENERATED_CODE_PROTO
76
#define REPLICON_APP_VER "1.0"
78
CharPtr REPLICON_APPLICATION = REPLICON_APP_VER;
82
static void PopulateRepliconIdBuf (BioseqPtr bsp, CharPtr buf, Int4 buf_size)
84
SeqIdPtr sip_local = NULL, sip_general = NULL, sip, sip_next;
87
while (sip != NULL && sip_local == NULL) {
88
if (sip->choice == SEQID_LOCAL) {
90
} else if (sip->choice == SEQID_GENERAL) {
96
if (sip_local != NULL) {
98
} else if (sip_general != NULL) {
101
sip = SeqIdFindBest (bsp->id, SEQID_GENBANK);
104
sip_next = sip->next;
106
SeqIdWrite (sip, buf, PRINTID_FASTA_LONG, buf_size - 1);
107
sip->next = sip_next;
111
typedef struct tablefiles {
115
} TableFilesData, PNTR TableFilesPtr;
118
static void MakeTable (BioseqPtr bsp, Pointer data)
120
SeqMgrDescContext context;
124
CharPtr chr_name = "ANONYMOUS";
125
CharPtr loc_str = "UNKNOWN";
126
CharPtr type_str = "UNKNOWN";
129
CharPtr col3fmt = "%s\t%s\t%s\n";
133
if (bsp == NULL || ISA_aa(bsp->mol)) {
136
t = (TableFilesPtr) data;
138
PopulateRepliconIdBuf (bsp, buf, sizeof (buf));
140
sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
141
if (sdp == NULL || (biop = (BioSourcePtr) sdp->data.ptrvalue) == NULL) {
142
printf ("ERROR! No BioSource for %s\n", buf);
144
chr_name = GetRepliconChromosomeName (biop);
145
loc_str = GetRepliconLocation (biop);
146
type_str = GetRepliconType (biop);
147
sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
148
if (sdp == NULL || (mip = (MolInfoPtr) sdp->data.ptrvalue) == NULL || mip->completeness != 1) {
149
/* not complete - looking for organelles */
150
if (chr_name != NULL) {
151
if (loc_str == NULL || type_str == NULL) {
152
printf ("ERROR! Unrecognized BioSource.genome value!\n");
154
fprintf (t->incomplete, "%s\t%s\n", buf, chr_name);
155
len = StringLen (col3fmt) + StringLen (chr_name) + StringLen (loc_str) + StringLen (type_str);
156
col3 = (CharPtr) MemNew (sizeof (Char) * len);
157
sprintf (col3, col3fmt, chr_name, loc_str, type_str);
158
ValNodeAddPointer (&(t->chr_list), 0, col3);
163
if (chr_name == NULL || loc_str == NULL || type_str == NULL) {
164
printf ("ERROR! Unrecognized BioSource.genome value!\n");
165
} else if (t != NULL && t->complete != NULL) {
166
fprintf (t->complete, "%s\t%s\t%s\t%s\n", buf, chr_name, loc_str, type_str);
168
printf ("%s\t%s\t%s\t%s\n", buf, chr_name, loc_str, type_str);
171
chr_name = MemFree (chr_name);
172
loc_str = MemFree (loc_str);
173
type_str = MemFree (type_str);
176
/* Args structure contains command-line arguments */
179
c_argCompleteOuputFile,
180
o_argIncompleteOrgFile,
181
s_argIncompleteSeqFile
186
{"File List File", NULL, NULL, NULL,
187
TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
188
{"Complete Output File", NULL, NULL, NULL,
189
TRUE, 'c', ARG_FILE_OUT, 0.0, 0, NULL},
190
{"Incomplete Org Output File", NULL, NULL, NULL,
191
TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
192
{"Incomplete Seq Output File", NULL, NULL, NULL,
193
TRUE, 's', ARG_FILE_OUT, 0.0, 0, NULL},
200
CharPtr input_file, complete_output_file, incomplete_org_file, incomplete_seq_file;
213
ErrSetFatalLevel (SEV_MAX);
214
ErrSetMessageLevel (SEV_MAX);
215
ErrClearOptFlags (EO_SHOW_USERSTR);
216
UseLocalAsnloadDataAndErrMsg ();
219
/* finish resolving internal connections in ASN.1 parse tables */
221
if (! AllObjLoad ()) {
222
Message (MSG_FATAL, "AllObjLoad failed");
225
if (! SubmitAsnLoad ()) {
226
Message (MSG_FATAL, "SubmitAsnLoad failed");
229
if (! FeatDefSetLoad ()) {
230
Message (MSG_FATAL, "FeatDefSetLoad failed");
233
if (! SeqCodeSetLoad ()) {
234
Message (MSG_FATAL, "SeqCodeSetLoad failed");
237
if (! GeneticCodeTableLoad ()) {
238
Message (MSG_FATAL, "GeneticCodeTableLoad failed");
243
/* process command line arguments */
244
sprintf (app, "replicon %s", REPLICON_APPLICATION);
245
if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
249
input_file = (CharPtr) myargs [i_argInputFile].strvalue;
250
complete_output_file = (CharPtr) myargs [c_argCompleteOuputFile].strvalue;
251
incomplete_org_file = (CharPtr) myargs [o_argIncompleteOrgFile].strvalue;
252
incomplete_seq_file = (CharPtr) myargs [s_argIncompleteSeqFile].strvalue;
254
if (StringHasNoText (input_file)) {
255
Message (MSG_FATAL, "Must supply input file.");
258
if (StringHasNoText (complete_output_file)) {
259
Message (MSG_FATAL, "Must supply filename for complete replicons.");
262
if (StringHasNoText (incomplete_org_file)) {
263
Message (MSG_FATAL, "Must supply filename for list of incomplete replicon sources.");
266
if (StringHasNoText (incomplete_seq_file)) {
267
Message (MSG_FATAL, "Must supply filename for list of incomplete replicon sequences.");
272
t.complete = FileOpen (complete_output_file, "w");
273
if (t.complete == NULL) {
274
Message (MSG_FATAL, "Unable to open %s", complete_output_file);
277
t.incomplete = FileOpen (incomplete_seq_file, "w");
278
if (t.incomplete == NULL) {
279
Message (MSG_FATAL, "Unable to open %s", incomplete_seq_file);
284
fi = FileOpen (input_file, "r");
286
Message (MSG_FATAL, "Unable to open %s", input_file);
291
rbd.current_data = NULL;
293
line = AbstractReadFunction (&rbd);
294
while (line != NULL) {
295
fp = FileOpen (line, "r");
297
Message (MSG_FATAL, "Unable to open %s", line);
300
while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL) {
303
VisitBioseqsInSep ((SeqEntryPtr) dataptr, &t, MakeTable);
306
VisitBioseqsInSet ((BioseqSetPtr) dataptr, &t, MakeTable);
309
MakeTable ((BioseqPtr) dataptr, &t);
312
Message (MSG_ERROR, "Unrecognized data type %d", datatype);
315
ObjMgrFree (datatype, dataptr);
318
line = AbstractReadFunction (&rbd);
322
FileClose (t.complete);
323
FileClose (t.incomplete);
324
fp = FileOpen (incomplete_org_file, "w");
326
Message (MSG_FATAL, "Unable to open %s", incomplete_org_file);
329
t.chr_list = ValNodeSort (t.chr_list, SortVnpByString);
330
ValNodeUnique (&(t.chr_list), SortVnpByString, ValNodeFreeData);
331
for (vnp = t.chr_list; vnp != NULL; vnp = vnp->next) {
332
fprintf (fp, "%s", (CharPtr)vnp->data.ptrvalue);