2
* ===========================================================================
5
* National Center for Biotechnology Information (NCBI)
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government do not place any restriction on its use or reproduction.
13
* We would, however, appreciate having the NCBI and the author cited in
14
* any work or product based on this material
16
* Although all reasonable efforts have been taken to ensure the accuracy
17
* and reliability of the software and data, the NLM and the U.S.
18
* Government do not and cannot warrant the performance or results that
19
* may be obtained by using this software or data. The NLM and the U.S.
20
* Government disclaim all warranties, express or implied, including
21
* warranties of performance, merchantability or fitness for any particular
24
* ===========================================================================
26
* File Name: raw2delt.c
28
* Author: Colleen Bollin
30
* Version Creation Date: 4/12/07
37
* --------------------------------------------------------------------------
38
* Date Name Description of modification
39
* ------- ---------- -----------------------------------------------------
42
* ==========================================================================
56
#define RAW2DELT_APP_VER "1.0"
58
CharPtr RAW2DELT_APPLICATION = RAW2DELT_APP_VER;
60
typedef struct outputstream {
68
} OutputStreamData, PNTR OutputStreamPtr;
70
typedef struct inputstream {
76
} InputStreamData, PNTR InputStreamPtr;
78
typedef struct asnstream {
82
AsnTypePtr atp_bss_se;
83
} AsnStreamData, PNTR AsnStreamPtr;
85
static FILE* OpenOneFile (
92
Char file [FILENAME_MAX], path [PATH_MAX];
101
StringNCpy_0 (path, directory, sizeof (path));
102
sprintf (file, "%s%s", base, suffix);
103
FileBuildPath (path, NULL, file);
105
return FileOpen (path, "r");
108
static AsnIoPtr AsnIoFromInputStream (
114
Char file [FILENAME_MAX], path [PATH_MAX];
117
if (isp == NULL) return NULL;
119
if (isp->is_binary) {
125
if (isp->base == NULL) {
126
aip = AsnIoOpen ("stdin", read_flag);
128
StringNCpy_0 (path, isp->directory, sizeof (path));
129
sprintf (file, "%s%s", isp->base, isp->suffix);
130
FileBuildPath (path, NULL, file);
131
aip = AsnIoOpen (path, read_flag);
137
static AsnIoPtr AsnIoFromOutputStream (OutputStreamPtr osp)
140
Char file [FILENAME_MAX], path [PATH_MAX];
143
if (osp == NULL) return NULL;
144
if (osp->aip == NULL) {
145
write_flag = osp->is_binary ? "wb" : "w";
146
if (StringDoesHaveText (osp->outfile)) {
147
StringNCpy_0 (path, osp->outfile, sizeof (path));
149
if (osp->base == NULL) {
150
aip = AsnIoOpen ("stdout", write_flag);
152
if (osp->outsuffix == NULL) {
155
StringNCpy_0 (path, osp->results_dir, sizeof (path));
156
sprintf (file, "%s%s%s", osp->base, osp->suffix, osp->outsuffix);
157
FileBuildPath (path, NULL, file);
158
aip = AsnIoOpen (path, write_flag);
160
Message (MSG_POSTERR, "Unable to write to %s.", path);
170
static void WriteOneFile (
178
aip = AsnIoFromOutputStream (osp);
180
SeqEntryAsnWrite (sep, aip, NULL);
183
if (aip != osp->aip) {
189
static void CollectBioseqsForConversion (BioseqPtr bsp, Pointer userdata)
191
ValNodePtr PNTR list;
193
if (bsp == NULL || bsp->repr != Seq_repr_raw || ISA_aa (bsp->mol)) return;
194
if (userdata == NULL)
198
list = (ValNodePtr PNTR) userdata;
200
ValNodeAddPointer (list, 0, bsp);
204
static void ProcessSeqEntry (SeqEntryPtr sep, Int4Ptr gap_sizes)
206
ValNodePtr bsp_list = NULL, vnp;
209
if (sep == NULL || gap_sizes == NULL) return;
211
VisitBioseqsInSep (sep, &bsp_list, CollectBioseqsForConversion);
213
for (vnp = bsp_list; vnp != NULL; vnp = vnp->next) {
214
bsp = (BioseqPtr) vnp->data.ptrvalue;
215
ConvertNsToGaps (bsp, gap_sizes);
217
bsp_list = ValNodeFree (bsp_list);
220
static Uint2 ProcessOneAsn (
228
Uint2 datatype, entityID = 0;
231
if (fp == NULL || gap_sizes == NULL) return 0;
233
dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, TRUE, FALSE, TRUE, FALSE);
234
if (dataptr == NULL) {
235
Message (MSG_POSTERR, "Unable to read data from %s.", path);
239
sep = GetTopSeqEntryForEntityID (entityID);
240
ProcessSeqEntry (sep, gap_sizes);
245
/* return -1 if failure, 0 if success */
246
static Int4 ProcessOneRecord (
257
if (osp == NULL || gap_sizes == NULL) return -1;
258
fp = OpenOneFile (directory, osp->base, osp->suffix);
259
if (fp == NULL) return -1;
261
entityID = ProcessOneAsn (fp, osp->base == NULL ? "input stream" : osp->base, gap_sizes);
265
if (entityID == 0) return -1;
267
/* finish processing */
269
sep = GetTopSeqEntryForEntityID (entityID);
271
WriteOneFile (osp, sep);
274
ObjMgrFreeByEntityID (entityID);
278
static Int4 ProcessStream (InputStreamPtr isp, OutputStreamPtr osp, AsnStreamPtr asp, Int4Ptr gap_sizes)
280
AsnTypePtr atp, atp_srch;
281
AsnIoPtr asn_in, asn_out;
287
if (isp == NULL || osp == NULL || asp == NULL || gap_sizes == NULL) return 1;
289
asn_in = AsnIoFromInputStream (isp);
290
asn_out = AsnIoFromOutputStream (osp);
292
if (isp->is_seqentry) {
294
atp_srch = asp->atp_se;
298
atp_srch = asp->atp_bss_se;
301
while ((atp = AsnReadId(asn_in, asp->amp, atp)) != NULL && rval == 0) {
302
if (atp != atp_srch) {
303
AsnReadVal(asn_in, atp, &av);
304
AsnWrite(asn_out, atp, &av);
305
AsnKillValue(atp, &av);
308
if ((sep = SeqEntryAsnRead(asn_in, atp)) == NULL) {
309
Message (MSG_POSTERR, "SeqEntryAsnRead failure");
312
entityID = ObjMgrRegister (OBJ_SEQENTRY, sep);
313
ProcessSeqEntry (sep, gap_sizes);
315
if (! SeqEntryAsnWrite(sep, asn_out, atp)) {
316
Message (MSG_POSTERR, "SeqEntryAsnWrite failure");
320
ObjMgrFreeByEntityID (entityID);
322
} /* Endwhile, AsnReadId */
325
if (asn_out != osp->aip) {
332
/* return -1 on failure, 0 on success */
333
static Int4 FileRecurse (
342
Char path [PATH_MAX];
345
ValNodePtr head, vnp;
346
CharPtr orig_dir, orig_base;
349
/* get list of all files in source directory */
351
head = DirCatalog (directory);
353
for (vnp = head; vnp != NULL; vnp = vnp->next) {
354
if (vnp->choice == 0) {
355
str = (CharPtr) vnp->data.ptrvalue;
356
if (StringDoesHaveText (str)) {
358
/* does filename have desired substring? */
360
ptr = StringStr (str, osp->suffix);
364
/* make sure detected suffix is really at end of filename */
366
if (StringCmp (ptr, osp->suffix) == 0) {
369
/* process file that has desired suffix (usually .fsa) */
371
orig_dir = isp->directory;
372
isp->directory = directory;
373
orig_base = isp->base;
375
if (isp->is_binary) {
376
rval |= ProcessStream (isp, osp, asp, gap_sizes);
378
rval |= ProcessOneRecord (directory, osp, gap_sizes);
380
isp->directory = orig_dir;
381
isp->base = orig_base;
386
} else if (vnp->choice == 1) {
388
/* recurse into subdirectory */
390
StringNCpy_0 (path, directory, sizeof (path));
391
str = (CharPtr) vnp->data.ptrvalue;
392
FileBuildPath (path, str, NULL);
393
rval |= FileRecurse (path, isp, osp, asp, gap_sizes);
397
/* clean up file list */
399
ValNodeFreeData (head);
403
static Boolean SetUpAsnStreamData (AsnStreamPtr asp)
406
if (asp == NULL) return FALSE;
408
if (! SeqSetAsnLoad()) {
409
Message (MSG_POSTERR, "Unable to load SeqSet parse tree");
412
asp->amp = AsnAllModPtr();
413
if (asp->amp == NULL) {
414
Message (MSG_POSTERR, "Unable to obtain ASN.1 module pointer");
418
/* Get pointers to ASN.1 types that must be dealt with in asn_in */
420
if ( (asp->atp_bss = AsnFind("Bioseq-set")) == NULL) {
421
Message (MSG_POSTERR, "could not find type Bioseq-set");
424
if ( (asp->atp_bss_se = AsnFind("Bioseq-set.seq-set.E")) == NULL) {
425
Message (MSG_POSTERR, "AsnFind failure: Bioseq-set.seq-set.E");
428
if ( (asp->atp_se = AsnFind("Seq-entry")) == NULL) {
429
Message (MSG_POSTERR, "AsnFind failure: Seq-entry");
435
/* Args structure contains command-line arguments */
437
#define p_argInputPath 0
438
#define r_argOutputPath 1
439
#define i_argInputFile 2
440
#define o_argOutputFile 3
441
#define x_argSuffix 4
442
#define s_argOutSuffix 5
443
#define b_argInputBinary 6
444
#define e_argInputSeqEntry 7
445
#define d_argOutputBinary 8
446
#define u_argEqUnknownGap 9
447
#define U_argGTEqUnknownGap 10
448
#define k_argEqUnknownGap 11
449
#define K_argGtEqUnknownGap 12
453
{"Path to Files", NULL, NULL, NULL,
454
TRUE, 'p', ARG_STRING, 0.0, 0, NULL},
455
{"Path for Results", NULL, NULL, NULL,
456
TRUE, 'r', ARG_STRING, 0.0, 0, NULL},
457
{"Single Input File", NULL, NULL, NULL,
458
TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
459
{"Single Output File", NULL, NULL, NULL,
460
TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
461
{"Suffix", ".sqn", NULL, NULL,
462
TRUE, 'x', ARG_STRING, 0.0, 0, NULL},
463
{"Suffix for converted files", "", NULL, NULL,
464
TRUE, 's', ARG_STRING, 0.0, 0, NULL},
465
{"Input is binary", "F", NULL, NULL,
466
TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
467
{"Input is Seq-entry", "F", NULL, NULL,
468
TRUE, 'e', ARG_BOOLEAN, 0.0, 0, NULL},
469
{"Output is binary", "F", NULL, NULL,
470
TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
471
{"Exact size to convert to unknown length gap", "-1", NULL, NULL,
472
TRUE, 'u', ARG_INT, 0.0, 0, NULL},
473
{"Size greater than/equal to to convert to unknown length gap", "-1", NULL, NULL,
474
TRUE, 'U', ARG_INT, 0.0, 0, NULL},
475
{"Exact size to convert to known length gap", "-1", NULL, NULL,
476
TRUE, 'k', ARG_INT, 0.0, 0, NULL},
477
{"Size greater than/equal to to convert to known length gap", "-1", NULL, NULL,
478
TRUE, 'K', ARG_INT, 0.0, 0, NULL},
487
OutputStreamData osd;
492
Int4 u_eq = 0, u_gteq = -1, k_eq = 0, k_gteq = -1;
496
ErrSetFatalLevel (SEV_MAX);
497
ErrClearOptFlags (EO_SHOW_USERSTR);
498
UseLocalAsnloadDataAndErrMsg ();
501
/* finish resolving internal connections in ASN.1 parse tables */
503
if (! AllObjLoad ()) {
504
Message (MSG_FATAL, "AllObjLoad failed");
507
if (! SubmitAsnLoad ()) {
508
Message (MSG_FATAL, "SubmitAsnLoad failed");
511
if (! FeatDefSetLoad ()) {
512
Message (MSG_FATAL, "FeatDefSetLoad failed");
515
if (! SeqCodeSetLoad ()) {
516
Message (MSG_FATAL, "SeqCodeSetLoad failed");
519
if (! GeneticCodeTableLoad ()) {
520
Message (MSG_FATAL, "GeneticCodeTableLoad failed");
524
SetUpAsnStreamData (&asd);
526
/* initialize OuputStreamData */
527
MemSet (&osd, 0, sizeof (osd));
529
/* initialize InputStreamData */
530
MemSet (&isd, 0, sizeof (isd));
532
/* initialize gap_sizes */
536
/* process command line arguments */
538
sprintf (app, "raw2delt %s", RAW2DELT_APPLICATION);
539
if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
543
u_eq = (Int4) myargs [u_argEqUnknownGap].intvalue;
544
u_gteq = (Int4) myargs [U_argGTEqUnknownGap].intvalue;
545
k_eq = (Int4) myargs [k_argEqUnknownGap].intvalue;
546
k_gteq = (Int4) myargs [K_argGtEqUnknownGap].intvalue;
548
if (u_eq < 1 && u_gteq < 1 && k_eq < 1 && k_gteq < 1) {
549
Message (MSG_FATAL, "Must specify values for at least one of -u, -U, -k, -K");
551
} else if (u_eq > -1 && u_gteq > -1) {
552
Message (MSG_FATAL, "May only specify value for -u or -U, not both");
554
} else if (k_eq > -1 && k_gteq > -1) {
555
Message (MSG_FATAL, "May only specify value for -k or -K, not both");
561
} else if (u_gteq > 0) {
562
gap_sizes[0] = 0 - u_gteq;
567
} else if (k_gteq > 0) {
568
gap_sizes[1] = 0 - k_gteq;
571
if (gap_sizes[0] == gap_sizes[1]) {
572
Message (MSG_FATAL, "Cannot specify the same size for known and unknown length gaps");
576
directory = (CharPtr) myargs [p_argInputPath].strvalue;
577
osd.results_dir = (CharPtr) myargs [r_argOutputPath].strvalue;
578
if (StringHasNoText (osd.results_dir)) {
579
osd.results_dir = NULL;
581
osd.suffix = (CharPtr) myargs [x_argSuffix].strvalue;
582
osd.outsuffix = (CharPtr) myargs [s_argOutSuffix].strvalue;
583
osd.base = (CharPtr) myargs [i_argInputFile].strvalue;
584
osd.outfile = (CharPtr) myargs [o_argOutputFile].strvalue;
585
if (StringHasNoText (osd.outfile)) {
588
osd.is_binary = (Boolean) myargs [d_argOutputBinary].intvalue;
590
if (osd.base == "stdin") {
594
/* if we don't have an output directory or an output file, and the user hasn't provided an
595
* output suffix, add a default.
597
if (osd.results_dir == NULL && osd.outfile == NULL && StringHasNoText (osd.outsuffix)) {
598
osd.outsuffix = ".delta";
601
isd.is_binary = (Boolean) myargs [b_argInputBinary].intvalue;
602
isd.is_seqentry = (Boolean) myargs [e_argInputSeqEntry].intvalue;
603
isd.directory = directory;
605
isd.suffix = osd.suffix;
607
if (StringDoesHaveText (osd.outfile)) {
608
osd.aip = AsnIoOpen (osd.outfile, "w");
609
if (osd.aip == NULL) {
610
Message (MSG_FATAL, "Unable to open output file");
614
if (StringHasNoText (osd.results_dir)) {
615
osd.results_dir = directory;
617
/* if we're putting the results in a separate directory, strip the directory name from the output base */
618
if (!StringHasNoText (osd.results_dir) && !StringHasNoText (osd.base)) {
620
ptr = StringRChr (osd.base, '\\');
622
ptr = StringRChr (osd.base, '/');
631
if (StringHasNoText(directory) && StringHasNoText(osd.base)) {
632
rval = ProcessStream (&isd, &osd, &asd, gap_sizes);
633
} else if (StringDoesHaveText (osd.base)) {
634
ptr = StringRChr (osd.base, '.');
637
StringNCpy_0 (sfx, ptr, sizeof (sfx));
643
rval = ProcessStream (&isd, &osd, &asd, gap_sizes);
645
rval = ProcessOneRecord (directory, &osd, gap_sizes);
649
rval = FileRecurse (directory, &isd, &osd, &asd, gap_sizes);
652
if (osd.aip != NULL) {
653
AsnIoFlush (osd.aip);
654
AsnIoClose (osd.aip);