2
* ===========================================================================
5
* National Center for Biotechnology Information (NCBI)
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government do not place any restriction on its use or reproduction.
13
* We would, however, appreciate having the NCBI and the author cited in
14
* any work or product based on this material
16
* Although all reasonable efforts have been taken to ensure the accuracy
17
* and reliability of the software and data, the NLM and the U.S.
18
* Government do not and cannot warrant the performance or results that
19
* may be obtained by using this software or data. The NLM and the U.S.
20
* Government disclaim all warranties, express or implied, including
21
* warranties of performance, merchantability or fitness for any particular
24
* ===========================================================================
26
* File Name: asn2fsa.c
28
* Author: Jonathan Kans
30
* Version Creation Date: 3/4/04
37
* --------------------------------------------------------------------------
38
* Date Name Description of modification
39
* ------- ---------- -----------------------------------------------------
42
* ==========================================================================
60
#ifdef INTERNAL_NCBI_ASN2FSA
61
#include <accpubseq.h>
64
#define ASN2FSA_APP_VER "1.2"
66
CharPtr ASN2FSA_APPLICATION = ASN2FSA_APP_VER;
68
static ValNodePtr requested_uid_list = NULL;
69
static TNlmMutex requested_uid_mutex = NULL;
71
static ValNodePtr locked_bsp_list = NULL;
72
static TNlmMutex locked_bsp_mutex = NULL;
74
static void AddUidToQueue (
79
ValNodePtr last = NULL, vnp;
83
if (sip == NULL || sip->choice != SEQID_GI) return;
84
uid = (Int4) sip->data.intvalue;
87
ret = NlmMutexLockEx (&requested_uid_mutex);
89
ErrPostEx (SEV_FATAL, 0, 0, "AddUidToQueue mutex failed [%ld]", (long) ret);
93
/* check against uids already in queue */
96
for (vnp = requested_uid_list; vnp != NULL; vnp = vnp->next) {
98
if ((Int4) vnp->data.intvalue == uid) break;
101
/* add uid to queue */
105
vnp = ValNodeAddInt (&last, 0, uid);
108
requested_uid_list = ValNodeAddInt (NULL, 0, uid);
109
last = requested_uid_list;
113
NlmMutexUnlock (requested_uid_mutex);
116
static Int4 RemoveUidFromQueue (
124
ret = NlmMutexLockEx (&requested_uid_mutex);
126
ErrPostEx (SEV_FATAL, 0, 0, "RemoveUidFromQueue mutex failed [%ld]", (long) ret);
130
/* extract next requested uid from queue */
132
if (requested_uid_list != NULL) {
133
vnp = requested_uid_list;
134
requested_uid_list = vnp->next;
136
uid = (Int4) vnp->data.intvalue;
140
NlmMutexUnlock (requested_uid_mutex);
145
static void QueueFarSegments (SeqLocPtr slp)
153
if (slp == NULL) return;
155
sip = SeqLocId (slp);
157
loc = SeqLocFindNext (slp, NULL);
159
sip = SeqLocId (loc);
162
if (sip == NULL) return;
164
/* if packaged in record, no need to fetch it */
166
if (BioseqFindCore (sip) != NULL) return;
168
/* check against currently locked records */
170
for (vnp = locked_bsp_list; vnp != NULL; vnp = vnp->next) {
171
bsp = (BioseqPtr) vnp->data.ptrvalue;
172
if (bsp == NULL) continue;
173
if (SeqIdIn (sip, bsp->id)) return;
179
static void QueueFarBioseqs (BioseqPtr bsp, Pointer userdata)
183
SeqLocPtr slp = NULL;
186
if (bsp == NULL) return;
188
if (bsp->repr == Seq_repr_seg) {
189
vn.choice = SEQLOC_MIX;
191
vn.data.ptrvalue = bsp->seq_ext;
193
while ((slp = SeqLocFindNext (&vn, slp)) != NULL) {
194
if (slp != NULL && slp->choice != SEQLOC_NULL) {
195
QueueFarSegments (slp);
198
} else if (bsp->repr == Seq_repr_delta) {
199
for (dsp = (DeltaSeqPtr) (bsp->seq_ext); dsp != NULL; dsp = dsp->next) {
200
if (dsp->choice == 1) {
201
slp = (SeqLocPtr) dsp->data.ptrvalue;
202
if (slp != NULL && slp->choice != SEQLOC_NULL) {
203
QueueFarSegments (slp);
210
static void AddBspToList (
218
if (bsp == NULL) return;
220
ret = NlmMutexLockEx (&locked_bsp_mutex);
222
ErrPostEx (SEV_FATAL, 0, 0, "AddBspToList mutex failed [%ld]", (long) ret);
226
vnp = ValNodeAddPointer (&locked_bsp_list, 0, (Pointer) bsp);
228
NlmMutexUnlock (locked_bsp_mutex);
231
static ValNodePtr ExtractBspList (
239
ret = NlmMutexLockEx (&locked_bsp_mutex);
241
ErrPostEx (SEV_FATAL, 0, 0, "ExtractBspList mutex failed [%ld]", (long) ret);
245
vnp = locked_bsp_list;
246
locked_bsp_list = NULL;
248
NlmMutexUnlock (locked_bsp_mutex);
253
typedef struct fastaflags {
254
Boolean master_style;
256
Boolean far_genomic_qual;
257
Boolean qual_gap_is_zero;
274
} FastaFlagData, PNTR FastaFlagPtr;
276
static VoidPtr DoAsyncLookup (
286
ffp = (FastaFlagPtr) arg;
287
if (ffp == NULL) return;
289
#ifdef INTERNAL_NCBI_ASN2FSA
290
if (ffp->usePUBSEQ) {
295
ReadDBBioseqFetchEnable ("asn2fsa", ffp->blastdbname, TRUE, FALSE);
298
MemSet ((Pointer) &vn, 0, sizeof (ValNode));
300
uid = RemoveUidFromQueue ();
303
vn.choice = SEQID_GI;
304
vn.data.intvalue = uid;
307
bsp = BioseqLockById (&vn);
312
uid = RemoveUidFromQueue ();
316
ReadDBBioseqFetchDisable ();
318
#ifdef INTERNAL_NCBI_ASN2FSA
319
if (ffp->usePUBSEQ) {
325
#define NUM_ASYNC_LOOKUP_THREADS 5
327
static void ProcessAsyncLookups (
334
TNlmThread thds [NUM_ASYNC_LOOKUP_THREADS];
336
/* spawn several threads for individual BioseqLockById requests */
338
for (i = 0; i < NUM_ASYNC_LOOKUP_THREADS; i++) {
339
thds [i] = NlmThreadCreate (DoAsyncLookup, (Pointer) ffp);
342
/* wait for all fetching threads to complete */
344
for (i = 0; i < NUM_ASYNC_LOOKUP_THREADS; i++) {
345
NlmThreadJoin (thds [i], &status);
349
static ValNodePtr AsyncLockFarComponents (
356
ValNodePtr bsplist = NULL, sublist, vnp;
359
if (sep == NULL || ffp == NULL) return NULL;
360
oldsep = SeqEntrySetScope (sep);
362
/* add far uids to queue */
364
VisitBioseqsInSep (sep, NULL, QueueFarBioseqs);
366
/* fetching from uid list using several threads */
368
ProcessAsyncLookups (ffp);
370
sublist = ExtractBspList ();
372
/* take list, look for seg or delta, recurse */
374
while (sublist != NULL) {
375
for (vnp = sublist; vnp != NULL; vnp = vnp->next) {
376
bsp = (BioseqPtr) vnp->data.ptrvalue;
377
if (bsp == NULL) continue;
378
QueueFarBioseqs (bsp, NULL);
381
ValNodeLink (&bsplist, sublist);
384
ProcessAsyncLookups (ffp);
386
sublist = ExtractBspList ();
389
SeqEntrySetScope (oldsep);
393
static ValNodePtr DoLockFarComponents (
400
time_t start_time, stop_time;
403
if (NlmThreadsAvailable () && ffp->useThreads) {
404
return AsyncLockFarComponents (sep);
407
return LockFarComponents (sep);
410
start_time = GetSecs ();
412
if (NlmThreadsAvailable () && ffp->useThreads) {
413
rsult = AsyncLockFarComponents (sep, ffp);
414
} else if (ffp->useThreads) {
415
Message (MSG_POST, "Threads not available in this executable");
416
rsult = LockFarComponents (sep);
418
rsult = LockFarComponents (sep);
421
stop_time = GetSecs ();
426
static Boolean DeltaLitOnly (
433
if (bsp == NULL || bsp->repr != Seq_repr_delta) return FALSE;
434
for (vnp = (ValNodePtr)(bsp->seq_ext); vnp != NULL; vnp = vnp->next) {
435
if (vnp->choice == 1) return FALSE;
440
static Boolean SegHasParts (
448
if (bsp == NULL || bsp->repr != Seq_repr_seg) return FALSE;
450
if (sep == NULL) return FALSE;
452
if (sep == NULL || (! IS_Bioseq_set (sep))) return FALSE;
453
bssp = (BioseqSetPtr) sep->data.ptrvalue;
454
if (bssp != NULL && bssp->_class == BioseqseqSet_class_parts) return TRUE;
458
static void CacheFarComponents (
468
if (ffp == NULL || ffp->fr == NULL || bsplist == NULL) return;
470
for (vnp = bsplist; vnp != NULL; vnp = vnp->next) {
471
bsp = (BioseqPtr) vnp->data.ptrvalue;
472
if (bsp == NULL) continue;
474
/* cache raw and constructed, near segmented, and delta literal */
478
case Seq_repr_const :
479
if (BioseqFastaStream (bsp, ffp->fr, 0, ffp->linelen, 0, 0, TRUE) < 0) {
484
entityID = ObjMgrGetEntityIDForPointer (bsp);
485
AssignIDsInEntity (entityID, 0, NULL);
486
if (SegHasParts (bsp)) {
487
if (BioseqFastaStream (bsp, ffp->fr, 0, ffp->linelen, 0, 0, TRUE) < 0) {
492
case Seq_repr_delta :
493
if (DeltaLitOnly (bsp)) {
494
if (BioseqFastaStream (bsp, ffp->fr, 0, ffp->linelen, 0, 0, TRUE) < 0) {
505
static void PrintQualProc (
514
fp = (FILE*) userdata;
515
fprintf (fp, "%s", buf);
518
static void PrintQualScores (
526
ffp = (FastaFlagPtr) userdata;
527
if (bsp == NULL || ffp == NULL) return;
528
if (! ISA_na (bsp->mol)) return;
530
if (ffp->far_genomic_qual) {
531
PrintQualityScoresForContig (bsp, ffp->qual_gap_is_zero, ffp->ql);
533
PrintQualityScoresToBuffer (bsp, ffp->qual_gap_is_zero, ffp->ql, PrintQualProc);
537
static void ProcessSingleRecord (
549
Pointer dataptr = NULL;
550
Uint2 datatype, entityID = 0;
551
Char file [FILENAME_MAX], path [PATH_MAX];
552
StreamFlgType flags = 0;
553
FILE *fp, *ofp = NULL;
556
if (ffp == NULL) return;
561
if (suffix == NULL) {
564
StringNCpy_0 (path, directory, sizeof (path));
565
sprintf (file, "%s%s", base, suffix);
566
FileBuildPath (path, NULL, file);
568
if (StringHasNoText (path)) return;
570
if (ffp->type == 1) {
571
fp = FileOpen (path, "r");
573
Message (MSG_POSTERR, "Failed to open '%s'", path);
577
dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, FALSE, FALSE);
581
entityID = ObjMgrRegister (datatype, dataptr);
583
} else if (ffp->type >= 2 && ffp->type <= 5) {
584
aip = AsnIoOpen (path, ffp->binary? "rb" : "r");
586
Message (MSG_POSTERR, "AsnIoOpen failed for input file '%s'", path);
592
dataptr = (Pointer) SeqEntryAsnRead (aip, NULL);
593
datatype = OBJ_SEQENTRY;
596
dataptr = (Pointer) BioseqAsnRead (aip, NULL);
597
datatype = OBJ_BIOSEQ;
600
dataptr = (Pointer) BioseqSetAsnRead (aip, NULL);
601
datatype = OBJ_BIOSEQSET;
604
dataptr = (Pointer) SeqSubmitAsnRead (aip, NULL);
605
datatype = OBJ_SEQSUB;
613
entityID = ObjMgrRegister (datatype, dataptr);
616
Message (MSG_POSTERR, "Input format type '%d' unrecognized", (int) ffp->type);
620
if (entityID < 1 || dataptr == NULL) {
621
Message (MSG_POSTERR, "Data read failed for input file '%s'", path);
625
if (datatype == OBJ_SEQSUB || datatype == OBJ_SEQENTRY ||
626
datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET) {
628
sep = GetTopSeqEntryForEntityID (entityID);
631
sep = SeqEntryNew ();
633
if (datatype == OBJ_BIOSEQ) {
634
bsp = (BioseqPtr) dataptr;
636
sep->data.ptrvalue = bsp;
637
SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
638
} else if (datatype == OBJ_BIOSEQSET) {
639
bssp = (BioseqSetPtr) dataptr;
641
sep->data.ptrvalue = bssp;
642
SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) bssp, sep);
644
sep = SeqEntryFree (sep);
647
sep = GetTopSeqEntryForEntityID (entityID);
651
if (ffp->expand_gaps) {
652
flags = STREAM_EXPAND_GAPS;
657
bsplist = DoLockFarComponents (sep, ffp);
658
if (bsplist != NULL && ffp->fr != NULL) {
659
CacheFarComponents (ffp, bsplist);
663
if (ffp->nt != NULL) {
664
if (SeqEntryFastaStream (sep, ffp->nt, flags, ffp->linelen, 0, 0,
665
TRUE, FALSE, ffp->master_style) < 0) {
669
if (ffp->aa != NULL) {
670
if (SeqEntryFastaStream (sep, ffp->aa, flags, ffp->linelen, 0, 0,
671
FALSE, TRUE, ffp->master_style) < 0) {
675
if (ffp->ql != NULL) {
676
VisitBioseqsInSep (sep, (Pointer) ffp, PrintQualScores);
679
bsplist = UnlockFarComponents (bsplist);
683
Message (MSG_POSTERR, "Datatype %d not recognized", (int) datatype);
686
ObjMgrFree (datatype, dataptr);
689
static void ProcessMultipleRecord (
699
AsnTypePtr atp, atp_bss, atp_desc, atp_se;
702
Char buf [64], cmmd [256], file [FILENAME_MAX], path [PATH_MAX], longest [64];
703
Char path1 [PATH_MAX], path2 [PATH_MAX], path3 [PATH_MAX];
704
StreamFlgType flags = 0;
707
SeqEntryPtr fsep, sep;
709
time_t starttime, stoptime, worsttime;
713
Boolean usedPopen = FALSE;
716
if (ffp == NULL) return;
721
if (suffix == NULL) {
724
StringNCpy_0 (path, directory, sizeof (path));
725
sprintf (file, "%s%s", base, suffix);
726
FileBuildPath (path, NULL, file);
728
if (StringHasNoText (path)) return;
731
if (ffp->compressed) {
732
Message (MSG_POSTERR, "Can only decompress on-the-fly on UNIX machines");
737
amp = AsnAllModPtr ();
739
Message (MSG_POSTERR, "Unable to load AsnAllModPtr");
743
atp_bss = AsnFind ("Bioseq-set");
744
if (atp_bss == NULL) {
745
Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set");
749
atp_desc = AsnFind ("Bioseq-set.descr");
750
if (atp_desc == NULL) {
751
Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.descr");
755
atp_se = AsnFind ("Bioseq-set.seq-set.E");
756
if (atp_se == NULL) {
757
Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set.E");
762
if (ffp->compressed) {
763
gzcatprog = getenv ("NCBI_UNCOMPRESS-BINARY");
764
if (gzcatprog != NULL) {
765
sprintf (cmmd, "%s %s", gzcatprog, path);
767
ret = system ("gzcat -h >/dev/null 2>&1");
769
sprintf (cmmd, "gzcat %s", path);
770
} else if (ret == -1) {
771
Message (MSG_POSTERR, "Unable to fork or exec gzcat in ScanBioseqSetRelease");
774
ret = system ("zcat -h >/dev/null 2>&1");
776
sprintf (cmmd, "zcat %s", path);
777
} else if (ret == -1) {
778
Message (MSG_POSTERR, "Unable to fork or exec zcat in ScanBioseqSetRelease");
781
Message (MSG_POSTERR, "Unable to find zcat or gzcat in ScanBioseqSetRelease - please edit your PATH environment variable");
786
fp = popen (cmmd, /* ffp->binary? "rb" : */ "r");
789
fp = FileOpen (path, ffp->binary? "rb" : "r");
792
fp = FileOpen (path, ffp->binary? "rb" : "r");
795
Message (MSG_POSTERR, "FileOpen failed for input file '%s'", path);
799
aip = AsnIoNew (ffp->binary? ASNIO_BIN_IN : ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
801
Message (MSG_ERROR, "AsnIoNew failed for input file '%s'", path);
806
tfp = FileOpen (path1, "w");
811
tfp = FileOpen (path2, "w");
816
tfp = FileOpen (path3, "w");
822
if (ffp->expand_gaps) {
823
flags = STREAM_EXPAND_GAPS;
829
while ((atp = AsnReadId (aip, amp, atp)) != NULL) {
831
sep = SeqEntryAsnRead (aip, atp);
833
starttime = GetSecs ();
836
if (ffp->logfp != NULL) {
837
fsep = FindNthBioseq (sep, 1);
838
if (fsep != NULL && fsep->choice == 1) {
839
bsp = (BioseqPtr) fsep->data.ptrvalue;
841
SeqIdWrite (bsp->id, buf, PRINTID_FASTA_LONG, sizeof (buf));
842
fprintf (ffp->logfp, "%s\n", buf);
850
bsplist = DoLockFarComponents (sep, ffp);
851
if (bsplist != NULL && ffp->fr != NULL) {
852
CacheFarComponents (ffp, bsplist);
856
if (ffp->nt != NULL) {
857
SeqEntryFastaStream (sep, ffp->nt, flags, ffp->linelen, 0, 0, TRUE, FALSE, ffp->master_style);
859
if (ffp->aa != NULL) {
860
SeqEntryFastaStream (sep, ffp->aa, flags, ffp->linelen, 0, 0, FALSE, TRUE, ffp->master_style);
862
if (ffp->ql != NULL) {
863
VisitBioseqsInSep (sep, (Pointer) ffp, PrintQualScores);
866
bsplist = UnlockFarComponents (bsplist);
868
stoptime = GetSecs ();
869
if (stoptime - starttime > worsttime && StringDoesHaveText (buf)) {
870
worsttime = stoptime - starttime;
871
StringCpy (longest, buf);
880
AsnReadVal (aip, atp, NULL);
884
AsnIoFree (aip, FALSE);
896
if (ffp->logfp != NULL && (! StringHasNoText (longest))) {
897
fprintf (ffp->logfp, "Longest processing time %ld seconds on %s\n",
898
(long) worsttime, longest);
899
fprintf (ffp->logfp, "Total number of records %ld\n", (long) numrecords);
903
sprintf (cmmd, "rm %s; rm %s; rm %s", path1, path2, path3);
907
static void ProcessOneRecord (
915
if (ffp == NULL) return;
918
ProcessMultipleRecord (directory, base, suffix, ffp);
920
ProcessSingleRecord (directory, base, suffix, ffp);
924
static void ProcessOneSeqEntry (
932
StreamFlgType flags = 0;
934
if (sep == NULL || ffp == NULL) return;
936
if (ffp->expand_gaps) {
937
flags = STREAM_EXPAND_GAPS;
942
bsplist = DoLockFarComponents (sep, ffp);
943
if (bsplist != NULL && ffp->fr != NULL) {
944
CacheFarComponents (ffp, bsplist);
948
if (ffp->nt != NULL) {
949
if (SeqEntryFastaStream (sep, ffp->nt, flags, ffp->linelen, 0, 0,
950
TRUE, FALSE, ffp->master_style) < 0) {
954
if (ffp->aa != NULL) {
955
if (SeqEntryFastaStream (sep, ffp->aa, flags, ffp->linelen, 0, 0,
956
FALSE, TRUE, ffp->master_style) < 0) {
960
if (ffp->ql != NULL) {
961
VisitBioseqsInSep (sep, (Pointer) ffp, PrintQualScores);
964
bsplist = UnlockFarComponents (bsplist);
967
static void FileRecurse (
976
Char path [PATH_MAX];
978
ValNodePtr head, vnp;
980
/* get list of all files in source directory */
982
head = DirCatalog (directory);
984
for (vnp = head; vnp != NULL; vnp = vnp->next) {
985
if (vnp->choice == 0) {
986
if (StringHasNoText (subdir) || StringStr (directory, subdir) != NULL) {
987
str = (CharPtr) vnp->data.ptrvalue;
988
if (! StringHasNoText (str)) {
990
/* does filename have desired substring? */
992
ptr = StringStr (str, suffix);
996
/* process file that has desired suffix (usually .ent) */
998
ProcessOneRecord (directory, str, suffix, ffp);
1002
} else if (vnp->choice == 1 && dorecurse) {
1004
/* recurse into subdirectory */
1006
StringNCpy_0 (path, directory, sizeof (path));
1007
str = (CharPtr) vnp->data.ptrvalue;
1008
FileBuildPath (path, str, NULL);
1010
FileRecurse (path, str, suffix, dorecurse, ffp);
1014
/* clean up file list */
1016
ValNodeFreeData (head);
1019
static SeqEntryPtr SeqEntryFromAccnOrGi (
1027
SeqEntryPtr sep = NULL;
1032
if (StringHasNoText (accn)) return NULL;
1034
TrimSpacesAroundString (accn);
1039
while (ch != '\0') {
1040
if (! IS_DIGIT (ch)) {
1048
if (sscanf (accn, "%ld", &val) == 1) {
1052
sip = SeqIdFromAccessionDotVersion (accn);
1054
uid = GetGIForSeqId (sip);
1060
sep = PubSeqSynchronousQuery (uid, 0, -1);
1066
/* Args structure contains command-line arguments */
1068
#define p_argInputPath 0
1069
#define i_argInputFile 1
1070
#define o_argNtOutFile 2
1071
#define v_argAaOutFile 3
1072
#define q_argQlOutFile 4
1073
#define x_argSuffix 5
1074
#define u_argRecurse 6
1075
#define m_argMaster 7
1076
#define g_argExpandGaps 8
1077
#define s_argGenomicQual 9
1078
#define z_argZeroQualGap 10
1079
#define a_argType 11
1080
#define b_argBinary 12
1081
#define c_argCompressed 13
1082
#define r_argRemote 14
1083
#define f_argFastaIdx 15
1084
#define d_argBlastDB 16
1085
#define k_argLocalFetch 17
1086
#define l_argLockFar 18
1087
#define h_argFarOutFile 19
1088
#define e_argLineLength 20
1089
#define T_argThreads 21
1090
#define L_argLogFile 22
1091
#define A_argAccession 23
1094
{"Path to ASN.1 Files", NULL, NULL, NULL,
1095
TRUE, 'p', ARG_STRING, 0.0, 0, NULL},
1096
{"Single Input File", "stdin", NULL, NULL,
1097
TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
1098
{"Nucleotide Output File Name", NULL, NULL, NULL,
1099
TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
1100
{"Protein Output File Name", NULL, NULL, NULL,
1101
TRUE, 'v', ARG_FILE_OUT, 0.0, 0, NULL},
1102
{"Quality Score Output File Name", NULL, NULL, NULL,
1103
TRUE, 'q', ARG_FILE_OUT, 0.0, 0, NULL},
1104
{"File Selection Substring", ".ent", NULL, NULL,
1105
TRUE, 'x', ARG_STRING, 0.0, 0, NULL},
1106
{"Recurse", "F", NULL, NULL,
1107
TRUE, 'u', ARG_BOOLEAN, 0.0, 0, NULL},
1108
{"Master Style for Near Segmented Sequences", "F", NULL, NULL,
1109
TRUE, 'm', ARG_BOOLEAN, 0.0, 0, NULL},
1110
{"Expand Delta Gaps into Ns", "F", NULL, NULL,
1111
TRUE, 'g', ARG_BOOLEAN, 0.0, 0, NULL},
1112
{"Far Genomic Contig for Quality Scores", "F", NULL, NULL,
1113
TRUE, 's', ARG_BOOLEAN, 0.0, 0, NULL},
1114
{"Print Quality Score Gap as -1", "F", NULL, NULL,
1115
TRUE, 'z', ARG_BOOLEAN, 0.0, 0, NULL},
1116
{"ASN.1 Type (a Any, e Seq-entry, b Bioseq, s Bioseq-set, m Seq-submit, t Batch Processing)", "a", NULL, NULL,
1117
TRUE, 'a', ARG_STRING, 0.0, 0, NULL},
1118
{"Bioseq-set is Binary", "F", NULL, NULL,
1119
TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
1120
{"Bioseq-set is Compressed", "F", NULL, NULL,
1121
TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
1122
{"Remote Fetching from ID", "F", NULL, NULL,
1123
TRUE, 'r', ARG_BOOLEAN, 0.0, 0, NULL},
1124
{"Path to Indexed FASTA Data", NULL, NULL, NULL,
1125
TRUE, 'f', ARG_STRING, 0.0, 0, NULL},
1126
{"Path to ReadDB Database", NULL, NULL, NULL,
1127
TRUE, 'd', ARG_STRING, 0.0, 0, NULL},
1128
{"Local Fetching", "F", NULL, NULL,
1129
TRUE, 'k', ARG_BOOLEAN, 0.0, 0, NULL},
1130
{"Lock Components in Advance", "F", NULL, NULL,
1131
TRUE, 'l', ARG_BOOLEAN, 0.0, 0, NULL},
1132
{"Far Component Cache Output File Name", NULL, NULL, NULL,
1133
TRUE, 'h', ARG_FILE_OUT, 0.0, 0, NULL},
1134
{"Line Length", "70", "10", "120",
1135
TRUE, 'e', ARG_INT, 0.0, 0, NULL},
1136
{"Use Threads", "F", NULL, NULL,
1137
TRUE, 'T', ARG_BOOLEAN, 0.0, 0, NULL},
1138
{"Log File", NULL, NULL, NULL,
1139
TRUE, 'L', ARG_FILE_OUT, 0.0, 0, NULL},
1140
{"Accession to Fetch", NULL, NULL, NULL,
1141
TRUE, 'A', ARG_STRING, 0.0, 0, NULL},
1147
Char app [64], sfx [32];
1148
CharPtr accn, base, blastdb, directory, fastaidx, ntout,
1149
aaout, qlout, frout, logfile, ptr, str, suffix;
1150
Boolean batch, binary, blast, compressed, dorecurse,
1151
expandgaps, fargenomicqual, fasta, local, lock,
1152
masterstyle, qualgapzero, remote, usethreads;
1154
Int2 linelen, type = 0;
1155
time_t run_time, start_time, stop_time;
1158
/* standard setup */
1160
ErrSetFatalLevel (SEV_MAX);
1161
ErrClearOptFlags (EO_SHOW_USERSTR);
1162
ErrSetLogfile ("stderr", ELOG_APPEND);
1163
UseLocalAsnloadDataAndErrMsg ();
1166
if (! AllObjLoad ()) {
1167
Message (MSG_FATAL, "AllObjLoad failed");
1170
if (! SubmitAsnLoad ()) {
1171
Message (MSG_FATAL, "SubmitAsnLoad failed");
1174
if (! FeatDefSetLoad ()) {
1175
Message (MSG_FATAL, "FeatDefSetLoad failed");
1178
if (! SeqCodeSetLoad ()) {
1179
Message (MSG_FATAL, "SeqCodeSetLoad failed");
1182
if (! GeneticCodeTableLoad ()) {
1183
Message (MSG_FATAL, "GeneticCodeTableLoad failed");
1187
/* process command line arguments */
1189
sprintf (app, "asn2fsa %s", ASN2FSA_APPLICATION);
1190
if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
1194
/* additional setup modifications */
1196
MemSet ((Pointer) &ffd, 0, sizeof (FastaFlagData));
1198
directory = (CharPtr) myargs [p_argInputPath].strvalue;
1199
suffix = (CharPtr) myargs [x_argSuffix].strvalue;
1200
base = (CharPtr) myargs [i_argInputFile].strvalue;
1201
accn = (CharPtr) myargs [A_argAccession].strvalue;
1202
dorecurse = (Boolean) myargs [u_argRecurse].intvalue;
1203
remote = (Boolean ) myargs [r_argRemote].intvalue;
1204
fastaidx = (CharPtr) myargs [f_argFastaIdx].strvalue;
1205
fasta = (Boolean) StringDoesHaveText (fastaidx);
1206
blastdb = (CharPtr) myargs [d_argBlastDB].strvalue;
1207
blast = (Boolean) StringDoesHaveText (blastdb);
1208
local = (Boolean) myargs [k_argLocalFetch].intvalue;
1209
lock = (Boolean) myargs [l_argLockFar].intvalue;
1210
linelen = (Int2) myargs [e_argLineLength].intvalue;
1211
usethreads = (Boolean) myargs [T_argThreads].intvalue;
1213
expandgaps = (Boolean) myargs [g_argExpandGaps].intvalue;
1214
masterstyle = (Boolean) myargs [m_argMaster].intvalue;
1215
fargenomicqual = (Boolean) myargs [s_argGenomicQual].intvalue;
1216
qualgapzero = (Boolean) myargs [z_argZeroQualGap].intvalue;
1218
binary = (Boolean) myargs [b_argBinary].intvalue;
1219
compressed = (Boolean) myargs [c_argCompressed].intvalue;
1221
str = myargs [a_argType].strvalue;
1222
if (StringICmp (str, "a") == 0) {
1224
} else if (StringICmp (str, "e") == 0) {
1226
} else if (StringICmp (str, "b") == 0) {
1228
} else if (StringICmp (str, "s") == 0) {
1230
} else if (StringICmp (str, "m") == 0) {
1232
} else if (StringICmp (str, "t") == 0) {
1239
if ((binary || compressed) && (! batch)) {
1241
Message (MSG_FATAL, "-b or -c cannot be used without -t or -a");
1246
if (StringHasNoText (directory) && StringHasNoText (base)) {
1247
Message (MSG_FATAL, "Input path or input file must be specified");
1251
ntout = (CharPtr) myargs [o_argNtOutFile].strvalue;
1252
aaout = (CharPtr) myargs [v_argAaOutFile].strvalue;
1253
qlout = (CharPtr) myargs [q_argQlOutFile].strvalue;
1254
frout = (CharPtr) myargs [h_argFarOutFile].strvalue;
1256
logfile = (CharPtr) myargs [L_argLogFile].strvalue;
1258
/* default to stdout for nucleotide output if nothing specified */
1260
if (StringHasNoText (ntout) &&
1261
StringHasNoText (aaout) &&
1262
StringHasNoText (qlout)) {
1266
start_time = GetSecs ();
1268
/* populate parameter structure */
1270
ffd.expand_gaps = expandgaps;
1271
ffd.master_style = masterstyle;
1272
ffd.far_genomic_qual = fargenomicqual;
1273
ffd.qual_gap_is_zero = (Boolean) (! qualgapzero);
1275
ffd.binary = binary;
1276
ffd.compressed = compressed;
1278
ffd.useThreads = usethreads;
1280
ffd.linelen = linelen;
1288
if (! StringHasNoText (ntout)) {
1289
ffd.nt = FileOpen (ntout, "w");
1290
if (ffd.nt == NULL) {
1291
Message (MSG_FATAL, "Unable to open nucleotide output file");
1296
if (! StringHasNoText (aaout)) {
1297
ffd.aa = FileOpen (aaout, "w");
1298
if (ffd.aa == NULL) {
1299
Message (MSG_FATAL, "Unable to open protein output file");
1304
if (! StringHasNoText (qlout)) {
1305
ffd.ql = FileOpen (qlout, "w");
1306
if (ffd.ql == NULL) {
1307
Message (MSG_FATAL, "Unable to open quality score output file");
1312
if (! StringHasNoText (frout)) {
1313
ffd.fr = FileOpen (frout, "w");
1314
if (ffd.fr == NULL) {
1315
Message (MSG_FATAL, "Unable to open far component cache output file");
1321
if (! StringHasNoText (logfile)) {
1322
ffd.logfp = FileOpen (logfile, "w");
1323
if (ffd.logfp == NULL) {
1324
Message (MSG_FATAL, "Unable to open log file");
1329
/* register fetch functions */
1332
#ifdef INTERNAL_NCBI_ASN2FSA
1333
if (! PUBSEQBioseqFetchEnable ("asn2fsa", FALSE)) {
1334
Message (MSG_POSTERR, "PUBSEQBioseqFetchEnable failed");
1337
ffd.usePUBSEQ = TRUE;
1338
ffd.useThreads = FALSE;
1340
PubSeqFetchEnable ();
1345
ptr = StringRChr (blastdb, DIRDELIMCHR);
1349
TransientSetAppParam ("NCBI", "BLAST", "BLASTDB", blastdb);
1350
if (StringDoesHaveText (ptr)) {
1351
ReadDBBioseqFetchEnable ("asn2fsa", ptr, TRUE, FALSE);
1352
ffd.blastdbname = ptr;
1353
ffd.useBLAST = TRUE;
1355
ReadDBBioseqFetchEnable ("asn2fsa", "nr", TRUE, FALSE);
1356
ffd.blastdbname = "nr";
1357
ffd.useBLAST = TRUE;
1360
ReadDBBioseqFetchEnable ("asn2fsa", blastdb, TRUE, FALSE);
1361
ffd.blastdbname = blastdb;
1362
ffd.useBLAST = TRUE;
1367
AltIndexedFastaLibFetchEnable (fastaidx);
1371
LocalSeqFetchInit (FALSE);
1374
/* recurse through all files within source directory or subdirectories */
1376
if (StringDoesHaveText (accn)) {
1379
sep = SeqEntryFromAccnOrGi (accn);
1381
ProcessOneSeqEntry (sep, &ffd);
1386
} else if (StringDoesHaveText (directory)) {
1388
FileRecurse (directory, NULL, suffix, dorecurse, &ffd);
1390
} else if (StringDoesHaveText (base)) {
1392
ptr = StringRChr (base, '.');
1394
StringNCpy_0 (sfx, ptr, sizeof (sfx));
1397
ProcessOneRecord (directory, base, sfx, &ffd);
1400
if (ffd.nt != NULL) {
1403
if (ffd.aa != NULL) {
1406
if (ffd.ql != NULL) {
1409
if (ffd.fr != NULL) {
1411
CreateFastaIndex (frout);
1414
stop_time = GetSecs ();
1415
run_time = stop_time - start_time;
1417
if (ffd.logfp != NULL) {
1418
fprintf (ffd.logfp, "Finished in %ld seconds\n", (long) run_time);
1419
FileClose (ffd.logfp);
1422
/* close fetch functions */
1425
LocalSeqFetchDisable ();
1429
AltIndexedFastaLibFetchDisable ();
1433
ReadDBBioseqFetchDisable ();
1437
#ifdef INTERNAL_NCBI_ASN2FSA
1438
PUBSEQBioseqFetchDisable ();
1440
PubSeqFetchDisable ();