2
* ===========================================================================
5
* National Center for Biotechnology Information (NCBI)
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government do not place any restriction on its use or reproduction.
13
* We would, however, appreciate having the NCBI and the author cited in
14
* any work or product based on this material
16
* Although all reasonable efforts have been taken to ensure the accuracy
17
* and reliability of the software and data, the NLM and the U.S.
18
* Government do not and cannot warrant the performance or results that
19
* may be obtained by using this software or data. The NLM and the U.S.
20
* Government disclaim all warranties, express or implied, including
21
* warranties of performance, merchantability or fitness for any particular
24
* ===========================================================================
26
* File Name: tbl2asn.c
28
* Author: Jonathan Kans
30
* Version Creation Date: 5/5/00
37
* --------------------------------------------------------------------------
38
* Date Name Description of modification
39
* ------- ---------- -----------------------------------------------------
42
* ==========================================================================
62
static FILE* OpenOneFile (
69
Char file [FILENAME_MAX], path [PATH_MAX];
77
StringNCpy_0 (path, directory, sizeof (path));
78
sprintf (file, "%s%s", base, suffix);
79
FileBuildPath (path, NULL, file);
81
return FileOpen (path, "r");
84
static void WriteOneFile (
94
Char file [FILENAME_MAX], path [PATH_MAX];
97
MemSet ((Pointer) &ssb, 0, sizeof (SeqSubmit));
100
ssb.data = (Pointer) sep;
102
StringNCpy_0 (path, results, sizeof (path));
103
sprintf (file, "%s%s", base, suffix);
104
FileBuildPath (path, NULL, file);
106
aip = AsnIoOpen (path, "w");
107
if (aip == NULL) return;
110
SeqSubmitAsnWrite (&ssb, aip, NULL);
112
SeqEntryAsnWrite (sep, aip, NULL);
119
static void ValidateOneFile (
127
Char file [FILENAME_MAX], path [PATH_MAX];
131
StringNCpy_0 (path, results, sizeof (path));
132
sprintf (file, "%s%s", base, suffix);
133
FileBuildPath (path, NULL, file);
135
ErrSetOptFlags (EO_LOGTO_USRFILE);
136
ErrSetLogfile (path, ELOG_APPEND | ELOG_NOCREATE);
138
vsp = ValidStructNew ();
140
vsp->useSeqMgrIndexes = TRUE;
141
vsp->suppressContext = TRUE;
142
oldErrSev = ErrSetMessageLevel (SEV_NONE);
143
ValidateSeqEntry (sep, vsp);
144
ValidStructFree (vsp);
145
ErrSetMessageLevel (oldErrSev);
148
ErrSetLogfile (NULL, ELOG_APPEND | ELOG_NOCREATE);
149
ErrClearOptFlags (EO_LOGTO_USRFILE);
152
static void FlatfileOneFile (
160
Char file [FILENAME_MAX], path [PATH_MAX];
164
StringNCpy_0 (path, results, sizeof (path));
165
sprintf (file, "%s%s", base, suffix);
166
FileBuildPath (path, NULL, file);
168
fp = FileOpen (path, "w");
169
if (fp == NULL) return;
171
oldErrSev = ErrSetMessageLevel (SEV_MAX);
172
SeqEntryToGnbk (sep, NULL, GENBANK_FMT, ENTREZ_MODE, NORMAL_STYLE, 0, fp);
173
ErrSetMessageLevel (oldErrSev);
178
/* for full-length cDNAs, allow automatic annotation of largest internal ORF */
180
typedef struct orfdata {
181
Int4 curlen [6], bestlen [6], currstart [6], beststart [6], sublen [6];
182
Boolean inorf [6], altstart;
183
} OrfData, PNTR OrfDataPtr;
185
static void LIBCALLBACK LookForOrfs (
200
odp = (OrfDataPtr) userdata;
201
if (strand == Seq_strand_plus) {
206
if (odp->inorf [idx]) {
208
odp->inorf [idx] = FALSE;
209
if (odp->curlen [idx] > odp->bestlen [idx]) {
210
odp->bestlen [idx] = odp->curlen [idx];
211
odp->beststart [idx] = odp->currstart [idx];
214
(odp->curlen [idx])++;
216
} else if (atgStart || (altStart && odp->altstart)) {
217
odp->inorf [idx] = TRUE;
218
odp->curlen [idx] = 1;
219
odp->currstart [idx] = position - frame;
227
odp->curlen [idx] = 0;
228
odp->sublen [idx] = 0;
229
odp->currstart [idx] = position - frame;
230
} else if (atgStart || (altStart && odp->altstart)) {
231
(odp->sublen [idx])++;
232
odp->curlen [idx] = odp->sublen [idx];
233
if (odp->curlen [idx] > odp->bestlen [idx]) {
234
odp->bestlen [idx] = odp->curlen [idx];
235
odp->beststart [idx] = odp->currstart [idx];
238
(odp->sublen [idx])++;
243
static SeqFeatPtr AnnotateBestOrf (
252
SeqFeatPtr cds = NULL;
265
if (bsp == NULL) return NULL;
266
for (i = 0; i < 6; i++) {
267
od.curlen [i] = INT4_MIN;
269
od.currstart [i] = 0;
270
od.beststart [i] = 0;
271
od.sublen [i] = INT4_MIN;
272
od.inorf [i] = FALSE;
274
od.altstart = altstart;
276
/* use simultaneous 6-frame translation finite state machine */
278
tbl = PersistentTransTableByGenCode (genCode);
280
TransTableProcessBioseq (tbl, LookForOrfs, (Pointer) &od, bsp);
282
/* TransTableFree (tbl); - now using persistent tables, free at end */
285
for (i = 0; i < 6; i++) {
286
if (od.bestlen [i] > best) {
287
best = od.bestlen [i];
291
if (idx == -1) return NULL;
293
/* make feature location on largest ORF */
296
MemSet ((Pointer) &sint, 0, sizeof (SeqInt));
297
sint.from = od.beststart [idx] + idx;
298
sint.to = sint.from + (od.bestlen [idx]) * 3 + 2;
299
sint.id = SeqIdFindBest (bsp->id, 0);
300
sint.strand = Seq_strand_plus;
301
vn.choice = SEQLOC_INT;
303
vn.data.ptrvalue = (Pointer) &sint;
306
MemSet ((Pointer) &sint, 0, sizeof (SeqInt));
307
sint.from = od.beststart [idx] + idx - 3;
308
sint.to = sint.from + (od.bestlen [idx]) * 3 + 2;
309
sint.id = SeqIdFindBest (bsp->id, 0);
310
sint.strand = Seq_strand_minus;
311
vn.choice = SEQLOC_INT;
313
vn.data.ptrvalue = (Pointer) &sint;
317
/* make CDS feature with unknown product - now check [protein=...] */
319
cds = CreateNewFeatureOnBioseq (bsp, SEQFEAT_CDREGION, &vn);
320
if (cds == NULL) return NULL;
321
crp = CreateNewCdRgn (1, FALSE, genCode);
322
if (crp == NULL) return NULL;
324
cds->data.value.ptrvalue = (Pointer) crp;
327
if (prp == NULL) return cds;
328
xref = SeqFeatXrefNew ();
329
if (xref == NULL) return cds;
330
xref->data.choice = SEQFEAT_PROT;
331
xref->data.value.ptrvalue = (Pointer) prp;
332
xref->next = cds->xref;
334
prp = ParseTitleIntoProtRef (stp, prp);
335
if (prp->name == NULL) {
336
prp->name = ValNodeCopyStr (NULL, 0, "unknown");
339
/* parse CDS comment ("note" goes to biosource) and experimental evidence */
341
str = SqnTagFind (stp, "comment");
342
if (! StringHasNoText (str)) {
343
cds->comment = StringSave (str);
346
str = SqnTagFind (stp, "evidence");
347
if (StringICmp (str, "experimental") == 0) {
351
/* now check [gene=...], make gene feature if locus or synonym present */
354
if (grp == NULL) return cds;
355
grp = ParseTitleIntoGeneRef (stp, grp);
356
if (grp->locus == NULL && grp->syn == NULL) {
360
sfp = CreateNewFeatureOnBioseq (bsp, SEQFEAT_GENE, NULL);
361
if (sfp == NULL) return cds;
362
sfp->data.value.ptrvalue = (Pointer) grp;
367
/* change all feature IDs to entered accession */
369
static void PromoteSeqId (SeqIdPtr sip, Pointer userdata)
372
SeqIdPtr bestid, newid, oldid;
374
bestid = (SeqIdPtr) userdata;
376
newid = SeqIdDup (bestid);
377
if (newid == NULL) return;
379
oldid = ValNodeNew (NULL);
380
if (oldid == NULL) return;
382
MemCopy (oldid, sip, sizeof (ValNode));
385
sip->choice = newid->choice;
386
sip->data.ptrvalue = newid->data.ptrvalue;
391
SeqIdStripLocus (sip);
394
static void CorrectFeatureSeqIds (
400
VisitSeqIdsInSeqLoc (sfp->location, userdata, PromoteSeqId);
403
/* source information for several common organisms sequenced by genome centers */
405
typedef struct orgstuff {
413
} OrgStuff, PNTR OrfStuffPtr;
415
static OrgStuff commonOrgStuff [] = {
417
"Saccharomyces cerevisiae", "baker's yeast",
418
"Eukaryota; Fungi; Ascomycota; Saccharomycetes; Saccharomycetales; Saccharomycetaceae; Saccharomyces",
422
"Drosophila melanogaster", "fruit fly",
423
"Eukaryota; Metazoa; Arthropoda; Tracheata; Hexapoda; Insecta; Pterygota; Neoptera; Endopterygota; Diptera; Brachycera; Muscomorpha; Ephydroidea; Drosophilidae; Drosophila",
427
"Homo sapiens", "human",
428
"Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Primates; Catarrhini; Hominidae; Homo",
432
"Escherichia coli", "",
433
"Bacteria; Proteobacteria; gamma subdivision; Enterobacteriaceae; Escherichia",
437
"Helicobacter pylori", "",
438
"Bacteria; Proteobacteria; epsilon subdivision; Helicobacter group; Helicobacter",
442
"Arabidopsis thaliana", "thale cress",
443
"Eukaryota; Viridiplantae; Embryophyta; Tracheophyta; Spermatophyta; Magnoliophyta; eudicotyledons; core eudicots; Rosidae; eurosids II; Brassicales; Brassicaceae; Arabidopsis",
447
"Mus musculus", "house mouse",
448
"Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Rodentia; Sciurognathi; Muridae; Murinae; Mus",
452
"Rattus norvegicus", "Norway rat",
453
"Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Rodentia; Sciurognathi; Muridae; Murinae; Rattus",
457
"Danio rerio", "zebrafish",
458
"Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Actinopterygii; Neopterygii; Teleostei; Euteleostei; Ostariophysi; Cypriniformes; Cyprinidae; Rasborinae; Danio",
463
"Eukaryota; Viridiplantae; Embryophyta; Tracheophyta; Spermatophyta; Magnoliophyta; Liliopsida; Poales; Poaceae; Zea",
467
"Caenorhabditis elegans", "",
468
"Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis",
472
"Caenorhabditis briggsae", "",
473
"Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis",
477
"Anopheles gambiae", "African malaria mosquito",
478
"Eukaryota; Metazoa; Arthropoda; Tracheata; Hexapoda; Insecta; Pterygota; Neoptera; Endopterygota; Diptera; Nematocera; Culicoidea; Anopheles",
482
"Anopheles gambiae str. PEST", "African malaria mosquito",
483
"Eukaryota; Metazoa; Arthropoda; Tracheata; Hexapoda; Insecta; Pterygota; Neoptera; Endopterygota; Diptera; Nematocera; Culicoidea; Anopheles",
487
"Tetrahymena thermophila", "",
488
"Eukaryota; Alveolata; Ciliophora; Oligohymenophorea; Hymenostomatida; Tetrahymenina; Tetrahymena",
492
NULL, NULL, NULL, 0, 0, 0
496
static Boolean HasTaxon (OrgRefPtr orp)
502
if (orp == FALSE) return FALSE;
503
for (db = orp->db; db != NULL; db = db->next) {
504
dbt = (DbtagPtr) db->data.ptrvalue;
505
if (dbt != NULL && dbt->db != NULL &&
506
StringICmp (dbt->db, "taxon") == 0) return TRUE;
511
static void AddMissingSourceInfo (BioSourcePtr biop)
522
if (biop == NULL) return;
524
if (orp == NULL) return;
526
if (onp == NULL) return;
528
/* look for entry of organisms in commonOrgStuff table */
530
for (idx = 0; commonOrgStuff [idx].taxname != NULL; idx++) {
531
osp = &(commonOrgStuff [idx]);
532
if (StringICmp (orp->taxname, osp->taxname) == 0) {
533
if (StringHasNoText (orp->common) && (! StringHasNoText (osp->common))) {
534
orp->common = StringSave (osp->common);
536
if (onp->gcode == 0) {
537
onp->gcode = osp->gcode;
539
if (onp->mgcode == 0) {
540
onp->mgcode = osp->mgcode;
542
if (StringHasNoText (onp->div)) {
543
onp->div = StringSave (osp->division);
545
if (StringHasNoText (onp->lineage)) {
546
onp->lineage = StringSave (osp->lineage);
548
if (! HasTaxon (orp)) {
549
db = ValNodeNew (NULL);
553
oip = ObjectIdNew ();
555
oip->id = osp->taxID;
556
dbt->db = StringSave ("taxon");
558
db->data.ptrvalue = (Pointer) dbt;
568
static BioseqPtr SqnGetBioseqGivenSeqLoc (SeqLocPtr slp, Uint2 entityID)
576
if (slp == NULL) return NULL;
578
sip = SeqLocId (slp);
580
bsp = BioseqFind (sip);
582
tmp = SeqLocFindNext (slp, NULL);
584
sip = SeqLocId (tmp);
586
bsp = BioseqFind (sip);
588
sep = SeqMgrGetSeqEntryForData (bsp);
589
entityID = ObjMgrGetEntityIDForChoice (sep);
590
bsp = GetBioseqGivenSeqLoc (slp, entityID);
598
static BioseqPtr GetBioseqReferencedByAnnot (SeqAnnotPtr sap, Uint2 entityID)
612
if (sap == NULL) return NULL;
615
feat = (SeqFeatPtr) sap->data;
616
while (feat != NULL) {
617
slp = feat->location;
619
bsp = SqnGetBioseqGivenSeqLoc (slp, entityID);
620
if (bsp != NULL) return bsp;
626
align = (SeqAlignPtr) sap->data;
627
while (align != NULL) {
628
if (align->segtype == 1) {
629
ddp = (DenseDiagPtr) align->segs;
631
for (sip = ddp->id; sip != NULL; sip = sip->next) {
632
bsp = BioseqFind (sip);
633
if (bsp != NULL) return bsp;
636
} else if (align->segtype == 2) {
637
dsp = (DenseSegPtr) align->segs;
639
for (sip = dsp->ids; sip != NULL; sip = sip->next) {
640
bsp = BioseqFind (sip);
641
if (bsp != NULL) return bsp;
644
} else if (align->segtype == 3) {
645
ssp = (StdSegPtr) align->segs;
646
if (ssp != NULL && ssp->loc != NULL) {
647
for (tloc = ssp->loc; tloc != NULL; tloc = tloc->next) {
648
bsp = BioseqFind (SeqLocId (tloc));
649
if (bsp != NULL) return bsp;
657
graph = (SeqGraphPtr) sap->data;
658
while (graph != NULL) {
661
bsp = SqnGetBioseqGivenSeqLoc (slp, entityID);
662
if (bsp != NULL) return bsp;
673
static BioseqPtr AttachSeqAnnotEntity (Uint2 entityID, SeqAnnotPtr sap)
678
SeqEntryPtr oldscope;
681
SeqFeatPtr sfp = NULL;
683
if (sap == NULL) return NULL;
684
bsp = GetBioseqReferencedByAnnot (sap, entityID);
686
oldscope = SeqEntrySetScope (NULL);
687
if (oldscope != NULL) {
688
bsp = GetBioseqReferencedByAnnot (sap, entityID);
689
SeqEntrySetScope (oldscope);
693
sep = SeqMgrGetSeqEntryForData (bsp);
694
entityID = ObjMgrGetEntityIDForChoice (sep);
695
if (sap->type == 1) {
696
sfp = (SeqFeatPtr) sap->data;
697
sep = GetBestTopParentForData (entityID, bsp);
698
genCode = SeqEntryToGeneticCode (sep, NULL, NULL, 0);
699
SetEmptyGeneticCodes (sap, genCode);
701
MemSet ((Pointer) &ompc, 0, sizeof (OMProcControl));
702
ompc.input_entityID = entityID;
703
ompc.input_itemID = GetItemIDGivenPointer (entityID, OBJ_BIOSEQ, (Pointer) bsp);
704
ompc.input_itemtype = OBJ_BIOSEQ;
705
ompc.output_itemtype = OBJ_SEQANNOT;
706
ompc.output_data = (Pointer) sap;
707
if (! AttachDataForProc (&ompc, FALSE)) {
708
Message (MSG_POSTERR, "AttachSeqAnnotEntity failed");
709
} else if (sfp != NULL) {
710
PromoteXrefs (sfp, bsp, entityID);
713
Message (MSG_POSTERR, "Feature table identifiers do not match record");
718
static CharPtr TrimBracketsFromString (CharPtr str)
721
Uchar ch; /* to use 8bit characters in multibyte languages */
725
if (StringHasNoText (str)) return str;
727
/* remove bracketed fields */
736
while (ch != '\0' && ch != ']') {
751
/* remove runs of whitespace characters */
757
if (IS_WHITESP (ch)) {
762
while (IS_WHITESP (ch)) {
778
static void ProcessOneNuc (
788
BioSourcePtr biop = NULL;
798
SqnTagPtr stp = NULL;
803
if (bsp == NULL) return;
805
sep = GetBestTopParentForData (entityID, bsp);
806
genCode = SeqEntryToGeneticCode (sep, NULL, NULL, 0);
808
if (bsp->mol == Seq_mol_na) {
809
bsp->mol = Seq_mol_dna;
813
src = AsnIoMemCopy ((Pointer) src,
814
(AsnReadFunc) BioSourceAsnRead,
815
(AsnWriteFunc) BioSourceAsnWrite);
818
vnp = ValNodeExtract (&(bsp->descr), Seq_descr_title);
820
ttl = (CharPtr) vnp->data.ptrvalue;
822
stp = SqnTagParse (ttl);
827
biop = ParseTitleIntoBioSource (stp, organism, src);
828
ParseTitleIntoBioseq (stp, bsp);
831
biop = ParseTitleIntoBioSource (NULL, organism, src);
834
SeqDescrAddPointer (&(bsp->descr), Seq_descr_source, (Pointer) biop);
835
AddMissingSourceInfo (biop);
838
if (BioseqGetSeqDescr (bsp, Seq_descr_molinfo, NULL) == NULL) {
842
mip = ParseTitleIntoMolInfo (stp, mip);
844
if (mip->biomol == 0) {
845
mip->biomol = MOLECULE_TYPE_GENOMIC;
847
SeqDescrAddPointer (&(bsp->descr), Seq_descr_molinfo, (Pointer) mip);
851
if (genCode == 0 && biop != NULL) {
856
mito = (Boolean) (biop->genome == 4 || biop->genome == 5);
858
genCode = onp->mgcode;
860
genCode = onp->gcode;
867
gbp = ParseTitleIntoGenBank (stp, NULL);
868
if (gbp != NULL && gbp->extra_accessions != NULL) {
869
SeqDescrAddPointer (&(bsp->descr), Seq_descr_genbank, (Pointer) gbp);
871
gbp = GBBlockFree (gbp);
874
shp = ParseTitleIntoSeqHist (stp, NULL);
875
if (shp != NULL && shp->replace_ids != NULL) {
876
bsp->hist = SeqHistFree (bsp->hist);
879
shp = SeqHistFree (shp);
884
cds = AnnotateBestOrf (bsp, genCode, altstart, stp);
886
PromoteXrefs (cds, bsp, entityID);
894
TrimBracketsFromString (ttl);
895
if (! StringHasNoText (ttl)) {
896
str = StringSave (ttl);
897
SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str);
900
ValNodeFreeData (vnp);
903
static void ProcessOneAnnot (
916
if (sap == NULL) return;
918
bsp = AttachSeqAnnotEntity (entityID, sap);
919
if (bsp == NULL) return;
921
sep = GetBestTopParentForData (entityID, bsp);
922
genCode = SeqEntryToGeneticCode (sep, NULL, NULL, 0);
924
/* if existing accession, coerce all SeqIds */
926
if (! StringHasNoText (accn)) {
927
sip = SeqIdFromAccession (accn, 0, NULL);
929
bsp->id = SeqIdSetFree (bsp->id);
931
SeqMgrReplaceInBioseqIndex (bsp);
932
VisitFeaturesOnBsp (bsp, (Pointer) bsp->id, CorrectFeatureSeqIds);
936
/* for parsed in features or best ORF, promote CDS products to protein bioseq */
938
for (sap = bsp->annot; sap != NULL; sap = sap->next) {
939
if (sap->type == 1) {
940
SetEmptyGeneticCodes (sap, genCode);
941
sfp = (SeqFeatPtr) sap->data;
942
PromoteXrefs (sfp, bsp, entityID);
947
static void ReplaceOnePeptide (
964
if (ssp == NULL || ssp->numid < 1) return;
966
sip = MakeSeqID (ssp->id [0]);
967
bsp = BioseqFind (sip);
969
if (bsp == NULL || bsp->repr != Seq_repr_raw) return;
971
/* remove trailing X and * */
974
BSSeek (bs, -1, SEEK_END);
975
aa = (Uint1) BSGetByte (bs);
976
while ((aa == 'X' || aa == '*') && ssp->seqlen > 0) {
977
BSSeek (bs, -1, SEEK_END);
979
BSSeek (bs, -1, SEEK_END);
980
aa = (Uint1) BSGetByte (bs);
982
ssp->seqlen = BSLen (bs);
984
str1 = BSMerge (ssp->seq, NULL);
985
str2 = BSMerge (bsp->seq_data, NULL);
987
if (StringCmp (str1, str2) != 0) {
989
/* swap sequence byte stores */
992
bsp->seq_data = ssp->seq;
994
bsp->length = BSLen (bsp->seq_data);
995
bsp->seq_data_type = Seq_code_ncbieaa;
997
cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
999
crp = (CdRegionPtr) cds->data.value.ptrvalue;
1001
/* conditionally set CDS conflict flag, suppress validator complaint */
1003
if (crp != NULL && conflict) {
1004
crp->conflict = TRUE;
1008
prt = SeqMgrGetBestProteinFeature (bsp, NULL);
1010
slp = prt->location;
1011
if (slp != NULL && slp->choice == SEQLOC_INT) {
1012
sintp = (SeqIntPtr) slp->data.ptrvalue;
1013
if (sintp != NULL) {
1014
sintp->to = bsp->length - 1;
1024
static Uint2 ProcessOneAsn (
1033
BioseqPtr bsp = NULL;
1035
Uint2 datatype, entityID;
1038
if (fp == NULL) return 0;
1040
dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, FALSE, FALSE, TRUE, FALSE);
1041
if (dataptr == NULL) return 0;
1043
sep = GetTopSeqEntryForEntityID (entityID);
1044
bsp = FindNucBioseq (sep);
1046
ObjMgrFreeByEntityID (entityID);
1050
ProcessOneNuc (entityID, bsp, src, organism, findorf, altstart);
1055
static Uint2 ProcessAsnSet (
1067
Uint2 datatype, entityID;
1068
SeqEntryPtr sep, topsep;
1070
bssp = BioseqSetNew ();
1071
if (bssp == NULL) return 0;
1072
bssp->_class = BioseqseqSet_class_genbank;
1074
topsep = SeqEntryNew ();
1075
if (topsep == NULL) return 0;
1077
topsep->data.ptrvalue = (Pointer) bssp;
1079
entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp);
1081
while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL) {
1082
if (datatype == OBJ_BIOSEQ) {
1084
sep = SeqMgrGetSeqEntryForData (dataptr);
1085
AddSeqEntryToSeqEntry (topsep, sep, FALSE);
1087
bsp = (BioseqPtr) dataptr;
1088
ProcessOneNuc (entityID, bsp, src, organism, findorf, altstart);
1091
ObjMgrFree (datatype, dataptr);
1095
SeqMgrLinkSeqEntry (topsep, 0, NULL);
1100
static void ProcessOneRecord (
1111
SeqDescrPtr sdphead,
1123
Uint2 datatype, entityID;
1131
fp = OpenOneFile (directory, base, suffix);
1132
if (fp == NULL) return;
1134
/* read one or more ASN.1 or FASTA sequence files */
1137
entityID = ProcessAsnSet (fp, src, organism, findorf, altstart);
1139
entityID = ProcessOneAsn (fp, src, organism, findorf, altstart);
1143
if (entityID == 0) return;
1145
/* read one or more feature tables from .tbl file */
1147
fp = OpenOneFile (directory, base, ".tbl");
1150
while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL) {
1151
if (datatype == OBJ_SEQANNOT) {
1153
sap = (SeqAnnotPtr) dataptr;
1154
ProcessOneAnnot (sap, entityID, accn);
1157
ObjMgrFree (datatype, dataptr);
1163
/* read one or more feature tables from .pep file */
1165
fp = OpenOneFile (directory, base, ".pep");
1168
/* indexing needed to find CDS from protein product to set conflict flag */
1170
SeqMgrIndexFeatures (entityID, NULL);
1172
while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, TRUE)) != NULL) {
1173
if (datatype == OBJ_FASTA) {
1175
ssp = (SimpleSeqPtr) dataptr;
1176
ReplaceOnePeptide (ssp, conflict);
1177
SimpleSeqFree (ssp);
1180
ObjMgrFree (datatype, dataptr);
1186
sep = GetTopSeqEntryForEntityID (entityID);
1188
if (sdphead != NULL) {
1189
if (IS_Bioseq (sep)) {
1190
bsp = (BioseqPtr) sep->data.ptrvalue;
1191
ValNodeLink (&(bsp->descr),
1192
AsnIoMemCopy ((Pointer) sdphead,
1193
(AsnReadFunc) SeqDescrAsnRead,
1194
(AsnWriteFunc) SeqDescrAsnWrite));
1195
} else if (IS_Bioseq_set (sep)) {
1196
bssp = (BioseqSetPtr) sep->data.ptrvalue;
1197
ValNodeLink (&(bssp->descr),
1198
AsnIoMemCopy ((Pointer) sdphead,
1199
(AsnReadFunc) SeqDescrAsnRead,
1200
(AsnWriteFunc) SeqDescrAsnWrite));
1205
sdp = CreateNewDescriptor (sep, Seq_descr_create_date);
1207
sdp->data.ptrvalue = (Pointer) dp;
1210
SeriousSeqEntryCleanup (sep, NULL, NULL);
1211
WriteOneFile (results, base, ".sqn", sep, sbp);
1213
if (validate || flatfile) {
1216
/* copy in citsub as publication for validator and flatfile */
1218
sdp = CreateNewDescriptor (sep, Seq_descr_pub);
1220
sdp->data.ptrvalue = AsnIoMemCopy ((Pointer) pdp,
1221
(AsnReadFunc) PubdescAsnRead,
1222
(AsnWriteFunc) PubdescAsnWrite);
1225
SeqMgrIndexFeatures (entityID, 0);
1228
Message (MSG_POST, "Validating %s\n", base);
1229
ValidateOneFile (results, base, ".val", sep);
1232
Message (MSG_POST, "Flatfile %s\n", base);
1233
sep = FindNucSeqEntry (sep);
1234
FlatfileOneFile (results, base, ".gbf", sep);
1238
ObjMgrFreeByEntityID (entityID);
1241
static void GetFirstBiop (
1247
BioSourcePtr PNTR biopp;
1249
biopp = (BioSourcePtr PNTR) userdata;
1250
if (biop == NULL || biopp == NULL) return;
1251
if (*biopp != NULL) return;
1255
static CharPtr overwriteMsg = "Your template with a .sqn suffix will be overwritten. Do you wish to continue?";
1257
static Boolean TemplateOverwriteRisk (
1265
Char file [FILENAME_MAX], path [PATH_MAX];
1269
if (StringStr (tmplate, ".sqn") == NULL) return FALSE;
1270
if (! StringHasNoText (single)) {
1271
StringNCpy_0 (file, tmplate, sizeof (file));
1272
ptr = StringStr (file, ".");
1276
ptr = StringStr (single, ".");
1278
StringCat (file, ptr);
1280
if (StringCmp (file, single) == 0) return TRUE;
1281
} else if (! StringHasNoText (directory)) {
1282
StringNCpy_0 (path, directory, sizeof (path));
1283
StringNCpy_0 (file, tmplate, sizeof (file));
1284
ptr = StringStr (file, ".");
1288
StringCat (file, suffix);
1289
FileBuildPath (path, NULL, file);
1290
if (FileLength (path) > 0) return TRUE;
1295
/* Args structure contains command-line arguments */
1297
#define p_argInputPath 0
1298
#define r_argOutputPath 1
1299
#define f_argSingleFile 2
1300
#define x_argSuffix 3
1301
#define t_argTemplate 4
1302
#define s_argFastaSet 5
1303
#define a_argAccession 6
1304
#define n_argOrgName 7
1305
#define c_argFindOrf 8
1306
#define m_argAltStart 9
1307
#define k_argConflict 10
1308
#define v_argValidate 11
1309
#define b_argGenBank 12
1312
{"Path to files", NULL, NULL, NULL,
1313
TRUE, 'p', ARG_STRING, 0.0, 0, NULL},
1314
{"Path for results", NULL, NULL, NULL,
1315
TRUE, 'r', ARG_STRING, 0.0, 0, NULL},
1316
{"Only this file", NULL, NULL, NULL,
1317
TRUE, 'f', ARG_FILE_IN, 0.0, 0, NULL},
1318
{"Suffix", ".fsa", NULL, NULL,
1319
TRUE, 'x', ARG_STRING, 0.0, 0, NULL},
1320
{"Template file", NULL, NULL, NULL,
1321
TRUE, 't', ARG_FILE_IN, 0.0, 0, NULL},
1322
{"Read Set of FASTAs", "F", NULL, NULL,
1323
TRUE, 's', ARG_BOOLEAN, 0.0, 0, NULL},
1324
{"Accession", NULL, NULL, NULL,
1325
TRUE, 'a', ARG_STRING, 0.0, 0, NULL},
1326
{"Organism name", NULL, NULL, NULL,
1327
TRUE, 'n', ARG_STRING, 0.0, 0, NULL},
1328
{"Annotate longest ORF", "F", NULL, NULL,
1329
TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
1330
{"Allow alternative starts", "F", NULL, NULL,
1331
TRUE, 'm', ARG_BOOLEAN, 0.0, 0, NULL},
1332
{"Set conflict on mismatch", "F", NULL, NULL,
1333
TRUE, 'k', ARG_BOOLEAN, 0.0, 0, NULL},
1334
{"Validate", "F", NULL, NULL,
1335
TRUE, 'v', ARG_BOOLEAN, 0.0, 0, NULL},
1336
{"Generate GenBank file", "F", NULL, NULL,
1337
TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
1343
Boolean altstart, conflict, fastaset, findorf, flatfile, validate;
1344
CharPtr base, directory, results, suffix, accn, organism, ptr, tmplate;
1349
ValNodePtr head, vnp;
1351
PubdescPtr pdp = NULL;
1353
SubmitBlockPtr sbp = NULL;
1354
SeqDescrPtr sdphead = NULL;
1357
BioSourcePtr src = NULL;
1358
SeqSubmitPtr ssp = NULL;
1360
/* standard setup */
1362
ErrSetFatalLevel (SEV_MAX);
1363
ErrClearOptFlags (EO_SHOW_USERSTR);
1364
UseLocalAsnloadDataAndErrMsg ();
1367
/* finish resolving internal connections in ASN.1 parse tables */
1369
if (! AllObjLoad ()) {
1370
Message (MSG_FATAL, "AllObjLoad failed");
1373
if (! SubmitAsnLoad ()) {
1374
Message (MSG_FATAL, "SubmitAsnLoad failed");
1377
if (! FeatDefSetLoad ()) {
1378
Message (MSG_FATAL, "FeatDefSetLoad failed");
1381
if (! SeqCodeSetLoad ()) {
1382
Message (MSG_FATAL, "SeqCodeSetLoad failed");
1385
if (! GeneticCodeTableLoad ()) {
1386
Message (MSG_FATAL, "GeneticCodeTableLoad failed");
1390
/* process command line arguments */
1392
if (! GetArgs ("tbl2asn", sizeof (myargs) / sizeof (Args), myargs)) {
1396
directory = (CharPtr) myargs [p_argInputPath].strvalue;
1397
results = (CharPtr) myargs [r_argOutputPath].strvalue;
1398
if (StringHasNoText (results)) {
1399
results = directory;
1401
suffix = (CharPtr) myargs [x_argSuffix].strvalue;
1402
base = (CharPtr) myargs [f_argSingleFile].strvalue;
1403
tmplate = (CharPtr) myargs [t_argTemplate].strvalue;
1404
fastaset = (Boolean) myargs [s_argFastaSet].intvalue;
1405
accn = (CharPtr) myargs [a_argAccession].strvalue;
1406
organism = (CharPtr) myargs [n_argOrgName].strvalue;
1407
findorf = (Boolean) myargs [c_argFindOrf].intvalue;
1408
altstart = (Boolean) myargs [m_argAltStart].intvalue;
1409
conflict = (Boolean) myargs [k_argConflict].intvalue;
1410
validate = (Boolean) myargs [v_argValidate].intvalue;
1411
flatfile = (Boolean) myargs [b_argGenBank].intvalue;
1413
if (StringHasNoText (base) && (! StringHasNoText (accn))) {
1414
Message (MSG_FATAL, "Accession can be entered only for a single record");
1418
/* Seq-submit or Submit-block template is optional */
1420
if (! StringHasNoText (tmplate)) {
1421
if (TemplateOverwriteRisk (tmplate, base, directory, suffix)) {
1422
if (Message (MSG_YN, overwriteMsg) == ANS_NO) return 0;
1424
fp = FileOpen (tmplate, "r");
1426
while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL) {
1427
if (datatype == OBJ_SEQSUB) {
1428
ssp = (SeqSubmitPtr) dataptr;
1429
} else if (datatype == OBJ_SUBMIT_BLOCK) {
1430
sbp = (SubmitBlockPtr) dataptr;
1431
} else if (datatype == OBJ_SEQDESC) {
1432
ValNodeLink (&sdphead, (SeqDescrPtr) dataptr);
1434
ObjMgrFree (datatype, dataptr);
1440
if (ssp != NULL && sbp == NULL) {
1444
Message (MSG_FATAL, "Unable to read required template file");
1451
/* copy submit block, will free SeqSubmit before processing */
1453
sbp = AsnIoMemCopy ((Pointer) sbp,
1454
(AsnReadFunc) SubmitBlockAsnRead,
1455
(AsnWriteFunc) SubmitBlockAsnWrite);
1457
sbp->tool = MemFree (sbp->tool);
1458
sbp->tool = StringSave ("tbl2asn");
1460
sbp->reldate = DateFree (sbp->reldate);
1463
csp->date = DateFree (csp->date);
1464
csp->date = DateCurr ();
1465
MemSet ((Pointer) &pd, 0, sizeof (Pubdesc));
1466
MemSet ((Pointer) &pb, 0, sizeof (ValNode));
1467
pb.choice = PUB_Sub;
1468
pb.data.ptrvalue = (Pointer) csp;
1473
if (ssp != NULL && ssp->datatype == 1) {
1474
sep = (SeqEntryPtr) ssp->data;
1476
VisitBioSourcesInSep (sep, (Pointer) &src, GetFirstBiop);
1479
/* copy top biosource */
1481
src = AsnIoMemCopy ((Pointer) src,
1482
(AsnReadFunc) BioSourceAsnRead,
1483
(AsnWriteFunc) BioSourceAsnWrite);
1487
/* in case template has colliding ID, free it now */
1489
SeqSubmitFree (ssp);
1493
/* process one or more records */
1495
if (! StringHasNoText (base)) {
1496
ptr = StringStr (base, ".");
1498
StringNCpy_0 (sfx, ptr, sizeof (sfx));
1501
ProcessOneRecord (sbp, pdp, src, directory, results, base, sfx, fastaset, accn,
1502
organism, sdphead, findorf, altstart, conflict, validate, flatfile);
1505
/* get list of all files in source directory */
1507
head = DirCatalog (directory);
1509
for (vnp = head; vnp != NULL; vnp = vnp->next) {
1510
if (vnp->choice == 0) {
1511
base = (CharPtr) vnp->data.ptrvalue;
1512
if (! StringHasNoText (base)) {
1513
ptr = StringStr (base, suffix);
1516
Message (MSG_POST, "Processing %s\n", base);
1517
ProcessOneRecord (sbp, pdp, src, directory, results, base, suffix, fastaset, NULL,
1518
organism, sdphead, findorf, altstart, conflict, validate, flatfile);
1524
/* clean up file list */
1526
ValNodeFreeData (head);
1530
SubmitBlockFree (sbp);
1533
BioSourceFree (src);
1536
SeqDescrFree (sdphead);
1538
TransTableFreeAll ();