37
37
* Date Name Description of modification
39
39
* $Log: lsqfetch.c,v $
40
* Revision 6.11 2001/09/21 20:02:04 kans
41
* allow U and u to be DNA in CheckDnaResidue for the BLAST guys, even though it is amino acid Selenocysteine in most places
43
* Revision 6.10 2001/03/13 16:48:58 kans
44
* fixes to saving path, binary search results
46
* Revision 6.9 2001/03/12 23:19:33 kans
47
* added IndexedFastaLib functions - currently uses genome contig naming conventions
49
* Revision 6.8 2001/01/09 00:12:39 kans
50
* now handles SEQID_GI
52
* Revision 6.7 1999/10/07 16:21:13 kans
53
* removed static AddSeqId and SeqIdDupList which were identical to public sequtil functions
55
* Revision 6.6 1999/07/20 21:18:39 sicotte
56
* add static AddSeqId for linker conflicts
58
* Revision 6.5 1999/07/20 21:16:49 sicotte
59
* add static SeqIdDupList for linker conflicts
61
* Revision 6.4 1999/04/01 22:26:13 sicotte
62
* Make lsqfetch Attempt to Parse the fasta defline, otherwise use the supplied SeqId
64
* Revision 6.3 1999/03/11 23:39:33 kans
65
* sprintf and sscanf casts
67
* Revision 6.2 1998/02/06 17:41:33 zjing
68
* make the function CheckDnaResidue external
70
* Revision 6.0 1997/08/25 18:06:28 madden
71
* Revision changed to 6.0
73
* $Log: lsqfetch.c,v $
40
* Revision 6.28 2004/10/05 19:11:23 kans
41
* separate internal CreateBinaryAsnIndex and CreateTextAsnIndex functions
43
* Revision 6.27 2004/10/05 18:56:48 kans
44
* AsnIndexedLibFetchEnable only works in text mode, backed out SeqEntryAsnRead change - will later implement use of catenated Seq-entry instead of se2bss processed input if text
46
* Revision 6.26 2004/10/05 17:34:16 kans
47
* protect all binary search functions against R out of range
49
* Revision 6.25 2004/10/05 17:25:04 kans
50
* AsnIndexedLibBioseqFetchFunc handles all Seq-id types, passes atp_se to SeqEntryAsnRead
52
* Revision 6.24 2004/10/05 16:21:37 kans
53
* LocalSeqFetchInit checks for INDEXED_TEXT_ASN and INDEXED_BIN_ASN, calls AsnIndexedLibFetchEnable
55
* Revision 6.23 2004/08/04 20:21:04 kans
56
* record alfp->binary at correct place in asn indexed fetch enable function
58
* Revision 6.22 2004/08/03 17:51:49 kans
59
* added AsnIndexedLibFetch enable and disable functions
61
* Revision 6.21 2004/08/02 19:10:14 kans
62
* added CreateAsnIndex for indexing Bioseq-set ftp release files
64
* Revision 6.20 2004/04/13 16:58:32 kans
65
* allow alt index to have gi numbers, also test .fsa if .fa fails
67
* Revision 6.19 2003/11/13 17:18:02 kans
68
* added SearchAltIndex, finished Alt fetch for chimp revision
70
* Revision 6.18 2003/11/12 23:49:11 kans
71
* SortIfpByID needed LIBCALLBACK for PC
73
* Revision 6.17 2003/11/12 23:38:48 kans
74
* changing AltIndexedFastaLibFetchEnable prototype, implementation not yet finished
76
* Revision 6.16 2003/08/27 21:24:05 kans
77
* enable alt indexed fasta looks up previously registered function, changes settings for new path
79
* Revision 6.15 2003/08/27 19:27:43 kans
80
* added AltIndexedFastaLibFetch functions for chimpanzee genome project
82
* Revision 6.14 2002/11/13 23:07:37 johnson
83
* Changed make_lib such that it looks to see if it matches the *whole* seq-id
84
* (defined by the next character being non-alphanumeric).
86
* Revision 6.13 2002/07/19 20:16:33 johnson
87
* bug fix in make_lib -- wasn't properly handling sequences >=1000 residues
89
* Revision 6.12 2002/01/22 19:50:32 kans
90
* IndexedFastaLibBioseqFetchFunc looks for prefix with upper case followed by lower case
74
92
* Revision 6.11 2001/09/21 20:02:04 kans
75
93
* allow U and u to be DNA in CheckDnaResidue for the BLAST guys, even though it is amino acid Selenocysteine in most places
1330
1370
flfp = (FastaLibFetchPtr) ompp->procdata;
1331
1371
if (flfp == NULL) return;
1332
1372
MemFree (flfp->path);
1373
/* MemFree (flfp->fastaname); */
1333
1374
FreeFastaIndex (flfp->currentfip);
1334
1375
MemFree (flfp);
1378
/* chimpanzee genome object manager registerable fetch function */
1380
static CharPtr altfastalibfetchproc = "AltIndexedFastaLibBioseqFetch";
1382
typedef struct idfip {
1385
} IdFip, PNTR IdFipPtr;
1387
typedef struct alibftch {
1392
} AltLibFetchData, PNTR AltLibFetchPtr;
1394
static void ChangeLocalToGenbank (BioseqPtr bsp, Pointer userdata)
1397
Char id [41], tmp [41];
1400
for (sip = bsp->id; sip != NULL && sip->choice != SEQID_LOCAL; sip = sip->next) continue;
1401
if (sip == NULL) return;
1402
SeqIdWrite (sip, id, PRINTID_REPORT, sizeof (id));
1403
sprintf (tmp, "gb|%s", id);
1404
sip = SeqIdParse (tmp);
1405
bsp->id = SeqIdSetFree (bsp->id);
1407
SeqMgrReplaceInBioseqIndex (bsp);
1410
static FastaIndexPtr SearchAltIndex (
1411
AltLibFetchPtr alfp,
1420
if (alfp == NULL || alfp->index == NULL) return NULL;
1422
if (StringHasNoText (seqid)) return NULL;
1425
R = alfp->numids - 1;
1428
compare = StringICmp (ifp [mid].seqid, seqid);
1436
if (R >= 0 && R < alfp->numids) {
1437
if (StringICmp (ifp [R].seqid, seqid) == 0) {
1445
static Int2 LIBCALLBACK AltIndexedFastaLibBioseqFetchFunc (Pointer data)
1448
AltLibFetchPtr alfp;
1450
Pointer dataptr = NULL;
1451
Uint2 datatype, entityID = 0;
1452
Char file [FILENAME_MAX], path [PATH_MAX], id [41];
1456
OMProcControlPtr ompcp;
1458
SeqEntryPtr sep = NULL;
1462
ompcp = (OMProcControlPtr) data;
1463
if (ompcp == NULL) return OM_MSG_RET_ERROR;
1465
if (ompp == NULL) return OM_MSG_RET_ERROR;
1466
alfp = (AltLibFetchPtr) ompp->procdata;
1467
if (alfp == NULL) return OM_MSG_RET_ERROR;
1468
sip = (SeqIdPtr) ompcp->input_data;
1469
if (sip == NULL) return OM_MSG_RET_ERROR;
1471
if (sip->choice == SEQID_GENBANK || sip->choice == SEQID_GI) {
1473
SeqIdWrite (sip, id, PRINTID_REPORT, sizeof (id));
1474
fip = SearchAltIndex (alfp, id);
1476
offset = SearchFastaIndex (fip, id);
1477
if (offset < 0) return OM_MSG_RET_ERROR;
1478
StringCpy (file, fip->file);
1479
tmp = StringStr (file, ".idx");
1483
StringCat (file, ".fa");
1484
StringNCpy_0 (path, fip->path, sizeof (path));
1485
FileBuildPath (path, NULL, file);
1486
fp = FileOpen (path, "r");
1488
tmp = StringStr (file, ".fa");
1491
StringCat (file, ".fsa");
1492
StringNCpy_0 (path, fip->path, sizeof (path));
1493
FileBuildPath (path, NULL, file);
1494
fp = FileOpen (path, "r");
1497
if (fp == NULL) return OM_MSG_RET_ERROR;
1498
fseek (fp, offset, SEEK_SET);
1499
dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID,
1500
FALSE, FALSE, TRUE, FALSE);
1501
if (dataptr != NULL) {
1502
sep = GetTopSeqEntryForEntityID (entityID);
1508
if (sep == NULL) return OM_MSG_RET_ERROR;
1509
VisitBioseqsInSep (sep, NULL, ChangeLocalToGenbank);
1510
bsp = BioseqFindInSeqEntry (sip, sep);
1511
ompcp->output_data = (Pointer) bsp;
1512
ompcp->output_entityID = ObjMgrGetEntityIDForChoice (sep);
1513
return OM_MSG_RET_DONE;
1516
static int LIBCALLBACK SortIfpByID (VoidPtr vp1, VoidPtr vp2)
1519
IdFipPtr ifp1, ifp2;
1521
if (vp1 == NULL || vp2 == NULL) return 0;
1522
ifp1 = (IdFipPtr) vp1;
1523
ifp2 = (IdFipPtr) vp2;
1524
if (ifp1 == NULL || ifp2 == NULL) return 0;
1525
return StringICmp (ifp1->seqid, ifp2->seqid);
1528
NLM_EXTERN Boolean AltIndexedFastaLibFetchEnable (CharPtr path)
1531
AltLibFetchPtr alfp = NULL;
1532
Char file [FILENAME_MAX];
1537
Boolean is_new = FALSE;
1542
Char str [PATH_MAX];
1546
StringNCpy_0 (str, path, sizeof (str));
1547
TrimSpacesAroundString (str);
1549
ompp = ObjMgrProcFind (omp, 0, altfastalibfetchproc, OMPROC_FETCH);
1551
alfp = (AltLibFetchPtr) ompp->procdata;
1553
alfp->path = MemFree (alfp->path);
1554
for (vnp = alfp->fiplist; vnp != NULL; vnp = vnp->next) {
1555
fip = (FastaIndexPtr) vnp->data.ptrvalue;
1556
FreeFastaIndex (fip);
1558
alfp->fiplist = ValNodeFree (alfp->fiplist);
1559
alfp->index = MemFree (alfp->index);
1562
alfp = (AltLibFetchPtr) MemNew (sizeof (AltLibFetchData));
1566
alfp->path = StringSave (str);
1567
head = DirCatalog (str);
1568
for (vnp = head; vnp != NULL; vnp = vnp->next) {
1569
if (vnp->choice == 0) {
1570
tmp = (CharPtr) vnp->data.ptrvalue;
1571
if (StringStr (tmp, ".idx") != NULL) {
1572
StringCpy (str, alfp->path);
1573
sprintf (file, "%s", tmp);
1574
FileBuildPath (str, NULL, file);
1575
fip = ReadFastaIndex (str);
1577
ValNodeAddPointer (&(alfp->fiplist), 0, (Pointer) fip);
1578
numids += fip->numlines;
1583
ValNodeFreeData (head);
1584
ifp = (IdFipPtr) MemNew (sizeof (IdFip) * (numids + 2));
1586
alfp->numids = numids;
1589
for (vnp = alfp->fiplist; vnp != NULL; vnp = vnp->next) {
1590
fip = (FastaIndexPtr) vnp->data.ptrvalue;
1592
for (j = 0; j < fip->numlines; j++, i++) {
1593
ifp [i].seqid = fip->seqids [j];
1598
HeapSort (ifp, (size_t) numids, sizeof (IdFip), SortIfpByID);
1602
ObjMgrProcLoad (OMPROC_FETCH, altfastalibfetchproc, altfastalibfetchproc,
1603
OBJ_SEQID, 0, OBJ_BIOSEQ, 0, (Pointer) alfp,
1604
AltIndexedFastaLibBioseqFetchFunc, PROC_PRIORITY_DEFAULT);
1609
NLM_EXTERN void AltIndexedFastaLibFetchDisable (void)
1612
AltLibFetchPtr alfp;
1619
ompp = ObjMgrProcFind (omp, 0, altfastalibfetchproc, OMPROC_FETCH);
1620
if (ompp == NULL) return;
1621
ObjMgrFreeUserData (0, ompp->procid, OMPROC_FETCH, 0);
1622
alfp = (AltLibFetchPtr) ompp->procdata;
1623
if (alfp == NULL) return;
1624
alfp->path = MemFree (alfp->path);
1625
for (vnp = alfp->fiplist; vnp != NULL; vnp = vnp->next) {
1626
fip = (FastaIndexPtr) vnp->data.ptrvalue;
1627
FreeFastaIndex (fip);
1629
alfp->fiplist = ValNodeFree (alfp->fiplist);
1630
alfp->index = MemFree (alfp->index);
1634
/* common function for creating indexes of fasta library files */
1337
1636
NLM_EXTERN void CreateFastaIndex (
1420
1719
FileClose (ifp);
1721
ValNodeFreeData (head);
1724
/* object manager registerable fetch function for local ASN.1 indexed files */
1726
static CharPtr asnlibfetchproc = "AsnIndexedLibBioseqFetch";
1728
typedef struct asnlibftch {
1734
} AsnLibFetchData, PNTR AsnLibFetchPtr;
1736
static FastaIndexPtr SearchAsnIndex (
1737
AsnLibFetchPtr alfp,
1746
if (alfp == NULL || alfp->index == NULL) return NULL;
1748
if (StringHasNoText (seqid)) return NULL;
1751
R = alfp->numids - 1;
1754
compare = StringICmp (ifp [mid].seqid, seqid);
1762
if (R >= 0 && R < alfp->numids) {
1763
if (StringICmp (ifp [R].seqid, seqid) == 0) {
1771
static Int2 LIBCALLBACK AsnIndexedLibBioseqFetchFunc (Pointer data)
1775
AsnLibFetchPtr alfp;
1777
Char file [FILENAME_MAX], path [PATH_MAX], id [41];
1780
OMProcControlPtr ompcp;
1782
SeqEntryPtr sep = NULL;
1786
ompcp = (OMProcControlPtr) data;
1787
if (ompcp == NULL) return OM_MSG_RET_ERROR;
1789
if (ompp == NULL) return OM_MSG_RET_ERROR;
1790
alfp = (AsnLibFetchPtr) ompp->procdata;
1791
if (alfp == NULL) return OM_MSG_RET_ERROR;
1792
sip = (SeqIdPtr) ompcp->input_data;
1793
if (sip == NULL) return OM_MSG_RET_ERROR;
1795
SeqIdWrite (sip, id, PRINTID_REPORT, sizeof (id));
1796
fip = SearchAsnIndex (alfp, id);
1798
offset = SearchFastaIndex (fip, id);
1799
if (offset < 0) return OM_MSG_RET_ERROR;
1800
StringCpy (file, fip->file);
1801
tmp = StringStr (file, ".idx");
1805
StringCat (file, ".aso");
1806
StringNCpy_0 (path, fip->path, sizeof (path));
1807
FileBuildPath (path, NULL, file);
1808
aip = AsnIoOpen (path, alfp->binary? "rb" : "r");
1810
tmp = StringStr (file, ".aso");
1813
StringCat (file, ".asn");
1814
StringNCpy_0 (path, fip->path, sizeof (path));
1815
FileBuildPath (path, NULL, file);
1816
aip = AsnIoOpen (path, alfp->binary? "rb" : "r");
1819
if (aip == NULL) return OM_MSG_RET_ERROR;
1820
AsnIoSeek (aip, offset);
1821
sep = SeqEntryAsnRead (aip, NULL);
1825
if (sep == NULL) return OM_MSG_RET_ERROR;
1826
bsp = BioseqFindInSeqEntry (sip, sep);
1827
ompcp->output_data = (Pointer) bsp;
1828
ompcp->output_entityID = ObjMgrGetEntityIDForChoice (sep);
1829
return OM_MSG_RET_DONE;
1832
NLM_EXTERN Boolean AsnIndexedLibFetchEnable (CharPtr path, Boolean binary)
1835
AsnLibFetchPtr alfp = NULL;
1836
Char file [FILENAME_MAX];
1841
Boolean is_new = FALSE;
1846
Char str [PATH_MAX];
1850
StringNCpy_0 (str, path, sizeof (str));
1851
TrimSpacesAroundString (str);
1853
ompp = ObjMgrProcFind (omp, 0, asnlibfetchproc, OMPROC_FETCH);
1855
alfp = (AsnLibFetchPtr) ompp->procdata;
1857
alfp->path = MemFree (alfp->path);
1858
for (vnp = alfp->fiplist; vnp != NULL; vnp = vnp->next) {
1859
fip = (FastaIndexPtr) vnp->data.ptrvalue;
1860
FreeFastaIndex (fip);
1862
alfp->fiplist = ValNodeFree (alfp->fiplist);
1863
alfp->index = MemFree (alfp->index);
1866
alfp = (AsnLibFetchPtr) MemNew (sizeof (AsnLibFetchData));
1869
alfp->binary = binary;
1873
alfp->path = StringSave (str);
1874
head = DirCatalog (str);
1875
for (vnp = head; vnp != NULL; vnp = vnp->next) {
1876
if (vnp->choice == 0) {
1877
tmp = (CharPtr) vnp->data.ptrvalue;
1878
if (StringStr (tmp, ".idx") != NULL) {
1879
StringCpy (str, alfp->path);
1880
sprintf (file, "%s", tmp);
1881
FileBuildPath (str, NULL, file);
1882
fip = ReadFastaIndex (str);
1884
ValNodeAddPointer (&(alfp->fiplist), 0, (Pointer) fip);
1885
numids += fip->numlines;
1890
ValNodeFreeData (head);
1891
ifp = (IdFipPtr) MemNew (sizeof (IdFip) * (numids + 2));
1893
alfp->numids = numids;
1896
for (vnp = alfp->fiplist; vnp != NULL; vnp = vnp->next) {
1897
fip = (FastaIndexPtr) vnp->data.ptrvalue;
1899
for (j = 0; j < fip->numlines; j++, i++) {
1900
ifp [i].seqid = fip->seqids [j];
1905
HeapSort (ifp, (size_t) numids, sizeof (IdFip), SortIfpByID);
1909
ObjMgrProcLoad (OMPROC_FETCH, asnlibfetchproc, asnlibfetchproc,
1910
OBJ_SEQID, 0, OBJ_BIOSEQ, 0, (Pointer) alfp,
1911
AsnIndexedLibBioseqFetchFunc, PROC_PRIORITY_DEFAULT);
1916
NLM_EXTERN void AsnIndexedLibFetchDisable (void)
1919
AsnLibFetchPtr alfp;
1926
ompp = ObjMgrProcFind (omp, 0, asnlibfetchproc, OMPROC_FETCH);
1927
if (ompp == NULL) return;
1928
ObjMgrFreeUserData (0, ompp->procid, OMPROC_FETCH, 0);
1929
alfp = (AsnLibFetchPtr) ompp->procdata;
1930
if (alfp == NULL) return;
1931
alfp->path = MemFree (alfp->path);
1932
for (vnp = alfp->fiplist; vnp != NULL; vnp = vnp->next) {
1933
fip = (FastaIndexPtr) vnp->data.ptrvalue;
1934
FreeFastaIndex (fip);
1936
alfp->fiplist = ValNodeFree (alfp->fiplist);
1937
alfp->index = MemFree (alfp->index);
1941
/* common function for creating indexes of ASN.1 Bioseq-set ftp release files */
1943
typedef struct asnidxdata {
1948
} AsnIdxData, PNTR AsnIdxPtr;
1950
static void SaveAsnIdxOffset (
1957
Char id [41], tmp [64];
1960
aip = (AsnIdxPtr) userdata;
1961
if (bsp == NULL || aip == NULL) return;
1963
sip = SeqIdFindBest (bsp->id, SEQID_GI);
1965
sip = SeqIdFindBest (bsp->id, 0);
1968
SeqIdWrite (sip, id, PRINTID_REPORT, sizeof (id));
1969
if (! StringHasNoText (id)) {
1971
/* save ID and offset separated by tab character */
1973
sprintf (tmp, "%s\t%ld", id, (long) aip->offset);
1974
aip->last = ValNodeNew (aip->last);
1975
if (aip->head == NULL) {
1976
aip->head = aip->last;
1978
if (aip->last != NULL) {
1979
aip->last->data.ptrvalue = StringSave (tmp);
1984
static void CreateBinaryAsnIndex (
1992
AsnTypePtr atp, atp_bss, atp_se;
1995
Char path [PATH_MAX];
2000
if (StringHasNoText (file)) return;
2002
/* replace extension by .idx for index file */
2004
StringNCpy_0 (path, file, sizeof (path));
2005
ptr = StringRChr (path, '.');
2009
StringCat (path, ".idx");
2011
aip = AsnIoOpen (file, "rb");
2012
if (aip == NULL) return;
2014
ofp = FileOpen (path, "w");
2017
MemSet ((Pointer) &aid, 0, sizeof (AsnIdxData));
2022
amp = AsnAllModPtr ();
2024
atp_bss = AsnFind ("Bioseq-set");
2025
atp_se = AsnFind ("Bioseq-set.seq-set.E");
2029
/* get initial file offset */
2031
aid.offset = AsnIoTell (aip);
2033
/* read next ASN.1 component */
2035
while ((atp = AsnReadId (aip, amp, atp)) != NULL) {
2036
if (atp == atp_se) {
2038
sep = SeqEntryAsnRead (aip, atp);
2039
VisitBioseqsInSep (sep, (Pointer) &aid, SaveAsnIdxOffset);
2043
ObjMgrReapOne (omp);
2044
ObjMgrFreeCache (0);
2048
AsnReadVal (aip, atp, NULL);
2051
/* get file offset of next ASN.1 component */
2053
aid.offset = AsnIoTell (aip);
2058
aid.head = ValNodeSort (aid.head, SortVnpByString);
2059
aid.head = UniqueValNode (aid.head);
2061
/* write ID and offset index */
2063
for (vnp = aid.head; vnp != NULL; vnp = vnp->next) {
2064
fprintf (ofp, "%s\n", (CharPtr) vnp->data.ptrvalue);
2072
ValNodeFreeData (aid.head);
2075
static void CreateTextAsnIndex (
2081
Pointer dataptr = NULL;
2082
Uint2 datatype, entityID = 0;
2084
Char path [PATH_MAX];
2089
if (StringHasNoText (file)) return;
2091
/* replace extension by .idx for index file */
2093
StringNCpy_0 (path, file, sizeof (path));
2094
ptr = StringRChr (path, '.');
2098
StringCat (path, ".idx");
2100
ifp = FileOpen (file, "r");
2101
if (ifp == NULL) return;
2103
ofp = FileOpen (path, "w");
2106
MemSet ((Pointer) &aid, 0, sizeof (AsnIdxData));
2111
/* get initial file offset */
2113
aid.offset = ftell (ifp);
2115
/* read next ASN.1 component */
2117
while ((dataptr = ReadAsnFastaOrFlatFile (ifp, &datatype, &entityID,
2118
FALSE, FALSE, TRUE, FALSE)) != NULL) {
2120
sep = GetTopSeqEntryForEntityID (entityID);
2121
VisitBioseqsInSep (sep, (Pointer) &aid, SaveAsnIdxOffset);
2123
ObjMgrFreeByEntityID (entityID);
2125
/* get file offset of next ASN.1 component */
2127
aid.offset = ftell (ifp);
2132
aid.head = ValNodeSort (aid.head, SortVnpByString);
2133
aid.head = UniqueValNode (aid.head);
2135
/* write ID and offset index */
2137
for (vnp = aid.head; vnp != NULL; vnp = vnp->next) {
2138
fprintf (ofp, "%s\n", (CharPtr) vnp->data.ptrvalue);
2146
ValNodeFreeData (aid.head);
2149
NLM_EXTERN void CreateAsnIndex (
2156
CreateBinaryAsnIndex (file);
2158
CreateTextAsnIndex (file);