30
30
* Initial Version Creation Date: 10/18/1999
34
34
* File Description: CDD utility routines
37
37
* --------------------------------------------------------------------------
38
38
* $Log: cddutil.c,v $
39
* Revision 1.92 2004/06/22 14:20:19 camacho
40
* Changed invocation of posFreqsToMatrix to conform with new signature.
42
* Revision 1.91 2003/12/09 22:21:35 bauer
43
* added CddCountResTypes
45
* Revision 1.90 2003/12/09 18:48:05 bauer
46
* commented out deprecated fields in BlastKarlinBlkCopy
48
* Revision 1.89 2003/10/07 21:19:39 bauer
49
* made CddMasterIs3D more general
51
* Revision 1.88 2003/08/25 19:09:47 bauer
52
* added SeqAlignReadFromFile
54
* Revision 1.87 2003/05/22 18:39:14 bauer
55
* list txids in Cdd XML dumper
57
* Revision 1.86 2003/05/21 17:25:21 bauer
58
* optional ObjMgr in CddReadDBGetBioseq
60
* Revision 1.85 2003/05/09 20:38:12 bauer
61
* report invalid intervals in CddRelocateSeqLoc
63
* Revision 1.84 2003/05/08 14:40:09 bauer
64
* bugfix in CddConsensus
66
* Revision 1.83 2003/04/25 17:17:32 thiessen
67
* fix return value type
69
* Revision 1.82 2003/04/25 14:36:20 bauer
70
* impalaScaling now returns boolean value
72
* Revision 1.81 2003/02/06 21:04:27 bauer
73
* fixed bug in reindexing to consensus
75
* Revision 1.80 2002/12/03 14:36:31 bauer
76
* added CddMSLMixedToMSLDenDiag
78
* Revision 1.79 2002/11/22 21:35:23 bauer
79
* added SeqAnnotReadFromFile and preservation of scores in DenseSeg to DenseDiag conversion
81
* Revision 1.78 2002/10/22 14:54:27 bauer
82
* fixed date format in XML dumper
84
* Revision 1.77 2002/10/10 20:38:19 bauer
85
* changes to accomodate new spec items
89
* Revision 1.76 2002/10/02 17:32:21 bauer
90
* avoid merging blocks when reindexing alignments
92
* Revision 1.75 2002/08/17 11:55:07 bauer
95
* Revision 1.74 2002/08/16 19:51:45 bauer
96
* added Ben's CddSrvGetStyle2
98
* Revision 1.73 2002/08/06 12:54:26 bauer
99
* fixes to accomodate COGs
101
* Revision 1.72 2002/07/31 17:15:29 kans
102
* added prototype for BlastDefLineSetFree for Mac compiler
104
* Revision 1.71 2002/07/31 14:58:58 bauer
105
* BLAST DB Sequence Retrieval
107
* Revision 1.70 2002/07/17 18:54:10 bauer
108
* CddFeaturesAreConsistent now returns explicit error messages
110
* Revision 1.69 2002/07/11 14:43:43 bauer
111
* column 21(X) is now always -1 in PSSMs
113
* Revision 1.68 2002/07/10 22:51:10 bauer
114
* changed score for 26th pssm column to match whats done by makemat
116
* Revision 1.67 2002/07/10 22:06:07 bauer
117
* replaced posScaling by impalaScaling
119
* Revision 1.66 2002/07/10 15:34:31 bauer
120
* made SipIsConsensus public
122
* Revision 1.65 2002/07/09 21:12:39 bauer
123
* added CddDenDiagCposComp2KBP to return Karlin-Altschul parameters together with PSSM
125
* Revision 1.64 2002/07/05 21:09:25 bauer
126
* added GetAlignmentSize
128
* Revision 1.63 2002/06/12 15:22:51 bauer
131
* Revision 1.62 2002/05/31 17:54:31 thiessen
132
* fix for read-only string literal (e.g. on Mac)
134
* Revision 1.61 2002/05/24 17:49:01 bauer
135
* Unlock Bioseqs again in SeqAlignInform
137
* Revision 1.60 2002/05/16 22:37:49 bauer
138
* free seqalign in CddExpAlignToSeqAlign if empty
140
* Revision 1.59 2002/05/06 16:59:50 bauer
141
* remove blanks in inferred CD short names
143
* Revision 1.58 2002/04/25 14:30:19 bauer
144
* fixed CddFindMMDBIdInBioseq
146
* Revision 1.57 2002/04/22 16:37:31 bauer
147
* added check for missing structure alignments
149
* Revision 1.56 2002/04/18 21:00:26 bauer
150
* added check CddFeaturesAreConsistent
152
* Revision 1.55 2002/04/12 14:02:43 bauer
153
* added update_date case to CddAssignDescr
155
* Revision 1.54 2002/04/11 14:34:25 bauer
156
* added CddRemoveConsensus
158
* Revision 1.53 2002/03/28 15:55:00 bauer
159
* fixed bug in CddFindBioseqInSeqEntry, thanks to Ben
161
* Revision 1.52 2002/02/20 22:27:28 bauer
162
* utility functions for the CD-Server
164
* Revision 1.51 2002/02/12 23:00:46 bauer
165
* added missing break in CddRelocateSeqLoc
167
* Revision 1.50 2002/02/06 19:35:37 bauer
168
* some more CddGet.. functionality
170
* Revision 1.49 2002/02/05 23:15:40 bauer
171
* added a few CddGet.. utility functions, changes to CddAddDescr allow to extend scrapbook line by line
173
* Revision 1.48 2002/01/28 14:11:29 bauer
174
* add score to entrez neighbor lists
176
* Revision 1.47 2002/01/05 14:49:44 bauer
177
* made SeqAlignDup a local function
179
* Revision 1.46 2002/01/04 19:46:56 bauer
180
* added functions to interconvert PSSM-Ids and accessions
39
182
* Revision 1.45 2001/11/15 15:35:13 kans
40
183
* changed strdup to StringSave for Mac
326
512
ValNodeLink(&(pcdd->description),vnp);
516
/*---------------------------------------------------------------------------*/
517
/*---------------------------------------------------------------------------*/
518
/* remove an item from the CD description. Works for a limited set of types. */
519
/* status and repeats are removed no matter what. For comments, references */
520
/* etc. the content of the description is compared to the input */
521
/*---------------------------------------------------------------------------*/
522
/*---------------------------------------------------------------------------*/
523
Boolean LIBCALL CddKillDescr(CddPtr pcdd, Pointer pThis, Int4 iWhat, Int4 iIval)
525
ValNodePtr vnp, vnpold = NULL, vnpthis = NULL, vnpbefore = NULL, pub;
527
vnp = pcdd->description;
529
if (vnp->choice == iWhat) {
531
case CddDescr_comment:
532
case CddDescr_othername:
533
case CddDescr_category:
534
case CddDescr_source:
535
if (Nlm_StrCmp((CharPtr)pThis,(CharPtr)vnp->data.ptrvalue) == 0) {
542
case CddDescr_status:
543
case CddDescr_curation_status:
544
case CddDescr_old_root:
545
case CddDescr_repeats:
551
case CddDescr_reference:
552
pub = vnp->data.ptrvalue;
553
if ((pThis && pThis == pub) ||
554
(iIval>0 && pub->choice==PUB_PMid && pub->data.intvalue==iIval)) {
569
vnpbefore->next = vnpthis->next;
571
pcdd->description = vnpthis->next;
577
/*---------------------------------------------------------------------------*/
578
/*---------------------------------------------------------------------------*/
579
/* series of functions to extract info from CDs */
580
/*---------------------------------------------------------------------------*/
581
/*---------------------------------------------------------------------------*/
582
Int4 LIBCALL CddGetVersion(CddPtr pcdd)
587
cid = pcdd->id; while (cid) {
588
if (cid->choice == CddId_gid) {
589
pGid = cid->data.ptrvalue;
590
return pGid->version;
597
OrgRefPtr LIBCALL CddGetOrgRef(CddPtr pcdd)
601
pcdsc = pcdd->description;
603
if (pcdsc->choice == CddDescr_tax_source) {
604
return (OrgRefPtr) pcdsc->data.ptrvalue;
611
Int4 LIBCALL CddGetPssmId(CddPtr pcdd)
615
cid = pcdd->id; while (cid) {
616
if (cid->choice == CddId_uid) {
617
return cid->data.intvalue;
624
Int4 LIBCALL CddGetPmIds(CddPtr pcdd, Int4Ptr iPMids)
629
Boolean bRefOpen = FALSE;
631
if (iPMids) iPMids = MemNew(250*sizeof(Int4));
632
pcdsc = pcdd->description;
634
if (pcdsc->choice == CddDescr_reference) {
635
pub = pcdsc->data.ptrvalue;
636
if (pub->choice == PUB_PMid) {
637
iPMids[count] = pub->data.intvalue;
639
if (count >= 250) return count;
648
CharPtr LIBCALL CddGetDescr(CddPtr pcdd)
652
pcdsc = pcdd->description;
654
if (pcdsc->choice == CddDescr_comment) {
655
if (Nlm_StrCmp(pcdsc->data.ptrvalue,"linked to 3D-structure") != 0) {
656
return (CharPtr) pcdsc->data.ptrvalue;
664
DatePtr LIBCALL CddGetCreateDate(CddPtr pcdd)
668
pcdsc = pcdd->description;
670
if (pcdsc->choice == CddDescr_create_date) {
671
return (DatePtr) pcdsc->data.ptrvalue;
678
DatePtr LIBCALL CddGetUpdateDate(CddPtr pcdd)
680
DatePtr pcd = NULL, pcdthis;
683
pcdsc = pcdd->description;
685
if (pcdsc->choice == CddDescr_update_date) {
686
pcdthis = (DatePtr) pcdsc->data.ptrvalue;
690
if (pcdthis->data[1] > pcd->data[1]) {
693
if (pcdthis->data[2] > pcd->data[2]) {
696
if (pcdthis->data[3] > pcd->data[3]) pcd = pcdthis;
706
CharPtr LIBCALL CddGetSource(CddPtr pcdd)
710
pcdsc = pcdd->description;
712
if (pcdsc->choice == CddDescr_source) {
713
return (CharPtr) pcdsc->data.ptrvalue;
720
CharPtr LIBCALL CddGetSourceId(CddPtr pcdd)
726
pcdsc = pcdd->description;
728
if (pcdsc->choice == CddDescr_source_id) {
729
vnp = pcdsc->data.ptrvalue;
730
if (vnp->choice == CddId_gid) {
731
pGid = vnp->data.ptrvalue;
732
return (CharPtr) pGid->accession;
740
Int4 LIBCALL CddGetAlignmentLength(CddPtr pcdd)
746
sap = pcdd->seqannot;
748
salp = (SeqAlignPtr) sap->data;
757
/*---------------------------------------------------------------------------*/
758
/* return number of rows and average number of blocks per row */
759
/*---------------------------------------------------------------------------*/
760
Int4Ptr LIBCALL GetAlignmentSize(SeqAlignPtr salp)
766
SeqAlignPtr salpThis;
771
ddp = salpThis->segs;
776
salpThis = salpThis->next;
778
pRet = MemNew(2*sizeof(Int4));
779
pRet[0] = iLength + 1;
780
pRet[1] = iBlocks / iLength;
784
ValNodePtr LIBCALL CddGetAnnotNames(CddPtr pcdd)
787
ValNodePtr vnp, vnpHead = NULL;
789
aap = pcdd->alignannot;
791
vnp = ValNodeCopyStr(&(vnpHead),0,aap->description);
329
797
/*---------------------------------------------------------------------------*/
330
798
/*---------------------------------------------------------------------------*/
331
799
/* Query the status of a CD */
414
979
/*---------------------------------------------------------------------------*/
415
980
/*---------------------------------------------------------------------------*/
981
/* repeat detection - find repeated subsequences in a CD consensus Sequence */
982
/*---------------------------------------------------------------------------*/
983
/*---------------------------------------------------------------------------*/
984
Boolean LIBCALL CddCheckForRepeats(CddPtr pcdd, Int4 width, Int4 GapI, Int4 GapE,
985
Nlm_FloatHi rthresh, Boolean bOutput)
988
BLAST_ScoreBlkPtr sbp;
990
Int4 len, winner = 1;
991
Int4 **iscore, repct = 0;
992
Int4 **fscore, bst, pnum, plen;
994
Int4 i, j, k, l, m, n, t1, t2, laststart;
997
Boolean done = FALSE;
999
Nlm_FloatHi cfac, wplaus = 0.0;
1000
SeqLocPtr slp, slpHead;
1003
typedef struct repstep {
1014
if (!CddHasConsensus) return FALSE;
1015
bsp = pcdd->trunc_master;
1017
sbp = BLAST_ScoreBlkNew(Seq_code_ncbistdaa,1);
1018
sbp->read_in_matrix = TRUE;
1019
sbp->protein_alphabet = TRUE;
1020
sbp->posMatrix = NULL;
1021
sbp->number_of_contexts = 1;
1022
iStatus = BlastScoreBlkMatFill(sbp,"BLOSUM62");
1024
spp = SeqPortNew(bsp, 0, LAST_RESIDUE, Seq_strand_unknown,Seq_code_ncbistdaa);
1025
buffer = MemNew((len)*sizeof(Uint1));
1026
for (i=0; i<len; i++) buffer[i] = SeqPortGetResidue(spp);
1027
spp = SeqPortFree(spp);
1029
iscore = (Int4 **) MemNew(len * sizeof(Int4 *));
1030
for (i=0;i<len;i++) iscore[i] = (Int4 *) MemNew(len * sizeof(Int4));
1031
fscore = (Int4 **) MemNew(len * sizeof(Int4 *));
1032
for (i=0;i<len;i++) fscore[i] = (Int4 *) MemNew(len * sizeof(Int4));
1034
for (i=0;i<len;i++) {
1036
iscore[i][i] = sbp->matrix[t1][t1];
1037
for (j=i+1;j<len;j++) {
1039
iscore[i][j] = sbp->matrix[t1][t2];
1046
for (i=0;i<len;i++) for (j=i;j<len;j++) fscore[i][j] = iscore[i][j];
1047
for (i=len-2;i>=0;i--) {
1048
for (j=len-2;j>=i;j--) {
1049
bst = fscore[i+1][j+1];
1050
for (k = i+2;k<i+2+width && k<len;k++) bst = MAX(bst,fscore[k][j+1]-GapI-GapE*(k-i-1));
1051
for (l = j+2;l<j+2+width && l<len;l++) bst = MAX(bst,fscore[i+1][l]-GapI-GapE*(l-j-1));
1053
fscore[i][j] += bst;
1058
for (j=laststart;j<len;j++) if (fscore[0][j] >= bst) {
1059
bst = fscore[0][j]; l = j;
1066
/* printf("Next best alignment scores %4d at %4d (of %4d: %5.1f%%)\n",bst,l,len, 100.0*l/len); */
1067
rep[repct].start = l;
1068
rep[repct].score = bst;
1069
rep[repct].npred = 1;
1070
rep[repct].plaus = 0.0;
1071
rep[repct].lpred = len;
1072
rep[repct].lremn = len - rep[repct].start;
1074
rep[repct].lpred = (Int4) (0.5 + ((Nlm_FloatHi) l / (Nlm_FloatHi) repct));
1075
rep[repct].npred = (Int4) (0.5 + ((Nlm_FloatHi) len / (Nlm_FloatHi) rep[repct].lpred));
1087
for (k=i+2;k<i+2+width && k<len;k++) {
1088
if (fscore[k][j+1]-GapI-GapE*(k-i-1) > bst) {
1089
bst = fscore[k][j+1]-GapI-GapE*(k-i-1);
1093
for (l=j+2;l<j+2+width && l<len;l++) {
1094
if (fscore[i+1][l]-GapI-GapE*(l-j-1) > bst) {
1095
bst = fscore[i+1][l]-GapI-GapE*(l-j-1);
1099
iscore[i][j] = -10000;
1100
if (bst < 0) tbdone = TRUE;
1102
if (i >= len-1 || j >= len-1) {
1104
iscore[i][j] = -10000;
1109
for (i=0;i<len;i++) MemFree(fscore[i]);
1111
for (i=0;i<len;i++) MemFree(iscore[i]);
1113
rep[0].plaus = (Nlm_FloatHi) (rep[0].score - rep[1].score) / (Nlm_FloatHi) rep[0].score;
1114
if (rep[0].plaus > rthresh) rep[0].plaus = 1.0;
1115
winner = 1; wplaus = rep[0].plaus;
1117
for (i=1;i<repct;i++) {
1119
plen = (Int4) (((Nlm_FloatHi)len/(Nlm_FloatHi)pnum)+.5);
1122
cfac += (Nlm_FloatHi)(abs(rep[j].start-(j*plen)))/(Nlm_FloatHi)plen;
1124
cfac /= (Nlm_FloatHi) i;
1125
rep[i].plaus = 1.0 - cfac;
1127
if (rep[i].plaus > wplaus) {
1128
wplaus = rep[i].plaus;
1132
for (i=0;i<winner;i++) {
1133
if (rep[i].lpred < 3 || (i>0 && rep[i].score < 17)) {
1139
for (i=0;i<repct;i++) {
1140
printf("%s %3d %4d %8.5f %4d %8.5f %2d %3d %8.5f\n",pcdd->name,
1141
i,rep[i].score,(Nlm_FloatHi)rep[i].score/(Nlm_FloatHi)rep[0].score,
1142
rep[i].start, (Nlm_FloatHi)rep[i].start/(Nlm_FloatHi)len,
1143
rep[i].npred,rep[i].lpred, rep[i].plaus);
1145
printf("%s: predicted %d repeats\n",pcdd->name,winner);
1148
CddKillDescr(pcdd,NULL,CddDescr_repeats,0);
1150
pcdr = CddRepeatNew(); pcdr->avglen = 0;
1151
slp = NULL; slpHead = NULL;
1152
for (i=0;i<winner;i++) {
1154
if (i < (winner-1)) l = rep[i+1].start - 1; else l = len - 1;
1155
pcdr->avglen += l-k+1;
1156
slp = (SeqLocPtr) ValNodeNew(NULL);
1157
slp->choice = SEQLOC_INT;
1158
sintp = SeqIntNew();
1161
sintp->id = SeqIdDup(pcdd->profile_range->id);
1162
slp->data.ptrvalue = sintp;
1163
if (!slpHead) slpHead = slp; else ValNodeLink(&(slpHead),slp);
1165
pcdr->avglen /= winner;
1166
pcdr->count = winner;
1167
pcdr->location = (SeqLocPtr) ValNodeNew(NULL);
1168
pcdr->location->choice = SEQLOC_PACKED_INT;
1169
pcdr->location->data.ptrvalue = slpHead;
1170
CddAssignDescr(pcdd,pcdr,CddDescr_repeats,0);
1173
sbp = BLAST_ScoreBlkDestruct(sbp);
1175
if (winner > 1) return TRUE;
1180
/*---------------------------------------------------------------------------*/
1181
/*---------------------------------------------------------------------------*/
416
1182
/* retrieve the accession of a CD as a character string */
417
1183
/*---------------------------------------------------------------------------*/
418
1184
/*---------------------------------------------------------------------------*/
5606
/*---------------------------------------------------------------------------*/
5607
/* fix character string for export into XML */
5608
/*---------------------------------------------------------------------------*/
5609
static CharPtr CddFixForXML(CharPtr pc)
5614
pcNew = MemNew(sizeof(Char) * (Nlm_StrLen(pc) + 100));
5616
while (pc[i] != '\0') {
5623
} else if (pc[i] == '<') {
5629
} else if (pc[i] == '&') {
5644
pcNew = Nlm_Realloc(pcNew, sizeof(Char) * Nlm_StrLen(pcNew));
5648
/*---------------------------------------------------------------------------*/
5649
/*---------------------------------------------------------------------------*/
5650
/* dump out pubmed links for a CD to be used in Entrez neighboring */
5651
/*---------------------------------------------------------------------------*/
5652
/*---------------------------------------------------------------------------*/
5653
void LIBCALL CddDumpPMLinks(CddPtr pcdd, FILE *FP)
5656
Int4 gi = 0, pmid = 0;
5657
ValNodePtr pub, pdescr;
5659
cid = (CddIdPtr) pcdd->id;
5661
switch (cid->choice) {
5663
gi = cid->data.intvalue;
5671
pdescr = pcdd->description;
5673
switch (pdescr->choice) {
5674
case CddDescr_reference:
5675
pub = pdescr->data.ptrvalue;
5676
if (pub->choice == PUB_PMid) {
5677
pmid = pub->data.intvalue;
5679
fprintf(FP,"%d\t%d\t0\n",gi,pmid);
5686
pdescr = pdescr->next;
5691
/*---------------------------------------------------------------------------*/
5692
/*---------------------------------------------------------------------------*/
5693
/* dump out taxonomy links for a CD to be used in Entrez neighboring */
5694
/*---------------------------------------------------------------------------*/
5695
/*---------------------------------------------------------------------------*/
5696
void LIBCALL CddDumpTaxLinks(CddPtr pcdd, FILE *FP)
5699
Int4 txid = 0, gi = 0;
5702
ValNodePtr pdescr, vnp;
5705
cid = (CddIdPtr) pcdd->id;
5707
switch (cid->choice) {
5709
gi = cid->data.intvalue;
5717
pdescr = pcdd->description;
5719
switch (pdescr->choice) {
5720
case CddDescr_tax_source:
5721
pOrgRef = (OrgRefPtr) pdescr->data.ptrvalue;
5724
dbtp = (DbtagPtr) vnp->data.ptrvalue;
5725
if (dbtp && Nlm_StrCmp(dbtp->db,"taxon") == 0) {
5727
if (oidp && oidp->id >= 0) {
5728
fprintf(FP,"%d\t%d\t0\n",gi,oidp->id);
5737
pdescr = pdescr->next;
5742
/*---------------------------------------------------------------------------*/
5743
/*---------------------------------------------------------------------------*/
5744
/* dump out contents of a CD used for Entrez-indexing in pseudo-XML */
5745
/*---------------------------------------------------------------------------*/
5746
/*---------------------------------------------------------------------------*/
5747
void LIBCALL CddDumpXML(CddPtr pcdd, FILE *FP)
5749
ValNodePtr pdescr, cid;
5750
CharPtr cShortName = NULL, cAbstract = NULL, cPubDate = NULL;
5751
CharPtr cEntrezDate = NULL, cFilter = NULL, cOrganism = NULL;
5752
CharPtr cAccession = NULL, cDatabase = NULL;
5753
Int4 gi = 0, txid = -1;
5754
Boolean bHaveAbstract = FALSE, bAllocDatabase = FALSE;
5762
cEntrezDate = MemNew(11 * sizeof(Char));
5763
cid = (CddIdPtr) pcdd->id;
5765
switch (cid->choice) {
5767
gi = cid->data.intvalue;
5770
pGid = (GlobalIdPtr) cid->data.ptrvalue;
5771
cAccession = pGid->accession;
5778
pdescr = pcdd->description;
5780
switch (pdescr->choice) {
5781
case CddDescr_comment:
5782
if (!bHaveAbstract) {
5783
cAbstract = (CharPtr) pdescr->data.ptrvalue;
5784
bHaveAbstract = TRUE;
5787
case CddDescr_source:
5788
cDatabase = (CharPtr) pdescr->data.ptrvalue;
5789
case CddDescr_tax_source:
5790
pOrgRef = (OrgRefPtr) pdescr->data.ptrvalue;
5791
cOrganism = pOrgRef->taxname;
5794
dbtp = (DbtagPtr) vnp->data.ptrvalue;
5795
if (dbtp && Nlm_StrCmp(dbtp->db,"taxon") == 0) {
5797
if (oidp && oidp->id >= 0) {
5804
case CddDescr_create_date:
5805
case CddDescr_update_date:
5806
cPubDate = MemNew(11 * sizeof(Char));
5807
pDate = (DatePtr) pdescr->data.ptrvalue;
5808
if (pDate->data[2] > 0 && pDate->data[3] > 0)
5809
sprintf(cPubDate,"%d/%d/%d",pDate->data[1]+1900,pDate->data[2],pDate->data[3]);
5810
if (pDate->data[2] > 0 && pDate->data[3] == 0)
5811
sprintf(cPubDate,"%d/%d",pDate->data[1]+1900,pDate->data[2]);
5812
if (pDate->data[2] == 0 && pDate->data[3] == 0)
5813
sprintf(cPubDate,"%d",pDate->data[1]+1900);
5818
pdescr = pdescr->next;
5820
cShortName = (CharPtr) pcdd->name;
5822
if (Nlm_StrNCmp(cAccession,"cd",2) == 0) {
5823
cDatabase = StringSave("Cdd");
5824
bAllocDatabase = TRUE;
5828
cPubDate = MemNew(11 * sizeof(Char));
5829
pDate = (DatePtr) DateCurr();
5830
if (pDate->data[2] > 0 && pDate->data[3] > 0)
5831
sprintf(cPubDate,"%d/%d/%d",pDate->data[1]+1900,pDate->data[2],pDate->data[3]);
5832
if (pDate->data[2] > 0 && pDate->data[3] == 0)
5833
sprintf(cPubDate,"%d/%d",pDate->data[1]+1900,pDate->data[2]);
5834
if (pDate->data[2] == 0 && pDate->data[3] == 0)
5835
sprintf(cPubDate,"%d",pDate->data[1]+1900);
5837
pDate = (DatePtr) DateCurr();
5838
if (pDate->data[2] > 0 && pDate->data[3] > 0)
5839
sprintf(cEntrezDate,"%d/%d/%d",pDate->data[1]+1900,pDate->data[2],pDate->data[3]);
5840
if (pDate->data[2] > 0 && pDate->data[3] == 0)
5841
sprintf(cEntrezDate,"%d/%d",pDate->data[1]+1900,pDate->data[2]);
5842
if (pDate->data[2] == 0 && pDate->data[3] == 0)
5843
sprintf(cEntrezDate,"%d",pDate->data[1]+1900);
5845
cAbstract = CddFixForXML(cAbstract);
5846
cShortName = CddFixForXML(cShortName);
5848
fprintf(FP," <IdxDocument>\n");
5850
fprintf(FP," <IdxUid>%d</IdxUid>\n",gi);
5851
fprintf(FP," <IdxKeyFields>\n");
5853
fprintf(FP," <PubDate>%s</PubDate>\n",cPubDate);
5855
fprintf(FP," <EntrezDate>%s</EntrezDate>\n",cEntrezDate);
5856
fprintf(FP," </IdxKeyFields>\n");
5857
fprintf(FP," <IdxDisplayFields>\n");
5859
fprintf(FP," <Accession>%s</Accession>\n",cAccession);
5861
fprintf(FP," <Title>%s</Title>\n",cShortName);
5863
fprintf(FP," <Abstract>%s</Abstract>\n",cAbstract);
5864
fprintf(FP," </IdxDisplayFields>\n");
5865
fprintf(FP," <IdxSearchFields>\n");
5867
fprintf(FP," <Uid>%d</Uid>\n",gi);
5869
fprintf(FP," <Filter>%s</Filter>\n",cFilter);
5871
fprintf(FP," <Accession>%s</Accession>\n",cAccession);
5873
fprintf(FP," <Database>%s</Database>\n",cDatabase);
5875
fprintf(FP," <Title>%s</Title>\n",cShortName);
5877
fprintf(FP," <Text>%s</Text>\n",cAbstract);
5879
fprintf(FP," <Organism>txid%d</Organism>\n",txid);
5881
fprintf(FP," <Organism>%s</Organism>\n",cOrganism);
5883
fprintf(FP," <PubDate>%s</PubDate>\n",cPubDate);
5885
fprintf(FP," <EntrezDate>%s</EntrezDate>\n",cEntrezDate);
5886
fprintf(FP," </IdxSearchFields>\n");
5887
fprintf(FP," </IdxDocument>\n");
5889
if (bAllocDatabase) MemFree(cDatabase);
5891
MemFree(cEntrezDate);
5894
/*---------------------------------------------------------------------------*/
5895
/*---------------------------------------------------------------------------*/
5896
/* create and link a CddIdxData structure */
5897
/*---------------------------------------------------------------------------*/
5898
/*---------------------------------------------------------------------------*/
5899
CddIdxDataPtr LIBCALL CddIdxDataNew(CharPtr acc, Int4 uid)
5904
cidp = MemNew(sizeof(CddIdxData));
5908
cidp->iPssmId = uid;
5914
CddIdxDataPtr LIBCALL CddIdxDataLink(CddIdxDataPtr PNTR head, CddIdxDataPtr cidp)
5916
CddIdxDataPtr cidpThis;
5920
while (cidpThis->next != NULL) cidpThis = cidpThis->next;
5921
cidpThis->next = cidp;
5929
/*---------------------------------------------------------------------------*/
5930
/*---------------------------------------------------------------------------*/
5931
/* read in the cdd.idx file to return data associating accessions with uids */
5932
/*---------------------------------------------------------------------------*/
5933
/*---------------------------------------------------------------------------*/
5934
CddIdxDataPtr LIBCALL CddReadIdx(CharPtr CDDidx)
5936
CddIdxDataPtr cidp, cidpHead = NULL;
5943
fp = FileOpen(CDDidx,"r");
5944
if (!fp) CddSevError("could not open cdd.idx!");
5947
pcTest = fgets(pcBuf, (size_t)2048, fp);
5948
if (pcTest) if (pcTest[0] != ' ') {
5949
uid = (Int4) atoi(Nlm_StrTok(pcTest," "));
5950
acc = (CharPtr) StringSave(Nlm_StrTok(NULL," "));
5952
acc[Nlm_StrLen(acc) - 1] = '\0';
5953
cidp = CddIdxDataNew(acc,uid);
5954
CddIdxDataLink(&(cidpHead),cidp);
5963
/*---------------------------------------------------------------------------*/
5964
/*---------------------------------------------------------------------------*/
5965
/* retrieve a CDD accession via the PSSMid or vice versa */
5966
/*---------------------------------------------------------------------------*/
5967
/*---------------------------------------------------------------------------*/
5968
void LIBCALL CddAccFromPssmId(Int4 iPssmId, CharPtr cAcc, CharPtr CDDidx)
5972
cidp = CddReadIdx(CDDidx);
5973
if (!cidp) CddSevError("cdd.idx read failed!");
5975
if (iPssmId == cidp->iPssmId) {
5976
Nlm_StrCpy(cAcc, cidp->cCDDid);
5983
void LIBCALL CddPssmIdFromAcc(Int4 *iPssmId, CharPtr cAcc, CharPtr CDDidx)
5987
cidp = CddReadIdx(CDDidx);
5988
if (!cidp) CddSevError("cdd.idx read failed!");
5990
if (Nlm_StrCmp(cAcc, cidp->cCDDid) == 0) {
5991
*iPssmId = cidp->iPssmId;
5998
/*---------------------------------------------------------------------------*/
5999
/*---------------------------------------------------------------------------*/
6000
/* truncate a string right at the position of the first puncutation mark */
6001
/*---------------------------------------------------------------------------*/
6002
/*---------------------------------------------------------------------------*/
6003
void LIBCALL CddTruncStringAtFirstPunct(CharPtr pChar)
6007
for (i=0;i<Nlm_StrLen(pChar);i++) {
6008
if (pChar[i] == ',' ||
6011
pChar[i] = '\0'; break;
6016
void LIBCALL CddFillBlanksInString(CharPtr pChar)
6020
for (i=0;i<Nlm_StrLen(pChar);i++) {
6021
if (pChar[i] == ' ') pChar[i] = '_';
6025
/*---------------------------------------------------------------------------*/
6026
/*---------------------------------------------------------------------------*/
6027
/* Remove a Consensus from a CD */
6028
/*---------------------------------------------------------------------------*/
6029
/*---------------------------------------------------------------------------*/
6030
Boolean LIBCALL CddRemoveConsensus(CddPtr pcdd)
6034
SeqAlignPtr salp, salpnew;
6037
SeqEntryPtr sepThis, sepHead = NULL, sepTail = NULL;
6039
if (!CddHasConsensus(pcdd)) return FALSE;
6040
salp = (SeqAlignPtr) pcdd->seqannot->data;
6041
ddp = (DenseDiagPtr) salp->segs;
6042
sip = (SeqIdPtr) ddp->id->next;
6043
bssp = (BioseqSetPtr) pcdd->sequences->data.ptrvalue;
6044
if (pcdd->alignannot) {
6045
CddTransferAlignAnnot(pcdd->alignannot, sip, salp, bssp);
6047
salpnew = CddReindexSeqAlign(salp,sip,bssp);
6048
pcdd->seqannot->data = salpnew->next;
6049
sepThis = bssp->seq_set;
6051
bsp = (BioseqPtr) sepThis->data.ptrvalue;
6052
if (!SipIsConsensus(bsp->id)) {
6053
if (!sepHead) sepHead = sepThis;
6054
if (sepTail) sepTail->next = sepThis;
6057
sepThis = sepThis->next;
6059
sepTail->next = NULL;
6060
bssp->seq_set = sepHead;
6061
pcdd->sequences->data.ptrvalue = bssp;
6062
pcdd->profile_range = NULL;
6063
pcdd->trunc_master = NULL;
6064
pcdd->posfreq = NULL;
6065
pcdd->scoremat = NULL;
6066
pcdd->distance = NULL;
6070
/*---------------------------------------------------------------------------*/
6071
/*---------------------------------------------------------------------------*/
6072
/* Check if a CD has annotation that is inconsistent with the alignment */
6073
/*---------------------------------------------------------------------------*/
6074
/*---------------------------------------------------------------------------*/
6075
Boolean LIBCALL CddFeaturesAreConsistent(CddPtr pcdd, CharPtr errmsg)
6084
if (!pcdd->alignannot) return TRUE;
6085
bssp = (BioseqSetPtr) pcdd->sequences->data.ptrvalue;
6086
salp = (SeqAlignPtr) pcdd->seqannot->data;
6087
if (!salp) return TRUE;
6088
eap = SeqAlignToCddExpAlign(salp,bssp->seq_set);
6089
aap = pcdd->alignannot;
6091
slp = aap->location;
6092
iWhere = CddSeqLocInExpAlign(slp,eap);
6094
if (iWhere == INT4_MAX) {
6095
sprintf(errmsg,"Feature: %s, illegal feature address",aap->description);
6097
sprintf(errmsg,"Feature: %s, Location: %d",aap->description,iWhere+1);
6103
CddExpAlignFree(eap);
6108
/*---------------------------------------------------------------------------*/
6109
/*---------------------------------------------------------------------------*/
6110
/* find a MMDB-Identifier in a Bioseq */
6111
/*---------------------------------------------------------------------------*/
6112
/*---------------------------------------------------------------------------*/
6113
SeqAnnotPtr LIBCALL CddFindMMDBIdInBioseq(BioseqPtr bsp, Int4 *iMMDBid)
6115
SeqAnnotPtr annot, prevannot = NULL;
6123
if (annot->type == 4) {
6126
if (sip->choice == SEQID_GENERAL) {
6127
dbtp = sip->data.ptrvalue;
6128
if (Nlm_StrCmp(dbtp->db,"mmdb") == 0) {
6130
*iMMDBid = oidp->id;
6137
if (annot->next) prevannot = annot;
6138
annot = annot->next;
6144
/*---------------------------------------------------------------------------*/
6145
/*---------------------------------------------------------------------------*/
6146
/* Check if a CD has 3D superposition information for all aligned chains */
6147
/*---------------------------------------------------------------------------*/
6148
/*---------------------------------------------------------------------------*/
6149
Boolean LIBCALL CddHas3DSuperpos(CddPtr pcdd)
6152
BiostrucAnnotSetPtr basp = NULL;
6155
SeqIdPtr sip1, sip2;
6157
Int4 i, nStruct = 0;
6159
BiostrucFeatureSetPtr bfsp;
6160
BiostrucFeaturePtr bfp;
6161
ValNodePtr location, vnp;
6162
ChemGraphAlignmentPtr cgap;
6165
pMMDBid = MemNew(250 * sizeof(Int4));
6166
basp = pcdd->features;
6167
if (!basp) return FALSE;
6168
bssp = (BioseqSetPtr) pcdd->sequences->data.ptrvalue;
6169
/*---------------------------------------------------------------------------*/
6170
/* create a list of MMDB-Id pairs which need to be checked */
6171
/*---------------------------------------------------------------------------*/
6172
salp = pcdd->seqannot->data;
6176
sip2 = ddp->id->next;
6177
if (sip1->choice == SEQID_PDB) {
6179
bsp = CddFindSip(sip1,bssp->seq_set);
6180
CddFindMMDBIdInBioseq(bsp,&pMMDBid[nStruct]);
6184
if (sip2->choice == SEQID_PDB) {
6185
bsp = CddFindSip(sip2,bssp->seq_set);
6186
CddFindMMDBIdInBioseq(bsp,&pMMDBid[nStruct]);
6191
/*---------------------------------------------------------------------------*/
6192
/* now walk down the biostruc annot set and remove all pairs encountered */
6193
/*---------------------------------------------------------------------------*/
6194
bfsp = basp->features;
6196
bfp = bfsp->features;
6198
location = bfp->Location_location;
6199
if (location->choice == Location_location_alignment) {
6200
cgap = location->data.ptrvalue;
6201
vnp = cgap->biostruc_ids;
6202
thisslave = vnp->next->data.intvalue;
6203
for (i=1;i<nStruct;i++) {
6204
if (thisslave == pMMDBid[i]) {
6205
pMMDBid[i] = 0; break;
6214
/*---------------------------------------------------------------------------*/
6215
/* at this point check if any pair is left (i.e. any mmdb-id except master) */
6216
/*---------------------------------------------------------------------------*/
6217
for (i=1;i<nStruct;i++) {
6219
MemFree(pMMDBid); return(FALSE);
6222
MemFree(pMMDBid); return TRUE;
6225
/*---------------------------------------------------------------------------*/
6226
/*---------------------------------------------------------------------------*/
6227
/* CD has unfinished alignment business */
6228
/*---------------------------------------------------------------------------*/
6229
/*---------------------------------------------------------------------------*/
6230
Boolean LIBCALL CddHasPendingAlignments(CddPtr pcdd)
6232
if (pcdd->pending) return TRUE;
6236
/*---------------------------------------------------------------------------*/
6237
/*---------------------------------------------------------------------------*/
6239
/*---------------------------------------------------------------------------*/
6240
/*---------------------------------------------------------------------------*/
6241
Boolean LIBCALL SeqHasTax(BioseqPtr bsp)
6247
if (descr->choice == Seq_descr_source) {
6250
descr = descr->next;
6255
/*---------------------------------------------------------------------------*/
6256
/*---------------------------------------------------------------------------*/
6257
/* Fetch bioseq using BLAST dB, removing ids redundant to query. */
6258
/* index is the NR index in the dB -- if you don't have it, use -1. */
6259
/*---------------------------------------------------------------------------*/
6260
/*---------------------------------------------------------------------------*/
6261
NLM_EXTERN BlastDefLineSetPtr LIBCALL BlastDefLineSetFree PROTO ((BlastDefLineSetPtr ));
6263
BioseqPtr LIBCALL CddReadDBGetBioseq(SeqIdPtr query, Int4 index,
6266
return CddReadDBGetBioseqEx(query,index,rdfp,TRUE);
6269
BioseqPtr LIBCALL CddReadDBGetBioseqEx(SeqIdPtr query, Int4 index,
6270
ReadDBFILEPtr rdfp, Boolean bUseObjMgr)
6273
BlastDefLinePtr bdp,tbdp;
6274
RDBTaxNamesPtr tnames;
6277
if(query == NULL) return NULL;
6278
if(index < 0 && (index = SeqId2OrdinalId(rdfp, query)) < 0)
6280
if(!(bsp = readdb_get_bioseq_ex(rdfp, index, bUseObjMgr, FALSE))) return NULL;
6281
if(!(tbdp = FDGetDeflineAsnFromBioseq(bsp))) return NULL;
6283
for(bdp=tbdp; bdp; bdp=bdp->next) {
6284
if(SeqIdComp(bdp->seqid,query) == SIC_YES ||
6285
SeqIdComp(bdp->seqid->next,query) == SIC_YES) {
6286
tnames = RDBGetTaxNames(rdfp->taxinfo, bdp->taxid);
6291
SeqIdPrint(query,buf,PRINTID_FASTA_SHORT);
6292
ErrPostEx(SEV_WARNING,0,0,"Bad seqid %s",buf);
6295
bsp->id = SeqIdSetFree(bsp->id);
6296
bsp->id = SeqIdDup(bdp->seqid->next);
6297
bsp->id->next = SeqIdDup(bdp->seqid);
6299
/* Remove concatenated deflines. */
6300
SeqDescrFree(ValNodeExtractList(&bsp->descr, Seq_descr_title));
6301
/* Replace with single defline. */
6302
ValNodeCopyStr(&bsp->descr, Seq_descr_title, bdp->title);
6304
tbdp = BlastDefLineSetFree(tbdp);
6306
/* Remove all user data (tax info, asn1 defline). */
6307
SeqDescrFree(ValNodeExtractList(&bsp->descr, Seq_descr_user));
6309
if(tnames) { /* Add taxid, taxnames. */
6313
bsrp = BioSourceNew();
6314
bsrp->subtype = NULL;
6315
bsrp->org = OrgRefNew();
6316
if(tnames->sci_name)
6317
bsrp->org->taxname = StringSave(tnames->sci_name);
6318
if(tnames->common_name)
6319
bsrp->org->common = StringSave(tnames->common_name);
6321
dtp->db = StringSave("taxon");
6322
dtp->tag = ObjectIdNew();
6323
dtp->tag->id = tnames->tax_id;
6324
bsrp->org->db = NULL;
6325
ValNodeAddPointer(&bsrp->org->db, 0, dtp);
6326
ValNodeAddPointer(&bsp->descr, Seq_descr_source, bsrp);
6328
RDBTaxNamesFree(tnames);
6330
/* if (!SeqHasTax(bsp)) {
6331
printf("Warning: Bioseq without taxonomy!");
6332
if (query->choice == SEQID_GI) {
6333
printf("GI: %d\n",query->data.intvalue);
6334
} else printf("\n");
6339
/*---------------------------------------------------------------------------*/
6340
/*---------------------------------------------------------------------------*/
6341
/* return a common style dictionary for Cn3D 4.0 */
6342
/*---------------------------------------------------------------------------*/
6343
/*---------------------------------------------------------------------------*/
6345
Cn3dStyleDictionaryPtr LIBCALL CddSrvGetStyle2(Int4 *styles[], Int4 nstyles)
6347
Cn3dStyleDictionaryPtr csdp;
6348
Cn3dStyleTableItemPtr cstip,cstipTail,cstipHead=NULL;
6351
if(nstyles < 1) return(NULL);
6352
csdp = Cn3dStyleDictionaryNew();
6353
csdp->global_style = CddSrvGetStyle2_Ex(styles[0]);
6354
for(i=1; i<nstyles; i++) {
6355
cstip = Cn3dStyleTableItemNew();
6357
cstip->style = CddSrvGetStyle2_Ex(styles[i]);
6359
if(!cstipHead) cstipHead = cstipTail = cstip;
6361
cstipTail->next = cstip;
6365
csdp->style_table = cstipHead;
6369
static Cn3dStyleSettingsPtr CddSrvGetStyle2_Ex(Int4 style[])
6371
Cn3dStyleSettingsPtr cssp;
6372
Cn3dBackboneStylePtr cbsp;
6373
Cn3dGeneralStylePtr cgsp;
6374
Cn3dBackboneLabelStylePtr cblsp;
6376
cssp = Cn3dStyleSettingsNew();
6379
cbsp = Cn3dBackboneStyleNew();
6380
cbsp->type = style[0];
6381
cbsp->style = style[1];
6382
cbsp->color_scheme = style[2];
6384
MyCn3dColorInit(style[3],style[4],style[5],
6386
cssp->protein_backbone = cbsp;
6387
cbsp = Cn3dBackboneStyleNew();
6388
cbsp->type = style[8];
6389
cbsp->style = style[9];
6390
cbsp->color_scheme = style[10];
6392
MyCn3dColorInit(style[11],style[12],style[13],
6393
style[14],style[15]);
6394
cssp->nucleotide_backbone = cbsp;
6395
cgsp = Cn3dGeneralStyleNew();
6396
cgsp->is_on = style[16];
6397
cgsp->style = style[17];
6398
cgsp->color_scheme = style[18];
6400
MyCn3dColorInit(style[19],style[20],style[21],
6401
style[22],style[23]);
6402
cssp->protein_sidechains = cgsp;
6403
cgsp = Cn3dGeneralStyleNew();
6404
cgsp->is_on = style[24];
6405
cgsp->style = style[25];
6406
cgsp->color_scheme = style[26];
6408
MyCn3dColorInit(style[27],style[28],style[29],
6409
style[30],style[31]);
6410
cssp->nucleotide_sidechains = cgsp;
6411
cgsp = Cn3dGeneralStyleNew();
6412
cgsp->is_on = style[32];
6413
cgsp->style = style[33];
6414
cgsp->color_scheme = style[34];
6416
MyCn3dColorInit(style[35],style[36],style[37],
6417
style[38],style[39]);
6418
cssp->heterogens = cgsp;
6419
cgsp = Cn3dGeneralStyleNew();
6420
cgsp->is_on = style[40];
6421
cgsp->style = style[41];
6422
cgsp->color_scheme = style[42];
6424
MyCn3dColorInit(style[43],style[44],style[45],
6425
style[46],style[47]);
6426
cssp->solvents = cgsp;
6427
cgsp = Cn3dGeneralStyleNew();
6428
cgsp->is_on = style[48];
6429
cgsp->style = style[49];
6430
cgsp->color_scheme = style[50];
6432
MyCn3dColorInit(style[51],style[52],style[53],
6433
style[54],style[55]);
6434
cssp->connections = cgsp;
6435
cgsp = Cn3dGeneralStyleNew();
6436
cgsp->is_on = style[56];
6437
cgsp->style = style[57];
6438
cgsp->color_scheme = style[58];
6440
MyCn3dColorInit(style[59],style[60],style[61],
6441
style[62],style[63]);
6442
cssp->helix_objects = cgsp;
6443
cgsp = Cn3dGeneralStyleNew();
6444
cgsp->is_on = style[64];
6445
cgsp->style = style[65];
6446
cgsp->color_scheme = style[66];
6448
MyCn3dColorInit(style[67],style[68],style[69],
6449
style[70],style[71]);
6450
cssp->strand_objects = cgsp;
6451
cssp->virtual_disulfides_on = style[72];
6452
cssp->virtual_disulfide_color =
6453
MyCn3dColorInit(style[73],style[74],style[75],
6454
style[76],style[77]);
6455
cssp->hydrogens_on = style[78];
6456
cssp->background_color =
6457
MyCn3dColorInit(style[79],style[80],style[81],
6458
style[82],style[83]);
6459
cssp->scale_factor = style[84];
6460
cssp->space_fill_proportion = style[85];
6461
cssp->ball_radius = style[86];
6462
cssp->stick_radius = style[87];
6463
cssp->tube_radius = style[88];
6464
cssp->tube_worm_radius = style[89];
6465
cssp->helix_radius = style[90];
6466
cssp->strand_width = style[91];
6467
cssp->strand_thickness = style[92];
6468
cblsp = Cn3dBackboneLabelStyleNew();
6469
cblsp->spacing = style[93];
6470
cblsp->type = style[94];
6471
cblsp->number = style[95];
6472
cblsp->termini = style[96];
6473
cblsp->white = style[97];
6474
cssp->protein_labels = cblsp;
6475
cblsp = Cn3dBackboneLabelStyleNew();
6476
cblsp->spacing = style[98];
6477
cblsp->type = style[99];
6478
cblsp->number = style[100];
6479
cblsp->termini = style[101];
6480
cblsp->white = style[102];
6481
cssp->nucleotide_labels =cblsp;
6482
cssp->ion_labels = style[103];
6486
static Cn3dColorPtr MyCn3dColorInit(Int4 scale_factor, Int4 red, Int4 green, Int4 blue,
6491
ccp = Cn3dColorNew();
6492
ccp->scale_factor = scale_factor;