30
30
* Version Creation Date: 4/1/91
34
34
* File Description: Sequence Utilities for objseq and objsset
37
37
* --------------------------------------------------------------------------
38
38
* $Log: sequtil.c,v $
39
* Revision 6.161 2004/10/13 16:46:52 kans
40
* added DA, DB, DC as DDBJ_EST
42
* Revision 6.160 2004/09/21 22:34:36 dondosha
43
* Get the number of linked HSPs for a DenDiag from the first segment, not from the largest linked set
45
* Revision 6.159 2004/09/15 13:02:02 kans
46
* added CW to WHICH_db_accession as NCBI GSS
48
* Revision 6.158 2004/08/20 18:55:27 kans
49
* SeqLocStrand skips NULL or EMPTY components of MIX to avoid giving incorrect unknown result
51
* Revision 6.157 2004/08/17 19:35:44 kans
52
* BSPack uses BSRead instead of BSGetByte for significant speed increase
54
* Revision 6.156 2004/08/06 17:15:50 kans
55
* added CV as NCBI EST
57
* Revision 6.155 2004/08/04 17:15:16 kans
58
* added AccnInUniProt - still need AccnIsSWISSPROT for old style
60
* Revision 6.154 2004/07/21 18:05:00 rsmith
61
* SeqLocStart and SeqLocStop were not handling SEQLOC_PACKED_PNTs properly
63
* Revision 6.153 2004/07/14 22:55:04 dondosha
64
* Add version in GetAccessionVersionFromSeqId only if it is > 0
66
* Revision 6.152 2004/07/14 22:46:08 dondosha
67
* Added GetAccessionVersionFromSeqId function to extract Accession.version from a Seq-id
69
* Revision 6.151 2004/07/14 19:09:19 kans
70
* added CP for ACCN_NCBI_GENOME in WHICH_db_accession
72
* Revision 6.150 2004/06/04 17:31:34 kans
73
* added CN and CO accession prefixes
75
* Revision 6.149 2004/05/27 15:37:31 kans
76
* fixed typo in WHICH_db_accession - new 12-character RefSeq test used wrong comparison for underscore test
78
* Revision 6.148 2004/05/25 20:46:18 kans
79
* WHICH_db_accession handles 12-character RefSeq accessions
81
* Revision 6.147 2004/03/30 20:29:33 kans
82
* in static std_order array within SeqIdBestRank, demoted gibbsq, gibbmt, and patent
84
* Revision 6.146 2004/03/16 22:08:31 kans
85
* added CL to WHICH_db_accession as NCBI EST
87
* Revision 6.145 2004/03/15 18:42:59 coulouri
88
* Handle memory allocation failure gracefully with BSRebuildDNA_4na
90
* Revision 6.144 2004/01/16 16:37:01 kans
91
* added CM as scaffold/CON record
93
* Revision 6.143 2003/12/18 19:35:53 kans
94
* added CQ and CR prefixes
96
* Revision 6.142 2003/12/16 16:03:04 kans
97
* added CL as ncbi gss
99
* Revision 6.141 2003/11/10 16:12:33 kans
100
* added CK as genbank est
102
* Revision 6.140 2003/10/31 20:51:24 kans
103
* added CI and CJ as DDBJ ESTs to WHICH_db_accession
105
* Revision 6.139 2003/10/24 14:36:12 kans
106
* added CH as GenBank CONN to WHICH_db_accession
108
* Revision 6.138 2003/09/09 20:08:18 kans
109
* SeqLocPartialCheck locks bioseq if seqloc_whole and far
111
* Revision 6.137 2003/09/02 15:11:50 kans
112
* WHICH_db_accession takes ZP_ with 8 digits as refseq_prot_predicted
114
* Revision 6.136 2003/08/11 13:45:18 kans
115
* added CG as ncbi gss
117
* Revision 6.135 2003/07/14 20:17:53 kans
118
* added CF as ncbi est to WHICH_db_accession
120
* Revision 6.134 2003/07/02 14:35:21 kans
121
* added CE as ncbi gss
123
* Revision 6.133 2003/05/20 22:15:24 yaschenk
124
* SeqIdSelect loops indefinitely on corrupted memory
126
* Revision 6.132 2003/04/30 16:40:41 kans
127
* added CD as GenBank EST
129
* Revision 6.131 2003/03/25 13:32:22 kans
130
* added CC as ncbi gss accession prefix
132
* Revision 6.130 2003/03/24 19:41:56 kans
133
* added tmsmart_order, use in SeqIdWrite to prevent TMSMART temporary ID from being used
135
* Revision 6.129 2003/02/20 19:05:31 ford
136
* Modified MakeNewProteinSeqIdExMT() to create an ID of maximum length 30 instead of 20.
138
* Revision 6.128 2003/01/21 17:06:57 kans
139
* implement PRINTID_FASTA_ALL SeqIdWrite
141
* Revision 6.127 2003/01/13 18:15:35 kans
142
* added CB as NCBI EST prefix
144
* Revision 6.126 2002/11/05 18:50:31 kans
145
* fixed bug in SeqLocPartialCheck
147
* Revision 6.125 2002/10/19 19:11:21 kans
148
* added CA as GenBank EST
150
* Revision 6.124 2002/10/03 16:18:35 kans
151
* added BZ as NCBI GSS
153
* Revision 6.123 2002/09/20 20:47:51 kans
154
* added BY as DDBJ EST prefix
156
* Revision 6.122 2002/08/28 13:28:54 kans
159
* Revision 6.121 2002/08/26 20:38:26 kans
160
* added BW as ddbj est prefix
162
* Revision 6.120 2002/08/19 15:57:28 kans
163
* BV is NCBI STS prefix
165
* Revision 6.119 2002/08/19 15:54:47 kans
166
* added BU as NCBI EST prefix
168
* Revision 6.118 2002/07/12 18:47:46 kans
169
* WHICH_db_accession was using the result of AccnIsSWISSPROT incorrectly
171
* Revision 6.117 2002/07/08 20:25:45 kans
172
* added BT as FLI_cDNA type
174
* Revision 6.116 2002/06/28 14:48:03 kans
175
* added BS as ddbj genome project prefix
177
* Revision 6.115 2002/06/20 18:38:43 kans
178
* added FAA and GAA, ACCN_NCBI_WGS_PROT, ACCN_EMBL_WGS_PROT, ACCN_DDBJ_WGS_PROT, and ACCN_IS_WGS
180
* Revision 6.114 2002/06/19 17:13:48 kans
181
* added ACCN_PDB, support for PDB in SeqIdFromAccessionDotVersion
183
* Revision 6.113 2002/06/10 18:06:16 kans
184
* SeqLocLen use of smp->seq_len_lookup_func first checks sip for NULL
186
* Revision 6.112 2002/06/10 14:07:12 kans
187
* SeqLocLen on whole tries new smp->seq_len_lookup_func registered function
189
* Revision 6.111 2002/05/29 19:19:53 bazhin
190
* Added support for new EAA-EZZ protein's WGS accessions.
192
* Revision 6.110 2002/04/24 17:11:03 kans
193
* added BR as DDBJ TPA accession prefix
195
* Revision 6.109 2002/04/02 18:19:56 kans
196
* SeqLocPartialCheck fixes
198
* Revision 6.108 2002/03/26 18:11:26 kans
199
* WHICH_db_accession WGS assignments - A*** NCBI, B*** DDBJ, C*** EMBL
201
* Revision 6.107 2002/03/12 17:08:32 kans
202
* added BQ as NCBI EST
204
* Revision 6.106 2002/02/14 18:30:27 kans
205
* SeqIdFromAccessionDotVersion defaults version to INT2_MIN
207
* Revision 6.105 2002/01/29 19:28:16 kans
208
* SeqIdParse can parse type|accession.ver with no trailing vertical bars for RefSeq and DNA database types
210
* Revision 6.104 2002/01/22 18:49:15 kans
211
* added ACCN_NCBI_WGS, ACCN_EMBL_WGS, and ACCN_DDBJ_WGS
213
* Revision 6.103 2002/01/17 13:49:32 kans
214
* BP added as DDBJ EST accession prefix
216
* Revision 6.102 2002/01/16 16:59:38 camacho
217
* Changed the type of buflen parameter in SeqIdWrite from Int2 to Uint4
39
219
* Revision 6.101 2001/11/29 14:04:29 kans
40
220
* reverted GetThePointForOffset, deal with trans-splicing in feature indexing left/right extreme calculation itself
8490
8893
if ((StringICmp(temp,"AAA") >= 0) && (StringICmp(temp,"AZZ") <= 0)) {
8491
8894
retcode = ACCN_NCBI_PROT;
8492
} else if ((StringICmp(temp,"DAA") >= 0) && (StringICmp(temp,"DZZ") <= 0)) {
8493
retcode = ACCN_NCBI_PROT;
8494
if ((StringICmp(temp,"DAA") == 0)) {
8495
retcode = ACCN_NCBI_TPA_PROT;
8895
} else if ((StringICmp(temp,"BAA") >= 0) && (StringICmp(temp,"BZZ") <= 0)) {
8896
retcode = ACCN_DDBJ_PROT;
8497
8897
} else if ((StringICmp(temp,"CAA") >= 0) && (StringICmp(temp,"CZZ") <= 0)) {
8498
8898
retcode = ACCN_EMBL_PROT;
8499
} else if ((StringICmp(temp,"BAA") >= 0) && (StringICmp(temp,"BZZ") <= 0)) {
8500
retcode = ACCN_DDBJ_PROT;
8899
} else if ((StringICmp(temp,"DAA") >= 0) && (StringICmp(temp,"DZZ") <= 0)) {
8900
retcode = ACCN_NCBI_TPA_PROT;
8901
} else if ((StringICmp(temp,"EAA") >= 0) && (StringICmp(temp,"EZZ") <= 0)) {
8902
retcode = ACCN_NCBI_WGS_PROT;
8903
} else if ((StringICmp(temp,"FAA") >= 0) && (StringICmp(temp,"FZZ") <= 0)) {
8904
retcode = ACCN_DDBJ_TPA_PROT;
8905
} else if ((StringICmp(temp,"GAA") >= 0) && (StringICmp(temp,"GZZ") <= 0)) {
8906
retcode = ACCN_DDBJ_WGS_PROT;
8502
8908
retcode = ACCN_IS_PROTEIN;
8516
8921
(StringICmp(temp,"BE") == 0) ||
8517
8922
(StringICmp(temp,"BF") == 0) ||
8518
8923
(StringICmp(temp,"BI") == 0) ||
8519
(StringICmp(temp,"BM") == 0) ) { /* NCBI EST */
8924
(StringICmp(temp,"BM") == 0) ||
8925
(StringICmp(temp,"BQ") == 0) ||
8926
(StringICmp(temp,"BU") == 0) ||
8927
(StringICmp(temp,"CA") == 0) ||
8928
(StringICmp(temp,"CB") == 0) ||
8929
(StringICmp(temp,"CD") == 0) ||
8930
(StringICmp(temp,"CF") == 0) ||
8931
(StringICmp(temp,"CK") == 0) ||
8932
(StringICmp(temp,"CN") == 0) ||
8933
(StringICmp(temp,"CO") == 0) ||
8934
(StringICmp(temp,"CV") == 0) ) { /* NCBI EST */
8520
8935
retcode = ACCN_NCBI_EST;
8936
} else if ((StringICmp(temp,"BV") == 0)) { /* NCBI STS */
8937
retcode = ACCN_NCBI_STS;
8521
8938
} else if ((StringICmp(temp,"AC") == 0)) { /* NCBI HTGS */
8522
8939
retcode = ACCN_NCBI_HTGS;
8523
8940
} else if ((StringICmp(temp,"AF") == 0) ||
8524
8941
(StringICmp(temp,"AY") == 0)) { /* NCBI direct submission */
8525
8942
retcode = ACCN_NCBI_DIRSUB;
8526
} else if ((StringICmp(temp,"AE") == 0)) { /* NCBI genome project data */
8943
} else if ((StringICmp(temp,"AE") == 0) ||
8944
(StringICmp(temp,"CP") == 0)) { /* NCBI genome project data */
8527
8945
retcode = ACCN_NCBI_GENOME;
8528
} else if ((StringICmp(temp,"AH") == 0)) { /* NCBI segmented set header Bioseq */
8946
} else if ((StringICmp(temp,"AH") == 0) ||
8947
(StringICmp(temp,"CH") == 0) || /* NCBI segmented set header Bioseq */
8948
(StringICmp(temp,"CM") == 0)) {
8529
8949
retcode = ACCN_NCBI_SEGSET | ACCN_AMBIGOUS_MOL; /* A few segmented
8530
8950
proteins are AH */
8531
8951
} else if ((StringICmp(temp,"AS") == 0)) { /* NCBI "other" */
8534
8954
retcode = ACCN_NCBI_GSDB;
8535
8955
} else if ((StringICmp(temp,"AQ") == 0) ||
8536
8956
(StringICmp(temp,"AZ") == 0) ||
8537
(StringICmp(temp,"BH") == 0) ) { /* NCBI GSS */
8957
(StringICmp(temp,"BH") == 0) ||
8958
(StringICmp(temp,"BZ") == 0) ||
8959
(StringICmp(temp,"CC") == 0) ||
8960
(StringICmp(temp,"CE") == 0) ||
8961
(StringICmp(temp,"CG") == 0) ||
8962
(StringICmp(temp,"CL") == 0) ||
8963
(StringICmp(temp,"CW") == 0) ) { /* NCBI GSS */
8538
8964
retcode = ACCN_NCBI_GSS;
8539
8965
} else if ((StringICmp(temp,"AR") == 0)) { /* NCBI patent */
8540
8966
retcode = ACCN_NCBI_PATENT;
8541
8967
} else if((StringICmp(temp,"BC")==0)) { /* NCBI long cDNA project : MGC */
8542
8968
retcode = ACCN_NCBI_cDNA;
8969
} else if((StringICmp(temp,"BT")==0)) { /* NCBI FLI_cDNA */
8970
retcode = ACCN_NCBI_cDNA;
8543
8971
} else if((StringICmp(temp,"BK")==0) || /* NCBI third-party annotation */
8544
8972
(StringICmp(temp,"BL") == 0)) {
8545
8973
retcode = ACCN_NCBI_TPA;
8546
8974
} else if ((StringICmp(temp,"BN") == 0)) { /* EMBL third-party annotation */
8547
8975
retcode = ACCN_EMBL_TPA;
8976
} else if ((StringICmp(temp,"BR") == 0)) { /* DDBJ third-party annotation */
8977
retcode = ACCN_DDBJ_TPA;
8548
8978
} else if ((StringICmp(temp,"AJ") == 0) ||
8549
8979
(StringICmp(temp,"AM") == 0)) { /* EMBL direct submission */
8550
8980
retcode = ACCN_EMBL_DIRSUB;
8551
} else if ((StringICmp(temp,"AL") == 0)) { /* EMBL genome project data */
8981
} else if ((StringICmp(temp,"AL") == 0) ||
8982
(StringICmp(temp,"BX") == 0)||
8983
(StringICmp(temp,"CR") == 0)) { /* EMBL genome project data */
8552
8984
retcode = ACCN_EMBL_GENOME;
8553
8985
} else if ((StringICmp(temp,"AN") == 0)) { /* EMBL CON division */
8554
8986
retcode = ACCN_EMBL_CON;
8555
} else if ((StringICmp(temp,"AX") == 0)) { /* EMBL patent division */
8987
} else if ((StringICmp(temp,"AX") == 0) ||
8988
(StringICmp(temp,"CQ") == 0)) { /* EMBL patent division */
8556
8989
retcode = ACCN_EMBL_PATENT;
8557
8990
} else if ((StringICmp(temp,"AT") == 0) ||
8558
8991
(StringICmp(temp,"AU") == 0) ||
8559
8992
(StringICmp(temp,"AV") == 0) ||
8560
8993
(StringICmp(temp,"BB") == 0) ||
8561
(StringICmp(temp,"BJ") == 0)) { /* DDBJ EST's */
8994
(StringICmp(temp,"BJ") == 0) ||
8995
(StringICmp(temp,"BP") == 0) ||
8996
(StringICmp(temp,"BW") == 0) ||
8997
(StringICmp(temp,"BY") == 0) ||
8998
(StringICmp(temp,"CI") == 0) ||
8999
(StringICmp(temp,"CJ") == 0) ||
9000
(StringICmp(temp,"DA") == 0) ||
9001
(StringICmp(temp,"DB") == 0) ||
9002
(StringICmp(temp,"DC") == 0)) { /* DDBJ EST's */
8562
9003
retcode = ACCN_DDBJ_EST;
8563
9004
} else if ((StringICmp(temp,"AB") == 0)) { /* DDBJ direct submission */
8564
9005
retcode = ACCN_DDBJ_DIRSUB;
8565
9006
} else if ((StringICmp(temp,"AG") == 0) ||
8566
(StringICmp(temp,"AP") == 0)) { /* DDBJ genome project data */
9007
(StringICmp(temp,"AP") == 0) ||
9008
(StringICmp(temp,"BS") == 0)) { /* DDBJ genome project data */
8567
9009
retcode = ACCN_DDBJ_GENOME;
8568
9010
} else if ((StringICmp(temp,"AK") == 0)) { /* DDBJ HTGS */
8569
9011
retcode = ACCN_DDBJ_HTGS;
9070
case 11: /* New 11-character accession, two letters +"_"+ 8 digits */
9071
if(!IS_ALPHA(*s) || !IS_ALPHA(*(s+1)))
9077
temp[2] = NULLB; s++;
9079
if ((StringICmp(temp,"ZP") == 0)) {
9080
retcode = ACCN_REFSEQ_PROT_PREDICTED;
9084
if (! IS_DIGIT(*s)) {
9092
if(IS_ALPHA(*s) && IS_ALPHA(*(s+1)) && IS_ALPHA(*(s+2)) && IS_ALPHA(*(s+3))) {
9093
/* whole genome shotgun 12-character accession, four letters + 8 digits */
9099
if ((StringNICmp(temp,"A", 1) == 0)) {
9100
retcode = ACCN_NCBI_WGS;
9101
} else if ((StringNICmp(temp,"B", 1) == 0)) {
9102
retcode = ACCN_DDBJ_WGS;
9103
} else if ((StringNICmp(temp,"C", 1) == 0)) {
9104
retcode = ACCN_EMBL_WGS;
9108
if (! IS_DIGIT(*s)) {
9114
} else if(IS_ALPHA(*s) && IS_ALPHA(*(s+1)) && (*(s+2)=='_')) {
9115
/* New 12-character accession, two letters +"_"+ 9 digits */
9118
temp[2] = NULLB; s++;
9120
if ((StringICmp(temp,"NP") == 0)) {
9121
retcode = ACCN_REFSEQ_PROT;
9122
} else if ((StringICmp(temp,"NM") == 0)) {
9123
retcode = ACCN_REFSEQ_mRNA;
9124
} else if (IS_ALPHA(*temp) && IS_ALPHA(*(temp+1))) {
9125
retcode =ACCN_REFSEQ | ACCN_AMBIGOUS_MOL;
9129
if (! IS_DIGIT(*s)) {
8630
9138
retval = FALSE;
8632
9140
} /* Endswitch, StringLen(s) */
8636
9142
return (retval ? retcode : ACCN_UNKNOWN);