2
* ===========================================================================
5
* National Center for Biotechnology Information
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government have not placed any restriction on its use or reproduction.
14
* Although all reasonable efforts have been taken to ensure the accuracy
15
* and reliability of the software and data, the NLM and the U.S.
16
* Government do not and cannot warrant the performance or results that
17
* may be obtained by using this software or data. The NLM and the U.S.
18
* Government disclaim all warranties, express or implied, including
19
* warranties of performance, merchantability or fitness for any particular
22
* Please cite the author in any work or product based on this material.
24
* ===========================================================================
28
* Author: James Ostell
30
* Version Creation Date: 1/1/94
34
* File Description: Sequence editing utilities
37
* --------------------------------------------------------------------------
38
* Date Name Description of modification
39
* ------- ---------- -----------------------------------------------------
42
* Revision 6.241 2001/12/14 20:19:44 kans
43
* added ERR_SEQ_FEAT_MultiIntervalGene
45
* Revision 6.240 2001/12/11 22:05:34 kans
46
* NC drops no protein to warning, suppresses if top IS_Bioseq
48
* Revision 6.239 2001/12/11 20:31:24 kans
49
* relax or suppress some messages for NC as well as NT or GPS
51
* Revision 6.238 2001/12/11 14:08:56 kans
52
* added ERR_SEQ_FEAT_CollidingGeneNames
54
* Revision 6.237 2001/12/10 21:28:17 kans
55
* added ERR_SEQ_FEAT_AbuttingIntervals
57
* Revision 6.236 2001/12/07 15:15:48 kans
58
* do not complain about cds packaging problem if nc_ gps made redundant prior to splitting
60
* Revision 6.235 2001/12/06 20:51:28 kans
61
* check label qualifier for single token not just numbers
63
* Revision 6.234 2001/12/06 18:43:44 kans
64
* do not complain about molinfo.biomol other if XR_ RefSeq
66
* Revision 6.233 2001/12/04 19:29:18 kans
67
* common CheckForIllegalDbxref function handles extended list for GPS or RefSeq
69
* Revision 6.232 2001/11/27 13:31:59 kans
70
* check circular topology for SeqLocOrder problem, last ValidateSeqLoc tests only on Seq_repr_seg
72
* Revision 6.231 2001/11/21 22:02:20 kans
73
* check for bssp class other, gen prod set mRNA feature with local seqid far product
75
* Revision 6.230 2001/11/16 12:41:43 kans
76
* fixes to ValidateSeqLoc
78
* Revision 6.229 2001/10/15 20:48:06 kans
79
* suppress ERR_SEQ_INST_RnaDnaConflict - this is how we indicate an mRNA sequenced from a cDNA
81
* Revision 6.228 2001/10/12 21:20:48 kans
82
* validate orgref dbxref against legal list, suppress farloc warning if NC_ record
84
* Revision 6.227 2001/10/10 20:15:15 kans
85
* added ERR_SEQ_INST_RnaDnaConflict
87
* Revision 6.226 2001/10/09 18:16:43 kans
88
* Molinfo-biomol other is SEV_WARNING
90
* Revision 6.225 2001/10/01 11:30:02 kans
91
* removed unused LockAllBioseqs - replaced by seqmgr function
93
* Revision 6.224 2001/09/27 18:22:32 kans
94
* PUBSTATUS_aheadofprint suppresses some ERR_GENERIC_MissingPubInfo checks
96
* Revision 6.223 2001/09/21 18:38:34 kans
97
* warn if exception text set but exception flag is not
99
* Revision 6.222 2001/09/21 15:26:36 kans
100
* warn on anticodon not equal to 3 bases
102
* Revision 6.221 2001/09/21 14:15:08 kans
103
* corrected logic in tRNA anticodon checks
105
* Revision 6.220 2001/09/19 20:05:19 kans
106
* report obsolete (unconverted) descriptor types
108
* Revision 6.219 2001/09/18 21:42:45 kans
109
* check for inconsistent protein title was stuck in infinite loop - fixed
111
* Revision 6.218 2001/09/18 15:54:13 kans
112
* warn on tRNA with unparsed string extension
114
* Revision 6.217 2001/09/10 21:08:30 kans
115
* run through gnu-indent to clean up code formatting (EN)
117
* Revision 6.216 2001/09/05 14:26:02 bazhin
118
* Fixed a couple of typos while checking CDS's Imp-feats for
119
* translation presence.
121
* Revision 6.215 2001/08/31 21:38:13 kans
122
* test for zero gi, imp cds with translation or without pseudo (EN)
124
* Revision 6.214 2001/08/31 16:12:29 kans
125
* added ERR_SEQ_DESCR_InconsistentProteinTitle
127
* Revision 6.213 2001/08/30 18:37:36 kans
128
* check for unparsed anticodon qualifier
130
* Revision 6.212 2001/08/30 15:48:42 kans
131
* do not check rpt_unit string length if multi_rpt_unit in parentheses
133
* Revision 6.211 2001/08/28 19:35:23 kans
134
* RNA imp feats (not converted because of ifp->loc) now reported as SEV_ERROR, also checking misc_RNA and precursor_RNA
136
* Revision 6.210 2001/08/27 19:04:31 kans
137
* warn if rna imp feat should be converted to rna feature
139
* Revision 6.209 2001/08/24 19:32:49 kans
140
* artificial frameshift suppresses SpliceCheck on mRNA as well as CDS
142
* Revision 6.208 2001/08/24 19:29:13 kans
143
* added artificial frameshift exception text to suppress splice check
145
* Revision 6.207 2001/08/20 15:31:28 kans
146
* check for codon gbqual on coding region, report with WrongQualOnImpFeat, for now
148
* Revision 6.206 2001/08/14 19:43:08 kans
149
* added ERR_GENERIC_UnnecessaryPubEquiv and ERR_SEQ_FEAT_UnnecessaryCitPubEquiv
151
* Revision 6.205 2001/08/06 23:59:45 kans
152
* added third party annotation SeqID support
154
* Revision 6.204 2001/08/02 14:59:18 kans
155
* TrnaCodonWrong uses SEV_WARNING for selenocysteine
157
* Revision 6.203 2001/08/01 16:55:09 kans
158
* if is_nc, feature location on segmented bioseq goes to SEV_WARNING
160
* Revision 6.202 2001/07/19 14:41:04 kans
161
* check for obsolete one-of text in ifp->loc
163
* Revision 6.201 2001/07/18 16:21:53 kans
164
* only allow unpublished or submitted in cgp->cit
166
* Revision 6.200 2001/07/18 16:11:15 kans
167
* check citgen->cit for illegal text, call ValidatePubdesc only once for each pub
169
* Revision 6.199 2001/07/18 15:35:09 kans
170
* check for published journal article missing volume and pages
172
* Revision 6.198 2001/07/16 21:06:40 kans
173
* added ERR_SEQ_INST_BadSecondaryAccn
175
* Revision 6.197 2001/07/13 00:05:35 kans
176
* added CheckSegBspAgainstParts to report ERR_SEQ_INST_PartsOutOfOrder
178
* Revision 6.196 2001/07/10 23:00:32 kans
179
* added ERR_SEQ_FEAT_LocOnSegmentedBioseq
181
* Revision 6.195 2001/07/10 22:41:36 kans
182
* added ERR_SEQ_FEAT_ImpFeatBadLoc
184
* Revision 6.194 2001/07/10 22:00:09 kans
185
* calls SeqLocMixedStrands to check for mixed strand on segmented bioseq
187
* Revision 6.193 2001/07/10 15:25:46 kans
188
* SeqIdNameHasSpace is now SEV_REJECT
190
* Revision 6.192 2001/07/08 21:29:45 kans
191
* added ERR_SEQ_DESCR_BadSubSource and ERR_SEQ_DESCR_BadOrgMod
193
* Revision 6.191 2001/06/25 19:16:41 kans
194
* check crp->conflict, do not check length or splice
196
* Revision 6.190 2001/06/14 21:59:16 kans
197
* suppress overlapping peptide error if exception says alternative processing or alternate processing
199
* Revision 6.189 2001/05/30 20:51:27 kans
200
* for PDB records, suppress NoOrgFound anywhere on record, NoMolInfo, NoProtRefFound, and ShortSeq
202
* Revision 6.188 2001/05/27 22:51:12 kans
203
* added ERR_SEQ_FEAT_PolyAsiteNotPoint
205
* Revision 6.187 2001/05/25 22:26:46 kans
206
* added ERR_SEQ_FEAT_DuplicateInterval, protein EST and other techniques not allowed, segmented protein SeqID is 2 letters + 6 digits
208
* Revision 6.186 2001/04/17 14:08:51 kans
209
* cds mrna range check is warning if cds exception flag set
211
* Revision 6.185 2001/04/16 17:16:33 kans
212
* drop duplicate cds error to warning if gps or nt
214
* Revision 6.184 2001/04/13 21:19:09 kans
215
* SuppressTrailingXMessage if translation ends in * or partial at carboxy end
217
* Revision 6.183 2001/04/13 16:21:22 kans
218
* CdTransCheck translates without removing trailing ambiguity residues - consistent with other current behavior
220
* Revision 6.182 2001/04/07 18:05:56 kans
221
* supress cds product packaging problem if nt accession
223
* Revision 6.181 2001/04/05 20:17:16 kans
224
* added ERR_SEQ_FEAT_CDSproductPackagingProblem
226
* Revision 6.180 2001/03/26 18:58:17 kans
227
* added INTERVAL_OVERLAP choice for SeqMgrGetOverlappingFeature, if CDS completely within mRNA introns using this test, do not report validator error
229
* Revision 6.179 2001/03/02 01:35:33 kans
230
* trans splicing message turns off both mixed_strand and out_or_order errors, can be combined with other exception messages in string
232
* Revision 6.178 2001/02/28 22:45:11 kans
233
* ValidateSeqLoc handles SEQLOC_NULL so as not to miss mixed strands, ignores mixed strands on primer_bind, ignores interval order for publications
235
* Revision 6.177 2001/02/22 14:58:07 ostell
236
* moved duplicate feature from ERROR to WARN for misc_feat
238
* Revision 6.176 2001/02/14 20:58:18 kans
239
* check for ERR_SEQ_INST_BadSeqIdFormat in GenBank/EMBL/DDBJ records
241
* Revision 6.175 2001/02/13 23:32:24 kans
242
* trans splicing exception turns off mixed strand error
244
* Revision 6.174 2001/02/12 14:13:47 kans
245
* OverlappingGeneIsPseudo test for mRNA and CDS
247
* Revision 6.173 2001/02/09 22:21:39 kans
248
* duplicate feature with different comment or label is still severity error if GPSorNT - manually annotated records still use different strings to reduce severity, since these might really be intended to be separate features
250
* Revision 6.172 2001/02/09 16:11:16 kans
251
* check for GPSorNT to lower duplicate feature severity was accidentally only on CDS features, not all features
253
* Revision 6.171 2001/02/07 20:14:58 kans
254
* ValidateImpFeat GBQUAL_rpt_type test now deals with (DIRECT,TERMINAL) sets of values in parentheses
256
* Revision 6.170 2001/02/03 00:43:31 kans
257
* use SeqIdIn instead of SeqIdForSameBioseq to get appropriate phrap graphs
259
* Revision 6.169 2001/02/01 16:45:09 kans
260
* delta seq does not check for htgs tech if NT_ refseq
262
* Revision 6.168 2001/01/27 00:08:07 kans
263
* added ERR_SEQ_INST_TrailingX
265
* Revision 6.167 2001/01/25 15:44:14 kans
266
* handles multiple entries on a Seq-submit without a bioseq-set wrapper, suppresses some errors for patents
268
* Revision 6.166 2001/01/23 20:27:11 kans
269
* added ERR_SEQ_DESCR_MultipleChromosomes
271
* Revision 6.165 2001/01/16 20:53:37 kans
272
* SeqLocOrder not done for misc_recomb
274
* Revision 6.164 2001/01/09 20:54:01 kans
275
* DeltaOrFarSeg limits deltas to those without nuc-prot sets
277
* Revision 6.163 2001/01/09 00:25:23 kans
278
* DuplicateSegmentReferences now warning if not SEQLOC_WHOLE - unable to easily do more sophisticated test
280
* Revision 6.162 2001/01/03 17:10:43 kans
281
* if DeltaOrFarSeg, drop NoProtein from CDS message to SEV_WARNING
283
* Revision 6.161 2001/01/02 14:02:16 kans
284
* do not complain about SeqLocOrder for heterogen bonds
286
* Revision 6.160 2000/12/29 16:22:16 kans
287
* check for bad gene or mrna overlap is SEV_WARNING for NC_ records, which can be initially built on older records in ID that would no longer pass the current stricter validation
289
* Revision 6.159 2000/12/22 00:30:18 kans
290
* segmented bioseq now checked for bad seqloc order of features on components
292
* Revision 6.158 2000/12/21 18:02:25 kans
293
* exit CheckForCommonCDSProduct if cds == NULL
295
* Revision 6.157 2000/12/19 18:20:21 kans
296
* allele and mutation are obsolete
298
* Revision 6.156 2000/12/04 21:54:38 kans
299
* added NGorNT to suppress NoProtein error
301
* Revision 6.155 2000/11/30 16:43:20 kans
302
* added MrnaTransCheck
304
* Revision 6.154 2000/11/27 20:46:01 kans
305
* drop duplicate CDS error to warning if genomic product set, NT contig, or different frames
307
* Revision 6.153 2000/11/27 20:11:48 kans
308
* calls LockFarComponents and UnlockFarComponents instead of maintaining separate version
310
* Revision 6.152 2000/11/22 16:50:45 kans
311
* duplicate STS now SEV_WARNING
313
* Revision 6.151 2000/11/20 19:55:35 kans
314
* cyanelle is a plastid, as far as the flat file is concerned, so use genetic code 11
316
* Revision 6.150 2000/11/20 19:32:35 kans
317
* warn if nucleotide bioseq in nps within gps does not have mrna feature pointing to it - also added several classes of plastids to use genetic code 11
319
* Revision 6.149 2000/11/14 21:36:58 kans
320
* if biop->genome == GENOME_plastid, use genetic code 11
322
* Revision 6.148 2000/11/14 15:26:05 kans
323
* duplicate feature now SEV_WARNING for variation features - SNPs are not yet merged, and would prevent genome sequences from being loaded into ID
325
* Revision 6.147 2000/11/09 22:58:51 kans
326
* if genomic product set or NT_ contig, splice check severity is relaxed to SEV_WARNING - allows for occasional intron gap in model
328
* Revision 6.146 2000/11/06 21:15:56 kans
329
* CheckForCommonMRNAProduct checks for NULL mRNA before complaining if different than non-NULL sfp
331
* Revision 6.145 2000/11/06 17:39:50 kans
332
* added ERR_SEQ_FEAT_mRNAgeneRange
334
* Revision 6.144 2000/11/01 21:43:31 kans
335
* added DifferentDbxrefs check for FEATDEF_REGION (for same domain from different CDD data sources), relaxed severity for Regions
337
* Revision 6.143 2000/09/27 18:25:20 kans
338
* prelock components of far delta, just like components of far seg already did
340
* Revision 6.142 2000/09/25 00:08:21 kans
341
* mRNA and CDS features can have far RefSeq products if done by genome annotation, regular RefSeq nuc-prot sets still checked
343
* Revision 6.141 2000/09/24 22:22:42 kans
344
* show >350 KB message only if GenBank, EMBL, or DDBJ SeqID in record
346
* Revision 6.140 2000/09/24 00:07:28 kans
347
* if delta seq > 350 kb, but in genomic product set (genome annotation project, for now), do not post error message
349
* Revision 6.139 2000/09/21 18:22:18 kans
350
* ribosomal slippage allows translation check, does not allow splice check
352
* Revision 6.138 2000/09/19 14:54:12 kans
353
* if genomic product set, do not report multiple cds products (still needs work for contig after splitting to suppress far cds product warning) - and do not report missing pub if gps or refseq
355
* Revision 6.137 2000/09/01 23:42:10 kans
356
* validate genomic product set packaging message now includes mRNA->product Seq-loc
358
* Revision 6.136 2000/08/28 23:20:24 kans
359
* added ERR_SEQ_FEAT_MultipleMRNAproducts
361
* Revision 6.135 2000/08/28 23:04:27 kans
362
* added ERR_SEQ_PKG_GenomicProductPackagingProblem
364
* Revision 6.134 2000/08/04 12:52:32 kans
365
* change logic and message for ERR_SEQ_DESCR_BadOrganelle
367
* Revision 6.133 2000/08/02 22:31:46 kans
368
* changed ERR_SEQ_DESCR_BadLocation to ERR_SEQ_DESCR_BadOrganelle
370
* Revision 6.132 2000/08/02 22:27:33 kans
371
* added ERR_SEQ_DESCR_BadLocation
373
* Revision 6.131 2000/07/14 22:46:02 kans
374
* report position of first ACGT base with zero score and first N base with nonzero score
376
* Revision 6.130 2000/07/14 19:47:17 kans
377
* allow Phred Quality along with Phrap Quality
379
* Revision 6.129 2000/07/12 15:02:58 kans
380
* check score against 0 or 100 regardless of min or max, to catch bad bytes if the min or max value were also reported as bad
382
* Revision 6.128 2000/07/06 21:50:21 kans
383
* start lastloc at -1, set gcp->itemID for all messages
385
* Revision 6.127 2000/07/06 21:11:32 kans
386
* added ERR_SEQ_GRAPH_GraphOverlap
388
* Revision 6.126 2000/07/06 16:17:40 kans
389
* once again only counting seqlit with real data for numdsp - introduced bug when separating dsp and sgp counting
391
* Revision 6.125 2000/07/06 16:01:54 kans
392
* expanded phrap graph error messages
394
* Revision 6.124 2000/07/06 15:25:11 kans
395
* check for fa2htgs bug
397
* Revision 6.123 2000/07/05 17:02:12 kans
398
* added spp->gapIsZero, SeqPortSet_do_virtualEx, using ncbi4na with gap of 0 to distinguish quality scores under N versus quality scores under gap
400
* Revision 6.122 2000/07/03 21:22:47 kans
401
* changed some seqgraph error levels, preparing to figure out if nonzero score is below a gap, report as more severe error
403
* Revision 6.121 2000/07/03 17:11:43 kans
404
* gphlen should be either seqlen (seqlit sum) or bsplen, also check every value to be within reported range
406
* Revision 6.120 2000/07/03 16:37:28 kans
407
* multi seqlit deltas can have a single phrap seqgraph, downgrade some errors to warning (for now), may want to allow small positive score for N bases in future
409
* Revision 6.119 2000/06/26 12:58:15 kans
410
* look for Blast Type instead of Hist Seqalign to detect PowerBLAST alignment
412
* Revision 6.118 2000/06/21 21:57:02 kans
413
* fix to graph validation
415
* Revision 6.117 2000/06/21 18:01:39 kans
416
* check residues against quality values, report ACGT with 0 score and N with non-zero score
418
* Revision 6.116 2000/06/21 17:08:23 kans
419
* check each sgp->loc against running seqlit range
421
* Revision 6.115 2000/06/21 00:02:58 kans
422
* added ValidateGraphsOnBioseq, still need to look for runs of 0 that are not opposite runs of N
424
* Revision 6.114 2000/06/20 20:30:53 kans
425
* added ERR_SEQ_ALIGN_BlastAligns
427
* Revision 6.113 2000/06/13 15:14:08 kans
428
* change pre-locking to only lock remote genome components - otherwise got unlocking 0 lockcnt messages
430
* Revision 6.112 2000/06/12 14:55:19 kans
431
* added ERR_SEQ_FEAT_InvalidQualifierValue for /rpt_type and /rpt_unit
433
* Revision 6.111 2000/06/09 19:01:00 kans
434
* added ERR_SEQ_DESCR_BioSourceNeedsFocus
436
* Revision 6.110 2000/05/17 16:12:28 kans
437
* virion is no longer a legal feature
439
* Revision 6.109 2000/05/16 19:06:05 kans
440
* check for out-of-phase processed peptide now ignores partial ends
442
* Revision 6.108 2000/05/12 19:00:44 kans
443
* added ERR_SEQ_FEAT_PeptideFeatOutOfFrame
445
* Revision 6.107 2000/05/12 15:46:36 kans
446
* fixed typo-induced bug in CheckForCommonCDSProduct
448
* Revision 6.106 2000/05/11 16:14:45 kans
449
* MultipleCDSproduct check also aborts if sfp->product is NULL
451
* Revision 6.105 2000/05/11 16:12:13 kans
452
* Do not report ERR_SEQ_FEAT_MultipleCDSproducts if pseudo cds or contained by pseudo gene
454
* Revision 6.104 2000/05/10 18:09:29 kans
455
* added ERR_SEQ_FEAT_FocusOnBioSourceFeature
457
* Revision 6.103 2000/05/04 14:36:58 kans
458
* cleared up warnings found by gcc and clcc, and changed implementation of locking and unlocking remote genome segments
460
* Revision 6.102 2000/05/02 19:36:46 kans
461
* LockOrUnockAllSegments to speed up validation of remote genomes
463
* Revision 6.101 2000/05/02 19:12:06 kans
464
* added ERR_SEQ_FEAT_MultipleCDSproducts
466
* Revision 6.100 2000/03/14 13:33:33 kans
467
* NCBISubValidate sets indexing, adds AppProperty to shut off specific messages to be decided later
469
* Revision 6.99 2000/02/18 21:25:34 kans
470
* added ERR_SEQ_DESCR_SerialInComment and ERR_SEQ_FEAT_SerialInComment
472
* Revision 6.98 2000/02/14 15:00:19 kans
473
* added vsp->farIDsInAlignments for use by alignment validator
475
* Revision 6.97 2000/02/08 19:10:42 kans
476
* delta seq okay for htgs_3
478
* Revision 6.96 2000/01/26 23:14:46 kans
479
* added ERR_SEQ_INST_DuplicateSegmentReferences
481
* Revision 6.95 2000/01/14 21:14:02 kans
482
* added ERR_SEQ_FEAT_OverlappingPeptideFeat
484
* Revision 6.94 2000/01/11 17:01:46 kans
485
* changed Burma to Myanmar
487
* Revision 6.93 2000/01/03 20:18:02 kans
488
* suspicious CDS location message raised to SEV_ERROR if RefSeq NM_ accession
490
* Revision 6.92 1999/12/24 20:05:17 kans
491
* added ERR_SEQ_INST_IdOnMultipleBioseqs - for once scoping made an important test trivially easy
493
* Revision 6.91 1999/12/24 01:21:05 kans
494
* added validateAlignments flag controlling call to ValidateSeqAlignWithinValidator
496
* Revision 6.90 1999/12/23 19:07:24 kans
497
* for CDS, added CheckForBadGeneOverlap and CheckForBadMRNAOverlap
499
* Revision 6.89 1999/12/17 13:07:50 sirotkin
502
* Revision 6.88 1999/12/16 21:57:33 kans
503
* added test for ERR_SEQ_FEAT_BothStrands
505
* Revision 6.87 1999/12/08 02:40:41 kans
506
* added ERR_SEQ_INST_SeqIdNameHasSpace
508
* Revision 6.86 1999/12/06 15:23:34 kans
509
* duplicate features in separate unnamed feature tables were not being detected, now are with slightly different error message
511
* Revision 6.85 1999/11/22 21:56:11 kans
512
* removed Galapagos Islands (part of Ecuador) and removed space in Cote d*Ivoire
514
* Revision 6.84 1999/11/12 16:51:34 kans
515
* cDNA-derived STS will not trigger ConflictingBiomolTech error
517
* Revision 6.83 1999/11/09 19:16:28 kans
518
* added ERR_SEQ_INST_ConflictingBiomolTech
520
* Revision 6.82 1999/11/04 00:14:09 kans
521
* added ERR_SEQ_DESCR_MissingLineage
523
* Revision 6.81 1999/10/28 20:27:29 kans
524
* added ERR_SEQ_INST_MolNuclAcid
526
* Revision 6.80 1999/10/01 20:09:50 kans
527
* fix to feature packaging check that handles segmented bioseqs
529
* Revision 6.79 1999/10/01 19:47:35 kans
530
* support for ERR_SEQ_PKG_FeaturePackagingProblem
532
* Revision 6.78 1999/10/01 14:42:15 kans
533
* changed SEV_FATAL to SEV_REJECT
535
* Revision 6.77 1999/09/27 21:04:08 kans
536
* report ERR_SEQ_DESCR_NoOrgFound if empty taxname and common name
538
* Revision 6.76 1999/09/06 21:36:03 kans
539
* ValidateSeqEntry sets scope
541
* Revision 6.75 1999/08/24 17:44:01 kans
542
* removed Wagad from country list
544
* Revision 6.74 1999/08/24 15:22:17 kans
545
* added Galapagos Islands and Wagad to the country list
547
* Revision 6.73 1999/08/18 20:24:49 kans
548
* self-recursive call of CheckForInconsistentBiosources was not using tmp, but original sep, resulting in stack overflow in complex records
550
* Revision 6.72 1999/08/17 19:46:12 kans
551
* ValidatePopSet posts ERR_SEQ_DESCR_InconsistentBioSources
553
* Revision 6.71 1999/08/03 00:13:02 kans
554
* vsp->suppressContext now causes simplified locations to be written, seqidworst fastashort no locus
556
* Revision 6.70 1999/07/29 15:41:48 kans
557
* changed Serbia and Montenegro to Yugoslavia
559
* Revision 6.69 1999/07/22 22:04:35 kans
560
* added suppressContext flag
562
* Revision 6.68 1999/07/15 22:37:32 kans
563
* ValidateBioSource called once per biosource, not once per bioseq
565
* Revision 6.67 1999/07/15 20:39:22 kans
566
* suppress no pub warning if seq-submit, which has a cit-sub
568
* Revision 6.66 1999/06/24 19:33:24 kans
569
* corrected country list
571
* Revision 6.65 1999/06/22 17:15:49 kans
572
* added ERR_SEQ_DESCR_NoTaxonID
574
* Revision 6.64 1999/06/18 20:57:46 kans
575
* using collab approved country list
577
* Revision 6.63 1999/06/18 20:21:04 kans
578
* implemented ERR_SEQ_DESCR_BadCountryCode, indexed descr callback sets proper itemtype, itemID for click responsiveness
580
* Revision 6.62 1999/06/15 20:04:03 kans
581
* no org or pub anywhere on record now reports context of first bioseq for batch processing
583
* Revision 6.61 1999/06/15 19:45:42 kans
584
* changed SequenceTooLong to SequenceExceeds350kbp
586
* Revision 6.60 1999/06/14 16:14:20 kans
587
* added ERR_SEQ_FEAT_TrnaCodonWrong check
589
* Revision 6.59 1999/06/11 18:31:16 kans
590
* added ERR_SEQ_FEAT_TranslExceptPhase
592
* Revision 6.58 1999/06/09 21:34:29 kans
593
* stop in protein message gives gene and protein name for reading report later
595
* Revision 6.57 1999/05/07 15:31:20 kans
596
* added ERR_SEQ_FEAT_UnnecessaryGeneXref
598
* Revision 6.56 1999/05/05 19:11:41 kans
599
* for no pubs or biosource anywhere, needed to set vsp->gcp for ValidErr/ErrPostItem
601
* Revision 6.55 1999/05/05 13:03:14 kans
602
* no org or pub anywhere after clearing error counts
604
* Revision 6.54 1999/05/03 20:06:35 kans
605
* if no pubs or no biosource, report only once, not once per bioseq
607
* Revision 6.53 1999/03/31 20:57:48 kans
608
* htgs phase 1 and 2 messages also check for phase 0
610
* Revision 6.52 1999/03/04 19:55:49 kans
611
* inconsistent create_date messages now sev_warning
613
* Revision 6.51 1999/02/25 21:53:58 kans
614
* relax duplicate feature severity to warning if label or comment are different, or if FEATDEF_PUB
616
* Revision 6.50 1999/02/16 22:19:02 kans
617
* fixed interval comparison in duplicate feature detection
619
* Revision 6.49 1999/02/02 16:39:10 kans
620
* added ERR_SEQ_FEAT_DuplicateFeat
622
* Revision 6.48 1999/01/05 23:20:50 kans
623
* SpliceCheckEx does not check exon junction if partial
625
* Revision 6.47 1998/12/14 22:27:28 kans
626
* CdTransCheck now deals with termination by polyA
628
* Revision 6.46 1998/12/07 20:00:56 kans
629
* meant to set bcp = NULL, not bsp = NULL, crashed with segmented protein
631
* Revision 6.45 1998/10/26 20:57:45 kans
632
* check gene and prot db fields for IllegalDbXref
634
* Revision 6.44 1998/10/23 15:25:57 kans
635
* added FarLocation warning
637
* Revision 6.43 1998/10/22 16:05:57 kans
638
* removed labeltype parameter from SeqMgrIndexFeatures, changed index parameter/field to Uint2
640
* Revision 6.42 1998/10/21 14:32:11 kans
641
* on invalid feature for bioseq, restore itemid itemid and itemtype to avoid weird(er) click association - need to rewrite valid with new index functions, which will give proper items
643
* Revision 6.41 1998/10/20 20:18:10 kans
644
* mRNA feature is invalid on an mRNA (cDNA) bioseq
646
* Revision 6.40 1998/10/20 18:12:54 kans
647
* invalid for type (e.g., intron on mRNA) now coerces gcp to have feature itemtype, itemID for selection
649
* Revision 6.39 1998/10/15 17:29:18 kans
650
* import feature of mat_, sig_, and transit_peptide now flagged as invalid for type
652
* Revision 6.38 1998/09/22 13:12:01 kans
653
* locationFilter parameter to explore features function
655
* Revision 6.37 1998/09/21 17:29:35 kans
656
* precursor rna can have intron feature
658
* Revision 6.36 1998/09/17 16:38:14 kans
659
* added ERR_SEQ_DESCR_NoMolInfoFound
661
* Revision 6.35 1998/09/01 19:25:27 kans
662
* context parameter in get best protein, get cds/rna given product
664
* Revision 6.34 1998/08/28 22:25:56 kans
665
* keep track of last biomol, tech, completeness in multiple molinfo descriptors
667
* Revision 6.33 1998/08/26 21:07:48 kans
668
* added check for ERR_SEQ_INST_ConflictingIdsOnBioseq
670
* Revision 6.32 1998/08/10 16:05:15 kans
671
* copy some old descriptor checks to Molinfo
673
* Revision 6.31 1998/07/23 14:25:38 kans
674
* intron and CAAT_signal are illegal on mRNA - first checks molinfo, then resorts to Seq_mol_rna as mRNA criterion
676
* Revision 6.30 1998/07/16 16:06:56 kans
677
* use ObjMgrGetEntityIDForChoice instead of ObjMgrGetEntityIDForPointer for SeqEntryPtr
679
* Revision 6.29 1998/07/14 18:10:33 kans
680
* invalid feature for nucleotide now says nucleotide, not protein
682
* Revision 6.28 1998/07/06 18:01:52 kans
683
* added LIBCALLBACK to SeqMgrExplore function callbacks
685
* Revision 6.27 1998/07/02 17:53:43 kans
686
* useSeqMgrIndexes field added to ValidStructPtr, validator can use either old (nested gathers) or new (SeqMgr indexing) method
688
* Revision 6.26 1998/06/24 18:49:15 kans
689
* added missing BioseqContextFree
691
* Revision 6.25 1998/06/22 20:13:21 kans
692
* gencode mismatch reports biosource and cds codes
694
* Revision 6.24 1998/06/12 20:05:53 kans
695
* fixed unix compiler warnings
697
* Revision 6.23 1998/04/16 15:12:15 kans
698
* slight fix to frame > 1 and not at splice site test
700
* Revision 6.22 1998/04/15 21:59:25 kans
701
* added ERR_SEQ_FEAT_IllegalDbXref
703
* Revision 6.21 1998/04/14 20:57:36 kans
704
* check for mixed bioseqs in segset, parts set, and for sets within parts set
706
* Revision 6.20 1998/04/14 19:11:25 kans
707
* improvements to PartialAtSpliceSite and frame > 1 check
709
* Revision 6.19 1998/04/14 18:55:56 kans
710
* cds frame > 1 but not 5prime partial now also checks for PartialAtSpliceSite
712
* Revision 6.18 1998/04/13 18:10:38 kans
713
* warn if CDS frame > 1 but not 5prime partial
715
* Revision 6.17 1998/04/02 15:45:51 kans
716
* MolInfoPtr had not been obtained for Seq_repr_raw for HTGS test on long sequences
718
* Revision 6.16 1998/03/30 17:35:22 kans
719
* check raw bioseq for htgs flags if greater than 350kb
721
* Revision 6.15 1998/03/18 20:41:50 kans
722
* SpliceCheck only on mRNA (not all RNAs) and CDS
724
* Revision 6.14 1998/03/09 17:48:46 kans
725
* OBJ_SEQSUB_CIT now satisfies need for publication
727
* Revision 6.13 1998/02/19 17:21:15 shavirin
728
* Added check for NULL in ValidErr() function
730
* Revision 6.12 1998/02/18 20:34:55 kans
731
* added ERR_GENERIC_MissingPubInfo
733
* Revision 6.11 1998/02/09 20:35:35 kans
734
* calls ERR_SEQ_FEAT_PseudoCdsHasProduct
736
* Revision 6.10 1998/01/30 21:05:54 kans
737
* check for ERR_SEQ_DESCR_MultipleBioSources
739
* Revision 6.9 1998/01/30 20:29:48 kans
740
* added PartialAtSpliceSite check
742
* Revision 6.8 1998/01/13 15:34:50 kans
743
* gbqual_citation satisfied by sfp->cit
745
* Revision 6.7 1998/01/10 00:05:36 kans
746
* added ValidateImpFeat
748
* Revision 6.6 1998/01/06 03:07:57 ostell
749
* in comparison of cdregion genetic code to biosource genetic code, set defaults
750
* to 0 instead of -1 to fix default behavior on building submission.
752
* Revision 6.5 1997/12/18 21:51:43 kans
753
* warn on cds/biosource genetic code conflict, rna type 0
755
* Revision 6.4 1997/11/14 17:10:13 kans
756
* added checks for bioseq length > 350K (based on Cavanaugh request)
758
* Revision 6.3 1997/08/27 20:11:02 kans
759
* order gene should in fact have partial flag set
761
* Revision 6.2 1997/08/27 19:48:32 kans
762
* print feature product seqloc
764
* Revision 6.1 1997/08/27 14:15:51 kans
765
* gene of order should not cause partial error
767
* Revision 6.0 1997/08/25 18:08:25 madden
768
* Revision changed to 6.0
770
* Revision 5.24 1997/08/13 18:52:51 kans
771
* new packaging errors set to SEV_REJECT
773
* Revision 5.23 1997/08/13 15:36:53 kans
774
* added NucProtNotSegSet and SegSetNotParts (Bazhin)
776
* Revision 5.22 1997/07/07 21:28:11 kans
777
* existing bad start codon check was being bypassed, so new one was added
779
* Revision 5.21 1997/07/07 15:00:28 kans
780
* signal or transit peptide do not need names
782
* Revision 5.20 1997/07/02 19:44:09 kans
783
* added check for et al, changed symbol names for empty gene and prot feature
785
* Revision 5.19 1997/06/24 16:39:12 kans
786
* fixed Digital Unix compiler complaint
788
* Revision 5.18 1997/06/19 18:39:51 vakatov
789
* [WIN32,MSVC++] Adopted for the "NCBIOBJ.LIB" DLL'ization
791
* Revision 5.17 1997/05/29 17:25:16 kans
792
* splice check and trans check not done if excpt
794
* Revision 5.16 1997/05/28 19:10:32 kans
795
* added check for empty protref
797
* Revision 5.15 1997/05/20 21:11:38 kans
798
* warnings for delta seq not htgs1 or 2, cds orf with product, gene with no fields, cds exception gbqual without excpt
800
* Revision 5.14 1997/04/24 20:39:20 kans
801
* invalid splice sites are warning level unless app property forces to error
803
* Revision 5.13 1997/03/17 21:43:28 kans
804
* added closing bracket to bioseq length indication
806
* Revision 5.12 1997/02/20 13:50:33 ostell
807
* added length check on segmented sequence back
809
* Revision 5.11 1996/11/22 17:23:20 kans
810
* splice errors on exon imp-feats are now severity warning, since there is
811
* no way of knowing which are the unspliced ends of the first and last exon
813
* Revision 5.10 1996/11/04 16:29:55 kans
814
* app property allows splice check for exon features, and rare GC splice
815
* donor has separate warning message
817
* Revision 5.9 1996/10/16 20:31:16 ostell
818
* added length check for delta sequences
819
* added CdTrnsCheck for exception and pseudo
821
* Revision 5.8 1996/08/21 14:08:26 ostell
822
* rmoved kludge for big sequences
824
* Revision 5.7 1996/08/19 02:45:49 ostell
825
* added check in BioseqContect for more than 30n bioseqs to control
828
* Revision 5.6 1996/08/06 19:56:03 kans
829
* for SEQLOC_WHOLE, must call SeqIdFindBest on bsp->id
831
* Revision 5.5 1996/08/01 18:58:00 kans
832
* on pseudo cds, suppress CdTransCheck, SpliceCheck
834
* Revision 5.4 1996/06/19 00:35:32 ostell
835
* added check for ragged end of CdRegion
837
* Revision 5.1 1996/06/16 04:16:05 ostell
838
* added support for delta seq
840
* Revision 5.0 1996/05/28 13:23:23 ostell
841
* Set to revision 5.0
843
* Revision 4.19 1996/05/03 18:59:13 kans
844
* up to 5 stops still allows mismatch report, which includes nuc position
846
* Revision 4.18 1996/04/01 16:31:47 ostell
847
* fix to preserver eror message count between invocations
849
* Revision 4.17 1996/03/15 20:01:14 ostell
850
* in SpliceCheck, give accession of sequence with bad junction
852
* Revision 4.16 1996/03/08 14:48:02 kans
853
* fixed typos in ValidateSeqEntry scope memset, use as parameter
855
* Revision 4.15 1996/03/06 20:43:59 ostell
856
* added scoping to validation
858
* Revision 4.14 1996/03/05 19:54:29 kans
859
* added biosource to two switch statements
861
* Revision 4.13 1996/03/03 16:59:34 ostell
862
* added SpellCheckPub() to look at more Pub types
864
* Revision 4.12 1996/03/02 03:41:43 ostell
865
* fix to correctly identigy splice junctions on minus strand
867
* Revision 4.11 1996/02/26 22:06:37 ostell
868
* finished gatherized version of spell check on descriptors
870
* Revision 4.10 1996/02/19 19:58:05 ostell
871
* added support for Code-break and tRNA.anticodon
873
* Revision 4.9 1996/01/23 23:10:10 kans
874
* implemented onlyspell and justwarnonspell code
876
* Revision 4.8 1995/12/07 01:55:37 ostell
877
* fix to check for NULL on bioseqset parent
879
* Revision 4.7 1995/12/07 01:38:56 ostell
880
* added Splice error flag
882
* Revision 4.6 1995/12/06 22:11:23 ostell
883
* changed wording of SpliceCheck message
885
* Revision 4.5 1995/12/06 06:08:57 ostell
886
* lowered warning levels on partial messages
887
* added SpliceCheck()
889
* Revision 4.4 1995/08/16 18:21:52 epstein
890
* correct declaration of static functions to be consistent with function prototypes
892
* Revision 4.3 1995/08/04 18:41:02 madden
893
* removed "|SpellErr|" SpellCallBack.
895
* Revision 4.2 1995/08/03 12:45:56 madden
896
* Set ValNodePtr in SpellCheckBioseqDescr; added "SpellErr" to ErrPosting.
898
* Revision 4.1 1995/08/02 22:21:50 madden
899
* gatherized the spell functions.
901
* Revision 4.0 1995/07/26 13:49:01 ostell
902
* force revision to 4.0
904
* Revision 1.14 1995/06/03 13:45:47 ostell
905
* changes made in valid to use gather functions and ErrPostItem instead
906
* of previous custom functions
908
* Revision 1.13 1995/05/15 21:46:05 ostell
913
* ==========================================================================
916
static char *this_module = "valid";
918
#define THIS_MODULE this_module
920
static char *this_file = __FILE__;
922
#define THIS_FILE this_file
927
#include <validerr.h>
928
#include <sqnutils.h>
937
/*****************************************************************************
939
* NOTE: look at all the ValidErr calls with severity=0. Some should be
940
* bumped up later. Look also for string "PARSER"
942
*****************************************************************************/
952
static ValidStructPtr globalvsp; /* for spell checker */
954
NLM_EXTERN void CDECL ValidErr VPROTO ((ValidStructPtr vsp, int severity, int code1, int code2, const char *fmt, ...));
955
static void ValidateBioseqInst (GatherContextPtr gcp);
956
static void ValidateBioseqContext (GatherContextPtr gcp);
957
static void ValidateBioseqSet (GatherContextPtr gcp);
958
static void ValidateGraphsOnBioseq (GatherContextPtr gcp);
959
static void SpellCheckSeqDescr (GatherContextPtr gcp);
960
NLM_EXTERN void CdTransCheck (ValidStructPtr vsp, SeqFeatPtr sfp);
961
NLM_EXTERN void MrnaTransCheck (ValidStructPtr vsp, SeqFeatPtr sfp);
962
NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp);
963
NLM_EXTERN void ValidateSeqLoc (ValidStructPtr vsp, SeqLocPtr slp, CharPtr prefix);
964
NLM_EXTERN Boolean PatchBadSequence (BioseqPtr bsp);
965
NLM_EXTERN CharPtr FindIDForEntry (SeqEntryPtr sep, CharPtr buf);
966
NLM_EXTERN void SpellCheckSeqFeat (GatherContextPtr gcp);
967
NLM_EXTERN void SpellCheckString (ValidStructPtr vsp, CharPtr str);
968
NLM_EXTERN void SpliceCheck (ValidStructPtr vsp, SeqFeatPtr sfp);
969
static void SpliceCheckEx (ValidStructPtr vsp, SeqFeatPtr sfp, Boolean checkAll);
970
static void ValidateBioSource (ValidStructPtr vsp, GatherContextPtr gcp, BioSourcePtr biop);
971
static void ValidatePubdesc (ValidStructPtr vsp, GatherContextPtr gcp, PubdescPtr pdp);
972
static void ValidateSfpCit (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp);
974
/* alignment validator */
975
NLM_EXTERN Boolean ValidateSeqAlignWithinValidator (ValidStructPtr vsp, SeqEntryPtr sep);
977
/*****************************************************************************
979
* Perform Validation Checks on a SeqEntry
981
*****************************************************************************/
983
NLM_EXTERN void ValidStructClear (ValidStructPtr vsp)
984
{ /* 0 out a ValidStruct */
988
SpellCheckFunc spellfunc;
989
SpellCallBackFunc spellcallback;
991
Boolean justwarnonspell;
992
Boolean useSeqMgrIndexes;
993
Boolean suppressContext;
994
Boolean validateAlignments;
995
Boolean farIDsInAlignments;
1000
errbuf = vsp->errbuf;
1001
cutoff = vsp->cutoff;
1002
patch_seq = vsp->patch_seq;
1003
spellfunc = vsp->spellfunc;
1004
spellcallback = vsp->spellcallback;
1005
onlyspell = vsp->onlyspell;
1006
justwarnonspell = vsp->justwarnonspell;
1007
useSeqMgrIndexes = vsp->useSeqMgrIndexes;
1008
suppressContext = vsp->suppressContext;
1009
validateAlignments = vsp->validateAlignments;
1010
farIDsInAlignments = vsp->farIDsInAlignments;
1011
MemSet ((VoidPtr) vsp, 0, sizeof (ValidStruct));
1012
vsp->errbuf = errbuf;
1013
vsp->cutoff = cutoff;
1014
vsp->patch_seq = patch_seq;
1015
vsp->spellfunc = spellfunc;
1016
vsp->spellcallback = spellcallback;
1017
vsp->onlyspell = onlyspell;
1018
vsp->justwarnonspell = justwarnonspell;
1019
vsp->useSeqMgrIndexes = useSeqMgrIndexes;
1020
vsp->suppressContext = suppressContext;
1021
vsp->validateAlignments = validateAlignments;
1022
vsp->farIDsInAlignments = farIDsInAlignments;
1026
NLM_EXTERN ValidStructPtr ValidStructNew (void)
1030
vsp = (ValidStructPtr) MemNew (sizeof (ValidStruct));
1034
NLM_EXTERN ValidStructPtr ValidStructFree (ValidStructPtr vsp)
1039
MemFree (vsp->errbuf);
1040
return (ValidStructPtr) MemFree (vsp);
1043
/*****************************************************************************
1047
*****************************************************************************/
1049
static void ChangeSeqIdToBestID (SeqIdPtr sip)
1057
bsp = BioseqFindCore (sip);
1060
id = SeqIdDup (SeqIdFindWorst (bsp->id));
1063
/* now remove SeqId contents to reuse SeqId valnode */
1064
pnt = sip->data.ptrvalue;
1065
switch (sip->choice) {
1066
case SEQID_LOCAL: /* local */
1067
ObjectIdFree ((ObjectIdPtr) pnt);
1069
case SEQID_GIBBSQ: /* gibbseq */
1070
case SEQID_GIBBMT: /* gibbmt */
1072
case SEQID_GIIM: /* giimid */
1073
GiimFree ((GiimPtr) pnt);
1075
case SEQID_GENBANK: /* genbank */
1076
case SEQID_EMBL: /* embl */
1077
case SEQID_PIR: /* pir */
1078
case SEQID_SWISSPROT: /* swissprot */
1079
case SEQID_OTHER: /* other */
1085
TextSeqIdFree ((TextSeqIdPtr) pnt);
1087
case SEQID_PATENT: /* patent seq id */
1088
PatentSeqIdFree ((PatentSeqIdPtr) pnt);
1090
case SEQID_GENERAL: /* general */
1091
DbtagFree ((DbtagPtr) pnt);
1093
case SEQID_GI: /* gi */
1096
PDBSeqIdFree ((PDBSeqIdPtr) pnt);
1099
sip->choice = id->choice;
1100
sip->data.ptrvalue = id->data.ptrvalue;
1101
SeqIdStripLocus (sip);
1104
static void ChangeSeqLocToBestID (SeqLocPtr slp)
1113
while (slp != NULL) {
1114
switch (slp->choice) {
1119
sip = (SeqIdPtr) slp->data.ptrvalue;
1120
ChangeSeqIdToBestID (sip);
1123
sinp = (SeqIntPtr) slp->data.ptrvalue;
1126
ChangeSeqIdToBestID (sip);
1130
spp = (SeqPntPtr) slp->data.ptrvalue;
1133
ChangeSeqIdToBestID (sip);
1136
case SEQLOC_PACKED_PNT:
1137
psp = (PackSeqPntPtr) slp->data.ptrvalue;
1140
ChangeSeqIdToBestID (sip);
1143
case SEQLOC_PACKED_INT:
1146
loc = (SeqLocPtr) slp->data.ptrvalue;
1147
while (loc != NULL) {
1148
ChangeSeqLocToBestID (loc);
1153
sbp = (SeqBondPtr) slp->data.ptrvalue;
1155
spp = (SeqPntPtr) sbp->a;
1158
ChangeSeqIdToBestID (sip);
1160
spp = (SeqPntPtr) sbp->b;
1163
ChangeSeqIdToBestID (sip);
1176
static Int2 WorstBioseqLabel (BioseqPtr bsp, CharPtr buffer, Int2 buflen, Uint1 content)
1183
AsnTypePtr ratp, matp;
1185
if ((bsp == NULL) || (buflen < 1))
1191
if (content != OM_LABEL_TYPE) {
1192
sip = SeqIdStripLocus (SeqIdDup (SeqIdFindWorst (bsp->id)));
1193
SeqIdWrite (sip, label, PRINTID_FASTA_SHORT, 39);
1195
if (content == OM_LABEL_CONTENT)
1196
return LabelCopy (buffer, label, buflen);
1198
diff = LabelCopyExtra (buffer, label, buflen, NULL, ": ");
1203
amp = AsnAllModPtr ();
1204
ratp = AsnTypeFind (amp, "Seq-inst.repr");
1205
matp = AsnTypeFind (amp, "Seq-inst.mol");
1209
tmp = StringMove (tmp, AsnEnumTypeStr (ratp, (Int2) (bsp->repr)));
1210
tmp = StringMove (tmp, ", ");
1211
tmp = StringMove (tmp, AsnEnumTypeStr (matp, (Int2) (bsp->mol)));
1212
sprintf (tmp, " len= %ld", (long) (bsp->length));
1213
diff = LabelCopy (buffer, label, buflen);
1217
if (content != OM_LABEL_SUMMARY)
1218
return (len - buflen);
1220
return (len - buflen); /* SUMMARY not done yet */
1224
NLM_EXTERN void CDECL ValidErr (vsp, severity, code1, code2, fmt, va_alist)
1232
NLM_EXTERN void CDECL ValidErr (ValidStructPtr vsp, int severity, int code1, int code2, const char *fmt, ...)
1236
GatherContextPtr gcp;
1241
SeqLocPtr loc = NULL;
1243
if (vsp == NULL || severity < vsp->cutoff)
1246
if (vsp->errbuf == NULL) {
1247
vsp->errbuf = MemNew (1024);
1248
if (vsp->errbuf == NULL)
1253
vsp->errors[severity]++;
1258
va_start (args, fmt);
1263
vsprintf (tmp, fmt, args);
1264
while (*tmp != '\0') {
1271
if (vsp->sfp != NULL) {
1272
diff = LabelCopy (tmp, " FEATURE: ", buflen);
1276
diff = FeatDefLabel (vsp->sfp, tmp, buflen, OM_LABEL_BOTH);
1280
if (vsp->suppressContext) {
1281
loc = AsnIoMemCopy (vsp->sfp->location, (AsnReadFunc) SeqLocAsnRead, (AsnWriteFunc) SeqLocAsnWrite);
1282
ChangeSeqLocToBestID (loc);
1283
ctmp = SeqLocPrint (loc);
1286
ctmp = SeqLocPrint (vsp->sfp->location);
1289
diff = LabelCopyExtra (tmp, ctmp, buflen, " [", "]");
1295
if (!vsp->suppressContext) {
1296
sip = SeqLocId (vsp->sfp->location);
1298
bsp = BioseqFind (sip);
1300
diff = LabelCopy (tmp, " [", buflen);
1304
diff = BioseqLabel (bsp, tmp, buflen, OM_LABEL_BOTH);
1308
diff = LabelCopy (tmp, "]", buflen);
1314
if (vsp->sfp->product != NULL) {
1315
if (vsp->suppressContext) {
1316
loc = AsnIoMemCopy (vsp->sfp->product, (AsnReadFunc) SeqLocAsnRead, (AsnWriteFunc) SeqLocAsnWrite);
1317
ChangeSeqLocToBestID (loc);
1318
ctmp = SeqLocPrint (loc);
1321
ctmp = SeqLocPrint (vsp->sfp->product);
1324
diff = LabelCopyExtra (tmp, ctmp, buflen, " -> [", "]");
1330
} else if (vsp->descr != NULL) {
1331
diff = LabelCopy (tmp, " DESCRIPTOR: ", buflen);
1335
diff = SeqDescLabel (vsp->descr, tmp, buflen, OM_LABEL_BOTH);
1341
if (vsp->suppressContext)
1345
if (vsp->sfp == NULL) { /* sfp adds its own context */
1346
if (vsp->bsp != NULL) {
1347
diff = LabelCopy (tmp, " BIOSEQ: ", buflen);
1351
if (vsp->suppressContext) {
1352
diff = WorstBioseqLabel (vsp->bsp, tmp, buflen, OM_LABEL_CONTENT);
1354
diff = BioseqLabel (vsp->bsp, tmp, buflen, OM_LABEL_BOTH);
1358
} else if (vsp->bssp != NULL) {
1359
diff = LabelCopy (tmp, " BIOSEQ-SET: ", buflen);
1363
if (vsp->suppressContext) {
1364
diff = BioseqSetLabel (vsp->bssp, tmp, buflen, OM_LABEL_CONTENT);
1366
diff = BioseqSetLabel (vsp->bssp, tmp, buflen, OM_LABEL_BOTH);
1373
ErrPostItem ((ErrSev) (severity), code1, code2, "%s", vsp->errbuf);
1374
vsp->errbuf[0] = '\0';
1379
/*****************************************************************************
1381
* Valid1GatherProc(gcp)
1382
* top level gather callback
1383
* dispatches to other levels
1385
*****************************************************************************/
1386
static Boolean Valid1GatherProc (GatherContextPtr gcp)
1392
Boolean is_blast_align;
1398
vsp = (ValidStructPtr) (gcp->userdata);
1399
vsp->gcp = gcp; /* needed for ValidErr */
1401
switch (gcp->thistype) {
1403
if (!vsp->onlyspell) {
1404
ValidateBioseqInst (gcp);
1405
ValidateBioseqContext (gcp);
1406
ValidateGraphsOnBioseq (gcp);
1410
if (!vsp->onlyspell) {
1411
ValidateBioseqSet (gcp);
1415
if (!vsp->onlyspell) {
1416
sap = (SeqAnnotPtr) gcp->thisitem;
1417
if (sap != NULL && sap->type == 2) {
1418
is_blast_align = FALSE;
1420
while ((desc = ValNodeFindNext (sap->desc, desc, Annot_descr_user)) != NULL) {
1421
if (desc->data.ptrvalue != NULL) {
1422
oip = ((UserObjectPtr) desc->data.ptrvalue)->type;
1423
if (oip != NULL && StringCmp (oip->str, "Blast Type") == 0) {
1424
is_blast_align = TRUE;
1428
if (is_blast_align) {
1429
ValidErr (vsp, SEV_ERROR, ERR_SEQ_ALIGN_BlastAligns, "Record contains BLAST alignments");
1435
if (!vsp->onlyspell) {
1436
ValidateSeqFeat (gcp);
1437
sfp = (SeqFeatPtr) (gcp->thisitem);
1439
if (sfp->data.choice == SEQFEAT_BIOSRC) {
1440
biop = (BioSourcePtr) sfp->data.value.ptrvalue;
1441
ValidateBioSource (vsp, gcp, biop);
1443
if (sfp->data.choice == SEQFEAT_PUB) {
1444
pdp = (PubdescPtr) sfp->data.value.ptrvalue;
1445
ValidatePubdesc (vsp, gcp, pdp);
1447
if (sfp->cit != NULL) {
1448
ValidateSfpCit (vsp, gcp, sfp);
1452
SpellCheckSeqFeat (gcp);
1455
SpellCheckSeqDescr (gcp);
1457
ValidateSeqDescr (gcp);
1459
sdp = (ValNodePtr) (gcp->thisitem);
1461
if (sdp->choice == Seq_descr_source) {
1462
biop = (BioSourcePtr) sdp->data.ptrvalue;
1463
ValidateBioSource (vsp, gcp, biop);
1465
if (sdp->choice == Seq_descr_pub) {
1466
pdp = (PubdescPtr) sdp->data.ptrvalue;
1467
ValidatePubdesc (vsp, gcp, pdp);
1469
if (sdp->choice == Seq_descr_mol_type) {
1470
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "MolType descriptor is obsolete");
1472
if (sdp->choice == Seq_descr_modif) {
1473
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Modif descriptor is obsolete");
1475
if (sdp->choice == Seq_descr_method) {
1476
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Method descriptor is obsolete");
1478
if (sdp->choice == Seq_descr_org) {
1479
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "OrgRef descriptor is obsolete");
1490
static void LookForAnyPubAndOrg (SeqEntryPtr sep, BoolPtr no_pub, BoolPtr no_biosrc)
1494
SeqAnnotPtr sap = NULL;
1495
ValNodePtr sdp = NULL;
1499
if (sep == NULL || no_pub == NULL || no_biosrc == NULL)
1501
if (IS_Bioseq (sep)) {
1502
bsp = (BioseqPtr) sep->data.ptrvalue;
1507
} else if (IS_Bioseq_set (sep)) {
1508
bssp = (BioseqSetPtr) sep->data.ptrvalue;
1511
for (tmp = bssp->seq_set; tmp != NULL; tmp = tmp->next) {
1512
LookForAnyPubAndOrg (tmp, no_pub, no_biosrc);
1518
while (sap != NULL) {
1519
if (sap->type == 1) {
1520
sfp = (SeqFeatPtr) sap->data;
1521
while (sfp != NULL) {
1522
if (sfp->data.choice == SEQFEAT_PUB) {
1524
} else if (sfp->data.choice == SEQFEAT_BIOSRC) {
1532
while (sdp != NULL) {
1533
if (sdp->choice == Seq_descr_pub) {
1535
} else if (sdp->choice == Seq_descr_source) {
1542
static void CheckFeatPacking (BioseqPtr bsp, SeqFeatPtr sfp, Uint4Ptr num_misplaced_features)
1545
BioseqSetPtr bssp, parent;
1548
if (sfp->idx.parenttype == OBJ_SEQANNOT) {
1549
sap = (SeqAnnotPtr) sfp->idx.parentptr;
1552
if (sap->idx.parenttype == OBJ_BIOSEQ) {
1553
/* if feature packaged on bioseq, must be target bioseq */
1554
par = (BioseqPtr) sap->idx.parentptr;
1555
if (par != bsp && SeqMgrGetParentOfPart (par, NULL) != bsp) {
1556
(*num_misplaced_features)++;
1560
if (sap->idx.parenttype == OBJ_BIOSEQSET) {
1561
/* if feature packaged on set, set must contain bioseq */
1562
bssp = (BioseqSetPtr) sap->idx.parentptr;
1565
if (bsp->idx.parenttype == OBJ_BIOSEQSET) {
1566
parent = (BioseqSetPtr) bsp->idx.parentptr;
1567
while (parent != NULL) {
1570
if (parent->idx.parenttype != OBJ_BIOSEQSET)
1572
parent = (BioseqSetPtr) parent->idx.parentptr;
1574
(*num_misplaced_features)++;
1580
static Boolean LIBCALLBACK CountMisplacedFeatures (BioseqPtr bsp, SeqMgrBioseqContextPtr bcontext)
1582
Uint4Ptr num_misplaced_features;
1584
SeqMgrFeatContext fcontext;
1586
num_misplaced_features = (Uint4Ptr) bcontext->userdata;
1587
sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
1588
while (sfp != NULL) {
1589
CheckFeatPacking (bsp, sfp, num_misplaced_features);
1590
sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
1596
static Boolean IsRefSeq (BioseqPtr bsp)
1602
for (sip = bsp->id; sip != NULL; sip = sip->next) {
1603
if (sip->choice == SEQID_OTHER)
1609
NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp)
1615
Boolean do_many = FALSE;
1616
Boolean mult_subs = FALSE;
1617
Boolean first = TRUE;
1619
Boolean suppress_no_pubs = TRUE;
1620
Boolean suppress_no_biosrc = TRUE;
1621
Uint4 num_misplaced_features = 0;
1622
GatherContextPtr gcp = NULL;
1625
BioseqPtr fbsp = NULL;
1632
Boolean isGPS = FALSE;
1633
Boolean isPatent = FALSE;
1634
Boolean isPDB = FALSE;
1636
for (i = 0; i < 6; i++) /* keep errors between clears */
1639
if (vsp->useSeqMgrIndexes) {
1640
entityID = ObjMgrGetEntityIDForChoice (sep);
1642
if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
1643
SeqMgrIndexFeatures (entityID, NULL);
1645
SeqMgrExploreBioseqs (entityID, NULL, (Pointer) &num_misplaced_features, CountMisplacedFeatures, TRUE, TRUE, TRUE);
1648
/* if not using indexing, still need feature->idx.subtype now */
1650
entityID = ObjMgrGetEntityIDForChoice (sep);
1651
AssignIDsInEntity (entityID, 0, NULL);
1654
/* Seq-submit can have multiple entries with no Bioseq-set wrapper */
1656
omdp = ObjMgrGetData (entityID);
1657
if (omdp != NULL && omdp->datatype == OBJ_SEQSUB) {
1658
ssp = (SeqSubmitPtr) omdp->dataptr;
1659
if (ssp != NULL && ssp->data != NULL) {
1660
if (sep->next != NULL) {
1667
if (IS_Bioseq_set (sep)) {
1668
bssp = (BioseqSetPtr) (sep->data.ptrvalue);
1669
switch (bssp->_class) {
1670
case BioseqseqSet_class_genbank:
1671
case BioseqseqSet_class_pir:
1672
case BioseqseqSet_class_gibb:
1673
case BioseqseqSet_class_gi:
1674
case BioseqseqSet_class_swissprot:
1675
sep = bssp->seq_set;
1678
case BioseqseqSet_class_gen_prod_set:
1685
/* if no pubs or biosource, only one message, not one per bioseq */
1688
for (tmp = sep; tmp != NULL; tmp = tmp->next) {
1689
LookForAnyPubAndOrg (tmp, &suppress_no_pubs, &suppress_no_biosrc);
1692
LookForAnyPubAndOrg (sep, &suppress_no_pubs, &suppress_no_biosrc);
1695
globalvsp = vsp; /* for spell checker */
1697
while (sep != NULL) {
1698
MemSet (&gs, 0, sizeof (GatherScope));
1699
gs.scope = sep; /* default is to scope to this set */
1701
ValidStructClear (vsp);
1704
MemSet ((Pointer) &gc, 0, sizeof (GatherContext));
1706
gc.entityID = ObjMgrGetEntityIDForChoice (sep);
1708
if (IS_Bioseq (sep)) {
1709
gc.thistype = OBJ_BIOSEQ;
1711
gc.thistype = OBJ_BIOSEQSET;
1713
vsp->gcp = gcp; /* above needed for ValidErr */
1714
vsp->suppress_no_pubs = suppress_no_pubs;
1715
vsp->suppress_no_biosrc = suppress_no_biosrc;
1717
/* build seqmgr feature indices if not already done */
1720
if (vsp->useSeqMgrIndexes) {
1721
entityID = ObjMgrGetEntityIDForChoice (sep);
1723
if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
1724
SeqMgrIndexFeatures (entityID, NULL);
1727
/* lock all remote genome components in advance */
1729
bsplist = LockFarComponents (sep);
1732
fsep = FindNthBioseq (sep, 1);
1734
if (fsep != NULL && IS_Bioseq (fsep)) {
1735
fbsp = (BioseqPtr) fsep->data.ptrvalue;
1736
/* report context as first bioseq */
1740
for (sip = fbsp->id; sip != NULL; sip = sip->next) {
1741
if (sip->choice == SEQID_PATENT) {
1743
} else if (sip->choice == SEQID_PDB) {
1749
if (suppress_no_pubs) {
1750
omdp = ObjMgrGetData (gc.entityID);
1751
if (omdp == NULL || omdp->datatype != OBJ_SEQSUB) {
1752
if ((!isGPS) && (!IsRefSeq (fbsp))) {
1753
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_NoPubFound, "No publications anywhere on this entire record.");
1757
if (suppress_no_biosrc) {
1758
if ((!isPatent) && ((!isPDB))) {
1759
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_NoOrgFound, "No organism name anywhere on this entire record.");
1763
if (num_misplaced_features > 1) {
1764
ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_FeaturePackagingProblem, "There are %d mispackaged features in this record.", (int) num_misplaced_features);
1765
} else if (num_misplaced_features == 1) {
1766
ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_FeaturePackagingProblem, "There is %d mispackaged feature in this record.", (int) num_misplaced_features);
1773
topsep = GetTopSeqEntryForEntityID (gc.entityID);
1774
oldsep = SeqEntrySetScope (topsep);
1776
AssignIDsInEntity (gc.entityID, 0, NULL);
1778
GatherSeqEntry (sep, (Pointer) vsp, Valid1GatherProc, &gs);
1780
if (vsp->validateAlignments) {
1782
ValidateSeqAlignWithinValidator (vsp, sep);
1786
SeqEntrySetScope (oldsep);
1788
if (vsp->useSeqMgrIndexes) {
1790
/* unlock all pre-locked remote genome components */
1792
bsplist = UnlockFarComponents (bsplist);
1796
for (i = 0; i < 6; i++)
1797
errors[i] += vsp->errors[i];
1804
for (i = 0; i < 6; i++)
1805
vsp->errors[i] = errors[i];
1811
static void ValidateSetContents (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
1816
vsp = (ValidStructPtr) data;
1818
if (IS_Bioseq (sep)) {
1819
bsp = (BioseqPtr) (sep->data.ptrvalue);
1820
if (ISA_aa (bsp->mol))
1824
if (bsp->repr == Seq_repr_seg)
1832
static CharPtr GetBioseqSetClass (Uint1 cl)
1834
if (cl == BioseqseqSet_class_nuc_prot)
1835
return ("nuc-prot");
1836
if (cl == BioseqseqSet_class_segset)
1838
if (cl == BioseqseqSet_class_conset)
1840
if (cl == BioseqseqSet_class_parts)
1842
if (cl == BioseqseqSet_class_gibb)
1844
if (cl == BioseqseqSet_class_gi)
1846
if (cl == BioseqseqSet_class_genbank)
1848
if (cl == BioseqseqSet_class_pir)
1850
if (cl == BioseqseqSet_class_pub_set)
1852
if (cl == BioseqseqSet_class_equiv)
1854
if (cl == BioseqseqSet_class_swissprot)
1855
return ("swissprot");
1856
if (cl == BioseqseqSet_class_pdb_entry)
1857
return ("pdb-entry");
1858
if (cl == BioseqseqSet_class_mut_set)
1860
if (cl == BioseqseqSet_class_pop_set)
1862
if (cl == BioseqseqSet_class_phy_set)
1864
if (cl == BioseqseqSet_class_other)
1869
static void IfInGPSmustBeMrnaProduct (ValidStructPtr vsp, BioseqPtr bsp)
1874
/* see if in genomic product */
1877
if (sep != NULL && IS_Bioseq_set (sep)) {
1878
bssp = (BioseqSetPtr) sep->data.ptrvalue;
1879
if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set) {
1880
if (SeqMgrGetRNAgivenProduct (bsp, NULL) == NULL) {
1881
ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_GenomicProductPackagingProblem, "Nucleotide bioseq should be product of mRNA feature on contig, but is not");
1887
static void ValidateNucProtSet (BioseqSetPtr bssp, ValidStructPtr vsp)
1893
if (bssp->_class != BioseqseqSet_class_nuc_prot)
1896
for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
1897
if (IS_Bioseq (sep)) {
1898
bsp = (BioseqPtr) sep->data.ptrvalue;
1899
if (bsp != NULL && ISA_na (bsp->mol)) {
1900
IfInGPSmustBeMrnaProduct (vsp, bsp);
1904
if (!IS_Bioseq_set (sep))
1907
bssp1 = sep->data.ptrvalue;
1911
if (bssp1->_class != BioseqseqSet_class_segset) {
1912
ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_NucProtNotSegSet,
1913
"Nuc-prot Bioseq-set contains wrong Bioseq-set, its class is \"%s\".", GetBioseqSetClass (bssp1->_class));
1919
static void ValidateSegmentedSet (BioseqSetPtr bssp, ValidStructPtr vsp)
1926
if (bssp->_class != BioseqseqSet_class_segset)
1929
for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
1930
if (IS_Bioseq (sep)) {
1931
bsp = (BioseqPtr) sep->data.ptrvalue;
1933
if (mol == 0 || mol == Seq_mol_other) {
1935
} else if (bsp->mol != Seq_mol_other) {
1936
if (ISA_na (bsp->mol) != ISA_na (mol)) {
1937
ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_SegSetMixedBioseqs, "Segmented set contains mixture of nucleotides and proteins");
1943
if (!IS_Bioseq_set (sep))
1946
bssp1 = sep->data.ptrvalue;
1950
if (bssp1->_class != BioseqseqSet_class_parts) {
1951
ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_SegSetNotParts,
1952
"Segmented set contains wrong Bioseq-set, its class is \"%s\".", GetBioseqSetClass (bssp1->_class));
1958
static void ValidatePartsSet (BioseqSetPtr bssp, ValidStructPtr vsp)
1965
if (bssp->_class != BioseqseqSet_class_parts)
1968
for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
1969
if (IS_Bioseq (sep)) {
1970
bsp = (BioseqPtr) sep->data.ptrvalue;
1972
if (mol == 0 || mol == Seq_mol_other) {
1974
} else if (bsp->mol != Seq_mol_other) {
1975
if (ISA_na (bsp->mol) != ISA_na (mol)) {
1976
ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_PartsSetMixedBioseqs, "Parts set contains mixture of nucleotides and proteins");
1984
for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
1985
if (IS_Bioseq_set (sep)) {
1986
bssp1 = sep->data.ptrvalue;
1990
ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_PartsSetHasSets,
1991
"Parts set contains unwanted Bioseq-set, its class is \"%s\".", GetBioseqSetClass (bssp1->_class));
1997
static Boolean CheckForInconsistentBiosources (SeqEntryPtr sep, ValidStructPtr vsp, OrgRefPtr PNTR orpp)
2004
SeqMgrDescContext dcontext;
2005
SeqMgrFeatContext fcontext;
2009
GatherContextPtr gcp;
2010
Uint2 entityID = 0, oldEntityID;
2011
Uint2 itemID = 0, oldItemID;
2012
Uint2 itemtype = 0, oldItemtype;
2014
if (sep == NULL || vsp == NULL || orpp == NULL)
2018
if (IS_Bioseq_set (sep)) {
2019
bssp = (BioseqSetPtr) sep->data.ptrvalue;
2022
for (tmp = bssp->seq_set; tmp != NULL; tmp = tmp->next) {
2023
if (CheckForInconsistentBiosources (tmp, vsp, orpp))
2029
if (!IS_Bioseq (sep))
2031
bsp = (BioseqPtr) sep->data.ptrvalue;
2036
sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
2038
biop = (BioSourcePtr) sdp->data.ptrvalue;
2039
entityID = dcontext.entityID;
2040
itemID = dcontext.itemID;
2041
itemtype = OBJ_SEQDESC;
2043
sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_BIOSRC, 0, &fcontext);
2045
biop = (BioSourcePtr) sfp->data.value.ptrvalue;
2046
entityID = fcontext.entityID;
2047
itemID = fcontext.itemID;
2048
itemtype = OBJ_SEQFEAT;
2058
if (firstorp == NULL) {
2063
if (StringNICmp (orp->taxname, "Influenza virus ", 16) == 0 &&
2064
StringNICmp (firstorp->taxname, "Influenza virus ", 16) == 0 && StringNICmp (orp->taxname, firstorp->taxname, 17) == 0) {
2068
if (StringICmp (orp->taxname, firstorp->taxname) == 0)
2071
oldEntityID = gcp->entityID;
2072
oldItemID = gcp->itemID;
2073
oldItemtype = gcp->thistype;
2075
gcp->entityID = entityID;
2076
gcp->itemID = itemID;
2077
gcp->thistype = itemtype;
2079
/* only report the first one that doesn't match */
2081
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InconsistentBioSources, "Population set contains inconsistent organisms.");
2083
gcp->entityID = oldEntityID;
2084
gcp->itemID = oldItemID;
2085
gcp->thistype = oldItemtype;
2090
static void ValidatePopSet (BioseqSetPtr bssp, ValidStructPtr vsp)
2092
OrgRefPtr orp = NULL;
2095
if (bssp->_class != BioseqseqSet_class_pop_set)
2098
for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
2099
if (CheckForInconsistentBiosources (sep, vsp, &orp))
2104
static void ValidateGenProdSet (BioseqSetPtr bssp, ValidStructPtr vsp)
2108
SeqMgrFeatContext fcontext;
2109
GatherContextPtr gcp = NULL;
2112
Uint2 olditemtype = 0;
2113
Uint2 olditemid = 0;
2117
if (bssp->_class != BioseqseqSet_class_gen_prod_set)
2120
sep = bssp->seq_set;
2121
if (!IS_Bioseq (sep))
2123
bsp = (BioseqPtr) sep->data.ptrvalue;
2130
olditemid = gcp->itemID;
2131
olditemtype = gcp->thistype;
2133
if (vsp->useSeqMgrIndexes) {
2134
mrna = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_mRNA, &fcontext);
2135
while (mrna != NULL) {
2136
cdna = BioseqFindFromSeqLoc (mrna->product);
2138
gcp->itemID = mrna->idx.itemID;
2139
gcp->thistype = OBJ_SEQFEAT;
2140
loc = SeqLocPrint (mrna->product);
2142
loc = StringSave ("?");
2144
sip = SeqLocId (mrna->product);
2145
/* okay to have far RefSeq product */
2146
if (sip == NULL || sip->choice != SEQID_OTHER) {
2147
ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_GenomicProductPackagingProblem, "Product of mRNA feature (%s) not packaged in genomic product set", loc);
2151
mrna = SeqMgrGetNextFeature (bsp, mrna, 0, FEATDEF_mRNA, &fcontext);
2155
gcp->itemID = olditemid;
2156
gcp->thistype = olditemtype;
2159
static void ValidateBioseqSet (GatherContextPtr gcp)
2165
vsp = (ValidStructPtr) (gcp->userdata);
2166
bssp = (BioseqSetPtr) (gcp->thisitem);
2172
if (vsp->non_ascii_chars) { /* non_ascii chars in AsnRead step */
2173
ValidErr (vsp, SEV_ERROR, ERR_GENERIC_NonAsciiAsn, "Non-ascii chars in input ASN.1 strings");
2174
vsp->non_ascii_chars = FALSE; /* only do once */
2183
SeqEntryExplore (sep, (Pointer) vsp, ValidateSetContents);
2185
switch (bssp->_class) {
2186
case 1: /* nuc-prot */
2187
if (vsp->nuccnt == 0)
2188
ValidErr (vsp, SEV_ERROR, ERR_SEQ_PKG_NucProtProblem, "No nucleotides in nuc-prot set");
2189
if (vsp->protcnt == 0)
2190
ValidErr (vsp, SEV_ERROR, ERR_SEQ_PKG_NucProtProblem, "No proteins in nuc-prot set");
2191
ValidateNucProtSet (bssp, vsp);
2193
case 2: /* seg set */
2194
if (vsp->segcnt == 0)
2195
ValidErr (vsp, SEV_ERROR, ERR_SEQ_PKG_SegSetProblem, "No segmented Bioseq in segset");
2196
ValidateSegmentedSet (bssp, vsp);
2198
case 4: /* seg set */
2199
ValidatePartsSet (bssp, vsp);
2201
case BioseqseqSet_class_pop_set: /* population set */
2202
ValidatePopSet (bssp, vsp);
2204
case BioseqseqSet_class_gen_prod_set: /* genomic product set */
2205
ValidateGenProdSet (bssp, vsp);
2207
case BioseqseqSet_class_other:
2208
ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_GenomicProductPackagingProblem, "Genomic product set class incorrectly set to other");
2211
if (!((vsp->nuccnt) || (vsp->protcnt)))
2212
ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_EmptySet, "No Bioseqs in this set");
2218
static void LookForGEDseqID (BioseqPtr bsp, Pointer userdata)
2223
isGEDPtr = (BoolPtr) userdata;
2224
for (sip = bsp->id; sip != NULL; sip = sip->next) {
2225
switch (sip->choice) {
2240
static Boolean SuppressTrailingXMessage (BioseqPtr bsp)
2250
cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
2252
bs = ProteinFromCdRegionEx (cds, TRUE, FALSE);
2254
str = BSMerge (bs, NULL);
2258
len = StringLen (str);
2259
if (len > 1 && str[len - 1] == '*') {
2267
sdp = BioseqGetSeqDescr (bsp, Seq_descr_molinfo, NULL);
2269
mip = (MolInfoPtr) sdp->data.ptrvalue;
2271
if (mip->completeness == 4 || mip->completeness == 5)
2278
static void LookForSecondaryConflict (ValidStructPtr vsp, GatherContextPtr gcp, CharPtr accn, ValNodePtr extra_acc)
2283
if (vsp == NULL || gcp == NULL)
2285
if (StringHasNoText (accn))
2287
for (vnp = extra_acc; vnp != NULL; vnp = vnp->next) {
2288
str = (CharPtr) vnp->data.ptrvalue;
2289
if (StringHasNoText (str))
2291
if (StringICmp (accn, str) == 0) {
2292
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadSecondaryAccn, "%s used for both primary and secondary accession", accn);
2297
static void CheckSegBspAgainstParts (ValidStructPtr vsp, GatherContextPtr gcp, BioseqPtr bsp)
2305
if (vsp == NULL || gcp == NULL || bsp == NULL)
2307
if (!vsp->useSeqMgrIndexes)
2310
if (bsp->repr != Seq_repr_seg || bsp->seq_ext_type != 1 || bsp->seq_ext == NULL)
2313
sep = bsp->seqentry;
2319
if (!IS_Bioseq_set (sep))
2321
bssp = (BioseqSetPtr) sep->data.ptrvalue;
2324
if (bssp->_class != BioseqseqSet_class_parts)
2327
sep = bssp->seq_set;
2328
for (slp = (ValNodePtr) bsp->seq_ext; slp != NULL; slp = slp->next) {
2329
if (slp->choice == SEQLOC_NULL)
2332
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartsOutOfOrder, "Parts set does not contain enough Bioseqs");
2335
if (IS_Bioseq (sep)) {
2336
part = (BioseqPtr) sep->data.ptrvalue;
2337
sip = SeqLocId (slp);
2338
if (sip != NULL && part != NULL) {
2339
if (!SeqIdIn (sip, part->id)) {
2340
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartsOutOfOrder, "Segmented bioseq seq_ext does not correspond to parts packaging order");
2345
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartsOutOfOrder, "Parts set component is not Bioseq");
2351
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartsOutOfOrder, "Parts set contains too many Bioseqs");
2355
/*****************************************************************************
2357
* ValidateBioseqInst(gcp)
2358
* Validate one Bioseq Seq-inst
2360
*****************************************************************************/
2361
static void ValidateBioseqInst (GatherContextPtr gcp)
2363
Boolean retval = TRUE;
2364
Int2 i, start_at, num;
2365
Boolean errors[4], check_alphabet;
2366
static char *repr[8] = {
2367
"virtual", "raw", "segmented", "constructed",
2368
"reference", "consensus", "map", "delta"
2371
Int2 residue, x, termination;
2372
Int4 len, divisor = 1, len2;
2374
ValNodePtr vnp, vnp2, idlist;
2375
BioseqContextPtr bcp;
2376
Boolean got_partial, is_invalid;
2377
int seqtype, terminations;
2379
BioseqPtr bsp, bsp2;
2380
SeqIdPtr sip1, sip2;
2381
Char buf1[41], buf2[41];
2383
SeqCodeTablePtr sctp;
2386
SeqMgrDescContext context;
2390
SeqMgrFeatContext genectxt;
2393
SeqMgrFeatContext protctxt;
2396
CharPtr ptr, last, str, title, buf;
2400
Boolean isGenBankEMBLorDDBJ;
2401
Boolean isPatent = FALSE;
2402
Boolean isPDB = FALSE;
2403
Boolean isNT = FALSE;
2405
Int2 numletters, numdigits;
2406
Boolean letterAfterDigit, badIDchars;
2410
SeqMgrDescContext dcontext;
2411
size_t buflen = 1001;
2414
Uint2 olditemtype = 0;
2415
Uint2 olditemid = 0;
2419
/* set up data structures */
2421
vsp = (ValidStructPtr) (gcp->userdata);
2422
bsp = (BioseqPtr) (gcp->thisitem);
2426
vsp->bssp = (BioseqSetPtr) (gcp->parentitem);
2427
vsp->bsp_partial_val = 0;
2429
if (vsp->non_ascii_chars) { /* non_ascii chars in AsnRead step */
2430
ValidErr (vsp, SEV_REJECT, ERR_GENERIC_NonAsciiAsn, "Non-ascii chars in input ASN.1 strings");
2431
vsp->non_ascii_chars = FALSE; /* only do once */
2434
if (bsp->id == NULL) {
2435
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_NoIdOnBioseq, "No ids on a Bioseq");
2439
for (sip1 = bsp->id; sip1 != NULL; sip1 = sip1->next) {
2440
if (sip1->choice == SEQID_OTHER) {
2441
tsip = (TextSeqIdPtr) sip1->data.ptrvalue;
2442
if (tsip != NULL && tsip->accession != NULL) {
2443
if (StringNICmp (tsip->accession, "NT_", 3) == 0) {
2449
for (sip2 = sip1->next; sip2 != NULL; sip2 = sip2->next) {
2450
if (SeqIdComp (sip1, sip2) != SIC_DIFF) {
2451
SeqIdWrite (sip1, buf1, PRINTID_FASTA_SHORT, 40);
2452
SeqIdWrite (sip2, buf2, PRINTID_FASTA_SHORT, 40);
2453
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_ConflictingIdsOnBioseq, "Conflicting ids on a Bioseq: (%s - %s)", buf1, buf2);
2458
for (sip1 = bsp->id; sip1 != NULL; sip1 = sip1->next) {
2459
switch (sip1->choice) {
2466
tsip = (TextSeqIdPtr) sip1->data.ptrvalue;
2467
if (tsip != NULL && tsip->accession != NULL) {
2470
letterAfterDigit = FALSE;
2472
for (ptr = tsip->accession, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
2473
if (IS_UPPER (ch)) {
2475
if (numdigits > 0) {
2476
letterAfterDigit = TRUE;
2478
} else if (IS_DIGIT (ch)) {
2484
if (letterAfterDigit || badIDchars) {
2485
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadSeqIdFormat, "Bad accession", tsip->accession);
2486
} else if (numletters == 1 && numdigits == 5 && ISA_na (bsp->mol)) {
2487
} else if (numletters == 2 && numdigits == 6 && ISA_na (bsp->mol)) {
2488
} else if (numletters == 3 && numdigits == 5 && ISA_aa (bsp->mol)) {
2489
} else if (numletters == 2 && numdigits == 6 && ISA_aa (bsp->mol) && bsp->repr == Seq_repr_seg) {
2491
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadSeqIdFormat, "Bad accession", tsip->accession);
2493
if (vsp->useSeqMgrIndexes) {
2494
vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_genbank, &context);
2496
gbp = (GBBlockPtr) vnp->data.ptrvalue;
2498
LookForSecondaryConflict (vsp, gcp, tsip->accession, gbp->extra_accessions);
2501
vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_embl, &context);
2503
ebp = (EMBLBlockPtr) vnp->data.ptrvalue;
2505
LookForSecondaryConflict (vsp, gcp, tsip->accession, ebp->extra_acc);
2510
/* and keep going with further test */
2512
tsip = (TextSeqIdPtr) sip1->data.ptrvalue;
2513
if (tsip != NULL && tsip->name != NULL) {
2515
for (ptr = tsip->name, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
2516
if (IS_WHITESP (ch)) {
2521
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqIdNameHasSpace, "Seq-id.name '%s' should be a single word without any spaces", tsip->name);
2532
if (sip1->data.intvalue == 0) {
2533
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_ZeroGiNumber, "Invalid GI number");
2540
for (sip1 = bsp->id; sip1 != NULL; sip1 = sip1->next) {
2541
bsp2 = BioseqFindCore (sip1);
2544
SeqIdWrite (sip1, buf1, PRINTID_FASTA_SHORT, 40);
2545
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_IdOnMultipleBioseqs, "BioseqFind (%s) unable to find itself - possible internal error", buf1);
2547
} else if (bsp2 != bsp) {
2548
SeqIdWrite (sip1, buf1, PRINTID_FASTA_SHORT, 40);
2549
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_IdOnMultipleBioseqs, "SeqID %s is present on multiple Bioseqs in record", buf1);
2553
for (i = 0; i < 4; i++)
2556
switch (bsp->repr) {
2557
case Seq_repr_virtual:
2558
if ((bsp->seq_ext_type) || (bsp->seq_ext != NULL))
2560
if ((bsp->seq_data_type) || (bsp->seq_data != NULL))
2564
if ((bsp->seq_ext_type != 3) || (bsp->seq_ext == NULL))
2566
if ((bsp->seq_data_type) || (bsp->seq_data != NULL))
2570
if ((bsp->seq_ext_type != 2) || (bsp->seq_ext == NULL))
2572
if ((bsp->seq_data_type) || (bsp->seq_data != NULL))
2576
if ((bsp->seq_ext_type != 1) || (bsp->seq_ext == NULL))
2578
if ((bsp->seq_data_type) || (bsp->seq_data != NULL))
2582
case Seq_repr_const:
2583
if ((bsp->seq_ext_type) || (bsp->seq_ext != NULL))
2585
if ((bsp->seq_data_type < 1) || (bsp->seq_data_type > 11)
2586
|| (bsp->seq_data == NULL))
2589
case Seq_repr_delta:
2590
if ((bsp->seq_ext_type != 4) || (bsp->seq_ext == NULL))
2592
if ((bsp->seq_data_type) || (bsp->seq_data != NULL))
2596
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_ReprInvalid, "Invalid Bioseq->repr = %d", (int) (bsp->repr));
2600
if (errors[0] == TRUE) {
2601
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_ExtNotAllowed, "Bioseq-ext not allowed on %s Bioseq", repr[bsp->repr - 1]);
2605
if (errors[1] == TRUE) {
2606
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_ExtBadOrMissing, "Missing or incorrect Bioseq-ext on %s Bioseq", repr[bsp->repr - 1]);
2610
if (errors[2] == TRUE) {
2611
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_SeqDataNotFound, "Missing Seq-data on %s Bioseq", repr[bsp->repr - 1]);
2615
if (errors[3] == TRUE) {
2616
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_SeqDataNotAllowed, "Seq-data not allowed on %s Bioseq", repr[bsp->repr - 1]);
2623
if (ISA_aa (bsp->mol)) {
2624
if (bsp->topology > 1) { /* not linear */
2625
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_CircularProtein, "Non-linear topology set on protein");
2627
if (bsp->strand > 1) {
2628
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_DSProtein, "Protein not single stranded");
2633
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_MolNotSet, "Bioseq.mol is 0");
2634
else if (bsp->mol == Seq_mol_other)
2635
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_MolOther, "Bioseq.mol is type other");
2636
else if (bsp->mol == Seq_mol_na)
2637
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_MolNuclAcid, "Bioseq.mol is type na");
2639
/* check sequence alphabet */
2640
if ((bsp->repr == Seq_repr_raw) || (bsp->repr == Seq_repr_const)) {
2641
if (bsp->fuzz != NULL) {
2642
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_FuzzyLen, "Fuzzy length on %s Bioseq", repr[bsp->repr - 1]);
2645
if (bsp->length < 1) {
2646
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_InvalidLen, "Invalid Bioseq length [%ld]", (long) bsp->length);
2649
seqtype = (int) (bsp->seq_data_type);
2651
case Seq_code_iupacna:
2652
case Seq_code_ncbi2na:
2653
case Seq_code_ncbi4na:
2654
case Seq_code_ncbi8na:
2655
case Seq_code_ncbipna:
2656
if (ISA_aa (bsp->mol)) {
2657
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidAlphabet, "Using a nucleic acid alphabet on a protein sequence");
2661
case Seq_code_iupacaa:
2662
case Seq_code_ncbi8aa:
2663
case Seq_code_ncbieaa:
2664
case Seq_code_ncbipaa:
2665
case Seq_code_ncbistdaa:
2666
if (ISA_na (bsp->mol)) {
2667
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidAlphabet, "Using a protein alphabet on a nucleic acid");
2672
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidAlphabet, "Using illegal sequence alphabet [%d]", (int) bsp->seq_data_type);
2676
check_alphabet = FALSE;
2678
case Seq_code_iupacaa:
2679
case Seq_code_iupacna:
2680
case Seq_code_ncbieaa:
2681
case Seq_code_ncbistdaa:
2682
check_alphabet = TRUE;
2684
case Seq_code_ncbi8na:
2685
case Seq_code_ncbi8aa:
2689
case Seq_code_ncbi4na:
2693
case Seq_code_ncbi2na:
2697
case Seq_code_ncbipna:
2701
case Seq_code_ncbipaa:
2710
len2 = BSLen (bsp->seq_data);
2712
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data too short [%ld] for given length [%ld]", (long) (len2 * divisor),
2713
(long) bsp->length);
2715
} else if (len < len2) {
2716
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data is larger [%ld] than given length [%ld]", (long) (len2 * divisor),
2717
(long) bsp->length);
2720
if (check_alphabet) { /* check 1 letter alphabets */
2722
case Seq_code_iupacaa:
2723
case Seq_code_ncbieaa:
2726
case Seq_code_ncbistdaa:
2733
spp = SeqPortNew (bsp, 0, -1, 0, 0);
2735
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqPortFail, "Can't open SeqPort");
2742
for (len = 0; len < bsp->length; len++) {
2743
residue = SeqPortGetResidue (spp);
2744
if (!IS_residue (residue)) {
2747
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "More than 10 invalid residues. Checking stopped");
2750
PatchBadSequence (bsp);
2753
BSSeek (bsp->seq_data, len, SEEK_SET);
2754
x = BSGetByte (bsp->seq_data);
2755
if (bsp->seq_data_type == Seq_code_ncbistdaa)
2756
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "Invalid residue [%d] in position [%ld]", (int) x, (long) (len + 1));
2758
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "Invalid residue [%c] in position [%ld]", (char) x, (long) (len + 1));
2760
} else if (residue == termination) {
2762
trailingX = 0; /* suppress if followed by terminator */
2763
} else if (residue == 'X') {
2770
if (trailingX > 0 && SuppressTrailingXMessage (bsp)) {
2771
/* suppress if cds translation ends in '*' or 3' partial */
2772
} else if (trailingX > 1) {
2773
ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_TrailingX, "Sequence ends in %d trailing Xs", (int) trailingX);
2774
} else if (trailingX > 0) {
2775
ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_TrailingX, "Sequence ends in %d trailing X", (int) trailingX);
2778
cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
2779
grp = SeqMgrGetGeneXref (cds);
2781
if (grp == NULL && cds != NULL) {
2782
gene = SeqMgrGetOverlappingGene (cds->location, &genectxt);
2784
grp = (GeneRefPtr) gene->data.value.ptrvalue;
2787
if (grp != NULL && (!SeqMgrGeneIsSuppressed (grp))) {
2788
if (grp->locus != NULL)
2789
genelbl = (grp->locus);
2790
else if (grp->desc != NULL)
2791
genelbl = (grp->desc);
2792
else if (grp->syn != NULL)
2793
genelbl = (CharPtr) (grp->syn->data.ptrvalue);
2795
prot = SeqMgrGetBestProteinFeature (bsp, &protctxt);
2796
protlbl = protctxt.label;
2797
if (StringHasNoText (genelbl)) {
2800
if (StringHasNoText (protlbl)) {
2803
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_StopInProtein, "[%d] termination symbols in protein sequence (%s - %s)", terminations, genelbl, protlbl);
2809
PatchBadSequence (bsp);
2816
if ((bsp->repr == Seq_repr_seg) || (bsp->repr == Seq_repr_ref)) { /* check segmented sequence */
2817
head.choice = SEQLOC_MIX;
2818
head.data.ptrvalue = bsp->seq_ext;
2820
ValidateSeqLoc (vsp, (SeqLocPtr) & head, "Segmented Bioseq");
2821
/* check the length */
2824
while ((vnp = SeqLocFindNext (&head, vnp)) != NULL) {
2825
len2 = SeqLocLen (vnp);
2829
if (bsp->length > len) {
2830
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data too short [%ld] for given length [%ld]", (long) (len), (long) bsp->length);
2831
} else if (bsp->length < len) {
2832
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data is larger [%ld] than given length [%ld]", (long) (len), (long) bsp->length);
2837
while ((vnp = SeqLocFindNext (&head, vnp)) != NULL) {
2838
sip1 = SeqLocId (vnp);
2840
SeqIdWrite (sip1, buf1, PRINTID_FASTA_SHORT, 40);
2841
ValNodeCopyStr (&idlist, vnp->choice, buf1);
2844
if (idlist != NULL) {
2845
idlist = ValNodeSort (idlist, SortVnpByString);
2846
last = (CharPtr) idlist->data.ptrvalue;
2847
lastchoice = (Uint1) idlist->choice;
2849
while (vnp != NULL) {
2850
str = (CharPtr) vnp->data.ptrvalue;
2851
if (StringICmp (last, str) == 0) {
2852
if (vnp->choice == lastchoice && lastchoice == SEQLOC_WHOLE) {
2853
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_DuplicateSegmentReferences, "Segmented sequence has multiple references to %s\n", str);
2855
ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_DuplicateSegmentReferences,
2856
"Segmented sequence has multiple references to %s that are not SEQLOC_WHOLE\n", str);
2859
last = (CharPtr) vnp->data.ptrvalue;
2860
lastchoice = (Uint1) vnp->choice;
2864
ValNodeFreeData (idlist);
2867
vsp->bsp_partial_val = SeqLocPartialCheck ((SeqLocPtr) (&head));
2868
if ((vsp->bsp_partial_val) && (ISA_aa (bsp->mol))) {
2871
got_partial = FALSE;
2872
if (vsp->useSeqMgrIndexes) {
2873
vnp = SeqMgrGetNextDescriptor (bsp, vnp, Seq_descr_modif, &context);
2875
bcp = BioseqContextNew (bsp);
2876
vnp = BioseqContextGetSeqDescr (bcp, Seq_descr_modif, vnp, NULL);
2878
while (vnp != NULL) {
2879
for (vnp2 = (ValNodePtr) (vnp->data.ptrvalue); vnp2 != NULL; vnp2 = vnp2->next) {
2880
switch (vnp2->data.intvalue) {
2881
case 10: /* partial */
2884
case 16: /* no-left */
2885
if (!(vsp->bsp_partial_val & SLP_START))
2886
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartialInconsistent, "GIBB-mod no-left inconsistent with segmented SeqLoc");
2889
case 17: /* no-right */
2890
if (!(vsp->bsp_partial_val & SLP_STOP))
2891
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartialInconsistent, "GIBB-mod no-right inconsistent with segmented SeqLoc");
2897
if (vsp->useSeqMgrIndexes) {
2898
vnp = SeqMgrGetNextDescriptor (bsp, vnp, Seq_descr_modif, &context);
2900
vnp = BioseqContextGetSeqDescr (bcp, Seq_descr_modif, vnp, NULL);
2903
if (!vsp->useSeqMgrIndexes) {
2904
BioseqContextFree (bcp);
2907
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartialInconsistent, "Partial segmented sequence without GIBB-mod");
2913
if (bsp->repr == Seq_repr_delta) {
2915
for (vnp = (ValNodePtr) (bsp->seq_ext); vnp != NULL; vnp = vnp->next) {
2916
if (vnp->data.ptrvalue == NULL)
2917
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_SeqDataLenWrong, "NULL pointer in delta seq_ext valnode");
2919
switch (vnp->choice) {
2920
case 1: /* SeqLocPtr */
2921
len2 = SeqLocLen ((SeqLocPtr) (vnp->data.ptrvalue));
2923
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_SeqDataLenWrong, "-1 length on seq-loc of delta seq_ext");
2927
case 2: /* SeqLitPtr */
2928
slitp = (SeqLitPtr) (vnp->data.ptrvalue);
2929
if (slitp->seq_data != NULL) {
2930
sctp = SeqCodeTableFind (slitp->seq_data_type);
2932
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidAlphabet, "Using illegal sequence alphabet [%d] in SeqLitPtr", (int) slitp->seq_data_type);
2933
len += slitp->length;
2937
start_at = (Int2) (sctp->start_at);
2938
num = (Int2) (sctp->num);
2940
switch (slitp->seq_data_type) {
2941
case Seq_code_iupacaa:
2942
case Seq_code_iupacna:
2943
case Seq_code_ncbieaa:
2944
case Seq_code_ncbistdaa:
2945
BSSeek (slitp->seq_data, 0, SEEK_SET);
2946
for (len2 = 1; len2 <= (slitp->length); len2++) {
2948
residue = BSGetByte (slitp->seq_data);
2949
i = residue - start_at;
2950
if ((i < 0) || (i >= num))
2952
else if (*(sctp->names[i]) == '\0')
2955
if (slitp->seq_data_type == Seq_code_ncbistdaa)
2956
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "Invalid residue [%d] in position [%ld]", (int) residue, (long) (len + len2));
2958
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "Invalid residue [%c] in position [%ld]", (char) residue, (long) (len + len2));
2966
len += slitp->length;
2969
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_ExtNotAllowed, "Illegal choice [%d] in delta chain", (int) (vnp->choice));
2974
if (bsp->length > len) {
2975
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data too short [%ld] for given length [%ld]", (long) (len), (long) bsp->length);
2976
} else if (bsp->length < len) {
2977
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data is larger [%ld] than given length [%ld]", (long) (len), (long) bsp->length);
2980
if (vsp->useSeqMgrIndexes) {
2981
vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
2983
bcp = BioseqContextNew (bsp);
2984
vnp = BioseqContextGetSeqDescr (bcp, Seq_descr_molinfo, NULL, NULL);
2985
BioseqContextFree (bcp);
2988
mip = (MolInfoPtr) vnp->data.ptrvalue;
2990
if ((!isNT) && mip->tech != MI_TECH_htgs_0 && mip->tech != MI_TECH_htgs_1 && mip->tech != MI_TECH_htgs_2 && mip->tech != MI_TECH_htgs_3) {
2991
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadDeltaSeq, "Delta seq technique should not be [%d]", (int) (mip->tech));
2997
if (ISA_aa (bsp->mol)) {
2998
if ((bsp->length <= 3) && (bsp->length >= 0) && (!isPDB)) {
2999
ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_ShortSeq, "Sequence only %ld residues", (long) (bsp->length));
3003
if ((bsp->length <= 10) && (bsp->length >= 0) && (!isPDB)) {
3004
ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_ShortSeq, "Sequence only %ld residues", (long) (bsp->length));
3008
if (bsp->length > 350000) {
3009
if (bsp->repr == Seq_repr_delta) {
3010
isGenBankEMBLorDDBJ = FALSE;
3011
/* suppress this for data from genome annotation project */
3012
VisitBioseqsInSep (vsp->sep, (Pointer) &isGenBankEMBLorDDBJ, LookForGEDseqID);
3013
if (mip != NULL && isGenBankEMBLorDDBJ) {
3014
if (mip->tech == MI_TECH_htgs_0 || mip->tech == MI_TECH_htgs_1 || mip->tech == MI_TECH_htgs_2) {
3015
ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_LongHtgsSequence, "Phase 0, 1 or 2 HTGS sequence exceeds 350kbp limit");
3016
} else if (mip->tech == MI_TECH_htgs_3) {
3017
ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SequenceExceeds350kbp, "Phase 3 HTGS sequence exceeds 350kbp limit");
3021
for (vnp = (ValNodePtr) (bsp->seq_ext); vnp != NULL; vnp = vnp->next) {
3022
if (vnp->choice == 2) {
3023
slitp = (SeqLitPtr) (vnp->data.ptrvalue);
3024
if (slitp != NULL) {
3025
if (slitp->seq_data != NULL) {
3028
len += slitp->length;
3032
if (len > 350000 && litHasData) {
3033
ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_LongLiteralSequence, "Length of sequence literals exceeds 350kbp limit");
3037
} else if (bsp->repr == Seq_repr_raw) {
3039
if (vsp->useSeqMgrIndexes) {
3040
vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
3042
bcp = BioseqContextNew (bsp);
3043
vnp = BioseqContextGetSeqDescr (bcp, Seq_descr_molinfo, NULL, NULL);
3044
BioseqContextFree (bcp);
3047
mip = (MolInfoPtr) vnp->data.ptrvalue;
3050
if (mip->tech == MI_TECH_htgs_0 || mip->tech == MI_TECH_htgs_1 || mip->tech == MI_TECH_htgs_2) {
3051
ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_LongHtgsSequence, "Phase 0, 1 or 2 HTGS sequence exceeds 350kbp limit");
3052
} else if (mip->tech == MI_TECH_htgs_3) {
3053
ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SequenceExceeds350kbp, "Phase 3 HTGS sequence exceeds 350kbp limit");
3055
ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SequenceExceeds350kbp, "Length of sequence exceeds 350kbp limit");
3058
ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SequenceExceeds350kbp, "Length of sequence exceeds 350kbp limit");
3061
/* Could be a segset header bioseq that is > 350kbp */
3062
/* No-op for now? Or generate a warning? */
3066
if (bsp->repr == Seq_repr_seg) {
3067
CheckSegBspAgainstParts (vsp, gcp, bsp);
3070
if (ISA_aa (bsp->mol) && vsp->useSeqMgrIndexes) {
3071
vnp = BioseqGetSeqDescr (bsp, Seq_descr_title, NULL);
3073
if (bsp->idx.parenttype == OBJ_BIOSEQSET) {
3074
bssp = (BioseqSetPtr) bsp->idx.parentptr;
3075
while (bssp != NULL && bssp->_class != BioseqseqSet_class_nuc_prot) {
3076
if (bssp->idx.parenttype == OBJ_BIOSEQSET) {
3077
bssp = (BioseqSetPtr) bssp->idx.parentptr;
3082
if (bssp != NULL && bssp->_class == BioseqseqSet_class_nuc_prot) {
3083
title = (CharPtr) vnp->data.ptrvalue;
3085
sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
3087
mip = (MolInfoPtr) sdp->data.ptrvalue;
3092
buf = MemNew (sizeof (Char) * (buflen + 1));
3093
MemSet ((Pointer) (&ii), 0, sizeof (ItemInfo));
3094
if (buf != NULL && CreateDefLineEx (&ii, bsp, buf, buflen, tech, NULL, NULL, TRUE)) {
3095
if (StringICmp (buf, title) != 0) {
3096
olditemid = gcp->itemID;
3097
olditemtype = gcp->thistype;
3098
if (vnp->extended != 0) {
3099
ovp = (ObjValNodePtr) vnp;
3100
gcp->itemID = ovp->idx.itemID;
3101
gcp->thistype = OBJ_SEQDESC;
3103
ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InconsistentProteinTitle, "Instantiated protein title does not match automatically generated title");
3104
gcp->itemID = olditemid;
3105
gcp->thistype = olditemtype;
3117
/*****************************************************************************
3119
* ValidatePubdesc(gcp)
3120
* Check pubdesc for missing information
3122
*****************************************************************************/
3123
static Boolean HasNoText (CharPtr str)
3129
while (ch != '\0') {
3140
static Boolean HasNoName (ValNodePtr name)
3147
ap = name->data.ptrvalue;
3151
if (pid->choice == 2) {
3154
if (!HasNoText (nsp->names[0])) {
3165
static void ValidatePubdesc (ValidStructPtr vsp, GatherContextPtr gcp, PubdescPtr pdp)
3171
Boolean hasName, hasTitle;
3173
Boolean noVol, noPages;
3178
if (vsp == NULL || pdp == NULL)
3180
for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
3181
switch (vnp->choice) {
3183
cgp = (CitGenPtr) vnp->data.ptrvalue;
3185
if (!StringHasNoText (cgp->cit)) {
3186
if (StringNICmp (cgp->cit, "submitted", 8) == 0 || StringNICmp (cgp->cit, "unpublished", 11) == 0 /* ||
3187
StringNICmp (cgp->cit, "in press", 8) == 0 ||
3188
StringNICmp (cgp->cit, "to be published", 15) == 0 */ ) {
3190
ValidErr (vsp, SEV_ERROR, ERR_GENERIC_MissingPubInfo, "Unpublished citation text invalid");
3196
cap = (CitArtPtr) vnp->data.ptrvalue;
3200
for (title = cap->title; title != NULL; title = title->next) {
3201
if (!HasNoText ((CharPtr) title->data.ptrvalue)) {
3206
ValidErr (vsp, SEV_ERROR, ERR_GENERIC_MissingPubInfo, "Publication has no title");
3210
if (alp->choice == 1) {
3211
for (name = alp->names; name != NULL; name = name->next) {
3212
if (!HasNoName (name)) {
3216
} else if (alp->choice == 2 || alp->choice == 3) {
3217
for (name = alp->names; name != NULL; name = name->next) {
3218
if (!HasNoText ((CharPtr) name->data.ptrvalue)) {
3225
ValidErr (vsp, SEV_ERROR, ERR_GENERIC_MissingPubInfo, "Publication has no author names");
3229
switch (cap->from) {
3231
cjp = (CitJourPtr) cap->fromptr;
3234
for (title = cjp->title; title != NULL; title = title->next) {
3235
if (!HasNoText ((CharPtr) title->data.ptrvalue)) {
3240
ValidErr (vsp, SEV_ERROR, ERR_GENERIC_MissingPubInfo, "Journal title missing");
3244
if (imp->prepub == 0 && imp->pubstatus != PUBSTATUS_aheadofprint) {
3245
noVol = StringHasNoText (imp->volume);
3246
noPages = StringHasNoText (imp->pages);
3247
if (noVol && noPages) {
3248
ValidErr (vsp, SEV_ERROR, ERR_GENERIC_MissingPubInfo, "Journal volume and pages missing");
3250
ValidErr (vsp, SEV_ERROR, ERR_GENERIC_MissingPubInfo, "Journal volume missing");
3251
} else if (noPages) {
3252
ValidErr (vsp, SEV_ERROR, ERR_GENERIC_MissingPubInfo, "Journal pages missing");
3263
ValidErr (vsp, SEV_WARNING, ERR_GENERIC_UnnecessaryPubEquiv, "Publication has unexpected internal Pub-equiv");
3271
static void ValidateSfpCit (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp)
3276
if (vsp == NULL || sfp == NULL || sfp->cit == NULL)
3281
for (ppr = (ValNodePtr) psp->data.ptrvalue; ppr != NULL; ppr = ppr->next) {
3282
if (ppr->choice == PUB_Equiv) {
3283
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryCitPubEquiv, "Citation on feature has unexpected internal Pub-equiv");
3289
typedef struct bioseqvalid
3292
Boolean is_aa; /* bioseq is protein? */
3293
Boolean is_mrna; /* molinfo is mrna? */
3294
Boolean is_prerna; /* molinfo is precursor rna? */
3296
int last_na_mol, last_na_mod, last_organelle, last_partialness, last_left_right, last_biomol, last_tech, last_completeness, num_full_length_src_feat, /* number full length src feats */
3297
num_full_length_prot_ref;
3298
ValNodePtr last_gb, last_embl, last_prf, last_pir, last_sp, last_pdb, last_create, last_update, last_biosrc, last_orgref;
3300
GatherContextPtr gcp;
3302
BioseqValidStr , PNTR BioseqValidStrPtr;
3304
/*****************************************************************************
3306
* ValidateSeqFeatContext(gcp)
3307
* Gather callback helper function for validating context on a Bioseq
3309
*****************************************************************************/
3310
static Boolean ValidateSeqFeatCommon (SeqFeatPtr sfp, BioseqValidStrPtr bvsp, ValidStructPtr vsp,
3311
Int4 left, Int4 right, Uint2 featitemid, Boolean farloc, BioseqPtr bsp)
3313
GatherContextPtr gcp = NULL;
3315
Uint2 olditemtype = 0;
3316
Uint2 olditemid = 0;
3322
Boolean on_seg = FALSE;
3323
Boolean is_nc = FALSE;
3324
ErrSev sev = SEV_ERROR;
3330
if (featitemid > 0) {
3333
olditemid = gcp->itemID;
3334
olditemtype = gcp->thistype;
3335
gcp->itemID = featitemid;
3336
gcp->thistype = OBJ_SEQFEAT;
3341
for (sip = bsp->id; sip != NULL; sip = sip->next) {
3342
if (sip->choice == SEQID_OTHER) {
3343
tsip = (TextSeqIdPtr) sip->data.ptrvalue;
3344
if (tsip != NULL && tsip->accession != NULL) {
3345
if (StringNICmp (tsip->accession, "NT_", 3) == 0) {
3354
if (sfp->data.choice == SEQFEAT_PROT) {
3355
if ((left == 0) && (right == ((vsp->bsp->length) - 1)))
3356
bvsp->num_full_length_prot_ref++;
3359
switch (sfp->data.choice) {
3360
case SEQFEAT_CDREGION:
3363
case SEQFEAT_TXINIT:
3364
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidForType, "Invalid feature for a protein Bioseq.");
3371
switch (sfp->data.choice) {
3373
case SEQFEAT_PSEC_STR:
3374
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidForType, "Invalid feature for a nucleotide Bioseq.");
3382
if (bvsp->is_mrna) {
3383
switch (sfp->data.choice) {
3385
rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
3386
if (rrp != NULL && rrp->type == 2) {
3387
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidForType, "mRNA feature is invalid on an mRNA (cDNA) Bioseq.");
3391
ifp = (ImpFeatPtr) sfp->data.value.ptrvalue;
3392
if (ifp != NULL && ifp->key != NULL && (!HasNoText (ifp->key))) {
3393
if (StringCmp (ifp->key, "intron") == 0 || StringCmp (ifp->key, "CAAT_signal") == 0) {
3394
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidForType, "Invalid feature for an mRNA Bioseq.");
3401
} else if (bvsp->is_prerna) {
3402
switch (sfp->data.choice) {
3404
ifp = (ImpFeatPtr) sfp->data.value.ptrvalue;
3405
if (ifp != NULL && ifp->key != NULL && (!HasNoText (ifp->key))) {
3406
if (StringCmp (ifp->key, "CAAT_signal") == 0) {
3407
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidForType, "Invalid feature for an pre-RNA Bioseq.");
3416
if (farloc && (! is_nc)) {
3417
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FarLocation, "Feature has 'far' location - accession not packaged in record");
3420
if ((sfp->data.choice == SEQFEAT_PUB) || (sfp->cit != NULL))
3421
bvsp->got_a_pub = TRUE;
3423
str = (CharPtr) sfp->comment;
3424
if (SerialNumberInString (str)) {
3425
ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_SerialInComment,
3426
"Feature comment may refer to reference by serial number - attach reference specific comments to the reference REMARK instead.");
3429
if (bsp != NULL && bsp->repr == Seq_repr_seg) {
3430
slp = SeqLocFindNext (sfp->location, NULL);
3431
while (slp != NULL) {
3432
sip = SeqLocId (slp);
3434
if (SeqIdIn (sip, bsp->id)) {
3438
slp = SeqLocFindNext (sfp->location, slp);
3444
ValidErr (vsp, sev, ERR_SEQ_FEAT_LocOnSegmentedBioseq, "Feature location on segmented bioseq, not on parts");
3449
gcp->itemID = olditemid;
3450
gcp->thistype = olditemtype;
3456
static void CheckMultiIntervalGene (SeqFeatPtr sfp, SeqMgrFeatContextPtr context, ValidStructPtr vsp, GatherContextPtr gcp)
3459
Uint2 olditemtype = 0;
3460
Uint2 olditemid = 0;
3462
if (sfp == NULL || context == NULL || vsp == NULL) return;
3463
if (SeqLocId (sfp->location) == NULL) return;
3464
if (context->numivals < 2) return;
3467
olditemid = gcp->itemID;
3468
olditemtype = gcp->thistype;
3469
gcp->itemID = context->itemID;
3470
gcp->thistype = OBJ_SEQFEAT;
3473
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_MultiIntervalGene, "Gene feature on non-segmented sequence should not have multiple intervals");
3476
gcp->itemID = olditemid;
3477
gcp->thistype = olditemtype;
3481
static Boolean LIBCALLBACK ValidateSeqFeatIndexed (SeqFeatPtr sfp, SeqMgrFeatContextPtr context)
3484
BioseqValidStrPtr bvsp;
3486
bvsp = (BioseqValidStrPtr) context->userdata;
3489
if (sfp->data.choice == SEQFEAT_GENE) {
3490
CheckMultiIntervalGene (sfp, context, vsp, vsp->gcp);
3493
return ValidateSeqFeatCommon (sfp, bvsp, vsp, context->left, context->right, context->itemID, context->farloc, context->bsp);
3496
static void ValidateSeqFeatContext (GatherContextPtr gcp)
3499
BioseqValidStrPtr bvsp;
3502
bvsp = (BioseqValidStrPtr) (gcp->userdata);
3504
sfp = (SeqFeatPtr) (gcp->thisitem);
3506
ValidateSeqFeatCommon (sfp, bvsp, vsp, gcp->extremes.left, gcp->extremes.right, 0, FALSE, NULL);
3509
/*****************************************************************************
3511
* CountryIsValid(name)
3512
* Validates subsource country against official country names
3514
*****************************************************************************/
3516
static CharPtr countrycodes[] = {
3525
"Antigua and Barbuda",
3529
"Ashmore and Cartier Islands",
3546
"Bosnia and Herzegovina",
3550
"British Virgin Islands",
3560
"Central African Republic",
3565
"Clipperton Island",
3570
"Coral Sea Islands",
3577
"Democratic Republic of the Congo",
3581
"Dominican Republic",
3585
"Equatorial Guinea",
3590
"Falkland Islands (Islas Malvinas)",
3597
"French Southern and Antarctic Lands",
3617
"Heard Island and McDonald Islands",
3638
"Juan de Nova Island",
3682
"Netherlands Antilles",
3691
"Northern Mariana Islands",
3708
"Republic of the Congo",
3714
"Saint Kitts and Nevis",
3716
"Saint Pierre and Miquelon",
3717
"Saint Vincent and the Grenadines",
3720
"Sao Tome and Principe",
3731
"South Georgia and the South Sandwich Islands",
3750
"Trinidad and Tobago",
3755
"Turks and Caicos Islands",
3759
"United Arab Emirates",
3769
"Wallis and Futuna",
3778
static Boolean CountryIsValid (CharPtr name)
3784
if (StringHasNoText (name))
3786
StringNCpy_0 (str, name, sizeof (str));
3787
ptr = StringChr (str, ':');
3793
R = sizeof (countrycodes) / sizeof (countrycodes[0]);
3797
if (StringICmp (countrycodes[mid], str) < 0) {
3804
if (StringICmp (countrycodes[R], str) == 0) {
3811
/*****************************************************************************
3813
* ValidateSeqDescrContext(gcp)
3814
* Gather callback helper function for validating context on a Bioseq
3816
*****************************************************************************/
3817
static void ValidateBioSource (ValidStructPtr vsp, GatherContextPtr gcp, BioSourcePtr biop)
3819
Boolean chromconf = FALSE;
3820
Int2 chromcount = 0;
3821
SubSourcePtr chromosome = NULL;
3822
CharPtr countryname;
3834
ssp = biop->subtype;
3835
while (ssp != NULL) {
3836
if (ssp->subtype == SUBSRC_country) {
3837
if (!CountryIsValid (ssp->name)) {
3838
countryname = ssp->name;
3839
if (StringHasNoText (countryname)) {
3842
ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadCountryCode, "Bad country name [%s]", countryname);
3844
} else if (ssp->subtype == SUBSRC_chromosome) {
3846
if (chromosome != NULL) {
3847
if (StringICmp (ssp->name, chromosome->name) != 0) {
3853
} else if (ssp->subtype == 0) {
3854
ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadSubSource, "Unknown subsource subtype %d", (int) (ssp->subtype));
3858
if (chromcount > 1) {
3860
ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleChromosomes, "Multiple conflicting chromosome qualifiers");
3862
ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleChromosomes, "Multiple identical chromosome qualifiers");
3866
if (orp == NULL || (StringHasNoText (orp->taxname) && StringHasNoText (orp->common))) {
3867
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_NoOrgFound, "No organism name has been applied to this Bioseq.");
3872
if (onp == NULL || StringHasNoText (onp->lineage)) {
3873
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_MissingLineage, "No lineage for this BioSource.");
3875
if (biop->genome == GENOME_kinetoplast) {
3876
if (StringStr (onp->lineage, "Kinetoplastida") == 0) {
3877
ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadOrganelle, "Only Kinetoplastida have kinetoplasts");
3879
} else if (biop->genome == GENOME_nucleomorph) {
3880
if (StringStr (onp->lineage, "Chlorarchniophyta") == 0 && StringStr (onp->lineage, "Cryptophyta") == 0) {
3881
ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadOrganelle, "Only Chlorarchniophyta and Cryptophyta have nucleomorphs");
3887
while (omp != NULL) {
3888
if (omp->subtype == 0 || omp->subtype == 1) {
3889
ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadOrgMod, "Unknown orgmod subtype %d", (int) (omp->subtype));
3895
for (db = orp->db; db != NULL; db = db->next) {
3897
dbt = (DbtagPtr) db->data.ptrvalue;
3898
if (dbt != NULL && dbt->db != NULL) {
3899
for (i = 0; i < DBNUM; i++) {
3900
if (StringCmp (dbt->db, dbtag [i]) == 0) {
3905
if (id == -1 || id < 4) {
3906
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref, "Illegal db_xref type %s", dbt->db);
3911
if (GetAppProperty ("InternalNcbiSequin") == NULL) return;
3913
for (db = orp->db; db != NULL; db = db->next) {
3914
dbt = (DbtagPtr) db->data.ptrvalue;
3916
if (StringICmp (dbt->db, "taxon") == 0)
3920
ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_NoTaxonID, "BioSource is missing taxon ID");
3923
static Boolean IsXr (ValNodePtr sdp)
3931
if (sdp->extended == 0) return FALSE;
3932
ovp = (ObjValNodePtr) sdp;
3933
if (ovp->idx.parenttype != OBJ_BIOSEQ) return FALSE;
3934
bsp = (BioseqPtr) ovp->idx.parentptr;
3935
if (bsp == NULL) return FALSE;
3936
for (sip = bsp->id; sip != NULL; sip = sip->next) {
3937
if (sip->choice != SEQID_OTHER) continue;
3938
tsip = (TextSeqIdPtr) sip->data.ptrvalue;
3939
if (tsip == NULL) continue;
3940
if (StringNICmp (tsip->accession, "XR_", 3) == 0) return TRUE;
3945
static Boolean ValidateSeqDescrCommon (ValNodePtr sdp, BioseqValidStrPtr bvsp, ValidStructPtr vsp, Uint2 descitemid)
3947
ValNodePtr vnp, vnp2;
3948
OrgRefPtr this_org = NULL, that_org = NULL;
3950
Char buf1[20], buf2[20];
3953
Uint2 olditemtype = 0;
3954
Uint2 olditemid = 0;
3956
GatherContextPtr gcp = NULL;
3958
static char *badmod = "Inconsistent GIBB-mod [%d] and [%d]";
3964
if (descitemid > 0) {
3967
olditemid = gcp->itemID;
3968
olditemtype = gcp->thistype;
3969
gcp->itemID = descitemid;
3970
gcp->thistype = OBJ_SEQDESC;
3974
switch (vnp->choice) {
3975
case Seq_descr_mol_type:
3976
tmpval = (int) (vnp->data.intvalue);
3978
case 8: /* peptide */
3980
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Nuclic acid with GIBB-mol = peptide");
3982
case 0: /* unknown */
3983
case 255: /* other */
3984
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "GIBB-mol unknown or other used");
3986
default: /* the rest are nucleic acid */
3988
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "GIBB-mol [%d] used on protein", tmpval);
3990
if (bvsp->last_na_mol) {
3991
if (bvsp->last_na_mol != (int) vnp->data.intvalue) {
3992
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Inconsistent GIBB-mol [%d] and [%d]", bvsp->last_na_mol, tmpval);
3995
bvsp->last_na_mol = tmpval;
4000
case Seq_descr_modif:
4001
for (vnp2 = (ValNodePtr) (vnp->data.ptrvalue); vnp2 != NULL; vnp2 = vnp2->next) {
4002
tmpval = (int) (vnp2->data.intvalue);
4006
if (bvsp->is_aa) { /* only temporarily on 0 */
4007
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Nucleic acid GIBB-mod [%d] on protein", tmpval);
4008
} else if (bvsp->last_na_mod) {
4009
if (tmpval != bvsp->last_na_mod) {
4010
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, badmod, bvsp->last_na_mod, tmpval);
4013
bvsp->last_na_mod = tmpval;
4015
case 4: /* mitochondria */
4016
case 5: /* chloroplast */
4017
case 6: /* kinetoplast */
4018
case 7: /* cyanelle */
4019
case 18: /* macronuclear */
4020
if (bvsp->last_organelle) {
4021
if (tmpval != bvsp->last_na_mod) {
4022
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, badmod, bvsp->last_organelle, tmpval);
4025
bvsp->last_organelle = tmpval;
4027
case 10: /* partial */
4028
case 11: /* complete */
4029
if (bvsp->last_partialness) {
4030
if (tmpval != bvsp->last_partialness) {
4031
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, badmod, bvsp->last_partialness, tmpval);
4034
bvsp->last_partialness = tmpval;
4035
if ((bvsp->last_left_right) && (tmpval == 11)) {
4036
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, badmod, bvsp->last_left_right, tmpval);
4039
case 16: /* no left */
4040
case 17: /* no right */
4041
if (bvsp->last_partialness == 11) { /* complete */
4042
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, badmod, bvsp->last_partialness, tmpval);
4044
bvsp->last_left_right = tmpval;
4046
case 255: /* other */
4047
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Unknown, "GIBB-mod = other used");
4055
case Seq_descr_method:
4057
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Nucleic acid with protein sequence method");
4060
case Seq_descr_comment:
4061
str = (CharPtr) vnp->data.ptrvalue;
4062
if (SerialNumberInString (str)) {
4063
ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_SerialInComment,
4064
"Comment may refer to reference by serial number - attach reference specific comments to the reference REMARK instead.");
4067
case Seq_descr_genbank:
4068
if (bvsp->last_gb != NULL)
4069
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple GenBank blocks");
4071
bvsp->last_gb = vnp;
4073
case Seq_descr_embl:
4074
if (bvsp->last_embl != NULL)
4075
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple EMBL blocks");
4077
bvsp->last_embl = vnp;
4080
if (bvsp->last_pir != NULL)
4081
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple PIR blocks");
4083
bvsp->last_pir = vnp;
4086
if (bvsp->last_sp != NULL)
4087
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple SWISS-PROT blocks");
4089
bvsp->last_sp = vnp;
4092
if (bvsp->last_pdb != NULL)
4093
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple PDB blocks");
4095
bvsp->last_pdb = vnp;
4098
if (bvsp->last_prf != NULL)
4099
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple PRF blocks");
4101
bvsp->last_prf = vnp;
4103
case Seq_descr_create_date:
4104
if (bvsp->last_create != NULL) {
4105
tmpval = (int) DateMatch ((DatePtr) vnp->data.ptrvalue, (DatePtr) (bvsp->last_create->data.ptrvalue), FALSE);
4107
DatePrint ((DatePtr) (vnp->data.ptrvalue), buf1);
4108
DatePrint ((DatePtr) (bvsp->last_create->data.ptrvalue), buf2);
4109
ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_Inconsistent, "Inconsistent create_dates [%s] and [%s]", buf1, buf2);
4112
bvsp->last_create = vnp;
4113
if (bvsp->last_update != NULL) {
4114
tmpval = (int) DateMatch ((DatePtr) vnp->data.ptrvalue, (DatePtr) (bvsp->last_update->data.ptrvalue), FALSE);
4116
DatePrint ((DatePtr) (vnp->data.ptrvalue), buf1);
4117
DatePrint ((DatePtr) (bvsp->last_update->data.ptrvalue), buf2);
4118
ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_Inconsistent, "Inconsistent create_date [%s] and update_date [%s]", buf1, buf2);
4122
case Seq_descr_update_date:
4123
if (bvsp->last_create != NULL) {
4124
tmpval = (int) DateMatch ((DatePtr) bvsp->last_create->data.ptrvalue, (DatePtr) (vnp->data.ptrvalue), FALSE);
4126
DatePrint ((DatePtr) (bvsp->last_create->data.ptrvalue), buf1);
4127
DatePrint ((DatePtr) (vnp->data.ptrvalue), buf2);
4128
ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_Inconsistent, "Inconsistent create_date [%s] and update_date [%s]", buf1, buf2);
4131
if (bvsp->last_update == NULL)
4132
bvsp->last_update = vnp;
4134
case Seq_descr_source:
4135
biop = (BioSourcePtr) vnp->data.ptrvalue;
4136
/* ValidateBioSource (vsp, gcp, biop); */
4137
this_org = biop->org;
4138
/* fall into Seq_descr_org */
4140
if (this_org == NULL)
4141
this_org = (OrgRefPtr) (vnp->data.ptrvalue);
4142
if (bvsp->last_org != NULL) {
4143
if ((this_org->taxname != NULL) && (bvsp->last_org->taxname != NULL)) {
4144
if (StringCmp (this_org->taxname, bvsp->last_org->taxname)) {
4145
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Inconsistent taxnames [%s] and [%s]", this_org->taxname, bvsp->last_org->taxname);
4149
bvsp->last_org = this_org;
4150
for (vnp2 = vnp->next; vnp2 != NULL; vnp2 = vnp2->next) {
4151
if (vnp2->choice == Seq_descr_source || vnp2->choice == Seq_descr_org) {
4153
if (vnp2->choice == Seq_descr_source) {
4154
that_org = ((BioSourcePtr) (vnp2->data.ptrvalue))->org;
4156
if (that_org == NULL) {
4157
that_org = (OrgRefPtr) (vnp2->data.ptrvalue);
4159
if (that_org != NULL) {
4160
if ((this_org->taxname != NULL) && (that_org->taxname != NULL) && StringCmp (this_org->taxname, that_org->taxname) == 0) {
4161
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_MultipleBioSources, "Undesired multiple source descriptors");
4168
bvsp->got_a_pub = TRUE;
4169
pdp = (PubdescPtr) vnp->data.ptrvalue;
4171
ValidatePubdesc (vsp, pdp);
4174
case Seq_descr_molinfo:
4175
mip = (MolInfoPtr) vnp->data.ptrvalue;
4177
switch (mip->biomol) {
4178
case MOLECULE_TYPE_PEPTIDE: /* peptide */
4180
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Nuclic acid with Molinfo-biomol = peptide");
4183
case 0: /* unknown */
4184
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Molinfo-biomol unknown used");
4186
case 255: /* other */
4188
ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InvalidForType, "Molinfo-biomol other used");
4191
default: /* the rest are nucleic acid */
4193
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Molinfo-biomol [%d] used on protein", (int) mip->biomol);
4195
if (bvsp->last_biomol) {
4196
if (bvsp->last_biomol != (int) mip->biomol) {
4197
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Inconsistent Molinfo-biomol [%d] and [%d]", bvsp->last_biomol, (int) mip->biomol);
4200
bvsp->last_biomol = (int) mip->biomol;
4206
switch (mip->tech) {
4207
case MI_TECH_concept_trans:
4208
case MI_TECH_seq_pept:
4210
case MI_TECH_seq_pept_overlap:
4211
case MI_TECH_seq_pept_homol:
4212
case MI_TECH_concept_trans_a:
4213
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Nucleic acid with protein sequence method");
4219
switch (mip->tech) {
4222
case MI_TECH_genemap:
4223
case MI_TECH_physmap:
4224
case MI_TECH_htgs_1:
4225
case MI_TECH_htgs_2:
4226
case MI_TECH_htgs_3:
4227
case MI_TECH_fli_cdna:
4228
case MI_TECH_htgs_0:
4230
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Protein with nucleic acid sequence method");
4236
if (bvsp->last_tech) {
4237
if (bvsp->last_tech != (int) mip->tech) {
4238
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Inconsistent Molinfo-tech [%d] and [%d]", bvsp->last_tech, (int) mip->tech);
4241
bvsp->last_tech = (int) mip->tech;
4243
if (bvsp->last_completeness) {
4244
if (bvsp->last_completeness != (int) mip->completeness) {
4245
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Inconsistent Molinfo-completeness [%d] and [%d]",
4246
bvsp->last_completeness, (int) mip->completeness);
4249
bvsp->last_completeness = (int) mip->completeness;
4259
gcp->itemID = olditemid;
4260
gcp->thistype = olditemtype;
4266
static Boolean LIBCALLBACK ValidateSeqDescrIndexed (ValNodePtr sdp, SeqMgrDescContextPtr context)
4269
BioseqValidStrPtr bvsp;
4271
bvsp = (BioseqValidStrPtr) context->userdata;
4274
return ValidateSeqDescrCommon (sdp, bvsp, vsp, context->itemID);
4277
static void ValidateSeqDescrContext (GatherContextPtr gcp)
4280
BioseqValidStrPtr bvsp;
4283
bvsp = (BioseqValidStrPtr) (gcp->userdata);
4285
sdp = (ValNodePtr) (gcp->thisitem);
4287
ValidateSeqDescrCommon (sdp, bvsp, vsp, 0);
4290
/*****************************************************************************
4292
* ValidateBioseqContextGather(gcp)
4293
* Gather callback for validating context on a Bioseq
4295
*****************************************************************************/
4296
static void ValidateCitSub (ValidStructPtr vsp, CitSubPtr csp)
4300
Boolean hasName = FALSE;
4302
if (vsp == NULL || csp == NULL)
4306
if (alp->choice == 1) {
4307
for (name = alp->names; name != NULL; name = name->next) {
4308
if (!HasNoName (name)) {
4312
} else if (alp->choice == 2 || alp->choice == 3) {
4313
for (name = alp->names; name != NULL; name = name->next) {
4314
if (!HasNoText ((CharPtr) name->data.ptrvalue)) {
4321
ValidErr (vsp, SEV_ERROR, ERR_GENERIC_MissingPubInfo, "Submission citation has no author names");
4325
static Boolean DifferentDbxrefs (ValNodePtr dbxref1, ValNodePtr dbxref2)
4327
DbtagPtr dbt1, dbt2;
4328
ObjectIdPtr oip1, oip2;
4330
if (dbxref1 == NULL || dbxref2 == NULL)
4332
dbt1 = (DbtagPtr) dbxref1->data.ptrvalue;
4333
dbt2 = (DbtagPtr) dbxref2->data.ptrvalue;
4334
if (dbt1 == NULL || dbt2 == NULL)
4336
if (StringICmp (dbt1->db, dbt2->db) != 0)
4340
if (oip1 == NULL || oip2 == NULL)
4342
if (oip1->str == NULL && oip2->str == NULL) {
4343
if (oip1->id != oip2->id)
4346
if (StringICmp (oip1->str, oip2->str) != 0)
4352
static Boolean GPSorNTorNC (SeqEntryPtr sep, SeqLocPtr location)
4359
if (sep != NULL && IS_Bioseq_set (sep)) {
4360
bssp = (BioseqSetPtr) sep->data.ptrvalue;
4361
if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set) {
4365
bsp = BioseqFindFromSeqLoc (location);
4367
for (sip = bsp->id; sip != NULL; sip = sip->next) {
4368
if (sip->choice == SEQID_OTHER) {
4369
tsip = (TextSeqIdPtr) sip->data.ptrvalue;
4370
if (tsip != NULL && tsip->accession != NULL) {
4371
if (StringNICmp (tsip->accession, "NT_", 3) == 0) {
4373
} else if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
4383
static Boolean NGorNT (SeqEntryPtr sep, SeqLocPtr location, BoolPtr is_nc)
4389
if (is_nc != NULL) {
4392
bsp = BioseqFindFromSeqLoc (location);
4394
for (sip = bsp->id; sip != NULL; sip = sip->next) {
4395
if (sip->choice == SEQID_OTHER) {
4396
tsip = (TextSeqIdPtr) sip->data.ptrvalue;
4397
if (tsip != NULL && tsip->accession != NULL) {
4398
if (StringNICmp (tsip->accession, "NT_", 3) == 0) {
4400
} else if (StringNICmp (tsip->accession, "NG_", 3) == 0) {
4402
} else if (StringNICmp (tsip->accession, "NC_", 3) == 0 && is_nc != NULL) {
4412
static Boolean GPSorRefSeq (SeqEntryPtr sep, SeqLocPtr location)
4418
if (sep != NULL && IS_Bioseq_set (sep)) {
4419
bssp = (BioseqSetPtr) sep->data.ptrvalue;
4420
if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set) {
4424
bsp = BioseqFindFromSeqLoc (location);
4426
for (sip = bsp->id; sip != NULL; sip = sip->next) {
4427
if (sip->choice == SEQID_OTHER) {
4435
static void CheckForNucProt (BioseqSetPtr bssp, Pointer userdata)
4439
if (bssp->_class == BioseqseqSet_class_nuc_prot) {
4440
hasPartsP = (BoolPtr) userdata;
4445
static void CheckForParts (BioseqSetPtr bssp, Pointer userdata)
4449
if (bssp->_class == BioseqseqSet_class_parts) {
4450
hasPartsP = (BoolPtr) userdata;
4455
static Boolean DeltaOrFarSeg (SeqEntryPtr sep, SeqLocPtr location)
4458
Boolean hasParts = FALSE;
4460
bsp = BioseqFindFromSeqLoc (location);
4462
if (bsp->repr == Seq_repr_delta) {
4463
VisitSetsInSep (sep, (Pointer) &hasParts, CheckForNucProt);
4467
if (bsp->repr == Seq_repr_seg) {
4468
VisitSetsInSep (sep, (Pointer) &hasParts, CheckForParts);
4476
static Boolean IsNC (SeqEntryPtr sep, SeqLocPtr location)
4482
bsp = BioseqFindFromSeqLoc (location);
4484
for (sip = bsp->id; sip != NULL; sip = sip->next) {
4485
if (sip->choice == SEQID_OTHER) {
4486
tsip = (TextSeqIdPtr) sip->data.ptrvalue;
4487
if (tsip != NULL && tsip->accession != NULL) {
4488
if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
4498
static Boolean NotPeptideException (SeqFeatPtr sfp, SeqFeatPtr last)
4500
if (sfp != NULL && sfp->excpt) {
4501
if (StringICmp (sfp->except_text, "alternative processing") == 0 || StringICmp (sfp->except_text, "alternate processing") == 0)
4504
if (last != NULL && last->excpt) {
4505
if (StringICmp (last->except_text, "alternative processing") == 0 || StringICmp (last->except_text, "alternate processing") == 0)
4511
static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bvsp)
4518
GatherContextPtr gcp;
4520
SeqMgrFeatContext fcontext;
4521
Uint2 featdeftype = 0;
4522
SeqFeatPtr last = NULL;
4524
CharPtr label = NULL;
4525
CharPtr comment = NULL;
4530
Int4Ptr ivals = NULL;
4532
SeqAnnotPtr sap = NULL;
4533
Uint2 olditemtype = 0;
4534
Uint2 olditemid = 0;
4548
vsp->gcp = gcp; /* needed for ValidErr */
4550
SeqMgrExploreFeatures (bsp, (Pointer) bvsp, ValidateSeqFeatIndexed, NULL, NULL, NULL);
4552
overlapPepSev = SEV_WARNING;
4553
if (GetAppProperty ("SpliceValidateAsError") != NULL) {
4554
overlapPepSev = SEV_ERROR;
4558
olditemid = gcp->itemID;
4559
olditemtype = gcp->thistype;
4562
sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
4563
while (sfp != NULL) {
4566
if (fcontext.left == left && fcontext.right == right && fcontext.featdeftype == featdeftype) {
4567
if (fcontext.strand == strand || strand == Seq_strand_unknown || fcontext.strand == Seq_strand_unknown) {
4569
if (fcontext.numivals != numivals || fcontext.ivals == NULL || ivals == NULL) {
4572
for (i = 0, j = 0; i < numivals; i++, j += 2) {
4573
if (fcontext.ivals[j] != ivals[j]) {
4576
if (fcontext.ivals[j + 1] != ivals[j + 1]) {
4581
if (ivalssame && /* StringICmp (fcontext.label, label) == 0 && */
4582
(fcontext.sap == sap || (fcontext.sap->desc == NULL && sap->desc == NULL))) {
4584
gcp->itemID = fcontext.itemID;
4585
gcp->thistype = OBJ_SEQFEAT;
4589
severity = SEV_ERROR;
4590
if (featdeftype == FEATDEF_PUB ||
4591
featdeftype == FEATDEF_REGION || featdeftype == FEATDEF_misc_feature || featdeftype == FEATDEF_STS || featdeftype == FEATDEF_variation) {
4592
severity = SEV_WARNING;
4593
} else if (StringICmp (fcontext.label, label) != 0 || StringICmp (sfp->comment, comment) != 0) {
4594
if (! GPSorNTorNC (vsp->sep, sfp->location)) {
4595
severity = SEV_WARNING;
4598
/* if different CDS frames, lower to warning */
4599
if (sfp->data.choice == SEQFEAT_CDREGION) {
4600
crp = (CdRegionPtr) sfp->data.value.ptrvalue;
4602
if (frame > 1 || crp->frame > 1) {
4603
if (frame != crp->frame) {
4604
severity = SEV_WARNING;
4608
if (GPSorNTorNC (vsp->sep, sfp->location)) {
4609
severity = SEV_WARNING;
4612
if (featdeftype == FEATDEF_REGION && DifferentDbxrefs (last->dbxref, sfp->dbxref)) {
4613
/* do not report if both have dbxrefs and they are different */
4614
} else if (fcontext.sap == sap) {
4615
ValidErr (vsp, severity, ERR_SEQ_FEAT_DuplicateFeat, "Possible duplicate feature");
4617
ValidErr (vsp, severity, ERR_SEQ_FEAT_DuplicateFeat, "Possible duplicate feature (packaged in different feature table)");
4621
gcp->itemID = olditemid;
4622
gcp->thistype = olditemtype;
4627
if (fcontext.featdeftype == FEATDEF_mat_peptide_aa ||
4628
fcontext.featdeftype == FEATDEF_sig_peptide_aa || fcontext.featdeftype == FEATDEF_transit_peptide_aa) {
4629
if (featdeftype == FEATDEF_mat_peptide_aa || featdeftype == FEATDEF_sig_peptide_aa || featdeftype == FEATDEF_transit_peptide_aa) {
4630
if (fcontext.left <= right && NotPeptideException (sfp, last)) {
4632
gcp->itemID = fcontext.itemID;
4633
gcp->thistype = OBJ_SEQFEAT;
4637
ValidErr (vsp, overlapPepSev, ERR_SEQ_FEAT_OverlappingPeptideFeat, "Signal, Transit, or Mature peptide features overlap");
4640
gcp->itemID = olditemid;
4641
gcp->thistype = olditemtype;
4649
left = fcontext.left;
4650
right = fcontext.right;
4651
label = fcontext.label;
4652
comment = sfp->comment;
4653
strand = fcontext.strand;
4654
featdeftype = fcontext.featdeftype;
4655
numivals = fcontext.numivals;
4656
ivals = fcontext.ivals;
4659
if (sfp->data.choice == SEQFEAT_CDREGION) {
4660
crp = (CdRegionPtr) sfp->data.value.ptrvalue;
4666
sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
4670
sfp = SeqMgrGetNextFeatureByLabel (bsp, NULL, SEQFEAT_GENE, 0, &fcontext);
4671
while (sfp != NULL) {
4672
label = fcontext.label;
4673
if (lastLabel != NULL) {
4675
if (StringCmp (lastLabel, label) == 0) {
4676
message = "Colliding names in gene features";
4677
} else if (StringICmp (lastLabel, label) == 0) {
4678
message = "Colliding names (with different capitalization) in gene features";
4680
if (message != NULL) {
4682
gcp->itemID = fcontext.itemID;
4683
gcp->thistype = OBJ_SEQFEAT;
4687
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_CollidingGeneNames, "%s", message);
4690
gcp->itemID = olditemid;
4691
gcp->thistype = olditemtype;
4696
sfp = SeqMgrGetNextFeatureByLabel (bsp, sfp, SEQFEAT_GENE, 0, &fcontext);
4699
SeqMgrExploreDescriptors (bsp, (Pointer) bvsp, ValidateSeqDescrIndexed, NULL);
4701
omdp = ObjMgrGetData (gcp->entityID);
4702
if (omdp != NULL && omdp->datatype == OBJ_SEQSUB) {
4703
ssp = (SeqSubmitPtr) omdp->dataptr;
4707
bvsp->got_a_pub = TRUE;
4709
/* csp = (CitSubPtr) gcp->thisitem; */
4710
ValidateCitSub (vsp, csp);
4718
static Boolean ValidateBioseqContextGather (GatherContextPtr gcp)
4721
BioseqValidStrPtr bvsp;
4724
bvsp = (BioseqValidStrPtr) (gcp->userdata);
4728
vsp->gcp = gcp; /* needed for ValidErr */
4730
switch (gcp->thistype) {
4732
ValidateSeqFeatContext (gcp);
4735
ValidateSeqDescrContext (gcp);
4737
case OBJ_SEQSUB_CIT:
4738
bvsp->got_a_pub = TRUE;
4739
csp = (CitSubPtr) gcp->thisitem;
4740
ValidateCitSub (vsp, csp);
4748
/*****************************************************************************
4750
* ValidateBioseqContext(gcp)
4751
* Validate one Bioseq for descriptors, features, and context
4752
* This is done as a second Gather, focussed on the Bioseq in
4755
*****************************************************************************/
4756
static void ValidateBioseqContext (GatherContextPtr gcp)
4765
ValNodePtr vnp = NULL;
4766
MolInfoPtr mip = NULL;
4767
SeqMgrDescContext context;
4768
BioseqContextPtr bcp;
4770
PatentSeqIdPtr psip;
4772
Boolean isPDB = FALSE;
4774
vsp = (ValidStructPtr) (gcp->userdata);
4775
bsp = (BioseqPtr) (gcp->thisitem);
4779
vsp->bssp = (BioseqSetPtr) (gcp->parentitem);
4781
MemSet (&gs, 0, sizeof (GatherScope));
4782
fake_whole.choice = SEQLOC_WHOLE;
4783
sip = SeqIdFindBest (bsp->id, 0);
4785
fake_whole.data.ptrvalue = sip;
4787
fake_whole.next = NULL;
4788
gs.target = &fake_whole;
4789
gs.get_feats_location = TRUE;
4790
gs.nointervals = TRUE;
4791
MemSet ((Pointer) (gs.ignore), (int) TRUE, (size_t) (sizeof (Boolean) * OBJ_MAX));
4792
gs.ignore[OBJ_SEQDESC] = FALSE;
4793
gs.ignore[OBJ_SEQFEAT] = FALSE;
4794
gs.ignore[OBJ_SEQANNOT] = FALSE;
4795
gs.ignore[OBJ_SUBMIT_BLOCK] = FALSE;
4796
gs.ignore[OBJ_SEQSUB_CIT] = FALSE;
4798
gs.scope = vsp->sep;
4800
MemSet (&bvs, 0, sizeof (BioseqValidStr));
4803
/* now looking for molinfo on every bioseq (okay on segset) */
4806
if (vsp->useSeqMgrIndexes) {
4807
vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
4809
bcp = BioseqContextNew (bsp);
4810
vnp = BioseqContextGetSeqDescr (bcp, Seq_descr_molinfo, NULL, NULL);
4811
BioseqContextFree (bcp);
4814
mip = (MolInfoPtr) vnp->data.ptrvalue;
4818
bvs.is_mrna = FALSE;
4819
bvs.is_prerna = FALSE;
4820
if (bsp != NULL && ISA_na (bsp->mol)) {
4822
if (mip->biomol == MOLECULE_TYPE_MRNA) {
4824
} else if (mip->biomol == MOLECULE_TYPE_PRE_MRNA) {
4825
bvs.is_prerna = TRUE;
4827
if (mip->biomol >= MOLECULE_TYPE_PRE_MRNA && mip->biomol <= MOLECULE_TYPE_SCRNA && bsp->mol == Seq_mol_dna) {
4828
/* - this is how we indicate an mRNA sequenced from a cDNA, so no error
4829
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_RnaDnaConflict, "MolInfo says RNA, Bioseq says DNA");
4832
} else if (bsp->mol == Seq_mol_rna) {
4833
bvs.is_mrna = TRUE; /* if no molinfo, assume rna is mrna */
4838
if (mip->tech == MI_TECH_sts ||
4839
mip->tech == MI_TECH_survey ||
4840
mip->tech == MI_TECH_htgs_0 || mip->tech == MI_TECH_htgs_1 || mip->tech == MI_TECH_htgs_2 || mip->tech == MI_TECH_htgs_3) {
4841
if (mip->tech == MI_TECH_sts && bsp->mol == Seq_mol_rna && mip->biomol == MOLECULE_TYPE_MRNA) {
4842
/* there are some STS sequences derived from cDNAs, so do not report these */
4843
} else if (mip->biomol != MOLECULE_TYPE_GENOMIC) {
4844
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_ConflictingBiomolTech, "HTGS/STS/GSS sequence should be genomic");
4845
} else if (bsp == NULL || (bsp->mol != Seq_mol_dna && bsp->mol != Seq_mol_na)) {
4846
ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_ConflictingBiomolTech, "HTGS/STS/GSS sequence should not be RNA");
4851
if (ISA_aa (bsp->mol)) {
4853
/* check proteins in nuc-prot set have a CdRegion */
4854
if (vsp->bssp != NULL) {
4855
if (vsp->bssp->_class == 1) { /* in a nuc-prot set */
4856
if (vsp->useSeqMgrIndexes) {
4857
sfp = SeqMgrGetCDSgivenProduct (bsp, NULL);
4859
sfp = SeqEntryGetSeqFeat (vsp->sep, 3, NULL, NULL, 1, bsp);
4861
if (sfp == NULL) /* no CdRegion points to this bsp */
4862
ValidErr (vsp, SEV_ERROR, ERR_SEQ_PKG_NoCdRegionPtr, "No CdRegion in nuc-prot set points to this protein");
4867
if (vsp->useSeqMgrIndexes) {
4869
ValidateBioseqContextIndexed (bsp, &bvs);
4871
GatherSeqEntry (vsp->sep, &bvs, ValidateBioseqContextGather, &gs);
4874
vsp->gcp = gcp; /* reset the gcp pointer changed in previous gather */
4878
if ((!bvs.got_a_pub) && (!vsp->suppress_no_pubs)) {
4881
omdp = ObjMgrGetData (gcp->entityID);
4883
if (omdp == NULL || omdp->datatype != OBJ_SEQSUB) {
4884
if (!IsRefSeq (bsp)) {
4885
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_NoPubFound, "No publications refer to this Bioseq.");
4890
for (sip = bsp->id; sip != NULL; sip = sip->next) {
4891
if (sip->choice == SEQID_PATENT) {
4892
psip = (PatentSeqIdPtr) sip->data.ptrvalue;
4895
if (ipp != NULL && StringICmp (ipp->country, "US") == 0)
4899
} else if (sip->choice == SEQID_PDB) {
4904
if ((!bvs.last_org) && (!vsp->suppress_no_biosrc))
4905
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_NoOrgFound, "No organism name has been applied to this Bioseq.");
4908
if ((bvs.is_aa) && (!bvs.num_full_length_prot_ref) && (!isPDB))
4909
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_NoProtRefFound, "No full length Prot-ref feature applied to this Bioseq");
4911
/* for now only flag missing molinfo in Sequin */
4912
if (mip == NULL && GetAppProperty ("SpliceValidateAsError") != NULL && (!isPDB)) {
4913
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_NoMolInfoFound, "No Mol-info applies to this Bioseq");
4920
/*****************************************************************************
4922
* ValidateSeqFeat(gcp)
4924
*****************************************************************************/
4925
static Boolean EmptyOrNullString (CharPtr str)
4932
while (ch != '\0') {
4933
if (ch > ' ' && ch <= '~')
4941
static void CheckPeptideOnCodonBoundary (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp, CharPtr key)
4945
SeqLocPtr first = NULL, last = NULL, slp = NULL;
4946
Boolean partial5, partial3;
4947
Int4 pos1, pos2, adjust = 0, mod1, mod2;
4949
cds = SeqMgrGetOverlappingCDS (sfp->location, NULL);
4952
crp = (CdRegionPtr) cds->data.value.ptrvalue;
4955
if (crp->frame == 2) {
4957
} else if (crp->frame == 3) {
4961
while ((slp = SeqLocFindNext (sfp->location, slp)) != NULL) {
4963
if (first == NULL) {
4967
if (first == NULL || last == NULL)
4970
pos1 = GetOffsetInLoc (first, cds->location, SEQLOC_START) - adjust;
4971
pos2 = GetOffsetInLoc (last, cds->location, SEQLOC_STOP) - adjust;
4975
CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
4983
if (mod1 != 0 && mod2 != 2) {
4984
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PeptideFeatOutOfFrame, "Start and stop of %s are out of frame with CDS codons", key);
4985
} else if (mod1 != 0) {
4986
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PeptideFeatOutOfFrame, "Start of %s is out of frame with CDS codons", key);
4987
} else if (mod2 != 2) {
4988
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PeptideFeatOutOfFrame, "Stop of %s is out of frame with CDS codons", key);
4992
static CharPtr legal_repeat_types[] = {
4993
"tandem", "inverted", "flanking", "terminal",
4994
"direct", "dispersed", "other", NULL
4997
static void ValidateImpFeat (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp, ImpFeatPtr ifp)
5003
SeqMgrFeatContext gcontext;
5009
Boolean multi_rpt_unit;
5010
Boolean no_white_space;
5011
Boolean only_digits;
5021
if (vsp == NULL || gcp == NULL || sfp == NULL || ifp == NULL)
5023
if (StringCmp (ifp->key, "-") == 0) {
5024
key = StringSave ("misc_feature");
5026
key = StringSaveNoNull (ifp->key);
5028
index = GBFeatKeyNameValid (&key, FALSE);
5031
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnknownImpFeatKey, "Unknown feature key %s", key);
5033
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnknownImpFeatKey, "NULL feature key");
5035
} else if (StringICmp (key, "virion") == 0 || StringICmp (key, "mutation") == 0 || StringICmp (key, "allele") == 0) {
5036
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_UnknownImpFeatKey, "Feature key %s is no longer legal", key);
5037
} else if (StringICmp (key, "polyA_site") == 0) {
5038
if (SeqLocStart (sfp->location) != SeqLocStop (sfp->location)) {
5039
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PolyAsiteNotPoint, "PolyA_site should be a single point");
5041
} else if (StringICmp (key, "mat_peptide") == 0 || StringICmp (key, "sig_peptide") == 0 || StringICmp (key, "transit_peptide") == 0) {
5042
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidForType, "Peptide processing feature should be converted to the appropriate protein feature subtype");
5043
CheckPeptideOnCodonBoundary (vsp, gcp, sfp, key);
5044
} else if (StringICmp (key, "mRNA") == 0 ||
5045
StringICmp (key, "tRNA") == 0 ||
5046
StringICmp (key, "rRNA") == 0 ||
5047
StringICmp (key, "snRNA") == 0 ||
5048
StringICmp (key, "scRNA") == 0 || StringICmp (key, "snoRNA") == 0 || StringICmp (key, "misc_RNA") == 0 || StringICmp (key, "precursor_RNA") == 0) {
5049
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidForType,
5050
"RNA feature should be converted to the appropriate RNA feature subtype, location should be converted manually");
5051
} else if (StringICmp (key, "CDS") == 0) {
5052
failed = TRUE; /* impfeat CDS must be pseudo; fail if not */
5056
grp = SeqMgrGetGeneXref (sfp);
5057
if (grp != NULL && grp->pseudo) {
5060
gene = SeqMgrGetOverlappingGene (sfp->location, &gcontext);
5065
grp = (GeneRefPtr) gene->data.value.ptrvalue;
5066
if (grp != NULL && grp->pseudo) {
5073
for (gbqual = sfp->qual; gbqual != NULL; gbqual = gbqual->next) {
5074
if (StringCmp (gbqual->qual, "translation") == 0) {
5075
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_ImpCDShasTranslation, "ImpFeat CDS with /translation found");
5079
ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_ImpCDSnotPseudo, "ImpFeat CDS should be pseudo");
5082
for (gbqual = sfp->qual; gbqual != NULL; gbqual = gbqual->next) {
5083
if (StringCmp (gbqual->qual, "gsdb_id") == 0) {
5086
val = GBQualNameValid (gbqual->qual);
5088
if (gbqual->qual != NULL) {
5089
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnknownImpFeatQual, "Unknown qualifier %s", gbqual->qual);
5091
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnknownImpFeatQual, "NULL qualifier");
5093
} else if (index != -1) {
5095
for (i = 0; i < ParFlat_GBFeat[index].opt_num; i++) {
5096
qual = ParFlat_GBFeat[index].opt_qual[i];
5103
for (i = 0; i < ParFlat_GBFeat[index].mand_num; i++) {
5104
qual = ParFlat_GBFeat[index].mand_qual[i];
5111
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WrongQualOnImpFeat, "Wrong qualifier %s for feature %s", gbqual->qual, key);
5114
if (gbqual->val != NULL) {
5115
if (val == GBQUAL_rpt_type) {
5117
tmp = StringSave (gbqual->val);
5122
while (!StringHasNoText (str)) {
5123
ptr = StringChr (str, ',');
5125
ptr = StringChr (str, ')');
5132
for (i = 0; legal_repeat_types[i] != NULL; i++) {
5133
if (StringICmp (str, legal_repeat_types[i]) == 0) {
5145
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "%s is not legal a legal value for qualifier %s", gbqual->val, gbqual->qual);
5147
} else if (val == GBQUAL_rpt_unit) {
5149
multi_rpt_unit = TRUE;
5150
for (ptr = gbqual->val, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
5153
} else if (ch == '(' || ch == ')' || ch == ',' || ch == '.' || IS_DIGIT (ch)) {
5155
multi_rpt_unit = FALSE;
5159
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Illegal value for qualifier %s", gbqual->qual);
5160
} else if ((!multi_rpt_unit) && StringLen (gbqual->val) > 48) {
5161
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Illegal value for qualifier %s", gbqual->qual);
5163
} else if (val == GBQUAL_label) {
5164
no_white_space = TRUE;
5166
for (ptr = gbqual->val, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
5167
if (IS_WHITESP (ch)) {
5168
no_white_space = FALSE;
5170
if (! IS_DIGIT (ch)) {
5171
only_digits = FALSE;
5174
if (only_digits || (! no_white_space)) {
5175
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Illegal value for qualifier %s", gbqual->qual);
5181
if (index != -1 && ParFlat_GBFeat[index].mand_num > 0) {
5182
for (i = 0; i < ParFlat_GBFeat[index].mand_num; i++) {
5184
qual = ParFlat_GBFeat[index].mand_qual[i];
5185
for (gbqual = sfp->qual; gbqual != NULL; gbqual = gbqual->next) {
5186
val = GBQualNameValid (gbqual->qual);
5193
if (qual == GBQUAL_citation && sfp->cit != NULL) {
5198
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MissingQualOnImpFeat, "Missing qualifier %s for feature %s", ParFlat_GBQual_names[qual].name, key);
5202
if (!StringHasNoText (ifp->loc)) {
5203
slp = sfp->location;
5204
if (StringStr (ifp->loc, "one-of") != NULL) {
5205
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_ImpFeatBadLoc, "ImpFeat loc %s has obsolete 'one-of' text for feature %s", ifp->loc, key);
5206
} else if (slp != NULL && slp->choice == SEQLOC_INT) {
5207
sint = (SeqIntPtr) slp->data.ptrvalue;
5208
if (sint != NULL && sint->strand != Seq_strand_minus) {
5209
sprintf (range, "%ld..%ld", (long) (sint->from + 1), (long) (sint->to + 1));
5210
if (StringCmp (ifp->loc, range) != 0) {
5211
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_ImpFeatBadLoc, "ImpFeat loc %s does not equal feature location %s for feature %s", ifp->loc, range, key);
5219
/* PartialAtSpliceSite uses code taken from SpliceCheckEx */
5220
static Boolean PartialAtSpliceSite (SeqLocPtr head, Uint2 slpTag)
5223
Int2 residue1, residue2;
5224
Boolean rsult = FALSE;
5226
SeqLocPtr slp = NULL, first = NULL, last = NULL;
5227
SeqPortPtr spp = NULL;
5229
Int4 strt, stp, donor, acceptor, len;
5231
if (slpTag != SLP_NOSTART && slpTag != SLP_NOSTOP)
5233
while ((slp = SeqLocFindPart (head, slp, EQUIV_IS_ONE)) != NULL) {
5234
if (first == NULL) {
5242
strand = SeqLocStrand (first);
5243
if (SeqLocStrand (last) != strand)
5246
if (slpTag == SLP_NOSTART) {
5251
sip = SeqLocId (slp);
5254
acceptor = SeqLocStart (slp);
5255
donor = SeqLocStop (slp);
5256
bsp = BioseqLockById (sip);
5260
spp = SeqPortNew (bsp, 0, -1, strand, Seq_code_ncbi4na);
5265
if (strand != Seq_strand_minus) {
5272
stp = len - donor - 1;
5273
strt = len - acceptor - 1;
5276
if (slpTag == SLP_NOSTOP && stp < len - 2) {
5277
SeqPortSeek (spp, (stp + 1), SEEK_SET);
5278
residue1 = SeqPortGetResidue (spp);
5279
residue2 = SeqPortGetResidue (spp);
5280
if (IS_residue (residue1) && IS_residue (residue2)) {
5281
if ((residue1 & 4) && (residue2 & 8)) {
5283
} else if ((residue1 & 4) && (residue2 & 2)) {
5287
} else if (slpTag == SLP_NOSTART && strt > 1) {
5288
SeqPortSeek (spp, (strt - 2), SEEK_SET);
5289
residue1 = SeqPortGetResidue (spp);
5290
residue2 = SeqPortGetResidue (spp);
5291
if (IS_residue (residue1) && IS_residue (residue2)) {
5292
if ((residue1 & 1) && (residue2 & 4)) {
5298
spp = SeqPortFree (spp);
5302
static void CheckTrnaCodons (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp, tRNAPtr trp)
5307
CharPtr codes = NULL;
5309
GeneticCodePtr gncp;
5312
ErrSev sev = SEV_ERROR;
5313
SeqMapTablePtr smtp;
5317
if (vsp == NULL || gcp == NULL || sfp == NULL || trp == NULL)
5319
for (j = 0; j < 6; j++) {
5320
if (trp->codon[j] < 64) {
5321
if (codes == NULL) {
5322
bsp = GetBioseqGivenSeqLoc (sfp->location, gcp->entityID);
5323
sep = GetBestTopParentForData (gcp->entityID, bsp);
5324
code = SeqEntryToGeneticCode (sep, NULL, NULL, 0);
5325
gncp = GeneticCodeFind (code, NULL);
5327
gncp = GeneticCodeFind (1, NULL);
5331
for (vnp = (ValNodePtr) gncp->data.ptrvalue; vnp != NULL; vnp = vnp->next) {
5332
if (vnp->choice == 3) {
5333
codes = (CharPtr) vnp->data.ptrvalue;
5339
taa = codes[trp->codon[j]];
5341
if (trp->aatype == 2) {
5345
switch (trp->aatype) {
5350
from = Seq_code_iupacaa;
5353
from = Seq_code_ncbieaa;
5356
from = Seq_code_ncbi8aa;
5359
from = Seq_code_ncbistdaa;
5364
smtp = SeqMapTableFind (Seq_code_ncbieaa, from);
5366
aa = SeqMapTableConvert (smtp, trp->aa);
5369
if (aa > 0 && aa != 255) {
5374
ValidErr (vsp, sev, ERR_SEQ_FEAT_TrnaCodonWrong, "tRNA codon does not match genetic code");
5381
static BioseqSetPtr GetParentNPS (BioseqPtr bsp)
5387
if (bsp->idx.parenttype != OBJ_BIOSEQSET)
5389
bssp = (BioseqSetPtr) bsp->idx.parentptr;
5390
while (bssp != NULL && bssp->_class != BioseqseqSet_class_nuc_prot && bssp->idx.parenttype == OBJ_BIOSEQSET) {
5391
bssp = (BioseqSetPtr) bssp->idx.parentptr;
5393
if (bssp->_class == BioseqseqSet_class_nuc_prot)
5398
static Boolean NucAndProtNotInNPS (BioseqPtr nuc, BioseqPtr prot)
5402
if (nuc == NULL || prot == NULL)
5404
bssp = GetParentNPS (nuc);
5407
if (GetParentNPS (prot) != bssp)
5412
static void CheckForCommonCDSProduct (ValidStructPtr vsp, SeqFeatPtr sfp)
5420
Boolean is_nc = FALSE;
5421
Boolean is_nc_gps = FALSE;
5422
Boolean is_nt = FALSE;
5428
if (sfp == NULL || sfp->pseudo)
5430
if (!vsp->useSeqMgrIndexes)
5432
crp = (CdRegionPtr) sfp->data.value.ptrvalue;
5433
if (crp != NULL && crp->orf)
5435
grp = SeqMgrGetGeneXref (sfp);
5436
if (grp == NULL || (!SeqMgrGeneIsSuppressed (grp))) {
5437
gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
5438
if (gene == NULL || gene->pseudo)
5440
grp = (GeneRefPtr) gene->data.value.ptrvalue;
5441
if (grp != NULL && grp->pseudo)
5444
if (sfp->product == NULL)
5446
bsp = BioseqFindFromSeqLoc (sfp->product);
5448
sip = SeqLocId (sfp->product);
5449
/* okay to have far RefSeq product... */
5450
if (sip == NULL || sip->choice != SEQID_OTHER) {
5452
if (sep != NULL && IS_Bioseq_set (sep)) {
5453
bssp = (BioseqSetPtr) sep->data.ptrvalue;
5454
/* but only if genomic product set */
5455
if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set)
5458
/* or just a bioseq */
5459
if (sep != NULL && IS_Bioseq (sep))
5461
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MultipleCDSproducts, "Unable to find product Bioseq from CDS feature");
5465
nuc = BioseqFindFromSeqLoc (sfp->location);
5467
for (sip = nuc->id; sip != NULL; sip = sip->next) {
5468
if (sip->choice == SEQID_OTHER) {
5469
tsip = (TextSeqIdPtr) sip->data.ptrvalue;
5470
if (tsip != NULL && tsip->accession != NULL) {
5471
if (StringNICmp (tsip->accession, "NT_", 3) == 0) {
5473
} else if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
5479
if (is_nc && nuc->idx.parenttype == OBJ_BIOSEQSET) {
5480
bssp = (BioseqSetPtr) nuc->idx.parentptr;
5481
if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set) {
5485
if (NucAndProtNotInNPS (nuc, bsp) && (! is_nt) && (! is_nc_gps)) {
5486
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_CDSproductPackagingProblem, "Protein product not packaged in nuc-prot set with nucleotide");
5489
cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
5493
/* if genomic product set, with one cds on contig and one on cdna, do not report */
5495
if (sep != NULL && IS_Bioseq_set (sep)) {
5496
bssp = (BioseqSetPtr) sep->data.ptrvalue;
5497
if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set) {
5498
/* feature packaging test will do final contig vs. cdna check */
5499
if (BioseqFindFromSeqLoc (cds->location) != BioseqFindFromSeqLoc (sfp->location))
5504
ValidErr (vsp, SEV_REJECT, ERR_SEQ_FEAT_MultipleCDSproducts, "Same product Bioseq from multiple CDS features");
5508
static void CheckForCommonMRNAProduct (ValidStructPtr vsp, SeqFeatPtr sfp)
5515
SeqEntryPtr oldscope;
5519
if (sfp == NULL || sfp->pseudo)
5521
if (!vsp->useSeqMgrIndexes)
5523
grp = SeqMgrGetGeneXref (sfp);
5524
if (grp == NULL || (!SeqMgrGeneIsSuppressed (grp))) {
5525
gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
5526
if (gene == NULL || gene->pseudo)
5528
grp = (GeneRefPtr) gene->data.value.ptrvalue;
5529
if (grp != NULL && grp->pseudo)
5532
if (sfp->product == NULL)
5535
oldscope = SeqEntrySetScope (vsp->sep);
5536
bsp = BioseqFindFromSeqLoc (sfp->product);
5537
SeqEntrySetScope (oldscope);
5539
sip = SeqLocId (sfp->product);
5540
if (sip != NULL && sip->choice == SEQID_LOCAL) {
5542
if (sep != NULL && IS_Bioseq_set (sep)) {
5543
bssp = (BioseqSetPtr) sep->data.ptrvalue;
5545
if (bssp->_class == BioseqseqSet_class_gen_prod_set ||
5546
bssp->_class == BioseqseqSet_class_other) {
5547
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_MissingMRNAproduct,
5548
"Product Bioseq of mRNA feature is not packaged in the record");
5556
mrna = SeqMgrGetRNAgivenProduct (bsp, NULL);
5560
ValidErr (vsp, SEV_REJECT, ERR_SEQ_FEAT_MultipleMRNAproducts, "Same product Bioseq from multiple mRNA features");
5564
static void CheckForBadGeneOverlap (ValidStructPtr vsp, SeqFeatPtr sfp)
5566
SeqMgrFeatContext fcontext;
5569
ErrSev sev = SEV_ERROR;
5573
grp = SeqMgrGetGeneXref (sfp);
5576
gene = SeqMgrGetOverlappingGene (sfp->location, &fcontext);
5579
gene = SeqMgrGetOverlappingFeature (sfp->location, FEATDEF_GENE, NULL, 0, NULL, SIMPLE_OVERLAP, &fcontext);
5582
if (IsNC (vsp->sep, sfp->location)) {
5585
if (sfp->data.choice == SEQFEAT_CDREGION) {
5586
ValidErr (vsp, sev, ERR_SEQ_FEAT_CDSgeneRange, "gene overlaps CDS but does not completely contain it");
5587
} else if (sfp->data.choice == SEQFEAT_RNA) {
5588
ValidErr (vsp, sev, ERR_SEQ_FEAT_mRNAgeneRange, "gene overlaps mRNA but does not completely contain it");
5592
static void CheckForBadMRNAOverlap (ValidStructPtr vsp, SeqFeatPtr sfp)
5594
SeqMgrFeatContext fcontext;
5596
ErrSev sev = SEV_ERROR;
5600
mrna = SeqMgrGetOverlappingFeature (sfp->location, FEATDEF_mRNA, NULL, 0, NULL, SIMPLE_OVERLAP, &fcontext);
5603
mrna = SeqMgrGetOverlappingFeature (sfp->location, FEATDEF_mRNA, NULL, 0, NULL, CHECK_INTERVALS, &fcontext);
5606
mrna = SeqMgrGetOverlappingFeature (sfp->location, FEATDEF_mRNA, NULL, 0, NULL, INTERVAL_OVERLAP, &fcontext);
5609
if (IsNC (vsp->sep, sfp->location)) {
5615
ValidErr (vsp, sev, ERR_SEQ_FEAT_CDSmRNArange, "mRNA overlaps or contains CDS but does not completely contain intervals");
5618
static void CheckForBothStrands (ValidStructPtr vsp, SeqFeatPtr sfp)
5620
Boolean bothstrands = FALSE;
5621
SeqLocPtr location, slp = NULL;
5625
location = sfp->location;
5626
if (location == NULL)
5628
while ((slp = SeqLocFindNext (location, slp)) != NULL) {
5629
if (SeqLocStrand (slp) == Seq_strand_both) {
5634
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_BothStrands, "mRNA or CDS may not be on both strands");
5638
static Boolean OverlappingGeneIsPseudo (SeqFeatPtr sfp)
5645
grp = SeqMgrGetGeneXref (sfp);
5651
gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
5655
grp = (GeneRefPtr) gene->data.value.ptrvalue;
5664
static CharPtr legalDbXrefOnRefSeq [] = {
5669
static void CheckForIllegalDbxref (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp, ValNodePtr dbxref)
5677
for (vnp = dbxref; vnp != NULL; vnp = vnp->next) {
5679
db = vnp->data.ptrvalue;
5680
if (db != NULL && db->db != NULL) {
5681
for (i = 0; i < DBNUM; i++) {
5682
if (StringCmp (db->db, dbtag[i]) == 0) {
5687
if (id == -1 && GPSorRefSeq (vsp->sep, sfp->location)) {
5688
for (i = 0; legalDbXrefOnRefSeq [i] != NULL; i++) {
5689
if (StringCmp (db->db, legalDbXrefOnRefSeq [i]) == 0) return;
5692
if (id == -1 || (sfp->data.choice != SEQFEAT_CDREGION && id < 4)) {
5693
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref, "Illegal db_xref type %s", db->db);
5699
NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp)
5702
static char *parterr[2] = { "PartialProduct", "PartialLocation" };
5703
static char *parterrs[4] = {
5704
"Start does not include first/last residue of sequence",
5705
"Stop does not include first/last residue of sequence",
5706
"Internal partial intervals do not include first/last residue of sequence",
5707
"Improper use of partial (greater than or less than)"
5709
Uint2 partials[2], errtype;
5719
Boolean pseudo, excpt, conflict, codonqual, anticodonqual;
5725
BioseqContextPtr bcp = NULL;
5726
BioSourcePtr biop, dbiop;
5728
OrgRefPtr orp, dorp;
5735
SeqMgrDescContext context;
5738
Boolean redundantgenexref;
5739
SeqMgrFeatContext fcontext;
5740
CharPtr syn1, syn2, label = NULL;
5741
Uint2 oldEntityID, oldItemID;
5743
vsp = (ValidStructPtr) (gcp->userdata);
5744
sfp = (SeqFeatPtr) (gcp->thisitem);
5747
type = (Int2) (sfp->data.choice);
5749
ValidateSeqLoc (vsp, sfp->location, "Location");
5751
ValidateSeqLoc (vsp, sfp->product, "Product");
5753
partials[0] = SeqLocPartialCheck (sfp->product);
5754
partials[1] = SeqLocPartialCheck (sfp->location);
5755
if ((partials[0] != SLP_COMPLETE) || (partials[1] != SLP_COMPLETE) || (sfp->partial)) { /* partialness */
5756
/* a feature on a partial sequence should be partial -- if often isn't */
5757
if ((!sfp->partial) && (partials[1] != SLP_COMPLETE) && (sfp->location->choice == SEQLOC_WHOLE)) {
5758
ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_PartialProblem, "On partial Bioseq, SeqFeat.partial should be TRUE");
5760
/* a partial feature, with complete location, but partial product */
5761
else if ((sfp->partial) && (sfp->product != NULL) && (partials[1] == SLP_COMPLETE) && (sfp->product->choice == SEQLOC_WHOLE)
5762
&& (partials[0] != SLP_COMPLETE)) {
5763
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem, "When SeqFeat.product is a partial Bioseq, SeqFeat.location should also be partial");
5765
/* gene on segmented set is now 'order', should also be partial */
5766
else if (type == SEQFEAT_GENE && sfp->product == NULL && partials[1] == SLP_INTERNAL) {
5767
if (!sfp->partial) {
5768
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem, "Gene of 'order' with otherwise complete location should have partial flag set");
5771
/* inconsistent combination of partial/complete product,location,partial flag */
5772
else if (((partials[0] == SLP_COMPLETE) && (sfp->product != NULL)) || (partials[1] == SLP_COMPLETE) || (!sfp->partial)) {
5773
tmp = StringMove (buf, "Inconsistent: ");
5774
if (sfp->product != NULL) {
5775
tmp = StringMove (tmp, "Product= ");
5777
tmp = StringMove (tmp, "partial, ");
5779
tmp = StringMove (tmp, "complete, ");
5781
tmp = StringMove (tmp, "Location= ");
5783
tmp = StringMove (tmp, "partial, ");
5785
tmp = StringMove (tmp, "complete, ");
5786
tmp = StringMove (tmp, "Feature.partial= ");
5788
tmp = StringMove (tmp, "TRUE");
5790
tmp = StringMove (tmp, "FALSE");
5791
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem, buf);
5794
/* may have other error bits set as well */
5795
for (i = 0; i < 2; i++) {
5796
errtype = SLP_NOSTART;
5797
for (j = 0; j < 4; j++) {
5798
if (partials[i] & errtype) {
5799
if (i == 1 && j < 2 && PartialAtSpliceSite (sfp->location, errtype)) {
5800
ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_PartialProblem, "%s: %s (but is at consensus splice site)", parterr[i], parterrs[j]);
5802
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem, "%s: %s", parterr[i], parterrs[j]);
5811
CheckForIllegalDbxref (vsp, gcp, sfp, sfp->dbxref);
5813
for (vnp = sfp->dbxref; vnp != NULL; vnp = vnp->next) {
5815
db = vnp->data.ptrvalue;
5817
for (i = 0; i < DBNUM; i++) {
5818
if (StringCmp (db->db, dbtag[i]) == 0) {
5823
if (id == -1 || (type != SEQFEAT_CDREGION && id < 4)) {
5824
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref, "Illegal db_xref type %s", db->db);
5831
case 1: /* Gene-ref */
5832
grp = (GeneRefPtr) (sfp->data.value.ptrvalue);
5834
if (EmptyOrNullString (grp->locus) &&
5835
EmptyOrNullString (grp->allele) && EmptyOrNullString (grp->desc) && EmptyOrNullString (grp->maploc) && grp->db == NULL && grp->syn == NULL) {
5836
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GeneRefHasNoData, "There is a gene feature where all fields are empty");
5838
CheckForIllegalDbxref (vsp, gcp, sfp, grp->db);
5840
for (vnp = grp->db; vnp != NULL; vnp = vnp->next) {
5842
db = vnp->data.ptrvalue;
5844
for (i = 0; i < DBNUM; i++) {
5845
if (StringCmp (db->db, dbtag[i]) == 0) {
5850
if (id == -1 || (type != SEQFEAT_CDREGION && id < 4)) {
5851
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref, "Illegal db_xref type %s", db->db);
5858
case 2: /* Org-ref */
5860
case 3: /* Cdregion */
5861
pseudo = sfp->pseudo; /* now also uses new feature pseudo flag */
5865
crp = (CdRegionPtr) (sfp->data.value.ptrvalue);
5867
conflict = crp->conflict;
5870
while (gbq != NULL) {
5871
if (StringICmp (gbq->qual, "pseudo") == 0) {
5874
if (StringICmp (gbq->qual, "exception") == 0) {
5877
if (StringICmp (gbq->qual, "codon") == 0) {
5882
if (OverlappingGeneIsPseudo (sfp)) {
5885
if ((!pseudo) && (!conflict)) {
5886
CdTransCheck (vsp, sfp);
5887
if (sfp->excpt && (StringICmp (sfp->except_text, "ribosomal slippage") == 0 ||
5888
StringICmp (sfp->except_text, "ribosome slippage") == 0 || StringICmp (sfp->except_text, "artificial frameshift") == 0)) {
5889
/* suppress splice check */
5891
SpliceCheck (vsp, sfp);
5894
crp = (CdRegionPtr) (sfp->data.value.ptrvalue);
5896
for (cbp = crp->code_break; cbp != NULL; cbp = cbp->next) {
5897
i = SeqLocCompare (cbp->loc, sfp->location);
5898
if ((i != SLC_A_IN_B) && (i != SLC_A_EQ_B))
5899
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_Range, "Code-break location not in coding region");
5901
if (excpt && (!sfp->excpt)) {
5902
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ExceptInconsistent, "Exception flag should be set in coding region");
5904
if (crp->orf && sfp->product != NULL) {
5905
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_OrfCdsHasProduct, "An ORF coding region should not have a product");
5907
if (pseudo && sfp->product != NULL) {
5908
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PsuedoCdsHasProduct, "A pseudo coding region should not have a product");
5911
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WrongQualOnImpFeat, "Use the proper genetic code, if available, or set transl_excepts on specific codons");
5915
bsp = GetBioseqGivenSeqLoc (sfp->location, gcp->entityID);
5918
if (vsp->useSeqMgrIndexes) {
5919
vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
5921
bcp = BioseqContextNew (bsp);
5922
vnp = BioseqContextGetSeqDescr (bcp, Seq_descr_source, NULL, NULL);
5924
if (vnp != NULL && vnp->data.ptrvalue != NULL) {
5925
biop = (BioSourcePtr) vnp->data.ptrvalue;
5927
if (orp != NULL && orp->orgname != NULL) {
5929
if (biop->genome == 4 || biop->genome == 5) {
5930
biopgencode = onp->mgcode;
5931
} else if (biop->genome == GENOME_chloroplast ||
5932
biop->genome == GENOME_chromoplast ||
5933
biop->genome == GENOME_plastid ||
5934
biop->genome == GENOME_cyanelle ||
5935
biop->genome == GENOME_apicoplast || biop->genome == GENOME_leucoplast || biop->genome == GENOME_proplastid) {
5938
biopgencode = onp->gcode;
5940
gc = crp->genetic_code;
5942
for (vnp = gc->data.ptrvalue; vnp != NULL; vnp = vnp->next) {
5943
if (vnp->choice == 2) {
5944
cdsgencode = (Int2) vnp->data.intvalue;
5948
if (biopgencode != cdsgencode) {
5949
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GenCodeMismatch,
5950
"Genetic code conflict between CDS (code %d) and BioSource (code %d)", (int) cdsgencode, (int) biopgencode);
5954
if (!vsp->useSeqMgrIndexes) {
5955
BioseqContextFree (bcp);
5959
CheckForBothStrands (vsp, sfp);
5960
CheckForBadGeneOverlap (vsp, sfp);
5961
CheckForBadMRNAOverlap (vsp, sfp);
5962
CheckForCommonCDSProduct (vsp, sfp);
5964
case 4: /* Prot-ref */
5965
prp = (ProtRefPtr) (sfp->data.value.ptrvalue);
5967
if (prp->processed != 3 && prp->processed != 4) {
5969
if ((vnp == NULL || EmptyOrNullString ((CharPtr) vnp->data.ptrvalue)) &&
5970
EmptyOrNullString (prp->desc) && prp->ec == NULL && prp->activity == NULL && prp->db == NULL) {
5971
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ProtRefHasNoData, "There is a protein feature where all fields are empty");
5974
CheckForIllegalDbxref (vsp, gcp, sfp, prp->db);
5976
for (vnp = prp->db; vnp != NULL; vnp = vnp->next) {
5978
db = vnp->data.ptrvalue;
5980
for (i = 0; i < DBNUM; i++) {
5981
if (StringCmp (db->db, dbtag[i]) == 0) {
5986
if (id == -1 || (type != SEQFEAT_CDREGION && id < 4)) {
5987
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref, "Illegal db_xref type %s", db->db);
5994
case 5: /* RNA-ref */
5995
rrp = (RnaRefPtr) (sfp->data.value.ptrvalue);
5996
if (rrp->type == 2) { /* mRNA */
5997
pseudo = sfp->pseudo;
5998
if (OverlappingGeneIsPseudo (sfp)) {
6002
MrnaTransCheck (vsp, sfp); /* transcription check */
6003
if (sfp->excpt && (StringICmp (sfp->except_text, "artificial frameshift") == 0)) {
6004
/* suppress splice check */
6006
SpliceCheck (vsp, sfp);
6009
CheckForBothStrands (vsp, sfp);
6010
CheckForBadGeneOverlap (vsp, sfp);
6011
CheckForCommonMRNAProduct (vsp, sfp);
6013
if (rrp->ext.choice == 2) { /* tRNA */
6014
trp = (tRNAPtr) (rrp->ext.value.ptrvalue);
6015
if (trp->anticodon != NULL) {
6016
i = SeqLocCompare (trp->anticodon, sfp->location);
6017
if ((i != SLC_A_IN_B) && (i != SLC_A_EQ_B)) {
6018
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_Range, "Anticodon location not in tRNA");
6020
if (SeqLocLen (trp->anticodon) != 3) {
6021
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_Range, "Anticodon is not 3 bases in length");
6024
CheckTrnaCodons (vsp, gcp, sfp, trp);
6026
if (rrp->type == 3) { /* tRNA */
6027
anticodonqual = FALSE;
6029
while (gbq != NULL) {
6030
if (StringICmp (gbq->qual, "anticodon") == 0) {
6031
anticodonqual = TRUE;
6035
if (anticodonqual) {
6036
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Unparsed anticodon qualifier in tRNA");
6039
if (rrp->type == 3 && rrp->ext.choice == 1) { /* tRNA with string extension */
6040
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Unparsed product qualifier in tRNA");
6042
if (rrp->type == 0) {
6043
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_RNAtype0, "RNA type 0 (unknown) not supported");
6047
pdp = (PubdescPtr) sfp->data.value.ptrvalue;
6049
ValidatePubdesc (vsp, pdp);
6054
case 8: /* Imp-feat */
6055
ifp = (ImpFeatPtr) sfp->data.value.ptrvalue;
6056
if (GetAppProperty ("ValidateExons") != NULL) {
6058
if (ifp != NULL && StringICmp (ifp->key, "exon") == 0) {
6059
SpliceCheckEx (vsp, sfp, TRUE);
6063
ValidateImpFeat (vsp, gcp, sfp, ifp);
6066
case 9: /* Region */
6068
case 10: /* Comment */
6074
case 13: /* Rsite-ref */
6076
case 14: /* User-object */
6078
case 15: /* TxInit */
6080
case 16: /* Numbering */
6082
case 17: /* Secondary Structure */
6084
case 18: /* NonStdRes */
6086
case 19: /* Heterogen */
6088
case 20: /* BioSource */
6089
biop = (BioSourcePtr) sfp->data.value.ptrvalue;
6090
if (biop != NULL && biop->is_focus) {
6091
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_FocusOnBioSourceFeature, "Focus must be on BioSource descriptor, not BioSource feature.");
6096
bsp = GetBioseqGivenSeqLoc (sfp->location, gcp->entityID);
6098
vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
6100
dbiop = (BioSourcePtr) vnp->data.ptrvalue;
6101
if (dbiop != NULL) {
6104
if (!StringHasNoText (orp->taxname)) {
6105
if (StringICmp (orp->taxname, dorp->taxname) != 0) {
6106
if (!dbiop->is_focus) {
6107
oldEntityID = gcp->entityID;
6108
oldItemID = gcp->itemID;
6110
gcp->entityID = context.entityID;
6111
gcp->itemID = context.itemID;
6112
gcp->thistype = OBJ_SEQDESC;
6114
ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_BioSourceNeedsFocus,
6115
"BioSource descriptor must have focus when BioSource feature with different taxname is present.");
6117
gcp->entityID = oldEntityID;
6118
gcp->itemID = oldItemID;
6119
gcp->thistype = OBJ_SEQFEAT;
6130
ValidateBioSource (vsp, gcp, biop);
6134
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidType, "Invalid SeqFeat type [%d]", (int) (type));
6137
if ((! sfp->excpt) && (! StringHasNoText (sfp->except_text))) {
6138
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ExceptInconsistent, "Exception text is set, but exception flag is not set");
6140
if (type != SEQFEAT_GENE) {
6141
grp = SeqMgrGetGeneXref (sfp);
6142
if (grp == NULL || SeqMgrGeneIsSuppressed (grp))
6144
sfpx = SeqMgrGetOverlappingGene (sfp->location, &fcontext);
6145
if (sfpx == NULL || sfpx->data.choice != SEQFEAT_GENE)
6147
grpx = (GeneRefPtr) sfpx->data.value.ptrvalue;
6150
redundantgenexref = FALSE;
6151
label = fcontext.label;
6152
if ((!StringHasNoText (grp->locus)) && (!StringHasNoText (grpx->locus))) {
6153
if ((StringICmp (grp->locus, grpx->locus) == 0)) {
6154
redundantgenexref = TRUE;
6157
} else if (grp->syn != NULL && grpx->syn != NULL) {
6158
syn1 = (CharPtr) grp->syn->data.ptrvalue;
6159
syn2 = (CharPtr) grpx->syn->data.ptrvalue;
6160
if ((!StringHasNoText (syn1)) && (!StringHasNoText (syn2))) {
6161
if ((StringICmp (syn1, syn2) == 0)) {
6162
redundantgenexref = TRUE;
6167
if (redundantgenexref) {
6168
if (StringHasNoText (label)) {
6171
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryGeneXref, "Unnecessary gene cross-reference %s", label);
6177
/*****************************************************************************
6179
* MrnaTransCheck (sfp, vsp)
6181
*****************************************************************************/
6183
NLM_EXTERN void MrnaTransCheck (ValidStructPtr vsp, SeqFeatPtr sfp)
6185
Int4 mismatch, total;
6186
CharPtr mrseq, pdseq;
6199
if (sfp->product == NULL)
6202
sip = SeqLocId (sfp->product);
6206
mrseq = GetSequenceByFeature (sfp);
6210
/* coerced feature on whole product for GetSequenceByFeature */
6212
MemSet ((Pointer) &sf, 0, sizeof (SeqFeat));
6213
MemSet ((Pointer) &vn, 0, sizeof (ValNode));
6215
vn.choice = SEQLOC_WHOLE;
6216
vn.data.ptrvalue = sip;
6218
pdseq = GetSequenceByFeature (&sf);
6219
if (pdseq != NULL) {
6220
mlen = StringLen (mrseq);
6221
plen = StringLen (pdseq);
6223
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_TranscriptLen, "Transcript length [%ld] does not match product length [%ld]", (long) mlen, (long) plen);
6224
} else if (mlen > 0 && StringICmp (mrseq, pdseq) != 0) {
6229
while (total < mlen) {
6230
if (*ptr1 != *ptr2) {
6237
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_TranscriptMismatches,
6238
"There are %ld mismatches out of %ld bases between the transcript and product sequence", (long) mismatch, (long) total);
6246
/*****************************************************************************
6249
* Treatment of terminal 'X'
6250
* If either the protein or the translation end in 'X' (usually
6251
* due to partial last codon) it is ignored to minimize conflicts
6252
* between approaches to add the X or not in this case.
6254
*****************************************************************************/
6255
static CharPtr MapToNTCoords (SeqFeatPtr sfp, SeqIdPtr protID, Int4 pos)
6263
if (sfp != NULL && protID != NULL && pos >= 0) {
6264
spntp = SeqPntNew ();
6265
pslp = ValNodeNew (NULL);
6266
pslp->choice = SEQLOC_PNT;
6267
pslp->data.ptrvalue = (Pointer) spntp;
6269
spntp->id = SeqIdDup (protID);
6270
nslp = aaLoc_to_dnaLoc (sfp, pslp);
6272
rsult = SeqLocPrint (nslp);
6280
static Boolean Loc_is_RefSeq (SeqLocPtr location)
6286
if (location == NULL)
6288
sip = SeqLocId (location);
6291
bsp = BioseqFind (sip);
6294
for (sip = bsp->id; sip != NULL; sip = sip->next) {
6295
if (sip->choice == SEQID_OTHER) {
6296
tsip = (TextSeqIdPtr) sip->data.ptrvalue;
6298
if (StringNICmp (tsip->accession, "NM_", 3) == 0) {
6307
NLM_EXTERN void CdTransCheck (ValidStructPtr vsp, SeqFeatPtr sfp)
6309
ByteStorePtr newprot = NULL;
6310
BioseqPtr prot1seq = NULL, prot2seq = NULL;
6311
Int4 prot1len = 0, prot2len, i, len;
6313
SeqIdPtr protid = NULL;
6314
Int2 residue1, residue2, stop_count = 0, mismatch = 0, ragged = 0;
6315
Boolean got_stop = FALSE;
6316
SeqPortPtr spp = NULL;
6317
Uint2 part_loc = 0, part_prod = 0;
6318
Boolean no_end = FALSE, no_beg = FALSE, show_stop = FALSE, got_dash = FALSE, done;
6320
ValNodePtr vnp, code;
6322
Boolean transl_except = FALSE, prot_ok = TRUE, is_nc = FALSE;
6325
Int4 pos1, pos2, pos;
6334
if (sfp->excpt && StringICmp (sfp->except_text, "ribosomal slippage") != 0 && StringICmp (sfp->except_text, "ribosome slippage") != 0) /* biological exception */
6337
for (gb = sfp->qual; gb != NULL; gb = gb->next) { /* pseuogene */
6338
if (!StringICmp ("pseudo", gb->qual))
6342
crp = (CdRegionPtr) (sfp->data.value.ptrvalue);
6343
if (crp->code_break == NULL) { /* check for unparsed transl_except */
6344
for (gb = sfp->qual; gb != NULL; gb = gb->next) {
6345
if (!StringCmp (gb->qual, "transl_except")) {
6346
transl_except = TRUE;
6352
if (crp->genetic_code != NULL) {
6353
for (vnp = crp->genetic_code->data.ptrvalue; ((vnp != NULL) && (!gccode)); vnp = vnp->next) {
6354
switch (vnp->choice) {
6358
code = GeneticCodeFind (0, (CharPtr) (vnp->data.ptrvalue));
6360
for (vnp = code->data.ptrvalue; ((vnp != NULL) && (!gccode)); vnp = vnp->next) {
6361
if (vnp->choice == 2) /* id */
6362
gccode = (int) (vnp->data.intvalue);
6367
gccode = (int) (vnp->data.intvalue);
6376
newprot = ProteinFromCdRegionEx (sfp, TRUE, FALSE); /* include stop codons, do not remove trailing X/B/Z */
6377
if (newprot == NULL) {
6378
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_CdTransFail, "Unable to translate");
6383
part_loc = SeqLocPartialCheck (sfp->location);
6384
part_prod = SeqLocPartialCheck (sfp->product);
6385
if ((part_loc & SLP_STOP) || (part_prod & SLP_STOP))
6387
else { /* complete stop, so check for ragged end */
6389
len = SeqLocLen (sfp->location);
6391
len -= (Int4) (crp->frame - 1);
6392
ragged = (Int2) (len % (Int4) (3));
6394
len = SeqLocLen (sfp->location);
6395
cbp = crp->code_break;
6396
while (cbp != NULL) {
6400
while ((tmp = SeqLocFindNext (cbp->loc, tmp)) != NULL) {
6401
pos = GetOffsetInLoc (tmp, sfp->location, SEQLOC_START);
6404
pos = GetOffsetInLoc (tmp, sfp->location, SEQLOC_STOP);
6408
pos = pos2 - pos1; /* codon length */
6409
if (pos >= 0 && pos <= 1 && pos2 == len - 1)
6411
/* allowing a partial codon at the end */
6420
/* check for code break not on a codon */
6421
len = SeqLocLen (sfp->location);
6422
cbp = crp->code_break;
6423
while (cbp != NULL) {
6427
while ((tmp = SeqLocFindNext (cbp->loc, tmp)) != NULL) {
6428
pos = GetOffsetInLoc (tmp, sfp->location, SEQLOC_START);
6431
pos = GetOffsetInLoc (tmp, sfp->location, SEQLOC_STOP);
6435
pos = pos2 - pos1; /* codon length */
6436
/* check for code break not on a codon */
6437
if (pos == 2 || (pos >= 0 && pos <= 1 && pos2 == len - 1)) {
6438
if (crp->frame == 2)
6440
else if (crp->frame == 3)
6444
if ((pos1 % 3) != pos) {
6445
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_TranslExceptPhase, "transl_except qual out of frame.");
6453
if (crp->frame > 1) {
6454
if (!(part_loc & SLP_START)) {
6456
if (Loc_is_RefSeq (sfp->location)) {
6459
ValidErr (vsp, sev, ERR_SEQ_FEAT_PartialProblem, "Suspicious CDS location - frame > 1 but not 5' partial");
6460
} else if ((part_loc & SLP_NOSTART) && (!PartialAtSpliceSite (sfp->location, SLP_NOSTART))) {
6462
if (Loc_is_RefSeq (sfp->location)) {
6465
ValidErr (vsp, sev, ERR_SEQ_FEAT_PartialProblem, "Suspicious CDS location - frame > 1 and not at consensus splice site");
6469
if ((part_loc & SLP_START) || (part_prod & SLP_START))
6472
prot2len = BSLen (newprot);
6474
BSSeek (newprot, 0, SEEK_SET);
6475
for (i = 0; i < len; i++) {
6476
residue1 = BSGetByte (newprot);
6477
if ((i == 0) && (residue1 == '-'))
6479
if (residue1 == '*') {
6489
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_StartCodon,
6490
"Illegal start codon and %ld internal stops. Probably wrong genetic code [%d]", (long) stop_count, gccode);
6492
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InternalStop, "%ld internal stops. Genetic code [%d]", (long) stop_count, gccode);
6496
} else if (got_dash) {
6497
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_StartCodon, "Illegal start codon used. Wrong genetic code [%d] or protein should be partial", gccode);
6502
protid = SeqLocId (sfp->product);
6503
if (protid != NULL) {
6504
prot1seq = BioseqFind (protid);
6505
if (prot1seq != NULL)
6506
prot1len = prot1seq->length;
6509
if (prot1seq == NULL) {
6511
if (! NGorNT (vsp->sep, sfp->location, &is_nc)) {
6513
if (DeltaOrFarSeg (vsp->sep, sfp->location)) {
6519
if (sep != NULL && IS_Bioseq (sep)) {
6523
if (sev != SEV_NONE) {
6524
ValidErr (vsp, sev, ERR_SEQ_FEAT_NoProtein, "No protein Bioseq given");
6533
if ((got_stop) && (len == (prot1len + 1))) { /* ok, got stop */
6537
spp = SeqPortNew (prot1seq, 0, -1, 0, Seq_code_ncbieaa);
6541
/* ignore terminal 'X' from partial last codon if present */
6544
while ((!done) && (prot1len)) {
6545
SeqPortSeek (spp, (prot1len - 1), SEEK_SET);
6546
residue1 = SeqPortGetResidue (spp);
6547
if (residue1 == 'X') /* remove terminal X */
6553
while ((!done) && (len)) {
6554
BSSeek (newprot, (len - 1), SEEK_SET);
6555
residue2 = BSGetByte (newprot);
6556
if (residue2 == 'X')
6562
if (len == prot1len) { /* could be identical */
6563
SeqPortSeek (spp, 0, SEEK_SET);
6564
BSSeek (newprot, 0, SEEK_SET);
6565
for (i = 0; i < len; i++) {
6566
residue1 = BSGetByte (newprot);
6567
residue2 = SeqPortGetResidue (spp);
6568
if (residue1 != residue2) {
6570
if (residue2 == INVALID_RESIDUE)
6572
if (mismatch == 10) {
6573
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_MisMatchAA, "More than 10 mismatches. Genetic code [%d]", gccode);
6575
} else if (i == 0) {
6576
if ((sfp->partial) && (!no_beg) && (!no_end)) /* ok, it's partial */
6577
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PartialProblem, "Start of location should probably be partial");
6578
else if (residue1 == '-')
6579
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_StartCodon, "Illegal start codon used. Wrong genetic code [%d] or protein should be partial", gccode);
6581
nuclocstr = MapToNTCoords (sfp, protid, i);
6582
if (nuclocstr != NULL) {
6583
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_MisMatchAA,
6584
"Residue %ld in protein [%c] != translation [%c] at %s", (long) (i + 1), (char) residue2, (char) residue1, nuclocstr);
6586
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_MisMatchAA,
6587
"Residue %ld in protein [%c] != translation [%c]", (long) (i + 1), (char) residue2, (char) residue1);
6589
MemFree (nuclocstr);
6592
nuclocstr = MapToNTCoords (sfp, protid, i);
6593
if (nuclocstr != NULL) {
6594
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_MisMatchAA,
6595
"Residue %ld in protein [%c] != translation [%c] at %s", (long) (i + 1), (char) residue2, (char) residue1, nuclocstr);
6597
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_MisMatchAA,
6598
"Residue %ld in protein [%c] != translation [%c]", (long) (i + 1), (char) residue2, (char) residue1);
6600
MemFree (nuclocstr);
6605
spp = SeqPortFree (spp);
6607
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_TransLen, "Given protein length [%ld] does not match translation length [%ld]", prot1len, len);
6610
if ((sfp->partial) && (!mismatch)) {
6611
if ((!no_beg) && (!no_end)) { /* just didn't label */
6613
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PartialProblem, "End of location should probably be partial");
6615
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PartialProblem, "This SeqFeat should not be partial");
6624
if ((!got_stop) && (!no_end)) {
6625
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_NoStop, "Missing stop codon");
6626
} else if ((got_stop) && (no_end)) {
6627
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PartialProblem, "Got stop codon, but 3'end is labeled partial");
6628
} else if ((got_stop) && (!no_end) && (ragged)) {
6629
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_TransLen, "Coding region extends %d base(s) past stop codon", (int) ragged);
6635
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_TranslExcept, "Unparsed transl_except qual. Skipped");
6638
if (prot2seq != NULL)
6639
BioseqFree (prot2seq);
6646
/*****************************************************************************
6649
* checks for GT/AG rule at splice junctions
6651
*****************************************************************************/
6656
static void SpliceCheckEx (ValidStructPtr vsp, SeqFeatPtr sfp, Boolean checkAll)
6658
SeqLocPtr slp, nxt, head;
6659
Uint1 strand = Seq_strand_unknown;
6660
SeqPortPtr spp = NULL;
6661
SeqIdPtr last_sip = NULL, sip, id;
6663
BioseqPtr bsp = NULL;
6664
Int4 strt, stp, len = 0, donor, acceptor;
6665
Int2 residue1, residue2;
6667
Boolean reportAsError, first, last, firstPartial, lastPartial;
6670
Boolean gpsOrNTorNC = FALSE;
6678
if (GetAppProperty ("NcbiSubutilValidation") != NULL)
6679
return; /* suppress if NCBISubValidate */
6682
if (sfp->excpt) /* biological exception */
6685
head = sfp->location;
6689
reportAsError = FALSE;
6690
if (GetAppProperty ("SpliceValidateAsError") != NULL) {
6691
reportAsError = TRUE;
6696
while ((slp = SeqLocFindPart (head, slp, EQUIV_IS_ONE)) != NULL) {
6698
if (slp->choice == SEQLOC_EQUIV)
6699
return; /* bail on this one */
6701
strand = SeqLocStrand (slp);
6703
if (strand != SeqLocStrand (slp)) /* bail on mixed strand */
6708
if ((!checkAll) && total < 2)
6718
firstPartial = FALSE;
6719
lastPartial = FALSE;
6721
/* genomic product set or NT_ contig always relaxes to SEV_WARNING */
6724
if (sep != NULL && IS_Bioseq_set (sep)) {
6725
bssp = (BioseqSetPtr) sep->data.ptrvalue;
6726
if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set) {
6731
slp = SeqLocFindPart (head, slp, EQUIV_IS_ONE);
6732
while (slp != NULL) {
6733
nxt = SeqLocFindPart (head, slp, EQUIV_IS_ONE);
6734
last = (Boolean) (nxt == NULL);
6735
partialflag = SeqLocPartialCheck (slp);
6736
firstPartial = (Boolean) (first && (partialflag & SLP_START));
6737
lastPartial = (Boolean) (last && (partialflag & SLP_STOP));
6739
sip = SeqLocId (slp);
6743
/* genomic product set or NT_ contig always relaxes to SEV_WARNING */
6744
bsp = BioseqFind (sip);
6746
for (id = bsp->id; id != NULL; id = id->next) {
6747
if (id->choice == SEQID_OTHER) {
6748
tsip = (TextSeqIdPtr) id->data.ptrvalue;
6749
if (tsip != NULL && tsip->accession != NULL) {
6750
if (StringNICmp (tsip->accession, "NT_", 3) == 0) {
6752
} else if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
6760
if ((ctr == 1) || (!SeqIdMatch (sip, last_sip))) {
6761
spp = SeqPortFree (spp);
6762
bsp = BioseqLockById (sip);
6766
spp = SeqPortNew (bsp, 0, -1, strand, Seq_code_ncbi4na);
6772
acceptor = SeqLocStart (slp);
6773
donor = SeqLocStop (slp);
6775
if (strand != Seq_strand_minus) {
6782
stp = len - donor - 1; /* orient to reverse complement seqport */
6783
strt = len - acceptor - 1;
6786
if (((checkAll && (!lastPartial)) || ctr < total) && (stp < (len - 2))) { /* check donor on all but last exon and on sequence */
6787
SeqPortSeek (spp, (stp + 1), SEEK_SET);
6788
residue1 = SeqPortGetResidue (spp);
6789
residue2 = SeqPortGetResidue (spp);
6790
if (IS_residue (residue1) && IS_residue (residue2)) {
6791
if ((!(residue1 & 4)) || /* not G or */
6792
(!(residue2 & 8))) { /* not T */
6793
if ((residue1 & 4) && (residue2 & 2)) { /* GC minor splice site */
6795
if (vsp->suppressContext) {
6796
WorstBioseqLabel (bsp, tbuf, 39, OM_LABEL_CONTENT);
6798
BioseqLabel (bsp, tbuf, 39, OM_LABEL_CONTENT);
6800
ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_NotSpliceConsensus,
6801
"Rare splice donor consensus (GC) found instead of (GT) after exon ending at position %ld of %s", (long) (donor + 1), tbuf);
6804
severity = SEV_WARNING;
6805
} else if (checkAll) {
6806
severity = SEV_WARNING;
6807
} else if (reportAsError) {
6808
severity = SEV_ERROR;
6810
severity = SEV_WARNING;
6813
if (vsp->suppressContext) {
6814
WorstBioseqLabel (bsp, tbuf, 39, OM_LABEL_CONTENT);
6816
BioseqLabel (bsp, tbuf, 39, OM_LABEL_CONTENT);
6818
ValidErr (vsp, severity, ERR_SEQ_FEAT_NotSpliceConsensus,
6819
"Splice donor consensus (GT) not found after exon ending at position %ld of %s", (long) (donor + 1), tbuf);
6825
if (((checkAll && (!firstPartial)) || ctr != 1) && (strt > 1)) {
6826
SeqPortSeek (spp, (strt - 2), SEEK_SET);
6827
residue1 = SeqPortGetResidue (spp);
6828
residue2 = SeqPortGetResidue (spp);
6829
if (IS_residue (residue1) && IS_residue (residue2)) {
6830
if ((!(residue1 & 1)) || /* not A or */
6831
(!(residue2 & 4))) { /* not G */
6833
severity = SEV_WARNING;
6834
} else if (checkAll) {
6835
severity = SEV_WARNING;
6836
} else if (reportAsError) {
6837
severity = SEV_ERROR;
6839
severity = SEV_WARNING;
6842
if (vsp->suppressContext) {
6843
WorstBioseqLabel (bsp, tbuf, 39, OM_LABEL_CONTENT);
6845
BioseqLabel (bsp, tbuf, 39, OM_LABEL_CONTENT);
6847
ValidErr (vsp, severity, ERR_SEQ_FEAT_NotSpliceConsensus,
6848
"Splice acceptor consensus (AG) not found before exon starting at position %ld of %s", (long) (acceptor + 1), tbuf);
6860
NLM_EXTERN void SpliceCheck (ValidStructPtr vsp, SeqFeatPtr sfp)
6862
SpliceCheckEx (vsp, sfp, FALSE);
6865
/*****************************************************************************
6867
* ValidateSeqLoc(vsp, slp, prefix)
6869
*****************************************************************************/
6870
NLM_EXTERN void ValidateSeqLoc (ValidStructPtr vsp, SeqLocPtr slp, CharPtr prefix)
6872
SeqLocPtr tmp, prev;
6873
Boolean retval = TRUE, tmpval, mixed_strand = FALSE, ordered = TRUE, adjacent = FALSE, circular = FALSE;
6875
Uint1 strand2, strand1;
6876
SeqIntPtr sip1, sip2, prevsip;
6879
SeqIdPtr id1 = NULL, id2;
6886
bsp = BioseqFindFromSeqLoc (slp);
6887
if (bsp != NULL && bsp->topology == 2) {
6895
strand1 = Seq_strand_other;
6896
while ((tmp = SeqLocFindNext (slp, tmp)) != NULL) {
6898
switch (tmp->choice) {
6901
sip2 = (SeqIntPtr) (tmp->data.ptrvalue);
6902
strand2 = sip2->strand;
6904
tmpval = SeqIntCheck (sip2);
6905
if ((tmpval) && (sip1 != NULL) && (ordered) && (! circular)) {
6906
if (SeqIdForSameBioseq (sip1->id, sip2->id)) {
6907
if (strand2 == Seq_strand_minus) {
6908
if (sip1->to < sip2->to)
6910
if (sip2->to + 1 == sip1->from)
6913
if (sip1->to > sip2->to)
6915
if (sip1->to + 1 == sip2->from)
6920
if (prevsip != NULL) {
6921
if (SeqIdForSameBioseq (prevsip->id, sip2->id)) {
6922
if (prevsip->strand == sip2->strand && prevsip->from == sip2->from && prevsip->to == sip2->to) {
6923
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_DuplicateInterval, "Duplicate exons in location");
6930
spp = (SeqPntPtr) (tmp->data.ptrvalue);
6931
strand2 = spp->strand;
6933
tmpval = SeqPntCheck (spp);
6936
case SEQLOC_PACKED_PNT:
6937
pspp = (PackSeqPntPtr) (tmp->data.ptrvalue);
6938
strand2 = pspp->strand;
6940
tmpval = PackSeqPntCheck (pspp);
6946
strand2 = Seq_strand_other;
6953
ctmp = SeqLocPrint (tmp);
6954
ValidErr (vsp, SEV_REJECT, ERR_SEQ_FEAT_Range, "%s: SeqLoc [%s] out of range", prefix, ctmp);
6959
if (tmp->choice != SEQLOC_NULL) {
6960
if ((strand1 != Seq_strand_other) && (strand2 != Seq_strand_other)) {
6961
if (SeqIdForSameBioseq (id1, id2)) {
6962
if (strand1 != strand2)
6963
mixed_strand = TRUE;
6972
if (vsp->sfp != NULL) {
6974
/* Publication intervals ordering does not matter */
6976
if (vsp->sfp->idx.subtype == FEATDEF_PUB) {
6981
/* ignore ordering of heterogen bonds */
6983
if (vsp->sfp->data.choice == SEQFEAT_HET) {
6988
/* misc_recomb intervals SHOULD be in reverse order */
6990
if (vsp->sfp->idx.subtype == FEATDEF_misc_recomb) {
6994
/* primer_bind intervals MAY be in on opposite strands */
6996
if (vsp->sfp->idx.subtype == FEATDEF_primer_bind) {
6997
mixed_strand = FALSE;
7003
ctmp = SeqLocPrint (slp);
7004
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_AbuttingIntervals, "%s: Adjacent intervals in SeqLoc [%s]", prefix, ctmp);
7008
if (vsp->sfp != NULL) {
7011
/* trans splicing exception turns off both mixed_strand and out_of_order messages */
7012
if (StringStr (sfp->except_text, "trans splicing") != NULL || StringStr (sfp->except_text, "trans-splicing") != NULL) {
7018
if ((mixed_strand) || (!ordered)) {
7019
ctmp = SeqLocPrint (slp);
7021
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_MixedStrand, "%s: Mixed strands in SeqLoc [%s]", prefix, ctmp);
7024
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_SeqLocOrder, "%s: Intervals out of order in SeqLoc [%s]", prefix, ctmp);
7029
if (vsp->sfp != NULL) {
7031
/* ignore special case features here as well */
7033
if (vsp->sfp->idx.subtype == FEATDEF_PUB ||
7034
vsp->sfp->data.choice == SEQFEAT_HET ||
7035
vsp->sfp->idx.subtype == FEATDEF_misc_recomb ||
7036
vsp->sfp->idx.subtype == FEATDEF_primer_bind)
7040
/* newer check for intervals out of order on segmented bioseq */
7042
if (bsp == NULL || bsp->repr != Seq_repr_seg) return;
7044
if (SeqLocBadSortOrder (bsp, slp)) {
7045
ctmp = SeqLocPrint (slp);
7046
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_SeqLocOrder, "%s: Intervals out of order in SeqLoc [%s]", prefix, ctmp);
7050
/* newer check for mixed strand on segmented bioseq */
7052
if (SeqLocMixedStrands (bsp, slp)) {
7053
ctmp = SeqLocPrint (slp);
7054
ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_MixedStrand, "%s: Mixed strands in SeqLoc [%s]", prefix, ctmp);
7059
/*****************************************************************************
7061
* SeqGraph validation section
7063
*****************************************************************************/
7065
typedef struct gphgetdata
7070
GphGetData , PNTR GphGetPtr;
7072
typedef struct grphitem
7079
GrphItem , PNTR GrphItemPtr;
7081
static Boolean GetGraphsProc (GatherObjectPtr gop)
7087
if (gop == NULL || gop->itemtype != OBJ_SEQGRAPH)
7089
ggp = (GphGetPtr) gop->userdata;
7090
sgp = (SeqGraphPtr) gop->dataptr;
7091
if (ggp == NULL || sgp == NULL)
7093
/* only phrap or gap4 currently allowed */
7094
if (StringICmp (sgp->title, "Phrap Quality") == 0 || StringICmp (sgp->title, "Phred Quality") == 0 || StringICmp (sgp->title, "Gap4") == 0) {
7095
/* data type must be bytes */
7096
if (sgp->flags[2] == 3) {
7097
if (SeqIdIn (SeqLocId (sgp->loc), ggp->bsp->id)) {
7098
gip = (GrphItemPtr) MemNew (sizeof (GrphItem));
7102
gip->left = GetOffsetInBioseq (sgp->loc, ggp->bsp, SEQLOC_LEFT_END);
7103
gip->right = GetOffsetInBioseq (sgp->loc, ggp->bsp, SEQLOC_RIGHT_END);
7104
ValNodeAddPointer (&(ggp->vnp), 0, (Pointer) gip);
7111
static int LIBCALLBACK SortSeqGraphProc (VoidPtr ptr1, VoidPtr ptr2)
7113
GrphItemPtr gip1, gip2;
7114
ValNodePtr vnp1, vnp2;
7116
if (ptr1 == NULL || ptr2 == NULL)
7118
vnp1 = *((ValNodePtr PNTR) ptr1);
7119
vnp2 = *((ValNodePtr PNTR) ptr2);
7120
if (vnp1 == NULL || vnp2 == NULL)
7122
gip1 = (GrphItemPtr) vnp1->data.ptrvalue;
7123
gip2 = (GrphItemPtr) vnp2->data.ptrvalue;
7124
if (gip1 == NULL || gip2 == NULL)
7126
if (gip1->left > gip2->left) {
7128
} else if (gip1->left < gip2->left) {
7130
} else if (gip1->right > gip2->right) {
7132
} else if (gip2->right < gip2->right) {
7138
/* gets valnode list of sorted graphs in GrphItem structures */
7140
static ValNodePtr GetSeqGraphsOnBioseq (Uint2 entityID, BioseqPtr bsp)
7145
Boolean objMgrFilter[OBJ_MAX];
7150
MemSet ((Pointer) &objMgrFilter, 0, sizeof (objMgrFilter));
7151
objMgrFilter[OBJ_SEQGRAPH] = TRUE;
7152
GatherObjectsInEntity (entityID, 0, NULL, GetGraphsProc, (Pointer) &ggd, objMgrFilter);
7153
for (vnp = ggd.vnp, index = 1; vnp != NULL; vnp = vnp->next, index++) {
7154
gip = (GrphItemPtr) vnp->data.ptrvalue;
7159
ggd.vnp = ValNodeSort (ggd.vnp, SortSeqGraphProc);
7163
static void ValidateGraphsOnBioseq (GatherContextPtr gcp)
7168
Int2 ctr, i, val, index;
7169
Int4 curroffset = 0, gphlen = 0, seqlen = 0, slplen,
7170
bslen, min = INT4_MAX, max = INT4_MIN, j, lastloc = -1, NsWithScore, GapsWithScore, ACGTsWithoutScore, valsBelowMin, valsAboveMax, firstN, firstACGT, pos;
7172
Uint2 entityID, olditemid = 0, olditemtype = 0, numdsp = 0, numsgp = 0, firstsgitemid = 0;
7174
ValNodePtr head, vnp;
7175
Boolean outOfOrder = FALSE, fa2htgsBug = FALSE, overlaps = FALSE;
7184
vsp = (ValidStructPtr) gcp->userdata;
7185
bsp = (BioseqPtr) gcp->thisitem;
7186
if (vsp == NULL || bsp == NULL)
7188
if (!ISA_na (bsp->mol))
7194
vsp->bssp = (BioseqSetPtr) gcp->parentitem;
7196
if (SeqMgrGetParentOfPart (bsp, NULL) != NULL)
7199
entityID = ObjMgrGetEntityIDForPointer (bsp);
7200
head = GetSeqGraphsOnBioseq (entityID, bsp);
7204
olditemid = gcp->itemID;
7205
olditemtype = gcp->thistype;
7206
gcp->thistype = OBJ_SEQGRAPH;
7208
for (vnp = head, index = 1; vnp != NULL; vnp = vnp->next, index++) {
7209
gip = (GrphItemPtr) vnp->data.ptrvalue;
7216
gcp->itemID = sgp->idx.itemID;
7217
if (firstsgitemid == 0) {
7218
firstsgitemid = sgp->idx.itemID;
7221
if (gip->index != index) {
7223
if (gip->index == 129 && index == 2) {
7227
if (gip->left <= lastloc) {
7230
lastloc = gip->right;
7231
min = MIN ((Int4) min, (Int4) sgp->min.intvalue);
7232
max = MAX ((Int4) max, (Int4) sgp->max.intvalue);
7234
if (sgp->min.intvalue < 0 || sgp->min.intvalue > 100) {
7235
ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphMin, "Graph min (%ld) out of range", (long) sgp->min.intvalue);
7238
if (sgp->max.intvalue < 0 || sgp->max.intvalue > 100) {
7239
ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphMax, "Graph max (%ld) out of range", (long) sgp->max.intvalue);
7242
gphlen += sgp->numval;
7243
bs = (ByteStorePtr) sgp->values;
7246
if (sgp->numval != bslen) {
7247
ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphByteLen, "SeqGraph (%ld) and ByteStore (%ld) length mismatch", (long) sgp->numval, (long) bslen);
7252
gcp->itemID = firstsgitemid;
7254
ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphOutOfOrder, "Graph components are out of order - probably caused by old fa2htgs bug");
7256
ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphOutOfOrder, "Graph components are out of order - may be a software bug");
7260
gcp->itemID = firstsgitemid;
7261
ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphOverlap, "Graph components overlap, with multiple scores for a single base");
7264
if (bsp->repr == Seq_repr_raw) {
7265
seqlen = bsp->length;
7266
} else if (bsp->repr == Seq_repr_delta) {
7267
for (dsp = (DeltaSeqPtr) (bsp->seq_ext); dsp != NULL; dsp = dsp->next) {
7268
switch (dsp->choice) {
7270
slocp = (SeqLocPtr) dsp->data.ptrvalue;
7273
if (slocp->choice != SEQLOC_NULL) {
7274
seqlen += SeqLocLen (slocp);
7278
slp = (SeqLitPtr) dsp->data.ptrvalue;
7279
if (slp == NULL || slp->seq_data == NULL)
7281
seqlen += slp->length;
7289
if (seqlen != gphlen && bsp->length != gphlen) {
7290
gcp->itemID = firstsgitemid;
7291
ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphBioseqLen, "SeqGraph (%ld) and Bioseq (%ld) length mismatch", (long) gphlen, (long) seqlen);
7294
if (bsp->repr == Seq_repr_delta) {
7295
if (head != NULL && head->next != NULL) {
7296
for (dsp = (DeltaSeqPtr) (bsp->seq_ext), vnp = head; dsp != NULL && vnp != NULL; dsp = dsp->next) {
7297
gip = (GrphItemPtr) vnp->data.ptrvalue;
7303
switch (dsp->choice) {
7305
slocp = (SeqLocPtr) dsp->data.ptrvalue;
7306
if (slocp != NULL && slocp->choice != SEQLOC_NULL) {
7307
slplen = SeqLocLen (slocp);
7308
curroffset += slplen;
7309
if (sgp->numval != slplen) {
7310
gcp->itemID = sgp->idx.itemID;
7311
ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphSeqLocLen, "SeqGraph (%ld) and SeqLoc (%ld) length mismatch", (long) sgp->numval, (long) slplen);
7321
slp = (SeqLitPtr) dsp->data.ptrvalue;
7322
if (slp != NULL && slp->seq_data != NULL) {
7323
if (sgp->numval != slp->length) {
7324
gcp->itemID = sgp->idx.itemID;
7325
ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphSeqLitLen, "SeqGraph (%ld) and SeqLit (%ld) length mismatch",
7326
(long) sgp->numval, (long) slp->length);
7329
if (slocp != NULL && slocp->choice == SEQLOC_INT) {
7330
sintp = (SeqIntPtr) slocp->data.ptrvalue;
7331
if (sintp != NULL) {
7332
if (sintp->from != curroffset) {
7333
gcp->itemID = sgp->idx.itemID;
7334
ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphStartPhase, "SeqGraph (%ld) and SeqLit (%ld) start do not coincide",
7335
(long) sintp->from, (long) curroffset);
7337
if (sintp->to != slp->length + curroffset - 1) {
7338
gcp->itemID = sgp->idx.itemID;
7339
ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphStopPhase, "SeqGraph (%ld) and SeqLit (%ld) stop do not coincide",
7340
(long) sintp->to, (long) (slp->length + curroffset - 1));
7351
curroffset += slp->length;
7358
for (dsp = (DeltaSeqPtr) (bsp->seq_ext), numdsp = 0; dsp != NULL; dsp = dsp->next) {
7359
switch (dsp->choice) {
7361
slocp = (SeqLocPtr) dsp->data.ptrvalue;
7362
if (slocp != NULL && slocp->choice != SEQLOC_NULL) {
7367
slp = (SeqLitPtr) dsp->data.ptrvalue;
7368
if (slp != NULL && slp->seq_data != NULL) {
7376
for (vnp = head, numsgp = 0; vnp != NULL; vnp = vnp->next, numsgp++)
7378
if (numdsp != numsgp) {
7379
gcp->itemID = firstsgitemid;
7380
ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphDiffNumber, "Different number of SeqGraph (%d) and SeqLit (%d) components", (int) numsgp, (int) numdsp);
7385
for (vnp = head; vnp != NULL; vnp = vnp->next) {
7386
gip = (GrphItemPtr) vnp->data.ptrvalue;
7392
spp = SeqPortNewByLoc (sgp->loc, Seq_code_ncbi4na);
7395
slplen = SeqLocLen (sgp->loc);
7396
if (bsp->repr == Seq_repr_delta || bsp->repr == Seq_repr_virtual) {
7397
SeqPortSet_do_virtualEx (spp, TRUE, TRUE); /* sets gapIsZero */
7400
bs = (ByteStorePtr) sgp->values;
7401
BSSeek (bs, 0, SEEK_SET);
7405
ctr = SeqPortRead (spp, bases, sizeof (bases));
7407
residue = (Uint1) bases[i];
7411
ACGTsWithoutScore = 0;
7418
while (residue != SEQPORT_EOF && j < sgp->numval) {
7419
if (IS_residue (residue)) {
7420
val = (Int2) BSGetByte (bs);
7421
if (val < sgp->min.intvalue || val < 0) {
7424
if (val > sgp->max.intvalue || val > 100) {
7439
ACGTsWithoutScore++;
7440
if (firstACGT == -1) {
7460
ctr = SeqPortRead (spp, bases, sizeof (bases));
7463
} else if (ctr < 1) {
7464
bases[0] = SEQPORT_EOF;
7467
residue = (Uint1) bases[i];
7471
gcp->itemID = sgp->idx.itemID;
7472
if (ACGTsWithoutScore > 0) {
7473
ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphACGTScore, "%ld ACGT bases have zero score value - first one at position %ld",
7474
(long) ACGTsWithoutScore, (long) firstACGT);
7476
if (NsWithScore > 0) {
7477
ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphNScore, "%ld N bases have positive score value - first one at position %ld",
7478
(long) NsWithScore, (long) firstN);
7480
if (GapsWithScore > 0) {
7481
ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphGapScore, "%ld gap bases have positive score value", (long) GapsWithScore);
7483
if (valsBelowMin > 0) {
7484
ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphBelow, "%ld quality scores have values below the reported minimum", (long) valsBelowMin);
7486
if (valsAboveMax > 0) {
7487
ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphAbove, "%ld quality scores have values above the reported maximum", (long) valsAboveMax);
7493
gcp->itemID = olditemid;
7494
gcp->thistype = olditemtype;
7496
ValNodeFreeData (head);
7499
/*****************************************************************************
7501
* PatchBadSequence(bsp)
7503
*****************************************************************************/
7504
NLM_EXTERN Boolean PatchBadSequence (BioseqPtr bsp)
7506
ByteStorePtr newseq;
7510
Int2 repchar, residue;
7515
if (!((bsp->repr == Seq_repr_raw) || (bsp->repr == Seq_repr_const)))
7518
is_na = ISA_na (bsp->mol);
7520
seqcode = Seq_code_iupacna;
7521
repchar = (Int2) 'N'; /* N */
7523
seqcode = Seq_code_iupacaa;
7524
repchar = (Int2) 'X';
7527
spp = SeqPortNew (bsp, 0, -1, 0, seqcode);
7532
newseq = BSNew (len);
7533
if (newseq == NULL) {
7538
for (i = 0; i < len; i++) {
7539
residue = SeqPortGetResidue (spp);
7540
if (residue == INVALID_RESIDUE) {
7543
BSPutByte (newseq, residue);
7547
BSFree (bsp->seq_data);
7548
bsp->seq_data = newseq;
7549
bsp->seq_data_type = seqcode;
7551
BioseqRawPack (bsp);
7556
static void FindABioseq (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
7561
bp = (BioseqPtr PNTR) data;
7562
if (*bp != NULL) /* already got one */
7565
if (IS_Bioseq (sep)) {
7566
bsp = (BioseqPtr) (sep->data.ptrvalue);
7572
NLM_EXTERN CharPtr FindIDForEntry (SeqEntryPtr sep, CharPtr buf)
7574
BioseqPtr bsp = NULL;
7576
if ((sep == NULL) || (buf == NULL))
7580
SeqEntryExplore (sep, (Pointer) (&bsp), FindABioseq);
7585
SeqIdPrint (bsp->id, buf, PRINTID_FASTA_LONG);
7589
static CharPtr TrimSpacesOnEitherSide (CharPtr str)
7595
if (str != NULL && str[0] != '\0') {
7599
while (ch != '\0' && ch <= ' ') {
7603
while (ch != '\0') {
7613
while (ch != '\0') {
7616
} else if (dst == NULL) {
7629
static void CopyLetters (CharPtr dest, CharPtr source, size_t maxsize)
7634
if (dest == NULL || maxsize < 1)
7642
while (maxsize > 1 && ch != '\0') {
7652
TrimSpacesOnEitherSide (tmp);
7655
static void LookForEtAl (ValidStructPtr vsp, ValNodePtr tmp)
7658
AuthListPtr authors = NULL;
7670
if (vsp == NULL || tmp == NULL)
7672
switch (tmp->choice) {
7674
cap = (CitArtPtr) (tmp->data.ptrvalue);
7675
authors = cap->authors;
7680
cbp = (CitBookPtr) (tmp->data.ptrvalue);
7681
authors = cbp->authors;
7684
cgp = (CitGenPtr) (tmp->data.ptrvalue);
7685
authors = cgp->authors;
7688
csp = (CitSubPtr) (tmp->data.ptrvalue);
7689
authors = csp->authors;
7694
if (authors == NULL || authors->choice != 1)
7696
for (names = authors->names; names != NULL; names = names->next) {
7697
ap = names->data.ptrvalue;
7700
if (pid != NULL && pid->choice == 2) {
7702
if (nsp != NULL && nsp->names[0] != NULL) {
7703
CopyLetters (last, nsp->names[0], sizeof (last));
7704
CopyLetters (first, nsp->names[1], sizeof (first));
7705
CopyLetters (initials, nsp->names[4], sizeof (initials));
7706
if ((StringICmp (last, "et al") == 0) || (StringCmp (initials, "al") == 0 && StringCmp (last, "et") == 0 && first[0] == '\0')) {
7707
if (names->next == NULL) {
7708
ValidErr (vsp, SEV_WARNING, ERR_GENERIC_AuthorListHasEtAl, "Author list ends in et al.");
7710
ValidErr (vsp, SEV_WARNING, ERR_GENERIC_AuthorListHasEtAl, "Author list contains et al.");
7719
static void SpellCheckPub (ValidStructPtr vsp, ValNodePtr tmp)
7724
ValNodePtr titles = NULL;
7726
if ((vsp == NULL) || (tmp == NULL))
7729
switch (tmp->choice) {
7731
cap = (CitArtPtr) (tmp->data.ptrvalue);
7732
titles = cap->title;
7737
cbp = (CitBookPtr) (tmp->data.ptrvalue);
7738
titles = cbp->title;
7741
cgp = (CitGenPtr) (tmp->data.ptrvalue);
7742
if (cgp->cit != NULL)
7743
SpellCheckString (vsp, cgp->cit);
7744
if (cgp->title != NULL)
7745
SpellCheckString (vsp, cgp->title);
7751
if (titles != NULL) {
7752
for (; titles != NULL; titles = titles->next) {
7753
if (titles->choice == Cit_title_name)
7754
SpellCheckString (vsp, (CharPtr) (titles->data.ptrvalue));
7761
static void SpellCheckSeqDescr (GatherContextPtr gcp)
7764
ValNodePtr tmp, vnp;
7767
vsp = (ValidStructPtr) (gcp->userdata);
7771
vnp = (ValNodePtr) (gcp->thisitem);
7778
if (vnp->choice == Seq_descr_pub) {
7779
pdp = (PubdescPtr) (vnp->data.ptrvalue);
7780
for (tmp = pdp->pub; tmp != NULL; tmp = tmp->next) {
7781
LookForEtAl (vsp, tmp);
7785
if (vsp->spellfunc == NULL)
7788
switch (vnp->choice) {
7789
case Seq_descr_title:
7790
case Seq_descr_region:
7791
case Seq_descr_comment:
7792
SpellCheckString (vsp, (CharPtr) (vnp->data.ptrvalue));
7795
pdp = (PubdescPtr) (vnp->data.ptrvalue);
7796
for (tmp = pdp->pub; tmp != NULL; tmp = tmp->next) {
7797
SpellCheckPub (vsp, tmp);
7806
NLM_EXTERN void SpellCheckSeqFeat (GatherContextPtr gcp)
7814
vsp = (ValidStructPtr) (gcp->userdata);
7818
sfp = (SeqFeatPtr) (gcp->thisitem);
7825
if (sfp->data.choice == SEQFEAT_PUB) {
7826
pdp = (PubdescPtr) (sfp->data.value.ptrvalue);
7827
for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
7828
LookForEtAl (vsp, vnp);
7832
if (vsp->spellfunc == NULL)
7835
SpellCheckString (vsp, sfp->comment);
7837
switch (sfp->data.choice) {
7838
case 1: /* Gene-ref */
7840
case 2: /* Org-ref */
7842
case 3: /* Cdregion */
7844
case 4: /* Prot-ref */
7845
prp = (ProtRefPtr) (sfp->data.value.ptrvalue);
7846
for (vnp = prp->name; vnp != NULL; vnp = vnp->next)
7847
SpellCheckString (vsp, (CharPtr) (vnp->data.ptrvalue));
7848
SpellCheckString (vsp, prp->desc);
7850
case 5: /* RNA-ref */
7853
pdp = (PubdescPtr) (sfp->data.value.ptrvalue);
7854
for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
7855
SpellCheckPub (vsp, vnp);
7860
case 8: /* Imp-feat */
7862
case 9: /* Region */
7863
SpellCheckString (vsp, (CharPtr) (sfp->data.value.ptrvalue));
7865
case 10: /* Comment */
7871
case 13: /* Rsite-ref */
7873
case 14: /* User-object */
7875
case 15: /* TxInit */
7877
case 16: /* Numbering */
7879
case 17: /* Secondary Structure */
7881
case 18: /* NonStdRes */
7883
case 19: /* Heterogen */
7885
case 20: /* BioSource */
7894
NLM_EXTERN void SpellCheckString (ValidStructPtr vsp, CharPtr str)
7896
if ((vsp == NULL) || (str == NULL))
7899
if (vsp->spellfunc == NULL)
7902
(*(vsp->spellfunc)) ((char *) str, (vsp->spellcallback));
7907
NLM_EXTERN void SpellCallBack (char *str)
7912
if (globalvsp != NULL && globalvsp->justwarnonspell) {
7915
ValidErr (globalvsp, sev, ERR_GENERIC_Spell, "[ %s ]", (CharPtr) str);