1
/* ===========================================================================
4
* National Center for Biotechnology Information (NCBI)
6
* This software/database is a "United States Government Work" under the
7
* terms of the United States Copyright Act. It was written as part of
8
* the author's official duties as a United States Government employee and
9
* thus cannot be copyrighted. This software/database is freely available
10
* to the public for use. The National Library of Medicine and the U.S.
11
* Government do not place any restriction on its use or reproduction.
12
* We would, however, appreciate having the NCBI and the author cited in
13
* any work or product based on this material.
15
* Although all reasonable efforts have been taken to ensure the accuracy
16
* and reliability of the software and data, the NLM and the U.S.
17
* Government do not and cannot warrant the performance or results that
18
* may be obtained by using this software or data. The NLM and the U.S.
19
* Government disclaim all warranties, express or implied, including
20
* warranties of performance, merchantability or fitness for any particular
23
* ===========================================================================
27
* Author: Sarah Wheelan
29
* Version Creation Date: 5/01
33
* File Description: mrna-to-genomic alignment algorithms and functions
36
* --------------------------------------------------------------------------
38
* Revision 6.11 2001/12/18 18:00:18 wheelan
41
* Revision 6.10 2001/11/20 12:13:28 wheelan
42
* made SPI_GetProteinFrommRNA EXTERN
44
* Revision 6.9 2001/11/05 16:14:53 wheelan
45
* added option to print multiple alignment to a file
47
* Revision 6.8 2001/10/04 12:34:07 wheelan
48
* added bigintron option
50
* Revision 6.7 2001/10/03 14:19:29 wheelan
51
* include new alignment manager
53
* Revision 6.6 2001/09/04 13:46:37 wheelan
54
* made SPI_RemoveInconsistentAlnsFromSet and SPI_flip_sa_list extern
56
* Revision 6.5 2001/08/24 13:44:35 wheelan
57
* changed printaln to Int4
59
* Revision 6.4 2001/08/06 16:49:25 wheelan
60
* changed revcompthresh parameter to 55 from 65
62
* Revision 6.3 2001/07/11 17:57:07 wheelan
63
* added typedefs for multiple alignments
65
* Revision 6.2 2001/07/10 16:44:42 wheelan
66
* added functions to make a multiple alignment
68
* Revision 6.1 2001/05/24 16:27:58 wheelan
72
* ==========================================================================
79
#include <alignmgr2.h>
85
#define NLM_EXTERN NLM_IMPORT
87
#define NLM_EXTERN extern
94
#define SPI_DROPOFF 50
95
#define SPI_GAPOPEN 10
96
#define SPI_GAPEXTEND 3
97
#define SPI_PENALTY -5
99
#define SPI_SPLICETHRESH 0.0001
101
#define SPI_MAXGAP 4 /* maximum gap allowed in SPI_ExtendAlnAlg */
103
#define SPI_REVCOMPTHRESH 55 /* minimum allowed % of splice sites present */
104
/* If model is < minimum, then the reverse */
105
/* complement will be checked. */
106
#define SPI_COVERDIFF 15 /* amount the %coverage is allowed to drop in the */
107
/* reverse complement models */
108
#define SPI_MISMTCHDIFF 10 /* amount the %mismatch is allowed to rise in */
109
/* the reverse complement models */
111
#define SPI_TEENYEXON 6 /* smallest exon to look for */
113
#define SPI_ENDFUZZ 13 /* if the overall alignment misses less than or equal */
114
/* to this amount on the ends of the mRNA, the */
115
/* alignment will be extended. */
117
#define SPI_MINBLASTSIZE 7 /* smallest bit that can go into BlastTwoSequencesByLoc */
119
#define SPI_MINPOLYASIZE 5 /* minimum #A's to call a poly(A)+ tail */
120
#define SPI_MAXPOLYASIZE 200 /* maximum number of nucleotides to bother */
121
/* scanning for a tail */
122
#define SPI_LINKERSIZE 8 /* maximum number of non-A's to allow on end of tail */
124
#define SPI_INTRONSIZE 35000 /* used only to decide whether an mRNA may have fallen */
126
#define SPI_INTRONSIZEXL 120000 /* if spot->bigintron TRUE, use this */
128
#define SPI_BIGINTRON 100000 /* max size of 1st and last introns, if 1st and last exons */
129
/* have to be found by SPI_FindPiece. */
130
#define SPI_BIGINTRONXL 240000 /* if spot->bigintron TRUE, use this */
132
#define SPI_PADDING 0 /* how much each region is padded on each side */
134
#define SPI_NUMSITES 4 /* number of alternative splice sites to consider per exon */
136
#define SPI_BIGOVERLAP 12 /* above this cutoff, the overlap won't be doubled */
137
/* to get the window in which to search for splice sites */
139
#define SPI_EXONMERGESIZE 15 /* exons closer than this to each other will get merged */
141
#define SPI_FLUFF 16 /* amount to search on either side of splice site for interspecies comp. */
143
#define SPI_UNKNOWN 0
144
#define SPI_CONSISTENT 1
145
#define SPI_IMPOSSIBLE 2
149
#define SPI_FUZZ 20 /* amount of overlap/underlap allowed to consider hits consistent*/
154
#define SPI_NEITHER 3
156
#define SPI_REVERSEUNKNOWN 0
157
#define SPI_REVERSE 1
158
#define SPI_NOTREVERSED 2
161
#define SPI_NOTMULT 2
163
#define SPI_LINE 60 /* line length for text alignment output -- must be more than SPI_PSPLICE */
164
#define SPI_PSPLICE 10 /* length of genomic sequence to print before and after each exon */
165
#define SPI_SPACER 12 /* space at the beginning of each printed alignment line */
167
#define SPI_NUMCOLS 8 /* number of columns in the tab-delimited file of position info for draft */
169
/* defines for organisms (determines which splice matrices to use) */
170
#define SPI_VERTEBRATE 1
173
#define SPI_CELEGANS 4
175
/* return codes for progress callback */
177
#define SPI_PROGRESS 2
178
#define SPI_FINISHED 3
180
typedef struct spi_bsinfo {
183
struct spi_bsinfo PNTR next;
184
} SPI_bsinfo, PNTR SPI_bsinfoPtr;
186
typedef struct spi_alninfo {
190
} SPI_AlnInfo, PNTR SPI_AlnInfoPtr;
192
typedef struct spi_ival {
202
struct spi_ival PNTR next;
203
} SPI_Ival, PNTR SPI_IvalPtr;
205
typedef struct spiexonprof {
209
struct spiexonprof PNTR next;
210
} SPI_ExonProf, PNTR SPI_ExonProfPtr;
212
typedef struct spi_mrna {
216
FloatHiPtr exonid; /* percent identity per exon */
217
Int4Ptr exongaps; /* number of gaps in each exon alignment */
218
Uint1Ptr splicedon; /* for each exon, is splice donor site (right) present? */
219
Uint1Ptr spliceacc; /* for each exon, is splice acceptor site (left) present? */
220
Uint1 missingends; /* SPI_LEFT, SPI_RIGHT, SPI_BOTH, or SPI_NEITHER */
221
Int4Ptr mstarts; /* exon starts, in mRNA coordinates */
222
Int4Ptr mstops; /* exon stops, in mRNA coordinates */
223
Int4Ptr gstarts; /* exon starts, in genomic coordinates */
224
Int4Ptr gstops; /* exon stops, in genomic coordinates */
225
SeqAlignPtr PNTR saps; /*indexed alignments for exons */
226
Int4 mRNAcoverage; /* percentage of the mRNA contained in this alignment */
227
FloatHi mismatch; /* percent mismatches in entire alignment */
228
Int4 polyAtail; /* if +, length of polyA tail that doesn't align */
229
/* if negative, length of polyAtail that does align */
230
Boolean fallsoff; /* does this mRNA fall of the end of the contig? */
231
SeqAlignPtr parent; /* parent of exon alignment set */
232
SeqAlignPtr continuous; /* continuous alignment over whole gene */
233
SPI_ExonProfPtr epp; /* positions of mismatches (for printing) */
234
CharPtr protein; /* sequence of the protein translated from the mRNA */
235
Int4 transstart; /* translation start position */
236
Boolean holes; /* are there holes in the mRNA alignment? */
237
struct spi_mrna PNTR next;
238
} SPI_mRNA, PNTR SPI_mRNAPtr;
240
typedef struct spi_utrinfo {
243
} SPI_UTRInfo, PNTR SPI_UTRInfoPtr;
245
typedef struct spi_seq {
248
} SPI_Seq, PNTR SPI_SeqPtr;
250
typedef struct spi_mult {
251
SeqAlignPtr PNTR exons;
253
} SPI_Mult, PNTR SPI_MultPtr;
255
typedef struct spi_reginfo {
265
Int4 polyAtail; /* length of polyA(+) tail that doesn't align */
266
Boolean fallsoff; /* this mRNA may fall off the end of the genomic sequence */
267
SPI_UTRInfo utr; /* if this is a CDS, UTR %ids are here */
269
struct spi_reginfo PNTR next;
270
} SPI_RegionInfo, PNTR SPI_RegionInfoPtr;
272
typedef struct spi_tinyinfo {
275
struct spi_tinyinfo PNTR next;
276
} SPI_TinyInfo, PNTR SPI_TinyInfoPtr;
278
typedef struct spi_pos {
282
} SPI_Pos, PNTR SPI_PosPtr;
284
typedef struct spi_fragmentptr {
287
Int4 fragnum; /* original fragment number, for reference */
289
SPI_PosPtr position_orig; /* the position info (if any) given for this fragment */
290
Int4 position_mrna; /* after alignment to mRNA, where is this fragment */
291
Int4 reverse; /* is this fragment reversed with respect to the mRNA */
292
SeqAlignPtr sap; /* indexed parent of the alignments for this fragment to the mRNA */
293
SPI_mRNAPtr smp; /* info for the alignment to this fragment */
294
Uint1 donor; /* does this (set of) alignment(s) have a donor site to the next frag? */
295
Uint1 acceptor; /* acceptor site ? */
296
struct spi_fragmentptr PNTR next;
297
} SPI_Frag, PNTR SPI_FragPtr;
299
typedef struct spi_fragherd {
300
Int4 polyAtail; /* length of polyAtail on the mRNA */
302
SPI_FragPtr PNTR sfparray;
303
} SPI_FragHerd, PNTR SPI_FragHerdPtr;
305
typedef struct spi_fraginfo {
311
SPI_PosPtr position_orig;
312
} SPI_FragInfo, PNTR SPI_FragInfoPtr;
314
typedef struct spi_mRNAtoherd {
315
BioseqPtr bsp_mrna; /* mrna sequence */
318
Int4Ptr exons; /* array which specifies which pieces go with which exons */
319
Int4Ptr fragments; /* array of fragment numbers, one per piece */
320
Int4Ptr sfpnum; /* array of sfp numbers, one for each piece */
321
Uint1Ptr fallsoff; /* for each piece, does it fall off fragment? */
322
Uint1Ptr strands; /* genomic strand, one per piece */
323
Int4Ptr mstarts; /* mrna starts, one per piece */
324
Int4Ptr mstops; /* mrna stops, one per piece */
325
Int4Ptr gstarts; /* genomic starts, one per piece */
326
Int4Ptr gstops; /* genomic stops, one per piece */
327
Int4Ptr lens; /* length of alignment for each piece */
328
Int4Ptr pmismatch; /* number of mismatches per piece */
329
Int4Ptr pgaps; /* number of gaps in alignment for each piece */
330
FloatHiPtr exonid; /* percent identity per exon */
331
Int4Ptr exongaps; /* number of gaps per exon */
332
Uint1Ptr splicedon; /* for each piece, is splice donor site (right) present? */
333
Uint1Ptr spliceacc; /* for each piece, is splice acceptor site (left) present? */
334
Uint1 missingends; /* SPI_LEFT, SPI_RIGHT, SPI_BOTH, or SPI_NEITHER */
335
SeqAlignPtr PNTR saps; /* indexed alignments, one for each piece (not exon) */
336
FloatHi mRNAcoverage; /* percentage of the mRNA contained in this alignment */
337
Boolean gaps; /* are there internal pieces missing from the mRNA alignment? */
338
FloatHi mismatch; /* percent mismatches in entire alignment */
340
struct spi_mRNAtoherd PNTR next;
341
} SPI_mRNAToHerd, PNTR SPI_mRNAToHerdPtr;
343
typedef struct spi_exonherdinfo {
349
struct spi_exonherdinfo PNTR next;
350
} SPI_ExonHerdInfo, PNTR SPI_ExonHerdInfoPtr;
352
typedef struct spi_splice {
357
} SPI_Splice, PNTR SPI_SplicePtr;
359
typedef struct spi_fragsplice {
360
SPI_SplicePtr splarray;
363
} SPI_FragSpl, PNTR SPI_FragSplPtr;
365
typedef struct spi_progress {
368
} SPI_Progress, PNTR SPI_ProgressPtr;
370
typedef Boolean (LIBCALLBACK *SPI_ProgressCallback)(SPI_ProgressPtr progress);
372
typedef struct spi_options {
373
FloatHi firstpasseval;
375
FloatHi thirdpasseval;
381
Boolean interspecies;
383
SeqAlignPtr PNTR sap_head;
389
SPI_ProgressCallback callback;
390
Int4 from; /* to restrict genomic interval */
392
Boolean makemult; /* make a multiple alignment from numerous returns? */
394
Uint1 strand; /* to restrict the search to one genomic strand */
395
} SPI_Options, PNTR SPI_OptionsPtr;
397
typedef struct spi_n {
404
} SPI_n, PNTR SPI_nPtr;
406
typedef struct spi_block {
410
struct spi_block PNTR next;
411
} SPI_Block, PNTR SPI_BlockPtr;
413
NLM_EXTERN SPI_RegionInfoPtr SPI_AlnSinglemRNAToGen(SPI_bsinfoPtr spig, SPI_bsinfoPtr spim, FILE *ofp, FILE *ofp2, SPI_OptionsPtr spot);
414
NLM_EXTERN SPI_mRNAToHerdPtr SPI_AlnSinglemRNAToPieces(SPI_bsinfoPtr spig_head, SPI_bsinfoPtr spim, FILE *ofp, FILE *ofp2, SPI_OptionsPtr spot);
415
NLM_EXTERN void SPI_MakeMultipleAlignment(SPI_RegionInfoPtr srip_head);
416
NLM_EXTERN void SPI_PrintMultipleAlignment(SPI_RegionInfoPtr srip, Boolean html, BioseqPtr bsp, FILE * ofp);
417
NLM_EXTERN void SPI_RegionListFree (SPI_RegionInfoPtr srip);
419
/*************************************************************************************
421
* SPI_AlnmRNAToGenomic is available to outside programs; just pass in the two
422
* bioseqs and options (to use default options, just pass in NULL, and to use
423
* other options, call SPI_OptionsNew to get an initialized options pointer and
424
* make the desired changes). If options are passed in, they should be freed
425
* using SPI_OptionsFree. SPI_AlignmRNAToGenomic returns a linked list of
426
* SPI_mRNAPtrs, one per gene model (default is to only return one gene model).
427
* Each SPI_mRNAPtr (see above) has arrays specifying the exon boundaries in
428
* genomic and mRNA coordinates as well as information about splice sites,
429
* percent identity, number of gaps, etc. The SPI_mRNAPtr also has one alignment
430
* per exon as well as a single alignment (smp->continuous) that covers the entire
431
* gene, with big gaps in the mRNA for the genomic introns. The SPI_mRNAPtr should
432
* be freed by the calling function, using SPI_mRNAFree.
434
* SPI_AlnmRNAToGenomic should only be used on finished sequence; it can handle
435
* interspecies comparisons but doesn't work on draft sequence.
437
*************************************************************************************/
438
NLM_EXTERN SPI_mRNAPtr SPI_AlignmRNAToGenomic(BioseqPtr bsp_genomic, BioseqPtr bsp_mrna, SPI_OptionsPtr spot);
440
/***************************************************************************
442
* SPI_flip_sa_list takes the head of a list of seqaligns and switches
443
* the first and second row of every alignment (alignments should all have
444
* two rows). Then, the indexes are freed and the alignments are reindexed.
446
***************************************************************************/
447
NLM_EXTERN void SPI_flip_sa_list (SeqAlignPtr sap);
449
/***************************************************************************
451
* SPI_RemoveInconsistentAlnsFromSet is a greedy algorithm that first
452
* sorts the alignments by score, then takes the highest-scoring
453
* alignment and compares it to the next-highest-scoring alignment, which
454
* is deleted if it is contained; on subsequent loops each next-highest-
455
* scoring alignment is compared to the set of alignments that have
456
* been kept. The alignments can be sorted along the first or
457
* second sequence; the alignments will be reversed so that they are
458
* all on the plus strand of the sequence to be examined.
459
* The input alignment must be indexed at least at the LITE level;
460
* conflicting child alignments will be deleted, not hidden, by this
461
* function. This function assumes that all children have the same two
462
* rows. The 'compact' parameter tells the function whether to try to
463
* keep alignments that are more to the left in genomic coordinates, or
466
***************************************************************************/
467
NLM_EXTERN void SPI_RemoveInconsistentAlnsFromSet(SeqAlignPtr sap, Int4 fuzz, Int4 n, Int4 compact);
469
NLM_EXTERN void SPI_bsinfoFreeList (SPI_bsinfoPtr spi);
470
NLM_EXTERN void SPI_mRNAFree (SPI_mRNAPtr smp);
471
NLM_EXTERN SPI_OptionsPtr SPI_OptionsNew(void);
472
NLM_EXTERN void SPI_OptionsFree (SPI_OptionsPtr spot);
473
NLM_EXTERN void SPI_is_donor (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score, Int4 org);
474
NLM_EXTERN void SPI_is_acceptor (Uint1Ptr sequence, Int4 seqlen, FloatHiPtr score, Int4 org);
476
/***************************************************************************
478
* SPI_GetProteinFrommRNA takes an mRNA bioseq and returns a string
479
* which is the best protein translation of the mRNA. First, the function
480
* looks to see whether there are any annotated CDSs, and if so, it uses
481
* the translation of the annotated CDS. If not, the function translates
482
* the mRNA in all 3 reading frames and looks for the frame with the
483
* longest protein, then returns that protein.
485
***************************************************************************/
486
NLM_EXTERN CharPtr SPI_GetProteinFrommRNA(BioseqPtr bsp_mrna, Int4Ptr start);
494
#define NLM_EXTERN NLM_EXPORT