2
* ===========================================================================
5
* National Center for Biotechnology Information
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's official duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government have not placed any restriction on its use or reproduction.
14
* Although all reasonable efforts have been taken to ensure the accuracy
15
* and reliability of the software and data, the NLM and the U.S.
16
* Government do not and cannot warrant the performance or results that
17
* may be obtained by using this software or data. The NLM and the U.S.
18
* Government disclaim all warranties, express or implied, including
19
* warranties of performance, merchantability or fitness for any particular
22
* Please cite the author in any work or product based on this material.
24
* ===========================================================================
26
* File Name: urkbias.c
28
* Author(s): John Kuzio
30
* Version Creation Date: 98-01-01
34
* File Description: codon bias
37
* --------------------------------------------------------------------------
38
* Date Name Description of modification
39
* --------------------------------------------------------------------------
41
* Revision 6.6 1998/09/30 21:46:45 kuzio
42
* no need to reverse bias score
44
* Revision 6.5 1998/09/16 18:03:31 kuzio
48
* ==========================================================================
57
extern Gather_CDSPtr GatherCDSNew (void)
61
if ((gcdsp = (Gather_CDSPtr) MemNew (sizeof (Gather_CDS))) == NULL)
63
gcdsp->LOscore = -1.0;
64
gcdsp->scorecut = 0.5;
69
extern Gather_CDSPtr GatherCDSFree (Gather_CDSPtr gcdsp)
77
MemFree (gcdsp->tableGlobal);
78
MemFree (gcdsp->tableRefine);
80
slp = gcdsp->slpGlobal;
86
id->next = SeqIdSetFree (id->next);
90
slp = gcdsp->slpRefine;
96
id->next = SeqIdSetFree (id->next);
106
id->next = SeqIdSetFree (id->next);
116
id->next = SeqIdSetFree (id->next);
120
slp = gcdsp->slpFound;
126
id->next = SeqIdSetFree (id->next);
131
return (Gather_CDSPtr) MemFree (gcdsp);
134
extern SeqLocPtr SeqLocDup (SeqLocPtr slpold)
136
SeqLocPtr slpnew, slpn, slp;
142
slpnew = (SeqLocPtr) AsnIoMemCopy ((Pointer) slpold,
143
(AsnReadFunc) SeqLocAsnRead,
144
(AsnWriteFunc) SeqLocAsnWrite);
151
id->next = SeqIdSetFree (id->next);
160
extern SeqLocPtr SeqLocDupAll (SeqLocPtr slpold)
167
slpnew = (SeqLocPtr) AsnIoMemCopy ((Pointer) slpold,
168
(AsnReadFunc) SeqLocAsnRead,
169
(AsnWriteFunc) SeqLocAsnWrite);
173
extern SeqLocPtr SeqLocLink (SeqLocPtr PNTR slph, SeqLocPtr slp)
177
if (slph == NULL || *slph == NULL)
183
while (slpnext->next != NULL)
184
slpnext = slpnext->next;
189
extern Boolean SeqLocMatch (SeqLocPtr slplist, SeqLocPtr slpnew)
191
Int4 start1, stop1, start2, stop2;
193
if (slpnew == NULL || slplist == NULL)
198
while (slplist != NULL)
200
start1 = SeqLocStart (slpnew);
201
stop1 = SeqLocStop (slpnew);
202
start2 = SeqLocStart (slplist);
203
stop2 = SeqLocStop (slplist);
205
if (start1 == start2 && stop1 == stop2)
207
slplist = slplist->next;
212
extern void UniqueOrfs (SeqLocPtr PNTR pslpFound)
214
SeqLocPtr slpFound, slpNew, slp;
217
slpFound = *pslpFound;
219
while (slpFound != NULL)
221
if (!SeqLocMatch (slpNew, slpFound))
222
SeqLocLink (&slpNew, SeqLocDup (slpFound));
223
slpFound = slpFound->next;
225
slpFound = *pslpFound;
226
while (slpFound != NULL)
228
slp = slpFound->next;
229
id = SeqLocId (slpFound);
231
id->next = SeqIdSetFree (id->next);
232
SeqLocFree (slpFound);
239
static int LIBCALLBACK SeqLocCompProc (VoidPtr ptr1, VoidPtr ptr2)
241
SeqLocPtr slp1, slp2;
243
if (ptr1 != NULL && ptr2 != NULL)
245
slp1 = *((SeqLocPtr PNTR) ptr1);
246
slp2 = *((SeqLocPtr PNTR) ptr2);
247
if (slp1 != NULL && slp2 != NULL)
249
if (SeqLocStart (slp1) > SeqLocStart (slp2))
253
else if (SeqLocStart (slp1) < SeqLocStart (slp2))
262
extern void SortByLocOrfs (SeqLocPtr PNTR pslpFound)
264
SeqLocPtr slpFound, PNTR slpt;
268
slpFound = *pslpFound;
269
while (slpFound != NULL)
272
slpFound = slpFound->next;
275
slpt = MemNew ((size_t)(num+1) * sizeof (SeqLocPtr));
276
slpFound = *pslpFound;
278
while (slpFound != NULL)
281
slpFound = slpFound->next;
285
HeapSort (slpt, num, sizeof (SeqLocPtr), SeqLocCompProc);
287
for (i = 0; i < num; i++)
290
slpFound->next = slpt[i+1];
293
*pslpFound = slpt[0];
299
extern void RemoveInternalOrfs (SeqLocPtr PNTR slpFound)
301
SeqLocPtr slp1, slp2, slp = NULL;
303
Int4 start1, stop1, start2, stop2;
304
Boolean flagInternal;
309
start1 = SeqLocStart (slp1);
310
stop1 = SeqLocStop (slp1);
311
flagInternal = FALSE;
315
start2 = SeqLocStart (slp2);
316
stop2 = SeqLocStop (slp2);
317
if ((start1 > start2 && start1 < stop2) &&
318
(stop1 > start2 && stop1 < stop2))
326
SeqLocLink (&slp, SeqLocDup (slp1));
333
id = SeqLocId (slp1);
335
id->next = SeqIdSetFree (id->next);
343
static Boolean ScanCodonUsage (GatherContextPtr gcp)
353
if ((gcdsp = (Gather_CDSPtr) gcp->userdata) == NULL)
355
if (gcp->thistype != OBJ_BIOSEQ)
357
if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
360
if (gcdsp->findcount != 0)
366
cutp = NewCodonTable ();
367
slpt = SeqLocDup (slp);
368
AddSeqLocToCodonTable (cutp, bsp, slpt, TRUE);
369
score = Confide (cutp, gcdsp->tableRefine);
370
FreeCodonTable (cutp);
371
if (score < gcdsp->scorecut)
373
SeqLocLink (&(gcdsp->slpHit), SeqLocDup (slp));
382
static Boolean RefineCodonUsage (GatherContextPtr gcp)
392
if ((gcdsp = (Gather_CDSPtr) gcp->userdata) == NULL)
394
if (gcp->thistype != OBJ_BIOSEQ)
396
if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
399
if (gcdsp->tableRefine != NULL)
405
cutp = NewCodonTable ();
406
slpt = SeqLocDup (slp);
407
AddSeqLocToCodonTable (cutp, bsp, slpt, TRUE);
408
score = Confide (cutp, gcdsp->tableGlobal);
409
FreeCodonTable (cutp);
410
if (score < gcdsp->scorecut)
412
gcdsp->refinecount++;
413
if (gcdsp->tableRefine == NULL)
414
gcdsp->tableRefine = NewCodonTable ();
415
AddSeqLocToCodonTable (gcdsp->tableRefine, bsp, slpt, TRUE);
416
SeqLocLink (&(gcdsp->slpRefine), SeqLocDup (slp));
424
static Boolean StandardMean (GatherContextPtr gcp)
434
if ((gcdsp = (Gather_CDSPtr) gcp->userdata) == NULL)
436
if (gcp->thistype != OBJ_BIOSEQ)
438
if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
441
if (gcdsp->stdcount != 0)
447
cutp = NewCodonTable ();
448
slpt = SeqLocDup (slp);
449
AddSeqLocToCodonTable (cutp, bsp, slpt, TRUE);
450
score = Confide (cutp, gcdsp->tableGlobal);
451
FreeCodonTable (cutp);
452
gcdsp->mean += score;
454
if (gcdsp->LOscore == -1)
455
gcdsp->LOscore = score;
456
if (score < gcdsp->LOscore)
457
gcdsp->LOscore = score;
458
if (score > gcdsp->HIscore)
459
gcdsp->HIscore = score;
466
static Boolean StandardDeviation (GatherContextPtr gcp)
476
if ((gcdsp = (Gather_CDSPtr) gcp->userdata) == NULL)
478
if (gcp->thistype != OBJ_BIOSEQ)
480
if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
483
if (gcdsp->stdev != 0.0)
489
cutp = NewCodonTable ();
490
slpt = SeqLocDup (slp);
491
AddSeqLocToCodonTable (cutp, bsp, slpt, TRUE);
492
score = Confide (cutp, gcdsp->tableGlobal);
493
FreeCodonTable (cutp);
494
score -= gcdsp->mean;
496
gcdsp->stdev += score;
503
static Boolean MakeGlobalTable (GatherContextPtr gcp)
511
if ((gcdsp = (Gather_CDSPtr) gcp->userdata) == NULL)
513
if (gcp->thistype != OBJ_BIOSEQ)
515
if ((bsp = (BioseqPtr) (gcp->thisitem)) == NULL)
518
if (gcdsp->tableGlobal != NULL)
521
slp = gcdsp->slpGlobal;
524
gcdsp->globalcount++;
525
if (gcdsp->tableGlobal == NULL)
526
gcdsp->tableGlobal = NewCodonTable ();
527
slpt = SeqLocDup (slp);
528
AddSeqLocToCodonTable (gcdsp->tableGlobal, bsp, slpt, TRUE);
535
static Int4 CullGlobalOrfs (SeqLocPtr PNTR pslpGlobal,
536
SeqLocPtr PNTR pslpRefine)
539
SeqLocPtr slpGlobal, slpRefine, slpNew, slp;
541
slpGlobal = *pslpGlobal;
542
slpRefine = *pslpRefine;
545
while (slpGlobal != NULL)
547
if (!SeqLocMatch (slpRefine, slpGlobal))
550
SeqLocLink (&slpNew, SeqLocDup (slpGlobal));
552
slpGlobal = slpGlobal->next;
554
slpGlobal = *pslpGlobal;
555
while (slpGlobal != NULL)
557
slp = slpGlobal->next;
558
SeqLocFree (slpGlobal);
561
*pslpGlobal = slpNew;
565
extern SeqLocPtr FindSimilarBiasOrfs (SeqEntryPtr sep, FloatHi probcut,
566
Int4 clustmin, Int4 findmin,
568
SeqLocPtr slpPotential)
570
static GatherScope gs;
585
gcdsp = GatherCDSNew ();
587
MemSet ((Pointer) gsp, 0, sizeof (GatherScope));
588
MemSet ((Pointer) gsp->ignore, (int) (TRUE),
589
(size_t) (OBJ_MAX * sizeof (Boolean)));
590
gsp->ignore[OBJ_BIOSEQ] = FALSE;
595
SeqLocLink (&gcdsp->slpGlobal, SeqLocDup (slp));
601
SeqLocLink (&gcdsp->slpAll, SeqLocDup (slp));
605
GatherSeqEntry (sep, (Pointer) gcdsp, MakeGlobalTable, (Pointer) gsp);
607
while (gcdsp->tableGlobal != NULL)
610
gcdsp->refinecount = 0;
611
gcdsp->findcount = 0;
613
gcdsp->HIscore = 0.0;
614
gcdsp->LOscore = -1.0;
616
gcdsp->scorecut = 0.5;
618
GatherSeqEntry (sep, (Pointer) gcdsp, StandardMean, (Pointer) gsp);
619
if (gcdsp->stdcount > 0)
620
gcdsp->mean /= gcdsp->stdcount;
621
GatherSeqEntry (sep, (Pointer) gcdsp, StandardDeviation, (Pointer) gsp);
623
if (gcdsp->stdcount > 1)
624
gcdsp->stdev /= (gcdsp->stdcount - 1);
627
gcdsp->stdev = (FloatHi) sqrt (gcdsp->stdev);
628
gcdsp->scorecut = gcdsp->LOscore + (gcdsp->stdev * probcut);
630
slp = gcdsp->slpRefine;
637
gcdsp->slpRefine = slp;
639
gcdsp->tableRefine = FreeCodonTable (gcdsp->tableRefine);
640
GatherSeqEntry (sep, (Pointer) gcdsp, RefineCodonUsage, (Pointer) gsp);
641
if (gcdsp->tableRefine != NULL)
643
if (gcdsp->refinecount >= clustmin)
645
gcdsp->scorecut *= 1.5; /* increase a bit to see any branch jumps */
646
GatherSeqEntry (sep, (Pointer) gcdsp, ScanCodonUsage, (Pointer) gsp);
648
if (gcdsp->findcount < findmin)
661
SeqLocLink (&gcdsp->slpFound, gcdsp->slpHit);
662
gcdsp->slpHit = NULL;
664
gcount = CullGlobalOrfs (&gcdsp->slpGlobal, &gcdsp->slpRefine);
665
gcdsp->tableGlobal = FreeCodonTable (gcdsp->tableGlobal);
666
if (gcount != gcdsp->globalcount)
668
gcdsp->globalcount = 0;
669
GatherSeqEntry (sep, (Pointer) gcdsp, MakeGlobalTable, (Pointer) gsp);
674
slp = gcdsp->slpGlobal;
681
gcdsp->slpGlobal = slp;
682
gcdsp->tableGlobal = FreeCodonTable (gcdsp->tableGlobal);
685
UniqueOrfs (&gcdsp->slpFound);
686
slp = gcdsp->slpFound;
687
gcdsp->slpFound = NULL;
688
GatherCDSFree (gcdsp);
692
extern FloatHiPtr BiasScoreBioseq (BioseqPtr bsp, Int4Ptr tableGlobal,
693
Int4 tripletwindow, Int4 xframe,
701
Int4 start, stop, xstop, xwindow;
706
if (!ISA_na (bsp->mol))
708
if (bsp->length < tripletwindow)
711
slp = ValNodeNew (NULL);
713
slp->choice = SEQLOC_INT;
714
slp->data.ptrvalue = sint;
716
xwindow = tripletwindow;
717
xstop = (bsp->length + 3 - xframe - xwindow) / 3;
718
score = (FloatHiPtr) MemNew ((size_t) (sizeof (FloatHi) * xstop));
723
stop = start + xwindow - 1;
726
sint->strand = xstrand;
727
cutp = CodonTableFromSeqLoc (bsp, slp);
731
xstop = bsp->length - 3;
732
for (start = stop + 1; start <= xstop; start += 3)
735
sint->to = start + 2;
736
AddSeqLocToCodonTable (cutp, bsp, slp, TRUE);
737
score[iscore++] = Confide (cutp, tableGlobal);
738
sint->from -= xwindow;
740
AddSeqLocToCodonTable (cutp, bsp, slp, FALSE);
743
FreeCodonTable (cutp);
744
slp->data.ptrvalue = (Pointer) SeqIntFree (sint);