26
26
* @file optimize_target_freq.c
28
* Author E. Michael Gertz
30
27
* Routines for finding an optimal set of target frequencies for the
31
28
* purpose of generating a compositionally adjusted score matrix. The
32
29
* function for performing this optimization is named
35
32
* The optimal target frequencies x minimize the Kullback-Liebler
38
35
* sum_k x[k] * ln(x[k]/q[k])
40
37
* from the set q of target frequencies from a standard matrix. They
41
38
* also satisfy the constraints
43
* \sum_{i = 0...alphsize - 1} x[i * alphsize + j] = col_sums[j]
40
* sum_{i = 0...alphsize - 1} x[i * alphsize + j] = col_sums[j]
44
41
* for j = 0...alphsize - 1
46
* \sum_{j = 0...alphsize - 1} x[i * alphsize + j] = row_sums[j]
43
* sum_{j = 0...alphsize - 1} x[i * alphsize + j] = row_sums[j]
47
44
* for i = 1...alphsize - 1
49
46
* where col_sums and row_sums are sets of background frequencies.
82
79
* Agarwala, Aleksandr Morgulis, Alejandro Schaffer and Yi-Kuo Yu
83
80
* (2005) Protein Database Searches Using Compositionally Adjusted
84
81
* Substitution Matrices. FEBS Journal, 272,5101-9.
83
* @author E. Michael Gertz
86
85
#ifndef SKIP_DOXYGEN_PROCESSING
87
86
static char const rcsid[] =
88
"$Id: optimize_target_freq.c,v 1.6 2005/12/01 13:49:43 gertz Exp $";
87
"$Id: optimize_target_freq.c,v 1.8 2005/12/19 15:37:33 gertz Exp $";
89
88
#endif /* SKIP_DOXYGEN_PROCESSING */
91
90
#include <string.h>
105
104
* The symmetric product
107
* A D A\T = \sum_{k = 0}^{n - 1} d_k * A[:][k] * A[:][k]\T,
106
* A D A^T = \sum_{k = 0}^{n - 1} d_k \times A[:][k] \times A[:][k]^T,
109
108
* where n = alphsize * alphsize and A[:][k] is column k of A.
111
110
* @param alphsize the size of the alphabet for this minimization
261
260
* Lagrangian function.
263
262
* @param resids_x the dual residual vector
263
* @param z dual variables (Lagrange multipliers)
264
264
* @param alphsize the alphabet size for this optimization problem
265
265
* @param grads the gradient of the objective function, an
266
266
* possibly the nonlinear relative entropy constraint.
367
367
* factorization of -J D^{-1} J^T, and the sufficient information to
368
368
* backsolve using this factorization are stored.
370
struct ReNewtonSystem {
371
int alphsize; /*< the size of the alphabet */
372
int constrain_rel_entropy; /*< if true, use the relative entropy
370
typedef struct ReNewtonSystem {
371
int alphsize; /**< the size of the alphabet */
372
int constrain_rel_entropy; /**< if true, use the relative entropy
373
373
constraint for this optimization
375
double ** W; /*< A lower-triangular matrix
375
double ** W; /**< A lower-triangular matrix
376
376
representing a factorization of
377
377
the (2,2) block, -J D^{-1} J^T, of
378
378
the condensed linear system */
379
double * Dinv; /*< The diagonal elements of the
379
double * Dinv; /**< The diagonal elements of the
380
380
inverse of the necessarily
381
381
diagonal (1,1) block of the linear
383
double * grad_re; /*< the gradient of the
383
double * grad_re; /**< the gradient of the
384
384
relative-entropy constraint, if
385
385
this constraint is used. */
387
typedef struct ReNewtonSystem ReNewtonSystem;
467
466
* @param constrain_rel_entropy if true, then the relative entropy
468
467
* constraint is used for this optimization
469
* @param workspace an allocated workspace array at least
470
* as large as the number of target frequencies
472
473
FactorReNewtonSystem(ReNewtonSystem * newton_system,
547
548
* @param z on entry, the primal residuals; on exit, the
548
549
* step in the dual variables.
549
550
* @param newton_system the factored matrix for the Newton system.
551
* @param workspace an allocated workspace array at least
552
* as large as the number of target frequencies
552
555
SolveReNewtonSystem(double x[], double z[],
613
616
* @param grads grads[0] is the gradient of the objective function
614
617
* and grad[1] is the gradient of the relative entropy
619
* @param alphsize the alphabet size for this problem
616
620
* @param x the primal variables
617
621
* @param q target frequencies of the standard matrix
618
622
* @param scores scores as computed using the target frequencies
686
* Find an optimal set of target frequencies for the purpose of
687
* generating a compositionally adjusted score matrix.
689
* @param x On exit, the optimal set of target frequencies,
690
* interpreted as a two dimensional array in
691
* row-major order. x need not be initialized on
692
* entry; any initial value will be ignored.
693
* @param alphsize the size of the alphabet for this optimization
695
* @param q a set of target frequencies from a standard
697
* @param row_sums the required row sums for the target frequencies;
698
* the composition of one of the sequences being compared.
699
* @param col_sums the required column sums for the target frequencies;
700
* the composition of the other sequence being compared.
701
* @param constrain_rel_entropy if true, constrain the relative
702
* entropy of the optimal target
703
* frequencies to equal
705
* @param relative_entropy if constrain_rel_entropy is true, then this
706
* is the required relative entropy for the
707
* optimal target frequencies. Otherwise,
708
* this argument is ignored.
709
* @param maxits the maximum number of iterations permitted for the
710
* optimization algorithm; a good value is 2000.
711
* @param tol the solution tolerance; the residuals of the optimization
712
* program must have Euclidean norm <= tol for the
713
* algorithm to terminate.
715
* @returns if an optimal set of target frequencies is
716
* found, then 0, if the iteration failed to
717
* converge, then 1, if there was some error, then -1.
689
/* Documented in optimized_target_freq.h */
720
691
Blast_OptimizeTargetFrequencies(double x[],