1
/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3
/*********************************************************************
4
* Clustal Omega - Multiple sequence alignment
6
* Copyright (C) 2010 University College Dublin
8
* Clustal-Omega is free software; you can redistribute it and/or
9
* modify it under the terms of the GNU General Public License as
10
* published by the Free Software Foundation; either version 2 of the
11
* License, or (at your option) any later version.
13
* This file is part of Clustal-Omega.
15
********************************************************************/
18
* RCS $Id: hhfunc-C.h 256 2011-06-23 13:57:28Z fabian $
22
* Changelog: Michael Remmert made changes to hhalign stand-alone code
23
* FS implemented some of the changes on 2010-11-11 -> MR1
25
* did not incorporate SS prediction PSIpred (yet); functions are:
34
* AddBackgroundFrequencies()
36
* @brief add background frequencies (derived from HMM) to
39
* @param[in,out] ppfFreq,
40
* [in] residue frequencies of sequence/profile,
41
* [out] overlayed with HMM background frequencies
42
* @param[in,out] ppfPseudoF,
43
* [in] residue frequencies+pseudocounts of sequence/profile,
44
* [out] overlayed with HMM background frequencies+pseudocounts
46
* length of sequence/profile (not aligned to HMM)
50
* sequences/profile to be 'softened'
51
* @param[in] pcPrealigned,
52
* sequence aligned to HMM, this is not quite a consensus,
53
* it is identical to 1st sequence but over-writes gap information,
54
* if other sequences in profile have (non-gap) residues
56
* number of sequences pre-aligned (pcPrealigned is 'consensus' of these sequences)
57
* @param[in] pcRepresent
58
* sequence representative of HMM, aligned to pcSeq0
61
AddBackgroundFrequencies(float **ppfFreq, float **ppfPseudoF, float **ppfPseudoTr,
62
int iSeqLen, hmm_light *prHMM,
63
char **ppcSeq, char *pcPrealigned, int iPreCnt, char *pcRepresent)
66
char *pcS = pcPrealigned; /* current residue in pre-aligned sequence */
67
char *pcH = pcRepresent; /* current residue in pre-aligned HMM */
68
int iS = 0; /* position in un-aligned sequence (corresponds to pcS) */
69
int iH = 0; /* position in un-aligned HMM (corresponds to pcH) */
70
int iA; /* residue iterator */
71
int iT; /* transition state iterator */
72
float fFWeight = 0.50 / sqrt((float)(iPreCnt)); /* weight of backgroud frequencies */ /* FIXME: tune value, 0.50 default */
73
//float fFWeight = 0.75;
74
float fFThgiew = UNITY - fFWeight; /* weight of 'true' frequencies */
75
float fGWeight = 0.50 / sqrt((float)(iPreCnt)); /* weight of backgroud transitions */ /* FIXME: tune value, 0.50 default */
76
//float fGWeight = 0.50 /*/ (float)(iPreCnt)*/; /* weight of backgroud transitions */ /* FIXME: tune value, 0.50 default */
77
float fGThgiew = UNITY - fGWeight; /* weight of 'true' transitions */
79
/* zf1SeqTrans[] are the (phenomenological) transition probabilities (+ pseudo-counts)
80
for a single sequence. It is almost certain (0.99) to stay in a match state, and very
81
unlikely (0.05) to go from match to insertion/deletion */
82
float zf1SeqTrans[] = {0.98, 0.01, 0.01, 0.25, 0.75, 0.25, 0.75};
83
float zf1SeqInit[] = {0.98, 0.01, 0.01, 0.99, 0.01, 0.99, 0.01};
84
float zf1SeqDel[] = {0.24, 0.01, 0.75, 0.01, 0.01, 0.01, 0.01};
85
float zf1SeqRevrt[] = {0.98, 0.01, 0.01, 0.01, 0.01, 0.99, 0.01};
87
if ( (NULL == pcPrealigned) || (NULL == pcRepresent) ){
88
/*printf("%s/%s:%d: WARNING HMM=NULL -- didn't think I would get here (carry on, no danger)\n",
89
__FUNCTION__, __FILE__, __LINE__);*/
93
if (NULL == prHMM->p){
94
printf("%s:%s:%d: WARNING -- Background Pseudocounts point to NULL\n"
95
"\tthis is not intended - don't add background but CONTINUE\n",
96
__FUNCTION__, __FILE__, __LINE__);
101
/* FIXME: should be 0 (FS thinks) but -1 gives better results */
102
iH = iS = 0/*-1*//*+1*/;
103
while ( ('\0' != *pcS) && ('\0' != *pcH) ){
105
if ( ('-' != *pcH) && ('-' != *pcS) ){
111
* - HMM had a gap in previous position (now closed)
112
* FIXME: this does not really work */
113
if ((iH > 0) && ('-' == *(pcH-1))){
115
for (iT = 0; iT < STATE_TRANSITIONS; iT++){
116
ppfPseudoTr[iS-1][iT] = log2f(zf1SeqRevrt[iT]);
122
/* do frequencies -- this is not really useful;
123
frequencies are derived from HMM
124
and contain already pseudocounts (PCs),
125
adding frequencies and then PCs will add PCs _twice_
126
results are better than not to add them,
127
but not as good as PCs */
128
for (iA = 0; iA < AMINOACIDS; iA++){
129
fAux = fFThgiew * ppfFreq[iS][iA] + fFWeight * prHMM->f[iH][iA];
130
ppfFreq[iS][iA] = fAux;
133
/* do pseudo-counts */
134
for (iA = 0; iA < AMINOACIDS; iA++){
136
fFThgiew * ppfPseudoF[iS][iA] + fFWeight * prHMM->p[iH][iA];
137
ppfPseudoF[iS][iA] = fAux;
139
#if 0 /* do state transitions */
140
for (iT = 0; iT < STATE_TRANSITIONS; iT++){
142
/* this is a very crude method,
143
which averages the logarithms of the transitions,
144
effectively performing a geometric mean -
145
this presumably violates normalisation.
146
however, results are surprisingly good */
148
fGThgiew * ppfPseudoTr[iS][iT] +
149
fGWeight * prHMM->tr[iH][iT];
150
ppfPseudoTr[iS][iT] = fAux;
151
#else /* crude averaging */
152
/* There are 2 things to consider:
153
(1) one should really blend the probabilities of
154
the transitions, however, by default we have
155
the logarithms thereof.
156
So must exponentiate, blend, and then take log again.
157
This is expensive, and does not seem to lead to better
158
results than blending the logarithms
159
(and violating normalisation)
160
(2) transition probabilities for a single sequence
161
are really easy, there are no insert/delete transitions.
162
However, there is a begin state that is different
164
But again, this does not seem to make a blind bit
169
fGThgiew * zf1SeqTrans[iT] +
170
fGWeight * prHMM->linTr[iH][iT];
171
ppfPseudoTr[iS][iT] = log2f(fAux);
175
fGThgiew * zf1SeqInit[iT] +
176
fGWeight * prHMM->linTr[iH][iT];
177
ppfPseudoTr[iS][iT] = log2f(fAux);
179
#endif /* mixing of linear/log */
180
} /* 0 <= iT < STATE_TRANSITIONS */
181
#endif /* did state transitions */
182
} /* was Match -- neither HMM nor sequence have gap */
184
else if ('-' == *pcH){
185
/* sequence opened up gap in HMM */
187
if ((iH > 0) && ('-' != *(pcH-1)) && (iS > 0)){
188
/* this is the first gap in HMM
189
* FIXME: this does not really work */
190
for (iT = 0; iT < STATE_TRANSITIONS; iT++){
191
ppfPseudoTr[iS-1][iT] = log2f(zf1SeqDel[iT]);
195
/* do nothing, keep single sequence values exactly as they are*/
200
else if ('-' == *pcS){
201
/* here the single sequence has a gap,
202
and the HMM (as a whole) does not. There may be individual gaps
203
in the HMM at this stage. By ignoring this we say that the HMM
204
dominates the overall behaviour - as in the M2M state as well
216
} /* this is the end of AddBackgroundFrequencies() */
223
* @brief Read HMM input file or transfer alignment
224
* and add pseudocounts etc.
226
* @param[in] iRnPtype
227
* type of read/prepare
228
* enum {INTERN_ALN_2_HMM = 0, READ_ALN_2_HMM, READ_HMM_2_HMM};
232
* number of seqs in alignment
233
* @param[in,out] prHMM,
234
* [in] if sequences read/prepared, [out] if HMM from file
235
* @param[in] pcPrealigned,
236
* (single) sequence aligned to background HMM
237
* @param[in] pcRepresent,
238
* sequence representing HMM aligned to individual sequence
239
* param[in] pdExWeights
240
* (external) sequence weights, derived from tree
242
* name of file with HMM info (formerly also alignment)
244
* HMM structure with transition probabilities, residue frequencies etc
246
* FIXME: what is qali?
248
* @return FAILURE on error, OK otherwise
251
ReadAndPrepare(int iRnPtype,
252
char **ppcProf, int iCnt, hmm_light *prHMM,
253
char *pcPrealigned, char *pcRepresent, double *pdExWeights,
254
char* infile, HMM& q, Alignment* qali=NULL) {
256
//#ifndef CLUSTALO_NOFILE
259
/* NOTE: there are different scenarios:
261
(i) ("" != infile) - read HMM from file,
262
transfer frequency/transition/pseudo-count (f/t/p) info into prHMM
264
(ii) ('\0' != ppcProf[0]) - transfer sequence/alignment into q/qali,
265
don't save f/t/p into prHMM,
266
on the contrary, if prior f/t/p available then add it to q/qali,
267
this is only done if (1==iCnt)
269
(iii) ('\0' == ppcProf[0]) - re-cycle old HMM information
270
recreate a HMM from previous data
273
/********************************/
274
/*** (o) Recycle internal HMM ***/
275
/********************************/
276
if ( (INTERN_ALN_2_HMM == iRnPtype) && (iCnt <= 0) ){
278
/* NOTE: here we are writing into a HMM structure/class;
279
memory has been allocated for this in hhalign.cpp;
280
however, as iCnt<=0, there may not be memory for
281
prHMM->n_display sequences/names.
282
But then, there doesn't have to be.
283
At this stage we are just copying one HMM into another HMM,
284
sequences are irrelevant. The only sequence of (marginal)
285
interest is the consensus sequence */
286
/* FIXME: check that prHMM is valid -- how? */
288
const int ciCons = 0;
289
const int ciNoof = ciCons+1;
290
const int ciInvalid = -1;
291
q.n_display = ciNoof; /* only consensus */
294
q.nfirst = ciCons;//prHMM->nfirst;
295
q.nss_dssp = ciInvalid;//prHMM->nss_dssp;
296
q.nsa_dssp = ciInvalid;//prHMM->nsa_dssp;
297
q.nss_pred = ciInvalid;//prHMM->nss_pred;
298
q.nss_conf = ciInvalid;//prHMM->nss_conf;
300
q.N_in = prHMM->N_in;
301
q.N_filtered = prHMM->N_filtered;
302
/* NOTE: I (FS) think that only ever 1 sequence will be transferred here,
303
that is, the consensus sequence. However, we might want to allow
304
(in the future) to transfer more sequences,
305
hence the awkward for() loop */
307
for (int i = prHMM->ncons+0; i < prHMM->ncons+q.n_display; i++){
308
/* NOTE: In the original hhalign code the first position
309
is kept open ('\0'). This makes it difficult to use
310
string functions like strlen/strdup etc.
311
Insert a temporary gap (.) to facilitate string operations */
312
char cHead = prHMM->seq[i][0];
313
prHMM->seq[i][0] = '.';
314
q.seq[i] = strdup(prHMM->seq[i]);
315
prHMM->seq[i][0] = q.seq[i][0] = cHead;
319
char cHead = prHMM->seq[prHMM->ncons][0];
320
prHMM->seq[prHMM->ncons][0] = '.';
321
q.seq[q.ncons] = strdup(prHMM->seq[prHMM->ncons]);
322
prHMM->seq[prHMM->ncons][0] = q.seq[q.ncons][0] = cHead;
325
const int NEFF_SCORE = 10; /* FIXME: Magic Number */
326
for (int i = 0; i < prHMM->L+1; i++){
327
q.Neff_M[i] = prHMM->Neff_M[i];
328
q.Neff_I[i] = prHMM->Neff_I[i];
329
q.Neff_D[i] = prHMM->Neff_D[i];
331
q.Neff_HMM = prHMM->Neff_HMM;
332
/* skip longname,name,file,fam,sfam,fold,cl */
333
q.lamda = prHMM->lamda;
336
HMMshadow rShadow = {0}; /* friend of HMM to access private members */
337
rShadow.copyShadowToHMM(q, *prHMM);
339
/* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */
340
/* pav already done in copyShadowToHMM */
346
} /* INTERN_ALN_2_HMM && iCnt<=0 */
348
/******************************/
349
/*** (i) Read HMM from file ***/
350
/******************************/
351
char line[LINELEN] = {0}; // input line
353
//if ( (0 != strcmp(infile,"")) /*&& (iCnt > 0)*/ )
354
if ( (READ_HMM_2_HMM == iRnPtype) || (READ_ALN_2_HMM == iRnPtype)) {
356
if (0 == strcmp(infile,"")){
358
"\texpected to re %s from file but no file specified\n"
360
, __FUNCTION__, __FILE__, __LINE__
361
, (READ_HMM_2_HMM==iRnPtype?"HMM":"alignment"));
364
inf = fopen(infile, "r");
365
if (!inf) OpenFileError(infile);
366
Pathname(path,infile);
371
//if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name);
375
fgetline(line,LINELEN-1,inf);
378
//if ( (0 != strcmp(infile,"")) && (iCnt > 0) )
379
if ( (READ_HMM_2_HMM == iRnPtype) ){
381
if (0 == strcmp(infile,"")){
382
printf("%s:%s:%d: expected to read HMM from file but no file-name\n",
383
__FUNCTION__, __FILE__, __LINE__);
386
// Is infile a HMMER3 file? /* MR1 */
387
if (!strncmp(line,"HMMER3",6))
390
cout<<"Query file is in HMMER3 format\n";
391
cout<<"WARNING! Use of HMMER3 format as input results in dramatically loss of sensitivity!\n";
394
// Read 'query HMMER file
396
q.ReadHMMer3(inf,path);
398
// Don't add transition pseudocounts to query!!
400
// Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
401
q.PreparePseudocounts();
403
// DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a]
404
q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
406
/* further down there is a q.Log2LinTransitionProbs
407
but only if (iCnt>0), however, we still need it it here (i think),
408
there is no danger of doing this twice, as trans_lin is checked
409
FIXME (FS, 2011-01-12) */
410
/* further down there is a q.Log2LinTransitionProbs
411
but only if (iCnt>0), however, we still need it it here (i think),
412
there is no danger of doing this twice, as trans_lin is checked
413
FIXME (FS, 2011-01-12) */
414
//if (par.forward >= 1)
416
q.Log2LinTransitionProbs(1.0);
420
// ... or Is infile an old HMMER file?
421
else if (!strncmp(line,"HMMER",5)) {
423
cout<<"Query file is in HMMER format\n";
424
cout<<"WARNING! Use of HMMER format as input results in dramatically loss of sensitivity!\n";
427
// Read 'query HMMER file
428
q.ReadHMMer(inf,path);
429
if (v>=2 && q.Neff_HMM>11.0)
430
fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM);
432
// Don't add transition pseudocounts to query!!
434
// Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
435
q.PreparePseudocounts();
437
// DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a]
438
q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
440
/* further down there is a q.Log2LinTransitionProbs
441
but only if (iCnt>0), however, we still need it it here (i think),
442
there is no danger of doing this twice, as trans_lin is checked
443
FIXME (FS, 2011-01-12) */
444
//if (par.forward >= 1)
446
q.Log2LinTransitionProbs(1.0);
449
} /* it was a HMMer file */
451
// ... or is it an hhm file?
452
else if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2)) {
454
if (v>=2) cout<<"Query file is in HHM format\n";
456
// Rewind to beginning of line and read query hhm file
459
if (v>=2 && q.Neff_HMM>11.0)
460
fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM);
462
// Add transition pseudocounts to query -> q.p[i][a]
463
q.AddTransitionPseudocounts();
465
// Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
466
q.PreparePseudocounts();
468
// Add amino acid pseudocounts to query: q.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
469
q.AddAminoAcidPseudocounts();
471
} /* it was a HHM file */
473
fprintf(stderr, "%s:%s:%d: Unknown HMM format in infile\n"
474
"infile=%s, #seq=%d\n"
475
, __FUNCTION__, __FILE__, __LINE__
482
/*** transfer class info to struct */
483
prHMM->n_display = q.n_display;
485
prHMM->seq = (char **)calloc((q.n_display+1), sizeof(char *));
486
/* FIXME valgrind says bytes get lost in the above calloc during
489
for (int i = 0; i < q.n_display; i++){
490
/* NOTE: In the original hhalign code the first position
491
is kept open ('\0'). This makes it difficult to use
492
string functions like strlen/strdup etc.
493
Insert a temporary gap (.) to facilitate string operations */
494
char cHead = q.seq[i][0];
496
prHMM->seq[i] = strdup(q.seq[i]);
497
q.seq[i][0] = prHMM->seq[i][0] = cHead;
499
prHMM->ncons = q.ncons;
500
prHMM->nfirst = q.nfirst;
501
prHMM->nss_dssp = q.nss_dssp;
502
prHMM->nsa_dssp = q.nsa_dssp;
503
prHMM->nss_pred = q.nss_pred;
504
prHMM->nss_conf = q.nss_conf;
506
prHMM->N_in = q.N_in;
507
prHMM->N_filtered = q.N_filtered;
508
const int NEFF_SCORE = 10; /* FIXME: Magic Number */
509
prHMM->Neff_M = (float *)calloc(prHMM->L+1, sizeof(float));
510
prHMM->Neff_I = (float *)calloc(prHMM->L+1, sizeof(float));
511
prHMM->Neff_D = (float *)calloc(prHMM->L+1, sizeof(float));
512
/*for (int i = 0; i < prHMM->L+1; i++){
513
prHMM->Neff_M[i] = prHMM->Neff_I[i] = prHMM->Neff_D[i] = NEFF_SCORE;
515
prHMM->Neff_HMM = q.Neff_HMM;
516
/* skip longname,name,file,fam,sfam,fold,cl */
517
prHMM->lamda = q.lamda;
520
HMMshadow rShadow = {0}; /* friend of HMM to access private members */
521
rShadow.copyHMMtoShadow(q);
523
prHMM->f = (float **)calloc(prHMM->L+1, sizeof(float *));
524
prHMM->g = (float **)calloc(prHMM->L+1, sizeof(float *));
525
prHMM->p = (float **)calloc(prHMM->L+1, sizeof(float *));
526
prHMM->tr = (float **)calloc(prHMM->L+1, sizeof(float *));
527
prHMM->linTr = (float **)calloc(prHMM->L+1, sizeof(float *));
528
for (int i = 0; i < prHMM->L+1; i++){
529
prHMM->f[i] = (float *)calloc(AMINOACIDS, sizeof(float));
530
prHMM->g[i] = (float *)calloc(AMINOACIDS, sizeof(float));
531
prHMM->p[i] = (float *)calloc(AMINOACIDS, sizeof(float));
532
for (int j = 0; j < AMINOACIDS; j++){
533
prHMM->f[i][j] = (float)rShadow.f[i][j];
534
prHMM->g[i][j] = (float)rShadow.g[i][j];
535
prHMM->p[i][j] = (float)rShadow.p[i][j];
537
prHMM->tr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float));
538
prHMM->linTr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float));
539
for (int j = 0; j< STATE_TRANSITIONS; j++){
540
prHMM->tr[i][j] = (float)rShadow.tr[i][j];
541
prHMM->linTr[i][j] = fpow2(rShadow.tr[i][j]);
543
} /*0 <= i < prHMM->L+1 */
544
/* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */
545
for (int j = 0; j < AMINOACIDS; j++){
546
prHMM->pav[j] = (float)rShadow.pav[j];
550
} /* have read HHM from file */
551
/*else if ( ((NULL != ppcProf) && (iCnt > 0) && ('\0' != ppcProf[0][0])) ||
552
( (0 != strcmp(infile,"") && (0 == iCnt) )) )*/
553
else if ( (INTERN_ALN_2_HMM == iRnPtype) || (READ_ALN_2_HMM == iRnPtype) ) {
555
if ( (INTERN_ALN_2_HMM == iRnPtype) &&
556
( (NULL == ppcProf) || (iCnt <= 0) || ('\0' == ppcProf[0][0]) ) ){
557
printf("%s:%s:%d: was expecting internal alignment but\n"
558
"\tppcProf=%p, #seq=%d, ppcProf[0][0]=%c\n"
559
, __FUNCTION__, __FILE__, __LINE__
560
, ppcProf, iCnt, ppcProf[0][0]);
563
else if ( (READ_ALN_2_HMM == iRnPtype) &&
564
(0 == strcmp(infile,"")) ){
565
printf("%s:%s:%d: was expecting to read alignment from file but no filename\n"
566
, __FUNCTION__, __FILE__, __LINE__);
570
/*******************************/
571
/*** (ii) it is an alignment ***/
572
/*******************************/
573
/* transfer alignment information from clustal character array
574
into pali/q/t classes */
576
* NOTE that emissions in HMMer file format contain pseudo-counts.
577
* HHM file format does not contain emission pseudo-counts.
578
* the structure that stores background HMM information does contain pseudo-counts
582
pali=new Alignment(iCnt); /* FIXME: pass in iCnt to get rid of MAXSEQ */
589
printf("\nError in %s: only HHM files can be calibrated.\n",program_name);
590
printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile);
594
if (v>=2 && strcmp(infile,"stdin")) cout<<infile<<" is in A2M, A3M or FASTA format\n";
596
/* Read alignment from infile into matrix X[k][l] as ASCII
597
(and supply first line as extra argument) */
599
if (INTERN_ALN_2_HMM == iRnPtype){
600
pali->Transfer(ppcProf, iCnt);
602
//else if (0 == iCnt)
603
else if (READ_ALN_2_HMM == iRnPtype){
604
pali->Read(inf, infile, line);
607
printf("%s:%s:%d: FATAL problem\n"
608
"infile = (%s), #seq = %d\n"
609
, __FUNCTION__, __FILE__, __LINE__
614
/* Convert ASCII to int (0-20), throw out all insert states,
615
record their number in I[k][i]
616
and store marked sequences in name[k] and seq[k] */
617
pali->Compress(infile);
619
/* Sort out the nseqdis most dissimilar sequences for display
620
in the output alignments */
621
pali->FilterForDisplay(par.max_seqid, par.coverage, par.qid,
622
par.qsc,par.nseqdis);
624
// Filter alignment for min score per column with core query profile, defined by coverage_core and qsc_core
625
//if (par.coresc>-10) pali->HomologyFilter(par.coverage_core, par.qsc_core, par.coresc);
627
/* Remove sequences with seq. identity larger than seqid percent
628
(remove the shorter of two) */
629
pali->N_filtered = pali->Filter(par.max_seqid, par.coverage,
630
par.qid, par.qsc, par.Ndiff);
632
/* Calculate pos-specific weights,
633
AA frequencies and transitions -> f[i][a], tr[i][a] */
634
pali->FrequenciesAndTransitions(q);
635
if (v>=2 && q.Neff_HMM>11.0)
636
fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM);
638
/*printf("%d %d %f %d (N,Nf,Neff,L) %s:%s:%d\n"
639
, q.N_in, q.N_filtered, q.Neff_HMM, q.L, __FUNCTION__, __FILE__, __LINE__);*/
641
// Add transition pseudocounts to query -> p[i][a]
642
q.AddTransitionPseudocounts();
644
/* Generate an amino acid frequency matrix from f[i][a]
645
with full pseudocount admixture (tau=1) -> g[i][a] */
646
q.PreparePseudocounts();
648
/* Add amino acid pseudocounts to query:
649
p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] */
650
q.AddAminoAcidPseudocounts();
652
/* ****** add aligned background pseudocounts ***** */
653
HMMshadow rShadowQ = {0};
654
rShadowQ.copyHMMtoShadow(q);
656
AddBackgroundFrequencies(rShadowQ.f, rShadowQ.p, rShadowQ.tr,
658
q.seq, pcPrealigned, iCnt, pcRepresent);
662
delete(pali); pali = NULL;
665
} /* else if (NULL != ppcProf) // not hmmer/hhalign but alignment */
667
//else if ((prHMM->L > 0) && ('\0' == ppcProf[0][0]))
668
else if (INTERN_HMM_2_HMM == iRnPtype){
670
/******************************************/
671
/*** (iii) re-cycle old HMM information ***/
672
/******************************************/
675
printf("%s:%s:%d: was expecting to copy HMM structure but L=%d\n"
676
, __FUNCTION__, __FILE__, __LINE__, prHMM->L);
679
printf("%s:%s:%d: RE-CYCLING HMM\n", __FUNCTION__, __FILE__, __LINE__);
682
q.n_display = prHMM->n_display;
684
for (int i = 0; i < q.n_display; i++){
685
/* NOTE: In the original hhalign code the first position
686
is kept open ('\0'). This makes it difficult to use
687
string functions like strlen/strdup etc.
688
Insert a temporary gap (.) to facilitate string operations */
689
char cHead = prHMM->seq[i][0];
690
prHMM->seq[i][0] = '.';
691
q.seq[i] = strdup(prHMM->seq[i]);
692
q.seq[i][0] = prHMM->seq[i][0] = cHead;
694
q.nfirst = prHMM->nfirst;
698
char cHead = prHMM->seq[prHMM->nfirst][0];
699
prHMM->seq[prHMM->nfirst][0] = '.';
700
q.seq[0] = strdup(prHMM->seq[prHMM->nfirst]);
701
q.seq[q.n_display-1][0] = prHMM->seq[prHMM->nfirst][0] = cHead;
703
q.ncons = prHMM->ncons;
704
q.nss_dssp = prHMM->nss_dssp;
705
q.nsa_dssp = prHMM->nsa_dssp;
706
q.nss_pred = prHMM->nss_pred;
707
q.nss_conf = prHMM->nss_conf;
709
q.N_in = prHMM->N_in;
710
q.N_filtered = prHMM->N_filtered;
711
#define NEFF_SCORE 10 /* FIXME: Magic Number */
712
/*for (int i; i < prHMM->L+1; i++){
713
q.Neff_M[i] = q.Neff_I[i] = q.Neff_D[i] = NEFF_SCORE;
715
q.Neff_HMM = prHMM->Neff_HMM;
716
/* skip longname,name,file,fam,sfam,fold,cl */
717
q.lamda = prHMM->lamda;
720
HMMshadow rShadow = {0}; /* friend of HMM to access private members */
721
rShadow.copyShadowToHMM(q, *prHMM);
726
if (par.forward>=1) q.Log2LinTransitionProbs(1.0);
735
} /*** end: ReadAndPrepare() ***/
745
FreeHMMstruct(hmm_light *prHMM)
749
if (NULL != prHMM->f){
750
for (i = 0; i < prHMM->L+1; i++){
751
if (NULL != prHMM->f[i]){
752
free(prHMM->f[i]); prHMM->f[i] = NULL;
754
} /* 0 <= i < prHMM->L+1 */
755
free(prHMM->f); prHMM->f = NULL;
757
if (NULL != prHMM->g){
758
for (i = 0; i < prHMM->L+1; i++){
759
if (NULL != prHMM->g[i]){
760
free(prHMM->g[i]); prHMM->g[i] = NULL;
762
} /* 0 <= i < prHMM->L+1 */
763
free(prHMM->g); prHMM->g = NULL;
765
if (NULL != prHMM->p){
766
for (i = 0; i < prHMM->L+1; i++){
767
if (NULL != prHMM->p[i]){
768
free(prHMM->p[i]); prHMM->p[i] = NULL;
770
} /* 0 <= i < prHMM->L+1 */
771
free(prHMM->p); prHMM->p = NULL;
773
if (NULL != prHMM->tr){
774
for (i = 0; i < prHMM->L+1; i++){
775
if (NULL != prHMM->tr[i]){
776
free(prHMM->tr[i]); prHMM->tr[i] = NULL;
778
} /* 0 <= i < prHMM->L+1 */
779
free(prHMM->tr); prHMM->tr = NULL;
781
if (NULL != prHMM->linTr){
782
for (i = 0; i < prHMM->L+1; i++){
783
if (NULL != prHMM->linTr[i]){
784
free(prHMM->linTr[i]); prHMM->linTr[i] = NULL;
786
} /* 0 <= i < prHMM->L+1 */
787
free(prHMM->linTr); prHMM->linTr = NULL;
790
if (NULL != prHMM->Neff_M){
791
free(prHMM->Neff_M); prHMM->Neff_M = NULL;
793
if (NULL != prHMM->Neff_I){
794
free(prHMM->Neff_I); prHMM->Neff_I = NULL;
796
if (NULL != prHMM->Neff_D){
797
free(prHMM->Neff_D); prHMM->Neff_D = NULL;
800
if (NULL != prHMM->seq){
801
for (i = 0; i < prHMM->n_display; i++){
802
if (NULL != prHMM->seq[i]){
803
free(prHMM->seq[i]); prHMM->seq[i] = NULL;
806
free(prHMM->seq); prHMM->seq = NULL;
809
memset(prHMM, 0, sizeof(hmm_light));
811
} /*** end: FreeHMMstruct() ***/
815
* @brief comparisin function for ascending qsort() of doubles
818
CompDblAsc(const void *pv1, const void *pv2){
820
double d1 = *(double *)pv1;
821
double d2 = *(double *)pv2;
823
if (d1 > d2) { return +1; }
824
else if (d1 < d2) { return -1; }
827
} /*** end: CompDblAsc() ***/
833
* @brief convert alignment into HMM (hhmake)
836
* structure with pseudocounts etc
837
* @param[in] pcHMM_input
838
* name of file with HMM
842
AlnToHMM2(hmm_light *rHMM_p,
843
char **ppcSeq, int iN){
847
SetSubstitutionMatrix();
852
const int ciGoodMeasureSeq = 10;
853
int iAuxLen = strlen(ppcSeq[0]);
854
par.maxResLen = iAuxLen;
855
par.maxResLen += ciGoodMeasureSeq;
856
par.maxColCnt = iAuxLen + ciGoodMeasureSeq;
857
par.max_seqid=DEFAULT_FILTER;
858
par.loc=0; par.mact=0;
859
/* some minor parameters, affecting alignment (i think) */
860
par.p = 0.0; /* minimum threshold for inclusion in hit list */
861
par.Z = 100; /* minimum threshold for inclusion in hit list and alignment listing */
862
par.z = 1; /* min number of lines in hit list */
863
par.B = 100; /* max number of alignments */
864
par.b = 1; /* min number of alignments */
865
par.E = 1e6; /* maximum threshold for inclusion in hit list and alignment listing */
866
par.altali=1;par.hitrank=0;par.showcons=1; par.showdssp=1;par.showpred=1;par.nseqdis=iN;par.cons=1;
869
const int ciGoodMeasure = 10;
870
int iLen = strlen(ppcSeq[0]) + ciGoodMeasure;
871
if (iLen > par.maxResLen){
872
par.maxResLen = par.maxColCnt = iLen;
874
HMM rTemp(iN+2, iLen); /* temporary template */
875
Alignment rTempAli(iN+2, iLen); /* temporary alignment */
876
int iParCons = par.cons;
878
/*printf(">>>>>>>>>>> %s:%s:%d: there are %d sequences\n", __FUNCTION__, __FILE__, __LINE__, iN);*/
881
if (OK != ReadAndPrepare(INTERN_ALN_2_HMM,
883
NULL/*prealigned*/, NULL/*representative*/, NULL/*weights*/, //YES/*transfer*/,
884
(char*)("")/*in-file*/, rTemp, &rTempAli)) {
890
/*** transfer class info to struct */
891
rHMM_p->n_display = rTemp.n_display;
892
rHMM_p->sname = NULL;
893
rHMM_p->seq = (char **)calloc((rTemp.n_display+1), sizeof(char *));
895
for (int i = 0; i < rTemp.n_display; i++){
896
/* NOTE: In the original hhalign code the first position
897
is kept open ('\0'). This makes it difficult to use
898
string functions like strlen/strdup etc.
899
Insert a temporary gap (.) to facilitate string operations */
900
char cHead = rTemp.seq[i][0];
901
rTemp.seq[i][0] = '.';
902
rHMM_p->seq[i] = strdup(rTemp.seq[i]);
903
rTemp.seq[i][0] = rHMM_p->seq[i][0] = cHead;
905
rHMM_p->ncons = rTemp.ncons;
906
rHMM_p->nfirst = rTemp.nfirst;
907
if (-1 == rHMM_p->ncons){
908
/* ncons needed later for alignment of
909
representative sequence and copy of profile.
910
ncons not always set (-1 default),
911
this will cause segmentation fault.
912
nfirst (probably) right index -
914
rHMM_p->ncons = rTemp.nfirst;
916
rHMM_p->nss_dssp = rTemp.nss_dssp;
917
rHMM_p->nsa_dssp = rTemp.nsa_dssp;
918
rHMM_p->nss_pred = rTemp.nss_pred;
919
rHMM_p->nss_conf = rTemp.nss_conf;
921
rHMM_p->N_in = rTemp.N_in;
922
rHMM_p->N_filtered = rTemp.N_filtered;
923
#define NEFF_SCORE 10 /* FIXME: Magic Number */ //// get rid of that, FS, 2010-12-22
924
rHMM_p->Neff_M = (float *)calloc(rHMM_p->L+1, sizeof(float));
925
rHMM_p->Neff_I = (float *)calloc(rHMM_p->L+1, sizeof(float));
926
rHMM_p->Neff_D = (float *)calloc(rHMM_p->L+1, sizeof(float));
927
for (int i = 0; i < rHMM_p->L+1; i++){
928
rHMM_p->Neff_M[i] = rHMM_p->Neff_I[i] = rHMM_p->Neff_D[i] = NEFF_SCORE; //// get rid of that, FS, 2010-12-22
930
rHMM_p->Neff_HMM = rTemp.Neff_HMM;
931
/* skip longname,name,file,fam,sfam,fold,cl */
932
rHMM_p->lamda = rTemp.lamda;
933
rHMM_p->mu = rTemp.mu;
935
HMMshadow rShadow = {0}; /* friend of HMM to access private members */
936
rShadow.copyHMMtoShadow(rTemp);
938
rHMM_p->Neff_M = (float *)calloc(rHMM_p->L+1, sizeof(float));
939
rHMM_p->Neff_I = (float *)calloc(rHMM_p->L+1, sizeof(float));
940
rHMM_p->Neff_D = (float *)calloc(rHMM_p->L+1, sizeof(float));
941
rHMM_p->f = (float **)calloc(rHMM_p->L+1, sizeof(float *));
942
rHMM_p->g = (float **)calloc(rHMM_p->L+1, sizeof(float *));
943
rHMM_p->p = (float **)calloc(rHMM_p->L+1, sizeof(float *));
944
rHMM_p->tr = (float **)calloc(rHMM_p->L+1, sizeof(float *));
945
rHMM_p->linTr = (float **)calloc(rHMM_p->L+1, sizeof(float *));
947
for (int i = 0; i < rHMM_p->L+1; i++){
948
rHMM_p->Neff_M[i] = (float)rShadow.Neff_M[i];
949
rHMM_p->Neff_I[i] = (float)rShadow.Neff_I[i];
950
rHMM_p->Neff_D[i] = (float)rShadow.Neff_D[i];
951
rHMM_p->f[i] = (float *)calloc(AMINOACIDS, sizeof(float));
952
rHMM_p->g[i] = (float *)calloc(AMINOACIDS, sizeof(float));
953
rHMM_p->p[i] = (float *)calloc(AMINOACIDS, sizeof(float));
954
for (int j = 0; j < AMINOACIDS; j++){
955
rHMM_p->f[i][j] = (float)rShadow.f[i][j];
956
rHMM_p->g[i][j] = (float)rShadow.g[i][j];
957
rHMM_p->p[i][j] = (float)rShadow.p[i][j];
959
rHMM_p->tr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float));
960
rHMM_p->linTr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float));
961
for (int j = 0; j< STATE_TRANSITIONS; j++){
962
rHMM_p->tr[i][j] = (float)rShadow.tr[i][j];
963
rHMM_p->linTr[i][j] = fpow2(rShadow.tr[i][j]);
965
} /*0 <= i < prHMM->L+1 */
966
/* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */
967
for (int j = 0; j < AMINOACIDS; j++){
968
rHMM_p->pav[j] = (float)rShadow.pav[j];
976
rTemp.ClobberGlobal();
977
rTempAli.ClobberGlobal();
981
} /*** end of AlnToHMM2() ***/
987
* @brief turn alignment (from file) into HHM (HMM) on file
989
* @param[out] hmm_out
990
* name of file with HMM info corresponding to tmp_aln
992
* name of file with alignment, to be turned into HMM (HHM)
996
HHMake_Wrapper(char *tmp_aln, char *hmm_out)
999
HMM rTemp; /* temporary template */
1000
Alignment rTempAli; /* temporary alignment */
1001
hmm_light *rHMM_p = NULL;
1004
this is a wrapper for a stand-alone program hhmake.
1005
hhmake uses a different set of parameters than hhalign.
1006
However, all parameters are GLOBAL.
1007
So at this point we register the hhalign settings,
1008
reset them to hhmake settings and set them back
1009
at the end of the function
1012
/* save settings from hhalign */
1013
int iParShowcons=par.showcons;
1014
int iParAppend=par.append;
1015
int iParNseqdis=par.nseqdis;
1016
int iParMark=par.mark;
1017
int iParMaxSeqid=par.max_seqid;
1018
int iParQid=par.qid;
1019
double dParQsc=par.qsc;
1020
int iParCoverage=par.coverage;
1021
int iParNdiff=par.Ndiff;
1022
int iParCoverageCore=par.coverage_core;
1023
double dParQscCore=par.qsc_core;
1024
double dParCoresc=par.coresc;
1026
int iParMgaps=par.Mgaps;
1027
int iParMatrix=par.matrix;
1028
int iParPcm=par.pcm;
1029
double dParPcw=par.pcw;
1030
double dParGapb=par.gapb;
1033
/* these are settings suitable for hhmake */
1034
par.showcons=1; // write consensus sequence into hhm file
1035
par.append=0; // overwrite output file
1036
par.nseqdis=10; // maximum number of query or template sequences to be recoreded in HMM and diplayed in output alignments
1037
par.mark=0; // 1: only marked sequences (or first) get displayed; 0: most divergent ones get displayed
1038
par.max_seqid=90; // default for maximum sequence identity threshold
1039
par.qid=0; // default for maximum sequence identity threshold
1040
par.qsc=-20.0f; // default for minimum score per column with query
1041
par.coverage=0; // default for minimum coverage threshold
1042
par.Ndiff=100; // pick Ndiff most different sequences from alignment
1043
par.coverage_core=30; // Minimum coverage for sequences in core alignment
1044
par.qsc_core=0.3f; // Minimum score per column with core alignment (HMM)
1045
par.coresc=-20.0f; // Minimum score per column with core alignment (HMM)
1046
par.M=1; // match state assignment is by A2M/A3M
1047
par.Mgaps=50; // above this percentage of gaps, columns are assigned to insert states
1048
par.matrix=0; // Subst.matrix 0: Gonnet, 1: HSDM, 2: BLOSUM50 3: BLOSUM62
1049
par.pcm=0; // no amino acid and transition pseudocounts added
1050
par.pcw=0; // wc>0 weighs columns according to their intra-clomun similarity
1051
par.gapb=0.0; // default values for transition pseudocounts; 0.0: add no transition pseudocounts!
1052
par.wg=0; // 0: use local sequence weights 1: use local ones
1055
if (OK != ReadAndPrepare(READ_ALN_2_HMM,
1056
NULL, 0, rHMM_p, NULL, NULL, NULL,
1057
tmp_aln, rTemp, &rTempAli)) {
1061
rTemp.WriteToFile(hmm_out);
1065
/* restore settings from hhalign */
1066
par.showcons=iParShowcons;
1067
par.append=iParAppend;
1068
par.nseqdis=iParNseqdis;
1070
par.max_seqid=iParMaxSeqid;
1073
par.coverage=iParCoverage;
1074
par.Ndiff=iParNdiff;
1075
par.coverage_core=iParCoverageCore;
1076
par.qsc_core=dParQscCore;
1077
par.coresc=dParCoresc;
1079
par.Mgaps=iParMgaps;
1080
par.matrix=iParMatrix;
1088
rTemp.ClobberGlobal();
1089
rTempAli.ClobberGlobal();
1097
* @brief read HMM from file, copy (relevant) info into struct
1100
* structure with pseudocounts etc, probably uninitialised on entry
1101
* @param[in] pcHMM_input
1102
* name of file with HMM
1106
readHMMWrapper(hmm_light *rHMM_p,
1109
par.maxResLen = 15002;
1110
HMM rTemp(1000,par.maxResLen); /* temporary template */
1111
Alignment rTempAli; /* temporary alignment */
1112
/* AW changed init from {0} to 0 because it failed to compile with
1113
* gcc 4.3.3 with the following error:
1114
* error: braces around initializer for non-aggregate type
1116
/* FS taken out initialiser alltogether */
1118
/* 0th arg of RnP is the type of RnP, ie,
1119
enum {INTERN_ALN_2_HMM = 0, READ_ALN_2_HMM, READ_HMM_2_HMM};*/
1120
/* 1st arg of ReadAndPrepare() is ppcSeqs, 2nd is #seq */
1121
/* FIXME: at this stage the 3rd arg, rHMM_p, only acts as a dummy.
1122
this is rather silly, as it is the actual struct that will
1123
carry the HMM info at the end.
1124
If we write it already in ReadAndPrepare() then we could
1125
dispense with all this friend-class nonsense */
1126
if (OK != ReadAndPrepare(READ_HMM_2_HMM,
1127
NULL, 0, rHMM_p, NULL, NULL, NULL,
1128
pcHMM_input, rTemp, &rTempAli)) {
1132
/* an important piece of information I want to get out of here
1133
is the consenssus sequence. there are however certain
1134
Pfam HMMs that don't trigger consensus calculation.
1135
at the moment I (FS) don't understand why this is
1136
(or rather why this is not). The proper place to do this
1137
should be inside ReadAndPrepare():ReadHMMer(), but
1138
I have not quite penetrated the logic there.
1139
Therefore I try to catch this condition at this point (here)
1142
if (-1 == rHMM_p->ncons){
1144
rHMM_p->ncons = rHMM_p->nfirst;
1146
for (i = 0; i < rHMM_p->L; i++){
1147
double dPmax = 0.00;
1149
for (iA = 0; iA < AMINOACIDS; iA++){
1150
if (rHMM_p->f[i][iA] > dPmax){
1152
dPmax = rHMM_p->f[i][iA];
1154
} /* (0 <= iA < AMINOACIDS) */
1155
rHMM_p->seq[rHMM_p->ncons][i] = i2aa(iAmax);
1156
} /* (0 <= i < rHMM_p->L) */
1158
} /* ncons not set */
1161
rTemp.ClobberGlobal();
1162
rTempAli.ClobberGlobal();
1166
} /*** end: readHMMWrapper() ***/
1173
/////////////////////////////////////////////////////////////////////////////
1175
* @brief Do precalculations for q and t to prepare comparison
1178
PrepareTemplate(HMM& q, HMM& t, int format)
1180
if (format==0) // HHM format
1182
// Add transition pseudocounts to template
1183
t.AddTransitionPseudocounts();
1185
// Generate an amino acid frequency matrix from t.f[i][a] with max pseudocounts (tau=1) -> t.g[i][a]
1186
t.PreparePseudocounts();
1188
// Add amino acid pseudocounts to template: t.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1189
t.AddAminoAcidPseudocounts();
1191
else // HHMER format
1193
// Don't add transition pseudocounts to template
1194
// t.AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg, par.gaph, par.gapi, 0.0);
1196
// Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
1197
t.PreparePseudocounts();
1199
// DON'T ADD amino acid pseudocounts to temlate: pcm=0! t.p[i][a] = t.f[i][a]
1200
t.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
1203
// Modify transition probabilities to include SS-dependent penalties
1204
if (par.ssgap) t.UseSecStrucDependentGapPenalties();
1206
if (par.forward>=1) t.Log2LinTransitionProbs(1.0);
1208
// Factor Null model into HMM t
1209
// ATTENTION! t.p[i][a] is divided by pnul[a] (for reasons of efficiency) => do not reuse t.p
1210
t.IncludeNullModelInHMM(q,t); // Can go BEFORE the loop if not dependent on template
1216
/*** end of hhfunc-C.h ***/