~ubuntu-branches/ubuntu/quantal/clustalo/quantal : revision 1

1

/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */

2

3

/*********************************************************************

4

* Clustal Omega - Multiple sequence alignment

5

*

6

7

*

8

* Clustal-Omega is free software; you can redistribute it and/or

9

* modify it under the terms of the GNU General Public License as

10

* published by the Free Software Foundation; either version 2 of the

11

* License, or (at your option) any later version.

12

*

13

* This file is part of Clustal-Omega.

14

*

15

********************************************************************/

16

17

/*

18

* RCS $Id: hhhitlist-C.h 243 2011-05-31 13:49:19Z fabian $

19

*/

20

21

// hhhitlist.C

22

23

#ifndef MAIN

24

#define MAIN

25

#include <iostream> // cin, cout, cerr

26

#include <fstream> // ofstream, ifstream

27

#include <stdio.h> // printf

28

#include <stdlib.h> // exit

29

#include <string> // strcmp, strstr

30

#include <math.h> // sqrt, pow

31

#include <limits.h> // INT_MIN

32

#include <float.h> // FLT_MIN

33

#include <time.h> // clock

34

#include <ctype.h> // islower, isdigit etc

35

using std::ios;

36

using std::ifstream;

37

using std::ofstream;

38

using std::cout;

39

using std::cerr;

40

using std::endl;

41

#include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.

42

#include "list.h" // list data structure

43

#include "hash.h" // hash data structure

44

#include "hhdecl-C.h" // constants, class

45

#include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.

46

#include "hhhmm.h" // class HMM

47

#include "hhalignment.h" // class Alignment

48

#include "hhhit.h"

49

#include "hhhalfalignment.h"

50

#include "hhfullalignment.h"

51

#endif

52

53

54

//////////////////////////////////////////////////////////////////////////////

55

//////////////////////////////////////////////////////////////////////////////

56

//// Methods of class HitList

57

//////////////////////////////////////////////////////////////////////////////

58

//////////////////////////////////////////////////////////////////////////////

59

60

61

62

//////////////////////////////////////////////////////////////////////////////

63

/**

64

* @brief Print summary listing of hits

65

*/

66

void

67

HitList::PrintHitList(HMM& q, char* outfile)

68

{

69

Hit hit;

70

int nhits=0;

71

char str[NAMELEN]="";

72

73

FILE* outf=NULL;

74

if (strcmp(outfile,"stdout"))

75

{

76

outf=fopen(outfile,"w");

77

if (!outf) OpenFileError(outfile);

78

}

79

else

80

outf = stdout;

81

82

83

fprintf(outf,"Query %s\n",q.longname);

84

// fprintf(outf,"Family %s\n",q.fam);

85

fprintf(outf,"Match_columns %i\n",q.L);

86

fprintf(outf,"No_of_seqs %i out of %i\n",q.N_filtered,q.N_in);

87

fprintf(outf,"Neff %-4.1f\n",q.Neff_HMM);

88

fprintf(outf,"Searched_HMMs %i\n",N_searched);

89

90

// Print date stamp

91

time_t* tp=new(time_t);

92

*tp=time(NULL);

93

fprintf(outf,"Date %s",ctime(tp));

94

delete (tp); (tp) = NULL;

95

96

// Print command line

97

fprintf(outf,"Command ");

98

for (int i=0; i<par.argc; i++)

99

if (strlen(par.argv[i])<=par.maxdbstrlen)

100

fprintf(outf,"%s ",par.argv[i]);

101

else

102

fprintf(outf,"<%i characters> ",(int)strlen(par.argv[i]));

103

fprintf(outf,"\n\n");

104

105

#ifdef WINDOWS

106

if (par.trans)

107

fprintf(outf," No Hit Prob E-trans E-value Score SS Cols Query HMM Template HMM\n");

108

else

109

fprintf(outf," No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM\n");

110

#else

111

if (par.trans)

112

fprintf(outf," No Hit Prob E-trans E-value Score SS Cols Query HMM Template HMM\n");

113

else

114

fprintf(outf," No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM\n");

115

#endif

116

117

Reset();

118

while (!End()) // print hit list

119

{

120

hit = ReadNext();

121

if (nhits>=par.Z) break; //max number of lines reached?

122

if (nhits>=par.z && hit.Probab < par.p) break;

123

if (nhits>=par.z && hit.Eval > par.E) continue;

124

// if (hit.matched_cols <=1) continue; // adding this might get to intransparent... analogous statement in PrintAlignments

125

nhits++;

126

sprintf(str,"%3i %-30.30s ",nhits,hit.longname);

127

128

129

#ifdef WINDOWS

130

if (par.trans) // Transitive scoring

131

fprintf(outf,"%-34.34s %5.1f %8.2G %8.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.E1val,hit.Eval,hit.score,hit.score_ss,hit.matched_cols);

132

else // Normal scoring

133

fprintf(outf,"%-34.34s %5.1f %8.2G %8.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.Eval,hit.Pval,hit.score,hit.score_ss,hit.matched_cols);

134

#else

135

if (par.trans) // Transitive scoring

136

fprintf(outf,"%-34.34s %5.1f %7.2G %7.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.E1val,hit.Eval,hit.score,hit.score_ss,hit.matched_cols);

137

else // Normal scoring

138

fprintf(outf,"%-34.34s %5.1f %7.2G %7.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.Eval,hit.Pval,hit.score,hit.score_ss,hit.matched_cols);

139

#endif

140

141

sprintf(str,"%4i-%-4i ",hit.i1,hit.i2);

142

fprintf(outf,"%-10.10s",str);

143

sprintf(str,"%4i-%-4i",hit.j1,hit.j2);

144

fprintf(outf,"%-9.9s(%i)\n",str,hit.L);

145

} //end print hit list

146

fprintf(outf,"\n");

147

if (strcmp(outfile,"stdout")) fclose(outf);

148

}

149

150

151

152

//////////////////////////////////////////////////////////////////////////////

153

/**

154

* @brief Print alignments of query sequences against hit sequences

155

*/

156

int

157

HitList::PrintAlignments(

158

159

160

#ifdef CLUSTALO

161

char **ppcFirstProf, char **ppcSecndProf,

162

#endif

163

HMM& q, char* outfile, char outformat)

164

{

165

Hit hit;

166

FullAlignment qt_ali(par.nseqdis+10); // maximum 10 annotation (pseudo) sequences (ss_dssp, sa_dssp, ss_pred, ss_conf, consens,...)

167

int nhits=0;

168

169

#ifndef CLUSTALO_NOFILE

170

FILE* outf=NULL;

171

if (strcmp(outfile,"stdout"))

172

{

173

if (outformat==0)

174

outf=fopen(outfile,"a"); //append to summary hitlist

175

else

176

outf=fopen(outfile,"w"); //open for writing

177

if (!outf) OpenFileError(outfile);

178

}

179

else

180

outf = stdout;

181

#endif

182

183

Reset();

184

while (!End()) // print hit list

185

{

186

if (nhits>=par.B) break; //max number of lines reached?

187

hit = ReadNext();

188

if (nhits>=par.b && hit.Probab < par.p) break;

189

if (nhits>=par.b && hit.Eval > par.E) continue;

190

// // adding this might get to intransparent...

191

// // analogous statement in PrintHitlist and hhalign.C

192

// if (hit.matched_cols <=1) continue;

193

nhits++;

194

195

// Build double alignment of query against template sequences

196

int iBuildRet = qt_ali.Build(q,hit);

197

if (iBuildRet != OK){ /* FS, r241 -> r243 */

198

fprintf(stderr, "%s:%s:%d: qt_ali.Build failed\n",

199

__FUNCTION__, __FILE__, __LINE__);

200

return FAILURE;

201

}

202

203

#ifndef CLUSTALO

204

// Print out alignment

205

if (outformat==0) // HHR format

206

{

207

fprintf(outf,"No %-3i\n",nhits);

208

qt_ali.PrintHeader(outf,q,hit);

209

qt_ali.PrintHHR(outf,hit);

210

}

211

else if (outformat==1) // FASTA format

212

{

213

fprintf(outf,"# No %-3i\n",nhits);

214

qt_ali.PrintFASTA(outf,hit);

215

}

216

else if(outformat==2) // A2M format

217

{

218

fprintf(outf,"# No %-3i\n",nhits);

219

qt_ali.PrintA2M(outf,hit);

220

}

221

else // A3m format

222

{

223

fprintf(outf,"# No %-3i\n",nhits);

224

qt_ali.PrintA3M(outf,hit);

225

}

226

#else

227

qt_ali.OverWriteSeqs(ppcFirstProf, ppcSecndProf);

228

#endif

229

230

qt_ali.FreeMemory();

231

}

232

#ifndef CLUSTALO_NOFILE

233

if (strcmp(outfile,"stdout")) fclose(outf);

234

#endif

235

236

return OK;

237

238

} /* this is the end of HitList::PrintAlignments() */

239

240

241

242

243

244

////////////////////////////////////////////////////////////////////////////

245

/**

246

* @brief Return the ROC_5 score for optimization

247

* (changed 28.3.08 by Michael & Johannes)

248

*/

249

void

250

HitList::Optimize(HMM& q, char* buffer)

251

{

252

const int NFAM =5; // calculate ROC_5 score

253

const int NSFAM=5; // calculate ROC_5 score

254

int roc=0; // ROC score

255

int fam=0; // number of hits from same family (at current threshold)

256

int not_fam=0; // number of hits not from same family

257

int sfam=0; // number of hits from same suporfamily (at current threshold)

258

int not_sfam=0; // number of hits not from same superfamily

259

Hit hit;

260

261

SortList();

262

Reset();

263

while (!End())

264

{

265

hit = ReadNext();

266

if (!strcmp(hit.fam,q.fam)) fam++; // query and template from same superfamily? => positive

267

else if (not_fam<NFAM) // query and template from different family? => negative

268

{

269

not_fam++;

270

roc += fam;

271

}

272

if (!strcmp(hit.sfam,q.sfam)) sfam++; // query and template from same superfamily? => positive

273

else if (not_sfam<NSFAM) // query and template from different superfamily? => negative

274

{

275

not_sfam++;

276

roc += sfam;

277

}

278

// printf("qfam=%s tfam=%s qsfam=%s tsfam=%s fam=%-2i not_fam=%3i sfam=%-3i not_sfam=%-5i roc=%-3i\n",q.fam,hit.fam,q.sfam,hit.sfam,fam,not_fam,sfam,not_sfam,roc);

279

}

280

281

// Write ROC score to file or stdout

282

FILE* buf=NULL;

283

if (strcmp(par.buffer,"stdout"))

284

{

285

buf=fopen(buffer,"w");

286

if (!buf) OpenFileError(par.buffer);

287

}

288

else

289

buf = stdout;

290

291

fprintf(buf,"%f\n",float(roc)/float(fam*NFAM+sfam*NSFAM)); // must be between 0 and 1

292

if (v>=2) printf("ROC=%f\n",float(roc)/float(fam*NFAM+sfam*NSFAM));

293

fclose(buf);

294

}

295

296

297

298

//////////////////////////////////////////////////////////////////////////////

299

/**

300

* @brief Print score distribution into file score_dist

301

*/

302

void

303

HitList::PrintScoreFile(HMM& q)

304

{

305

int i=0, n;

306

FILE* scoref=NULL;

307

Hit hit;

308

Hash<int> twice(10000); // make sure only one hit per HMM is listed

309

twice.Null(-1);

310

311

if (strcmp(par.scorefile,"stdout"))

312

{

313

scoref=fopen(par.scorefile,"w");

314

if (!scoref)

315

{cerr<<endl<<"WARNING from "<<par.argv[0]<<": could not open \'"<<par.scorefile<<"\'\n"; return;}

316

}

317

else

318

scoref = stdout;

319

Reset();

320

fprintf(scoref,"NAME %s\n",q.longname);

321

fprintf(scoref,"FAM %s\n",q.fam);

322

fprintf(scoref,"FILE %s\n",q.file);

323

fprintf(scoref,"LENG %i\n",q.L);

324

fprintf(scoref,"\n");

325

//fprintf(scoref,"TARGET REL LEN COL LOG-PVA S-TOT MS NALI\n");

326

327

//For hhformat, the PROBAB field has to start at position 41 !!

328

// ----+----1----+----2----+----3----+----4----+----

329

fprintf(scoref,"TARGET FAMILY REL LEN COL LOG-PVA S-AASS PROBAB SCORE\n");

330

// d153l__ 5 185 185 287.82 464.22 100.00

331

// d1qsaa2 3 168 124 145.55 239.22 57.36

332

while (!End())

333

{

334

i++;

335

hit = ReadNext();

336

if (twice[hit.name]==1) continue; // better hit with same HMM has been listed already

337

twice.Add(hit.name,1);

338

//if template and query are from the same superfamily

339

if (!strcmp(hit.name,q.name)) n=5;

340

else if (!strcmp(hit.fam,q.fam)) n=4;

341

else if (!strcmp(hit.sfam,q.sfam)) n=3;

342

else if (!strcmp(hit.fold,q.fold)) n=2;

343

else if (!strcmp(hit.cl,q.cl)) n=1;

344

else n=0;

345

fprintf(scoref,"%-10s %-10s %1i %3i %3i %s %7.2f %6.2f %7.2f\n",hit.name,hit.fam,n,hit.L,hit.matched_cols,sprintg(-1.443*hit.logPval,7),-hit.score_aass,hit.Probab,hit.score);

346

}

347

fclose(scoref);

348

}

349

350

351

inline double

352

logPvalue_HHblast(double s, double corr)

353

{

354

return -s*(1.0-0.5*corr) + (1.0-corr)*log(1.0+s);

355

// return -s*(1.0-0.5*corr) + log( 1.0+(1.0-corr)*s );

356

// return -s*(1.0-0.5*corr) + log( 1.0+(1.0-corr)*(1.0-0.5*corr)*s );

357

}

358

359

inline double

360

Pvalue_HHblast(double s, double corr)

361

{

362

return exp(-s*(1.0-0.5*corr)) * pow(1.0+s,1.0-corr);

363

// return exp(-s*(1.0-0.5*corr)) * ( 1.0+(1.0-corr)*s );

364

// return exp(-s*(1.0-0.5*corr)) * ( 1.0+(1.0-corr)*(1.0-0.5*corr)*s );

365

}

366

367

inline double

368

logLikelihood_HHblast(double s, double corr)

369

{

370

if (s<0.0) { s=0.0; if (corr<1E-5) corr=1E-5; else if (corr>0.99999) corr=0.99999; }

371

else { if (corr<0.0) corr=0.0; else if (corr>1.0) corr=1.0; }

372

return -s*(1.0-0.5*corr) - corr*log(1.0+s) + log(s*(1.0-0.5*corr)+0.5*corr);

373

// return -s*(1.0-0.5*corr) + log( s*(1.0-corr)*(1.0-0.5*corr)+0.5*corr );

374

// return -s*(1.0-0.5*corr) + log((s*(1.0-corr)*(1.0-0.5*corr)+corr)*(1.0-0.5*corr));

375

}

376

377

/////////////////////////////////////////////////////////////////////////////////////

378

/**

379

* @brief Evaluate the *negative* log likelihood for the order statistic of the uniform distribution

380

* for the best 10% of hits (vertex v = (corr,offset) )

381

* The k'th order statistic for X~Uniform is p:=X^(k)~Beta(k,n-k+1) = const*p^(k-1)*(1-p)^(n-k)

382

* Needed to fit the correlation and score offset in HHblast

383

*/

384

double

385

HitList::RankOrderFitCorr(double* v)

386

{

387

double sum=0.0;

388

// printf("%8.2G %8.2G %i\n",v[0],v[1],Nprof);

389

int i1 = imin(Nprof,imax(50,int(0.05*Nprof)));

390

for (int i=0; i<i1; i++)

391

{

392

double p = Pvalue_HHblast(score[i]+v[1],v[0]);

393

// sum -= (1.0-double(i)/double(i1)) * weight[i] * ( double(i)*log(p) + (Nprof-i-1.0)*log(1.0-p) );

394

float diff = p-(float(i)+1.0)/(Nprof+1.0);

395

sum += (1.0-double(i)/double(i1)) * weight[i]*diff*diff*(Nprof+1.0)*(Nprof+1.0)*(Nprof+2.0)/(i+10.0)/(Nprof-i);

396

// printf("%-3i Pval=%7.5f Preal=%7.5f diff=%7.5f sum=%7.5f\n",i,p,float(i+1)/(1.0+Nprof),diff,sum);

397

}

398

return sum;

399

}

400

401

/**

402

* @brief Static wrapper-function for calling the nonstatic member function RankOrderFitCorr()

403

* ( see http://www.newty.de/fpt/callback.html#member )

404

*/

405

double

406

HitList::RankOrderFitCorr_static(void* pt2hitlist, double* v)

407

{

408

HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist

409

return mySelf->RankOrderFitCorr(v); // call member function

410

}

411

412

/////////////////////////////////////////////////////////////////////////////////////

413

/**

414

* @brief Evaluate the *negative* log likelihood of the data at the vertex v = (corr,offset)

415

* Needed to fit the correlation and score offset in HHblast

416

*/

417

double

418

HitList::LogLikelihoodCorr(double* v)

419

{

420

double sum=0.0;

421

// printf("%8.2G %8.2G %i\n",v[0],v[1],Nprof);

422

for (int i=0; i<Nprof; i++)

423

{

424

sum -= weight[i]*logLikelihood_HHblast(score[i]+v[1],v[0]);

425

// printf("%-3i Pval=%7.5f Preal=%7.5f diff=%7.5f rmsd=%7.5f sum=%7.5f\n",i,Pvalue_HHblast(score[i],v[0]),float(i)/(1.0+Nprof),x,sqrt(sum/sumw),sum);

426

}

427

return sum;

428

}

429

430

/**

431

* @brief Static wrapper-function for calling the nonstatic member function LogLikelihoodCorr()

432

* ( see http://www.newty.de/fpt/callback.html#member )

433

*/

434

double

435

HitList::LogLikelihoodCorr_static(void* pt2hitlist, double* v)

436

{

437

HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist

438

return mySelf->LogLikelihoodCorr(v); // call member function

439

}

440

441

/////////////////////////////////////////////////////////////////////////////////////

442

/**

443

* @brief Evaluate the *negative* log likelihood of the data at the vertex v = (lamda,mu)

444

* p(s) = lamda * exp{ -exp[-lamda*(s-mu)] - lamda*(s-mu) } = lamda * exp( -exp(-x) - x)

445

*/

446

double

447

HitList::LogLikelihoodEVD(double* v)

448

{

449

double sum=0.0, sumw=0.0;

450

for (int i=0; i<Nprof; i++)

451

{

452

double x = v[0]*(score[i]-v[1]);

453

sum += weight[i]*(exp(-x)+x);

454

sumw += weight[i];

455

}

456

return sum - sumw*log(v[0]);

457

}

458

459

/**

460

* @brief Static wrapper-function for calling the nonstatic member function LogLikelihoodEVD()

461

* ( see http://www.newty.de/fpt/callback.html#member )

462

*/

463

double

464

HitList::LogLikelihoodEVD_static(void* pt2hitlist, double* v)

465

{

466

HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist

467

return mySelf->LogLikelihoodEVD(v); // call member function

468

}

469

470

/////////////////////////////////////////////////////////////////////////////////////

471

/**

472

* @brief Subroutine to FindMin: try new point given by highest point ihigh and fac and replace ihigh if it is lower

473

*/

474

double

475

HitList::TryPoint(const int ndim, double* p, double* y, double* psum, int ihigh, double fac, double (*Func)(void* pt2hitlist, double* v))

476

{

477

// New point p_try = p_c + fac*(p_high-p_c),

478

// where p_c = ( sum_i (p_i) - p_high)/ndim is the center of ndim other points

479

// => p_try = fac1*sum_i(p_i) + fac2*p_high

480

double fac1=(1.-fac)/ndim;

481

double fac2=fac-fac1;

482

double ptry[ndim]; //new point to try out

483

double ytry; //function value of new point

484

int j; //index for the ndim parameters

485

486

for (j=0; j<ndim; j++)

487

ptry[j]=psum[j]*fac1+p[ihigh*ndim+j]*fac2;

488

ytry = (*Func)(this,ptry);

489

if (ytry<=y[ihigh])

490

{

491

// if (v>=4) printf("Trying: %-7.3f %-7.3f %-7.3f -> accept\n",ptry[0],ptry[1],ytry);

492

y[ihigh]=ytry;

493

for (j=0; j<ndim; j++)

494

{

495

psum[j] += ptry[j]-p[ihigh*ndim+j]; //update psum[j]

496

p[ihigh*ndim+j]=ptry[j]; //replace p[ihigh] with ptry

497

} //Note: ihigh is now not highest point anymore!

498

}

499

// else if (v>=4) printf("Trying: %-7.3f %-7.3f %-7.3f -> reject\n",ptry[0],ptry[1],ytry);

500

501

return ytry;

502

}

503

504

505

506

/////////////////////////////////////////////////////////////////////////////////////

507

/**

508

* @brief Find minimum with simplex method of Nelder and Mead (1965)

509

*/

510

float

511

HitList::FindMin(const int ndim, double* p, double* y, double tol, int& nfunc, double (*Func)(void* pt2hitlist, double* v))

512

{

513

const int MAXNFUNC=99; //maximum allowed number of function evaluations

514

int ihigh; //index of highest point on simplex

515

int inext; //index of second highest point on simplex

516

int ilow; //index of lowest point on simplex

517

int i; //index for the ndim+1 points

518

int j; //index for the ndim parameters

519

double rtol; //tolerance: difference of function value between highest and lowest point of simplex

520

double temp; //dummy

521

double ytry; //function value of trial point

522

double psum[ndim]; //psum[j] = j'th coordinate of sum vector (sum over all vertex vectors)

523

524

nfunc=0; //number of function evaluations =0

525

//Calculate sum vector psum[j]

526

for (j=0; j<ndim; j++)

527

{

528

psum[j]=p[j];

529

for (i=1; i<ndim+1; i++)

530

psum[j]+=p[i*ndim+j];

531

}

532

533

// Repeat finding better points in simplex until rtol<tol

534

while(1)

535

{

536

// Find indices for highest, next highest and lowest point

537

ilow=0;

538

if (y[0]>y[1]) {inext=1; ihigh=0;} else {inext=0; ihigh=1;}

539

for (i=0; i<ndim+1; i++)

540

{

541

if (y[i]<=y[ilow]) ilow=i;

542

if (y[i]>y[ihigh]) {inext=ihigh; ihigh=i;}

543

else if (y[i]>y[inext] && i!= ihigh) inext=i;

544

}

545

546

// If tolerance in y is smaller than tol swap lowest point to index 0 and break -> return

547

rtol = 2.*fabs(y[ihigh]-y[ilow]) / (fabs(y[ihigh])+fabs(y[ilow])+1E-10);

548

if (rtol<tol)

549

{

550

temp=y[ilow]; y[ilow]=y[0]; y[0]=temp;

551

for (j=0; j<ndim; j++)

552

{

553

temp=p[ilow*ndim+j]; p[ilow*ndim+j]=p[j]; p[j]=temp;

554

}

555

break;

556

}

557

558

// Max number of function evaluations exceeded?

559

if (nfunc>=MAXNFUNC )

560

{

561

if (v) fprintf(stderr,"\nWARNING: maximum likelihood fit of score distribution did not converge.\n");

562

return 1;

563

}

564

565

nfunc+=2;

566

// Point-reflect highest point on the center of gravity p_c of the other ndim points of the simplex

567

if (v>=3) printf("%3i %-7.3f %-7.3f %-12.8f %-9.3E\n",nfunc,p[ilow*ndim],p[ilow*ndim+1],y[ilow],rtol);

568

// if (v>=2) printf(" %3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);

569

ytry = TryPoint(ndim,p,y,psum,ihigh,-1.0,Func); //reflect highest point on p_c

570

571

if (ytry<=y[ilow])

572

{

573

ytry = TryPoint(ndim,p,y,psum,ihigh,2.0,Func); //expand: try new point 2x further away from p_c

574

// if (v>=2) printf("Expanded: %3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);

575

}

576

else if (ytry>=y[inext])

577

{

578

// The new point is worse than the second worst point

579

temp=y[ihigh];

580

ytry=TryPoint(ndim,p,y,psum,ihigh,0.5,Func); //contract simplex by 0.5 along (p_high-p_c

581

// if (v>=2) printf("Compressed:%3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);

582

if (ytry>=temp)

583

{

584

// Trial point is larger than worst point => contract simplex by 0.5 towards lowest point

585

for (i=0; i<ndim+1; i++)

586

{

587

if (i!=ilow)

588

{

589

for (j=0; j<ndim; j++)

590

p[i*ndim+j]=0.5*(p[i*ndim+j]+p[ilow+j]);

591

y[i] = (*Func)(this,p+i*ndim);

592

// y[i] = (*Func)(p+i*ndim);

593

}

594

}

595

nfunc+=ndim;

596

// if (v>=2) printf("Contracted:%3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);

597

598

//Calculate psum[j]

599

for (j=0; j<ndim; j++)

600

{

601

psum[j]=p[j];

602

for (i=1; i<ndim+1; i++)

603

psum[j]+=p[i*ndim+j];

604

}

605

}

606

}

607

else nfunc--;

608

}

609

return (float)rtol;

610

}

611

612

613

614

/////////////////////////////////////////////////////////////////////////////////////

615

/**

616

* @brief Do a maximum likelihod fit of the scores with an EV distribution with parameters lamda and mu

617

*/

618

void

619

HitList::MaxLikelihoodEVD(HMM& q, int nbest)

620

{

621

double tol=1E-6; // Maximum relative tolerance when minimizing -log(P)/N (~likelihood)

622

static char first_call=1;

623

static Hash<int> size_fam(MAXPROF/10); // Hash counts number of HMMs in family

624

static Hash<int> size_sfam(MAXPROF/10); // Hash counts number of families in superfamily

625

Hash<int> excluded(50); // Hash containing names of superfamilies to be excluded from fit

626

size_fam.Null(0); // Set int value to return when no data can be retrieved

627

size_sfam.Null(0); // Set int value to return when no data can be retrieved

628

excluded.Null(0); // Set int value to return when no data can be retrieved

629

Hit hit;

630

631

double mu; // EVD[mu,lam](x) = exp(-exp(-(x-mu)/lam)) = P(score<=x)

632

double vertex[2*3]; // three vertices of the simplex in lamda-mu plane

633

double yvertex[3]; // log likelihood values at the three vertices of the simplex

634

int nfunc=0; // number of function calls

635

double sum_weights=0.0;

636

float sum_scores=0.0;

637

float rtol;

638

639

if (first_call==1)

640

{

641

first_call=0;

642

// Count how many HMMs are in each family; set number of multiple hits per template nrep

643

if (v>=4) printf(" count number of profiles in each family and families in each superfamily ...\n");

644

Reset();

645

while (!End())

646

{

647

hit = ReadNext();

648

if (!size_fam.Contains(hit.fam)) (*size_sfam(hit.sfam))++; //Add one to hash element for superfamily

649

(*size_fam(hit.fam))++; //Add one to hash element for family

650

// printf("size(%s)=%i name=%s\n",hit.fam,*size_fam(hit.fam),hit.name)

651

}

652

fams=size_fam.Size();

653

sfams=size_sfam.Size();

654

if (v>=3)

655

printf("%-3i HMMs from %i families and %i superfamilies searched. Found %i hits\n",N_searched,fams,sfams,Size());

656

}

657

658

// Query has SCOP family identifier?

659

if (q.fam && q.fam[0]>='a' && q.fam[0]<='k' && q.fam[1]=='.')

660

{

661

char sfamid[NAMELEN];

662

char* ptr_in_fam=q.fam;

663

while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))

664

{

665

char* ptr=strrchr(sfamid,'.');

666

if (ptr) *ptr='\0';

667

excluded.Add(sfamid);

668

// fprintf(stderr,"Exclude SCOP superfamily %s ptr_in_fam='%s'\n",sfamid,ptr_in_fam);

669

}

670

}

671

// Exclude best superfamilies from fit

672

else if (nbest>0)

673

{

674

if (sfams<97+nbest) return;

675

676

// Find the nbest best-scoring superfamilies for exclusion from first ML fit

677

if (v>=4) printf(" find %i best-scoring superfamilies to exclude from first fit ...\n",nbest);

678

hit = Smallest();

679

excluded.Add(hit.sfam);

680

// printf("Exclude in first round: %s %8.2f %s\n",hit.name,hit.score_aass,hit.sfam);

681

while (excluded.Size()<nbest)

682

{

683

Reset();

684

while (!End() && excluded.Contains(ReadNext().sfam)) ;

685

hit=ReadCurrent();

686

while (!End())

687

{

688

if (ReadNext()<hit && !excluded.Contains(ReadCurrent().sfam))

689

hit=ReadCurrent();

690

}

691

excluded.Add(hit.sfam);

692

// printf("Exclude in first round: %s %8.2f %s %i %i\n",hit.name,hit.score_aass,hit.sfam,excluded.Size(),excluded.Contains(hit.sfam));

693

}

694

tol = 0.01/size_sfam.Size(); // tol=1/N would lead to delta(log-likelihood)~1 (where N ~ number of superfamilies) since (1+1/N)^N = e

695

}

696

else

697

{

698

// Find the best-scoring superfamilies from first fit for exclusion from second ML fit

699

if (v>=4) printf(" find best-scoring superfamilies to exclude from second fit ...\n");

700

Reset();

701

while (!End())

702

{

703

hit = ReadNext();

704

if (hit.Eval < 0.05) excluded.Add(hit.sfam); // changed from 0.5 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

705

}

706

tol = 0.001/size_sfam.Size(); // tol=1/N would lead to delta(log-likelihood)~1 (where N ~ number of superfamilies) since (1+1/N)^N = e

707

}

708

709

// Put scores into score[] and weights into weight[]

710

if (v>=3) printf(" generate scores and weights array for ML fitting ...\n");

711

Nprof=0;

712

Reset();

713

while (!End())

714

{

715

hit = ReadNext();

716

if (hit.irep > 1) continue; //Use only best hit per template

717

if (Nprof>=MAXPROF) break;

718

719

char sfamid[NAMELEN];

720

char* ptr_in_fam=hit.fam;

721

while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))

722

{

723

char* ptr=strrchr(sfamid,'.');

724

if (ptr) *ptr='\0';

725

if (excluded.Contains(sfamid)) break; //HMM is among superfamilies to be excluded

726

}

727

if (excluded.Contains(sfamid)) {

728

if (v>=3) fprintf(stderr,"Exclude hit %s (family %s contains %s)\n",hit.name,hit.fam,sfamid);

729

continue;

730

}

731

// ScopID(hit.cl,hit.fold,hit.sfam,hit.fam); //Get scop superfamily code for template

732

// if (*hit.sfam=='\0' || excluded.Contains(hit.sfam)) continue; // skip HMM

733

734

score[Nprof] = hit.score;

735

weight[Nprof]=1./size_fam[hit.fam]/size_sfam[hit.sfam];

736

sum_scores +=hit.score*weight[Nprof];

737

sum_weights+=weight[Nprof];

738

739

//DEBUG

740

// if (v>=4) printf("%-10.10s %-12.12s %-3i %-12.12s %-3i %6.4f %6.4f %7.1f\n",hit.name,hit.fam,size_fam[hit.fam],hit.sfam,size_sfam[hit.sfam],1./size_fam[hit.fam]/size_sfam[hit.sfam],sum,hit.score);

741

Nprof++;

742

}

743

//DEBUG

744

if (v>=3)

745

printf("%i hits used for score distribution\n",Nprof);

746

// for (int i=0; i<Nprof; i++) printf("%3i score=%8.3f weight=%7.5f\n",i,score[i],weight[i]);

747

748

// Set simplex vertices and function values

749

mu = sum_scores/sum_weights - 0.584/LAMDA;

750

if (par.loc) // fit only in local mode; in global mode use fixed value LAMDA and mu mean score

751

{

752

double (*Func)(void*, double*);

753

Func = HitList::LogLikelihoodEVD_static;

754

755

if (nbest>0) {vertex[0]=LAMDA; vertex[1]=mu;} /////////////////////////////////////////// DEBUG

756

else {vertex[0]=q.lamda; vertex[1]=mu;}

757

vertex[2]=vertex[0]+0.1; vertex[3]=vertex[1];

758

vertex[4]=vertex[0]; vertex[5]=vertex[1]+0.2;

759

yvertex[0]=Func(this,vertex );

760

yvertex[1]=Func(this,vertex+2);

761

yvertex[2]=Func(this,vertex+4);

762

763

// Find lam and mu that minimize negative log likelihood of data

764

if (v>=3) printf("Fitting to EVD by maximum likelihood...\niter lamda mu -log(P)/N tol\n");

765

rtol = FindMin(2,vertex,yvertex,tol,nfunc,Func);

766

if (v>=3) printf("%3i %-7.3f %-7.2f %-7.3f %-7.1E\n\n",nfunc,vertex[0],vertex[1],yvertex[0]-(1.5772-log(vertex[0])),rtol);

767

// printf("HHsearch lamda=%-6.3f mu=%-6.3f\n",vertex[0],vertex[1]);

768

}

769

else

770

{

771

vertex[0]=LAMDA_GLOB; vertex[1]=mu;

772

}

773

774

// Set lamda and mu of profile

775

q.lamda = vertex[0];

776

q.mu = vertex[1];

777

778

// Set P-values and E-values

779

// CHECK UPDATE FROM score=-logpval to score=-logpval+SSSCORE2NATLOG*score_ss !!!!

780

Reset();

781

while (!End())

782

{

783

hit = ReadNext();

784

785

// Calculate total score in raw score units: P-value = 1- exp(-exp(-lamda*(Saa-mu)))

786

hit.weight=1./size_fam[hit.fam]/size_sfam[hit.sfam]; // needed for transitive scoring

787

hit.logPval = logPvalue(hit.score,vertex);

788

hit.Pval=Pvalue(hit.score,vertex);

789

hit.Eval=exp(hit.logPval+log(N_searched));

790

// hit.score_aass = hit.logPval/0.45-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~4.0 in EVDs for scop20.1.63 HMMs

791

hit.score_aass = -q.lamda*(hit.score-q.mu)/0.45-3.0 - fmin(hit.score_ss,fmax(0.0,0.5*hit.score-5.0)); // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs

792

hit.Probab = Probab(hit);

793

if (nbest>0 && par.loc) // correct length correction (not needed for second round of fitting, since lamda very similar)

794

if (par.idummy==0) ////////////////////////////////////////////

795

hit.score += log(q.L*hit.L)*(1/LAMDA-1/vertex[0]);

796

hit.score_sort = hit.score_aass;

797

Overwrite(hit); // copy hit object into current position of hitlist

798

799

if (nbest==0 && par.trans==1) // if in transitive scoring mode (weights file given)

800

TransitiveScoring();

801

else if (nbest==0 && par.trans==2) // if in transitive scoring mode (weights file given)

802

TransitiveScoring2();

803

else if (nbest==0 && par.trans==3) // if in transitive scoring mode (weights file given)

804

TransitiveScoring3();

805

else if (nbest==0 && par.trans==4) // if in transitive scoring mode (weights file given)

806

TransitiveScoring4();

807

}

808

}

809

810

811

/////////////////////////////////////////////////////////////////////////////////////

812

/**

813

* @brief Calculate correlation and score offset for HHblast composite E-values

814

*/

815

void

816

HitList::CalculateHHblastCorrelation(HMM& q)

817

{

818

int nfunc=0; // number of function calls

819

double tol; // Maximum relative tolerance when minimizing -log(P)/N (~likelihood)

820

double vertex[2*3]; // three vertices of the simplex in lamda-mu plane

821

double yvertex[3]; // log likelihood values at the three vertices of the simplex

822

Hit hit;

823

Hash<int> excluded(50); // Hash containing names of superfamilies to be excluded from fit

824

excluded.Null(0); // Set int value to return when no data can be retrieved

825

826

// Set sum of HHsearch and PSI-BLAST score for calculating correlation

827

Reset();

828

while (!End())

829

{

830

hit = ReadNext();

831

hit.score_sort = hit.logPval + blast_logPvals->Show(hit.name); // if template not in hash, return log Pval = 0, i.e. Pvalue = 1!

832

Overwrite(hit); // copy hit object into current position of hitlist

833

}

834

835

// Query has SCOP family identifier?

836

if (q.fam && q.fam[0]>='a' && q.fam[0]<='k' && q.fam[1]=='.')

837

{

838

char sfamid[NAMELEN];

839

char* ptr_in_fam=q.fam;

840

while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))

841

{

842

char* ptr=strrchr(sfamid,'.');

843

if (ptr) *ptr='\0';

844

excluded.Add(sfamid);

845

fprintf(stderr,"Exclude SCOP superfamily %s ptr_in_fam='%s'\n",sfamid,ptr_in_fam);

846

}

847

}

848

849

// Resort list by sum of log P-values

850

ResortList(); // use InsertSort to resort list according to sum of minus-log-Pvalues

851

Nprof=0;

852

Reset();

853

ReadNext(); // skip best hit

854

while (!End())

855

{

856

hit = ReadNext();

857

if (hit.irep>=2) continue; // use only best alignments

858

// if (hit.Eval<0.005) {if (v>=3) printf("Fitting HHblast correlation coefficient: skipping %s with Evalue=%9.1g\n",hit.name,hit.Eval); continue;}

859

if (Nprof>=MAXPROF) break;

860

861

char sfamid[NAMELEN];

862

char* ptr_in_fam=hit.fam;

863

while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))

864

{

865

char* ptr=strrchr(sfamid,'.');

866

if (ptr) *ptr='\0';

867

if (excluded.Contains(sfamid)) break; //HMM is among superfamilies to be excluded

868

}

869

if (excluded.Contains(sfamid)) {

870

if (v>=1) fprintf(stderr,"Exclude hit %s (family %s contains %s)\n",hit.name,hit.fam,sfamid);

871

continue;

872

}

873

score[Nprof] = -hit.score_sort;

874

weight[Nprof] = 1.0; // = hit.weight;

875

// printf("%3i %-12.12s %7.3f + %7.3f = %7.3f \n",Nprof,hit.name,hit.logPval,blast_logPvals->Show(hit.name),-hit.score_sort); //////////////////////

876

printf("%3i %7.3f %7.3f\n",Nprof,hit.Pval,exp(blast_logPvals->Show(hit.name))); //////////////////////

877

Nprof++;

878

}

879

880

// Fit correlation

881

vertex[0]=0.5; vertex[1]=0.2;

882

vertex[2]=vertex[0]+0.2; vertex[3]=vertex[1];

883

vertex[4]=vertex[0]; vertex[5]=vertex[1]+0.2;

884

885

yvertex[0]=RankOrderFitCorr(vertex );

886

yvertex[1]=RankOrderFitCorr(vertex+2);

887

yvertex[2]=RankOrderFitCorr(vertex+4);

888

// yvertex[0]=LogLikelihoodCorr(vertex );

889

// yvertex[1]=LogLikelihoodCorr(vertex+2);

890

// yvertex[2]=LogLikelihoodCorr(vertex+4);

891

tol = 1e-6;

892

v=3;//////////////////////////////////

893

// Find correlation and offset that minimize mean square deviation of reported composite Pvalues from actual

894

if (v>=2) printf("Fitting correlation coefficient for HHblast...\niter corr offset logLikelihood tol\n");

895

float rtol = FindMin(2,vertex,yvertex,tol,nfunc, HitList::RankOrderFitCorr_static);

896

if (v>=2) printf("%3i %-7.3f %-7.2f %-7.3f %-7.1E\n\n",nfunc,vertex[0],vertex[1],yvertex[0],rtol);

897

if (vertex[0]<0) vertex[0]=0.0;

898

899

// Print correlation and offset for profile

900

printf("HHblast correlation=%-6.3f score offset=%-6.3f\n",vertex[0],vertex[1]);

901

v=2;//////////////////////////////////

902

}

903

904

905

/**

906

* @brief Calculate HHblast composite E-values

907

*/

908

inline double

909

corr_HHblast(float Nq, float Nt)

910

{

911

return 0.5;

912

}

913

914

/**

915

* @brief Calculate HHblast composite E-values

916

*/

917

inline double

918

offset_HHblast(float Nq, float Nt)

919

{

920

return 0.0;

921

}

922

923

//////////////////////////////////////////////////////////////////////////////

924

/**

925

* @brief Calculate HHblast composite E-values

926

*/

927

void

928

HitList::CalculateHHblastEvalues(HMM& q)

929

{

930

Hit hit;

931

float corr, offset; // correlation coefficient and offset for calculating composite HHblast P-values

932

933

Reset();

934

while (!End())

935

{

936

hit = ReadNext();

937

corr = corr_HHblast(q.Neff_HMM,hit.Neff_HMM);

938

offset = offset_HHblast(q.Neff_HMM,hit.Neff_HMM);

939

hit.score_sort = hit.logPval + blast_logPvals->Show(hit.name);

940

hit.logPval = logPvalue_HHblast(-hit.score_sort+offset,corr); // overwrite logPval from HHsearch with composite logPval from HHblast

941

hit.Pval = Pvalue_HHblast(-hit.score_sort+offset,corr); // overwrite P-value from HHsearch with composite P-value from HHblast

942

hit.Eval = exp(hit.logPval+log(N_searched)); // overwrite E-value from HHsearch with composite E-value from HHblast

943

hit.Probab = Probab(hit);

944

Overwrite(hit); // copy hit object into current position of hitlist

945

}

946

ResortList(); // use InsertSort to resort list according to sum of minus-log-Pvalues

947

}

948

949

950

//////////////////////////////////////////////////////////////////////////////

951

/**

952

* @brief Read file generated by blastpgp (default output) and store P-values in hash

953

*/

954

void

955

HitList::ReadBlastFile(HMM& q)

956

{

957

char line[LINELEN]=""; // input line

958

int Ndb; // number of sequences in database

959

int Ldb=0; // size of database in number of amino acids

960

char* templ;

961

int i;

962

if (!blast_logPvals) { blast_logPvals = new(Hash<float>); blast_logPvals->New(16381,0); }

963

964

FILE* blaf = NULL;

965

if (!strcmp(par.blafile,"stdin")) blaf=stdin;

966

else

967

{

968

blaf = fopen(par.blafile,"rb");

969

if (!blaf) OpenFileError(par.blafile);

970

}

971

972

// Read number of sequences and size of database

973

while (fgetline(line,LINELEN-1,blaf) && !strstr(line,"sequences;"));

974

if (!strstr(line,"sequences;")) FormatError(par.blafile,"No 'Database:' string found.");

975

char* ptr=line;

976

Ndb = strint(ptr);

977

if (Ndb==INT_MIN) FormatError(par.blafile,"No integer for number of sequences in database found.");

978

while ((i=strint(ptr))>INT_MIN) Ldb = 1000*Ldb + i;

979

if (Ldb==0) FormatError(par.blafile,"No integer for size of database found.");

980

printf("\nNumber of sequences in database = %i Size of database = %i\n",Ndb,Ldb);

981

982

// Read all E-values and sequence lengths

983

while (fgetline(line,LINELEN-1,blaf))

984

{

985

if (line[0]=='>')

986

{

987

// Read template name

988

templ = new(char[255]);

989

ptr = line+1;

990

strwrd(templ,ptr);

991

if (!blast_logPvals->Contains(templ)) // store logPval only for best HSP with template

992

{

993

// Read length

994

while (fgetline(line,LINELEN-1,blaf) && !strstr(line,"Length ="));

995

ptr = line+18;

996

int length = strint(ptr);

997

// Read E-value

998

fgetline(line,LINELEN-1,blaf);

999

fgetline(line,LINELEN-1,blaf);

1000

float EvalDB; // E-value[seq-db] = Evalue for comparison Query vs. database, from PSI-BLAST

1001

float EvalQT; // E-value[seq-seq] = Evalue for comparison Query vs. template (seq-seq)

1002

double logPval;

1003

ptr = strstr(line+20,"Expect =");

1004

if (!ptr) FormatError(par.blafile,"No 'Expect =' string found.");

1005

if (sscanf(ptr+8,"%g",&EvalDB)<1)

1006

{

1007

ptr[7]='1';

1008

if (sscanf(ptr+7,"%g",&EvalDB)<1)

1009

FormatError(par.blafile,"No Evalue found after 'Expect ='.");

1010

}

1011

// Calculate P-value[seq-seq] = 1 - exp(-E-value[seq-seq]) = 1 - exp(-Lt/Ldb*E-value[seq-db])

1012

EvalQT = length/double(Ldb)*double(EvalDB);

1013

if (EvalQT>1E-3) logPval = log(1.0-exp(-EvalQT)); else logPval=log(double(EvalQT)+1.0E-99);

1014

blast_logPvals->Add(templ,logPval);

1015

printf("template=%-10.10s length=%-3i EvalDB=%8.2g EvalQT=%8.2g P-value=%8.2g log Pval=%8.2g\n",templ,length,EvalDB,EvalQT,exp(logPval),logPval);

1016

}

1017

else {

1018

delete[] templ; templ = NULL;

1019

}

1020

}

1021

}

1022

fclose(blaf);

1023

}

1024

1025

1026

/////////////////////////////////////////////////////////////////////////////////////

1027

/**

1028

* @brief Calculate output of hidden neural network units

1029

*/

1030

inline float

1031

calc_hidden_output(const float* weights, const float* bias, float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm)

1032

{

1033

float res;

1034

// Calculate activation of hidden unit = sum of all inputs * weights + bias

1035

res = Lqnorm*weights[0] + Ltnorm*weights[1] + Nqnorm*weights[2] + Ntnorm*weights[3] + *bias;

1036

res = 1.0 / (1.0 + exp(-(res ))); // logistic function

1037

return res;

1038

}

1039

1040

////////////////////////////////////////////////////////////////////////////////////

1041

/**

1042

* @brief Neural network regressions of lamda for EVD

1043

*/

1044

inline float

1045

lamda_NN(float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm)

1046

{

1047

const int inputs = 4;

1048

const int hidden = 4;

1049

const float biases[] = {-0.73195, -1.43792, -1.18839, -3.01141}; // bias for all hidden units

1050

const float weights[] = { // Weights for the neural networks (column = start unit, row = end unit)

1051

-0.52356, -3.37650, 1.12984, -0.46796,

1052

-4.71361, 0.14166, 1.66807, 0.16383,

1053

-0.94895, -1.24358, -1.20293, 0.95434,

1054

-0.00318, 0.53022, -0.04914, -0.77046,

1055

2.45630, 3.02905, 2.53803, 2.64379

1056

};

1057

float lamda=0.0;

1058

for (int h = 0; h<hidden; h++) {

1059

lamda += calc_hidden_output( weights+inputs*h, biases+h, Lqnorm,Ltnorm,Nqnorm,Ntnorm ) * weights[hidden*inputs+h];

1060

}

1061

return lamda;

1062

}

1063

1064

////////////////////////////////////////////////////////////////////////////////////

1065

/**

1066

* @brief Neural network regressions of mu for EVD

1067

*/

1068

inline float

1069

mu_NN(float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm)

1070

{

1071

const int inputs = 4;

1072

const int hidden = 6;

1073

const float biases[] = {-4.25264, -3.63484, -5.86653, -4.78472, -2.76356, -2.21580}; // bias for all hidden units

1074

const float weights[] = { // Weights for the neural networks (column = start unit, row = end unit)

1075

1.96172, 1.07181, -7.41256, 0.26471,

1076

0.84643, 1.46777, -1.04800, -0.51425,

1077

1.42697, 1.99927, 0.64647, 0.27834,

1078

1.34216, 1.64064, 0.35538, -8.08311,

1079

2.30046, 1.31700, -0.46435, -0.46803,

1080

0.90090, -3.53067, 0.59212, 1.47503,

1081

-1.26036, 1.52812, 1.58413, -1.90409, 0.92803, -0.66871

1082

};

1083

float mu=0.0;

1084

for (int h = 0; h<hidden; h++) {

1085

mu += calc_hidden_output( weights+inputs*h, biases+h, Lqnorm,Ltnorm,Nqnorm,Ntnorm ) * weights[hidden*inputs+h];

1086

}

1087

return 20.0*mu;

1088

}

1089

1090

//////////////////////////////////////////////////////////////////////////////

1091

/**

1092

* @brief Calculate Pvalues as a function of query and template lengths and diversities

1093

*/

1094

void

1095

HitList::CalculatePvalues(HMM& q)

1096

{

1097

Hit hit;

1098

float lamda=0.4, mu=3.0;

1099

const float log1000=log(1000.0);

1100

1101

if (par.idummy!=2)

1102

{

1103

printf("WARNING: idummy should have been ==2 (no length correction)\n");

1104

exit(4);

1105

}

1106

1107

if(N_searched==0) N_searched=1;

1108

if (v>=2)

1109

printf("Calculate Pvalues as a function of query and template lengths and diversities...\n");

1110

Reset();

1111

while (!End())

1112

{

1113

hit = ReadNext();

1114

1115

if (par.loc)

1116

{

1117

lamda = lamda_NN( log(q.L)/log1000, log(hit.L)/log1000, q.Neff_HMM/10.0, hit.Neff_HMM/10.0 );

1118

mu = mu_NN( log(q.L)/log1000, log(hit.L)/log1000, q.Neff_HMM/10.0, hit.Neff_HMM/10.0 );

1119

// if (v>=3 && nhits++<20)

1120

// printf("hit=%-10.10s Lq=%-4i Lt=%-4i Nq=%5.2f Nt=%5.2f => lamda=%-6.3f mu=%-6.3f\n",hit.name,q.L,hit.L,q.Neff_HMM,hit.Neff_HMM,lamda,mu);

1121

}

1122

else

1123

{

1124

printf("WARNING: global calibration not yet implemented!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");

1125

}

1126

hit.logPval = logPvalue(hit.score,lamda,mu);

1127

hit.Pval = Pvalue(hit.score,lamda,mu);

1128

hit.Eval=exp(hit.logPval+log(N_searched));

1129

// hit.score_aass = hit.logPval/LAMDA-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs

1130

// P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))

1131

hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-hit.Pval)) )/0.45 - fmin(lamda*hit.score_ss,fmax(0.0,0.2*(hit.score-8.0)))/0.45 - 3.0;

1132

hit.score_sort = hit.score_aass;

1133

hit.Probab = Probab(hit);

1134

Overwrite(hit);

1135

}

1136

SortList();

1137

Reset();

1138

return;

1139

}

1140

1141

//////////////////////////////////////////////////////////////////////////////

1142

/**

1143

* @brief Calculate Pvalues from calibration of 0: query HMM, 1:template HMMs, 2: both

1144

*/

1145

void

1146

HitList::GetPvalsFromCalibration(HMM& q)

1147

{

1148

Hit hit;

1149

char warn=0;

1150

if(N_searched==0) N_searched=1;

1151

if (v>=2)

1152

{

1153

switch (par.calm)

1154

{

1155

case 0:

1156

printf("Using lamda=%-5.3f and mu=%-5.2f from calibrated query HMM %s. \n",q.lamda,q.mu,q.name);

1157

printf("Note that HMMs need to be recalibrated when changing HMM-HMM alignment options.\n");

1158

break;

1159

case 1:

1160

printf("Using score distribution parameters lamda and mu from database HMMs \n");

1161

break;

1162

case 2:

1163

printf("Combining score distribution parameters lamda and mu from query and database HMMs\n");

1164

printf("Note that HMMs need to be recalibrated when changing HMM-HMM alignment options.\n");

1165

break;

1166

}

1167

}

1168

Reset();

1169

while (!End())

1170

{

1171

hit = ReadNext();

1172

if (par.calm==0 || (hit.logPvalt==0) )

1173

{

1174

hit.logPval = logPvalue(hit.score,q.lamda,q.mu);

1175

hit.Pval = Pvalue(hit.score,q.lamda,q.mu);

1176

if (par.calm>0 && warn++<1 && v>=1)

1177

printf("Warning: some template HMM (e.g. %s) are not calibrated. Using query calibration.\n",hit.name);

1178

}

1179

else if (par.calm==1)

1180

{

1181

hit.logPval = hit.logPvalt;

1182

hit.Pval = hit.Pvalt;

1183

}

1184

else if (par.calm==2)

1185

{

1186

hit.logPval = 0.5*( logPvalue(hit.score,q.lamda,q.mu) + hit.logPvalt);

1187

hit.Pval = sqrt( Pvalue(hit.score,q.lamda,q.mu) * hit.Pvalt);

1188

if (v>=5) printf("Score: %7.1f lamda: %7.1f mu: %7.1f P-values: query-calibrated: %8.2G template-calibrated: %8.2G geometric mean: %8.2G\n",hit.score,q.lamda,q.mu,Pvalue(hit.score,q.lamda,q.mu),hit.Pvalt,hit.Pval);

1189

}

1190

1191

hit.Eval=exp(hit.logPval+log(N_searched));

1192

// hit.score_aass = hit.logPval/LAMDA-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs

1193

// P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))

1194

hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-hit.Pval)) ) / 0.45-3.0 - fmin(hit.score_ss,fmax(0.0,0.5*hit.score-5.0));

1195

hit.score_sort = hit.score_aass;

1196

hit.Probab = Probab(hit);

1197

Overwrite(hit);

1198

}

1199

SortList();

1200

Reset();

1201

return;

1202

}

1203

1204

1205

1206

1207

1208

1209

1210

1211

1212

//////////////////////////////////////////////////////////////////////////////

1213

//////////////////////////////////////////////////////////////////////////////

1214

//////////////////////////////////////////////////////////////////////////////

1215

// Transitive scoring

1216

//////////////////////////////////////////////////////////////////////////////

1217

//////////////////////////////////////////////////////////////////////////////

1218

//////////////////////////////////////////////////////////////////////////////

1219

1220

1221

1222

1223

1224

1225

1226

/////////////////////////////////////////////////////////////////////////////////////

1227

/**

1228

* @brief Calculate P-values and Probabilities from transitive scoring over whole database

1229

*/

1230

void

1231

HitList::TransitiveScoring()

1232

{

1233

void PrintMatrix(float** V, int N);

1234

void PrintMatrix(double** V, int N);

1235

1236

float** Z; // matrix of intra-db Z-scores Z_kl

1237

float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)

1238

char** fold; // fold name of HMM k

1239

char** fam; // family of HMM k

1240

float* Prob; // probability of HMM k

1241

float* Zq; // Zq[k] = Z-score between query and database HMM k

1242

float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q

1243

float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k

1244

float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively

1245

int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans

1246

int N; // dimension of weight matrix is NxN

1247

int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)

1248

int k,l,m,n; // indices for database HMMs

1249

char name[NAMELEN];

1250

Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N}

1251

index.Null(-1); // Set int value to return when no data can be retrieved

1252

Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit

1253

excluded.Null(0); // Set int value to return when no data can be retrieved

1254

Hit hit;

1255

size_t unused; /* disable fread gcc warning */

1256

1257

// Read weights matrix W with index hash and names array

1258

fprintf(stderr,"Reading in weights file\n");

1259

FILE* wfile = fopen(par.wfile,"rb");

1260

if (v>=1 && wfile==NULL)

1261

{

1262

fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);

1263

perror("fopen");

1264

fprintf(stderr,"Skipping caclulation of transitive P-values\n");

1265

par.trans=0;

1266

return;

1267

}

1268

unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database)

1269

if (v>=1 && N!=N_searched)

1270

{

1271

fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);

1272

fprintf(stderr,"Skipping caclulation of transitive P-values\n");

1273

par.trans=0;

1274

return;

1275

}

1276

if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);

1277

// Read names of HMMs (to specify mapping of HMM to matrix indices)

1278

for (k=0; k<N; k++)

1279

{

1280

unused = fread(name,sizeof(char),IDLEN,wfile);

1281

index.Add(name,k);

1282

}

1283

// Read symmetric Z-scores matrix

1284

Z = new(float*[N]);

1285

for (k=0; k<N; k++)

1286

{

1287

Z[k] = new(float[N]);

1288

for (l=0; l<k; l++) Z[k][l] = Z[l][k];

1289

unused = fread(Z[k]+k,sizeof(float),N-k,wfile);

1290

}

1291

// Read symmetric covariance matrix

1292

C = new(float*[N]);

1293

for (k=0; k<N; k++)

1294

{

1295

C[k] = new(float[N]);

1296

for (l=0; l<k; l++) C[k][l] = C[l][k];

1297

unused = fread(C[k]+k,sizeof(float),N-k,wfile);

1298

}

1299

fclose(wfile);

1300

1301

// Allocate memory

1302

Zq = new(float[N]);

1303

Ztq = new(float[N]);

1304

Zrq = new(float[N]);

1305

fold = new(char*[N]);

1306

fam = new(char*[N]);

1307

Prob = new(float[N]);

1308

ll = new(int[N]);

1309

w = new(float[N]);

1310

1311

// Transform P-values to normally distributed Z-scores and store in Zq vector

1312

fprintf(stderr,"Transform P-values to Z-scores\n");

1313

float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL

1314

float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans

1315

printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans);

1316

1317

Reset();

1318

while (!End())

1319

{

1320

hit = ReadNext();

1321

if (hit.irep>1) continue;

1322

k = index.Show(hit.name);

1323

if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}

1324

if (hit.logPvalt<0)

1325

Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk)

1326

else

1327

Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk

1328

// printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);

1329

// if (isnan(Zq[k])) {

1330

// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");

1331

// printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);

1332

// par.trans=0;

1333

// return;

1334

// }

1335

if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);

1336

fold[k] = new(char[IDLEN]);

1337

fam[k] = new(char[IDLEN]);

1338

strcpy(fold[k],hit.fold);

1339

strcpy(fam[k],hit.fam);

1340

weight[k] = hit.weight;

1341

Prob[k] = hit.Probab;

1342

}

1343

1344

if (v>=3)

1345

{

1346

excluded.Reset();

1347

while (!excluded.End())

1348

{

1349

excluded.ReadNext(name);

1350

printf("Excluded fold %s from fitting to Ztq\n",name);

1351

}

1352

}

1353

1354

1355

////////////////////////////////////////////////////////////////

1356

// Calculate transitive score (query->l) Zt[l]

1357

1358

// Construct vector ll of indices l for which Z_lq > Zmin_trans

1359

m = 0;

1360

for (l=0; l<N; l++)

1361

if (Zq[l]>=Zmin_trans) ll[m++]=l;

1362

M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans

1363

1364

// for (m=0; m<M; m++)

1365

// fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);

1366

1367

if (M<=1)

1368

for (k=0; k<N; k++) Ztq[k]=0.0;

1369

else

1370

{

1371

// Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans

1372

double** Csub = new(double*[M]);

1373

double** Cinv = new(double*[M]);

1374

for (m=0; m<M; m++)

1375

{

1376

Csub[m] = new(double[M]);

1377

Cinv[m] = new(double[M]);

1378

for (n=0; n<M; n++)

1379

Csub[m][n] = double(C[ll[m]][ll[n]]);

1380

}

1381

1382

if (v>=3)

1383

{

1384

fprintf(stderr,"Covariance matrix\n");

1385

PrintMatrix(Csub,M);

1386

}

1387

1388

// Invert Csub

1389

fprintf(stderr,"Calculate inverse of covariance submatrix\n");

1390

InvertMatrix(Cinv,Csub,M);

1391

1392

if (v>=3)

1393

{

1394

fprintf(stderr,"Inverse covariance matrix\n");

1395

PrintMatrix(Cinv,M);

1396

}

1397

1398

// Calculate weights w[l]

1399

for (m=0; m<M; m++)

1400

{

1401

double sum = 0.0;

1402

for (n=0; n<M; n++)

1403

sum += 1.0 * Cinv[m][n];

1404

w[m] = fmax(sum,0.0);

1405

}

1406

for (l=0; l<M; l++){

1407

delete[](Cinv[l]); (Cinv[l]) = NULL;

1408

}

1409

delete[](Cinv); (Cinv) = NULL;

1410

1411

// Calculate Ztq[k] for all HMMs k

1412

fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");

1413

float norm = NormalizationFactor(Csub,w,M);

1414

for (k=0; k<N; k++)

1415

{

1416

double sumZ = 0.0;

1417

for (m=0; m<M; m++)

1418

sumZ += w[m] * Z[ll[m]][k];

1419

Ztq[k] = sumZ/norm;

1420

}

1421

1422

for (l=0; l<M; l++){

1423

delete[](Csub[l]); (Csub[l]) = NULL;

1424

}

1425

delete[](Csub); (Csub) = NULL;

1426

}

1427

1428

////////////////////////////////////////////////////////////////

1429

// Calculate reverse transitive score (l->query-) Zrq[l]

1430

1431

fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");

1432

for (k=0; k<N; k++)

1433

{

1434

// Construct vector ll of indices l for which Z_lk > Zmin_tran

1435

m = 0;

1436

for (l=0; l<N; l++)

1437

if (Z[l][k]+Z[k][l]>=2*Zmin_trans) ll[m++]=l;

1438

int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran

1439

1440

1441

// fprintf(stderr,"\nfam[k]: %s\n",fam[k]);

1442

// for (m=0; m<M; m++)

1443

// printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);

1444

1445

if (M<=1)

1446

{

1447

Zrq[k] = Zq[k];

1448

}

1449

else

1450

{

1451

// Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans

1452

double** Csub = new(double*[M]);

1453

for (m=0; m<M; m++)

1454

{

1455

Csub[m] = new(double[M]);

1456

for (n=0; n<M; n++)

1457

Csub[m][n] = double(C[ll[m]][ll[n]]);

1458

}

1459

// fprintf(stderr,"Covariance matrix\n");

1460

// PrintMatrix(Csub,M);

1461

1462

if (M==2)

1463

{

1464

for (m=0; m<M; m++) w[m] = 1.0/M;

1465

}

1466

else

1467

{

1468

1469

double** Cinv = new(double*[M]);

1470

for (m=0; m<M; m++) Cinv[m] = new(double[M]);

1471

1472

// Invert Csub

1473

InvertMatrix(Cinv,Csub,M);

1474

1475

// fprintf(stderr,"Inverse covariance matrix\n");

1476

// PrintMatrix(Cinv,M);

1477

1478

// Calculate weights w[l]

1479

for (m=0; m<M; m++)

1480

{

1481

double sum = 0.0;

1482

for (n=0; n<M; n++)

1483

sum += 1.0 * Cinv[m][n];

1484

w[m] = fmax(sum,0.0);

1485

}

1486

1487

// for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);

1488

1489

for (l=0; l<M; l++){

1490

delete[](Cinv[l]); (Cinv[l]) = NULL;

1491

}

1492

delete[](Cinv); (Cinv) = NULL;

1493

}

1494

1495

// Calculate Zrq[k] and normalize

1496

float norm = NormalizationFactor(Csub,w,M);

1497

double sumZ = 0.0;

1498

for (m=0; m<M; m++)

1499

sumZ += w[m] * Zq[ll[m]];

1500

Zrq[k] = sumZ/norm;

1501

1502

for (l=0; l<M; l++){

1503

delete[](Csub[l]); (Csub[l]) = NULL;

1504

}

1505

delete[](Csub); (Csub) = NULL;

1506

}

1507

1508

// fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);

1509

}

1510

1511

// Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score

1512

for (k=0; k<N; k++)

1513

{

1514

float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);

1515

// if (isnan(Zqtot))

1516

// {

1517

// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");

1518

// printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);

1519

// par.trans=0;

1520

// return;

1521

// }

1522

if (v>=2 && Zq[k] + Zqtot > 2*Zmin_trans) {

1523

printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);

1524

}

1525

Ztq[k] = Zqtot;

1526

}

1527

1528

// Calculate mean and standard deviation of Z1q

1529

fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");

1530

double sumw=0.0;

1531

double sumZ=0.0;

1532

double sumZ2=0.0;

1533

for (k=0; k<N; k++)

1534

{

1535

if (excluded.Contains(fold[k])) continue;

1536

sumw += weight[k];

1537

sumZ += weight[k]*Ztq[k];

1538

sumZ2 += weight[k]*Ztq[k]*Ztq[k];

1539

// if (isnan(sumZ))

1540

// {

1541

// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");

1542

// printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);

1543

// par.trans=0;

1544

// return;

1545

// }

1546

}

1547

float mu = sumZ/sumw;

1548

float sigma = sqrt(sumZ2/sumw-mu*mu);

1549

if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma);

1550

sigma *= 1.01;// correct different fitting of EVD and normal variables

1551

1552

// Normalize Ztq and calculate P1-values

1553

fprintf(stderr,"Normalize Ztq and calculate P1-values\n");

1554

Reset();

1555

while (!End())

1556

{

1557

hit = ReadNext();

1558

hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);

1559

hit.E1val = N_searched*(hit.logPval<-100.0? 0.0 : exp(hit.logPval));

1560

// P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))

1561

hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;

1562

hit.Probab = Probab(hit);

1563

hit.score_sort = hit.logPval;

1564

Overwrite(hit); // copy hit object into current position of hitlist

1565

}

1566

1567

for (k=0; k<N; k++){

1568

delete[](Z[k]); (Z[k]) = NULL;

1569

}

1570

for (k=0; k<N; k++){

1571

delete[](C[k]); (C[k]) = NULL;

1572

}

1573

for (k=0; k<N; k++){

1574

delete[](fold[k]); (fold[k]) = NULL;

1575

}

1576

for (k=0; k<N; k++){

1577

delete[](fam[k]); (fam[k]) = NULL;

1578

}

1579

delete[](C); (C) = NULL;

1580

delete[](Z); (Z) = NULL;

1581

delete[](fold); (fold) = NULL;

1582

delete[](fam); (fam) = NULL;

1583

delete[](Prob); (Prob) = NULL;

1584

delete[](ll); (ll) = NULL;

1585

delete[](Zq); (Zq) = NULL;

1586

delete[](Ztq); (Ztq) = NULL;

1587

}

1588

1589

1590

1591

//////////////////////////////////////////////////////////////////////////////

1592

/**

1593

* @brief Calculate P-values and Probabilities from transitive scoring over whole database

1594

*/

1595

void

1596

HitList::TransitiveScoring2()

1597

{

1598

void PrintMatrix(float** V, int N);

1599

void PrintMatrix(double** V, int N);

1600

1601

float** Z; // matrix of intra-db Z-scores Z_kl

1602

float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)

1603

char** fold; // fold name of HMM k

1604

char** fam; // family of HMM k

1605

float* Prob; // probability of HMM k

1606

float* Zq; // Zq[k] = Z-score between query and database HMM k

1607

float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q

1608

float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k

1609

float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively

1610

int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans

1611

int N; // dimension of weight matrix is NxN

1612

int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)

1613

int k,l,m,n; // indices for database HMMs

1614

char name[NAMELEN];

1615

Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N}

1616

index.Null(-1); // Set int value to return when no data can be retrieved

1617

Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit

1618

excluded.Null(0); // Set int value to return when no data can be retrieved

1619

Hit hit;

1620

size_t unused; /* disable fread gcc warning */

1621

1622

// Read weights matrix W with index hash and names array

1623

fprintf(stderr,"Reading in weights file\n");

1624

FILE* wfile = fopen(par.wfile,"rb");

1625

if (v>=1 && wfile==NULL)

1626

{

1627

fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);

1628

perror("fopen");

1629

fprintf(stderr,"Skipping caclulation of transitive P-values\n");

1630

par.trans=0;

1631

return;

1632

}

1633

unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database)

1634

if (v>=1 && N!=N_searched)

1635

{

1636

fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);

1637

fprintf(stderr,"Skipping caclulation of transitive P-values\n");

1638

par.trans=0;

1639

return;

1640

}

1641

if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);

1642

// Read names of HMMs (to specify mapping of HMM to matrix indices)

1643

for (k=0; k<N; k++)

1644

{

1645

unused = fread(name,sizeof(char),IDLEN,wfile);

1646

index.Add(name,k);

1647

}

1648

// Read symmetric Z-scores matrix

1649

Z = new(float*[N]);

1650

for (k=0; k<N; k++)

1651

{

1652

Z[k] = new(float[N]);

1653

for (l=0; l<k; l++) Z[k][l] = Z[l][k];

1654

unused = fread(Z[k]+k,sizeof(float),N-k,wfile);

1655

}

1656

// Read symmetric covariance matrix

1657

C = new(float*[N]);

1658

for (k=0; k<N; k++)

1659

{

1660

C[k] = new(float[N]);

1661

for (l=0; l<k; l++) C[k][l] = C[l][k];

1662

unused = fread(C[k]+k,sizeof(float),N-k,wfile);

1663

}

1664

fclose(wfile);

1665

1666

// Allocate memory

1667

Zq = new(float[N]);

1668

Ztq = new(float[N]);

1669

Zrq = new(float[N]);

1670

fold = new(char*[N]);

1671

fam = new(char*[N]);

1672

Prob = new(float[N]);

1673

ll = new(int[N]);

1674

w = new(float[N]);

1675

1676

// Transform P-values to normally distributed Z-scores and store in Zq vector

1677

fprintf(stderr,"Transform P-values to Z-scores\n");

1678

float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL

1679

float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans

1680

printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans);

1681

1682

Reset();

1683

while (!End())

1684

{

1685

hit = ReadNext();

1686

if (hit.irep>1) continue;

1687

k = index.Show(hit.name);

1688

if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}

1689

if (hit.logPvalt<0)

1690

Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk)

1691

else

1692

Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk

1693

// printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);

1694

// if (isnan(Zq[k]))

1695

// {

1696

// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");

1697

// printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);

1698

// par.trans=0;

1699

// return;

1700

// }

1701

if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);

1702

fold[k] = new(char[IDLEN]);

1703

fam[k] = new(char[IDLEN]);

1704

strcpy(fold[k],hit.fold);

1705

strcpy(fam[k],hit.fam);

1706

weight[k] = hit.weight;

1707

Prob[k] = hit.Probab;

1708

}

1709

1710

if (v>=3)

1711

{

1712

excluded.Reset();

1713

while (!excluded.End())

1714

{

1715

excluded.ReadNext(name);

1716

printf("Excluded fold %s from fitting to Ztq\n",name);

1717

}

1718

}

1719

1720

1721

////////////////////////////////////////////////////////////////

1722

// Calculate transitive score (query->l) Zt[l]

1723

1724

// Construct vector ll of indices l for which Z_lq > Zmin_trans

1725

m = 0;

1726

for (l=0; l<N; l++)

1727

if (Zq[l]>=Zmin_trans) ll[m++]=l;

1728

M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans

1729

1730

// for (m=0; m<M; m++)

1731

// fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);

1732

1733

if (M<=1)

1734

for (k=0; k<N; k++) Ztq[k]=0.0;

1735

else

1736

{

1737

// Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans

1738

double** Csub = new(double*[M]);

1739

double** Cinv = new(double*[M]);

1740

for (m=0; m<M; m++)

1741

{

1742

Csub[m] = new(double[M]);

1743

Cinv[m] = new(double[M]);

1744

for (n=0; n<M; n++)

1745

Csub[m][n] = double(C[ll[m]][ll[n]]);

1746

}

1747

1748

if (v>=3)

1749

{

1750

fprintf(stderr,"Covariance matrix\n");

1751

PrintMatrix(Csub,M);

1752

}

1753

1754

// // Invert Csub

1755

// fprintf(stderr,"Calculate inverse of covariance submatrix\n");

1756

// InvertMatrix(Cinv,Csub,M);

1757

1758

// if (v>=3)

1759

// {

1760

// fprintf(stderr,"Inverse covariance matrix\n");

1761

// PrintMatrix(Cinv,M);

1762

// }

1763

1764

1765

// Calculate weights w[l]

1766

for (m=0; m<M; m++)

1767

{

1768

double sum = 0.0;

1769

for (n=0; n<M; n++)

1770

sum += 1.0 * Csub[m][n];

1771

printf("w[%4i] = %-8.5f\n",ll[m],1.0/sum);

1772

w[m] = (sum>0? Zq[ll[m]] / sum : 0.0);

1773

}

1774

for (l=0; l<M; l++){

1775

delete[](Cinv[l]); (Cinv[l]) = NULL;

1776

}

1777

delete[](Cinv); (Cinv) = NULL;

1778

1779

// Calculate Ztq[k] for all HMMs k

1780

fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");

1781

float norm = NormalizationFactor(Csub,w,M);

1782

for (k=0; k<N; k++)

1783

{

1784

double sumZ = 0.0;

1785

for (m=0; m<M; m++)

1786

sumZ += w[m] * Z[ll[m]][k];

1787

Ztq[k] = sumZ/norm;

1788

}

1789

1790

for (l=0; l<M; l++){

1791

delete[](Csub[l]); (Csub[l]) = NULL;

1792

}

1793

delete[](Csub); (Csub) = NULL;

1794

}

1795

1796

////////////////////////////////////////////////////////////////

1797

// Calculate reverse transitive score (l->query-) Zrq[l]

1798

1799

fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");

1800

for (k=0; k<N; k++)

1801

{

1802

// Construct vector ll of indices l for which Z_lk > Zmin_tran

1803

m = 0;

1804

for (l=0; l<N; l++)

1805

if (Z[l][k]+Z[k][l]>=2*Zmin_trans) ll[m++]=l;

1806

int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran

1807

1808

1809

// fprintf(stderr,"\nfam[k]: %s\n",fam[k]);

1810

// for (m=0; m<M; m++)

1811

// printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);

1812

1813

if (M<=1)

1814

{

1815

Zrq[k] = Zq[k];

1816

}

1817

else

1818

{

1819

// Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans

1820

double** Csub = new(double*[M]);

1821

for (m=0; m<M; m++)

1822

{

1823

Csub[m] = new(double[M]);

1824

for (n=0; n<M; n++)

1825

Csub[m][n] = double(C[ll[m]][ll[n]]);

1826

}

1827

// fprintf(stderr,"Covariance matrix\n");

1828

// PrintMatrix(Csub,M);

1829

1830

if (M<=2)

1831

{

1832

for (m=0; m<M; m++) w[m] = 1.0/M;

1833

}

1834

else

1835

{

1836

1837

double** Cinv = new(double*[M]);

1838

for (m=0; m<M; m++) Cinv[m] = new(double[M]);

1839

1840

// // Invert Csub

1841

// InvertMatrix(Cinv,Csub,M);

1842

1843

// // fprintf(stderr,"Inverse covariance matrix\n");

1844

// // PrintMatrix(Cinv,M);

1845

1846

// Calculate weights w[l]

1847

for (m=0; m<M; m++)

1848

{

1849

double sum = 0.0;

1850

for (n=0; n<M; n++)

1851

sum += 1.0 * Csub[m][n];

1852

w[m] = (sum>0? Z[ll[m]][k] / sum : 0.0);

1853

}

1854

1855

// for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);

1856

1857

for (l=0; l<M; l++){

1858

delete[](Cinv[l]); (Cinv[l]) = NULL;

1859

}

1860

delete[](Cinv); (Cinv) = NULL;

1861

}

1862

1863

// Calculate Zrq[k] and normalize

1864

float norm = NormalizationFactor(Csub,w,M);

1865

double sumZ = 0.0;

1866

for (m=0; m<M; m++)

1867

sumZ += w[m] * Zq[ll[m]];

1868

Zrq[k] = sumZ/norm;

1869

1870

for (l=0; l<M; l++){

1871

delete[](Csub[l]); (Csub[l]) = NULL;

1872

}

1873

delete[](Csub); (Csub) = NULL;

1874

}

1875

1876

// fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);

1877

}

1878

1879

// Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score

1880

for (k=0; k<N; k++)

1881

{

1882

float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);

1883

// if (isnan(Zqtot))

1884

// {

1885

// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");

1886

// printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);

1887

// par.trans=0;

1888

// return;

1889

// }

1890

if (v>=2 && Zq[k] + Zqtot > 2*Zmin_trans) {

1891

printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);

1892

}

1893

Ztq[k] = Zqtot;

1894

}

1895

1896

// Calculate mean and standard deviation of Z1q

1897

fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");

1898

double sumw=0.0;

1899

double sumZ=0.0;

1900

double sumZ2=0.0;

1901

for (k=0; k<N; k++)

1902

{

1903

if (excluded.Contains(fold[k])) continue;

1904

sumw += weight[k];

1905

sumZ += weight[k]*Ztq[k];

1906

sumZ2 += weight[k]*Ztq[k]*Ztq[k];

1907

// if (isnan(sumZ))

1908

// {

1909

// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");

1910

// printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);

1911

// par.trans=0;

1912

// return;

1913

// }

1914

}

1915

float mu = sumZ/sumw;

1916

float sigma = sqrt(sumZ2/sumw-mu*mu);

1917

if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma);

1918

sigma *= 1.01;// correct different fitting of EVD and normal variables

1919

1920

// Normalize Ztq and calculate P1-values

1921

fprintf(stderr,"Normalize Ztq and calculate P1-values\n");

1922

Reset();

1923

while (!End())

1924

{

1925

hit = ReadNext();

1926

hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);

1927

hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval));

1928

// P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))

1929

hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;

1930

hit.Probab = Probab(hit);

1931

hit.score_sort = hit.logPval;

1932

Overwrite(hit); // copy hit object into current position of hitlist

1933

}

1934

1935

for (k=0; k<N; k++){

1936

delete[](Z[k]); (Z[k]) = NULL;

1937

}

1938

for (k=0; k<N; k++){

1939

delete[](C[k]); (C[k]) = NULL;

1940

}

1941

for (k=0; k<N; k++){

1942

delete[](fold[k]); (fold[k]) = NULL;

1943

}

1944

for (k=0; k<N; k++){

1945

delete[](fam[k]); (fam[k]) = NULL;

1946

}

1947

delete[](C); (C) = NULL;

1948

delete[](Z); (Z) = NULL;

1949

delete[](fold); (fold) = NULL;

1950

delete[](fam); (fam) = NULL;

1951

delete[](Prob); (Prob) = NULL;

1952

delete[](ll); (ll) = NULL;

1953

delete[](Zq); (Zq) = NULL;

1954

delete[](Ztq); (Ztq) = NULL;

1955

}

1956

1957

1958

/////////////////////////////////////////////////////////////////////////////////////

1959

/**

1960

* @brief Calculate P-values and Probabilities from transitive scoring over whole database

1961

* Like TransitiveScoring(),

1962

* but in transitive scoring, Z1_qk = sum_l w_l*Z_lk, use all l:E_ql<=E_qk

1963

* and in reverse scoring, Z1_kr = sum_l w_l*Z_lq, use all l:E_kl<=E_kq

1964

*/

1965

void

1966

HitList::TransitiveScoring3()

1967

{

1968

void PrintMatrix(float** V, int N);

1969

void PrintMatrix(double** V, int N);

1970

1971

float** Z; // matrix of intra-db Z-scores Z_kl

1972

float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)

1973

char** fold; // fold name of HMM k

1974

char** fam; // family of HMM k

1975

float* Prob; // probability of HMM k

1976

float* Zq; // Zq[k] = Z-score between query and database HMM k

1977

float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q

1978

float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k

1979

float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively

1980

int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans

1981

int N; // dimension of weight matrix is NxN

1982

int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)

1983

int k,l,m,n; // indices for database HMMs

1984

char name[NAMELEN];

1985

Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N}

1986

index.Null(-1); // Set int value to return when no data can be retrieved

1987

Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit

1988

excluded.Null(0); // Set int value to return when no data can be retrieved

1989

Hit hit;

1990

size_t unused; /* disable fread gcc warning */

1991

1992

// Read weights matrix W with index hash and names array

1993

fprintf(stderr,"Reading in weights file\n");

1994

FILE* wfile = fopen(par.wfile,"rb");

1995

if (v>=1 && wfile==NULL)

1996

{

1997

fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);

1998

perror("fopen");

1999

fprintf(stderr,"Skipping caclulation of transitive P-values\n");

2000

par.trans=0;

2001

return;

2002

}

2003

unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database)

2004

if (v>=1 && N!=N_searched)

2005

{

2006

fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);

2007

fprintf(stderr,"Skipping caclulation of transitive P-values\n");

2008

par.trans=0;

2009

return;

2010

}

2011

if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);

2012

// Read names of HMMs (to specify mapping of HMM to matrix indices)

2013

for (k=0; k<N; k++)

2014

{

2015

unused = fread(name,sizeof(char),IDLEN,wfile);

2016

index.Add(name,k);

2017

}

2018

// Read symmetric Z-scores matrix

2019

Z = new(float*[N]);

2020

for (k=0; k<N; k++)

2021

{

2022

Z[k] = new(float[N]);

2023

for (l=0; l<k; l++) Z[k][l] = Z[l][k];

2024

unused = fread(Z[k]+k,sizeof(float),N-k,wfile);

2025

}

2026

// Read symmetric covariance matrix

2027

C = new(float*[N]);

2028

for (k=0; k<N; k++)

2029

{

2030

C[k] = new(float[N]);

2031

for (l=0; l<k; l++) C[k][l] = C[l][k];

2032

unused = fread(C[k]+k,sizeof(float),N-k,wfile);

2033

}

2034

fclose(wfile);

2035

2036

// Allocate memory

2037

Zq = new(float[N]);

2038

Ztq = new(float[N]);

2039

Zrq = new(float[N]);

2040

fold = new(char*[N]);

2041

fam = new(char*[N]);

2042

Prob = new(float[N]);

2043

ll = new(int[N]);

2044

w = new(float[N]);

2045

2046

// Transform P-values to normally distributed Z-scores and store in Zq vector

2047

fprintf(stderr,"Transform P-values to Z-scores\n");

2048

float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL

2049

float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans

2050

printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans);

2051

2052

Reset();

2053

while (!End())

2054

{

2055

hit = ReadNext();

2056

if (hit.irep>1) continue;

2057

k = index.Show(hit.name);

2058

if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}

2059

if (hit.logPvalt<0)

2060

Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk)

2061

else

2062

Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk

2063

// printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);

2064

// if (isnan(Zq[k]))

2065

// {

2066

// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");

2067

// printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);

2068

// par.trans=0;

2069

// return;

2070

// }

2071

if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);

2072

fold[k] = new(char[IDLEN]);

2073

fam[k] = new(char[IDLEN]);

2074

strcpy(fold[k],hit.fold);

2075

strcpy(fam[k],hit.fam);

2076

weight[k] = hit.weight;

2077

Prob[k] = hit.Probab;

2078

}

2079

2080

if (v>=3)

2081

{

2082

excluded.Reset();

2083

while (!excluded.End())

2084

{

2085

excluded.ReadNext(name);

2086

printf("Excluded fold %s from fitting to Ztq\n",name);

2087

}

2088

}

2089

2090

2091

////////////////////////////////////////////////////////////////

2092

// Calculate transitive score (query->l) Ztq[l]

2093

2094

fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");

2095

for (k=0; k<N; k++)

2096

{

2097

// Construct vector ll of indices l for which Z_lq OR Z_lk >= max(Z_kq,Zmin_trans)

2098

float Zmink = fmax(Zq[k],Zmin_trans);

2099

for (m=l=0; l<N; l++)

2100

if (Zq[l]>=Zmink) ll[m++]=l;

2101

M = m; // number of indices l for which Z_lq OR Z_lk >= max(Z_kq,Zmin_trans)

2102

2103

// for (m=0; m<M; m++)

2104

// fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);

2105

2106

if (M<=1)

2107

{

2108

Ztq[k]=Zq[k];

2109

}

2110

else

2111

{

2112

// Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans

2113

double** Csub = new(double*[M]);

2114

double** Cinv = new(double*[M]);

2115

for (m=0; m<M; m++)

2116

{

2117

Csub[m] = new(double[M]);

2118

Cinv[m] = new(double[M]);

2119

for (n=0; n<M; n++)

2120

Csub[m][n] = double(C[ll[m]][ll[n]]);

2121

}

2122

2123

// fprintf(stderr,"Covariance matrix\n");

2124

// PrintMatrix(Csub,M);

2125

2126

// Invert Csub

2127

// fprintf(stderr,"Calculate inverse of covariance submatrix\n");

2128

InvertMatrix(Cinv,Csub,M);

2129

2130

// fprintf(stderr,"Inverse covariance matrix\n");

2131

// PrintMatrix(Cinv,M);

2132

2133

// Calculate weights w[l]

2134

for (m=0; m<M; m++)

2135

{

2136

double sum = 0.0;

2137

for (n=0; n<M; n++)

2138

sum += 1.0 * Cinv[m][n]; // signal ~ sum_l w_l*Z_lq !

2139

w[m] = fmax(sum,0.0);

2140

}

2141

for (l=0; l<M; l++){

2142

delete[](Cinv[l]); (Cinv[l]) = NULL;

2143

}

2144

delete[](Cinv); (Cinv) = NULL;

2145

2146

// Calculate Ztq[k]

2147

float norm = NormalizationFactor(Csub,w,M);

2148

double sumZ = 0.0;

2149

for (m=0; m<M; m++)

2150

sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);

2151

// sumZ += w[m] * Z[ll[m]][k];

2152

Ztq[k] = sumZ/norm;

2153

2154

for (l=0; l<M; l++){

2155

delete[](Csub[l]); (Csub[l]) = NULL;

2156

}

2157

delete[](Csub); (Csub) = NULL;

2158

}

2159

}

2160

2161

////////////////////////////////////////////////////////////////

2162

// Calculate reverse transitive score (l->query-) Zrq[l]

2163

2164

fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");

2165

for (k=0; k<N; k++)

2166

{

2167

// Construct vector ll of indices l for which Z_lk > Zmin_tran

2168

float Zmink = fmax(Zq[k],Zmin_trans);

2169

for (m=l=0; l<N; l++)

2170

if (Z[l][k]>=Zmink) ll[m++]=l;

2171

int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran

2172

2173

2174

// fprintf(stderr,"\nfam[k]: %s\n",fam[k]);

2175

// for (m=0; m<M; m++)

2176

// printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);

2177

2178

if (M<=1)

2179

{

2180

Zrq[k] = Zq[k];

2181

}

2182

else

2183

{

2184

// Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans

2185

double** Csub = new(double*[M]);

2186

for (m=0; m<M; m++)

2187

{

2188

Csub[m] = new(double[M]);

2189

for (n=0; n<M; n++)

2190

Csub[m][n] = double(C[ll[m]][ll[n]]);

2191

}

2192

// fprintf(stderr,"Covariance matrix\n");

2193

// PrintMatrix(Csub,M);

2194

2195

if (M==2)

2196

{

2197

for (m=0; m<M; m++) w[m] = 1.0/M;

2198

}

2199

else

2200

{

2201

2202

double** Cinv = new(double*[M]);

2203

for (m=0; m<M; m++) Cinv[m] = new(double[M]);

2204

2205

// Invert Csub

2206

InvertMatrix(Cinv,Csub,M);

2207

2208

// fprintf(stderr,"Inverse covariance matrix\n");

2209

// PrintMatrix(Cinv,M);

2210

2211

// Calculate weights w[l]

2212

for (m=0; m<M; m++)

2213

{

2214

double sum = 0.0;

2215

for (n=0; n<M; n++)

2216

sum += 1.0 * Cinv[m][n]; // signal ~ sum_l w_l*Z_lq !

2217

w[m] = fmax(sum,0.0);

2218

}

2219

// for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);

2220

for (l=0; l<M; l++){

2221

delete[](Cinv[l]); (Cinv[l]) = NULL;

2222

}

2223

delete[](Cinv); (Cinv) = NULL;

2224

}

2225

2226

// Calculate Zrq[k] and normalize

2227

float norm = NormalizationFactor(Csub,w,M);

2228

double sumZ = 0.0;

2229

for (m=0; m<M; m++)

2230

sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);

2231

// sumZ += w[m] * Zq[ll[m]];

2232

Zrq[k] = sumZ/norm;

2233

2234

for (l=0; l<M; l++){

2235

delete[](Csub[l]); (Csub[l]) = NULL;

2236

}

2237

delete[](Csub); (Csub) = NULL;

2238

}

2239

2240

// fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);

2241

}

2242

2243

// Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score

2244

for (k=0; k<N; k++)

2245

{

2246

2247

float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);

2248

// if (isnan(Zqtot))

2249

// {

2250

// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");

2251

// printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);

2252

// par.trans=0;

2253

// return;

2254

// }

2255

if (v>=3 && Zqtot > 2*Zmin_trans) {

2256

printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);

2257

}

2258

Ztq[k] = Zqtot;

2259

}

2260

2261

// Calculate mean and standard deviation of Z1q

2262

fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");

2263

double sumw=0.0;

2264

double sumZ=0.0;

2265

double sumZ2=0.0;

2266

for (k=0; k<N; k++)

2267

{

2268

if (excluded.Contains(fold[k])) continue;

2269

sumw += weight[k];

2270

sumZ += weight[k]*Ztq[k];

2271

sumZ2 += weight[k]*Ztq[k]*Ztq[k];

2272

// if (isnan(sumZ))

2273

// {

2274

// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");

2275

// printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);

2276

// par.trans=0;

2277

// return;

2278

// }

2279

}

2280

float mu = sumZ/sumw;

2281

float sigma = sqrt(sumZ2/sumw-mu*mu);

2282

if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma);

2283

sigma *= 1.01;// correct different fitting of EVD and normal variables

2284

2285

// Normalize Ztq and calculate P1-values

2286

fprintf(stderr,"Normalize Ztq and calculate P1-values\n");

2287

Reset();

2288

while (!End())

2289

{

2290

hit = ReadNext();

2291

hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);

2292

hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval));

2293

// P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))

2294

hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;

2295

hit.Probab = Probab(hit);

2296

hit.score_sort = hit.logPval;

2297

Overwrite(hit); // copy hit object into current position of hitlist

2298

}

2299

2300

for (k=0; k<N; k++){

2301

delete[](Z[k]); (Z[k]) = NULL;

2302

}

2303

for (k=0; k<N; k++){

2304

delete[](C[k]); (C[k]) = NULL;

2305

}

2306

for (k=0; k<N; k++){

2307

delete[](fold[k]); (fold[k]) = NULL;

2308

}

2309

for (k=0; k<N; k++){

2310

delete[](fam[k]); (fam[k]) = NULL;

2311

}

2312

delete[](C); (C) = NULL;

2313

delete[](Z); (Z) = NULL;

2314

delete[](fold); (fold) = NULL;

2315

delete[](fam); (fam) = NULL;

2316

delete[](Prob); (Prob) = NULL;

2317

delete[](ll); (ll) = NULL;

2318

delete[](Zq); (Zq) = NULL;

2319

delete[](Ztq); (Ztq) = NULL;

2320

2321

}

2322

2323

2324

/////////////////////////////////////////////////////////////////////////////////////

2325

/**

2326

* @brief Calculate P-values and Probabilities from transitive scoring over whole database

2327

* Best tested scheme. Use fmin(Zq[ll[m]],Z[ll[m]][k])

2328

* and fast approximation for weights (not inverse covariance matrix)

2329

*/

2330

void

2331

HitList::TransitiveScoring4()

2332

{

2333

void PrintMatrix(float** V, int N);

2334

void PrintMatrix(double** V, int N);

2335

2336

float** Z; // matrix of intra-db Z-scores Z_kl

2337

float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)

2338

char** fold; // fold name of HMM k

2339

char** fam; // family of HMM k

2340

float* Prob; // probability of HMM k

2341

float* Zq; // Zq[k] = Z-score between query and database HMM k

2342

float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q

2343

float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k

2344

float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively

2345

int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans

2346

int N; // dimension of weight matrix is NxN

2347

int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)

2348

int k,l,m,n; // indices for database HMMs

2349

char name[NAMELEN];

2350

Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N}

2351

index.Null(-1); // Set int value to return when no data can be retrieved

2352

Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit

2353

excluded.Null(0); // Set int value to return when no data can be retrieved

2354

Hit hit;

2355

size_t unused; /* disable fread gcc warning */

2356

2357

// Read weights matrix W with index hash and names array

2358

fprintf(stderr,"Reading in weights file\n");

2359

FILE* wfile = fopen(par.wfile,"rb");

2360

if (v>=1 && wfile==NULL)

2361

{

2362

fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);

2363

perror("fopen");

2364

fprintf(stderr,"Skipping caclulation of transitive P-values\n");

2365

par.trans=0;

2366

return;

2367

}

2368

unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database)

2369

if (v>=1 && N!=N_searched)

2370

{

2371

fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);

2372

fprintf(stderr,"Skipping caclulation of transitive P-values\n");

2373

par.trans=0;

2374

return;

2375

}

2376

if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);

2377

// Read names of HMMs (to specify mapping of HMM to matrix indices)

2378

for (k=0; k<N; k++)

2379

{

2380

unused = fread(name,sizeof(char),IDLEN,wfile);

2381

index.Add(name,k);

2382

}

2383

// Read symmetric Z-scores matrix

2384

Z = new(float*[N]);

2385

for (k=0; k<N; k++)

2386

{

2387

Z[k] = new(float[N]);

2388

for (l=0; l<k; l++) Z[k][l] = Z[l][k];

2389

unused = fread(Z[k]+k,sizeof(float),N-k,wfile);

2390

}

2391

// Read symmetric covariance matrix

2392

C = new(float*[N]);

2393

for (k=0; k<N; k++)

2394

{

2395

C[k] = new(float[N]);

2396

for (l=0; l<k; l++) C[k][l] = C[l][k];

2397

unused = fread(C[k]+k,sizeof(float),N-k,wfile);

2398

}

2399

fclose(wfile);

2400

2401

// Allocate memory

2402

Zq = new(float[N]);

2403

Ztq = new(float[N]);

2404

Zrq = new(float[N]);

2405

fold = new(char*[N]);

2406

fam = new(char*[N]);

2407

Prob = new(float[N]);

2408

ll = new(int[N]);

2409

w = new(float[N]);

2410

2411

// Transform P-values to normally distributed Z-scores and store in Zq vector

2412

fprintf(stderr,"Transform P-values to Z-scores\n");

2413

float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL

2414

float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans

2415

printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans);

2416

2417

Reset();

2418

while (!End())

2419

{

2420

hit = ReadNext();

2421

if (hit.irep>1) continue;

2422

k = index.Show(hit.name);

2423

if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}

2424

if (hit.logPvalt<0)

2425

Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk)

2426

else

2427

Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk

2428

// printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);

2429

// if (isnan(Zq[k])) {

2430

// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");

2431

// printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);

2432

// par.trans=0;

2433

// return;

2434

// }

2435

if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);

2436

fold[k] = new(char[IDLEN]);

2437

fam[k] = new(char[IDLEN]);

2438

strcpy(fold[k],hit.fold);

2439

strcpy(fam[k],hit.fam);

2440

weight[k] = hit.weight;

2441

Prob[k] = hit.Probab;

2442

}

2443

2444

if (v>=3)

2445

{

2446

excluded.Reset();

2447

while (!excluded.End())

2448

{

2449

excluded.ReadNext(name);

2450

printf("Excluded fold %s from fitting to Ztq\n",name);

2451

}

2452

}

2453

2454

////////////////////////////////////////////////////////////////

2455

// Calculate transitive score (query->l) Zt[l]

2456

2457

// Construct vector ll of indices l for which Z_lq > Zmin_trans

2458

m = 0;

2459

for (l=0; l<N; l++)

2460

if (Zq[l]>=Zmin_trans) ll[m++]=l;

2461

M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans

2462

2463

// for (m=0; m<M; m++)

2464

// fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);

2465

2466

if (M<=1)

2467

for (k=0; k<N; k++) Ztq[k]=0.0;

2468

else

2469

{

2470

// Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans

2471

double** Csub = new(double*[M]);

2472

for (m=0; m<M; m++)

2473

{

2474

Csub[m] = new(double[M]);

2475

for (n=0; n<M; n++)

2476

Csub[m][n] = double(C[ll[m]][ll[n]]);

2477

}

2478

2479

if (v>=3)

2480

{

2481

fprintf(stderr,"Covariance matrix\n");

2482

PrintMatrix(Csub,M);

2483

}

2484

2485

2486

// Calculate weights w[l]

2487

for (m=0; m<M; m++)

2488

{

2489

double sum = 0.0;

2490

for (n=0; n<M; n++)

2491

sum += fmax(0.0,Csub[m][n]);

2492

printf("w[%4i] = %-8.5f\n",ll[m],1.0/sum);

2493

w[m] = 1.0/sum;

2494

}

2495

2496

// Calculate Ztq[k] for all HMMs k

2497

fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");

2498

float norm = NormalizationFactor(Csub,w,M);

2499

for (k=0; k<N; k++)

2500

{

2501

double sumZ = 0.0;

2502

for (m=0; m<M; m++)

2503

sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);

2504

Ztq[k] = sumZ/norm;

2505

}

2506

2507

for (l=0; l<M; l++){

2508

delete[](Csub[l]); (Csub[l]) = NULL;

2509

}

2510

delete[](Csub); (Csub) = NULL;

2511

}

2512

2513

////////////////////////////////////////////////////////////////

2514

// Calculate reverse transitive score (l->query-) Zrq[l]

2515

2516

fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");

2517

for (k=0; k<N; k++)

2518

{

2519

// Construct vector ll of indices l for which Z_lk > Zmin_tran

2520

m = 0;

2521

for (l=0; l<N; l++)

2522

if (Z[k][l]>=Zmin_trans) ll[m++]=l;

2523

int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran

2524

2525

2526

// fprintf(stderr,"\nfam[k]: %s\n",fam[k]);

2527

// for (m=0; m<M; m++)

2528

// printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);

2529

2530

if (M<=1)

2531

{

2532

Zrq[k] = Zq[k];

2533

}

2534

else

2535

{

2536

// Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans

2537

double** Csub = new(double*[M]);

2538

for (m=0; m<M; m++)

2539

{

2540

Csub[m] = new(double[M]);

2541

for (n=0; n<M; n++)

2542

Csub[m][n] = double(C[ll[m]][ll[n]]);

2543

}

2544

// fprintf(stderr,"Covariance matrix\n");

2545

// PrintMatrix(Csub,M);

2546

2547

// Calculate weights w[l]

2548

for (m=0; m<M; m++)

2549

{

2550

double sum = 0.0;

2551

for (n=0; n<M; n++)

2552

sum += fmax(0.0,Csub[m][n]);

2553

w[m] = 1.0/sum;

2554

}

2555

2556

// for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);

2557

2558

2559

// Calculate Zrq[k] and normalize

2560

float norm = NormalizationFactor(Csub,w,M);

2561

double sumZ = 0.0;

2562

for (m=0; m<M; m++)

2563

sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);

2564

Zrq[k] = sumZ/norm;

2565

2566

for (l=0; l<M; l++){

2567

delete[](Csub[l]); (Csub[l]) = NULL;

2568

}

2569

delete[](Csub); (Csub) = NULL;

2570

}

2571

2572

// fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);

2573

}

2574

2575

// Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score

2576

for (k=0; k<N; k++)

2577

{

2578

float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);

2579

// if (isnan(Zqtot))

2580

// {

2581

// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");

2582

// printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);

2583

// par.trans=0;

2584

// return;

2585

// }

2586

if (v>=3 && Zq[k] + Zqtot > 2*Zmin_trans) {

2587

printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);

2588

}

2589

Ztq[k] = Zqtot;

2590

}

2591

2592

// Calculate mean and standard deviation of Z1q

2593

fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");

2594

double sumw=0.0;

2595

double sumZ=0.0;

2596

double sumZ2=0.0;

2597

for (k=0; k<N; k++)

2598

{

2599

if (excluded.Contains(fold[k])) continue;

2600

sumw += weight[k];

2601

sumZ += weight[k]*Ztq[k];

2602

sumZ2 += weight[k]*Ztq[k]*Ztq[k];

2603

// if (isnan(sumZ))

2604

// {

2605

// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");

2606

// printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);

2607

// par.trans=0;

2608

// return;

2609

// }

2610

}

2611

float mu = sumZ/sumw;

2612

float sigma = sqrt(sumZ2/sumw-mu*mu);

2613

if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma);

2614

sigma *= 1.01;// correct different fitting of EVD and normal variables

2615

2616

// Normalize Ztq and calculate P1-values

2617

fprintf(stderr,"Normalize Ztq and calculate P1-values\n");

2618

Reset();

2619

while (!End())

2620

{

2621

hit = ReadNext();

2622

hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);

2623

hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval));

2624

// P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))

2625

hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;

2626

hit.Probab = Probab(hit);

2627

hit.score_sort = hit.logPval;

2628

Overwrite(hit); // copy hit object into current position of hitlist

2629

}

2630

2631

for (k=0; k<N; k++){

2632

delete[](Z[k]); (Z[k]) = NULL;

2633

}

2634

for (k=0; k<N; k++){

2635

delete[](C[k]); (C[k]) = NULL;

2636

}

2637

for (k=0; k<N; k++){

2638

delete[](fold[k]); (fold[k]) = NULL;

2639

}

2640

for (k=0; k<N; k++){

2641

delete[](fam[k]); (fam[k]) = NULL;

2642

}

2643

delete[](C); (C) = NULL;

2644

delete[](Z); (Z) = NULL;

2645

delete[](fold); (fold) = NULL;

2646

delete[](fam); (fam) = NULL;

2647

delete[](Prob); (Prob) = NULL;

2648

delete[](ll); (ll) = NULL;

2649

delete[](Zq); (Zq) = NULL;

2650

delete[](Ztq); (Ztq) = NULL;

2651

}

2652

2653

2654

/////////////////////////////////////////////////////////////////////////////////////

2655

/**

2656

* @brief Score2Z transforms the -log(P-value) score into a Z-score for 0 < S

2657

* Score2Z(S) = sqrt(2)*dierfc(2*e^(-S)), where dierfc is the inverse of the complementary error function

2658

*/

2659

double

2660

HitList::Score2Z(double S)

2661

{

2662

double s, t, u, w, x, y, z;

2663

if (S<=0) return double(-100000);

2664

y = ( S>200 ? 0.0 : 2.0*exp(-S) );

2665

if (y > 1)

2666

{

2667

z = (S<1e-6? 2*S : 2-y);

2668

w = 0.916461398268964 - log(z);

2669

}

2670

else

2671

{

2672

z = y;

2673

w = 0.916461398268964 - (0.69314718056-S);

2674

}

2675

2676

u = sqrt(w);

2677

s = (log(u) + 0.488826640273108) / w;

2678

t = 1 / (u + 0.231729200323405);

2679

2680

x = u * (1 - s * (s * 0.124610454613712 + 0.5)) -

2681

((((-0.0728846765585675 * t + 0.269999308670029) * t +

2682

0.150689047360223) * t + 0.116065025341614) * t +

2683

0.499999303439796) * t;

2684

t = 3.97886080735226 / (x + 3.97886080735226);

2685

u = t - 0.5;

2686

s = (((((((((0.00112648096188977922 * u +

2687

1.05739299623423047e-4) * u - 0.00351287146129100025) * u -

2688

7.71708358954120939e-4) * u + 0.00685649426074558612) * u +

2689

0.00339721910367775861) * u - 0.011274916933250487) * u -

2690

0.0118598117047771104) * u + 0.0142961988697898018) * u +

2691

0.0346494207789099922) * u + 0.00220995927012179067;

2692

s = ((((((((((((s * u - 0.0743424357241784861) * u -

2693

0.105872177941595488) * u + 0.0147297938331485121) * u +

2694

0.316847638520135944) * u + 0.713657635868730364) * u +

2695

1.05375024970847138) * u + 1.21448730779995237) * u +

2696

1.16374581931560831) * u + 0.956464974744799006) * u +

2697

0.686265948274097816) * u + 0.434397492331430115) * u +

2698

0.244044510593190935) * t -

2699

(z==0? 0: z * exp(x * x - 0.120782237635245222));

2700

x += s * (x * s + 1);

2701

if (y > 1) {

2702

x = -x;

2703

}

2704

return double (1.41421356237*x);

2705

}

2706

2707

/////////////////////////////////////////////////////////////////////////////////////////////////////////

2708

/**

2709

* @brief Z2Score transforms the Z-score into a -log(P-value) value

2710

* Z2Score(Z) = log(2) - log( erfc(Z/sqrt(2)) ) , where derfc is the complementary error function

2711

*/

2712

double

2713

HitList::Z2Score(double Z)

2714

{

2715

double t, u, x, y;

2716

x = 0.707106781188*Z;

2717

if (x>10) return 0.69314718056 - (-x*x - log( (1-0.5/x/x)/x/1.772453851) );

2718

t = 3.97886080735226 / (fabs(x) + 3.97886080735226);

2719

u = t - 0.5;

2720

y = (((((((((0.00127109764952614092 * u + 1.19314022838340944e-4) * u -

2721

0.003963850973605135) * u - 8.70779635317295828e-4) * u +

2722

0.00773672528313526668) * u + 0.00383335126264887303) * u -

2723

0.0127223813782122755) * u - 0.0133823644533460069) * u +

2724

0.0161315329733252248) * u + 0.0390976845588484035) * u +

2725

0.00249367200053503304;

2726

y = ((((((((((((y * u - 0.0838864557023001992) * u -

2727

0.119463959964325415) * u + 0.0166207924969367356) * u +

2728

0.357524274449531043) * u + 0.805276408752910567) * u +

2729

1.18902982909273333) * u + 1.37040217682338167) * u +

2730

1.31314653831023098) * u + 1.07925515155856677) * u +

2731

0.774368199119538609) * u + 0.490165080585318424) * u +

2732

0.275374741597376782) * t * (x>10? 0.0 : exp(-x * x));

2733

return 0.69314718056 - log( x < 0 ? 2 - y : y );

2734

}

2735

2736

2737

/////////////////////////////////////////////////////////////////////////////////////////////////////////

2738

/**

2739

* @brief

2740

*/

2741

void

2742

PrintMatrix(float** V, int N)

2743

{

2744

int k,l;

2745

for (k=0; k<N; k++)

2746

{

2747

fprintf(stderr,"k=%4i \n",k);

2748

for (l=0; l<N; l++)

2749

{

2750

fprintf(stderr,"%4i:%6.3f ",l,V[k][l]);

2751

if ((l+1)%10==0) fprintf(stderr,"\n");

2752

}

2753

fprintf(stderr,"\n");

2754

}

2755

fprintf(stderr,"\n");

2756

}

2757

2758

/////////////////////////////////////////////////////////////////////////////////////////////////////////

2759

/**

2760

* @brief

2761

*/

2762

void

2763

PrintMatrix(double** V, int N)

2764

{

2765

int k,l;

2766

for (k=0; k<N; k++)

2767

{

2768

fprintf(stderr,"k=%4i \n",k);

2769

for (l=0; l<N; l++)

2770

{

2771

fprintf(stderr,"%4i:%6.3f ",l,V[k][l]);

2772

if ((l+1)%10==0) fprintf(stderr,"\n");

2773

}

2774

fprintf(stderr,"\n");

2775

}

2776

fprintf(stderr,"\n");

2777

}

2778

2779

/////////////////////////////////////////////////////////////////////////////////////////////////////////

2780

/**

2781

* @brief

2782

*/

2783

void

2784

HitList::Normalize(float* Ztq, char** fold, Hash<int>& excluded)

2785

{

2786

double sumw=0.0;

2787

double sumZ=0.0;

2788

double sumZ2=0.0;

2789

for (int k=0; k<N_searched; k++)

2790

{

2791

if (excluded.Contains(fold[k])) continue;

2792

sumw += weight[k];

2793

sumZ += weight[k]*Ztq[k];

2794

sumZ2 += weight[k]*Ztq[k]*Ztq[k];

2795

}

2796

float mu = sumZ/sumw;

2797

float sigma = sqrt(sumZ2/sumw-mu*mu);

2798

printf("Transitive score Ztq: mu=%8.3g sigma=%8.3g\n",mu,sigma);

2799

for (int k=0; k<N_searched; k++) Ztq[k] = (Ztq[k]-mu)/sigma;

2800

return;

2801

}

2802

2803

/////////////////////////////////////////////////////////////////////////////////////////////////////////

2804

/**

2805

* @brief Calculate standard deviation of Z1 = sum_m [ w_m * Z_m ], where Csub_mn = cov(Z_m,Z_n)

2806

*/

2807

float

2808

HitList::NormalizationFactor(double** Csub, float* w, int M)

2809

{

2810

double sum=0.0;

2811

for (int m=0; m<M; m++)

2812

{

2813

double summ=0.0;

2814

for (int n=0; n<M; n++) summ += Csub[m][n]*w[n];

2815

sum += w[m]*summ;

2816

}

2817

return sqrt(sum);

2818

}

2819

2820

/////////////////////////////////////////////////////////////////////////////////////////////////////////

2821

/**

2822

* @brief Calculate inverse of matrix A and store result in B

2823

*/

2824

void

2825

HitList::InvertMatrix(double** B, double** A, int N)

2826

{

2827

if (N==0)

2828

{

2829

printf("Error: InvertMatrix called with matrix of dimension 0\n");

2830

exit(6);

2831

}

2832

if (N==1)

2833

{

2834

B[0][0] = (A[0][0]==0.0? 0 :1.0/A[0][0]);

2835

return;

2836

}

2837

2838

int k,l,m;

2839

double** V = new(double*[N]);

2840

double* s = new(double[N]);

2841

for (k=0; k<N; k++) V[k] = new(double[N]);

2842

2843

// Copy original matrix A into B since B will be overwritten by SVD()

2844

for (k=0; k<N; k++)

2845

for (l=0; l<N; l++)

2846

B[k][l] = A[k][l];

2847

2848

SVD(B, N, s, V); // U replaces B on output; s[] contains singluar values

2849

2850

// Calculate inverse of A: A^-1 = V * diag(1/s) * U^t

2851

double** U = B;

2852

// Calculate V[k][m] -> V[k][m] *diag(1/s)

2853

for (k=0; k<N; k++)

2854

for (m=0; m<N; m++)

2855

if (s[m]!=0.0) V[k][m] /= s[m]; else V[k][m] = 0.0;

2856

// Calculate V[k][l] -> (V * U^t)_kl

2857

for (k=0; k<N; k++)

2858

{

2859

if (v>=4 && k%100==0) printf("%i\n",k);

2860

for (l=0; l<N; l++)

2861

{

2862

s[l] = 0.0; // use s[] as temporary memory to avoid overwriting B[k][] as long as it is needed

2863

for (m=0; m<N; m++)

2864

s[l] += V[k][m]*U[l][m];

2865

}

2866

for (l=0; l<N; l++) V[k][l]=s[l];

2867

}

2868

for (k=0; k<N; k++)

2869

for (l=0; l<N; l++)

2870

B[k][l] = V[k][l];

2871

2872

for (k=0; k<N; k++){

2873

delete[](V[k]); (V[k]) = NULL;

2874

}

2875

delete[](V); (V) = NULL;

2876

return;

2877

}

2878

2879

2880

/////////////////////////////////////////////////////////////////////////////////////////////////////////

2881

/**

2882

* @brief

2883

*/

2884

void

2885

HitList::TransposeMatrix(double** V, int N)

2886

{

2887

int k,l;

2888

for (k=0; k<N; k++) // transpose Z for efficiency of ensuing matrix multiplication

2889

for (l=0; l<k; l++)

2890

{

2891

double buf = V[k][l];

2892

V[k][l] = V[l][k];

2893

V[l][k] = buf;

2894

}

2895

}

2896

2897

/////////////////////////////////////////////////////////////////////////////////////////////////////////

2898

static double sqrarg;

2899

#define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg)

2900

static double maxarg1,maxarg2;

2901

#define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2))

2902

static int iminarg1,iminarg2;

2903

#define IMIN(a,b) (iminarg1=(a),iminarg2=(b),(iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2))

2904

#define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))

2905

2906

/**

2907

* @brief This is a version of the Golub and Reinsch algorithm for singular value decomposition for a quadratic

2908

* (n x n) matrix A. It is sped up by transposing A amd V matrices at various places in the algorithm.

2909

* On a 400x400 matrix it runs in 1.6 s or 2.3 times faster than the original (n x m) version.

2910

* On a 4993x4993 matrix it runs in 2h03 or 4.5 times faster than the original (n x m) version.

2911

*

2912

* Given a matrix a[0..n-1][0..n-1], this routine computes its singular value decomposition, A = U � W � V^t .

2913

* The matrix U replaces a on output. The diagonal matrix of singular values W is out-put as a vector w[0..n-1].

2914

* The matrix V (not the transpose V^t) is output as V[0..n-1][0..n-1] ./

2915

*/

2916

void

2917

HitList::SVD(double **A, int n, double w[], double **V)

2918

{

2919

int m=n; // in general algorithm A is an (m x n) matrix instead of (n x n)

2920

2921

double pythag(double a, double b);

2922

int flag,i,its,j,jj,k,l=1,nm=1;

2923

double anorm,c,f,g,h,s,scale,x,y,z,*rv1;

2924

rv1=new(double[n]);

2925

g=scale=anorm=0.0;

2926

2927

// Householder reduction to bidiagonal form.

2928

if (v>=5) printf("\nHouseholder reduction to bidiagonal form\n");

2929

for (i=0;i<n;i++) {

2930

if (v>=4 && i%100==0) printf("i=%i\n",i);

2931

if (v>=4) fprintf(stderr,".");

2932

l=i+1;

2933

rv1[i]=scale*g;

2934

g=s=scale=0.0;

2935

if (i < m) {

2936

for (k=i;k<m;k++) scale += fabs(A[k][i]);

2937

if (scale) {

2938

for (k=i;k<m;k++) {

2939

A[k][i] /= scale;

2940

s += A[k][i]*A[k][i];

2941

}

2942

f=A[i][i];

2943

g = -SIGN(sqrt(s),f);

2944

h=f*g-s;

2945

A[i][i]=f-g;

2946

for (j=l;j<n;j++) {

2947

for (s=0.0,k=i;k<m;k++) s += A[k][i]*A[k][j];

2948

f=s/h;

2949

for (k=i;k<m;k++) A[k][j] += f*A[k][i];

2950

}

2951

for (k=i;k<m;k++) A[k][i] *= scale;

2952

}

2953

}

2954

w[i]=scale *g;

2955

g=s=scale=0.0;

2956

if (i < m && i != n-1) {

2957

for (k=l;k<n;k++) scale += fabs(A[i][k]);

2958

if (scale) {

2959

for (k=l;k<n;k++) {

2960

A[i][k] /= scale;

2961

s += A[i][k]*A[i][k];

2962

}

2963

f=A[i][l];

2964

g = -SIGN(sqrt(s),f);

2965

h=f*g-s;

2966

A[i][l]=f-g;

2967

for (k=l;k<n;k++) rv1[k]=A[i][k]/h;

2968

for (j=l;j<m;j++) {

2969

for (s=0.0,k=l;k<n;k++) s += A[j][k]*A[i][k];

2970

for (k=l;k<n;k++) A[j][k] += s*rv1[k];

2971

}

2972

for (k=l;k<n;k++) A[i][k] *= scale;

2973

}

2974

}

2975

anorm=FMAX(anorm,(fabs(w[i])+fabs(rv1[i])));

2976

}

2977

// Accumulation of right-hand transformations.

2978

if (v>=5) printf("\nAccumulation of right-hand transformations\n");

2979

TransposeMatrix(V,n);

2980

for (i=n-1;i>=0;i--) {

2981

if (v>=4 && i%100==0) printf("i=%i\n",i);

2982

if (v>=4) fprintf(stderr,".");

2983

if (i < n-1) {

2984

if (g) {

2985

// Double division to avoid possible underflow.

2986

for (j=l;j<n;j++)

2987

V[i][j]=(A[i][j]/A[i][l])/g;

2988

for (j=l;j<n;j++) {

2989

for (s=0.0,k=l;k<n;k++) s += A[i][k]*V[j][k];

2990

for (k=l;k<n;k++) V[j][k] += s*V[i][k];

2991

}

2992

}

2993

for (j=l;j<n;j++) V[j][i]=V[i][j]=0.0;

2994

}

2995

V[i][i]=1.0;

2996

g=rv1[i];

2997

l=i;

2998

}

2999

// Accumulation of left-hand transformations.

3000

if (v>=5) printf("\nAccumulation of left-hand transformations\n");

3001

TransposeMatrix(A,n);

3002

for (i=IMIN(m,n)-1;i>=0;i--) {

3003

if (v>=4 && i%100==0) printf("i=%i\n",i);

3004

if (v>=4) fprintf(stderr,".");

3005

l=i+1;

3006

g=w[i];

3007

for (j=l;j<n;j++) A[j][i]=0.0;

3008

if (g) {

3009

g=1.0/g;

3010

for (j=l;j<n;j++) {

3011

for (s=0.0,k=l;k<m;k++) s += A[i][k]*A[j][k];

3012

f=(s/A[i][i])*g;

3013

for (k=i;k<m;k++) A[j][k] += f*A[i][k];

3014

}

3015

for (j=i;j<m;j++) A[i][j] *= g;

3016

} else for (j=i;j<m;j++) A[i][j]=0.0;

3017

++A[i][i];

3018

}

3019

3020

// Diagonalization of the bidiagonal form: Loop over singular values, and over allowed iterations.

3021

if (v>=5) printf("\nDiagonalization of the bidiagonal form\n");

3022

for (k=n-1;k>=0;k--) {

3023

if (v>=4 && k%100==0) printf("k=%i\n",k);

3024

if (v>=4) fprintf(stderr,".");

3025

for (its=1;its<=30;its++) {

3026

flag=1;

3027

// Test for splitting. Note that rv1[1] is always zero.

3028

for (l=k;l>=0;l--) {

3029

nm=l-1;

3030

if ((double)(fabs(rv1[l])+anorm) == anorm) {

3031

flag=0;

3032

break;

3033

}

3034

if ((double)(fabs(w[nm])+anorm) == anorm) break;

3035

}

3036

if (flag) {

3037

// Cancellation of rv1[l], if l > 1.

3038

c=0.0;

3039

s=1.0;

3040

for (i=l;i<=k;i++) {

3041

f=s*rv1[i];

3042

rv1[i]=c*rv1[i];

3043

if ((double)(fabs(f)+anorm) == anorm) break;

3044

g=w[i];

3045

h=pythag(f,g);

3046

w[i]=h;

3047

h=1.0/h;

3048

c=g*h;

3049

s = -f*h;

3050

for (j=0;j<m;j++) {

3051

y=A[nm][j];

3052

z=A[i][j];

3053

A[nm][j]=y*c+z*s;

3054

A[i][j]=z*c-y*s;

3055

}

3056

}

3057

}

3058

z=w[k];

3059

// Convergence.

3060

if (l == k) {

3061

// Singular value is made nonnegative.

3062

if (z < 0.0) {

3063

w[k] = -z;

3064

for (j=0;j<n;j++) V[k][j] = -V[k][j];

3065

}

3066

break;

3067

}

3068

if (its == 30) {printf("Error in SVD: no convergence in 30 iterations\n"); exit(7);}

3069

// Shift from bottom 2-by-2 minor.

3070

x=w[l];

3071

nm=k-1;

3072

y=w[nm];

3073

g=rv1[nm];

3074

h=rv1[k];

3075

f=((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y);

3076

g=pythag(f,1.0);

3077

f=((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;

3078

// Next QR transformation:

3079

c=s=1.0;

3080

for (j=l;j<=nm;j++) {

3081

i=j+1;

3082

g=rv1[i];

3083

y=w[i];

3084

h=s*g;

3085

g=c*g;

3086

z=pythag(f,h);

3087

rv1[j]=z;

3088

c=f/z;

3089

s=h/z;

3090

f=x*c+g*s;

3091

g = g*c-x*s;

3092

h=y*s;

3093

y *= c;

3094

for (jj=0;jj<n;jj++) {

3095

x=V[j][jj];

3096

z=V[i][jj];

3097

V[j][jj]=x*c+z*s;

3098

V[i][jj]=z*c-x*s;

3099

}

3100

z=pythag(f,h);

3101

// Rotation can be arbitrary if z = 0.

3102

w[j]=z;

3103

if (z) {

3104

z=1.0/z;

3105

c=f*z;

3106

s=h*z;

3107

}

3108

f=c*g+s*y;

3109

x=c*y-s*g;

3110

3111

for (jj=0;jj<m;jj++) {

3112

y=A[j][jj];

3113

z=A[i][jj];

3114

A[j][jj]=y*c+z*s;

3115

A[i][jj]=z*c-y*s;

3116

}

3117

}

3118

rv1[l]=0.0;

3119

rv1[k]=f;

3120

w[k]=x;

3121

}

3122

}

3123

TransposeMatrix(V,n);

3124

TransposeMatrix(A,n);

3125

delete[](rv1); (rv1) = NULL;

3126

}

3127

3128

/**

3129

* @brief Computes (a2 + b2 )^1/2 without destructive underflow or overflow.

3130

*/

3131

double

3132

pythag(double a, double b)

3133

{

3134

double absa,absb;

3135

absa=fabs(a);

3136

absb=fabs(b);

3137

if (absa > absb)

3138

return absa*sqrt(1.0+SQR(absb/absa));

3139

else

3140

return (absb == 0.0 ? 0.0 : absb*sqrt(1.0+SQR(absa/absb)));

3141

}

3142

3143

3144

/* @* HitList::ClobberGlobal(void)

3145

*/

3146

void

3147

HitList::ClobberGlobal(void){

3148

3149

3150

/* @<variables local to HitList::ClobberGlobal@> */

3151

class List<Hit>::ListEl<Hit> *pvIter = head;

3152

3153

/* NOTE: no free/delete-ing of data to be done here

3154

hitlist only holds a shallow copy of hit;

3155

hit is being cleared off properly.

3156

just reset everything to 0/0.0/NULL.

3157

The only important thing to do at this stage

3158

is to attach head and tail and set size = 0

3159

(FS, 2010-02-18)

3160

3161

NOTE: I only ever saw 1 (one) in-between element,

3162

but there may ctually be a real linked list

3163

of more than 1 element (FS, 2010-02-18)

3164

*/

3165

3166

// printf("POINTER:\t%p\t=HEAD\n", head);

3167

while (pvIter->next != tail){

3168

3169

// printf("POINTER:\t%p->\t%p\n", pvIter, pvIter->next);

3170

pvIter = pvIter->next;

3171

3172

#if 1

3173

pvIter->data.longname = pvIter->data.name =

3174

pvIter->data.file = pvIter->data.dbfile = NULL;

3175

pvIter->data.sname = NULL;

3176

pvIter->data.seq = NULL;

3177

pvIter->data.self = 0;

3178

pvIter->data.i = pvIter->data.j = NULL;

3179

pvIter->data.states = NULL;

3180

pvIter->data.S = pvIter->data.S_ss = pvIter->data.P_posterior = NULL;

3181

pvIter->data.Xcons = NULL;

3182

pvIter->data.sum_of_probs = 0.0;

3183

pvIter->data.Neff_HMM = 0.0;

3184

pvIter->data.score_ss = pvIter->data.Pval = pvIter->data.logPval =

3185

pvIter->data.Eval = pvIter->data.Probab = pvIter->data.Pforward = 0.0;

3186

pvIter->data.nss_conf = pvIter->data.nfirst =

3187

pvIter->data.i1 = pvIter->data.i2 = pvIter->data.j1 = pvIter->data.j2 =

3188

pvIter->data.matched_cols = pvIter->data.ssm1 = pvIter->data.ssm2 = 0;

3189

#endif

3190

}

3191

// printf("POINTER:\t\t\t%p=TAIL\n", tail);

3192

3193

3194

head->next = tail;

3195

tail->prev = head;

3196

size = 0;

3197

3198

/* @= */

3199

return;

3200

3201

} /* this is the end of HitList::ClobberGlobal() */

3202

3203

3204

/*

3205

* EOF hhhitlist-C.h

3206

*/