~ubuntu-branches/ubuntu/edgy/ncbi-tools6/edgy

« back to all changes in this revision

Viewing changes to algo/blast/core/mb_lookup.c

Committer: Bazaar Package Importer
Author(s): Barry deFreese
Date: 2006-07-19 23:28:07 UTC
mfrom: (1.1.5 upstream)
Revision ID: james.westby@ubuntu.com-20060719232807-et3cdmcjgmnyleyx

Tags: 6.1.20060507-3ubuntu1

Re-merge with Debian

files added:
algo/blast/api/blast_message_api.c

algo/blast/api/blast_message_api.h

algo/blast/composition_adjustment/unified_pvalues.c

algo/blast/composition_adjustment/unified_pvalues.h

algo/blast/core/blast_query_info.c

algo/blast/core/blast_query_info.h

algo/blast/core/blast_tune.c

algo/blast/core/blast_tune.h

connect/ncbi_lb.c

connect/ncbi_lb.h

connect/ncbi_local.c

connect/ncbi_local.h

connect/test/test_assert_impl.h

data/16Score.nhr

data/16Score.nin

data/16Score.nsq

debian/libvibrant6a.install

doc/man/cleanasn.1

doc/man/insdseqget.1

network/wwwblast/Src/Makefile.wblast2_cs

network/wwwblast/Src/showalignwrap.cpp

network/wwwblast/Src/showalignwrap.h

files removed:
debian/libvibrant6.install

files modified:
VERSION

access/ent2api.c

access/ent2api.h

access/makeacc.unx

access/mim.asn

access/pmfapi.c

access/pmfapi.h

algo/blast/api/blast_api.c

algo/blast/api/blast_api.h

algo/blast/api/blast_format.c

algo/blast/api/blast_format.h

algo/blast/api/blast_input.c

algo/blast/api/blast_input.h

algo/blast/api/blast_options_api.c

algo/blast/api/blast_options_api.h

algo/blast/api/blast_returns.c

algo/blast/api/blast_returns.h

algo/blast/api/blast_seq.c

algo/blast/api/blast_seq.h

algo/blast/api/blast_seqalign.c

algo/blast/api/blast_seqalign.h

algo/blast/api/blast_tabular.c

algo/blast/api/blast_tabular.h

algo/blast/api/dust_filter.c

algo/blast/api/repeats_filter.c

algo/blast/api/repeats_filter.h

algo/blast/api/seqsrc_readdb.c

algo/blast/api/twoseq_api.c

algo/blast/api/twoseq_api.h

algo/blast/composition_adjustment/compo_heap.c

algo/blast/composition_adjustment/compo_heap.h

algo/blast/composition_adjustment/compo_mode_condition.c

algo/blast/composition_adjustment/compo_mode_condition.h

algo/blast/composition_adjustment/composition_adjustment.c

algo/blast/composition_adjustment/composition_adjustment.h

algo/blast/composition_adjustment/composition_constants.h

algo/blast/composition_adjustment/matrix_frequency_data.c

algo/blast/composition_adjustment/matrix_frequency_data.h

algo/blast/composition_adjustment/nlm_linear_algebra.c

algo/blast/composition_adjustment/nlm_linear_algebra.h

algo/blast/composition_adjustment/optimize_target_freq.c

algo/blast/composition_adjustment/optimize_target_freq.h

algo/blast/composition_adjustment/redo_alignment.c

algo/blast/composition_adjustment/redo_alignment.h

algo/blast/composition_adjustment/smith_waterman.c

algo/blast/composition_adjustment/smith_waterman.h

algo/blast/core/aa_ungapped.c

algo/blast/core/aa_ungapped.h

algo/blast/core/blast_def.h

algo/blast/core/blast_diagnostics.c

algo/blast/core/blast_diagnostics.h

algo/blast/core/blast_encoding.c

algo/blast/core/blast_encoding.h

algo/blast/core/blast_engine.c

algo/blast/core/blast_engine.h

algo/blast/core/blast_extend.c

algo/blast/core/blast_filter.c

algo/blast/core/blast_gapalign.c

algo/blast/core/blast_gapalign.h

algo/blast/core/blast_hits.c

algo/blast/core/blast_hits.h

algo/blast/core/blast_hits_priv.h

algo/blast/core/blast_inline.h

algo/blast/core/blast_kappa.c

algo/blast/core/blast_kappa.h

algo/blast/core/blast_lookup.c

algo/blast/core/blast_lookup.h

algo/blast/core/blast_message.c

algo/blast/core/blast_message.h

algo/blast/core/blast_options.c

algo/blast/core/blast_options.h

algo/blast/core/blast_parameters.c

algo/blast/core/blast_parameters.h

algo/blast/core/blast_program.c

algo/blast/core/blast_program.h

algo/blast/core/blast_psi.c

algo/blast/core/blast_psi_priv.c

algo/blast/core/blast_psi_priv.h

algo/blast/core/blast_setup.c

algo/blast/core/blast_stat.c

algo/blast/core/blast_stat.h

algo/blast/core/blast_traceback.c

algo/blast/core/blast_traceback.h

algo/blast/core/blast_util.c

algo/blast/core/blast_util.h

algo/blast/core/gapinfo.c

algo/blast/core/gapinfo.h

algo/blast/core/greedy_align.c

algo/blast/core/lookup_util.c

algo/blast/core/lookup_util.h

algo/blast/core/lookup_wrap.c

algo/blast/core/mb_lookup.c

algo/blast/core/mb_lookup.h

algo/blast/core/ncbi_std.h

algo/blast/core/pattern.c

algo/blast/core/pattern.h

algo/blast/core/phi_gapalign.c

algo/blast/core/phi_lookup.c

algo/blast/core/phi_lookup.h

api/alignval.c

api/asn2gnb1.c

api/asn2gnb2.c

api/asn2gnb3.c

api/asn2gnb4.c

api/asn2gnb5.c

api/asn2gnb6.c

api/asn2gnbi.h

api/asn2gnbk.h

api/edutil.c

api/edutil.h

api/explore.h

api/findrepl.c

api/findrepl.h

api/salsap.c

api/salsap.h

api/seqmgr.c

api/seqmgr.h

api/seqport.c

api/sequtil.c

api/sequtil.h

api/sqnutil1.c

api/sqnutil2.c

api/sqnutil3.c

api/sqnutils.h

api/subutil.c

api/subutil.h

api/tofasta.c

api/txalign.c

api/utilpars.c

api/utilpars.h

api/valid.c

api/valid.h

api/valid.msg

api/validerr.h

asn/asn.all

asn/gbseq.asn

asn/insdseq.asn

asn/seq.asn

asnstat/all.h

asnstat/asngbseq.h

asnstat/asninsdseq.h

asnstat/asnmim.h

asnstat/asnseq.h

asnstat/mmdb1.h

biostruc/cdd/cdd.asn

biostruc/cdd/cdd.h

biostruc/cdd/objcdd.c

biostruc/cdd/objcdd.h

biostruc/mmdb1.asn

biostruc/objmmdb1.c

biostruc/objmmdb1.h

checkout.date

connect/ncbi_ansi_ext.c

connect/ncbi_ansi_ext.h

connect/ncbi_assert.h

connect/ncbi_comm.h

connect/ncbi_config.h

connect/ncbi_connection.c

connect/ncbi_connutil.c

connect/ncbi_connutil.h

connect/ncbi_core.c

connect/ncbi_core_c.c

connect/ncbi_dispd.c

connect/ncbi_file_connector.c

connect/ncbi_ftp_connector.c

connect/ncbi_heapmgr.c

connect/ncbi_heapmgr.h

connect/ncbi_host_info.c

connect/ncbi_host_info.h

connect/ncbi_host_infop.h

connect/ncbi_http_connector.c

connect/ncbi_http_connector.h

connect/ncbi_lbsmd.h

connect/ncbi_lbsmd_stub.c

connect/ncbi_memory_connector.c

connect/ncbi_memory_connector.h

connect/ncbi_priv.h

connect/ncbi_sendmail.c

connect/ncbi_server_info.c

connect/ncbi_server_info.h

connect/ncbi_server_infop.h

connect/ncbi_service.c

connect/ncbi_service.h

connect/ncbi_service_connector.c

connect/ncbi_service_connector.h

connect/ncbi_service_misc.h

connect/ncbi_servicep.h

connect/ncbi_socket.c

connect/ncbi_socket.h

connect/ncbi_socket_connector.c

connect/ncbi_types.h

connect/ncbi_util.c

connect/ncbi_util.h

connect/test/test_assert.h

connect/test/test_ncbi_connutil_hit.c

connect/test/test_ncbi_connutil_misc.c

connect/test/test_ncbi_disp.c

connect/test/test_ncbi_dsock.c

connect/test/test_ncbi_heapmgr.c

connect/test/test_ncbi_memory_connector.c

connect/test/test_ncbi_sendmail.c

connect/test/test_ncbi_service_connector.c

connect/test/test_ncbi_socket.c

connect/urlquery.c

connect/urlquery.h

data/seqcode.prt

data/seqcode.val

data/sequin.hlp

debian/changelog

debian/compat

debian/control

debian/libncbi6-dev.install

debian/libncbi6.install

debian/ncbi-tools-bin.install

debian/rules

demo/.BLAST_VERSION

demo/asn2all.c

demo/asn2gb.c

demo/asnval.c

demo/bl2seq.c

demo/blast_driver.c

demo/blastall.c

demo/blastpgp.c

demo/cleanasn.c

demo/copymat.c

demo/fa2htgs.c

demo/formatdb.c

demo/formatrpsdb.c

demo/gi2accn.c

demo/megablast.c

demo/rpsblast.c

demo/scantest.c

demo/seedtop.c

demo/tbl2asn.c

demo/vecscreen.c

desktop/biosrc.c

desktop/bspview.c

desktop/cdrgn.c

desktop/dlgutil1.c

desktop/dlgutil2.c

desktop/dlogutil.h

desktop/e2docsum.c

desktop/gbfview.c

desktop/gphview.c

desktop/import.c

desktop/saledit.h

desktop/salpanel.c

desktop/salpanel.h

desktop/salsa.c

desktop/seqpanel.c

desktop/seqpanel.h

desktop/vsm.c

doc/asn2gb.txt

doc/blast/blastftp.html

doc/blast/netblast.html

doc/blast/scoring.pdf

doc/blast/seedtop.html

doc/blast/web_blast.pl

doc/man/asn2gb.1

doc/man/asnval.1

doc/man/blast.1

doc/man/fa2htgs.1

doc/man/gbseqget.1

doc/man/tbl2asn.1

doc/sequin.htm

errmsg/valid.msg

link/macmet/setrsrc.c

make/makeLibs.met

make/makeall.unx

make/makedis.csh

make/makenet.unx

make/msvc_prj/algo/blast/api/blastapi.dsp

make/msvc_prj/algo/blast/composition_adjustment/blastcompadj.dsp

make/msvc_prj/algo/blast/core/blast.dsp

make/msvc_prj/connect/connect.dsp

make/xCode/NCBI.xcode/project.pbxproj

network/id2arch/id2.asn

network/nsclilib/ni_service.c

network/wwwblast/Src/test/run.pl

network/wwwblast/Src/viewgif.c

network/wwwblast/Src/wblast2.c

object/objbibli.c

object/objcode.c

object/objfeat.c

object/objgbseq.c

object/objgbseq.h

object/objinsdseq.c

object/objinsdseq.h

object/objpub.c

object/objpubd.h

sequin/sbtedit.c

sequin/sequin.h

sequin/sequin1.c

sequin/sequin10.c

sequin/sequin2.c

sequin/sequin3.c

sequin/sequin4.c

sequin/sequin5.c

sequin/sequin6.c

sequin/sequin7.c

sequin/sequin8.c

sequin/sequin9.c

tools/actutils.c

tools/actutils.h

tools/blast.h

tools/blastdef.h

tools/blastkar.c

tools/blastool.c

tools/blastpri.h

tools/blastutl.c

tools/blfmtutl.c

tools/blfmtutl.h

tools/kappa.c

tools/mbalign.c

tools/pseed3.c

tools/readdb.c

tools/readdb.h

tools/salptool.c

tools/salptool.h

tools/taxblast.c

tools/toporg.c

tools/toporg.h

tools/vecscrn.c

util/creaders/alnread.c

vibrant/vibforms.c

Show diffs side-by-side

added added

removed removed

algo/blast/core/mb_lookup.c

/* $Id: mb_lookup.c,v 1.57 2005/11/16 14:27:04 madden Exp $

/* $Id: mb_lookup.c,v 1.60 2005/12/22 14:04:52 papadopo Exp $

* ===========================================================================

* PUBLIC DOMAIN NOTICE

#ifndef SKIP_DOXYGEN_PROCESSING

static char const rcsid[] =

"$Id: mb_lookup.c,v 1.57 2005/11/16 14:27:04 madden Exp $";

"$Id: mb_lookup.c,v 1.60 2005/12/22 14:04:52 papadopo Exp $";

#endif /* SKIP_DOXYGEN_PROCESSING */

#include <algo/blast/core/blast_options.h>

113

* @param location locations on the query to be indexed in table [in]

114

* @param mb_lt the (already allocated) megablast lookup

115

* table structure [in|out]

116

* @param width number of bytes in megablast word (2 or 3) [in]

117

116

* @param lookup_options specifies the word_size and template options [in]

118

117

* @return zero on success, negative number on failure.

119

118

120

119

121

120

static Int2

122

121

s_FillDiscMBTable(BLAST_SequenceBlk* query, BlastSeqLoc* location,

123

BlastMBLookupTable* mb_lt, Uint1 width,

122

BlastMBLookupTable* mb_lt,

124

123

const LookupTableOptions* lookup_options)

125

124

126

125

{

180

179

}

181

180

182

181

mb_lt->discontiguous = TRUE;

183

mb_lt->compressed_wordsize = width + 1;

184

mb_lt->word_length = COMPRESSION_RATIO*(width+1);

185

182

mb_lt->template_length = lookup_options->mb_template_length;

186

mb_lt->mask = (Int4) (-1);

187

183

template_length = lookup_options->mb_template_length;

188

189

184

pv_array = mb_lt->pv_array;

190

185

pv_array_bts = mb_lt->pv_array_bts;

191

186

199

194

Uint1* seq;

200

195

Uint1 val;

201

196

202

/* A word is added to the table after its last base is added.

203

At that point, the start offset of the word is

204

template_length - 1 positions behind. This index is

205

also incremented, because lookup table indices are

206

1-based (offset 0 is reserved). */

197

/* A word is added to the table after the last base

198

in the word is read in. At that point, the start

199

offset of the word is template_length - 1 positions

200

behind. This index is also incremented, because

201

lookup table indices are 1-based (offset 0 is reserved). */

207

202

208

203

from = loc->ssr->left - (template_length - 2);

209

204

to = loc->ssr->right - (template_length - 2);

225

220

if (seq < pos)

226

221

continue;

227

222

223

#ifdef LOOKUP_VERBOSE

224

mb_lt->num_words_added++;

225

#endif

228

226

/* compute the hashtable index for the first template

229

227

and add 'index' at that position */

230

228

231

229

ecode1 = ComputeDiscontiguousIndex(accum, template_type);

232

230

if (mb_lt->hashtable[ecode1] == 0) {

231

#ifdef LOOKUP_VERBOSE

233

232

mb_lt->num_unique_pos_added++;

233

#endif

234

pv_array[ecode1>>pv_array_bts] |=

235

(((PV_ARRAY_TYPE) 1)<<(ecode1&PV_ARRAY_MASK));

236

}

247

248

ecode2 = ComputeDiscontiguousIndex(accum, second_template_type);

249

if (mb_lt->hashtable2[ecode2] == 0) {

250

#ifdef LOOKUP_VERBOSE

250

251

mb_lt->num_unique_pos_added++;

252

#endif

251

253

pv_array[ecode2>>pv_array_bts] |=

252

254

(((PV_ARRAY_TYPE) 1)<<(ecode2&PV_ARRAY_MASK));

253

255

}

281

283

* @param query the query sequence [in]

282

284

* @param location locations on the query to be indexed in table [in]

283

285

* @param mb_lt the (already allocated) megablast lookup table structure [in|out]

284

* @param width number of bytes in megablast word (2 or 3) [in]

285

286

* @param lookup_options specifies the word_size [in]

286

287

* @return zero on success, negative number on failure.

287

288

289

290

static Int2

290

s_FillContigMBTable(BLAST_SequenceBlk* query, BlastSeqLoc* location,

291

BlastMBLookupTable* mb_lt, Uint1 width,

291

s_FillContigMBTable(BLAST_SequenceBlk* query,

292

BlastSeqLoc* location,

293

BlastMBLookupTable* mb_lt,

292

294

const LookupTableOptions* lookup_options)

293

295

294

296

{

295

297

BlastSeqLoc* loc;

296

298

const Uint1 kNucMask = 0xfc;

297

299

/* 12-mers (or perhaps 8-mers) are used to build the lookup table

298

and this is what kWordLength specifies. */

299

const Int4 kWordLength = COMPRESSION_RATIO*width;

300

and this is what kLutWordLength specifies. */

301

const Int4 kLutWordLength = mb_lt->lut_word_length;

302

const Int4 kLutMask = mb_lt->hashsize - 1;

300

303

/* The user probably specified a much larger word size (like 28)

301

304

and this is what full_word_size is. */

302

Int4 full_word_size;

305

Int4 full_word_size = mb_lt->word_length;

303

306

Int4 index;

304

PV_ARRAY_TYPE *pv_array=NULL;

307

PV_ARRAY_TYPE *pv_array;

305

308

Int4 pv_array_bts;

306

const Int4 kMask = (1 << (8*width - 2)) - 1;

307

309

/* The calculation of the longest chain can be cpu intensive for

308

310

long queries or sets of queries. So we use a helper_array to

309

311

keep track of this, but compress it by kCompressionFactor so

317

319

318

320

ASSERT(mb_lt);

319

321

320

/* The next_pos array begins small and will grow dynamically */

321

322

mb_lt->next_pos = (Int4 *)calloc(query->length + 1, sizeof(Int4));

322

323

if (mb_lt->next_pos == NULL)

323

324

return -1;

324

325

mb_lt->compressed_wordsize = width;

326

full_word_size = mb_lt->word_length = lookup_options->word_size;

327

mb_lt->mask = mb_lt->hashsize - 1;

328

329

326

pv_array = mb_lt->pv_array;

330

327

pv_array_bts = mb_lt->pv_array_bts;

331

328

339

336

Since sequence pointer points to the end of the word, subtract

340

337

word length from the loop boundaries. */

341

338

Int4 from = loc->ssr->left;

342

Int4 to = loc->ssr->right - kWordLength;

339

Int4 to = loc->ssr->right - kLutWordLength;

343

340

Int4 ecode = 0;

344

341

Int4 last_offset;

345

342

Uint1* pos;

346

343

Uint1* seq;

347

344

Uint1 val;

348

345

349

/* case of unmasked region >= kWordLength but < full_word_size,

346

/* case of unmasked region >= kLutWordLength but < full_word_size,

350

347

so no hits should be generated. */

351

348

if (full_word_size > (loc->ssr->right - loc->ssr->left + 1))

352

349

continue;

353

350

354

351

seq = query->sequence_start + from;

355

pos = seq + kWordLength;

352

pos = seq + kLutWordLength;

356

353

357

354

/* Also add 1 to all indices, because lookup table indices count

358

355

from 1. */

359

from -= kWordLength - 2;

356

from -= kLutWordLength - 2;

360

357

last_offset = to + 2;

361

358

362

359

for (index = from; index <= last_offset; index++) {

363

360

val = *++seq;

364

if ((val & kNucMask) != 0) { /* ambiguity or gap */

361

/* if an ambiguity is encountered, do not add

362

any words that would contain it */

363

if ((val & kNucMask) != 0) {

365

364

ecode = 0;

366

pos = seq + kWordLength;

365

pos = seq + kLutWordLength;

367

366

continue;

368

367

}

369

368

370

369

/* get next base */

371

ecode = ((ecode & kMask) << 2) + val;

370

ecode = ((ecode << 2) & kLutMask) + val;

372

371

if (seq < pos)

373

372

continue;

374

373

374

#ifdef LOOKUP_VERBOSE

375

mb_lt->num_words_added++;

376

#endif

375

377

if (mb_lt->hashtable[ecode] == 0) {

378

#ifdef LOOKUP_VERBOSE

376

379

mb_lt->num_unique_pos_added++;

380

#endif

377

381

pv_array[ecode>>pv_array_bts] |=

378

382

(((PV_ARRAY_TYPE) 1)<<(ecode&PV_ARRAY_MASK));

379

383

}

398

402

/* Documentation in mb_lookup.h */

399

403

Int2 MB_LookupTableNew(BLAST_SequenceBlk* query, BlastSeqLoc* location,

400

404

BlastMBLookupTable** mb_lt_ptr,

401

const LookupTableOptions* lookup_options)

405

const LookupTableOptions* lookup_options,

406

Int4 approx_table_entries)

402

407

{

403

Int4 pv_shift, pv_size;

404

Int2 status=0;

405

Int2 bytes_in_word;

406

Uint1 width;

408

Int4 pv_size;

409

Int2 status = 0;

407

410

BlastMBLookupTable* mb_lt;

411

const Int4 kAlphabetSize = BLAST2NA_SIZE;

412

const Int4 kTargetPVSize = 131072;

408

413

const Int4 kSmallQueryCutoff = 15000;

409

414

const Int4 kLargeQueryCutoff = 800000;

410

415

420

return -1;

416

421

}

417

422

418

if ((mb_lt = (BlastMBLookupTable*) calloc(1, sizeof(BlastMBLookupTable))) == NULL)

419

{

423

mb_lt = (BlastMBLookupTable*)calloc(1, sizeof(BlastMBLookupTable));

424

if (mb_lt == NULL) {

420

425

return -1;

421

426

}

422

427

423

bytes_in_word = (lookup_options->word_size + 1)/ 4;

424

if (bytes_in_word < 3)

425

width = 2;

426

else

427

width = 3;

428

429

430

if (width == 2) {

431

mb_lt->hashsize = 1 << 16;

432

pv_shift = 0;

433

}

434

else {

435

Int4 from, to;

436

BlastSeqLoc* loc;

437

Int4 table_entries=0;

438

/* determine the approximate number of hashtable entries */

439

for (loc = location; loc; loc = loc->next) {

440

from = loc->ssr->left;

441

to = loc->ssr->right;

442

table_entries += (to - from);

443

}

444

445

/* To fit in the external cache of latter-day micro-

446

processors, the PV array must be compressed. pv_shift

447

below is the power of two that the array size is

448

divided by. The target PV array size is 128 kBytes.

449

450

If the query is too small or too large, the compression

451

should be higher. Small queries don't reuse the PV array,

452

and large queries saturate it. In either case, cache

453

is better used on other data. */

454

455

if (lookup_options->word_size == 11 && lookup_options->mb_template_length > 0) {

456

mb_lt->hashsize = 1 << 22;

457

if( table_entries <= kSmallQueryCutoff ||

458

table_entries >= kLargeQueryCutoff )

459

pv_shift = 3;

460

else

461

pv_shift = 2;

462

}

463

else {

464

mb_lt->hashsize = 1 << 24;

465

if( table_entries <= kSmallQueryCutoff ||

466

table_entries >= kLargeQueryCutoff )

467

pv_shift = 5;

468

else

469

pv_shift = 4;

470

}

471

}

472

473

if (lookup_options->mb_template_length)

474

{

475

mb_lt->full_byte_scan = lookup_options->full_byte_scan; /* Only applies to discontiguous megablast. */

476

/* The next two are not allowed in discontiguous megablast. */

477

mb_lt->ag_scanning_mode = FALSE;

478

mb_lt->variable_wordsize = FALSE;

479

mb_lt->scan_step = 0;

480

}

481

else

482

{

483

mb_lt->ag_scanning_mode = TRUE; /* FIXME, are there wordsize where this should be FALSE? */

484

mb_lt->scan_step = CalculateBestStride(lookup_options->word_size, lookup_options->variable_wordsize,

485

lookup_options->lut_type);

486

mb_lt->variable_wordsize = lookup_options->variable_wordsize;

487

}

488

489

if ((mb_lt->hashtable = (Int4*)

490

calloc(mb_lt->hashsize, sizeof(Int4))) == NULL) {

491

MBLookupTableDestruct(mb_lt);

492

return -1;

493

}

494

495

mb_lt->pv_array_bts = PV_ARRAY_BTS + pv_shift;

496

497

pv_size = mb_lt->hashsize>>(mb_lt->pv_array_bts);

498

/* Size measured in PV_ARRAY_TYPE's */

499

if ((mb_lt->pv_array = calloc(PV_ARRAY_BYTES, pv_size)) == NULL)

500

{

501

MBLookupTableDestruct(mb_lt);

502

return -1;

503

}

504

505

if (lookup_options->mb_template_length)

506

status = s_FillDiscMBTable(query, location, mb_lt, width, lookup_options);

507

else

508

status = s_FillContigMBTable(query, location, mb_lt, width, lookup_options);

428

/* first determine how 'wide' the lookup table is,

429

i.e. how many nucleotide letters are in a single entry */

430

431

if (lookup_options->mb_template_length > 0) {

432

433

/* for discontiguous megablast the choice is fixed */

434

if (lookup_options->word_size == 11)

435

mb_lt->lut_word_length = 11;

436

else

437

mb_lt->lut_word_length = 12;

438

}

439

else {

440

/* Choose the smallest lookup table that will not

441

be congested; this is a compromise between good

442

cache performance and few table accesses. The

443

number of entries where one table width becomes

444

better than another is probably machine-dependent */

445

446

ASSERT(lookup_options->word_size >= 10);

447

448

switch(lookup_options->word_size) {

449

case 10:

450

if (approx_table_entries < 18000)

451

mb_lt->lut_word_length = 9;

452

else

453

mb_lt->lut_word_length = 10;

454

break;

455

case 11:

456

if (approx_table_entries < 18000)

457

mb_lt->lut_word_length = 9;

458

else if (approx_table_entries < 180000)

459

mb_lt->lut_word_length = 10;

460

else

461

mb_lt->lut_word_length = 11;

462

break;

463

case 12:

464

if (approx_table_entries < 18000)

465

mb_lt->lut_word_length = 9;

466

else if (approx_table_entries < 60000)

467

mb_lt->lut_word_length = 10;

468

else if (approx_table_entries < 900000)

469

mb_lt->lut_word_length = 11;

470

else

471

mb_lt->lut_word_length = 12;

472

break;

473

default:

474

if (approx_table_entries < 300000)

475

mb_lt->lut_word_length = 11;

476

else

477

mb_lt->lut_word_length = 12;

478

break;

479

}

480

481

mb_lt->lut_word_length = MIN(lookup_options->word_size,

482

mb_lt->lut_word_length);

483

}

484

485

mb_lt->word_length = lookup_options->word_size;

486

mb_lt->hashsize = iexp(kAlphabetSize, mb_lt->lut_word_length);

487

mb_lt->hashtable = (Int4*)calloc(mb_lt->hashsize, sizeof(Int4));

488

if (mb_lt->hashtable == NULL) {

489

MBLookupTableDestruct(mb_lt);

490

return -1;

491

}

492

493

/* Allocate the PV array. To fit in the external cache of

494

latter-day microprocessors, the PV array cannot have one

495

bit for for every lookup table entry. Instead we choose

496

a size that should fit in cache and make a single bit

497

of the PV array handle multiple hashtable entries if

498

necessary.

499

500

If the query is too small or too large, the compression

501

should be higher. Small queries don't reuse the PV array,

502

and large queries saturate it. In either case, cache

503

is better used on something else */

504

505

if (mb_lt->hashsize <= 8 * kTargetPVSize)

506

pv_size = mb_lt->hashsize >> PV_ARRAY_BTS;

507

else

508

pv_size = kTargetPVSize / PV_ARRAY_BYTES;

509

510

if(approx_table_entries <= kSmallQueryCutoff ||

511

approx_table_entries >= kLargeQueryCutoff) {

512

pv_size = pv_size / 2;

513

}

514

mb_lt->pv_array_bts = ilog2(mb_lt->hashsize / pv_size);

515

mb_lt->pv_array = calloc(PV_ARRAY_BYTES, pv_size);

516

if (mb_lt->pv_array == NULL) {

517

MBLookupTableDestruct(mb_lt);

518

return -1;

519

}

520

521

if (lookup_options->mb_template_length > 0) {

522

/* discontiguous megablast */

523

mb_lt->full_byte_scan = lookup_options->full_byte_scan;

524

status = s_FillDiscMBTable(query, location, mb_lt, lookup_options);

525

}

526

else {

527

/* contiguous megablast */

528

mb_lt->scan_step = mb_lt->word_length - mb_lt->lut_word_length + 1;

529

status = s_FillContigMBTable(query, location, mb_lt, lookup_options);

530

}

509

531

510

532

if (status > 0) {

511

533

MBLookupTableDestruct(mb_lt);

513

535

}

514

536

515

537

*mb_lt_ptr = mb_lt;

538

539

#ifdef LOOKUP_VERBOSE

540

printf("lookup table size: %d (%d letters)\n", mb_lt->hashsize,

541

mb_lt->lut_word_length);

542

printf("words in table: %d\n", mb_lt->num_words_added);

543

printf("filled entries: %d (%f%%)\n", mb_lt->num_unique_pos_added,

544

100.0 * mb_lt->num_unique_pos_added / mb_lt->hashsize);

545

printf("PV array size: %d bytes (%d table entries/bit)\n",

546

pv_size * PV_ARRAY_BYTES,

547

mb_lt->hashsize / (pv_size << PV_ARRAY_BTS));

548

printf("longest chain: %d\n", mb_lt->longest_chain);

549

#endif

516

550

return 0;

517

551

}

518

552

524

558

BlastMBLookupTable* mb_lt;

525

559

Uint1* s;

526

560

Uint1* abs_start;

527

Int4 index=0, s_off;

528

529

Int4 q_off;

561

Int4 q_off, s_off;

562

Int4 last_offset;

563

Int4 index;

564

Int4 mask;

530

565

PV_ARRAY_TYPE *pv_array;

531

566

Int4 total_hits = 0;

532

Int4 compressed_wordsize, compressed_scan_step, word_size;

567

Int4 lut_word_length;

568

Int4 scan_step;

533

569

Uint1 pv_array_bts;

534

570

535

571

ASSERT(lookup_wrap->lut_type == MB_LOOKUP_TABLE);

543

579

max_hits -= mb_lt->longest_chain;

544

580

545

581

abs_start = subject->sequence;

546

s = abs_start + start_offset/COMPRESSION_RATIO;

547

compressed_scan_step = mb_lt->scan_step / COMPRESSION_RATIO;

548

compressed_wordsize = mb_lt->compressed_wordsize;

549

word_size = compressed_wordsize*COMPRESSION_RATIO;

550

551

index = 0;

552

553

/* NB: s in this function always points to the start of the word!

554

555

if (mb_lt->scan_step % COMPRESSION_RATIO == 0) {

556

Uint1* s_end = abs_start + subject->length/COMPRESSION_RATIO -

557

compressed_wordsize;

558

for ( ; s <= s_end; s += compressed_scan_step) {

559

BlastNaLookupInitIndex(compressed_wordsize, s, &index);

560

561

if (NA_PV_TEST(pv_array, index, pv_array_bts)) {

562

if (total_hits >= max_hits)

563

break;

564

q_off = mb_lt->hashtable[index];

565

s_off = (s - abs_start)*COMPRESSION_RATIO;

566

while (q_off) {

567

offset_pairs[total_hits].qs_offsets.q_off = q_off - 1;

568

offset_pairs[total_hits++].qs_offsets.s_off = s_off;

569

q_off = mb_lt->next_pos[q_off];

570

}

571

}

572

}

573

*end_offset = (s - abs_start)*COMPRESSION_RATIO;

574

} else {

575

Int4 last_offset = subject->length - word_size;

576

Uint1 bit;

577

Int4 adjusted_index;

578

579

for (s_off = start_offset; s_off <= last_offset;

580

s_off += mb_lt->scan_step) {

582

lut_word_length = mb_lt->lut_word_length;

583

mask = mb_lt->hashsize - 1;

584

scan_step = mb_lt->scan_step;

585

last_offset = subject->length - lut_word_length;

586

ASSERT(lut_word_length == 9 ||

587

lut_word_length == 10 ||

588

lut_word_length == 11 ||

589

lut_word_length == 12);

590

591

if (lut_word_length > 9) {

592

/* perform scanning for lookup tables of width 10, 11, and 12.

593

These widths require three bytes of the compressed subject

594

sequence, and possibly a fourth if the word is not known

595

to be aligned on a 4-base boundary */

596

597

if (scan_step % COMPRESSION_RATIO == 0) {

598

599

/* for strides that are a multiple of 4, words are

600

always aligned and three bytes of the subject sequence

601

will always hold a complete word (plus possible extra bases

602

that must be shifted away). s_end below points to either

603

the second to last or third to last byte of the subject

604

sequence; all subject sequences must therefore have a

605

sentinel byte */

606

607

Uint1* s_end = abs_start + (subject->length - lut_word_length) /

608

COMPRESSION_RATIO;

609

Int4 shift = 2 * (12 - lut_word_length);

610

s = abs_start + start_offset / COMPRESSION_RATIO;

611

scan_step = scan_step / COMPRESSION_RATIO;

612

613

for ( ; s <= s_end; s += scan_step) {

614

615

index = s[0] << 16 | s[1] << 8 | s[2];

616

index = index >> shift;

617

618

if (NA_PV_TEST(pv_array, index, pv_array_bts)) {

619

if (total_hits >= max_hits)

620

break;

621

q_off = mb_lt->hashtable[index];

622

s_off = (s - abs_start)*COMPRESSION_RATIO;

623

while (q_off) {

624

offset_pairs[total_hits].qs_offsets.q_off = q_off - 1;

625

offset_pairs[total_hits++].qs_offsets.s_off = s_off;

626

q_off = mb_lt->next_pos[q_off];

627

}

628

}

629

}

630

*end_offset = (s - abs_start)*COMPRESSION_RATIO;

631

}

632

else {

633

/* when the stride is not a multiple of 4, extra bases

634

may occur both before and after every word read from

635

the subject sequence. The portion of each 16-base

636

region that contains the actual word depends on the

637

offset of the word and the lookup table width, and

638

must be recalculated for each 16-base region */

639

640

for (s_off = start_offset; s_off <= last_offset; s_off += scan_step) {

641

642

Int4 shift = 2*(16 - (s_off % COMPRESSION_RATIO + lut_word_length));

643

s = abs_start + (s_off / COMPRESSION_RATIO);

644

645

index = s[0] << 24 | s[1] << 16 | s[2] << 8 | s[3];

646

index = (index >> shift) & mask;

647

648

if (NA_PV_TEST(pv_array, index, pv_array_bts)) {

649

if (total_hits >= max_hits)

650

break;

651

q_off = mb_lt->hashtable[index];

652

while (q_off) {

653

offset_pairs[total_hits].qs_offsets.q_off = q_off - 1;

654

offset_pairs[total_hits++].qs_offsets.s_off = s_off;

655

q_off = mb_lt->next_pos[q_off];

656

}

657

}

658

}

659

*end_offset = s_off;

660

}

661

}

662

else {

663

/* perform scanning for a lookup tables of width 9

664

Here the stride will never be a multiple of 4

665

(this table is only used for very small word sizes) */

666

667

for (s_off = start_offset; s_off <= last_offset; s_off += scan_step) {

668

669

Int4 shift = 2*(12 - (s_off % COMPRESSION_RATIO + lut_word_length));

581

670

s = abs_start + (s_off / COMPRESSION_RATIO);

582

bit = 2*(s_off % COMPRESSION_RATIO);

583

/* Compute index for a word made of full bytes */

584

s = BlastNaLookupInitIndex(compressed_wordsize, s, &index);

585

/* Adjust the word index by the base within a byte */

586

adjusted_index = BlastNaLookupAdjustIndex(s, index, mb_lt->mask, bit);

587

if (NA_PV_TEST(pv_array, adjusted_index, pv_array_bts)) {

671

672

index = s[0] << 16 | s[1] << 8 | s[2];

673

index = (index >> shift) & mask;

674

675

if (NA_PV_TEST(pv_array, index, pv_array_bts)) {

588

676

if (total_hits >= max_hits)

589

677

break;

590

q_off = mb_lt->hashtable[adjusted_index];

678

q_off = mb_lt->hashtable[index];

591

679

while (q_off) {

592

680

offset_pairs[total_hits].qs_offsets.q_off = q_off - 1;

593

681

offset_pairs[total_hits++].qs_offsets.s_off = s_off;

601

689

return total_hits;

602

690

}

603

691

604

Int4 MB_ScanSubject(const LookupTableWrap* lookup,

605

const BLAST_SequenceBlk* subject, Int4 start_offset,

606

BlastOffsetPair* NCBI_RESTRICT offset_pairs, Int4 max_hits,

607

Int4* end_offset)

608

{

609

Uint1* abs_start,* s_end;

610

Uint1* s;

611

Int4 total_hits = 0;

612

Uint4 q_off, s_off;

613

Int4 index;

614

BlastMBLookupTable* mb_lt;

615

PV_ARRAY_TYPE *pv_array;

616

Uint1 pv_array_bts;

617

Int4 compressed_wordsize;

618

619

ASSERT(lookup->lut_type == MB_LOOKUP_TABLE);

620

mb_lt = (BlastMBLookupTable*) lookup->lut;

621

622

pv_array = mb_lt->pv_array;

623

pv_array_bts = mb_lt->pv_array_bts;

624

compressed_wordsize = mb_lt->compressed_wordsize;

625

626

/* Since the test for number of hits here is done after adding them,

627

subtract the longest chain length from the allowed offset array size.

628

629

max_hits -= mb_lt->longest_chain;

630

631

abs_start = subject->sequence;

632

s = abs_start + start_offset/COMPRESSION_RATIO;

633

s_end = abs_start + (*end_offset)/COMPRESSION_RATIO;

634

635

s = BlastNaLookupInitIndex(mb_lt->compressed_wordsize, s, &index);

636

637

/* In the following code, s points to the byte right after the end of

638

the word. */

639

while (s <= s_end) {

640

if (NA_PV_TEST(pv_array, index, pv_array_bts)) {

641

if (total_hits >= max_hits)

642

break;

643

q_off = mb_lt->hashtable[index];

644

s_off = ((s - abs_start) - compressed_wordsize)*COMPRESSION_RATIO;

645

while (q_off) {

646

offset_pairs[total_hits].qs_offsets.q_off = q_off - 1;

647

offset_pairs[total_hits++].qs_offsets.s_off = s_off;

648

q_off = mb_lt->next_pos[q_off];

649

}

650

}

651

/* Compute the next value of the lookup index

652

(shifting sequence by a full byte) */

653

index = BlastNaLookupComputeIndex(FULL_BYTE_SHIFT, mb_lt->mask,

654

s++, index);

655

}

656

657

*end_offset =

658

((s - abs_start) - compressed_wordsize)*COMPRESSION_RATIO;

659

660

return total_hits;

661

}

662

663

692

Int4 MB_DiscWordScanSubject(const LookupTableWrap* lookup,

664

693

const BLAST_SequenceBlk* subject, Int4 start_offset,

665

694

BlastOffsetPair* NCBI_RESTRICT offset_pairs, Int4 max_hits,

Older »