2
* See the file LICENSE for redistribution information.
4
* Copyright (c) 1996-2004
5
* Sleepycat Software. All rights reserved.
8
* Copyright (c) 1990, 1993, 1994, 1995, 1996
9
* Keith Bostic. All rights reserved.
12
* Copyright (c) 1990, 1993
13
* The Regents of the University of California. All rights reserved.
15
* Redistribution and use in source and binary forms, with or without
16
* modification, are permitted provided that the following conditions
18
* 1. Redistributions of source code must retain the above copyright
19
* notice, this list of conditions and the following disclaimer.
20
* 2. Redistributions in binary form must reproduce the above copyright
21
* notice, this list of conditions and the following disclaimer in the
22
* documentation and/or other materials provided with the distribution.
23
* 3. Neither the name of the University nor the names of its contributors
24
* may be used to endorse or promote products derived from this software
25
* without specific prior written permission.
27
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39
* $Id: bt_rsearch.c,v 11.40 2004/07/23 17:21:09 bostic Exp $
42
#include "db_config.h"
44
#ifndef NO_SYSTEM_INCLUDES
45
#include <sys/types.h>
49
#include "dbinc/db_page.h"
50
#include "dbinc/btree.h"
51
#include "dbinc/db_shash.h"
52
#include "dbinc/lock.h"
57
* Search a btree for a record number.
59
* PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
62
__bam_rsearch(dbc, recnop, flags, stop, exactp)
75
db_indx_t adjust, deloffset, indx, top;
76
db_lockmode_t lock_mode;
78
db_recno_t recno, t_recno, total;
79
int ret, stack, t_ret;
83
cp = (BTREE_CURSOR *)dbc->internal;
89
* There are several ways we search a btree tree. The flags argument
90
* specifies if we're acquiring read or write locks and if we are
91
* locking pairs of pages. In addition, if we're adding or deleting
92
* an item, we have to lock the entire tree, regardless. See btree.h
95
* If write-locking pages, we need to know whether or not to acquire a
96
* write lock on a page before getting it. This depends on how deep it
97
* is in tree, which we don't know until we acquire the root page. So,
98
* if we need to lock the root page we may have to upgrade it later,
99
* because we won't get the correct lock initially.
101
* Retrieve the root page.
104
stack = LF_ISSET(S_STACK) ? 1 : 0;
105
lock_mode = stack ? DB_LOCK_WRITE : DB_LOCK_READ;
106
if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
108
if ((ret = __memp_fget(mpf, &pg, 0, &h)) != 0) {
109
/* Did not read it, so we can release the lock */
110
(void)__LPUT(dbc, lock);
115
* Decide if we need to save this page; if we do, write lock it.
116
* We deliberately don't lock-couple on this call. If the tree
117
* is tiny, i.e., one page, and two threads are busily updating
118
* the root page, we're almost guaranteed deadlocks galore, as
119
* each one gets a read lock and then blocks the other's attempt
123
((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
124
(LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
125
ret = __memp_fput(mpf, h, 0);
126
if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
130
lock_mode = DB_LOCK_WRITE;
131
if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
133
if ((ret = __memp_fget(mpf, &pg, 0, &h)) != 0) {
134
/* Did not read it, so we can release the lock */
135
(void)__LPUT(dbc, lock);
142
* If appending to the tree, set the record number now -- we have the
145
* Delete only deletes exact matches, read only returns exact matches.
146
* Note, this is different from __bam_search(), which returns non-exact
149
* The record may not exist. We can only return the correct location
150
* for the record immediately after the last record in the tree, so do
154
if (LF_ISSET(S_APPEND)) {
156
*recnop = recno = total + 1;
163
if (!LF_ISSET(S_PAST_EOF) || recno > total + 1) {
165
* Keep the page locked for serializability.
168
* This leaves the root page locked, which will
169
* eliminate any concurrency. A possible fix
170
* would be to lock the last leaf page instead.
172
ret = __memp_fput(mpf, h, 0);
174
__TLPUT(dbc, lock)) != 0 && ret == 0)
176
return (ret == 0 ? DB_NOTFOUND : ret);
183
* Record numbers in the tree are 0-based, but the recno is
184
* 1-based. All of the calculations below have to take this
193
* There may be logically deleted records on the page.
194
* If there are enough, the record may not exist.
196
if (TYPE(h) == P_LBTREE) {
203
for (t_recno = 0, indx = 0;; indx += adjust) {
204
if (indx >= NUM_ENT(h)) {
206
if (!LF_ISSET(S_PAST_EOF) ||
207
recno > t_recno + 1) {
208
ret = __memp_fput(mpf, h, 0);
210
if ((t_ret = __TLPUT(dbc,
211
lock)) != 0 && ret == 0)
218
if (!B_DISSET(GET_BKEYDATA(dbp, h,
219
indx + deloffset)->type) &&
224
/* Correct from 1-based to 0-based for a page offset. */
225
BT_STK_ENTER(dbp->dbenv,
226
cp, h, indx, lock, lock_mode, ret);
231
for (indx = 0, top = NUM_ENT(h);;) {
232
bi = GET_BINTERNAL(dbp, h, indx);
233
if (++indx == top || total + bi->nrecs >= recno)
242
/* Correct from 1-based to 0-based for a page offset. */
244
BT_STK_ENTER(dbp->dbenv,
245
cp, h, recno, lock, lock_mode, ret);
250
for (indx = 0, top = NUM_ENT(h);;) {
251
ri = GET_RINTERNAL(dbp, h, indx);
252
if (++indx == top || total + ri->nrecs >= recno)
259
return (__db_pgfmt(dbp->dbenv, h->pgno));
264
/* Return if this is the lowest page wanted. */
265
if (LF_ISSET(S_PARENT) && stop == h->level) {
266
BT_STK_ENTER(dbp->dbenv,
267
cp, h, indx, lock, lock_mode, ret);
272
BT_STK_PUSH(dbp->dbenv,
273
cp, h, indx, lock, lock_mode, ret);
278
lock_mode = DB_LOCK_WRITE;
280
__db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
284
* Decide if we want to return a pointer to the next
285
* page in the stack. If we do, write lock it and
288
if ((LF_ISSET(S_PARENT) &&
289
(u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
290
(h->level - 1) == LEAFLEVEL)
293
if ((ret = __memp_fput(mpf, h, 0)) != 0)
298
LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
299
if ((ret = __db_lget(dbc,
300
LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
302
* If we fail, discard the lock we held. This
303
* is OK because this only happens when we are
304
* descending the tree holding read-locks.
306
(void)__LPUT(dbc, lock);
311
if ((ret = __memp_fget(mpf, &pg, 0, &h)) != 0)
316
err: if (h != NULL && (t_ret = __memp_fput(mpf, h, 0)) != 0 && ret == 0)
320
__bam_stkrel(dbc, 0);
327
* Adjust the tree after adding or deleting a record.
329
* PUBLIC: int __bam_adjust __P((DBC *, int32_t));
332
__bam_adjust(dbc, adjust)
346
cp = (BTREE_CURSOR *)dbc->internal;
347
root_pgno = cp->root;
349
/* Update the record counts for the tree. */
350
for (epg = cp->sp; epg <= cp->csp; ++epg) {
352
if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
353
if (DBC_LOGGING(dbc)) {
354
if ((ret = __bam_cadjust_log(dbp, dbc->txn,
355
&LSN(h), 0, PGNO(h), &LSN(h),
356
(u_int32_t)epg->indx, adjust,
357
PGNO(h) == root_pgno ?
358
CAD_UPDATEROOT : 0)) != 0)
361
LSN_NOT_LOGGED(LSN(h));
363
if (TYPE(h) == P_IBTREE)
364
GET_BINTERNAL(dbp, h, epg->indx)->nrecs +=
367
GET_RINTERNAL(dbp, h, epg->indx)->nrecs +=
370
if (PGNO(h) == root_pgno)
371
RE_NREC_ADJ(h, adjust);
373
if ((ret = __memp_fset(mpf, h, DB_MPOOL_DIRTY)) != 0)
382
* Return the number of records in the tree.
384
* PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
387
__bam_nrecs(dbc, rep)
401
pgno = dbc->internal->root;
402
if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
404
if ((ret = __memp_fget(mpf, &pgno, 0, &h)) != 0)
409
ret = __memp_fput(mpf, h, 0);
410
if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
418
* Return the number of records below a page.
420
* PUBLIC: db_recno_t __bam_total __P((DB *, PAGE *));
435
/* Check for logically deleted records. */
436
for (indx = 0; indx < top; indx += P_INDX)
438
GET_BKEYDATA(dbp, h, indx + O_INDX)->type))
442
/* Check for logically deleted records. */
443
for (indx = 0; indx < top; indx += O_INDX)
444
if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
448
for (indx = 0; indx < top; indx += O_INDX)
449
nrecs += GET_BINTERNAL(dbp, h, indx)->nrecs;
455
for (indx = 0; indx < top; indx += O_INDX)
456
nrecs += GET_RINTERNAL(dbp, h, indx)->nrecs;