2
* See the file LICENSE for redistribution information.
4
* Copyright (c) 1996-2002
5
* Sleepycat Software. All rights reserved.
8
* Copyright (c) 1990, 1993, 1994, 1995, 1996
9
* Keith Bostic. All rights reserved.
12
* Copyright (c) 1990, 1993
13
* The Regents of the University of California. All rights reserved.
15
* Redistribution and use in source and binary forms, with or without
16
* modification, are permitted provided that the following conditions
18
* 1. Redistributions of source code must retain the above copyright
19
* notice, this list of conditions and the following disclaimer.
20
* 2. Redistributions in binary form must reproduce the above copyright
21
* notice, this list of conditions and the following disclaimer in the
22
* documentation and/or other materials provided with the distribution.
23
* 3. Neither the name of the University nor the names of its contributors
24
* may be used to endorse or promote products derived from this software
25
* without specific prior written permission.
27
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40
#include "db_config.h"
43
static const char revid[] = "$Id$";
46
#ifndef NO_SYSTEM_INCLUDES
47
#include <sys/types.h>
51
#include "dbinc/db_page.h"
52
#include "dbinc/btree.h"
53
#include "dbinc/db_shash.h"
54
#include "dbinc/lock.h"
58
* Search a btree for a record number.
60
* PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
63
__bam_rsearch(dbc, recnop, flags, stop, exactp)
76
db_indx_t adjust, deloffset, indx, top;
77
db_lockmode_t lock_mode;
79
db_recno_t recno, t_recno, total;
84
cp = (BTREE_CURSOR *)dbc->internal;
89
* There are several ways we search a btree tree. The flags argument
90
* specifies if we're acquiring read or write locks and if we are
91
* locking pairs of pages. In addition, if we're adding or deleting
92
* an item, we have to lock the entire tree, regardless. See btree.h
95
* If write-locking pages, we need to know whether or not to acquire a
96
* write lock on a page before getting it. This depends on how deep it
97
* is in tree, which we don't know until we acquire the root page. So,
98
* if we need to lock the root page we may have to upgrade it later,
99
* because we won't get the correct lock initially.
101
* Retrieve the root page.
104
stack = LF_ISSET(S_STACK) ? 1 : 0;
105
lock_mode = stack ? DB_LOCK_WRITE : DB_LOCK_READ;
106
if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
108
if ((ret = mpf->get(mpf, &pg, 0, &h)) != 0) {
109
/* Did not read it, so we can release the lock */
110
(void)__LPUT(dbc, lock);
115
* Decide if we need to save this page; if we do, write lock it.
116
* We deliberately don't lock-couple on this call. If the tree
117
* is tiny, i.e., one page, and two threads are busily updating
118
* the root page, we're almost guaranteed deadlocks galore, as
119
* each one gets a read lock and then blocks the other's attempt
123
((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
124
(LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
125
(void)mpf->put(mpf, h, 0);
126
(void)__LPUT(dbc, lock);
127
lock_mode = DB_LOCK_WRITE;
128
if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
130
if ((ret = mpf->get(mpf, &pg, 0, &h)) != 0) {
131
/* Did not read it, so we can release the lock */
132
(void)__LPUT(dbc, lock);
139
* If appending to the tree, set the record number now -- we have the
142
* Delete only deletes exact matches, read only returns exact matches.
143
* Note, this is different from __bam_search(), which returns non-exact
146
* The record may not exist. We can only return the correct location
147
* for the record immediately after the last record in the tree, so do
151
if (LF_ISSET(S_APPEND)) {
153
*recnop = recno = total + 1;
160
if (!LF_ISSET(S_PAST_EOF) || recno > total + 1) {
162
* Keep the page locked for serializability.
165
* This leaves the root page locked, which will
166
* eliminate any concurrency. A possible fix
167
* would be to lock the last leaf page instead.
169
(void)mpf->put(mpf, h, 0);
170
(void)__TLPUT(dbc, lock);
171
return (DB_NOTFOUND);
178
* Record numbers in the tree are 0-based, but the recno is
179
* 1-based. All of the calculations below have to take this
188
* There may be logically deleted records on the page.
189
* If there are enough, the record may not exist.
191
if (TYPE(h) == P_LBTREE) {
198
for (t_recno = 0, indx = 0;; indx += adjust) {
199
if (indx >= NUM_ENT(h)) {
201
if (!LF_ISSET(S_PAST_EOF) ||
202
recno > t_recno + 1) {
207
if (!B_DISSET(GET_BKEYDATA(dbp, h,
208
indx + deloffset)->type) &&
213
/* Correct from 1-based to 0-based for a page offset. */
214
BT_STK_ENTER(dbp->dbenv,
215
cp, h, indx, lock, lock_mode, ret);
220
for (indx = 0, top = NUM_ENT(h);;) {
221
bi = GET_BINTERNAL(dbp, h, indx);
222
if (++indx == top || total + bi->nrecs >= recno)
231
/* Correct from 1-based to 0-based for a page offset. */
233
BT_STK_ENTER(dbp->dbenv,
234
cp, h, recno, lock, lock_mode, ret);
239
for (indx = 0, top = NUM_ENT(h);;) {
240
ri = GET_RINTERNAL(dbp, h, indx);
241
if (++indx == top || total + ri->nrecs >= recno)
248
return (__db_pgfmt(dbp->dbenv, h->pgno));
253
/* Return if this is the lowest page wanted. */
254
if (LF_ISSET(S_PARENT) && stop == h->level) {
255
BT_STK_ENTER(dbp->dbenv,
256
cp, h, indx, lock, lock_mode, ret);
261
BT_STK_PUSH(dbp->dbenv,
262
cp, h, indx, lock, lock_mode, ret);
266
lock_mode = DB_LOCK_WRITE;
268
__db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
272
* Decide if we want to return a pointer to the next
273
* page in the stack. If we do, write lock it and
276
if ((LF_ISSET(S_PARENT) &&
277
(u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
278
(h->level - 1) == LEAFLEVEL)
281
(void)mpf->put(mpf, h, 0);
284
LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
285
if ((ret = __db_lget(dbc,
286
LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
288
* If we fail, discard the lock we held. This
289
* is OK because this only happens when we are
290
* descending the tree holding read-locks.
297
if ((ret = mpf->get(mpf, &pg, 0, &h)) != 0)
303
__bam_stkrel(dbc, 0);
309
* Adjust the tree after adding or deleting a record.
311
* PUBLIC: int __bam_adjust __P((DBC *, int32_t));
314
__bam_adjust(dbc, adjust)
328
cp = (BTREE_CURSOR *)dbc->internal;
329
root_pgno = cp->root;
331
/* Update the record counts for the tree. */
332
for (epg = cp->sp; epg <= cp->csp; ++epg) {
334
if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
335
if (DBC_LOGGING(dbc)) {
336
if ((ret = __bam_cadjust_log(dbp, dbc->txn,
337
&LSN(h), 0, PGNO(h), &LSN(h),
338
(u_int32_t)epg->indx, adjust,
339
PGNO(h) == root_pgno ?
340
CAD_UPDATEROOT : 0)) != 0)
343
LSN_NOT_LOGGED(LSN(h));
345
if (TYPE(h) == P_IBTREE)
346
GET_BINTERNAL(dbp, h, epg->indx)->nrecs +=
349
GET_RINTERNAL(dbp, h, epg->indx)->nrecs +=
352
if (PGNO(h) == root_pgno)
353
RE_NREC_ADJ(h, adjust);
355
if ((ret = mpf->set(mpf, h, DB_MPOOL_DIRTY)) != 0)
364
* Return the number of records in the tree.
366
* PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
369
__bam_nrecs(dbc, rep)
383
pgno = dbc->internal->root;
384
if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
386
if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0)
391
(void)mpf->put(mpf, h, 0);
392
(void)__TLPUT(dbc, lock);
399
* Return the number of records below a page.
401
* PUBLIC: db_recno_t __bam_total __P((DB *, PAGE *));
416
/* Check for logically deleted records. */
417
for (indx = 0; indx < top; indx += P_INDX)
419
GET_BKEYDATA(dbp, h, indx + O_INDX)->type))
423
/* Check for logically deleted records. */
424
for (indx = 0; indx < top; indx += O_INDX)
425
if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
429
for (indx = 0; indx < top; indx += O_INDX)
430
nrecs += GET_BINTERNAL(dbp, h, indx)->nrecs;
436
for (indx = 0; indx < top; indx += O_INDX)
437
nrecs += GET_RINTERNAL(dbp, h, indx)->nrecs;