2
* See the file LICENSE for redistribution information.
4
* Copyright (c) 1996-2002
5
* Sleepycat Software. All rights reserved.
8
* Copyright (c) 1990, 1993, 1994, 1995, 1996
9
* Keith Bostic. All rights reserved.
12
* Copyright (c) 1990, 1993, 1994, 1995
13
* The Regents of the University of California. All rights reserved.
15
* This code is derived from software contributed to Berkeley by
18
* Redistribution and use in source and binary forms, with or without
19
* modification, are permitted provided that the following conditions
21
* 1. Redistributions of source code must retain the above copyright
22
* notice, this list of conditions and the following disclaimer.
23
* 2. Redistributions in binary form must reproduce the above copyright
24
* notice, this list of conditions and the following disclaimer in the
25
* documentation and/or other materials provided with the distribution.
26
* 3. Neither the name of the University nor the names of its contributors
27
* may be used to endorse or promote products derived from this software
28
* without specific prior written permission.
30
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43
#include "db_config.h"
46
static const char revid[] = "$Id$";
49
#ifndef NO_SYSTEM_INCLUDES
50
#include <sys/types.h>
56
#include "dbinc/db_page.h"
57
#include "dbinc/db_shash.h"
58
#include "dbinc/btree.h"
59
#include "dbinc/lock.h"
63
* Search a btree for a key.
65
* PUBLIC: int __bam_search __P((DBC *, db_pgno_t,
66
* PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *));
69
__bam_search(dbc, root_pgno, key, flags, stop, recnop, exactp)
83
db_indx_t base, i, indx, *inp, lim;
84
db_lockmode_t lock_mode;
87
int adjust, cmp, deloffset, ret, stack;
88
int (*func) __P((DB *, const DBT *, const DBT *));
92
cp = (BTREE_CURSOR *)dbc->internal;
99
* There are several ways we search a btree tree. The flags argument
100
* specifies if we're acquiring read or write locks, if we position
101
* to the first or last item in a set of duplicates, if we return
102
* deleted items, and if we are locking pairs of pages. In addition,
103
* if we're modifying record numbers, we have to lock the entire tree
104
* regardless. See btree.h for more details.
106
* If write-locking pages, we need to know whether or not to acquire a
107
* write lock on a page before getting it. This depends on how deep it
108
* is in tree, which we don't know until we acquire the root page. So,
109
* if we need to lock the root page we may have to upgrade it later,
110
* because we won't get the correct lock initially.
112
* Retrieve the root page.
115
pg = root_pgno == PGNO_INVALID ? cp->root : root_pgno;
116
stack = LF_ISSET(S_STACK) && F_ISSET(cp, C_RECNUM);
117
lock_mode = stack ? DB_LOCK_WRITE : DB_LOCK_READ;
118
if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
120
if ((ret = mpf->get(mpf, &pg, 0, &h)) != 0) {
121
/* Did not read it, so we can release the lock */
122
(void)__LPUT(dbc, lock);
127
* Decide if we need to save this page; if we do, write lock it.
128
* We deliberately don't lock-couple on this call. If the tree
129
* is tiny, i.e., one page, and two threads are busily updating
130
* the root page, we're almost guaranteed deadlocks galore, as
131
* each one gets a read lock and then blocks the other's attempt
135
((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
136
(LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
137
(void)mpf->put(mpf, h, 0);
138
(void)__LPUT(dbc, lock);
139
lock_mode = DB_LOCK_WRITE;
140
if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
142
if ((ret = mpf->get(mpf, &pg, 0, &h)) != 0) {
143
/* Did not read it, so we can release the lock */
144
(void)__LPUT(dbc, lock);
147
if (!((LF_ISSET(S_PARENT) &&
148
(u_int8_t)(stop + 1) >= h->level) ||
149
(LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
150
/* Someone else split the root, start over. */
151
(void)mpf->put(mpf, h, 0);
152
(void)__LPUT(dbc, lock);
158
/* Choose a comparison function. */
159
func = F_ISSET(dbc, DBC_OPD) ?
160
(dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) :
166
* Do a binary search on the current page. If we're searching
167
* a Btree leaf page, we have to walk the indices in groups of
168
* two. If we're searching an internal page or a off-page dup
169
* page, they're an index per page item. If we find an exact
170
* match on a leaf page, we're done.
172
adjust = TYPE(h) == P_LBTREE ? P_INDX : O_INDX;
174
lim = NUM_ENT(h) / (db_indx_t)adjust; lim != 0; lim >>= 1) {
175
indx = base + ((lim >> 1) * adjust);
177
__bam_cmp(dbp, key, h, indx, func, &cmp)) != 0)
180
if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP)
185
base = indx + adjust;
191
* No match found. Base is the smallest index greater than
192
* key and may be zero or a last + O_INDX index.
194
* If it's a leaf page, return base as the "found" value.
195
* Delete only deletes exact matches.
197
if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP) {
200
if (LF_ISSET(S_EXACT))
203
if (LF_ISSET(S_STK_ONLY)) {
204
BT_STK_NUM(dbp->dbenv, cp, h, base, ret);
206
(void)mpf->put(mpf, h, 0);
212
* Possibly returning a deleted record -- DB_SET_RANGE,
213
* DB_KEYFIRST and DB_KEYLAST don't require an exact
214
* match, and we don't want to walk multiple pages here
215
* to find an undeleted record. This is handled by the
218
BT_STK_ENTER(dbp->dbenv,
219
cp, h, base, lock, lock_mode, ret);
226
* If it's not a leaf page, record the internal page (which is
227
* a parent page for the key). Decrement the base by 1 if it's
228
* non-zero so that if a split later occurs, the inserted page
229
* will be to the right of the saved page.
231
indx = base > 0 ? base - O_INDX : base;
234
* If we're trying to calculate the record number, sum up
235
* all the record numbers on this page up to the indx point.
237
next: if (recnop != NULL)
238
for (i = 0; i < indx; ++i)
239
recno += GET_BINTERNAL(dbp, h, i)->nrecs;
241
pg = GET_BINTERNAL(dbp, h, indx)->pgno;
243
if (LF_ISSET(S_STK_ONLY)) {
244
if (stop == h->level) {
245
BT_STK_NUM(dbp->dbenv, cp, h, indx, ret);
247
(void)mpf->put(mpf, h, 0);
250
BT_STK_NUMPUSH(dbp->dbenv, cp, h, indx, ret);
251
(void)mpf->put(mpf, h, 0);
252
if ((ret = __db_lget(dbc,
253
LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
255
* Discard our lock and return on failure. This
256
* is OK because it only happens when descending
257
* the tree holding read-locks.
263
/* Return if this is the lowest page wanted. */
264
if (LF_ISSET(S_PARENT) && stop == h->level) {
265
BT_STK_ENTER(dbp->dbenv,
266
cp, h, indx, lock, lock_mode, ret);
271
BT_STK_PUSH(dbp->dbenv,
272
cp, h, indx, lock, lock_mode, ret);
276
lock_mode = DB_LOCK_WRITE;
278
__db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
282
* Decide if we want to return a reference to the next
283
* page in the return stack. If so, lock it and never
286
if ((LF_ISSET(S_PARENT) &&
287
(u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
288
(h->level - 1) == LEAFLEVEL)
291
(void)mpf->put(mpf, h, 0);
294
LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
295
if ((ret = __db_lget(dbc,
296
LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
298
* If we fail, discard the lock we held. This
299
* is OK because this only happens when we are
300
* descending the tree holding read-locks.
306
if ((ret = mpf->get(mpf, &pg, 0, &h)) != 0)
314
* If we're trying to calculate the record number, add in the
315
* offset on this page and correct for the fact that records
316
* in the tree are 0-based.
319
*recnop = recno + (indx / P_INDX) + 1;
322
* If we got here, we know that we have a Btree leaf or off-page
323
* duplicates page. If it's a Btree leaf page, we have to handle
324
* on-page duplicates.
326
* If there are duplicates, go to the first/last one. This is
327
* safe because we know that we're not going to leave the page,
328
* all duplicate sets that are not on overflow pages exist on a
331
if (TYPE(h) == P_LBTREE) {
332
if (LF_ISSET(S_DUPLAST))
333
while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
334
inp[indx] == inp[indx + P_INDX])
338
inp[indx] == inp[indx - P_INDX])
343
* Now check if we are allowed to return deleted items; if not, then
344
* find the next (or previous) non-deleted duplicate entry. (We do
345
* not move from the original found key on the basis of the S_DELNO
348
if (LF_ISSET(S_DELNO)) {
349
deloffset = TYPE(h) == P_LBTREE ? O_INDX : 0;
350
if (LF_ISSET(S_DUPLAST))
351
while (B_DISSET(GET_BKEYDATA(dbp,
352
h, indx + deloffset)->type) && indx > 0 &&
353
inp[indx] == inp[indx - adjust])
356
while (B_DISSET(GET_BKEYDATA(dbp,
357
h, indx + deloffset)->type) &&
358
indx < (db_indx_t)(NUM_ENT(h) - adjust) &&
359
inp[indx] == inp[indx + adjust])
363
* If we weren't able to find a non-deleted duplicate, return
366
if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type))
370
if (LF_ISSET(S_STK_ONLY)) {
371
BT_STK_NUM(dbp->dbenv, cp, h, indx, ret);
373
(void)mpf->put(mpf, h, 0);
375
BT_STK_ENTER(dbp->dbenv, cp, h, indx, lock, lock_mode, ret);
382
/* Keep the page locked for serializability. */
383
(void)mpf->put(mpf, h, 0);
384
(void)__TLPUT(dbc, lock);
388
__bam_stkrel(dbc, 0);
394
* Release all pages currently held in the stack.
396
* PUBLIC: int __bam_stkrel __P((DBC *, u_int32_t));
399
__bam_stkrel(dbc, flags)
411
cp = (BTREE_CURSOR *)dbc->internal;
414
* Release inner pages first.
416
* The caller must be sure that setting STK_NOLOCK will not effect
417
* either serializability or recoverability.
419
for (ret = 0, epg = cp->sp; epg <= cp->csp; ++epg) {
420
if (epg->page != NULL) {
421
if (LF_ISSET(STK_CLRDBC) && cp->page == epg->page) {
426
mpf->put(mpf, epg->page, 0)) != 0 && ret == 0)
430
* Temporary fix for #3243 -- under certain deadlock
431
* conditions we call here again and re-free the page.
432
* The correct fix is to never release a stack that
433
* doesn't hold items.
437
if (LF_ISSET(STK_NOLOCK))
438
(void)__LPUT(dbc, epg->lock);
440
(void)__TLPUT(dbc, epg->lock);
443
/* Clear the stack, all pages have been released. */
453
* PUBLIC: int __bam_stkgrow __P((DB_ENV *, BTREE_CURSOR *));
456
__bam_stkgrow(dbenv, cp)
464
entries = cp->esp - cp->sp;
466
if ((ret = __os_calloc(dbenv, entries * 2, sizeof(EPG), &p)) != 0)
468
memcpy(p, cp->sp, entries * sizeof(EPG));
469
if (cp->sp != cp->stack)
470
__os_free(dbenv, cp->sp);
472
cp->csp = p + entries;
473
cp->esp = p + entries * 2;