2
* See the file LICENSE for redistribution information.
4
* Copyright (c) 1996-2002
5
* Sleepycat Software. All rights reserved.
11
static const char revid[] = "$Id$";
14
#ifndef NO_SYSTEM_INCLUDES
15
#include <sys/types.h>
21
#include "dbinc/db_shash.h"
22
#include "dbinc/lock.h"
23
#include "dbinc/txn.h"
24
#include "dbinc/rep.h"
26
#define ISSET_MAP(M, N) ((M)[(N) / 32] & (1 << (N) % 32))
28
#define CLEAR_MAP(M, N) { \
30
for (__i = 0; __i < (N); __i++) \
34
#define SET_MAP(M, B) ((M)[(B) / 32] |= (1 << ((B) % 32)))
35
#define CLR_MAP(M, B) ((M)[(B) / 32] &= ~(1 << ((B) % 32)))
37
#define OR_MAP(D, S, N) { \
39
for (__i = 0; __i < (N); __i++) \
42
#define BAD_KILLID 0xffffffff
50
u_int32_t last_locker_id;
54
static int __dd_abort __P((DB_ENV *, locker_info *));
55
static int __dd_build __P((DB_ENV *,
56
u_int32_t, u_int32_t **, u_int32_t *, u_int32_t *, locker_info **));
57
static int __dd_find __P((DB_ENV *,
58
u_int32_t *, locker_info *, u_int32_t, u_int32_t, u_int32_t ***));
59
static int __dd_isolder __P((u_int32_t, u_int32_t, u_int32_t, u_int32_t));
60
static int __dd_verify __P((locker_info *, u_int32_t *, u_int32_t *,
61
u_int32_t *, u_int32_t, u_int32_t, u_int32_t));
64
static void __dd_debug
65
__P((DB_ENV *, locker_info *, u_int32_t *, u_int32_t, u_int32_t));
71
* PUBLIC: int __lock_detect __P((DB_ENV *, u_int32_t, u_int32_t, int *));
74
__lock_detect(dbenv, flags, atype, abortp)
76
u_int32_t flags, atype;
79
DB_LOCKREGION *region;
83
u_int32_t *bitmap, *copymap, **deadp, **free_me, *tmpmap;
84
u_int32_t i, keeper, killid, limit, nalloc, nlockers;
85
u_int32_t lock_max, txn_max;
89
ENV_REQUIRES_CONFIG(dbenv,
90
dbenv->lk_handle, "DB_ENV->lock_detect", DB_INIT_LOCK);
92
/* Validate arguments. */
93
if ((ret = __db_fchk(dbenv, "DB_ENV->lock_detect", flags, 0)) != 0)
98
case DB_LOCK_MAXLOCKS:
99
case DB_LOCK_MINLOCKS:
100
case DB_LOCK_MINWRITE:
103
case DB_LOCK_YOUNGEST:
107
"DB_ENV->lock_detect: unknown deadlock detection mode specified");
112
* If this environment is a replication client, then we must use the
113
* MINWRITE detection discipline.
115
if (__rep_is_client(dbenv))
116
atype = DB_LOCK_MINWRITE;
120
lt = dbenv->lk_handle;
124
/* Check if a detector run is necessary. */
125
LOCKREGION(dbenv, lt);
127
/* Make a pass only if auto-detect would run. */
128
region = lt->reginfo.primary;
130
if (region->need_dd == 0) {
131
UNLOCKREGION(dbenv, lt);
135
/* Reset need_dd, so we know we've run the detector. */
138
/* Build the waits-for bitmap. */
139
ret = __dd_build(dbenv, atype, &bitmap, &nlockers, &nalloc, &idmap);
140
lock_max = region->stat.st_cur_maxid;
141
UNLOCKREGION(dbenv, lt);
144
* We need the cur_maxid from the txn region as well. In order
145
* to avoid tricky synchronization between the lock and txn
146
* regions, we simply unlock the lock region and then lock the
147
* txn region. This introduces a small window during which the
148
* transaction system could then wrap. We're willing to return
149
* the wrong answer for "oldest" or "youngest" in those rare
152
tmgr = dbenv->tx_handle;
154
R_LOCK(dbenv, &tmgr->reginfo);
155
txn_max = ((DB_TXNREGION *)tmgr->reginfo.primary)->cur_maxid;
156
R_UNLOCK(dbenv, &tmgr->reginfo);
158
txn_max = TXN_MAXIMUM;
159
if (ret != 0 || atype == DB_LOCK_EXPIRE)
165
if (FLD_ISSET(dbenv->verbose, DB_VERB_WAITSFOR))
166
__dd_debug(dbenv, idmap, bitmap, nlockers, nalloc);
168
/* Now duplicate the bitmaps so we can verify deadlock participants. */
169
if ((ret = __os_calloc(dbenv, (size_t)nlockers,
170
sizeof(u_int32_t) * nalloc, ©map)) != 0)
172
memcpy(copymap, bitmap, nlockers * sizeof(u_int32_t) * nalloc);
174
if ((ret = __os_calloc(dbenv, sizeof(u_int32_t), nalloc, &tmpmap)) != 0)
177
/* Find a deadlock. */
179
__dd_find(dbenv, bitmap, idmap, nlockers, nalloc, &deadp)) != 0)
184
for (; *deadp != NULL; deadp++) {
187
killid = (u_int32_t)((*deadp - bitmap) / nalloc);
191
if (atype == DB_LOCK_DEFAULT || atype == DB_LOCK_RANDOM)
194
* It's conceivable that under XA, the locker could
197
if (killid == BAD_KILLID)
201
* Start with the id that we know is deadlocked
202
* and then examine all other set bits and see
203
* if any are a better candidate for abortion
204
* and that they are genuinely part of the
205
* deadlock. The definition of "best":
206
* OLDEST: smallest id
207
* YOUNGEST: largest id
208
* MAXLOCKS: maximum count
209
* MINLOCKS: minimum count
210
* MINWRITE: minimum count
213
for (i = (killid + 1) % nlockers;
215
i = (i + 1) % nlockers) {
216
if (!ISSET_MAP(*deadp, i))
220
if (__dd_isolder(idmap[killid].id,
221
idmap[i].id, lock_max, txn_max))
225
case DB_LOCK_YOUNGEST:
226
if (__dd_isolder(idmap[i].id,
227
idmap[killid].id, lock_max, txn_max))
231
case DB_LOCK_MAXLOCKS:
232
if (idmap[i].count < idmap[killid].count)
236
case DB_LOCK_MINLOCKS:
237
case DB_LOCK_MINWRITE:
238
if (idmap[i].count > idmap[killid].count)
247
if (__dd_verify(idmap, *deadp,
248
tmpmap, copymap, nlockers, nalloc, i))
252
dokill: if (killid == BAD_KILLID)
256
* There are cases in which our general algorithm will
257
* fail. Returning 1 from verify indicates that the
258
* particular locker is not only involved in a deadlock,
259
* but that killing him will allow others to make forward
260
* progress. Unfortunately, there are cases where we need
261
* to abort someone, but killing them will not necessarily
262
* ensure forward progress (imagine N readers all trying to
263
* acquire a write lock). In such a scenario, we'll have
264
* gotten all the way through the loop, we will have found
265
* someone to keep (keeper will be valid), but killid will
266
* still be the initial deadlocker. In this case, if the
267
* initial killid satisfies __dd_verify, kill it, else abort
268
* keeper and indicate that we need to run deadlock detection
272
if (keeper != BAD_KILLID && killid == limit &&
273
__dd_verify(idmap, *deadp,
274
tmpmap, copymap, nlockers, nalloc, killid) == 0) {
275
LOCKREGION(dbenv, lt);
277
UNLOCKREGION(dbenv, lt);
281
/* Kill the locker with lockid idmap[killid]. */
282
if ((ret = __dd_abort(dbenv, &idmap[killid])) != 0) {
284
* It's possible that the lock was already aborted;
285
* this isn't necessarily a problem, so do not treat
288
if (ret == DB_ALREADY_ABORTED)
292
"warning: unable to abort locker %lx",
293
(u_long)idmap[killid].id);
294
} else if (FLD_ISSET(dbenv->verbose, DB_VERB_DEADLOCK))
296
"Aborting locker %lx", (u_long)idmap[killid].id);
298
__os_free(dbenv, tmpmap);
299
err1: __os_free(dbenv, copymap);
301
err: if (free_me != NULL)
302
__os_free(dbenv, free_me);
303
__os_free(dbenv, bitmap);
304
__os_free(dbenv, idmap);
310
* ========================================================================
314
# define DD_INVALID_ID ((u_int32_t) -1)
317
__dd_build(dbenv, atype, bmp, nlockers, allocp, idmap)
319
u_int32_t atype, **bmp, *nlockers, *allocp;
322
struct __db_lock *lp;
323
DB_LOCKER *lip, *lockerp, *child;
325
DB_LOCKREGION *region;
327
locker_info *id_array;
329
u_int32_t *bitmap, count, dd, *entryp, id, ndx, nentries, *tmpmap;
331
int expire_only, is_first, need_timeout, ret;
333
lt = dbenv->lk_handle;
334
region = lt->reginfo.primary;
335
LOCK_SET_TIME_INVALID(&now);
337
expire_only = atype == DB_LOCK_EXPIRE;
340
* While we always check for expired timeouts, if we are called
341
* with DB_LOCK_EXPIRE, then we are only checking for timeouts
342
* (i.e., not doing deadlock detection at all). If we aren't
343
* doing real deadlock detection, then we can skip a significant,
344
* amount of the processing. In particular we do not build
345
* the conflict array and our caller needs to expect this.
354
* We'll check how many lockers there are, add a few more in for
355
* good measure and then allocate all the structures. Then we'll
356
* verify that we have enough room when we go back in and get the
357
* mutex the second time.
359
retry: count = region->stat.st_nlockers;
366
if (FLD_ISSET(dbenv->verbose, DB_VERB_DEADLOCK))
367
__db_err(dbenv, "%lu lockers", (u_long)count);
370
nentries = ALIGN(count, 32) / 32;
373
* Allocate enough space for a count by count bitmap matrix.
376
* We can probably save the malloc's between iterations just
377
* reallocing if necessary because count grew by too much.
379
if ((ret = __os_calloc(dbenv, (size_t)count,
380
sizeof(u_int32_t) * nentries, &bitmap)) != 0)
383
if ((ret = __os_calloc(dbenv,
384
sizeof(u_int32_t), nentries, &tmpmap)) != 0) {
385
__os_free(dbenv, bitmap);
389
if ((ret = __os_calloc(dbenv,
390
(size_t)count, sizeof(locker_info), &id_array)) != 0) {
391
__os_free(dbenv, bitmap);
392
__os_free(dbenv, tmpmap);
397
* Now go back in and actually fill in the matrix.
399
if (region->stat.st_nlockers > count) {
400
__os_free(dbenv, bitmap);
401
__os_free(dbenv, tmpmap);
402
__os_free(dbenv, id_array);
407
* First we go through and assign each locker a deadlock detector id.
409
for (id = 0, lip = SH_TAILQ_FIRST(®ion->lockers, __db_locker);
411
lip = SH_TAILQ_NEXT(lip, ulinks, __db_locker)) {
412
if (F_ISSET(lip, DB_LOCKER_INABORT))
414
if (lip->master_locker == INVALID_ROFF) {
416
id_array[lip->dd_id].id = lip->id;
417
if (atype == DB_LOCK_MINLOCKS ||
418
atype == DB_LOCK_MAXLOCKS)
419
id_array[lip->dd_id].count = lip->nlocks;
420
if (atype == DB_LOCK_MINWRITE)
421
id_array[lip->dd_id].count = lip->nwrites;
423
lip->dd_id = DD_INVALID_ID;
428
* We only need consider objects that have waiters, so we use
429
* the list of objects with waiters (dd_objs) instead of traversing
430
* the entire hash table. For each object, we traverse the waiters
431
* list and add an entry in the waitsfor matrix for each waiter/holder
435
for (op = SH_TAILQ_FIRST(®ion->dd_objs, __db_lockobj);
436
op != NULL; op = SH_TAILQ_NEXT(op, dd_links, __db_lockobj)) {
439
CLEAR_MAP(tmpmap, nentries);
442
* First we go through and create a bit map that
443
* represents all the holders of this object.
445
for (lp = SH_TAILQ_FIRST(&op->holders, __db_lock);
447
lp = SH_TAILQ_NEXT(lp, links, __db_lock)) {
448
LOCKER_LOCK(lt, region, lp->holder, ndx);
449
if ((ret = __lock_getlocker(lt,
450
lp->holder, ndx, 0, &lockerp)) != 0)
452
if (F_ISSET(lockerp, DB_LOCKER_INABORT))
455
if (lockerp->dd_id == DD_INVALID_ID) {
456
dd = ((DB_LOCKER *)R_ADDR(<->reginfo,
457
lockerp->master_locker))->dd_id;
459
if (atype == DB_LOCK_MINLOCKS ||
460
atype == DB_LOCK_MAXLOCKS)
461
id_array[dd].count += lockerp->nlocks;
462
if (atype == DB_LOCK_MINWRITE)
463
id_array[dd].count += lockerp->nwrites;
467
id_array[dd].valid = 1;
470
* If the holder has already been aborted, then
471
* we should ignore it for now.
473
if (lp->status == DB_LSTAT_HELD)
478
* Next, for each waiter, we set its row in the matrix
479
* equal to the map of holders we set up above.
483
lp = SH_TAILQ_FIRST(&op->waiters, __db_lock);
486
lp = SH_TAILQ_NEXT(lp, links, __db_lock)) {
487
LOCKER_LOCK(lt, region, lp->holder, ndx);
488
if ((ret = __lock_getlocker(lt,
489
lp->holder, ndx, 0, &lockerp)) != 0)
491
if (lp->status == DB_LSTAT_WAITING) {
492
if (__lock_expired(dbenv,
493
&now, &lockerp->lk_expire)) {
494
lp->status = DB_LSTAT_EXPIRED;
495
MUTEX_UNLOCK(dbenv, &lp->mutex);
499
LOCK_TIME_ISVALID(&lockerp->lk_expire);
505
if (lockerp->dd_id == DD_INVALID_ID) {
506
dd = ((DB_LOCKER *)R_ADDR(<->reginfo,
507
lockerp->master_locker))->dd_id;
509
if (atype == DB_LOCK_MINLOCKS ||
510
atype == DB_LOCK_MAXLOCKS)
511
id_array[dd].count += lockerp->nlocks;
512
if (atype == DB_LOCK_MINWRITE)
513
id_array[dd].count += lockerp->nwrites;
516
id_array[dd].valid = 1;
519
* If the transaction is pending abortion, then
520
* ignore it on this iteration.
522
if (lp->status != DB_LSTAT_WAITING)
525
entryp = bitmap + (nentries * dd);
526
OR_MAP(entryp, tmpmap, nentries);
528
* If this is the first waiter on the queue,
529
* then we remove the waitsfor relationship
530
* with oneself. However, if it's anywhere
531
* else on the queue, then we have to keep
532
* it and we have an automatic deadlock.
535
if (ISSET_MAP(entryp, dd))
536
id_array[dd].self_wait = 1;
543
region->need_dd = need_timeout;
547
/* Now for each locker; record its last lock. */
548
for (id = 0; id < count; id++) {
549
if (!id_array[id].valid)
551
LOCKER_LOCK(lt, region, id_array[id].id, ndx);
552
if ((ret = __lock_getlocker(lt,
553
id_array[id].id, ndx, 0, &lockerp)) != 0) {
555
"No locks for locker %lu", (u_long)id_array[id].id);
560
* If this is a master transaction, try to
561
* find one of its children's locks first,
562
* as they are probably more recent.
564
child = SH_LIST_FIRST(&lockerp->child_locker, __db_locker);
567
lp = SH_LIST_FIRST(&child->heldby, __db_lock);
569
lp->status == DB_LSTAT_WAITING) {
570
id_array[id].last_locker_id = child->id;
573
child = SH_LIST_NEXT(
574
child, child_link, __db_locker);
575
} while (child != NULL);
577
lp = SH_LIST_FIRST(&lockerp->heldby, __db_lock);
579
id_array[id].last_locker_id = lockerp->id;
580
get_lock: id_array[id].last_lock = R_OFFSET(<->reginfo, lp);
581
lo = (DB_LOCKOBJ *)((u_int8_t *)lp + lp->obj);
582
pptr = SH_DBT_PTR(&lo->lockobj);
583
if (lo->lockobj.size >= sizeof(db_pgno_t))
584
memcpy(&id_array[id].pgno,
585
pptr, sizeof(db_pgno_t));
587
id_array[id].pgno = 0;
592
* Pass complete, reset the deadlock detector bit,
593
* unless we have pending timeouts.
595
region->need_dd = need_timeout;
598
* Now we can release everything except the bitmap matrix that we
605
__os_free(dbenv, tmpmap);
610
__dd_find(dbenv, bmp, idmap, nlockers, nalloc, deadp)
612
u_int32_t *bmp, nlockers, nalloc;
616
u_int32_t i, j, k, *mymap, *tmpmap;
618
int ndead, ndeadalloc, ret;
620
#undef INITIAL_DEAD_ALLOC
621
#define INITIAL_DEAD_ALLOC 8
623
ndeadalloc = INITIAL_DEAD_ALLOC;
625
if ((ret = __os_malloc(dbenv,
626
ndeadalloc * sizeof(u_int32_t *), &retp)) != 0)
630
* For each locker, OR in the bits from the lockers on which that
633
for (mymap = bmp, i = 0; i < nlockers; i++, mymap += nalloc) {
636
for (j = 0; j < nlockers; j++) {
637
if (!ISSET_MAP(mymap, j))
640
/* Find the map for this bit. */
641
tmpmap = bmp + (nalloc * j);
642
OR_MAP(mymap, tmpmap, nalloc);
643
if (!ISSET_MAP(mymap, i))
646
/* Make sure we leave room for NULL. */
647
if (ndead + 2 >= ndeadalloc) {
650
* If the alloc fails, then simply return the
651
* deadlocks that we already have.
653
if (__os_realloc(dbenv,
654
ndeadalloc * sizeof(u_int32_t),
661
retp[ndead++] = mymap;
663
/* Mark all participants in this deadlock invalid. */
664
for (k = 0; k < nlockers; k++)
665
if (ISSET_MAP(mymap, k))
676
__dd_abort(dbenv, info)
680
struct __db_lock *lockp;
683
DB_LOCKREGION *region;
688
lt = dbenv->lk_handle;
689
region = lt->reginfo.primary;
691
LOCKREGION(dbenv, lt);
693
/* Find the locker's last lock. */
694
LOCKER_LOCK(lt, region, info->last_locker_id, ndx);
695
if ((ret = __lock_getlocker(lt,
696
info->last_locker_id, ndx, 0, &lockerp)) != 0 || lockerp == NULL) {
698
ret = DB_ALREADY_ABORTED;
702
/* It's possible that this locker was already aborted. */
703
if ((lockp = SH_LIST_FIRST(&lockerp->heldby, __db_lock)) == NULL) {
704
ret = DB_ALREADY_ABORTED;
707
if (R_OFFSET(<->reginfo, lockp) != info->last_lock ||
708
lockp->status != DB_LSTAT_WAITING) {
709
ret = DB_ALREADY_ABORTED;
713
sh_obj = (DB_LOCKOBJ *)((u_int8_t *)lockp + lockp->obj);
714
SH_LIST_REMOVE(lockp, locker_links, __db_lock);
716
/* Abort lock, take it off list, and wake up this lock. */
717
SHOBJECT_LOCK(lt, region, sh_obj, ndx);
718
lockp->status = DB_LSTAT_ABORTED;
719
SH_TAILQ_REMOVE(&sh_obj->waiters, lockp, links, __db_lock);
722
* Either the waiters list is now empty, in which case we remove
723
* it from dd_objs, or it is not empty, in which case we need to
726
if (SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock) == NULL)
727
SH_TAILQ_REMOVE(®ion->dd_objs,
728
sh_obj, dd_links, __db_lockobj);
730
ret = __lock_promote(lt, sh_obj, 0);
731
MUTEX_UNLOCK(dbenv, &lockp->mutex);
733
region->stat.st_ndeadlocks++;
734
UNLOCKREGION(dbenv, lt);
738
out: UNLOCKREGION(dbenv, lt);
744
__dd_debug(dbenv, idmap, bitmap, nlockers, nalloc)
747
u_int32_t *bitmap, nlockers, nalloc;
749
u_int32_t i, j, *mymap;
752
__db_err(dbenv, "Waitsfor array\nWaiter:\tWaiting on:");
754
/* Allocate space to print 10 bytes per item waited on. */
756
#define MSGBUF_LEN ((nlockers + 1) * 10 + 64)
757
if (__os_malloc(dbenv, MSGBUF_LEN, &msgbuf) != 0)
760
for (mymap = bitmap, i = 0; i < nlockers; i++, mymap += nalloc) {
763
sprintf(msgbuf, /* Waiter. */
764
"%lx/%lu:\t", (u_long)idmap[i].id, (u_long)idmap[i].pgno);
765
for (j = 0; j < nlockers; j++)
766
if (ISSET_MAP(mymap, j))
767
sprintf(msgbuf, "%s %lx", msgbuf,
768
(u_long)idmap[j].id);
769
(void)sprintf(msgbuf,
770
"%s %lu", msgbuf, (u_long)idmap[i].last_lock);
771
__db_err(dbenv, msgbuf);
774
__os_free(dbenv, msgbuf);
779
* Given a bitmap that contains a deadlock, verify that the bit
780
* specified in the which parameter indicates a transaction that
781
* is actually deadlocked. Return 1 if really deadlocked, 0 otherwise.
782
* deadmap is the array that identified the deadlock.
783
* tmpmap is a copy of the initial bitmaps from the dd_build phase
784
* origmap is a temporary bit map into which we can OR things
785
* nlockers is the number of actual lockers under consideration
786
* nalloc is the number of words allocated for the bitmap
787
* which is the locker in question
790
__dd_verify(idmap, deadmap, tmpmap, origmap, nlockers, nalloc, which)
792
u_int32_t *deadmap, *tmpmap, *origmap;
793
u_int32_t nlockers, nalloc, which;
799
memset(tmpmap, 0, sizeof(u_int32_t) * nalloc);
802
* In order for "which" to be actively involved in
803
* the deadlock, removing him from the evaluation
804
* must remove the deadlock. So, we OR together everyone
805
* except which; if all the participants still have their
806
* bits set, then the deadlock persists and which does
807
* not participate. If the deadlock does not persist
808
* then "which" does participate.
811
for (j = 0; j < nlockers; j++) {
812
if (!ISSET_MAP(deadmap, j) || j == which)
815
/* Find the map for this bit. */
816
tmap = origmap + (nalloc * j);
819
* We special case the first waiter who is also a holder, so
820
* we don't automatically call that a deadlock. However, if
821
* it really is a deadlock, we need the bit set now so that
822
* we treat the first waiter like other waiters.
824
if (idmap[j].self_wait)
826
OR_MAP(tmpmap, tmap, nalloc);
834
* Now check the resulting map and see whether
835
* all participants still have their bit set.
837
for (j = 0; j < nlockers; j++) {
838
if (!ISSET_MAP(deadmap, j) || j == which)
840
if (!ISSET_MAP(tmpmap, j))
849
* Figure out the relative age of two lockers. We make all lockers
850
* older than all transactions, because that's how it's worked
851
* historically (because lockers are lower ids).
854
__dd_isolder(a, b, lock_max, txn_max)
856
u_int32_t lock_max, txn_max;
860
/* Check for comparing lock-id and txnid. */
861
if (a <= DB_LOCK_MAXID && b > DB_LOCK_MAXID)
863
if (b <= DB_LOCK_MAXID && a > DB_LOCK_MAXID)
866
/* In the same space; figure out which one. */
868
if (a <= DB_LOCK_MAXID)
872
* We can't get a 100% correct ordering, because we don't know
873
* where the current interval started and if there were older
874
* lockers outside the interval. We do the best we can.
878
* Check for a wrapped case with ids above max.
880
if (a > max && b < max)
882
if (b > max && a < max)