1
/* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
3
This program is free software; you can redistribute it and/or modify
4
it under the terms of the GNU General Public License as published by
5
the Free Software Foundation; version 2 of the License.
7
This program is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU General Public License for more details.
12
You should have received a copy of the GNU General Public License
13
along with this program; if not, write to the Free Software
14
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
19
"waiting threads" subsystem - a unified interface for threads to wait
20
on each other, with built-in deadlock detection.
24
a thread - is represented by a WT_THD structure. One physical thread
25
can have only one WT_THD descriptor at any given moment.
27
a resource - a thread does not wait for other threads directly,
28
instead it waits for a "resource", which is "owned" by other threads.
29
It waits, exactly, for all "owners" to "release" a resource.
30
It does not have to correspond to a physical resource. For example, it
31
may be convenient in certain cases to force resource == thread.
32
A resource is represented by a WT_RESOURCE structure.
34
a resource identifier - a pair of {resource type, value}. A value is
35
an ulonglong number. Represented by a WT_RESOURCE_ID structure.
37
a resource type - a pointer to a statically defined instance of
38
WT_RESOURCE_TYPE structure. This structure contains a pointer to
39
a function that knows how to compare values of this resource type.
40
In the simple case it could be wt_resource_id_memcmp().
42
a wait-for graph - a graph, that represenst "wait-for" relationships.
43
It has two types of nodes - threads and resources. There are directed
44
edges from a thread to a resource it is waiting for (WT_THD::waiting_for),
45
from a thread to resources that it "owns" (WT_THD::my_resources),
46
and from a resource to threads that "own" it (WT_RESOURCE::owners)
51
For flawless deadlock detection wait-for graph must be complete.
52
It means that when a thread starts waiting it needs to know *all* its
53
blockers, and call wt_thd_will_wait_for() for every one of them.
54
Otherwise two phenomena should be expected:
58
thread A needs to get a lock, and is blocked by a thread B.
60
Just before the timeout thread B releases the lock.
61
thread A is ready to grab the lock but discovers that it is also
62
blocked by a thread C.
63
It waits and times out.
65
As a result thread A has waited two timeout intervals, instead of one.
67
2. Unreliable cycle detection:
69
Thread A waits for threads B and C
71
Thread D wants to start waiting for A
73
one can see immediately that thread D creates a cycle, and thus
74
a deadlock is detected.
76
But if thread A would only wait for B, and start waiting for C
77
when B would unlock, thread D would be allowed to wait, a deadlock
78
would be only detected when B unlocks or somebody times out.
80
These two phenomena don't affect a correctness, and strictly speaking,
81
the caller is not required to call wt_thd_will_wait_for() for *all*
82
blockers - it may optimize wt_thd_will_wait_for() calls. But they
83
may be perceived as bugs by users, it must be understood that such
84
an optimization comes with its price.
89
First, the wt* subsystem must be initialized by calling
90
wt_init(). In the server you don't need to do it, it's done
93
Similarly, wt_end() frees wt* structures, should be called
94
at the end, but in the server mysqld.cc takes care of that.
96
Every WT_THD should be initialized with wt_thd_lazy_init().
97
After that they can be used in other wt_thd_* calls.
98
Before discarding, WT_THD should be free'd with
99
wt_thd_destroy(). In the server both are handled in sql_class.cc,
100
it's an error to try to do it manually.
102
To use the deadlock detection one needs to use this thread's WT_THD,
103
call wt_thd_will_wait_for() for every thread it needs to wait on,
104
then call wt_thd_cond_timedwait(). When thread releases a resource
105
it should call wt_thd_release() (or wt_thd_release_all()) - it will
106
notify (send a signal) threads waiting in wt_thd_cond_timedwait(),
109
Just like with pthread's cond_wait, there could be spurious
110
wake-ups from wt_thd_cond_timedwait(). A caller is expected to
111
handle that (that is, to re-check the blocking criteria).
113
wt_thd_will_wait_for() and wt_thd_cond_timedwait() return either
114
WT_OK or WT_DEADLOCK. Additionally wt_thd_cond_timedwait() can return
115
WT_TIMEOUT. Out of memory and other fatal errors are reported as
116
WT_DEADLOCK - and a transaction must be aborted just the same.
120
There are four config variables. Two deadlock search depths - short and
121
long - and two timeouts. Deadlock search is performed with the short
122
depth on every wt_thd_will_wait_for() call. wt_thd_cond_timedwait()
123
waits with a short timeout, performs a deadlock search with the long
124
depth, and waits with a long timeout. As most deadlock cycles are supposed
125
to be short, most deadlocks will be detected at once, and waits will
128
These config variables are thread-local. Different threads may have
129
different search depth and timeout values.
131
Also, deadlock detector supports different killing strategies, the victim
132
in a deadlock cycle is selected based on the "weight". See "weight"
133
description in waiting_threads.h for details. It's up to the caller to
134
set weights accordingly.
138
We calculate the number of successfull waits (WT_OK returned from
139
wt_thd_cond_timedwait()), a number of timeouts, a deadlock cycle
140
length distribution - number of deadlocks with every length from
141
1 to WT_CYCLE_STATS, and a wait time distribution - number
142
of waits with a time from 1 us to 1 min in WT_WAIT_STATS
143
intervals on a log e scale.
145
Sample usage as was done in the Maria engine
146
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
147
- in class THD, THD::transaction had a WT_THD object; there were session
148
variables to set the short/long depth/timeout.
149
- when mysqld started, wt_init() was called; when it ended, wt_end() was
151
- in THD's constructor,
152
wt_thd_lazy_init(&transaction.wt, &variables.wt_deadlock_search_depth_short,
153
&variables.wt_timeout_short,
154
&variables.wt_deadlock_search_depth_long,
155
&variables.wt_timeout_long);
157
wt_thd_destroy(&transaction.wt);
158
- this was sufficient to make the deadlock-detector available to the Maria
159
engine (which can grab THD); the engine used it this way:
160
- when it wrote a row, and hit a duplicate key, it would find who wrote this
161
key, the "blocker" transaction. If "blocker" had committed, duplicate key
162
error would be sent. Otherwise, we would wait for it, in the following code
163
snippet (originally from storage/maria/ma_write.c). After the blocker is
164
gone, we would retry the write:
166
while (keyinfo->ck_insert(info,
167
(*keyinfo->make_key)(info, &int_key, i, buff, record,
168
filepos, info->trn->trid)))
170
/ * we got a write error * /
171
if error is not "duplicate key" then return error;
172
info->dup_key_trid has the culprit:
173
if the culprit is ourselves then return error;
175
blocker= trnman_trid_to_trn(info->trn, info->dup_key_trid);
177
if blocker TRN was not found, it means that the conflicting
178
transaction was committed long time ago. It could not be
179
aborted, as it would have to wait on the key tree lock
180
to remove the conflicting key it has inserted.
182
if (!blocker || blocker->commit_trid != ~(TrID)0)
185
pthread_mutex_unlock(& blocker->state_lock);
186
rw_unlock(&keyinfo->root_lock);
189
/ * release root_lock to let blocker finish its work * /
190
rw_unlock(&keyinfo->root_lock);
192
/ * running. now we wait * /
195
const char *old_proc_info;
197
rc.type= &ma_rc_dup_unique;
198
rc.value= (intptr)blocker;
199
res= wt_thd_will_wait_for(info->trn->wt, blocker->wt, & rc);
202
pthread_mutex_unlock(& blocker->state_lock);
203
my_errno= HA_ERR_LOCK_DEADLOCK;
206
old_proc_info= proc_info_hook(0,
207
"waiting for a resource",
208
__func__, __FILE__, __LINE__);
209
res= wt_thd_cond_timedwait(info->trn->wt, & blocker->state_lock);
210
proc_info_hook(0, old_proc_info, __func__, __FILE__, __LINE__);
212
pthread_mutex_unlock(& blocker->state_lock);
215
my_errno= res == WT_TIMEOUT ? HA_ERR_LOCK_WAIT_TIMEOUT
216
: HA_ERR_LOCK_DEADLOCK;
219
/ * if we come here, blocker has rolled back or committed,
220
so is gone, we can retry the ck_insert() * /
222
rw_wrlock(&keyinfo->root_lock);
223
#ifndef MARIA_CANNOT_ROLLBACK
227
- ma_rc_dup_unique was:
228
/ * a WT_RESOURCE_TYPE for transactions waiting on a unique key conflict * /
229
WT_RESOURCE_TYPE ma_rc_dup_unique={ wt_resource_id_memcmp, 0};
230
- When a Maria transaction would commit or rollback it would call:
231
/ * Wake up threads waiting for this transaction * /
232
static void wt_thd_release_self(TRN *trn)
237
rc.type= &ma_rc_dup_unique;
238
rc.value= (intptr)trn;
239
wt_thd_release(trn->wt, & rc);
246
unittest/mysys/waiting_threads-t.c, currently disabled.
250
Note that if your lock system satisfy the following condition:
252
there exist four lock levels A, B, C, D, such as
253
A is compatible with B
254
A is not compatible with C
255
D is not compatible with B
257
(example A=IX, B=IS, C=S, D=X)
259
you need to include lock level in the resource identifier - a
260
thread waiting for lock of the type A on resource R and another
261
thread waiting for lock of the type B on resource R should wait on
262
different WT_RESOURCE structures, on different {lock, resource}
263
pairs. Otherwise the following is possible:
265
thread1> take S-lock on R
266
thread2> take IS-lock on R
267
thread3> wants X-lock on R, starts waiting for threads 1 and 2 on R.
268
thread3 is killed (or timeout or whatever)
269
WT_RESOURCE structure for R is still in the hash, as it has two owners
270
thread4> wants an IX-lock on R
271
WT_RESOURCE for R is found in the hash, thread4 starts waiting on it.
272
!! now thread4 is waiting for both thread1 and thread2
273
!! while, in fact, IX-lock and IS-lock are compatible and
274
!! thread4 should not wait for thread2.
277
#include <waiting_threads.h>
278
#include <m_string.h>
280
/* status variables */
283
preset table of wait intervals
285
ulonglong wt_wait_table[WT_WAIT_STATS];
287
wait time distribution (log e scale)
289
uint32 wt_wait_stats[WT_WAIT_STATS+1];
291
distribution of cycle lengths
292
first column tells whether this was during short or long detection
294
uint32 wt_cycle_stats[2][WT_CYCLE_STATS+1];
295
uint32 wt_success_stats;
297
static my_atomic_rwlock_t cycle_stats_lock, wait_stats_lock, success_stats_lock;
299
#ifdef SAFE_STATISTICS
300
#define incr(VAR, LOCK) \
302
my_atomic_rwlock_wrlock(&(LOCK)); \
303
my_atomic_add32(&(VAR), 1); \
304
my_atomic_rwlock_wrunlock(&(LOCK)); \
307
#define incr(VAR,LOCK) do { (VAR)++; } while(0)
310
static void increment_success_stats()
312
incr(wt_success_stats, success_stats_lock);
315
static void increment_cycle_stats(uint depth, uint slot)
317
if (depth >= WT_CYCLE_STATS)
318
depth= WT_CYCLE_STATS;
319
incr(wt_cycle_stats[slot][depth], cycle_stats_lock);
322
static void increment_wait_stats(ulonglong waited,int ret)
325
if ((ret) == ETIMEDOUT)
328
for (i= 0; i < WT_WAIT_STATS && waited/10 > wt_wait_table[i]; i++) ;
329
incr(wt_wait_stats[i], wait_stats_lock);
333
'lock' protects 'owners', 'state', and 'waiter_count'
336
a resource is picked up from a hash in a lock-free manner
337
it's returned pinned, so it cannot be freed at once
338
but it may be freed right after the pin is removed
339
to free a resource it should
343
two ways to access a resource:
345
- it's returned pinned.
346
a) take a lock in exclusive mode
347
b) check the state, it should be ACTIVE to be usable
349
2. by a direct reference
350
- could only used if a resource cannot be freed
351
e.g. accessing a resource by thd->waiting_for is safe,
352
a resource cannot be freed as there's a thread waiting for it
354
struct st_wt_resource {
357
enum { ACTIVE, FREE } state;
359
mysql_mutex_t *cond_mutex; /* a mutex for the 'cond' below */
362
before the 'lock' all elements are mutable, after (and including) -
363
immutable in the sense that lf_hash_insert() won't memcpy() over them.
366
#ifdef WT_RWLOCKS_USE_MUTEXES
368
we need a special rwlock-like 'lock' to allow readers bypass
369
waiting writers, otherwise readers can deadlock. For example:
371
A waits on resource x, owned by B, B waits on resource y, owned
372
by A, we have a cycle (A->x->B->y->A)
373
Both A and B start deadlock detection:
376
A goes deeper B goes deeper
379
with mutexes it would deadlock. With rwlocks it won't, as long
380
as both A and B are taking read locks (and they do).
381
But other threads may take write locks. Assume there's
382
C who wants to start waiting on x, and D who wants to start
385
A read-locks x B read-locks y
386
A goes deeper B goes deeper
387
=> C write-locks x (to add a new edge) D write-locks y
388
.. C is blocked D is blocked
389
A read-locks y B read-locks x
391
Now, if a read lock can bypass a pending wrote lock request, we're fine.
392
If it can not, we have a deadlock.
394
writer starvation is technically possible, but unlikely, because
395
the contention is expected to be low.
401
uint pending_writers: 15;
402
uint write_locked: 1;
407
mysql_cond_t cond; /* the corresponding mutex is provided by the caller */
408
DYNAMIC_ARRAY owners;
411
#ifdef WT_RWLOCKS_USE_MUTEXES
412
static void rc_rwlock_init(WT_RESOURCE *rc)
414
mysql_cond_init(0, &rc->lock.cond, 0);
415
mysql_mutex_init(0, &rc->lock.mutex, MY_MUTEX_INIT_FAST);
417
static void rc_rwlock_destroy(WT_RESOURCE *rc)
419
DBUG_ASSERT(rc->lock.write_locked == 0);
420
DBUG_ASSERT(rc->lock.readers == 0);
421
mysql_cond_destroy(&rc->lock.cond);
422
mysql_mutex_destroy(&rc->lock.mutex);
424
static void rc_rdlock(WT_RESOURCE *rc)
426
DBUG_PRINT("wt", ("TRYLOCK resid=%ld for READ", (ulong)rc->id.value));
427
mysql_mutex_lock(&rc->lock.mutex);
428
while (rc->lock.write_locked)
429
mysql_cond_wait(&rc->lock.cond, &rc->lock.mutex);
431
mysql_mutex_unlock(&rc->lock.mutex);
432
DBUG_PRINT("wt", ("LOCK resid=%ld for READ", (ulong)rc->id.value));
434
static void rc_wrlock(WT_RESOURCE *rc)
436
DBUG_PRINT("wt", ("TRYLOCK resid=%ld for WRITE", (ulong)rc->id.value));
437
mysql_mutex_lock(&rc->lock.mutex);
438
while (rc->lock.write_locked || rc->lock.readers)
439
mysql_cond_wait(&rc->lock.cond, &rc->lock.mutex);
440
rc->lock.write_locked= 1;
441
mysql_mutex_unlock(&rc->lock.mutex);
442
DBUG_PRINT("wt", ("LOCK resid=%ld for WRITE", (ulong)rc->id.value));
444
static void rc_unlock(WT_RESOURCE *rc)
446
DBUG_PRINT("wt", ("UNLOCK resid=%ld", (ulong)rc->id.value));
447
mysql_mutex_lock(&rc->lock.mutex);
448
if (rc->lock.write_locked)
450
rc->lock.write_locked= 0;
451
mysql_cond_broadcast(&rc->lock.cond);
453
else if (--rc->lock.readers == 0)
454
mysql_cond_broadcast(&rc->lock.cond);
455
mysql_mutex_unlock(&rc->lock.mutex);
458
static void rc_rwlock_init(WT_RESOURCE *rc)
460
my_rwlock_init(&rc->lock, 0);
462
static void rc_rwlock_destroy(WT_RESOURCE *rc)
464
rwlock_destroy(&rc->lock);
466
static void rc_rdlock(WT_RESOURCE *rc)
468
DBUG_PRINT("wt", ("TRYLOCK resid=%ld for READ", (ulong)rc->id.value));
469
rw_rdlock(&rc->lock);
470
DBUG_PRINT("wt", ("LOCK resid=%ld for READ", (ulong)rc->id.value));
472
static void rc_wrlock(WT_RESOURCE *rc)
474
DBUG_PRINT("wt", ("TRYLOCK resid=%ld for WRITE", (ulong)rc->id.value));
475
rw_wrlock(&rc->lock);
476
DBUG_PRINT("wt", ("LOCK resid=%ld for WRITE", (ulong)rc->id.value));
478
static void rc_unlock(WT_RESOURCE *rc)
480
DBUG_PRINT("wt", ("UNLOCK resid=%ld", (ulong)rc->id.value));
481
rw_unlock(&rc->lock);
486
All resources are stored in a lock-free hash. Different threads
487
may add new resources and perform deadlock detection concurrently.
489
static LF_HASH reshash;
492
WT_RESOURCE constructor
494
It's called from lf_hash and takes a pointer to an LF_SLIST instance.
495
WT_RESOURCE is located at arg+sizeof(LF_SLIST)
497
static void wt_resource_init(uchar *arg)
499
WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
500
DBUG_ENTER("wt_resource_init");
502
memset(rc, 0, sizeof(*rc));
504
mysql_cond_init(0, &rc->cond, 0);
505
my_init_dynamic_array(&rc->owners, sizeof(WT_THD *), 0, 5);
510
WT_RESOURCE destructor
512
It's called from lf_hash and takes a pointer to an LF_SLIST instance.
513
WT_RESOURCE is located at arg+sizeof(LF_SLIST)
515
static void wt_resource_destroy(uchar *arg)
517
WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
518
DBUG_ENTER("wt_resource_destroy");
520
DBUG_ASSERT(rc->owners.elements == 0);
521
rc_rwlock_destroy(rc);
522
mysql_cond_destroy(&rc->cond);
523
delete_dynamic(&rc->owners);
529
DBUG_ENTER("wt_init");
530
DBUG_ASSERT(reshash.alloc.constructor != wt_resource_init);
532
lf_hash_init(&reshash, sizeof(WT_RESOURCE), LF_HASH_UNIQUE, 0,
533
sizeof_WT_RESOURCE_ID, 0, 0);
534
reshash.alloc.constructor= wt_resource_init;
535
reshash.alloc.destructor= wt_resource_destroy;
537
Note a trick: we initialize the hash with the real element size,
538
but fix it later to a shortened element size. This way
539
the allocator will allocate elements correctly, but
540
lf_hash_insert() will only overwrite part of the element with memcpy().
541
lock, condition, and dynamic array will be intact.
543
reshash.element_size= offsetof(WT_RESOURCE, lock);
544
memset(wt_wait_stats, 0, sizeof(wt_wait_stats));
545
memset(wt_cycle_stats, 0, sizeof(wt_cycle_stats));
547
{ /* initialize wt_wait_table[]. from 1 us to 1 min, log e scale */
549
double from= log(1); /* 1 us */
550
double to= log(60e6); /* 1 min */
551
for (i= 0; i < WT_WAIT_STATS; i++)
553
wt_wait_table[i]= (ulonglong)exp((to-from)/(WT_WAIT_STATS-1)*i+from);
554
DBUG_ASSERT(i == 0 || wt_wait_table[i-1] != wt_wait_table[i]);
557
my_atomic_rwlock_init(&cycle_stats_lock);
558
my_atomic_rwlock_init(&success_stats_lock);
559
my_atomic_rwlock_init(&wait_stats_lock);
565
DBUG_ENTER("wt_end");
567
DBUG_ASSERT(reshash.count == 0);
568
lf_hash_destroy(&reshash);
569
my_atomic_rwlock_destroy(&cycle_stats_lock);
570
my_atomic_rwlock_destroy(&success_stats_lock);
571
my_atomic_rwlock_destroy(&wait_stats_lock);
576
Lazy WT_THD initialization
578
Cheap initialization of WT_THD. Only initialize fields that don't require
579
memory allocations - basically, it only does assignments. The rest of the
580
WT_THD structure will be initialized on demand, on the first use.
581
This allows one to initialize lazily all WT_THD structures, even if some
582
(or even most) of them will never be used for deadlock detection.
584
@param ds a pointer to deadlock search depth short value
585
@param ts a pointer to deadlock timeout short value
586
@param dl a pointer to deadlock search depth long value
587
@param tl a pointer to deadlock timeout long value
589
@note these are pointers to values, and WT_THD stores them as pointers.
590
It allows one later to change search depths and timeouts for existing
591
threads. It also means that the pointers must stay valid for the lifetime
594
void wt_thd_lazy_init(WT_THD *thd, const ulong *ds, const ulong *ts,
595
const ulong *dl, const ulong *tl)
597
DBUG_ENTER("wt_thd_lazy_init");
600
thd->deadlock_search_depth_short= ds;
601
thd->timeout_short= ts;
602
thd->deadlock_search_depth_long= dl;
603
thd->timeout_long= tl;
604
/* dynamic array is also initialized lazily - without memory allocations */
605
my_init_dynamic_array(&thd->my_resources, sizeof(WT_RESOURCE *), 0, 5);
607
thd->name= my_thread_name();
613
Finalize WT_THD initialization
615
After lazy WT_THD initialization, parts of the structure are still
616
uninitialized. This function completes the initialization, allocating
617
memory, if necessary. It's called automatically on demand, when WT_THD
620
static int fix_thd_pins(WT_THD *thd)
622
if (unlikely(thd->pins == 0))
624
thd->pins= lf_hash_get_pins(&reshash);
626
thd->name= my_thread_name();
629
return thd->pins == 0;
632
void wt_thd_destroy(WT_THD *thd)
634
DBUG_ENTER("wt_thd_destroy");
636
DBUG_ASSERT(thd->my_resources.elements == 0);
637
DBUG_ASSERT(thd->waiting_for == 0);
640
lf_hash_put_pins(thd->pins);
642
delete_dynamic(&thd->my_resources);
646
Trivial resource id comparison function - bytewise memcmp.
648
It can be used in WT_RESOURCE_TYPE structures where bytewise
649
comparison of values is sufficient.
651
my_bool wt_resource_id_memcmp(const void *a, const void *b)
653
/* we use the fact that there's no padding in the middle of WT_RESOURCE_ID */
654
compile_time_assert(offsetof(WT_RESOURCE_ID, type) == sizeof(ulonglong));
655
return memcmp(a, b, sizeof_WT_RESOURCE_ID);
659
arguments for the recursive deadlock_search function
661
struct deadlock_arg {
662
WT_THD * const thd; /**< starting point of a search */
663
uint const max_depth; /**< search depth limit */
664
WT_THD *victim; /**< a thread to be killed to resolve a deadlock */
665
WT_RESOURCE *last_locked_rc; /**< see comment at the end of deadlock_search() */
669
helper function to change the victim, according to the weight
671
static void change_victim(WT_THD* found, struct deadlock_arg *arg)
673
if (found->weight < arg->victim->weight)
675
if (arg->victim != arg->thd)
677
rc_unlock(arg->victim->waiting_for); /* release the previous victim */
678
DBUG_ASSERT(arg->last_locked_rc == found->waiting_for);
681
arg->last_locked_rc= 0;
686
recursive loop detection in a wait-for graph with a limited search depth
688
static int deadlock_search(struct deadlock_arg *arg, WT_THD *blocker,
691
WT_RESOURCE *rc, *volatile *shared_ptr= &blocker->waiting_for;
695
DBUG_ENTER("deadlock_search");
696
DBUG_PRINT("wt", ("enter: thd=%s, blocker=%s, depth=%u",
697
arg->thd->name, blocker->name, depth));
701
arg->last_locked_rc= 0;
703
if (depth > arg->max_depth)
705
DBUG_PRINT("wt", ("exit: WT_DEPTH_EXCEEDED (early)"));
706
DBUG_RETURN(WT_DEPTH_EXCEEDED);
711
safe dereference as explained in lf_alloc-pin.c
712
(in short: protects against lf_alloc_free() in lf_hash_delete())
717
lf_pin(arg->thd->pins, 0, rc);
718
} while (rc != *shared_ptr && LF_BACKOFF);
722
DBUG_PRINT("wt", ("exit: OK (early)"));
727
if (rc->state != ACTIVE || *shared_ptr != rc)
729
/* blocker is not waiting on this resource anymore */
731
lf_unpin(arg->thd->pins, 0);
734
/* as the state is locked, we can unpin now */
735
lf_unpin(arg->thd->pins, 0);
738
Below is not a pure depth-first search. It's a depth-first with a
739
slightest hint of breadth-first. Depth-first is:
742
foreach current in element->nodes[] do:
743
if current == X return error;
749
foreach current in element->nodes[] do:
750
if current == X return error;
751
foreach current in element->nodes[] do:
754
preferring shorter deadlocks over longer ones.
756
for (i= 0; i < rc->owners.elements; i++)
758
cursor= *dynamic_element(&rc->owners, i, WT_THD**);
760
We're only looking for (and detecting) cycles that include 'arg->thd'.
761
That is, only deadlocks that *we* have created. For example,
763
(thd waits for A, A waits for B, while B is waiting for thd).
764
While walking the graph we can encounter other cicles, e.g.
766
This will not be detected. Instead we will walk it in circles until
767
the search depth limit is reached (the latter guarantees that an
768
infinite loop is impossible). We expect the thread that has created
769
the cycle (one of A, B, and C) to detect its deadlock.
771
if (cursor == arg->thd)
774
increment_cycle_stats(depth, arg->max_depth ==
775
*arg->thd->deadlock_search_depth_long);
780
for (i= 0; i < rc->owners.elements; i++)
782
cursor= *dynamic_element(&rc->owners, i, WT_THD**);
783
switch (deadlock_search(arg, cursor, depth+1)) {
786
case WT_DEPTH_EXCEEDED:
787
ret= WT_DEPTH_EXCEEDED;
791
change_victim(cursor, arg); /* also sets arg->last_locked_rc to 0 */
792
i= rc->owners.elements; /* jump out of the loop */
797
if (arg->last_locked_rc)
798
rc_unlock(arg->last_locked_rc);
802
Note that 'rc' is locked in this function, but it's never unlocked here.
803
Instead it's saved in arg->last_locked_rc and the *caller* is
804
expected to unlock it. It's done to support different killing
805
strategies. This is how it works:
810
deadlock_search() function starts from thd, locks it (in fact it locks not
811
a thd, but a resource it is waiting on, but below, for simplicity, I'll
812
talk about "locking a thd"). Then it goes down recursively, locks A, and so
813
on. Goes down recursively, locks B. Goes down recursively, locks C.
814
Notices that C is waiting on thd. Deadlock detected. Sets arg->victim=thd.
815
Returns from the last deadlock_search() call. C stays locked!
816
Now it checks whether C is a more appropriate victim than 'thd'.
817
If yes - arg->victim=C, otherwise C is unlocked. Returns. B stays locked.
818
Now it checks whether B is a more appropriate victim than arg->victim.
819
If yes - old arg->victim is unlocked and arg->victim=B,
820
otherwise B is unlocked. Return.
823
In short, a resource is locked in a frame. But it's not unlocked in the
824
same frame, it's unlocked by the caller, and only after the caller checks
825
that it doesn't need to use current WT_THD as a victim. If it does - the
826
lock is kept and the old victim's resource is unlocked. When the recursion
827
is unrolled and we are back to deadlock() function, there are only two
828
locks left - on thd and on the victim.
830
arg->last_locked_rc= rc;
831
DBUG_PRINT("wt", ("exit: %s",
832
ret == WT_DEPTH_EXCEEDED ? "WT_DEPTH_EXCEEDED" :
833
ret ? "WT_DEADLOCK" : "OK"));
838
Deadlock detection in a wait-for graph
840
A wrapper for recursive deadlock_search() - prepares deadlock_arg structure,
841
invokes deadlock_search(), increments statistics, notifies the victim.
843
@param thd thread that is going to wait. Deadlock is detected
844
if, while walking the graph, we reach a thread that
846
@param blocker starting point of a search. In wt_thd_cond_timedwait()
847
it's thd, in wt_thd_will_wait_for() it's a thread that
848
thd is going to wait for
849
@param depth starting search depth. In general it's the number of
850
edges in the wait-for graph between thd and the
851
blocker. Practically only two values are used (and
852
supported) - when thd == blocker it's 0, when thd
853
waits directly for blocker, it's 1
854
@param max_depth search depth limit
856
static int deadlock(WT_THD *thd, WT_THD *blocker, uint depth,
859
struct deadlock_arg arg= {thd, max_depth, 0, 0};
861
DBUG_ENTER("deadlock");
862
DBUG_ASSERT(depth < 2);
863
ret= deadlock_search(&arg, blocker, depth);
864
if (ret == WT_DEPTH_EXCEEDED)
866
increment_cycle_stats(WT_CYCLE_STATS, max_depth ==
867
*thd->deadlock_search_depth_long);
871
if we started with depth==1, blocker was never considered for a victim
872
in deadlock_search(). Do it here.
874
if (ret == WT_DEADLOCK && depth)
875
change_victim(blocker, &arg);
876
if (arg.last_locked_rc)
879
Special return code if there's nobody to wait for.
881
depth == 0 means that we start the search from thd (thd == blocker).
882
ret == WT_OK means that no cycle was found and
883
arg.last_locked_rc == thd->waiting_for.
884
and arg.last_locked_rc->owners.elements == 0 means that
885
(applying the rule above) thd->waiting_for->owners.elements == 0,
886
and thd doesn't have anybody to wait for.
888
if (depth == 0 && ret == WT_OK && arg.last_locked_rc->owners.elements == 0)
890
DBUG_ASSERT(thd == blocker);
891
DBUG_ASSERT(arg.last_locked_rc == thd->waiting_for);
894
rc_unlock(arg.last_locked_rc);
896
/* notify the victim, if appropriate */
897
if (ret == WT_DEADLOCK && arg.victim != thd)
899
DBUG_PRINT("wt", ("killing %s", arg.victim->name));
900
arg.victim->killed= 1;
901
mysql_cond_broadcast(&arg.victim->waiting_for->cond);
902
rc_unlock(arg.victim->waiting_for);
910
Delete an element from reshash if it has no waiters or owners
912
rc->lock must be locked by the caller and it's unlocked on return.
914
static int unlock_lock_and_free_resource(WT_THD *thd, WT_RESOURCE *rc)
918
DBUG_ENTER("unlock_lock_and_free_resource");
920
DBUG_ASSERT(rc->state == ACTIVE);
922
if (rc->owners.elements || rc->waiter_count)
924
DBUG_PRINT("wt", ("nothing to do, %u owners, %u waiters",
925
rc->owners.elements, rc->waiter_count));
930
if (fix_thd_pins(thd))
936
/* XXX if (rc->id.type->make_key) key= rc->id.type->make_key(&rc->id, &keylen); else */
939
keylen= sizeof_WT_RESOURCE_ID;
943
To free the element correctly we need to:
944
1. take its lock (already done).
945
2. set the state to FREE
947
4. remove from the hash
951
DBUG_RETURN(lf_hash_delete(&reshash, thd->pins, key, keylen) == -1);
956
register the fact that thd is not waiting anymore
958
decrease waiter_count, clear waiting_for, free the resource if appropriate.
959
thd->waiting_for must be locked!
961
static int stop_waiting_locked(WT_THD *thd)
964
WT_RESOURCE *rc= thd->waiting_for;
965
DBUG_ENTER("stop_waiting_locked");
967
DBUG_ASSERT(rc->waiter_count);
968
DBUG_ASSERT(rc->state == ACTIVE);
971
ret= unlock_lock_and_free_resource(thd, rc);
972
DBUG_RETURN((thd->killed || ret) ? WT_DEADLOCK : WT_OK);
976
register the fact that thd is not waiting anymore
978
locks thd->waiting_for and calls stop_waiting_locked().
980
static int stop_waiting(WT_THD *thd)
983
WT_RESOURCE *rc= thd->waiting_for;
984
DBUG_ENTER("stop_waiting");
989
nobody's trying to free the resource now,
990
as its waiter_count is guaranteed to be non-zero
993
ret= stop_waiting_locked(thd);
998
notify the system that a thread needs to wait for another thread
1000
called by a *waiter* to declare that it (thd) will wait for another
1001
thread (blocker) on a specific resource (resid).
1002
can be called many times, if many blockers own a blocking resource.
1003
but must always be called with the same resource id - a thread cannot
1004
wait for more than one resource at a time.
1006
@return WT_OK or WT_DEADLOCK
1008
As a new edge is added to the wait-for graph, a deadlock detection is
1009
performed for this new edge.
1011
int wt_thd_will_wait_for(WT_THD *thd, WT_THD *blocker,
1012
const WT_RESOURCE_ID *resid)
1016
DBUG_ENTER("wt_thd_will_wait_for");
1020
DBUG_PRINT("wt", ("enter: thd=%s, blocker=%s, resid=%lu",
1021
thd->name, blocker->name, (ulong)resid->value));
1023
if (fix_thd_pins(thd))
1024
DBUG_RETURN(WT_DEADLOCK);
1026
if (thd->waiting_for == 0)
1030
/* XXX if (restype->make_key) key= restype->make_key(resid, &keylen); else */
1033
keylen= sizeof_WT_RESOURCE_ID;
1036
DBUG_PRINT("wt", ("first blocker"));
1039
while ((rc= lf_hash_search(&reshash, thd->pins, key, keylen)) == 0)
1043
DBUG_PRINT("wt", ("failed to find rc in hash, inserting"));
1044
memset(&tmp, 0, sizeof(tmp));
1048
if (lf_hash_insert(&reshash, thd->pins, &tmp) == -1) /* if OOM */
1049
DBUG_RETURN(WT_DEADLOCK);
1051
Two cases: either lf_hash_insert() failed - because another thread
1052
has just inserted a resource with the same id - and we need to retry.
1053
Or lf_hash_insert() succeeded, and then we need to repeat
1054
lf_hash_search() to find a real address of the newly inserted element.
1055
That is, we don't care what lf_hash_insert() has returned.
1056
And we need to repeat the loop anyway.
1059
if (rc == MY_ERRPTR)
1060
DBUG_RETURN(WT_DEADLOCK);
1062
DBUG_PRINT("wt", ("found in hash rc=%p", rc));
1065
if (rc->state != ACTIVE)
1067
DBUG_PRINT("wt", ("but it's not active, retrying"));
1068
/* Somebody has freed the element while we weren't looking */
1070
lf_hash_search_unpin(thd->pins);
1074
lf_hash_search_unpin(thd->pins); /* the element cannot go away anymore */
1075
thd->waiting_for= rc;
1081
DBUG_ASSERT(thd->waiting_for->id.type == resid->type);
1082
DBUG_ASSERT(resid->type->compare(&thd->waiting_for->id, resid) == 0);
1083
DBUG_PRINT("wt", ("adding another blocker"));
1086
we can safely access the resource here, it's in the hash as it has
1087
non-zero waiter_count
1089
rc= thd->waiting_for;
1091
DBUG_ASSERT(rc->waiter_count);
1092
DBUG_ASSERT(rc->state == ACTIVE);
1096
stop_waiting_locked(thd);
1097
DBUG_RETURN(WT_DEADLOCK);
1101
Another thread could be waiting on this resource for this very 'blocker'.
1102
In this case we should not add it to the list for the second time.
1104
for (i= 0; i < rc->owners.elements; i++)
1105
if (*dynamic_element(&rc->owners, i, WT_THD**) == blocker)
1107
if (i >= rc->owners.elements)
1109
if (push_dynamic(&blocker->my_resources, (void*)&rc))
1111
stop_waiting_locked(thd);
1112
DBUG_RETURN(WT_DEADLOCK); /* deadlock and OOM use the same error code */
1114
if (push_dynamic(&rc->owners, (void*)&blocker))
1116
pop_dynamic(&blocker->my_resources);
1117
stop_waiting_locked(thd);
1118
DBUG_RETURN(WT_DEADLOCK);
1123
if (deadlock(thd, blocker, 1, *thd->deadlock_search_depth_short) != WT_OK)
1126
DBUG_RETURN(WT_DEADLOCK);
1132
called by a *waiter* (thd) to start waiting
1134
It's supposed to be a drop-in replacement for
1135
pthread_cond_timedwait(), and it takes mutex as an argument.
1137
@return one of WT_TIMEOUT, WT_DEADLOCK, WT_OK
1139
int wt_thd_cond_timedwait(WT_THD *thd, mysql_mutex_t *mutex)
1141
int ret= WT_TIMEOUT;
1142
struct timespec timeout;
1143
ulonglong before, after, starttime;
1144
WT_RESOURCE *rc= thd->waiting_for;
1145
DBUG_ENTER("wt_thd_cond_timedwait");
1146
DBUG_PRINT("wt", ("enter: thd=%s, rc=%p", thd->name, rc));
1150
DBUG_ASSERT(rc->cond_mutex == mutex);
1152
rc->cond_mutex= mutex;
1153
mysql_mutex_assert_owner(mutex);
1156
before= starttime= my_getsystime();
1160
only for the sake of Windows we distinguish between
1161
'before' and 'starttime':
1163
my_getsystime() returns high-resolution value, that cannot be used for
1164
waiting (it doesn't follow system clock changes), but is good for time
1167
GetSystemTimeAsFileTime() follows system clock, but is low-resolution
1168
and will result in lousy intervals.
1170
GetSystemTimeAsFileTime((PFILETIME)&starttime);
1174
if (rc->owners.elements == 0)
1178
set_timespec_time_nsec(timeout, starttime, (*thd->timeout_short)*1000ULL);
1179
if (ret == WT_TIMEOUT && !thd->killed)
1180
ret= mysql_cond_timedwait(&rc->cond, mutex, &timeout);
1181
if (ret == WT_TIMEOUT && !thd->killed)
1183
int r= deadlock(thd, thd, 0, *thd->deadlock_search_depth_long);
1184
if (r == WT_FREE_TO_GO)
1186
else if (r != WT_OK)
1188
else if (*thd->timeout_long > *thd->timeout_short)
1190
set_timespec_time_nsec(timeout, starttime, (*thd->timeout_long)*1000ULL);
1192
ret= mysql_cond_timedwait(&rc->cond, mutex, &timeout);
1195
after= my_getsystime();
1196
if (stop_waiting(thd) == WT_DEADLOCK) /* if we're killed */
1198
increment_wait_stats(after-before, ret);
1200
increment_success_stats();
1205
called by a *blocker* when it releases a resource
1207
it's conceptually similar to pthread_cond_broadcast, and must be done
1208
under the same mutex as wt_thd_cond_timedwait().
1210
@param resid a resource to release. 0 to release all resources
1213
void wt_thd_release(WT_THD *thd, const WT_RESOURCE_ID *resid)
1216
DBUG_ENTER("wt_thd_release");
1218
for (i= 0; i < thd->my_resources.elements; i++)
1220
WT_RESOURCE *rc= *dynamic_element(&thd->my_resources, i, WT_RESOURCE**);
1221
if (!resid || (resid->type->compare(&rc->id, resid) == 0))
1227
nobody's trying to free the resource now,
1228
as its owners[] array is not empty (at least thd must be there)
1230
DBUG_ASSERT(rc->state == ACTIVE);
1231
for (j= 0; j < rc->owners.elements; j++)
1232
if (*dynamic_element(&rc->owners, j, WT_THD**) == thd)
1234
DBUG_ASSERT(j < rc->owners.elements);
1235
delete_dynamic_element(&rc->owners, j);
1236
if (rc->owners.elements == 0)
1238
mysql_cond_broadcast(&rc->cond);
1241
mysql_mutex_assert_owner(rc->cond_mutex);
1244
unlock_lock_and_free_resource(thd, rc);
1247
delete_dynamic_element(&thd->my_resources, i);
1253
reset_dynamic(&thd->my_resources);