1
/*****************************************************************************
3
Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
4
Copyright (c) 2008, Google Inc.
6
Portions of this file contain modifications contributed and copyrighted by
7
Google, Inc. Those modifications are gratefully acknowledged and are described
8
briefly in the InnoDB documentation. The contributions by Google are
9
incorporated with their permission, and subject to the conditions contained in
10
the file COPYING.Google.
12
This program is free software; you can redistribute it and/or modify it under
13
the terms of the GNU General Public License as published by the Free Software
14
Foundation; version 2 of the License.
16
This program is distributed in the hope that it will be useful, but WITHOUT
17
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20
You should have received a copy of the GNU General Public License along with
21
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
22
Place, Suite 330, Boston, MA 02111-1307 USA
24
*****************************************************************************/
26
/******************************************************
27
The wait array used in synchronization primitives
29
Created 9/5/1995 Heikki Tuuri
30
*******************************************************/
34
#include "sync0arr.ic"
37
#include "sync0sync.h"
47
The wait array consists of cells each of which has an
48
an operating system event object created for it. The threads
49
waiting for a mutex, for example, can reserve a cell
50
in the array and suspend themselves to wait for the event
51
to become signaled. When using the wait array, remember to make
52
sure that some thread holding the synchronization object
53
will eventually know that there is a waiter in the array and
54
signal the object, to prevent infinite wait.
55
Why we chose to implement a wait array? First, to make
56
mutexes fast, we had to code our own implementation of them,
57
which only in usually uncommon cases resorts to using
58
slow operating system primitives. Then we had the choice of
59
assigning a unique OS event for each mutex, which would
60
be simpler, or using a global wait array. In some operating systems,
61
the global wait array solution is more efficient and flexible,
62
because we can do with a very small number of OS events,
63
say 200. In NT 3.51, allocating events seems to be a quadratic
64
algorithm, because 10 000 events are created fast, but
65
100 000 events takes a couple of minutes to create.
67
As of 5.0.30 the above mentioned design is changed. Since now
68
OS can handle millions of wait events efficiently, we no longer
69
have this concept of each cell of wait array having one event.
70
Instead, now the event that a thread wants to wait on is embedded
71
in the wait object (mutex or rw_lock). We still keep the global
72
wait array for the sake of diagnostics and also to avoid infinite
73
wait The error_monitor thread scans the global wait array to signal
74
any waiting threads who have missed the signal. */
76
/* A cell where an individual thread may wait suspended
77
until a resource is released. The suspending is implemented
78
using an operating system event semaphore. */
79
struct sync_cell_struct {
80
void* wait_object; /* pointer to the object the
81
thread is waiting for; if NULL
82
the cell is free for use */
83
mutex_t* old_wait_mutex; /* the latest wait mutex in cell */
84
rw_lock_t* old_wait_rw_lock;/* the latest wait rw-lock in cell */
85
ulint request_type; /* lock type requested on the
87
const char* file; /* in debug version file where
89
ulint line; /* in debug version line where
91
os_thread_id_t thread; /* thread id of this waiting
93
ibool waiting; /* TRUE if the thread has already
94
called sync_array_event_wait
96
ib_int64_t signal_count; /* We capture the signal_count
97
of the wait_object when we
98
reset the event. This value is
99
then passed on to os_event_wait
100
and we wait only if the event
101
has not been signalled in the
102
period between the reset and
104
time_t reservation_time;/* time when the thread reserved
108
/* NOTE: It is allowed for a thread to wait
109
for an event allocated for the array without owning the
110
protecting mutex (depending on the case: OS or database mutex), but
111
all changes (set or reset) to the state of the event must be made
112
while owning the mutex. */
113
struct sync_array_struct {
114
ulint n_reserved; /* number of currently reserved
115
cells in the wait array */
116
ulint n_cells; /* number of cells in the
118
sync_cell_t* array; /* pointer to wait array */
119
ulint protection; /* this flag tells which
120
mutex protects the data */
121
mutex_t mutex; /* possible database mutex
122
protecting this data structure */
123
os_mutex_t os_mutex; /* Possible operating system mutex
124
protecting the data structure.
125
As this data structure is used in
126
constructing the database mutex,
127
to prevent infinite recursion
128
in implementation, we fall back to
130
ulint sg_count; /* count of how many times an
131
object has been signalled */
132
ulint res_count; /* count of cell reservations
133
since creation of the array */
136
#ifdef UNIV_SYNC_DEBUG
137
/**********************************************************************
138
This function is called only in the debug version. Detects a deadlock
139
of one or more threads because of waits of semaphores. */
142
sync_array_detect_deadlock(
143
/*=======================*/
144
/* out: TRUE if deadlock detected */
145
sync_array_t* arr, /* in: wait array; NOTE! the caller must
146
own the mutex to array */
147
sync_cell_t* start, /* in: cell where recursive search started */
148
sync_cell_t* cell, /* in: cell to search */
149
ulint depth); /* in: recursion depth */
150
#endif /* UNIV_SYNC_DEBUG */
152
/*********************************************************************
153
Gets the nth cell in array. */
156
sync_array_get_nth_cell(
157
/*====================*/
159
sync_array_t* arr, /* in: sync array */
160
ulint n) /* in: index */
163
ut_a(n < arr->n_cells);
165
return(arr->array + n);
168
/**********************************************************************
169
Reserves the mutex semaphore protecting a sync array. */
174
sync_array_t* arr) /* in: sync wait array */
178
protection = arr->protection;
180
if (protection == SYNC_ARRAY_OS_MUTEX) {
181
os_mutex_enter(arr->os_mutex);
182
} else if (protection == SYNC_ARRAY_MUTEX) {
183
mutex_enter(&(arr->mutex));
189
/**********************************************************************
190
Releases the mutex semaphore protecting a sync array. */
195
sync_array_t* arr) /* in: sync wait array */
199
protection = arr->protection;
201
if (protection == SYNC_ARRAY_OS_MUTEX) {
202
os_mutex_exit(arr->os_mutex);
203
} else if (protection == SYNC_ARRAY_MUTEX) {
204
mutex_exit(&(arr->mutex));
210
/***********************************************************************
211
Creates a synchronization wait array. It is protected by a mutex
212
which is automatically reserved when the functions operating on it
218
/* out, own: created wait array */
219
ulint n_cells, /* in: number of cells in the array
221
ulint protection) /* in: either SYNC_ARRAY_OS_MUTEX or
222
SYNC_ARRAY_MUTEX: determines the type
223
of mutex protecting the data structure */
230
/* Allocate memory for the data structures */
231
arr = ut_malloc(sizeof(sync_array_t));
232
memset(arr, 0x0, sizeof(*arr));
234
sz = sizeof(sync_cell_t) * n_cells;
235
arr->array = ut_malloc(sz);
236
memset(arr->array, 0x0, sz);
238
arr->n_cells = n_cells;
239
arr->protection = protection;
241
/* Then create the mutex to protect the wait array complex */
242
if (protection == SYNC_ARRAY_OS_MUTEX) {
243
arr->os_mutex = os_mutex_create(NULL);
244
} else if (protection == SYNC_ARRAY_MUTEX) {
245
mutex_create(&arr->mutex, SYNC_NO_ORDER_CHECK);
253
/**********************************************************************
254
Frees the resources in a wait array. */
259
sync_array_t* arr) /* in, own: sync wait array */
263
ut_a(arr->n_reserved == 0);
265
sync_array_validate(arr);
267
protection = arr->protection;
269
/* Release the mutex protecting the wait array complex */
271
if (protection == SYNC_ARRAY_OS_MUTEX) {
272
os_mutex_free(arr->os_mutex);
273
} else if (protection == SYNC_ARRAY_MUTEX) {
274
mutex_free(&(arr->mutex));
283
/************************************************************************
284
Validates the integrity of the wait array. Checks
285
that the number of reserved cells equals the count variable. */
290
sync_array_t* arr) /* in: sync wait array */
296
sync_array_enter(arr);
298
for (i = 0; i < arr->n_cells; i++) {
299
cell = sync_array_get_nth_cell(arr, i);
300
if (cell->wait_object != NULL) {
305
ut_a(count == arr->n_reserved);
307
sync_array_exit(arr);
310
/***********************************************************************
311
Returns the event that the thread owning the cell waits for. */
316
sync_cell_t* cell) /* in: non-empty sync array cell */
318
ulint type = cell->request_type;
320
if (type == SYNC_MUTEX) {
321
return(((mutex_t *) cell->wait_object)->event);
322
} else if (type == RW_LOCK_WAIT_EX) {
323
return(((rw_lock_t *) cell->wait_object)->wait_ex_event);
324
} else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */
325
return(((rw_lock_t *) cell->wait_object)->event);
329
/**********************************************************************
330
Reserves a wait array cell for waiting for an object.
331
The event of the cell is reset to nonsignalled state. */
334
sync_array_reserve_cell(
335
/*====================*/
336
sync_array_t* arr, /* in: wait array */
337
void* object, /* in: pointer to the object to wait for */
338
ulint type, /* in: lock request type */
339
const char* file, /* in: file where requested */
340
ulint line, /* in: line where requested */
341
ulint* index) /* out: index of the reserved cell */
350
sync_array_enter(arr);
354
/* Reserve a new cell. */
355
for (i = 0; i < arr->n_cells; i++) {
356
cell = sync_array_get_nth_cell(arr, i);
358
if (cell->wait_object == NULL) {
360
cell->waiting = FALSE;
361
cell->wait_object = object;
363
if (type == SYNC_MUTEX) {
364
cell->old_wait_mutex = object;
366
cell->old_wait_rw_lock = object;
369
cell->request_type = type;
378
sync_array_exit(arr);
380
/* Make sure the event is reset and also store
381
the value of signal_count at which the event
383
event = sync_cell_get_event(cell);
384
cell->signal_count = os_event_reset(event);
386
cell->reservation_time = time(NULL);
388
cell->thread = os_thread_get_curr_id();
394
ut_error; /* No free cell found */
399
/**********************************************************************
400
This function should be called when a thread starts to wait on
401
a wait array cell. In the debug version this function checks
402
if the wait for a semaphore will result in a deadlock, in which
403
case prints info and asserts. */
406
sync_array_wait_event(
407
/*==================*/
408
sync_array_t* arr, /* in: wait array */
409
ulint index) /* in: index of the reserved cell */
416
sync_array_enter(arr);
418
cell = sync_array_get_nth_cell(arr, index);
420
ut_a(cell->wait_object);
421
ut_a(!cell->waiting);
422
ut_ad(os_thread_get_curr_id() == cell->thread);
424
event = sync_cell_get_event(cell);
425
cell->waiting = TRUE;
427
#ifdef UNIV_SYNC_DEBUG
429
/* We use simple enter to the mutex below, because if
430
we cannot acquire it at once, mutex_enter would call
431
recursively sync_array routines, leading to trouble.
432
rw_lock_debug_mutex freezes the debug lists. */
434
rw_lock_debug_mutex_enter();
436
if (TRUE == sync_array_detect_deadlock(arr, cell, cell, 0)) {
438
fputs("########################################\n", stderr);
442
rw_lock_debug_mutex_exit();
444
sync_array_exit(arr);
446
os_event_wait_low(event, cell->signal_count);
448
sync_array_free_cell(arr, index);
451
/**********************************************************************
452
Reports info of a wait array cell. */
455
sync_array_cell_print(
456
/*==================*/
457
FILE* file, /* in: file where to print */
458
sync_cell_t* cell) /* in: sync cell */
465
type = cell->request_type;
468
"--Thread %lu has waited at %s line %lu"
469
" for %.2f seconds the semaphore:\n",
470
(ulong) os_thread_pf(cell->thread), cell->file,
472
difftime(time(NULL), cell->reservation_time));
474
if (type == SYNC_MUTEX) {
475
/* We use old_wait_mutex in case the cell has already
476
been freed meanwhile */
477
mutex = cell->old_wait_mutex;
480
"Mutex at %p created file %s line %lu, lock var %lu\n"
481
#ifdef UNIV_SYNC_DEBUG
482
"Last time reserved in file %s line %lu, "
483
#endif /* UNIV_SYNC_DEBUG */
484
"waiters flag %lu\n",
485
(void*) mutex, mutex->cfile_name, (ulong) mutex->cline,
486
(ulong) mutex->lock_word,
487
#ifdef UNIV_SYNC_DEBUG
488
mutex->file_name, (ulong) mutex->line,
489
#endif /* UNIV_SYNC_DEBUG */
490
(ulong) mutex->waiters);
492
} else if (type == RW_LOCK_EX
493
|| type == RW_LOCK_WAIT_EX
494
|| type == RW_LOCK_SHARED) {
496
fputs(type == RW_LOCK_EX ? "X-lock on" : "S-lock on", file);
498
rwlock = cell->old_wait_rw_lock;
501
" RW-latch at %p created in file %s line %lu\n",
502
(void*) rwlock, rwlock->cfile_name,
503
(ulong) rwlock->cline);
504
writer = rw_lock_get_writer(rwlock);
505
if (writer != RW_LOCK_NOT_LOCKED) {
507
"a writer (thread id %lu) has"
508
" reserved it in mode %s",
509
(ulong) os_thread_pf(rwlock->writer_thread),
512
: " wait exclusive\n");
516
"number of readers %lu, waiters flag %lu, "
518
"Last time read locked in file %s line %lu\n"
519
"Last time write locked in file %s line %lu\n",
520
(ulong) rw_lock_get_reader_count(rwlock),
521
(ulong) rwlock->waiters,
523
rwlock->last_s_file_name,
524
(ulong) rwlock->last_s_line,
525
rwlock->last_x_file_name,
526
(ulong) rwlock->last_x_line);
531
if (!cell->waiting) {
532
fputs("wait has ended\n", file);
536
#ifdef UNIV_SYNC_DEBUG
537
/**********************************************************************
538
Looks for a cell with the given thread id. */
541
sync_array_find_thread(
542
/*===================*/
543
/* out: pointer to cell or NULL
545
sync_array_t* arr, /* in: wait array */
546
os_thread_id_t thread) /* in: thread id */
551
for (i = 0; i < arr->n_cells; i++) {
553
cell = sync_array_get_nth_cell(arr, i);
555
if (cell->wait_object != NULL
556
&& os_thread_eq(cell->thread, thread)) {
558
return(cell); /* Found */
562
return(NULL); /* Not found */
565
/**********************************************************************
566
Recursion step for deadlock detection. */
569
sync_array_deadlock_step(
570
/*=====================*/
571
/* out: TRUE if deadlock detected */
572
sync_array_t* arr, /* in: wait array; NOTE! the caller must
573
own the mutex to array */
574
sync_cell_t* start, /* in: cell where recursive search
576
os_thread_id_t thread, /* in: thread to look at */
577
ulint pass, /* in: pass value */
578
ulint depth) /* in: recursion depth */
586
/* If pass != 0, then we do not know which threads are
587
responsible of releasing the lock, and no deadlock can
593
new = sync_array_find_thread(arr, thread);
596
/* Stop running of other threads */
598
ut_dbg_stop_threads = TRUE;
601
fputs("########################################\n"
602
"DEADLOCK of threads detected!\n", stderr);
607
ret = sync_array_detect_deadlock(arr, start, new, depth);
616
/**********************************************************************
617
This function is called only in the debug version. Detects a deadlock
618
of one or more threads because of waits of semaphores. */
621
sync_array_detect_deadlock(
622
/*=======================*/
623
/* out: TRUE if deadlock detected */
624
sync_array_t* arr, /* in: wait array; NOTE! the caller must
625
own the mutex to array */
626
sync_cell_t* start, /* in: cell where recursive search started */
627
sync_cell_t* cell, /* in: cell to search */
628
ulint depth) /* in: recursion depth */
632
os_thread_id_t thread;
634
rw_lock_debug_t*debug;
639
ut_ad(cell->wait_object);
640
ut_ad(os_thread_get_curr_id() == start->thread);
645
if (!cell->waiting) {
647
return(FALSE); /* No deadlock here */
650
if (cell->request_type == SYNC_MUTEX) {
652
mutex = cell->wait_object;
654
if (mutex_get_lock_word(mutex) != 0) {
656
thread = mutex->thread_id;
658
/* Note that mutex->thread_id above may be
659
also OS_THREAD_ID_UNDEFINED, because the
660
thread which held the mutex maybe has not
661
yet updated the value, or it has already
662
released the mutex: in this case no deadlock
663
can occur, as the wait array cannot contain
664
a thread with ID_UNDEFINED value. */
666
ret = sync_array_deadlock_step(arr, start, thread, 0,
670
"Mutex %p owned by thread %lu file %s line %lu\n",
671
mutex, (ulong) os_thread_pf(mutex->thread_id),
672
mutex->file_name, (ulong) mutex->line);
673
sync_array_cell_print(stderr, cell);
679
return(FALSE); /* No deadlock */
681
} else if (cell->request_type == RW_LOCK_EX
682
|| cell->request_type == RW_LOCK_WAIT_EX) {
684
lock = cell->wait_object;
686
debug = UT_LIST_GET_FIRST(lock->debug_list);
688
while (debug != NULL) {
690
thread = debug->thread_id;
692
if (((debug->lock_type == RW_LOCK_EX)
693
&& !os_thread_eq(thread, cell->thread))
694
|| ((debug->lock_type == RW_LOCK_WAIT_EX)
695
&& !os_thread_eq(thread, cell->thread))
696
|| (debug->lock_type == RW_LOCK_SHARED)) {
698
/* The (wait) x-lock request can block
699
infinitely only if someone (can be also cell
700
thread) is holding s-lock, or someone
701
(cannot be cell thread) (wait) x-lock, and
702
he is blocked by start thread */
704
ret = sync_array_deadlock_step(
705
arr, start, thread, debug->pass,
709
fprintf(stderr, "rw-lock %p ",
711
sync_array_cell_print(stderr, cell);
712
rw_lock_debug_print(debug);
717
debug = UT_LIST_GET_NEXT(list, debug);
722
} else if (cell->request_type == RW_LOCK_SHARED) {
724
lock = cell->wait_object;
725
debug = UT_LIST_GET_FIRST(lock->debug_list);
727
while (debug != NULL) {
729
thread = debug->thread_id;
731
if ((debug->lock_type == RW_LOCK_EX)
732
|| (debug->lock_type == RW_LOCK_WAIT_EX)) {
734
/* The s-lock request can block infinitely
735
only if someone (can also be cell thread) is
736
holding (wait) x-lock, and he is blocked by
739
ret = sync_array_deadlock_step(
740
arr, start, thread, debug->pass,
747
debug = UT_LIST_GET_NEXT(list, debug);
756
return(TRUE); /* Execution never reaches this line: for compiler
759
#endif /* UNIV_SYNC_DEBUG */
761
/**********************************************************************
762
Determines if we can wake up the thread waiting for a sempahore. */
765
sync_arr_cell_can_wake_up(
766
/*======================*/
767
sync_cell_t* cell) /* in: cell to search */
772
if (cell->request_type == SYNC_MUTEX) {
774
mutex = cell->wait_object;
776
if (mutex_get_lock_word(mutex) == 0) {
781
} else if (cell->request_type == RW_LOCK_EX) {
783
lock = cell->wait_object;
785
if (lock->lock_word > 0) {
786
/* Either unlocked or only read locked. */
791
} else if (cell->request_type == RW_LOCK_WAIT_EX) {
793
lock = cell->wait_object;
795
/* lock_word == 0 means all readers have left */
796
if (lock->lock_word == 0) {
800
} else if (cell->request_type == RW_LOCK_SHARED) {
801
lock = cell->wait_object;
803
/* lock_word > 0 means no writer or reserved writer */
804
if (lock->lock_word > 0) {
813
/**********************************************************************
814
Frees the cell. NOTE! sync_array_wait_event frees the cell
818
sync_array_free_cell(
819
/*=================*/
820
sync_array_t* arr, /* in: wait array */
821
ulint index) /* in: index of the cell in array */
825
sync_array_enter(arr);
827
cell = sync_array_get_nth_cell(arr, index);
829
ut_a(cell->wait_object != NULL);
831
cell->waiting = FALSE;
832
cell->wait_object = NULL;
833
cell->signal_count = 0;
835
ut_a(arr->n_reserved > 0);
838
sync_array_exit(arr);
841
/**************************************************************************
842
Increments the signalled count. */
845
sync_array_object_signalled(
846
/*========================*/
847
sync_array_t* arr) /* in: wait array */
849
#ifdef HAVE_ATOMIC_BUILTINS
850
(void) os_atomic_increment_ulint(&arr->sg_count, 1);
852
sync_array_enter(arr);
856
sync_array_exit(arr);
860
/**************************************************************************
861
If the wakeup algorithm does not work perfectly at semaphore relases,
862
this function will do the waking (see the comment in mutex_exit). This
863
function should be called about every 1 second in the server.
865
Note that there's a race condition between this thread and mutex_exit
866
changing the lock_word and calling signal_object, so sometimes this finds
867
threads to wake up even when nothing has gone wrong. */
870
sync_arr_wake_threads_if_sema_free(void)
871
/*====================================*/
873
sync_array_t* arr = sync_primary_wait_array;
879
sync_array_enter(arr);
884
while (count < arr->n_reserved) {
886
cell = sync_array_get_nth_cell(arr, i);
889
if (cell->wait_object == NULL) {
894
if (sync_arr_cell_can_wake_up(cell)) {
896
event = sync_cell_get_event(cell);
903
sync_array_exit(arr);
906
/**************************************************************************
907
Prints warnings of long semaphore waits to stderr. */
910
sync_array_print_long_waits(void)
911
/*=============================*/
912
/* out: TRUE if fatal semaphore wait threshold
917
ibool noticed = FALSE;
919
ulint fatal_timeout = srv_fatal_semaphore_wait_threshold;
922
for (i = 0; i < sync_primary_wait_array->n_cells; i++) {
924
cell = sync_array_get_nth_cell(sync_primary_wait_array, i);
926
if (cell->wait_object != NULL && cell->waiting
927
&& difftime(time(NULL), cell->reservation_time) > 240) {
928
fputs("InnoDB: Warning: a long semaphore wait:\n",
930
sync_array_cell_print(stderr, cell);
934
if (cell->wait_object != NULL && cell->waiting
935
&& difftime(time(NULL), cell->reservation_time)
943
"InnoDB: ###### Starts InnoDB Monitor"
944
" for 30 secs to print diagnostic info:\n");
945
old_val = srv_print_innodb_monitor;
947
/* If some crucial semaphore is reserved, then also the InnoDB
948
Monitor can hang, and we do not get diagnostics. Since in
949
many cases an InnoDB hang is caused by a pwrite() or a pread()
950
call hanging inside the operating system, let us print right
951
now the values of pending calls of these. */
954
"InnoDB: Pending preads %lu, pwrites %lu\n",
955
(ulong)os_file_n_pending_preads,
956
(ulong)os_file_n_pending_pwrites);
958
srv_print_innodb_monitor = TRUE;
959
os_event_set(srv_lock_timeout_thread_event);
961
os_thread_sleep(30000000);
963
srv_print_innodb_monitor = old_val;
965
"InnoDB: ###### Diagnostic info printed"
966
" to the standard error stream\n");
972
/**************************************************************************
973
Prints info of the wait array. */
976
sync_array_output_info(
977
/*===================*/
978
FILE* file, /* in: file where to print */
979
sync_array_t* arr) /* in: wait array; NOTE! caller must own the
987
"OS WAIT ARRAY INFO: reservation count %ld, signal count %ld\n",
988
(long) arr->res_count, (long) arr->sg_count);
992
while (count < arr->n_reserved) {
994
cell = sync_array_get_nth_cell(arr, i);
996
if (cell->wait_object != NULL) {
998
sync_array_cell_print(file, cell);
1005
/**************************************************************************
1006
Prints info of the wait array. */
1009
sync_array_print_info(
1010
/*==================*/
1011
FILE* file, /* in: file where to print */
1012
sync_array_t* arr) /* in: wait array */
1014
sync_array_enter(arr);
1016
sync_array_output_info(file, arr);
1018
sync_array_exit(arr);