1
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
4
COPYING CONDITIONS NOTICE:
6
This program is free software; you can redistribute it and/or modify
7
it under the terms of version 2 of the GNU General Public License as
8
published by the Free Software Foundation, and provided that the
9
following conditions are met:
11
* Redistributions of source code must retain this COPYING
12
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
13
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
14
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
17
* Redistributions in binary form must reproduce this COPYING
18
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
19
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
20
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
21
GRANT (below) in the documentation and/or other materials
22
provided with the distribution.
24
You should have received a copy of the GNU General Public License
25
along with this program; if not, write to the Free Software
26
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
31
TokuDB, Tokutek Fractal Tree Indexing Library.
32
Copyright (C) 2007-2013 Tokutek, Inc.
36
This program is distributed in the hope that it will be useful, but
37
WITHOUT ANY WARRANTY; without even the implied warranty of
38
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
39
General Public License for more details.
41
UNIVERSITY PATENT NOTICE:
43
The technology is licensed by the Massachusetts Institute of
44
Technology, Rutgers State University of New Jersey, and the Research
45
Foundation of State University of New York at Stony Brook under
46
United States of America Serial No. 11/760379 and to the patents
47
and/or patent applications resulting from it.
49
PATENT MARKING NOTICE:
51
This software is covered by US Patent No. 8,185,551.
52
This software is covered by US Patent No. 8,489,638.
56
"THIS IMPLEMENTATION" means the copyrightable works distributed by
57
Tokutek as part of the Fractal Tree project.
59
"PATENT CLAIMS" means the claims of patents that are owned or
60
licensable by Tokutek, both currently or in the future; and that in
61
the absence of this license would be infringed by THIS
62
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
64
"PATENT CHALLENGE" shall mean a challenge to the validity,
65
patentability, enforceability and/or non-infringement of any of the
66
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
68
Tokutek hereby grants to you, for the term and geographical scope of
69
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
70
irrevocable (except as stated in this section) patent license to
71
make, have made, use, offer to sell, sell, import, transfer, and
72
otherwise run, modify, and propagate the contents of THIS
73
IMPLEMENTATION, where such license applies only to the PATENT
74
CLAIMS. This grant does not include claims that would be infringed
75
only as a consequence of further modifications of THIS
76
IMPLEMENTATION. If you or your agent or licensee institute or order
77
or agree to the institution of patent litigation against any entity
78
(including a cross-claim or counterclaim in a lawsuit) alleging that
79
THIS IMPLEMENTATION constitutes direct or contributory patent
80
infringement, or inducement of patent infringement, then any rights
81
granted to you under this License shall terminate as of the date
82
such litigation is filed. If you or your agent or exclusive
83
licensee institute or order or agree to the institution of a PATENT
84
CHALLENGE, then Tokutek may terminate any rights granted to you
88
#ident "Copyright (c) 2009-2013 Tokutek Inc. All rights reserved."
89
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
93
* The purpose of this file is to implement the high-level logic for
94
* taking a checkpoint.
96
* There are three locks used for taking a checkpoint. They are listed below.
98
* NOTE: The reader-writer locks may be held by either multiple clients
99
* or the checkpoint function. (The checkpoint function has the role
100
* of the writer, the clients have the reader roles.)
102
* - multi_operation_lock
103
* This is a new reader-writer lock.
104
* This lock is held by the checkpoint function only for as long as is required to
105
* to set all the "pending" bits and to create the checkpoint-in-progress versions
106
* of the header and translation table (btt).
107
* The following operations must take the multi_operation_lock:
108
* - any set of operations that must be atomic with respect to begin checkpoint
110
* - checkpoint_safe_lock
111
* This is a new reader-writer lock.
112
* This lock is held for the entire duration of the checkpoint.
113
* It is used to prevent more than one checkpoint from happening at a time
114
* (the checkpoint function is non-re-entrant), and to prevent certain operations
115
* that should not happen during a checkpoint.
116
* The following operations must take the checkpoint_safe lock:
117
* - delete a dictionary
118
* - rename a dictionary
119
* The application can use this lock to disable checkpointing during other sensitive
120
* operations, such as making a backup copy of the database.
122
* Once the "pending" bits are set and the snapshots are taken of the header and btt,
123
* most normal database operations are permitted to resume.
129
#include <toku_portability.h>
133
#include "cachetable.h"
134
#include "log-internal.h"
136
#include "checkpoint.h"
137
#include <portability/toku_atomic.h>
138
#include <util/status.h>
139
#include <util/frwlock.h>
141
///////////////////////////////////////////////////////////////////////////////////
144
// Status is intended for display to humans to help understand system behavior.
145
// It does not need to be perfectly thread-safe.
147
static CHECKPOINT_STATUS_S cp_status;
149
#define STATUS_INIT(k,c,t,l,inc) TOKUDB_STATUS_INIT(cp_status, k, c, t, "checkpoint: " l, inc)
153
// Note, this function initializes the keyname, type, and legend fields.
154
// Value fields are initialized to zero by compiler.
156
STATUS_INIT(CP_PERIOD, CHECKPOINT_PERIOD, UINT64, "period", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
157
STATUS_INIT(CP_FOOTPRINT, nullptr, UINT64, "footprint", TOKU_ENGINE_STATUS);
158
STATUS_INIT(CP_TIME_LAST_CHECKPOINT_BEGIN, CHECKPOINT_LAST_BEGAN, UNIXTIME, "last checkpoint began ", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
159
STATUS_INIT(CP_TIME_LAST_CHECKPOINT_BEGIN_COMPLETE, CHECKPOINT_LAST_COMPLETE_BEGAN, UNIXTIME, "last complete checkpoint began ", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
160
STATUS_INIT(CP_TIME_LAST_CHECKPOINT_END, CHECKPOINT_LAST_COMPLETE_ENDED, UNIXTIME, "last complete checkpoint ended", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
161
STATUS_INIT(CP_TIME_CHECKPOINT_DURATION, CHECKPOINT_DURATION, UINT64, "time spent during checkpoint (begin and end phases)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
162
STATUS_INIT(CP_TIME_CHECKPOINT_DURATION_LAST, CHECKPOINT_DURATION_LAST, UINT64, "time spent during last checkpoint (begin and end phases)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
163
STATUS_INIT(CP_LAST_LSN, nullptr, UINT64, "last complete checkpoint LSN", TOKU_ENGINE_STATUS);
164
STATUS_INIT(CP_CHECKPOINT_COUNT, CHECKPOINT_TAKEN, UINT64, "checkpoints taken ", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
165
STATUS_INIT(CP_CHECKPOINT_COUNT_FAIL, CHECKPOINT_FAILED, UINT64, "checkpoints failed", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
166
STATUS_INIT(CP_WAITERS_NOW, nullptr, UINT64, "waiters now", TOKU_ENGINE_STATUS);
167
STATUS_INIT(CP_WAITERS_MAX, nullptr, UINT64, "waiters max", TOKU_ENGINE_STATUS);
168
STATUS_INIT(CP_CLIENT_WAIT_ON_MO, nullptr, UINT64, "non-checkpoint client wait on mo lock", TOKU_ENGINE_STATUS);
169
STATUS_INIT(CP_CLIENT_WAIT_ON_CS, nullptr, UINT64, "non-checkpoint client wait on cs lock", TOKU_ENGINE_STATUS);
171
STATUS_INIT(CP_BEGIN_TIME, CHECKPOINT_BEGIN_TIME, UINT64, "checkpoint begin time", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
172
STATUS_INIT(CP_LONG_BEGIN_COUNT, CHECKPOINT_LONG_BEGIN_COUNT, UINT64, "long checkpoint begin count", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
173
STATUS_INIT(CP_LONG_BEGIN_TIME, CHECKPOINT_LONG_BEGIN_TIME, UINT64, "long checkpoint begin time", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
175
cp_status.initialized = true;
179
#define STATUS_VALUE(x) cp_status.status[x].value.num
182
toku_checkpoint_get_status(CACHETABLE ct, CHECKPOINT_STATUS statp) {
183
if (!cp_status.initialized)
185
STATUS_VALUE(CP_PERIOD) = toku_get_checkpoint_period_unlocked(ct);
191
static LSN last_completed_checkpoint_lsn;
193
static toku_mutex_t checkpoint_safe_mutex;
194
static toku::frwlock checkpoint_safe_lock;
195
static toku_pthread_rwlock_t multi_operation_lock;
196
static toku_pthread_rwlock_t low_priority_multi_operation_lock;
198
static bool initialized = false; // sanity check
199
static volatile bool locked_mo = false; // true when the multi_operation write lock is held (by checkpoint)
200
static volatile bool locked_cs = false; // true when the checkpoint_safe write lock is held (by checkpoint)
201
static volatile uint64_t toku_checkpoint_long_threshold = 1000000;
203
// Note following static functions are called from checkpoint internal logic only,
204
// and use the "writer" calls for locking and unlocking.
207
multi_operation_lock_init(void) {
208
pthread_rwlockattr_t attr;
209
pthread_rwlockattr_init(&attr);
210
#if defined(HAVE_PTHREAD_RWLOCKATTR_SETKIND_NP)
211
pthread_rwlockattr_setkind_np(&attr, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
213
// TODO: need to figure out how to make writer-preferential rwlocks
216
toku_pthread_rwlock_init(&multi_operation_lock, &attr);
217
toku_pthread_rwlock_init(&low_priority_multi_operation_lock, &attr);
218
pthread_rwlockattr_destroy(&attr);
223
multi_operation_lock_destroy(void) {
224
toku_pthread_rwlock_destroy(&multi_operation_lock);
225
toku_pthread_rwlock_destroy(&low_priority_multi_operation_lock);
229
multi_operation_checkpoint_lock(void) {
230
toku_pthread_rwlock_wrlock(&low_priority_multi_operation_lock);
231
toku_pthread_rwlock_wrlock(&multi_operation_lock);
236
multi_operation_checkpoint_unlock(void) {
238
toku_pthread_rwlock_wrunlock(&multi_operation_lock);
239
toku_pthread_rwlock_wrunlock(&low_priority_multi_operation_lock);
243
checkpoint_safe_lock_init(void) {
244
toku_mutex_init(&checkpoint_safe_mutex, NULL);
245
checkpoint_safe_lock.init(&checkpoint_safe_mutex);
250
checkpoint_safe_lock_destroy(void) {
251
checkpoint_safe_lock.deinit();
252
toku_mutex_destroy(&checkpoint_safe_mutex);
256
checkpoint_safe_checkpoint_lock(void) {
257
toku_mutex_lock(&checkpoint_safe_mutex);
258
checkpoint_safe_lock.write_lock(false);
259
toku_mutex_unlock(&checkpoint_safe_mutex);
264
checkpoint_safe_checkpoint_unlock(void) {
266
toku_mutex_lock(&checkpoint_safe_mutex);
267
checkpoint_safe_lock.write_unlock();
268
toku_mutex_unlock(&checkpoint_safe_mutex);
271
// toku_xxx_client_(un)lock() functions are only called from client code,
272
// never from checkpoint code, and use the "reader" interface to the lock functions.
275
toku_multi_operation_client_lock(void) {
277
(void) toku_sync_fetch_and_add(&STATUS_VALUE(CP_CLIENT_WAIT_ON_MO), 1);
278
toku_pthread_rwlock_rdlock(&multi_operation_lock);
282
toku_multi_operation_client_unlock(void) {
283
toku_pthread_rwlock_rdunlock(&multi_operation_lock);
286
void toku_low_priority_multi_operation_client_lock(void) {
287
toku_pthread_rwlock_rdlock(&low_priority_multi_operation_lock);
290
void toku_low_priority_multi_operation_client_unlock(void) {
291
toku_pthread_rwlock_rdunlock(&low_priority_multi_operation_lock);
295
toku_checkpoint_safe_client_lock(void) {
297
(void) toku_sync_fetch_and_add(&STATUS_VALUE(CP_CLIENT_WAIT_ON_CS), 1);
298
toku_mutex_lock(&checkpoint_safe_mutex);
299
checkpoint_safe_lock.read_lock();
300
toku_mutex_unlock(&checkpoint_safe_mutex);
301
toku_multi_operation_client_lock();
305
toku_checkpoint_safe_client_unlock(void) {
306
toku_mutex_lock(&checkpoint_safe_mutex);
307
checkpoint_safe_lock.read_unlock();
308
toku_mutex_unlock(&checkpoint_safe_mutex);
309
toku_multi_operation_client_unlock();
312
// Initialize the checkpoint mechanism, must be called before any client operations.
314
toku_checkpoint_init(void) {
315
multi_operation_lock_init();
316
checkpoint_safe_lock_init();
321
toku_checkpoint_destroy(void) {
322
multi_operation_lock_destroy();
323
checkpoint_safe_lock_destroy();
327
#define SET_CHECKPOINT_FOOTPRINT(x) STATUS_VALUE(CP_FOOTPRINT) = footprint_offset + x
330
// Take a checkpoint of all currently open dictionaries
332
toku_checkpoint(CHECKPOINTER cp, TOKULOGGER logger,
333
void (*callback_f)(void*), void * extra,
334
void (*callback2_f)(void*), void * extra2,
335
checkpoint_caller_t caller_id) {
336
int footprint_offset = (int) caller_id * 1000;
340
(void) toku_sync_fetch_and_add(&STATUS_VALUE(CP_WAITERS_NOW), 1);
341
checkpoint_safe_checkpoint_lock();
342
(void) toku_sync_fetch_and_sub(&STATUS_VALUE(CP_WAITERS_NOW), 1);
344
if (STATUS_VALUE(CP_WAITERS_NOW) > STATUS_VALUE(CP_WAITERS_MAX))
345
STATUS_VALUE(CP_WAITERS_MAX) = STATUS_VALUE(CP_WAITERS_NOW); // threadsafe, within checkpoint_safe lock
347
SET_CHECKPOINT_FOOTPRINT(10);
348
multi_operation_checkpoint_lock();
349
SET_CHECKPOINT_FOOTPRINT(20);
350
toku_ft_open_close_lock();
352
SET_CHECKPOINT_FOOTPRINT(30);
353
STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN) = time(NULL);
354
uint64_t t_checkpoint_begin_start = toku_current_time_microsec();
355
toku_cachetable_begin_checkpoint(cp, logger);
356
uint64_t t_checkpoint_begin_end = toku_current_time_microsec();
358
toku_ft_open_close_unlock();
359
multi_operation_checkpoint_unlock();
361
SET_CHECKPOINT_FOOTPRINT(40);
363
callback_f(extra); // callback is called with checkpoint_safe_lock still held
365
toku_cachetable_end_checkpoint(cp, logger, callback2_f, extra2);
367
SET_CHECKPOINT_FOOTPRINT(50);
369
last_completed_checkpoint_lsn = logger->last_completed_checkpoint_lsn;
370
toku_logger_maybe_trim_log(logger, last_completed_checkpoint_lsn);
371
STATUS_VALUE(CP_LAST_LSN) = last_completed_checkpoint_lsn.lsn;
374
SET_CHECKPOINT_FOOTPRINT(60);
375
STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_END) = time(NULL);
376
STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN_COMPLETE) = STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN);
377
STATUS_VALUE(CP_CHECKPOINT_COUNT)++;
378
uint64_t duration = t_checkpoint_begin_end - t_checkpoint_begin_start;
379
STATUS_VALUE(CP_BEGIN_TIME) += duration;
380
if (duration >= toku_checkpoint_long_threshold) {
381
STATUS_VALUE(CP_LONG_BEGIN_TIME) += duration;
382
STATUS_VALUE(CP_LONG_BEGIN_COUNT) += 1;
384
STATUS_VALUE(CP_TIME_CHECKPOINT_DURATION) += (uint64_t) ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_END)) - ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN));
385
STATUS_VALUE(CP_TIME_CHECKPOINT_DURATION_LAST) = (uint64_t) ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_END)) - ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN));
386
STATUS_VALUE(CP_FOOTPRINT) = 0;
388
checkpoint_safe_checkpoint_unlock();
392
#include <toku_race_tools.h>
393
void __attribute__((__constructor__)) toku_checkpoint_helgrind_ignore(void);
395
toku_checkpoint_helgrind_ignore(void) {
396
TOKU_VALGRIND_HG_DISABLE_CHECKING(&cp_status, sizeof cp_status);
397
TOKU_VALGRIND_HG_DISABLE_CHECKING(&locked_mo, sizeof locked_mo);
398
TOKU_VALGRIND_HG_DISABLE_CHECKING(&locked_cs, sizeof locked_cs);
401
#undef SET_CHECKPOINT_FOOTPRINT