1
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
4
COPYING CONDITIONS NOTICE:
6
This program is free software; you can redistribute it and/or modify
7
it under the terms of version 2 of the GNU General Public License as
8
published by the Free Software Foundation, and provided that the
9
following conditions are met:
11
* Redistributions of source code must retain this COPYING
12
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
13
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
14
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
17
* Redistributions in binary form must reproduce this COPYING
18
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
19
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
20
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
21
GRANT (below) in the documentation and/or other materials
22
provided with the distribution.
24
You should have received a copy of the GNU General Public License
25
along with this program; if not, write to the Free Software
26
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
31
TokuDB, Tokutek Fractal Tree Indexing Library.
32
Copyright (C) 2007-2013 Tokutek, Inc.
36
This program is distributed in the hope that it will be useful, but
37
WITHOUT ANY WARRANTY; without even the implied warranty of
38
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
39
General Public License for more details.
41
UNIVERSITY PATENT NOTICE:
43
The technology is licensed by the Massachusetts Institute of
44
Technology, Rutgers State University of New Jersey, and the Research
45
Foundation of State University of New York at Stony Brook under
46
United States of America Serial No. 11/760379 and to the patents
47
and/or patent applications resulting from it.
49
PATENT MARKING NOTICE:
51
This software is covered by US Patent No. 8,185,551.
52
This software is covered by US Patent No. 8,489,638.
56
"THIS IMPLEMENTATION" means the copyrightable works distributed by
57
Tokutek as part of the Fractal Tree project.
59
"PATENT CLAIMS" means the claims of patents that are owned or
60
licensable by Tokutek, both currently or in the future; and that in
61
the absence of this license would be infringed by THIS
62
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
64
"PATENT CHALLENGE" shall mean a challenge to the validity,
65
patentability, enforceability and/or non-infringement of any of the
66
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
68
Tokutek hereby grants to you, for the term and geographical scope of
69
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
70
irrevocable (except as stated in this section) patent license to
71
make, have made, use, offer to sell, sell, import, transfer, and
72
otherwise run, modify, and propagate the contents of THIS
73
IMPLEMENTATION, where such license applies only to the PATENT
74
CLAIMS. This grant does not include claims that would be infringed
75
only as a consequence of further modifications of THIS
76
IMPLEMENTATION. If you or your agent or licensee institute or order
77
or agree to the institution of patent litigation against any entity
78
(including a cross-claim or counterclaim in a lawsuit) alleging that
79
THIS IMPLEMENTATION constitutes direct or contributory patent
80
infringement, or inducement of patent infringement, then any rights
81
granted to you under this License shall terminate as of the date
82
such litigation is filed. If you or your agent or exclusive
83
licensee institute or order or agree to the institution of a PATENT
84
CHALLENGE, then Tokutek may terminate any rights granted to you
88
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
89
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
92
#include <toku_race_tools.h>
95
#include <ft/txn_manager.h>
96
#include <ft/log_header.h>
97
#include <ft/checkpoint.h>
99
#include <portability/toku_atomic.h>
101
#include "ydb-internal.h"
103
#include "ydb_row_lock.h"
105
static uint64_t toku_txn_id64(DB_TXN * txn) {
106
HANDLE_PANICKED_ENV(txn->mgrp);
107
return toku_txn_get_root_id(db_txn_struct_i(txn)->tokutxn);
110
static void toku_txn_release_locks(DB_TXN *txn) {
111
// Prevent access to the locktree map while releasing.
112
// It is possible for lock escalation to attempt to
113
// modify this data structure while the txn commits.
114
toku_mutex_lock(&db_txn_struct_i(txn)->txn_mutex);
116
size_t num_ranges = db_txn_struct_i(txn)->lt_map.size();
117
for (size_t i = 0; i < num_ranges; i++) {
118
txn_lt_key_ranges ranges;
119
int r = db_txn_struct_i(txn)->lt_map.fetch(i, &ranges);
121
toku_db_release_lt_key_ranges(txn, &ranges);
124
toku_mutex_unlock(&db_txn_struct_i(txn)->txn_mutex);
127
static void toku_txn_destroy(DB_TXN *txn) {
128
db_txn_struct_i(txn)->lt_map.destroy();
129
toku_txn_destroy_txn(db_txn_struct_i(txn)->tokutxn);
130
toku_mutex_destroy(&db_txn_struct_i(txn)->txn_mutex);
134
static int toku_txn_commit(DB_TXN * txn, uint32_t flags,
135
TXN_PROGRESS_POLL_FUNCTION poll, void *poll_extra,
136
bool release_mo_lock, bool low_priority) {
137
HANDLE_PANICKED_ENV(txn->mgrp);
138
//Recursively kill off children
139
if (db_txn_struct_i(txn)->child) {
140
//commit of child sets the child pointer to NULL
141
int r_child = toku_txn_commit(db_txn_struct_i(txn)->child, flags, NULL, NULL, false, false);
142
if (r_child !=0 && !toku_env_is_panicked(txn->mgrp)) {
143
env_panic(txn->mgrp, r_child, "Recursive child commit failed during parent commit.\n");
145
//In a panicked env, the child may not be removed from the list.
146
HANDLE_PANICKED_ENV(txn->mgrp);
148
assert(!db_txn_struct_i(txn)->child);
151
assert(db_txn_struct_i(txn->parent)->child == txn);
152
db_txn_struct_i(txn->parent)->child=NULL;
154
if (flags & DB_TXN_SYNC) {
155
toku_txn_force_fsync_on_commit(db_txn_struct_i(txn)->tokutxn);
156
flags &= ~DB_TXN_SYNC;
158
int nosync = (flags & DB_TXN_NOSYNC)!=0 || (db_txn_struct_i(txn)->flags&DB_TXN_NOSYNC);
159
flags &= ~DB_TXN_NOSYNC;
164
r = toku_txn_abort_txn(db_txn_struct_i(txn)->tokutxn, poll, poll_extra);
167
r = toku_txn_commit_txn(db_txn_struct_i(txn)->tokutxn, nosync,
170
if (r!=0 && !toku_env_is_panicked(txn->mgrp)) {
171
env_panic(txn->mgrp, r, "Error during commit.\n");
173
//If panicked, we're done.
174
HANDLE_PANICKED_ENV(txn->mgrp);
177
TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn;
178
TOKULOGGER logger = txn->mgrp->i->logger;
181
toku_txn_get_fsync_info(ttxn, &do_fsync, &do_fsync_lsn);
182
// remove the txn from the list of live transactions, and then
183
// release the lock tree locks. MVCC requires that toku_txn_complete_txn
184
// get called first, otherwise we have bugs, such as #4145 and #4153
185
toku_txn_complete_txn(ttxn);
186
toku_txn_release_locks(txn);
187
// this lock must be released after toku_txn_complete_txn and toku_txn_release_locks because
188
// this lock must be held until the references to the open FTs is released
189
// begin checkpoint logs these associations, so we must be protect
190
// the changing of these associations with checkpointing
191
if (release_mo_lock) {
193
toku_low_priority_multi_operation_client_unlock();
195
toku_multi_operation_client_unlock();
198
toku_txn_maybe_fsync_log(logger, do_fsync_lsn, do_fsync);
204
toku_txn_destroy(txn);
208
static uint32_t toku_txn_id(DB_TXN * txn) {
209
HANDLE_PANICKED_ENV(txn->mgrp);
211
return (uint32_t) -1;
214
static int toku_txn_abort(DB_TXN * txn,
215
TXN_PROGRESS_POLL_FUNCTION poll, void *poll_extra) {
216
HANDLE_PANICKED_ENV(txn->mgrp);
217
//Recursively kill off children (abort or commit are both correct, commit is cheaper)
218
if (db_txn_struct_i(txn)->child) {
219
//commit of child sets the child pointer to NULL
220
int r_child = toku_txn_commit(db_txn_struct_i(txn)->child, DB_TXN_NOSYNC, NULL, NULL, false, false);
221
if (r_child !=0 && !toku_env_is_panicked(txn->mgrp)) {
222
env_panic(txn->mgrp, r_child, "Recursive child commit failed during parent abort.\n");
224
//In a panicked env, the child may not be removed from the list.
225
HANDLE_PANICKED_ENV(txn->mgrp);
227
assert(!db_txn_struct_i(txn)->child);
230
assert(db_txn_struct_i(txn->parent)->child == txn);
231
db_txn_struct_i(txn->parent)->child=NULL;
234
int r = toku_txn_abort_txn(db_txn_struct_i(txn)->tokutxn, poll, poll_extra);
235
if (r!=0 && !toku_env_is_panicked(txn->mgrp)) {
236
env_panic(txn->mgrp, r, "Error during abort.\n");
238
HANDLE_PANICKED_ENV(txn->mgrp);
240
toku_txn_complete_txn(db_txn_struct_i(txn)->tokutxn);
241
toku_txn_release_locks(txn);
242
toku_txn_destroy(txn);
246
static int toku_txn_xa_prepare (DB_TXN *txn, TOKU_XA_XID *xid) {
253
r = 0; // make this a NO-OP, MySQL calls this
256
HANDLE_PANICKED_ENV(txn->mgrp);
257
// Take the mo lock as soon as a non-readonly txn is found
259
holds_mo_lock = false;
260
if (!toku_txn_is_read_only(db_txn_struct_i(txn)->tokutxn)) {
261
// A readonly transaction does no logging, and therefore does not
263
toku_multi_operation_client_lock();
264
holds_mo_lock = true;
266
//Recursively commit any children.
267
if (db_txn_struct_i(txn)->child) {
268
//commit of child sets the child pointer to NULL
270
// toku_txn_commit will take the mo_lock if not held and a non-readonly txn is found.
271
int r_child = toku_txn_commit(db_txn_struct_i(txn)->child, 0, NULL, NULL, false, false);
272
if (r_child !=0 && !toku_env_is_panicked(txn->mgrp)) {
273
env_panic(txn->mgrp, r_child, "Recursive child commit failed during parent commit.\n");
275
//In a panicked env, the child may not be removed from the list.
276
HANDLE_PANICKED_ENV(txn->mgrp);
278
assert(!db_txn_struct_i(txn)->child);
280
ttxn = db_txn_struct_i(txn)->tokutxn;
281
toku_txn_prepare_txn(ttxn, xid);
283
logger = txn->mgrp->i->logger;
286
toku_txn_get_fsync_info(ttxn, &do_fsync, &do_fsync_lsn);
287
// release the multi operation lock before fsyncing the log
289
toku_multi_operation_client_unlock();
291
toku_txn_maybe_fsync_log(logger, do_fsync_lsn, do_fsync);
296
// requires: must hold the multi operation lock. it is
297
// released in toku_txn_xa_prepare before the fsync.
298
static int toku_txn_prepare (DB_TXN *txn, uint8_t gid[DB_GID_SIZE]) {
300
TOKU_ANNOTATE_NEW_MEMORY(&xid, sizeof(xid));
301
xid.formatID=0x756b6f54; // "Toku"
302
xid.gtrid_length=DB_GID_SIZE/2; // The maximum allowed gtrid length is 64. See the XA spec in source:/import/opengroup.org/C193.pdf page 20.
303
xid.bqual_length=DB_GID_SIZE/2; // The maximum allowed bqual length is 64.
304
memcpy(xid.data, gid, DB_GID_SIZE);
305
return toku_txn_xa_prepare(txn, &xid);
308
static int toku_txn_txn_stat (DB_TXN *txn, struct txn_stat **txn_stat) {
310
return toku_logger_txn_rollback_stats(db_txn_struct_i(txn)->tokutxn, *txn_stat);
313
static int locked_txn_txn_stat (DB_TXN *txn, struct txn_stat **txn_stat) {
314
int r = toku_txn_txn_stat(txn, txn_stat);
318
static int locked_txn_commit_with_progress(DB_TXN *txn, uint32_t flags,
319
TXN_PROGRESS_POLL_FUNCTION poll, void* poll_extra) {
320
bool holds_mo_lock = false;
321
bool low_priority = false;
322
TOKUTXN tokutxn = db_txn_struct_i(txn)->tokutxn;
323
if (!toku_txn_is_read_only(tokutxn)) {
324
// A readonly transaction does no logging, and therefore does not need the MO lock.
325
holds_mo_lock = true;
326
if (toku_txn_has_spilled_rollback(tokutxn)) {
328
toku_low_priority_multi_operation_client_lock();
330
toku_multi_operation_client_lock();
333
// cannot begin a checkpoint.
334
// the multi operation lock is taken the first time we
335
// see a non-readonly txn in the recursive commit.
336
// But released in the first-level toku_txn_commit (if taken),
337
// this way, we don't hold it while we fsync the log.
338
int r = toku_txn_commit(txn, flags, poll, poll_extra, holds_mo_lock, low_priority);
342
static int locked_txn_abort_with_progress(DB_TXN *txn,
343
TXN_PROGRESS_POLL_FUNCTION poll, void* poll_extra) {
344
// cannot begin a checkpoint
345
// the multi operation lock is taken the first time we
346
// see a non-readonly txn in the abort (or recursive commit).
347
// But released here so we don't have to hold additional state.
348
bool holds_mo_lock = false;
349
bool low_priority = false;
350
TOKUTXN tokutxn = db_txn_struct_i(txn)->tokutxn;
351
if (!toku_txn_is_read_only(tokutxn)) {
352
// A readonly transaction does no logging, and therefore does not need the MO lock.
353
holds_mo_lock = true;
354
if (toku_txn_has_spilled_rollback(tokutxn)) {
356
toku_low_priority_multi_operation_client_lock();
358
toku_multi_operation_client_lock();
361
int r = toku_txn_abort(txn, poll, poll_extra);
364
toku_low_priority_multi_operation_client_unlock();
366
toku_multi_operation_client_unlock();
372
int locked_txn_commit(DB_TXN *txn, uint32_t flags) {
373
int r = locked_txn_commit_with_progress(txn, flags, NULL, NULL);
377
int locked_txn_abort(DB_TXN *txn) {
378
int r = locked_txn_abort_with_progress(txn, NULL, NULL);
382
static void locked_txn_set_client_id(DB_TXN *txn, uint64_t client_id) {
383
toku_txn_set_client_id(db_txn_struct_i(txn)->tokutxn, client_id);
386
static uint64_t locked_txn_get_client_id(DB_TXN *txn) {
387
return toku_txn_get_client_id(db_txn_struct_i(txn)->tokutxn);
390
static inline void txn_func_init(DB_TXN *txn) {
391
#define STXN(name) txn->name = locked_txn_ ## name
394
STXN(abort_with_progress);
395
STXN(commit_with_progress);
400
#define SUTXN(name) txn->name = toku_txn_ ## name
404
txn->id = toku_txn_id;
405
txn->id64 = toku_txn_id64;
409
// Creates a transaction for the user
410
// In our system, as far as the user is concerned, the rules are as follows:
411
// - one cannot operate on a transaction if a child exists, with the exception of commit/abort
412
// - one cannot operate on a transaction simultaneously in two separate threads
413
// (the reason for this is that some operations may create a child transaction
414
// as part of the function, such as env->dbremove and env->dbrename, and if
415
// transactions could be operated on simulatenously in different threads, the first
416
// rule above is violated)
417
// - if a parent transaction is committed/aborted, the child transactions are recursively
420
int toku_txn_begin(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, uint32_t flags) {
421
HANDLE_PANICKED_ENV(env);
422
HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, stxn); //Cannot create child while child already exists.
423
if (!toku_logger_is_open(env->i->logger))
424
return toku_ydb_do_error(env, EINVAL, "Environment does not have logging enabled\n");
425
if (!(env->i->open_flags & DB_INIT_TXN))
426
return toku_ydb_do_error(env, EINVAL, "Environment does not have transactions enabled\n");
428
uint32_t txn_flags = 0;
429
txn_flags |= DB_TXN_NOWAIT; //We do not support blocking locks. RFP remove this?
431
// handle whether txn is declared as read only
432
bool parent_txn_declared_read_only =
434
(db_txn_struct_i(stxn)->flags & DB_TXN_READ_ONLY);
435
bool txn_declared_read_only = false;
436
if (flags & DB_TXN_READ_ONLY) {
437
txn_declared_read_only = true;
438
txn_flags |= DB_TXN_READ_ONLY;
439
flags &= ~(DB_TXN_READ_ONLY);
441
if (txn_declared_read_only && stxn &&
442
!parent_txn_declared_read_only
445
return toku_ydb_do_error(
448
"Current transaction set as read only, but parent transaction is not\n"
451
if (parent_txn_declared_read_only)
453
// don't require child transaction to also set transaction as read only
454
// if parent has already done so
455
txn_flags |= DB_TXN_READ_ONLY;
456
txn_declared_read_only = true;
460
TOKU_ISOLATION child_isolation = TOKU_ISO_SERIALIZABLE;
461
uint32_t iso_flags = flags & DB_ISOLATION_FLAGS;
462
if (!(iso_flags == 0 ||
463
iso_flags == DB_TXN_SNAPSHOT ||
464
iso_flags == DB_READ_COMMITTED ||
465
iso_flags == DB_READ_UNCOMMITTED ||
466
iso_flags == DB_SERIALIZABLE ||
467
iso_flags == DB_INHERIT_ISOLATION)
470
return toku_ydb_do_error(
473
"Invalid isolation flags set\n"
479
case (DB_INHERIT_ISOLATION):
481
child_isolation = db_txn_struct_i(stxn)->iso;
484
return toku_ydb_do_error(
487
"Cannot set DB_INHERIT_ISOLATION when no parent exists\n"
491
case (DB_READ_COMMITTED):
492
child_isolation = TOKU_ISO_READ_COMMITTED;
494
case (DB_READ_UNCOMMITTED):
495
child_isolation = TOKU_ISO_READ_UNCOMMITTED;
497
case (DB_TXN_SNAPSHOT):
498
child_isolation = TOKU_ISO_SNAPSHOT;
500
case (DB_SERIALIZABLE):
501
child_isolation = TOKU_ISO_SERIALIZABLE;
504
child_isolation = stxn ? db_txn_struct_i(stxn)->iso : TOKU_ISO_SERIALIZABLE;
507
assert(false); // error path is above, so this should not happen
510
if (stxn && child_isolation != db_txn_struct_i(stxn)->iso) {
511
return toku_ydb_do_error(
514
"Cannot set isolation level of transaction to something different \
519
if (flags&DB_TXN_NOWAIT) {
520
txn_flags |= DB_TXN_NOWAIT;
521
flags &= ~DB_TXN_NOWAIT;
523
if (flags&DB_TXN_NOSYNC) {
524
txn_flags |= DB_TXN_NOSYNC;
525
flags &= ~DB_TXN_NOSYNC;
527
if (flags!=0) return toku_ydb_do_error(env, EINVAL, "Invalid flags passed to DB_ENV->txn_begin\n");
529
struct __toku_db_txn_external *XCALLOC(eresult); // so the internal stuff is stuck on the end.
530
DB_TXN *result = &eresult->external_part;
533
txn_func_init(result);
535
result->parent = stxn;
536
db_txn_struct_i(result)->flags = txn_flags;
537
db_txn_struct_i(result)->iso = child_isolation;
538
db_txn_struct_i(result)->lt_map.create_no_array();
540
toku_mutex_init(&db_txn_struct_i(result)->txn_mutex, NULL);
542
TXN_SNAPSHOT_TYPE snapshot_type;
543
switch(db_txn_struct_i(result)->iso){
544
case(TOKU_ISO_SNAPSHOT):
546
snapshot_type = TXN_SNAPSHOT_ROOT;
549
case(TOKU_ISO_READ_COMMITTED):
551
snapshot_type = TXN_SNAPSHOT_CHILD;
556
snapshot_type = TXN_SNAPSHOT_NONE;
560
int r = toku_txn_begin_with_xid(
561
stxn ? db_txn_struct_i(stxn)->tokutxn : 0,
562
&db_txn_struct_i(result)->tokutxn,
567
false, // for_recovery
568
txn_declared_read_only // read_only
575
//Add to the list of children for the parent.
576
if (result->parent) {
577
assert(!db_txn_struct_i(result->parent)->child);
578
db_txn_struct_i(result->parent)->child = result;
585
void toku_keep_prepared_txn_callback (DB_ENV *env, TOKUTXN tokutxn) {
586
struct __toku_db_txn_external *XCALLOC(eresult);
587
DB_TXN *result = &eresult->external_part;
589
txn_func_init(result);
591
result->parent = NULL;
593
db_txn_struct_i(result)->tokutxn = tokutxn;
594
db_txn_struct_i(result)->lt_map.create();
596
toku_txn_set_container_db_txn(tokutxn, result);
598
toku_mutex_init(&db_txn_struct_i(result)->txn_mutex, NULL);
601
// Test-only function
602
void toku_increase_last_xid(DB_ENV *env, uint64_t increment) {
603
toku_txn_manager_increase_last_xid(toku_logger_get_txn_manager(env->i->logger), increment);