3233
// Effect: Insert the key-val pair into ft.
3278
static int ft_compare_keys(FT ft, const DBT *a, const DBT *b)
3279
// Effect: Compare two keys using the given fractal tree's comparator/descriptor
3281
FAKE_DB(db, &ft->cmp_descriptor);
3282
return ft->compare_fun(&db, a, b);
3285
static LEAFENTRY bn_get_le_and_key(BASEMENTNODE bn, int idx, DBT *key)
3286
// Effect: Gets the i'th leafentry from the given basement node and
3287
// fill its key in *key
3288
// Requires: The i'th leafentry exists.
3293
int r = bn->data_buffer.fetch_klpair(idx, &le, &le_len, &le_key);
3295
toku_fill_dbt(key, le_key, le_len);
3299
static LEAFENTRY ft_leaf_leftmost_le_and_key(FTNODE leaf, DBT *leftmost_key)
3300
// Effect: If a leftmost key exists in the given leaf, toku_fill_dbt()
3301
// the key into *leftmost_key
3302
// Requires: Leaf is fully in memory and pinned for read or write.
3303
// Return: leafentry if it exists, nullptr otherwise
3305
for (int i = 0; i < leaf->n_children; i++) {
3306
BASEMENTNODE bn = BLB(leaf, i);
3307
if (bn->data_buffer.num_klpairs() > 0) {
3308
// Get the first (leftmost) leafentry and its key
3309
return bn_get_le_and_key(bn, 0, leftmost_key);
3315
static LEAFENTRY ft_leaf_rightmost_le_and_key(FTNODE leaf, DBT *rightmost_key)
3316
// Effect: If a rightmost key exists in the given leaf, toku_fill_dbt()
3317
// the key into *rightmost_key
3318
// Requires: Leaf is fully in memory and pinned for read or write.
3319
// Return: leafentry if it exists, nullptr otherwise
3321
for (int i = leaf->n_children - 1; i >= 0; i--) {
3322
BASEMENTNODE bn = BLB(leaf, i);
3323
size_t num_les = bn->data_buffer.num_klpairs();
3325
// Get the last (rightmost) leafentry and its key
3326
return bn_get_le_and_key(bn, num_les - 1, rightmost_key);
3332
static int ft_leaf_get_relative_key_pos(FT ft, FTNODE leaf, const DBT *key, bool *nondeleted_key_found, int *target_childnum)
3333
// Effect: Determines what the relative position of the given key is with
3334
// respect to a leaf node, and if it exists.
3335
// Requires: Leaf is fully in memory and pinned for read or write.
3336
// Requires: target_childnum is non-null
3337
// Return: < 0 if key is less than the leftmost key in the leaf OR the relative position is unknown, for any reason.
3338
// 0 if key is in the bounds [leftmost_key, rightmost_key] for this leaf or the leaf is empty
3339
// > 0 if key is greater than the rightmost key in the leaf
3340
// *nondeleted_key_found is set (if non-null) if the target key was found and is not deleted, unmodified otherwise
3341
// *target_childnum is set to the child that (does or would) contain the key, if calculated, unmodified otherwise
3344
LEAFENTRY rightmost_le = ft_leaf_rightmost_le_and_key(leaf, &rightmost_key);
3345
if (rightmost_le == nullptr) {
3346
// If we can't get a rightmost key then the leaf is empty.
3347
// In such a case, we don't have any information about what keys would be in this leaf.
3348
// We have to assume the leaf node that would contain this key is to the left.
3351
// We have a rightmost leafentry, so it must exist in some child node
3352
invariant(leaf->n_children > 0);
3354
int relative_pos = 0;
3355
int c = ft_compare_keys(ft, key, &rightmost_key);
3358
*target_childnum = leaf->n_children - 1;
3359
} else if (c == 0) {
3360
if (nondeleted_key_found != nullptr && !le_latest_is_del(rightmost_le)) {
3361
*nondeleted_key_found = true;
3364
*target_childnum = leaf->n_children - 1;
3366
// The key is less than the rightmost. It may still be in bounds if it's >= the leftmost.
3368
LEAFENTRY leftmost_le = ft_leaf_leftmost_le_and_key(leaf, &leftmost_key);
3369
invariant_notnull(leftmost_le); // Must exist because a rightmost exists
3370
c = ft_compare_keys(ft, key, &leftmost_key);
3372
if (nondeleted_key_found != nullptr) {
3373
// The caller wants to know if a nondeleted key can be found.
3374
LEAFENTRY target_le;
3375
int childnum = toku_ftnode_which_child(leaf, key, &ft->cmp_descriptor, ft->compare_fun);
3376
BASEMENTNODE bn = BLB(leaf, childnum);
3377
struct msg_leafval_heaviside_extra extra = { ft->compare_fun, &ft->cmp_descriptor, key };
3378
int r = bn->data_buffer.find_zero<decltype(extra), toku_msg_leafval_heaviside>(
3381
nullptr, nullptr, nullptr
3383
*target_childnum = childnum;
3384
if (r == 0 && !le_latest_is_del(leftmost_le)) {
3385
*nondeleted_key_found = true;
3389
} else if (c == 0) {
3390
if (nondeleted_key_found != nullptr && !le_latest_is_del(leftmost_le)) {
3391
*nondeleted_key_found = true;
3394
*target_childnum = 0;
3400
return relative_pos;
3403
static void ft_insert_directly_into_leaf(FT ft, FTNODE leaf, int target_childnum, DBT *key, DBT *val,
3404
XIDS message_xids, enum ft_msg_type type, txn_gc_info *gc_info);
3405
static int getf_nothing(ITEMLEN, bytevec, ITEMLEN, bytevec, void *, bool);
3407
static int ft_maybe_insert_into_rightmost_leaf(FT ft, DBT *key, DBT *val, XIDS message_xids, enum ft_msg_type type,
3408
txn_gc_info *gc_info, bool unique)
3409
// Effect: Pins the rightmost leaf node and attempts to do an insert.
3410
// There are three reasons why we may not succeed.
3411
// - The rightmost leaf is too full and needs a split.
3412
// - The key to insert is not within the provable bounds of this leaf node.
3413
// - The key is within bounds, but it already exists.
3414
// Return: 0 if this function did insert, DB_KEYEXIST if a unique key constraint exists and
3415
// some nondeleted leafentry with the same key exists
3416
// < 0 if this function did not insert, for a reason other than DB_KEYEXIST.
3417
// Note: Treat this function as a possible, but not necessary, optimization for insert.
3418
// Rationale: We want O(1) insertions down the rightmost path of the tree.
3422
uint32_t rightmost_fullhash;
3423
BLOCKNUM rightmost_blocknum = ft->rightmost_blocknum;
3424
FTNODE rightmost_leaf = nullptr;
3426
// Don't do the optimization if our heurstic suggests that
3427
// insertion pattern is not sequential.
3428
if (ft->seqinsert_score < FT_SEQINSERT_SCORE_THRESHOLD) {
3432
// We know the seqinsert score is high enough that we should
3433
// attemp to directly insert into the right most leaf. Because
3434
// the score is non-zero, the rightmost blocknum must have been
3435
// set. See inject_message_in_locked_node(), which only increases
3436
// the score if the target node blocknum == rightmost_blocknum
3437
invariant(rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL);
3439
// Pin the rightmost leaf with a write lock.
3440
rightmost_fullhash = toku_cachetable_hash(ft->cf, rightmost_blocknum);
3441
struct ftnode_fetch_extra bfe;
3442
fill_bfe_for_full_read(&bfe, ft);
3443
toku_pin_ftnode(ft, rightmost_blocknum, rightmost_fullhash, &bfe, PL_WRITE_CHEAP, &rightmost_leaf, true);
3445
// The rightmost blocknum never chances once it is initialized to something
3446
// other than null. Verify that the pinned node has the correct blocknum.
3447
invariant(rightmost_leaf->thisnodename.b == rightmost_blocknum.b);
3449
// If the rightmost leaf is reactive, bail out out and let the normal promotion pass
3450
// take care of it. This also ensures that if any of our ancestors are reactive,
3451
// they'll be taken care of too.
3452
if (get_leaf_reactivity(rightmost_leaf, ft->h->nodesize) != RE_STABLE) {
3453
STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_REACTIVE, 1);
3457
// The groundwork has been laid for an insertion directly into the rightmost
3458
// leaf node. We know that it is pinned for write, fully in memory, has
3459
// no messages above it, and is not reactive.
3461
// Now, two more things must be true for this insertion to actually happen:
3462
// 1. The key to insert is within the bounds of this leafnode, or to the right.
3463
// 2. If there is a uniqueness constraint, it passes.
3464
bool nondeleted_key_found;
3466
int target_childnum;
3468
nondeleted_key_found = false;
3469
target_childnum = -1;
3470
relative_pos = ft_leaf_get_relative_key_pos(ft, rightmost_leaf, key,
3471
unique ? &nondeleted_key_found : nullptr,
3473
if (relative_pos >= 0) {
3474
STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_SUCCESS, 1);
3475
if (unique && nondeleted_key_found) {
3478
ft_insert_directly_into_leaf(ft, rightmost_leaf, target_childnum,
3479
key, val, message_xids, type, gc_info);
3483
STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_POS, 1);
3488
// If we did the insert, the rightmost leaf was unpinned for us.
3489
if (r != 0 && rightmost_leaf != nullptr) {
3490
toku_unpin_ftnode(ft, rightmost_leaf);
3496
static void ft_txn_log_insert(FT ft, DBT *key, DBT *val, TOKUTXN txn, bool do_logging, enum ft_msg_type type);
3498
int toku_ft_insert_unique(FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool do_logging) {
3499
// Effect: Insert a unique key-val pair into the fractal tree.
3500
// Return: 0 on success, DB_KEYEXIST if the overwrite constraint failed
3501
XIDS message_xids = txn != nullptr ? toku_txn_get_xids(txn) : xids_get_root_xids();
3503
TXN_MANAGER txn_manager = toku_ft_get_txn_manager(ft_h);
3504
txn_manager_state txn_state_for_gc(txn_manager);
3506
TXNID oldest_referenced_xid_estimate = toku_ft_get_oldest_referenced_xid_estimate(ft_h);
3507
txn_gc_info gc_info(&txn_state_for_gc,
3508
oldest_referenced_xid_estimate,
3509
// no messages above us, we can implicitly promote uxrs based on this xid
3510
oldest_referenced_xid_estimate,
3512
int r = ft_maybe_insert_into_rightmost_leaf(ft_h->ft, key, val, message_xids, FT_INSERT, &gc_info, true);
3513
if (r != 0 && r != DB_KEYEXIST) {
3514
// Default to a regular unique check + insert algorithm if we couldn't
3515
// do it based on the rightmost leaf alone.
3516
int lookup_r = toku_ft_lookup(ft_h, key, getf_nothing, nullptr);
3517
if (lookup_r == DB_NOTFOUND) {
3518
toku_ft_send_insert(ft_h, key, val, message_xids, FT_INSERT, &gc_info);
3526
ft_txn_log_insert(ft_h->ft, key, val, txn, do_logging, FT_INSERT);
3531
// Effect: Insert the key-val pair into an ft.
3234
3532
void toku_ft_insert (FT_HANDLE ft_handle, DBT *key, DBT *val, TOKUTXN txn) {
3235
3533
toku_ft_maybe_insert(ft_handle, key, val, txn, false, ZERO_LSN, true, FT_INSERT);
3356
3654
return txn_manager != nullptr ? toku_txn_manager_get_oldest_referenced_xid_estimate(txn_manager) : TXNID_NONE;
3657
static void ft_txn_log_insert(FT ft, DBT *key, DBT *val, TOKUTXN txn, bool do_logging, enum ft_msg_type type) {
3658
paranoid_invariant(type == FT_INSERT || type == FT_INSERT_NO_OVERWRITE);
3660
//By default use committed messages
3661
TXNID_PAIR xid = toku_txn_get_txnid(txn);
3663
BYTESTRING keybs = {key->size, (char *) key->data};
3664
toku_logger_save_rollback_cmdinsert(txn, toku_cachefile_filenum(ft->cf), &keybs);
3665
toku_txn_maybe_note_ft(txn, ft);
3667
TOKULOGGER logger = toku_txn_logger(txn);
3668
if (do_logging && logger) {
3669
BYTESTRING keybs = {.len=key->size, .data=(char *) key->data};
3670
BYTESTRING valbs = {.len=val->size, .data=(char *) val->data};
3671
if (type == FT_INSERT) {
3672
toku_log_enq_insert(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft->cf), xid, keybs, valbs);
3675
toku_log_enq_insert_no_overwrite(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft->cf), xid, keybs, valbs);
3359
3680
void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging, enum ft_msg_type type) {
3360
paranoid_invariant(type==FT_INSERT || type==FT_INSERT_NO_OVERWRITE);
3361
XIDS message_xids = xids_get_root_xids(); //By default use committed messages
3362
TXNID_PAIR xid = toku_txn_get_txnid(txn);
3364
BYTESTRING keybs = {key->size, (char *) key->data};
3365
toku_logger_save_rollback_cmdinsert(txn, toku_cachefile_filenum(ft_h->ft->cf), &keybs);
3366
toku_txn_maybe_note_ft(txn, ft_h->ft);
3367
message_xids = toku_txn_get_xids(txn);
3369
TOKULOGGER logger = toku_txn_logger(txn);
3370
if (do_logging && logger) {
3371
BYTESTRING keybs = {.len=key->size, .data=(char *) key->data};
3372
BYTESTRING valbs = {.len=val->size, .data=(char *) val->data};
3373
if (type == FT_INSERT) {
3374
toku_log_enq_insert(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft_h->ft->cf), xid, keybs, valbs);
3377
toku_log_enq_insert_no_overwrite(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft_h->ft->cf), xid, keybs, valbs);
3681
ft_txn_log_insert(ft_h->ft, key, val, txn, do_logging, type);
3382
3684
if (oplsn_valid && oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(ft_h->ft)).lsn) {
3687
XIDS message_xids = txn ? toku_txn_get_xids(txn) : xids_get_root_xids();
3385
3689
TXN_MANAGER txn_manager = toku_ft_get_txn_manager(ft_h);
3386
3690
txn_manager_state txn_state_for_gc(txn_manager);