1
/******************************************************
6
Created 4/20/1996 Heikki Tuuri
7
*******************************************************/
15
#include "dict0dict.h"
16
#include "dict0boot.h"
20
#include "mach0data.h"
26
#include "lock0lock.h"
28
#include "eval0eval.h"
29
#include "data0data.h"
32
#define ROW_INS_PREV 1
33
#define ROW_INS_NEXT 2
35
/*************************************************************************
36
Creates an insert node struct. */
41
/* out, own: insert node struct */
42
ulint ins_type, /* in: INS_VALUES, ... */
43
dict_table_t* table, /* in: table where to insert */
44
mem_heap_t* heap) /* in: mem heap where created */
48
node = mem_heap_alloc(heap, sizeof(ins_node_t));
50
node->common.type = QUE_NODE_INSERT;
52
node->ins_type = ins_type;
54
node->state = INS_NODE_SET_IX_LOCK;
61
node->trx_id = ut_dulint_zero;
63
node->entry_sys_heap = mem_heap_create(128);
65
node->magic_n = INS_NODE_MAGIC_N;
70
/***************************************************************
71
Creates an entry template for each index of a table. */
74
ins_node_create_entry_list(
75
/*=======================*/
76
ins_node_t* node) /* in: row insert node */
81
ut_ad(node->entry_sys_heap);
83
UT_LIST_INIT(node->entry_list);
85
index = dict_table_get_first_index(node->table);
87
while (index != NULL) {
88
entry = row_build_index_entry(node->row, index,
89
node->entry_sys_heap);
90
UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry);
92
index = dict_table_get_next_index(index);
96
/*********************************************************************
97
Adds system field buffers to a row. */
100
row_ins_alloc_sys_fields(
101
/*=====================*/
102
ins_node_t* node) /* in: insert node */
114
heap = node->entry_sys_heap;
116
ut_ad(row && table && heap);
117
ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table));
119
/* 1. Allocate buffer for row id */
121
col = dict_table_get_sys_col(table, DATA_ROW_ID);
123
dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
125
ptr = mem_heap_alloc(heap, DATA_ROW_ID_LEN);
127
dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN);
129
node->row_id_buf = ptr;
131
if (table->type == DICT_TABLE_CLUSTER_MEMBER) {
133
/* 2. Fill in the dfield for mix id */
135
col = dict_table_get_sys_col(table, DATA_MIX_ID);
137
dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
139
len = mach_dulint_get_compressed_size(table->mix_id);
140
ptr = mem_heap_alloc(heap, DATA_MIX_ID_LEN);
142
mach_dulint_write_compressed(ptr, table->mix_id);
143
dfield_set_data(dfield, ptr, len);
146
/* 3. Allocate buffer for trx id */
148
col = dict_table_get_sys_col(table, DATA_TRX_ID);
150
dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
151
ptr = mem_heap_alloc(heap, DATA_TRX_ID_LEN);
153
dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN);
155
node->trx_id_buf = ptr;
157
/* 4. Allocate buffer for roll ptr */
159
col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
161
dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
162
ptr = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN);
164
dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN);
167
/*************************************************************************
168
Sets a new row to insert for an INS_DIRECT node. This function is only used
169
if we have constructed the row separately, which is a rare case; this
170
function is quite slow. */
173
ins_node_set_new_row(
174
/*=================*/
175
ins_node_t* node, /* in: insert node */
176
dtuple_t* row) /* in: new row (or first row) for the node */
178
node->state = INS_NODE_SET_IX_LOCK;
184
mem_heap_empty(node->entry_sys_heap);
186
/* Create templates for index entries */
188
ins_node_create_entry_list(node);
190
/* Allocate from entry_sys_heap buffers for sys fields */
192
row_ins_alloc_sys_fields(node);
194
/* As we allocated a new trx id buf, the trx id should be written
197
node->trx_id = ut_dulint_zero;
200
/***********************************************************************
201
Does an insert operation by updating a delete marked existing record
202
in the index. This situation can occur if the delete marked record is
203
kept in the index for consistent reads. */
206
row_ins_sec_index_entry_by_modify(
207
/*==============================*/
208
/* out: DB_SUCCESS or error code */
209
btr_cur_t* cursor, /* in: B-tree cursor */
210
que_thr_t* thr, /* in: query thread */
211
mtr_t* mtr) /* in: mtr */
215
ut_ad(((cursor->index)->type & DICT_CLUSTERED) == 0);
216
ut_ad(rec_get_deleted_flag(btr_cur_get_rec(cursor)));
218
/* We just remove the delete mark from the secondary index record */
219
err = btr_cur_del_mark_set_sec_rec(0, cursor, FALSE, thr, mtr);
224
/***********************************************************************
225
Does an insert operation by delete unmarking and updating a delete marked
226
existing record in the index. This situation can occur if the delete marked
227
record is kept in the index for consistent reads. */
230
row_ins_clust_index_entry_by_modify(
231
/*================================*/
232
/* out: DB_SUCCESS, DB_FAIL, or error code */
233
ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
234
depending on whether mtr holds just a leaf
235
latch or also a tree latch */
236
btr_cur_t* cursor, /* in: B-tree cursor */
237
dtuple_t* entry, /* in: index entry to insert */
238
que_thr_t* thr, /* in: query thread */
239
mtr_t* mtr) /* in: mtr */
246
ut_ad((cursor->index)->type & DICT_CLUSTERED);
248
rec = btr_cur_get_rec(cursor);
250
ut_ad(rec_get_deleted_flag(rec));
252
heap = mem_heap_create(1024);
254
/* Build an update vector containing all the fields to be modified;
255
NOTE that this vector may contain also system columns! */
257
update = row_upd_build_difference(cursor->index, entry, rec, heap);
259
if (mode == BTR_MODIFY_LEAF) {
260
/* Try optimistic updating of the record, keeping changes
263
err = btr_cur_optimistic_update(0, cursor, update, 0, thr,
265
if ((err == DB_OVERFLOW) || (err == DB_UNDERFLOW)) {
269
ut_ad(mode == BTR_MODIFY_TREE);
270
err = btr_cur_pessimistic_update(0, cursor, update, 0, thr,
279
/*******************************************************************
280
Checks if a unique key violation to rec would occur at the index entry
284
row_ins_dupl_error_with_rec(
285
/*========================*/
286
/* out: TRUE if error */
287
rec_t* rec, /* in: user record */
288
dtuple_t* entry, /* in: entry to insert */
289
dict_index_t* index, /* in: index */
290
trx_t* trx) /* in: inserting transaction */
292
ulint matched_fields;
297
n_unique = dict_index_get_n_unique(index);
302
cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes);
304
if (matched_fields < n_unique) {
309
if (!rec_get_deleted_flag(rec)) {
314
/* If we get here, the record has its delete mark set. It is still
315
a unique key violation if the transaction which set the delete mark
316
is currently active and is not trx itself. We check if some
317
transaction has an implicit x-lock on the record. */
319
mutex_enter(&kernel_mutex);
321
if (index->type & DICT_CLUSTERED) {
322
impl_trx = lock_clust_rec_some_has_impl(rec, index);
324
impl_trx = lock_sec_rec_some_has_impl_off_kernel(rec, index);
327
mutex_exit(&kernel_mutex);
329
if (impl_trx && impl_trx != trx) {
337
/*******************************************************************
338
Scans a unique non-clustered index at a given index entry to determine
339
whether a uniqueness violation has occurred for the key value of the entry. */
342
row_ins_scan_sec_index_for_duplicate(
343
/*=================================*/
344
/* out: DB_SUCCESS or DB_DUPLICATE_KEY */
345
dict_index_t* index, /* in: non-clustered unique index */
346
dtuple_t* entry, /* in: index entry */
347
trx_t* trx) /* in: inserting transaction */
349
ulint dupl_count = 0;
358
/* Store old value on n_fields_cmp */
360
n_fields_cmp = dtuple_get_n_fields_cmp(entry);
362
dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index));
364
btr_pcur_open_on_user_rec(index, entry, PAGE_CUR_GE,
365
BTR_SEARCH_LEAF, &pcur, &mtr);
367
/* Scan index records and check that there are no duplicates */
370
if (btr_pcur_is_after_last_in_tree(&pcur, &mtr)) {
375
rec = btr_pcur_get_rec(&pcur);
377
cmp = cmp_dtuple_rec(entry, rec);
380
if (row_ins_dupl_error_with_rec(rec, entry, index,
384
if (dupl_count > 1) {
386
"Duplicate key in index %s\n",
388
dtuple_print(entry); */
399
btr_pcur_move_to_next_user_rec(&pcur, &mtr);
404
/* Restore old value */
405
dtuple_set_n_fields_cmp(entry, n_fields_cmp);
407
ut_a(dupl_count >= 1);
409
if (dupl_count > 1) {
411
return(DB_DUPLICATE_KEY);
417
/*******************************************************************
418
Tries to check if a unique key violation error would occur at an index entry
422
row_ins_duplicate_error(
423
/*====================*/
424
/* out: DB_SUCCESS if no error
425
DB_DUPLICATE_KEY if error,
426
DB_STRONG_FAIL if this is a non-clustered
427
index record and we cannot determine yet
428
if there will be an error: in this last
430
row_ins_scan_sec_index_for_duplicate
431
AFTER the insertion of the record! */
432
btr_cur_t* cursor, /* in: B-tree cursor */
433
dtuple_t* entry, /* in: entry to insert */
434
trx_t* trx, /* in: inserting transaction */
435
mtr_t* mtr, /* in: mtr */
436
rec_t** dupl_rec)/* out: record with which duplicate error */
442
ut_ad(cursor->index->type & DICT_UNIQUE);
444
/* NOTE: For unique non-clustered indexes there may be any number
445
of delete marked records with the same value for the non-clustered
446
index key (remember multiversioning), and which differ only in
447
the row refererence part of the index record, containing the
448
clustered index key fields. For such a secondary index record,
449
to avoid race condition, we must FIRST do the insertion and after
450
that check that the uniqueness condition is not breached! */
452
/* NOTE: A problem is that in the B-tree node pointers on an
453
upper level may match more to the entry than the actual existing
454
user records on the leaf level. So, even if low_match would suggest
455
that a duplicate key violation may occur, this may not be the case. */
457
n_unique = dict_index_get_n_unique(cursor->index);
459
if (cursor->low_match >= n_unique) {
461
rec = btr_cur_get_rec(cursor);
462
page = buf_frame_align(rec);
464
if (rec != page_get_infimum_rec(page)) {
466
if (row_ins_dupl_error_with_rec(rec, entry,
467
cursor->index, trx)) {
470
return(DB_DUPLICATE_KEY);
475
if (cursor->up_match >= n_unique) {
477
rec = page_rec_get_next(btr_cur_get_rec(cursor));
478
page = buf_frame_align(rec);
480
if (rec != page_get_supremum_rec(page)) {
482
if (row_ins_dupl_error_with_rec(rec, entry,
483
cursor->index, trx)) {
486
return(DB_DUPLICATE_KEY);
490
ut_a(!(cursor->index->type & DICT_CLUSTERED));
491
/* This should never happen */
494
if (cursor->index->type & DICT_CLUSTERED) {
499
/* It was a non-clustered index: we must scan the index after the
500
insertion to be sure if there will be duplicate key error */
502
return(DB_STRONG_FAIL);
505
/*******************************************************************
506
Checks if an index entry has long enough common prefix with an existing
507
record so that the intended insert of the entry must be changed to a modify of
508
the existing record. In the case of a clustered index, the prefix must be
509
n_unique fields long, and in the case of a secondary index, all fields must be
515
/* out: 0 if no update, ROW_INS_PREV if
516
previous should be updated; currently we
517
do the search so that only the low_match
518
record can match enough to the search tuple,
519
not the next record */
520
btr_cur_t* cursor) /* in: B-tree cursor */
526
/* NOTE: (compare to the note in row_ins_duplicate_error) Because node
527
pointers on upper levels of the B-tree may match more to entry than
528
to actual user records on the leaf level, we have to check if the
529
candidate record is actually a user record. In a clustered index
530
node pointers contain index->n_unique first fields, and in the case
531
of a secondary index, all fields of the index. */
533
enough_match = dict_index_get_n_unique_in_tree(cursor->index);
535
if (cursor->low_match >= enough_match) {
537
rec = btr_cur_get_rec(cursor);
538
page = buf_frame_align(rec);
540
if (rec != page_get_infimum_rec(page)) {
542
return(ROW_INS_PREV);
549
/*******************************************************************
550
Tries to insert an index entry to an index. If the index is clustered
551
and a record with the same unique key is found, the other record is
552
necessarily marked deleted by a committed transaction, or a unique key
553
violation error occurs. The delete marked record is then updated to an
554
existing record, and we must write an undo log record on the delete
555
marked record. If the index is secondary, and a record with exactly the
556
same fields is found, the other record is necessarily marked deleted.
557
It is then unmarked. Otherwise, the entry is just inserted to the index. */
560
row_ins_index_entry_low(
561
/*====================*/
562
/* out: DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL
563
if pessimistic retry needed, or error code */
564
ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
565
depending on whether we wish optimistic or
566
pessimistic descent down the index tree */
567
dict_index_t* index, /* in: index */
568
dtuple_t* entry, /* in: index entry to insert */
569
que_thr_t* thr) /* in: query thread */
572
ulint dupl = DB_SUCCESS;
576
rec_t* dupl_rec; /* Note that this may be undefined
577
for a non-clustered index even if
578
there is a duplicate key */
588
/* Note that we use PAGE_CUR_LE as the search mode, because then
589
the function will return in both low_match and up_match of the
590
cursor sensible values */
592
btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
593
mode | BTR_INSERT, &cursor, 0, &mtr);
595
if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
596
/* The insertion was made to the insert buffer already during
597
the search: we are done */
604
n_unique = dict_index_get_n_unique(index);
606
if (index->type & DICT_UNIQUE && (cursor.up_match >= n_unique
607
|| cursor.low_match >= n_unique)) {
609
dupl = row_ins_duplicate_error(&cursor, entry,
610
thr_get_trx(thr), &mtr, &dupl_rec);
611
if (dupl == DB_DUPLICATE_KEY) {
613
/* printf("Duplicate key in index %s lm %lu\n",
614
cursor->index->name, cursor->low_match);
616
dtuple_print(entry); */
624
modify = row_ins_must_modify(&cursor);
627
/* There is already an index entry with a long enough common
628
prefix, we must convert the insert into a modify of an
631
if (modify == ROW_INS_NEXT) {
632
rec = page_rec_get_next(btr_cur_get_rec(&cursor));
634
btr_cur_position(index, rec, &cursor);
637
if (index->type & DICT_CLUSTERED) {
638
err = row_ins_clust_index_entry_by_modify(mode,
642
err = row_ins_sec_index_entry_by_modify(&cursor,
646
} else if (mode == BTR_MODIFY_LEAF) {
647
err = btr_cur_optimistic_insert(0, &cursor, entry,
648
&dummy_rec, thr, &mtr);
650
ut_ad(mode == BTR_MODIFY_TREE);
651
err = btr_cur_pessimistic_insert(0, &cursor, entry,
652
&dummy_rec, thr, &mtr);
657
if (err == DB_SUCCESS && dupl == DB_STRONG_FAIL) {
658
/* We were not able to determine before the insertion
659
whether there will be a duplicate key error: do the check
662
err = row_ins_scan_sec_index_for_duplicate(index, entry,
666
ut_ad(err != DB_DUPLICATE_KEY || index->type & DICT_CLUSTERED
667
|| DB_DUPLICATE_KEY ==
668
row_ins_scan_sec_index_for_duplicate(index, entry,
673
/*******************************************************************
674
Inserts an index entry to index. Tries first optimistic, then pessimistic
675
descent down the tree. If the entry matches enough to a delete marked record,
676
performs the insert by updating or delete unmarking the delete marked
682
/* out: DB_SUCCESS, DB_LOCK_WAIT,
683
DB_DUPLICATE_KEY, or some other error code */
684
dict_index_t* index, /* in: index */
685
dtuple_t* entry, /* in: index entry to insert */
686
que_thr_t* thr) /* in: query thread */
690
/* Try first optimistic descent to the B-tree */
692
err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry, thr);
694
if (err != DB_FAIL) {
699
/* Try then pessimistic descent to the B-tree */
701
err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry, thr);
706
/***************************************************************
707
Sets the values of the dtuple fields in entry from the values of appropriate
711
row_ins_index_entry_set_vals(
712
/*=========================*/
713
dtuple_t* entry, /* in: index entry to make */
714
dtuple_t* row) /* in: row */
723
n_fields = dtuple_get_n_fields(entry);
725
for (i = 0; i < n_fields; i++) {
726
field = dtuple_get_nth_field(entry, i);
728
row_field = dtuple_get_nth_field(row, field->col_no);
730
field->data = row_field->data;
731
field->len = row_field->len;
735
/***************************************************************
736
Inserts a single index entry to the table. */
739
row_ins_index_entry_step(
740
/*=====================*/
741
/* out: DB_SUCCESS if operation successfully
742
completed, else error code or DB_LOCK_WAIT */
743
ins_node_t* node, /* in: row insert node */
744
que_thr_t* thr) /* in: query thread */
748
ut_ad(dtuple_check_typed(node->row));
750
row_ins_index_entry_set_vals(node->entry, node->row);
752
ut_ad(dtuple_check_typed(node->entry));
754
err = row_ins_index_entry(node->index, node->entry, thr);
759
/***************************************************************
760
Allocates a row id for row and inits the node->index field. */
763
row_ins_alloc_row_id_step(
764
/*======================*/
765
ins_node_t* node) /* in: row insert node */
769
ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
771
if (dict_table_get_first_index(node->table)->type & DICT_UNIQUE) {
773
/* No row id is stored if the clustered index is unique */
778
/* Fill in row id value to row */
780
row_id = dict_sys_get_new_row_id();
782
dict_sys_write_row_id(node->row_id_buf, row_id);
785
/***************************************************************
786
Gets a row to insert from the values list. */
789
row_ins_get_row_from_values(
790
/*========================*/
791
ins_node_t* node) /* in: row insert node */
793
que_node_t* list_node;
798
/* The field values are copied in the buffers of the select node and
799
it is safe to use them until we fetch from select again: therefore
800
we can just copy the pointers */
805
list_node = node->values_list;
810
dfield = dtuple_get_nth_field(row, i);
811
dfield_copy_data(dfield, que_node_get_val(list_node));
814
list_node = que_node_get_next(list_node);
818
/***************************************************************
819
Gets a row to insert from the select list. */
822
row_ins_get_row_from_select(
823
/*========================*/
824
ins_node_t* node) /* in: row insert node */
826
que_node_t* list_node;
831
/* The field values are copied in the buffers of the select node and
832
it is safe to use them until we fetch from select again: therefore
833
we can just copy the pointers */
838
list_node = node->select->select_list;
841
dfield = dtuple_get_nth_field(row, i);
842
dfield_copy_data(dfield, que_node_get_val(list_node));
845
list_node = que_node_get_next(list_node);
849
/***************************************************************
850
Inserts a row to a table. */
855
/* out: DB_SUCCESS if operation successfully
856
completed, else error code or DB_LOCK_WAIT */
857
ins_node_t* node, /* in: row insert node */
858
que_thr_t* thr) /* in: query thread */
864
if (node->state == INS_NODE_ALLOC_ROW_ID) {
866
row_ins_alloc_row_id_step(node);
868
node->index = dict_table_get_first_index(node->table);
869
node->entry = UT_LIST_GET_FIRST(node->entry_list);
871
if (node->ins_type == INS_SEARCHED) {
873
row_ins_get_row_from_select(node);
875
} else if (node->ins_type == INS_VALUES) {
877
row_ins_get_row_from_values(node);
880
node->state = INS_NODE_INSERT_ENTRIES;
883
ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
885
while (node->index != NULL) {
886
err = row_ins_index_entry_step(node, thr);
888
if (err != DB_SUCCESS) {
893
node->index = dict_table_get_next_index(node->index);
894
node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry);
897
ut_ad(node->entry == NULL);
899
node->state = INS_NODE_ALLOC_ROW_ID;
904
/***************************************************************
905
Inserts a row to a table. This is a high-level function used in SQL execution
911
/* out: query thread to run next or NULL */
912
que_thr_t* thr) /* in: query thread */
916
sel_node_t* sel_node;
922
trx = thr_get_trx(thr);
924
node = thr->run_node;
926
ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
928
parent = que_node_get_parent(node);
929
sel_node = node->select;
931
if (thr->prev_node == parent) {
932
node->state = INS_NODE_SET_IX_LOCK;
935
/* If this is the first time this node is executed (or when
936
execution resumes after wait for the table IX lock), set an
937
IX lock on the table and reset the possible select node. */
939
if (node->state == INS_NODE_SET_IX_LOCK) {
941
/* It may be that the current session has not yet started
942
its transaction, or it has been committed: */
944
trx_start_if_not_started(trx);
946
if (UT_DULINT_EQ(trx->id, node->trx_id)) {
947
/* No need to do IX-locking or write trx id to buf */
952
trx_write_trx_id(node->trx_id_buf, trx->id);
954
err = lock_table(0, node->table, LOCK_IX, thr);
956
if (err != DB_SUCCESS) {
961
node->trx_id = trx->id;
963
node->state = INS_NODE_ALLOC_ROW_ID;
965
if (node->ins_type == INS_SEARCHED) {
966
/* Reset the cursor */
967
sel_node->state = SEL_NODE_OPEN;
969
/* Fetch a row to insert */
971
thr->run_node = sel_node;
977
if ((node->ins_type == INS_SEARCHED)
978
&& (sel_node->state != SEL_NODE_FETCH)) {
980
ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
982
/* No more rows to insert */
983
thr->run_node = parent;
988
/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
990
err = row_ins(node, thr);
993
trx->error_state = err;
995
if (err == DB_SUCCESS) {
998
} else if (err == DB_LOCK_WAIT) {
1002
/* SQL error detected */
1007
/* DO THE TRIGGER ACTIONS HERE */
1009
if (node->ins_type == INS_SEARCHED) {
1010
/* Fetch a row to insert */
1012
thr->run_node = sel_node;
1014
thr->run_node = que_node_get_parent(node);