1
/*****************************************************************************
3
Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15
Place, Suite 330, Boston, MA 02111-1307 USA
17
*****************************************************************************/
19
/**************************************************//**
23
Created 6/2/1994 Heikki Tuuri
24
*******************************************************/
33
#include "page0page.h"
36
#ifndef UNIV_HOTBACKUP
41
#include "lock0lock.h"
42
#include "ibuf0ibuf.h"
46
Latching strategy of the InnoDB B-tree
47
--------------------------------------
48
A tree latch protects all non-leaf nodes of the tree. Each node of a tree
49
also has a latch of its own.
51
A B-tree operation normally first acquires an S-latch on the tree. It
52
searches down the tree and releases the tree latch when it has the
53
leaf node latch. To save CPU time we do not acquire any latch on
54
non-leaf nodes of the tree during a search, those pages are only bufferfixed.
56
If an operation needs to restructure the tree, it acquires an X-latch on
57
the tree before searching to a leaf node. If it needs, for example, to
59
(1) InnoDB decides the split point in the leaf,
60
(2) allocates a new page,
61
(3) inserts the appropriate node pointer to the first non-leaf level,
62
(4) releases the tree X-latch,
63
(5) and then moves records from the leaf to the new allocated page.
67
Leaf pages of a B-tree contain the index records stored in the
68
tree. On levels n > 0 we store 'node pointers' to pages on level
69
n - 1. For each page there is exactly one node pointer stored:
70
thus the our tree is an ordinary B-tree, not a B-link tree.
72
A node pointer contains a prefix P of an index record. The prefix
73
is long enough so that it determines an index record uniquely.
74
The file page number of the child page is added as the last
75
field. To the child page we can store node pointers or index records
76
which are >= P in the alphabetical order, but < P1 if there is
77
a next node pointer on the level, and P1 is its prefix.
79
If a node pointer with a prefix P points to a non-leaf child,
80
then the leftmost record in the child must have the same
81
prefix P. If it points to a leaf node, the child is not required
82
to contain any record with a prefix equal to P. The leaf case
83
is decided this way to allow arbitrary deletions in a leaf node
84
without touching upper levels of the tree.
86
We have predefined a special minimum record which we
87
define as the smallest record in any alphabetical order.
88
A minimum record is denoted by setting a bit in the record
89
header. A minimum record acts as the prefix of a node pointer
90
which points to a leftmost node on any level of the tree.
94
In the root node of a B-tree there are two file segment headers.
95
The leaf pages of a tree are allocated from one file segment, to
96
make them consecutive on disk if possible. From the other file segment
97
we allocate pages for the non-leaf levels of the tree.
100
#ifdef UNIV_BTR_DEBUG
101
/**************************************************************//**
102
Checks a file segment header within a B-tree root page.
103
@return TRUE if valid */
106
btr_root_fseg_validate(
107
/*===================*/
108
const fseg_header_t* seg_header, /*!< in: segment header */
109
ulint space) /*!< in: tablespace identifier */
111
ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
113
ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space);
114
ut_a(offset >= FIL_PAGE_DATA);
115
ut_a(offset <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END);
118
#endif /* UNIV_BTR_DEBUG */
120
/**************************************************************//**
121
Gets the root node of a tree and x-latches it.
122
@return root page, x-latched */
127
dict_index_t* index, /*!< in: index tree */
128
mtr_t* mtr) /*!< in: mtr */
135
space = dict_index_get_space(index);
136
zip_size = dict_table_zip_size(index->table);
137
root_page_no = dict_index_get_page(index);
139
block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
140
ut_a((ibool)!!page_is_comp(buf_block_get_frame(block))
141
== dict_table_is_comp(index->table));
142
#ifdef UNIV_BTR_DEBUG
143
if (!dict_index_is_ibuf(index)) {
144
const page_t* root = buf_block_get_frame(block);
146
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
148
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
151
#endif /* UNIV_BTR_DEBUG */
156
/**************************************************************//**
157
Gets the root node of a tree and x-latches it.
158
@return root page, x-latched */
163
dict_index_t* index, /*!< in: index tree */
164
mtr_t* mtr) /*!< in: mtr */
166
return(buf_block_get_frame(btr_root_block_get(index, mtr)));
169
/*************************************************************//**
170
Gets pointer to the previous user record in the tree. It is assumed that
171
the caller has appropriate latches on the page and its neighbor.
172
@return previous user record, NULL if there is none */
175
btr_get_prev_user_rec(
176
/*==================*/
177
rec_t* rec, /*!< in: record on leaf level */
178
mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if
179
needed, also to the previous page */
185
if (!page_rec_is_infimum(rec)) {
187
rec_t* prev_rec = page_rec_get_prev(rec);
189
if (!page_rec_is_infimum(prev_rec)) {
195
page = page_align(rec);
196
prev_page_no = btr_page_get_prev(page, mtr);
198
if (prev_page_no != FIL_NULL) {
202
buf_block_t* prev_block;
204
space = page_get_space_id(page);
205
zip_size = fil_space_get_zip_size(space);
207
prev_block = buf_page_get_with_no_latch(space, zip_size,
209
prev_page = buf_block_get_frame(prev_block);
210
/* The caller must already have a latch to the brother */
211
ut_ad(mtr_memo_contains(mtr, prev_block,
213
|| mtr_memo_contains(mtr, prev_block,
214
MTR_MEMO_PAGE_X_FIX));
215
#ifdef UNIV_BTR_DEBUG
216
ut_a(page_is_comp(prev_page) == page_is_comp(page));
217
ut_a(btr_page_get_next(prev_page, mtr)
218
== page_get_page_no(page));
219
#endif /* UNIV_BTR_DEBUG */
221
return(page_rec_get_prev(page_get_supremum_rec(prev_page)));
227
/*************************************************************//**
228
Gets pointer to the next user record in the tree. It is assumed that the
229
caller has appropriate latches on the page and its neighbor.
230
@return next user record, NULL if there is none */
233
btr_get_next_user_rec(
234
/*==================*/
235
rec_t* rec, /*!< in: record on leaf level */
236
mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if
237
needed, also to the next page */
243
if (!page_rec_is_supremum(rec)) {
245
rec_t* next_rec = page_rec_get_next(rec);
247
if (!page_rec_is_supremum(next_rec)) {
253
page = page_align(rec);
254
next_page_no = btr_page_get_next(page, mtr);
256
if (next_page_no != FIL_NULL) {
259
buf_block_t* next_block;
261
space = page_get_space_id(page);
262
zip_size = fil_space_get_zip_size(space);
264
next_block = buf_page_get_with_no_latch(space, zip_size,
266
next_page = buf_block_get_frame(next_block);
267
/* The caller must already have a latch to the brother */
268
ut_ad(mtr_memo_contains(mtr, next_block, MTR_MEMO_PAGE_S_FIX)
269
|| mtr_memo_contains(mtr, next_block,
270
MTR_MEMO_PAGE_X_FIX));
271
#ifdef UNIV_BTR_DEBUG
272
ut_a(page_is_comp(next_page) == page_is_comp(page));
273
ut_a(btr_page_get_prev(next_page, mtr)
274
== page_get_page_no(page));
275
#endif /* UNIV_BTR_DEBUG */
277
return(page_rec_get_next(page_get_infimum_rec(next_page)));
283
/**************************************************************//**
284
Creates a new index page (not the root, and also not
285
used in page reorganization). @see btr_page_empty(). */
290
buf_block_t* block, /*!< in/out: page to be created */
291
page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
292
dict_index_t* index, /*!< in: index */
293
ulint level, /*!< in: the B-tree level of the page */
294
mtr_t* mtr) /*!< in: mtr */
296
page_t* page = buf_block_get_frame(block);
298
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
300
if (UNIV_LIKELY_NULL(page_zip)) {
301
page_create_zip(block, index, level, mtr);
303
page_create(block, mtr, dict_table_is_comp(index->table));
304
/* Set the level of the new index page */
305
btr_page_set_level(page, NULL, level, mtr);
308
block->check_index_page_at_flush = TRUE;
310
btr_page_set_index_id(page, page_zip, index->id, mtr);
313
/**************************************************************//**
314
Allocates a new file page to be used in an ibuf tree. Takes the page from
315
the free list of the tree, which must contain pages!
316
@return new allocated block, x-latched */
319
btr_page_alloc_for_ibuf(
320
/*====================*/
321
dict_index_t* index, /*!< in: index tree */
322
mtr_t* mtr) /*!< in: mtr */
324
fil_addr_t node_addr;
327
buf_block_t* new_block;
329
root = btr_root_get(index, mtr);
331
node_addr = flst_get_first(root + PAGE_HEADER
332
+ PAGE_BTR_IBUF_FREE_LIST, mtr);
333
ut_a(node_addr.page != FIL_NULL);
335
new_block = buf_page_get(dict_index_get_space(index),
336
dict_table_zip_size(index->table),
337
node_addr.page, RW_X_LATCH, mtr);
338
new_page = buf_block_get_frame(new_block);
339
buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
341
flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
342
new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
344
ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
350
/**************************************************************//**
351
Allocates a new file page to be used in an index tree. NOTE: we assume
352
that the caller has made the reservation for free extents!
353
@return new allocated block, x-latched; NULL if out of space */
358
dict_index_t* index, /*!< in: index */
359
ulint hint_page_no, /*!< in: hint of a good page */
360
byte file_direction, /*!< in: direction where a possible
361
page split is made */
362
ulint level, /*!< in: level where the page is placed
364
mtr_t* mtr) /*!< in: mtr */
366
fseg_header_t* seg_header;
368
buf_block_t* new_block;
371
if (dict_index_is_ibuf(index)) {
373
return(btr_page_alloc_for_ibuf(index, mtr));
376
root = btr_root_get(index, mtr);
379
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
381
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
384
/* Parameter TRUE below states that the caller has made the
385
reservation for free extents, and thus we know that a page can
388
new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
389
file_direction, TRUE, mtr);
390
if (new_page_no == FIL_NULL) {
395
new_block = buf_page_get(dict_index_get_space(index),
396
dict_table_zip_size(index->table),
397
new_page_no, RW_X_LATCH, mtr);
398
buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
403
/**************************************************************//**
404
Gets the number of pages in a B-tree.
405
@return number of pages */
410
dict_index_t* index, /*!< in: index */
411
ulint flag) /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
413
fseg_header_t* seg_header;
421
mtr_s_lock(dict_index_get_lock(index), &mtr);
423
root = btr_root_get(index, &mtr);
425
if (flag == BTR_N_LEAF_PAGES) {
426
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
428
fseg_n_reserved_pages(seg_header, &n, &mtr);
430
} else if (flag == BTR_TOTAL_SIZE) {
431
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
433
n = fseg_n_reserved_pages(seg_header, &dummy, &mtr);
435
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
437
n += fseg_n_reserved_pages(seg_header, &dummy, &mtr);
447
/**************************************************************//**
448
Frees a page used in an ibuf tree. Puts the page to the free list of the
452
btr_page_free_for_ibuf(
453
/*===================*/
454
dict_index_t* index, /*!< in: index tree */
455
buf_block_t* block, /*!< in: block to be freed, x-latched */
456
mtr_t* mtr) /*!< in: mtr */
460
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
461
root = btr_root_get(index, mtr);
463
flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
464
buf_block_get_frame(block)
465
+ PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
467
ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
471
/**************************************************************//**
472
Frees a file page used in an index tree. Can be used also to (BLOB)
473
external storage pages, because the page level 0 can be given as an
479
dict_index_t* index, /*!< in: index tree */
480
buf_block_t* block, /*!< in: block to be freed, x-latched */
481
ulint level, /*!< in: page level */
482
mtr_t* mtr) /*!< in: mtr */
484
fseg_header_t* seg_header;
487
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
488
/* The page gets invalid for optimistic searches: increment the frame
491
buf_block_modify_clock_inc(block);
493
if (dict_index_is_ibuf(index)) {
495
btr_page_free_for_ibuf(index, block, mtr);
500
root = btr_root_get(index, mtr);
503
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
505
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
508
fseg_free_page(seg_header,
509
buf_block_get_space(block),
510
buf_block_get_page_no(block), mtr);
513
/**************************************************************//**
514
Frees a file page used in an index tree. NOTE: cannot free field external
515
storage pages because the page must contain info on its level. */
520
dict_index_t* index, /*!< in: index tree */
521
buf_block_t* block, /*!< in: block to be freed, x-latched */
522
mtr_t* mtr) /*!< in: mtr */
526
level = btr_page_get_level(buf_block_get_frame(block), mtr);
528
btr_page_free_low(index, block, level, mtr);
531
/**************************************************************//**
532
Sets the child node file address in a node pointer. */
535
btr_node_ptr_set_child_page_no(
536
/*===========================*/
537
rec_t* rec, /*!< in: node pointer record */
538
page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
539
part will be updated, or NULL */
540
const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
541
ulint page_no,/*!< in: child node address */
542
mtr_t* mtr) /*!< in: mtr */
547
ut_ad(rec_offs_validate(rec, NULL, offsets));
548
ut_ad(!page_is_leaf(page_align(rec)));
549
ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
551
/* The child address is in the last field */
552
field = rec_get_nth_field(rec, offsets,
553
rec_offs_n_fields(offsets) - 1, &len);
555
ut_ad(len == REC_NODE_PTR_SIZE);
557
if (UNIV_LIKELY_NULL(page_zip)) {
558
page_zip_write_node_ptr(page_zip, rec,
559
rec_offs_data_size(offsets),
562
mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
566
/************************************************************//**
567
Returns the child page of a node pointer and x-latches it.
568
@return child page, x-latched */
571
btr_node_ptr_get_child(
572
/*===================*/
573
const rec_t* node_ptr,/*!< in: node pointer */
574
dict_index_t* index, /*!< in: index */
575
const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
576
mtr_t* mtr) /*!< in: mtr */
581
ut_ad(rec_offs_validate(node_ptr, index, offsets));
582
space = page_get_space_id(page_align(node_ptr));
583
page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
585
return(btr_block_get(space, dict_table_zip_size(index->table),
586
page_no, RW_X_LATCH, mtr));
589
/************************************************************//**
590
Returns the upper level node pointer to a page. It is assumed that mtr holds
591
an x-latch on the tree.
592
@return rec_get_offsets() of the node pointer record */
595
btr_page_get_father_node_ptr(
596
/*=========================*/
597
ulint* offsets,/*!< in: work area for the return value */
598
mem_heap_t* heap, /*!< in: memory heap to use */
599
btr_cur_t* cursor, /*!< in: cursor pointing to user record,
600
out: cursor on node pointer record,
601
its page x-latched */
602
mtr_t* mtr) /*!< in: mtr */
611
page_no = buf_block_get_page_no(btr_cur_get_block(cursor));
612
index = btr_cur_get_index(cursor);
614
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
617
ut_ad(dict_index_get_page(index) != page_no);
619
level = btr_page_get_level(btr_cur_get_page(cursor), mtr);
620
user_rec = btr_cur_get_rec(cursor);
621
ut_a(page_rec_is_user_rec(user_rec));
622
tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
624
btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE,
625
BTR_CONT_MODIFY_TREE, cursor, 0, mtr);
627
node_ptr = btr_cur_get_rec(cursor);
628
ut_ad(!page_rec_is_comp(node_ptr)
629
|| rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
630
offsets = rec_get_offsets(node_ptr, index, offsets,
631
ULINT_UNDEFINED, &heap);
633
if (UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, offsets)
636
fputs("InnoDB: Dump of the child page:\n", stderr);
637
buf_page_print(page_align(user_rec), 0);
638
fputs("InnoDB: Dump of the parent page:\n", stderr);
639
buf_page_print(page_align(node_ptr), 0);
641
fputs("InnoDB: Corruption of an index tree: table ", stderr);
642
ut_print_name(stderr, NULL, TRUE, index->table_name);
643
fputs(", index ", stderr);
644
ut_print_name(stderr, NULL, FALSE, index->name);
645
fprintf(stderr, ",\n"
646
"InnoDB: father ptr page no %lu, child page no %lu\n",
648
btr_node_ptr_get_child_page_no(node_ptr, offsets),
650
print_rec = page_rec_get_next(
651
page_get_infimum_rec(page_align(user_rec)));
652
offsets = rec_get_offsets(print_rec, index,
653
offsets, ULINT_UNDEFINED, &heap);
654
page_rec_print(print_rec, offsets);
655
offsets = rec_get_offsets(node_ptr, index, offsets,
656
ULINT_UNDEFINED, &heap);
657
page_rec_print(node_ptr, offsets);
659
fputs("InnoDB: You should dump + drop + reimport the table"
661
"InnoDB: corruption. If the crash happens at "
662
"the database startup, see\n"
663
"InnoDB: " REFMAN "forcing-recovery.html about\n"
664
"InnoDB: forcing recovery. "
665
"Then dump + drop + reimport.\n", stderr);
673
/************************************************************//**
674
Returns the upper level node pointer to a page. It is assumed that mtr holds
675
an x-latch on the tree.
676
@return rec_get_offsets() of the node pointer record */
679
btr_page_get_father_block(
680
/*======================*/
681
ulint* offsets,/*!< in: work area for the return value */
682
mem_heap_t* heap, /*!< in: memory heap to use */
683
dict_index_t* index, /*!< in: b-tree index */
684
buf_block_t* block, /*!< in: child page in the index */
685
mtr_t* mtr, /*!< in: mtr */
686
btr_cur_t* cursor) /*!< out: cursor on node pointer record,
687
its page x-latched */
690
= page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
692
btr_cur_position(index, rec, block, cursor);
693
return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr));
696
/************************************************************//**
697
Seeks to the upper level node pointer to a page.
698
It is assumed that mtr holds an x-latch on the tree. */
703
dict_index_t* index, /*!< in: b-tree index */
704
buf_block_t* block, /*!< in: child page in the index */
705
mtr_t* mtr, /*!< in: mtr */
706
btr_cur_t* cursor) /*!< out: cursor on node pointer record,
707
its page x-latched */
711
= page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
713
btr_cur_position(index, rec, block, cursor);
715
heap = mem_heap_create(100);
716
btr_page_get_father_node_ptr(NULL, heap, cursor, mtr);
720
/************************************************************//**
721
Creates the root node for a new index tree.
722
@return page number of the created root, FIL_NULL if did not succeed */
727
ulint type, /*!< in: type of the index */
728
ulint space, /*!< in: space where created */
729
ulint zip_size,/*!< in: compressed page size in bytes
730
or 0 for uncompressed pages */
731
dulint index_id,/*!< in: index id */
732
dict_index_t* index, /*!< in: index */
733
mtr_t* mtr) /*!< in: mini-transaction handle */
739
page_zip_des_t* page_zip;
741
/* Create the two new segments (one, in the case of an ibuf tree) for
742
the index tree; the segment headers are put on the allocated root page
743
(for an ibuf tree, not in the root, but on a separate ibuf header
746
if (type & DICT_IBUF) {
747
/* Allocate first the ibuf header page */
748
buf_block_t* ibuf_hdr_block = fseg_create(
750
IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
752
buf_block_dbg_add_level(ibuf_hdr_block, SYNC_TREE_NODE_NEW);
754
ut_ad(buf_block_get_page_no(ibuf_hdr_block)
755
== IBUF_HEADER_PAGE_NO);
756
/* Allocate then the next page to the segment: it will be the
759
page_no = fseg_alloc_free_page(buf_block_get_frame(
762
+ IBUF_TREE_SEG_HEADER,
763
IBUF_TREE_ROOT_PAGE_NO,
765
ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
767
block = buf_page_get(space, zip_size, page_no,
770
block = fseg_create(space, 0,
771
PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
779
page_no = buf_block_get_page_no(block);
780
frame = buf_block_get_frame(block);
782
buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
784
if (type & DICT_IBUF) {
785
/* It is an insert buffer tree: initialize the free list */
787
ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
789
flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
791
/* It is a non-ibuf tree: create a file segment for leaf
793
if (!fseg_create(space, page_no,
794
PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) {
795
/* Not enough space for new segment, free root
796
segment before return. */
797
btr_free_root(space, zip_size, page_no, mtr);
802
/* The fseg create acquires a second latch on the page,
803
therefore we must declare it: */
804
buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
807
/* Create a new index page on the allocated segment page */
808
page_zip = buf_block_get_page_zip(block);
810
if (UNIV_LIKELY_NULL(page_zip)) {
811
page = page_create_zip(block, index, 0, mtr);
813
page = page_create(block, mtr,
814
dict_table_is_comp(index->table));
815
/* Set the level of the new index page */
816
btr_page_set_level(page, NULL, 0, mtr);
819
block->check_index_page_at_flush = TRUE;
821
/* Set the index id of the page */
822
btr_page_set_index_id(page, page_zip, index_id, mtr);
824
/* Set the next node and previous node fields */
825
btr_page_set_next(page, page_zip, FIL_NULL, mtr);
826
btr_page_set_prev(page, page_zip, FIL_NULL, mtr);
828
/* We reset the free bits for the page to allow creation of several
829
trees in the same mtr, otherwise the latch on a bitmap page would
830
prevent it because of the latching order */
832
if (!(type & DICT_CLUSTERED)) {
833
ibuf_reset_free_bits(block);
836
/* In the following assertion we test that two records of maximum
837
allowed size fit on the root page: this fact is needed to ensure
838
correctness of split algorithms */
840
ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
845
/************************************************************//**
846
Frees a B-tree except the root page, which MUST be freed after this
847
by calling btr_free_root. */
850
btr_free_but_not_root(
851
/*==================*/
852
ulint space, /*!< in: space where created */
853
ulint zip_size, /*!< in: compressed page size in bytes
854
or 0 for uncompressed pages */
855
ulint root_page_no) /*!< in: root page number */
864
root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr);
865
#ifdef UNIV_BTR_DEBUG
866
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
868
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
870
#endif /* UNIV_BTR_DEBUG */
872
/* NOTE: page hash indexes are dropped when a page is freed inside
875
finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
886
root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr);
887
#ifdef UNIV_BTR_DEBUG
888
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
890
#endif /* UNIV_BTR_DEBUG */
892
finished = fseg_free_step_not_header(
893
root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
902
/************************************************************//**
903
Frees the B-tree root page. Other tree MUST already have been freed. */
908
ulint space, /*!< in: space where created */
909
ulint zip_size, /*!< in: compressed page size in bytes
910
or 0 for uncompressed pages */
911
ulint root_page_no, /*!< in: root page number */
912
mtr_t* mtr) /*!< in: a mini-transaction which has already
916
fseg_header_t* header;
918
block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
920
btr_search_drop_page_hash_index(block);
922
header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP;
923
#ifdef UNIV_BTR_DEBUG
924
ut_a(btr_root_fseg_validate(header, space));
925
#endif /* UNIV_BTR_DEBUG */
927
while (!fseg_free_step(header, mtr));
929
#endif /* !UNIV_HOTBACKUP */
931
/*************************************************************//**
932
Reorganizes an index page. */
935
btr_page_reorganize_low(
936
/*====================*/
937
ibool recovery,/*!< in: TRUE if called in recovery:
938
locks should not be updated, i.e.,
939
there cannot exist locks on the
940
page, and a hash index should not be
941
dropped: it cannot exist */
942
buf_block_t* block, /*!< in: page to be reorganized */
943
dict_index_t* index, /*!< in: record descriptor */
944
mtr_t* mtr) /*!< in: mtr */
946
page_t* page = buf_block_get_frame(block);
947
page_zip_des_t* page_zip = buf_block_get_page_zip(block);
948
buf_block_t* temp_block;
955
ibool success = FALSE;
957
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
958
ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
959
#ifdef UNIV_ZIP_DEBUG
960
ut_a(!page_zip || page_zip_validate(page_zip, page));
961
#endif /* UNIV_ZIP_DEBUG */
962
data_size1 = page_get_data_size(page);
963
max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
965
#ifndef UNIV_HOTBACKUP
966
/* Write the log record */
967
mlog_open_and_write_index(mtr, page, index, page_is_comp(page)
968
? MLOG_COMP_PAGE_REORGANIZE
969
: MLOG_PAGE_REORGANIZE, 0);
970
#endif /* !UNIV_HOTBACKUP */
972
/* Turn logging off */
973
log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
975
#ifndef UNIV_HOTBACKUP
976
temp_block = buf_block_alloc(0);
977
#else /* !UNIV_HOTBACKUP */
978
ut_ad(block == back_block1);
979
temp_block = back_block2;
980
#endif /* !UNIV_HOTBACKUP */
981
temp_page = temp_block->frame;
983
/* Copy the old page to temporary space */
984
buf_frame_copy(temp_page, page);
986
#ifndef UNIV_HOTBACKUP
987
if (UNIV_LIKELY(!recovery)) {
988
btr_search_drop_page_hash_index(block);
991
block->check_index_page_at_flush = TRUE;
992
#endif /* !UNIV_HOTBACKUP */
994
/* Recreate the page: note that global data on page (possible
995
segment headers, next page-field, etc.) is preserved intact */
997
page_create(block, mtr, dict_table_is_comp(index->table));
999
/* Copy the records from the temporary space to the recreated page;
1000
do not copy the lock bits yet */
1002
page_copy_rec_list_end_no_locks(block, temp_block,
1003
page_get_infimum_rec(temp_page),
1006
if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
1007
/* Copy max trx id to recreated page */
1008
trx_id_t max_trx_id = page_get_max_trx_id(temp_page);
1009
page_set_max_trx_id(block, NULL, max_trx_id, mtr);
1010
/* In crash recovery, dict_index_is_sec_or_ibuf() always
1011
returns TRUE, even for clustered indexes. max_trx_id is
1012
unused in clustered index pages. */
1013
ut_ad(!ut_dulint_is_zero(max_trx_id) || recovery);
1016
if (UNIV_LIKELY_NULL(page_zip)
1018
(!page_zip_compress(page_zip, page, index, NULL))) {
1020
/* Restore the old page and exit. */
1022
#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
1023
/* Check that the bytes that we skip are identical. */
1024
ut_a(!memcmp(page, temp_page, PAGE_HEADER));
1025
ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
1026
PAGE_HEADER + PAGE_N_RECS + temp_page,
1027
PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
1028
ut_a(!memcmp(UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page,
1029
UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + temp_page,
1030
FIL_PAGE_DATA_END));
1031
#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
1033
memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
1034
PAGE_N_RECS - PAGE_N_DIR_SLOTS);
1035
memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
1036
UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END);
1038
#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
1039
ut_a(!memcmp(page, temp_page, UNIV_PAGE_SIZE));
1040
#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
1045
#ifndef UNIV_HOTBACKUP
1046
if (UNIV_LIKELY(!recovery)) {
1047
/* Update the record lock bitmaps */
1048
lock_move_reorganize_page(block, temp_block);
1050
#endif /* !UNIV_HOTBACKUP */
1052
data_size2 = page_get_data_size(page);
1053
max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
1055
if (UNIV_UNLIKELY(data_size1 != data_size2)
1056
|| UNIV_UNLIKELY(max_ins_size1 != max_ins_size2)) {
1057
buf_page_print(page, 0);
1058
buf_page_print(temp_page, 0);
1060
"InnoDB: Error: page old data size %lu"
1061
" new data size %lu\n"
1062
"InnoDB: Error: page old max ins size %lu"
1063
" new max ins size %lu\n"
1064
"InnoDB: Submit a detailed bug report"
1065
" to http://bugs.mysql.com\n",
1066
(unsigned long) data_size1, (unsigned long) data_size2,
1067
(unsigned long) max_ins_size1,
1068
(unsigned long) max_ins_size2);
1074
#ifdef UNIV_ZIP_DEBUG
1075
ut_a(!page_zip || page_zip_validate(page_zip, page));
1076
#endif /* UNIV_ZIP_DEBUG */
1077
#ifndef UNIV_HOTBACKUP
1078
buf_block_free(temp_block);
1079
#endif /* !UNIV_HOTBACKUP */
1081
/* Restore logging mode */
1082
mtr_set_log_mode(mtr, log_mode);
1087
#ifndef UNIV_HOTBACKUP
1088
/*************************************************************//**
1089
Reorganizes an index page.
1090
IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf
1091
page of a non-clustered index, the caller must update the insert
1092
buffer free bits in the same mini-transaction in such a way that the
1093
modification will be redo-logged.
1094
@return TRUE on success, FALSE on failure */
1097
btr_page_reorganize(
1098
/*================*/
1099
buf_block_t* block, /*!< in: page to be reorganized */
1100
dict_index_t* index, /*!< in: record descriptor */
1101
mtr_t* mtr) /*!< in: mtr */
1103
return(btr_page_reorganize_low(FALSE, block, index, mtr));
1105
#endif /* !UNIV_HOTBACKUP */
1107
/***********************************************************//**
1108
Parses a redo log record of reorganizing a page.
1109
@return end of log record or NULL */
1112
btr_parse_page_reorganize(
1113
/*======================*/
1114
byte* ptr, /*!< in: buffer */
1115
byte* end_ptr __attribute__((unused)),
1116
/*!< in: buffer end */
1117
dict_index_t* index, /*!< in: record descriptor */
1118
buf_block_t* block, /*!< in: page to be reorganized, or NULL */
1119
mtr_t* mtr) /*!< in: mtr or NULL */
1121
ut_ad(ptr && end_ptr);
1123
/* The record is empty, except for the record initial part */
1125
if (UNIV_LIKELY(block != NULL)) {
1126
btr_page_reorganize_low(TRUE, block, index, mtr);
1132
#ifndef UNIV_HOTBACKUP
1133
/*************************************************************//**
1134
Empties an index page. @see btr_page_create(). */
1139
buf_block_t* block, /*!< in: page to be emptied */
1140
page_zip_des_t* page_zip,/*!< out: compressed page, or NULL */
1141
dict_index_t* index, /*!< in: index of the page */
1142
ulint level, /*!< in: the B-tree level of the page */
1143
mtr_t* mtr) /*!< in: mtr */
1145
page_t* page = buf_block_get_frame(block);
1147
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1148
ut_ad(page_zip == buf_block_get_page_zip(block));
1149
#ifdef UNIV_ZIP_DEBUG
1150
ut_a(!page_zip || page_zip_validate(page_zip, page));
1151
#endif /* UNIV_ZIP_DEBUG */
1153
btr_search_drop_page_hash_index(block);
1155
/* Recreate the page: note that global data on page (possible
1156
segment headers, next page-field, etc.) is preserved intact */
1158
if (UNIV_LIKELY_NULL(page_zip)) {
1159
page_create_zip(block, index, level, mtr);
1161
page_create(block, mtr, dict_table_is_comp(index->table));
1162
btr_page_set_level(page, NULL, level, mtr);
1165
block->check_index_page_at_flush = TRUE;
1168
/*************************************************************//**
1169
Makes tree one level higher by splitting the root, and inserts
1170
the tuple. It is assumed that mtr contains an x-latch on the tree.
1171
NOTE that the operation of this function must always succeed,
1172
we cannot reverse it: therefore enough free disk space must be
1173
guaranteed to be available before this function is called.
1174
@return inserted record */
1177
btr_root_raise_and_insert(
1178
/*======================*/
1179
btr_cur_t* cursor, /*!< in: cursor at which to insert: must be
1180
on the root page; when the function returns,
1181
the cursor is positioned on the predecessor
1182
of the inserted record */
1183
const dtuple_t* tuple, /*!< in: tuple to insert */
1184
ulint n_ext, /*!< in: number of externally stored columns */
1185
mtr_t* mtr) /*!< in: mtr */
1187
dict_index_t* index;
1195
rec_t* node_ptr_rec;
1196
page_cur_t* page_cursor;
1197
page_zip_des_t* root_page_zip;
1198
page_zip_des_t* new_page_zip;
1199
buf_block_t* root_block;
1200
buf_block_t* new_block;
1202
root = btr_cur_get_page(cursor);
1203
root_block = btr_cur_get_block(cursor);
1204
root_page_zip = buf_block_get_page_zip(root_block);
1205
#ifdef UNIV_ZIP_DEBUG
1206
ut_a(!root_page_zip || page_zip_validate(root_page_zip, root));
1207
#endif /* UNIV_ZIP_DEBUG */
1208
index = btr_cur_get_index(cursor);
1209
#ifdef UNIV_BTR_DEBUG
1210
if (!dict_index_is_ibuf(index)) {
1211
ulint space = dict_index_get_space(index);
1213
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
1215
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
1219
ut_a(dict_index_get_page(index) == page_get_page_no(root));
1220
#endif /* UNIV_BTR_DEBUG */
1221
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
1223
ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX));
1225
/* Allocate a new page to the tree. Root splitting is done by first
1226
moving the root records to the new page, emptying the root, putting
1227
a node pointer to the new page, and then splitting the new page. */
1229
level = btr_page_get_level(root, mtr);
1231
new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr);
1232
new_page = buf_block_get_frame(new_block);
1233
new_page_zip = buf_block_get_page_zip(new_block);
1234
ut_a(!new_page_zip == !root_page_zip);
1236
|| page_zip_get_size(new_page_zip)
1237
== page_zip_get_size(root_page_zip));
1239
btr_page_create(new_block, new_page_zip, index, level, mtr);
1241
/* Set the next node and previous node fields of new page */
1242
btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr);
1243
btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr);
1245
/* Copy the records from root to the new page one by one. */
1248
#ifdef UNIV_ZIP_COPY
1250
#endif /* UNIV_ZIP_COPY */
1252
(!page_copy_rec_list_end(new_block, root_block,
1253
page_get_infimum_rec(root),
1257
/* Copy the page byte for byte. */
1258
page_zip_copy_recs(new_page_zip, new_page,
1259
root_page_zip, root, index, mtr);
1261
/* Update the lock table and possible hash index. */
1263
lock_move_rec_list_end(new_block, root_block,
1264
page_get_infimum_rec(root));
1266
btr_search_move_or_delete_hash_entries(new_block, root_block,
1270
/* If this is a pessimistic insert which is actually done to
1271
perform a pessimistic update then we have stored the lock
1272
information of the record to be inserted on the infimum of the
1273
root page: we cannot discard the lock structs on the root page */
1275
lock_update_root_raise(new_block, root_block);
1277
/* Create a memory heap where the node pointer is stored */
1278
heap = mem_heap_create(100);
1280
rec = page_rec_get_next(page_get_infimum_rec(new_page));
1281
new_page_no = buf_block_get_page_no(new_block);
1283
/* Build the node pointer (= node key and page address) for the
1286
node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap,
1288
/* The node pointer must be marked as the predefined minimum record,
1289
as there is no lower alphabetical limit to records in the leftmost
1291
dtuple_set_info_bits(node_ptr,
1292
dtuple_get_info_bits(node_ptr)
1293
| REC_INFO_MIN_REC_FLAG);
1295
/* Rebuild the root page to get free space */
1296
btr_page_empty(root_block, root_page_zip, index, level + 1, mtr);
1298
/* Set the next node and previous node fields, although
1299
they should already have been set. The previous node field
1300
must be FIL_NULL if root_page_zip != NULL, because the
1301
REC_INFO_MIN_REC_FLAG (of the first user record) will be
1302
set if and only if btr_page_get_prev() == FIL_NULL. */
1303
btr_page_set_next(root, root_page_zip, FIL_NULL, mtr);
1304
btr_page_set_prev(root, root_page_zip, FIL_NULL, mtr);
1306
page_cursor = btr_cur_get_page_cur(cursor);
1308
/* Insert node pointer to the root */
1310
page_cur_set_before_first(root_block, page_cursor);
1312
node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
1315
/* The root page should only contain the node pointer
1316
to new_page at this point. Thus, the data should fit. */
1319
/* Free the memory heap */
1320
mem_heap_free(heap);
1322
/* We play safe and reset the free bits for the new page */
1325
fprintf(stderr, "Root raise new page no %lu\n", new_page_no);
1328
if (!dict_index_is_clust(index)) {
1329
ibuf_reset_free_bits(new_block);
1332
/* Reposition the cursor to the child node */
1333
page_cur_search(new_block, index, tuple,
1334
PAGE_CUR_LE, page_cursor);
1336
/* Split the child and insert tuple */
1337
return(btr_page_split_and_insert(cursor, tuple, n_ext, mtr));
1340
/*************************************************************//**
1341
Decides if the page should be split at the convergence point of inserts
1342
converging to the left.
1343
@return TRUE if split recommended */
1346
btr_page_get_split_rec_to_left(
1347
/*===========================*/
1348
btr_cur_t* cursor, /*!< in: cursor at which to insert */
1349
rec_t** split_rec) /*!< out: if split recommended,
1350
the first record on upper half page,
1351
or NULL if tuple to be inserted should
1355
rec_t* insert_point;
1358
page = btr_cur_get_page(cursor);
1359
insert_point = btr_cur_get_rec(cursor);
1361
if (page_header_get_ptr(page, PAGE_LAST_INSERT)
1362
== page_rec_get_next(insert_point)) {
1364
infimum = page_get_infimum_rec(page);
1366
/* If the convergence is in the middle of a page, include also
1367
the record immediately before the new insert to the upper
1368
page. Otherwise, we could repeatedly move from page to page
1369
lots of records smaller than the convergence point. */
1371
if (infimum != insert_point
1372
&& page_rec_get_next(infimum) != insert_point) {
1374
*split_rec = insert_point;
1376
*split_rec = page_rec_get_next(insert_point);
1385
/*************************************************************//**
1386
Decides if the page should be split at the convergence point of inserts
1387
converging to the right.
1388
@return TRUE if split recommended */
1391
btr_page_get_split_rec_to_right(
1392
/*============================*/
1393
btr_cur_t* cursor, /*!< in: cursor at which to insert */
1394
rec_t** split_rec) /*!< out: if split recommended,
1395
the first record on upper half page,
1396
or NULL if tuple to be inserted should
1400
rec_t* insert_point;
1402
page = btr_cur_get_page(cursor);
1403
insert_point = btr_cur_get_rec(cursor);
1405
/* We use eager heuristics: if the new insert would be right after
1406
the previous insert on the same page, we assume that there is a
1407
pattern of sequential inserts here. */
1409
if (UNIV_LIKELY(page_header_get_ptr(page, PAGE_LAST_INSERT)
1414
next_rec = page_rec_get_next(insert_point);
1416
if (page_rec_is_supremum(next_rec)) {
1418
/* Split at the new record to insert */
1421
rec_t* next_next_rec = page_rec_get_next(next_rec);
1422
if (page_rec_is_supremum(next_next_rec)) {
1427
/* If there are >= 2 user records up from the insert
1428
point, split all but 1 off. We want to keep one because
1429
then sequential inserts can use the adaptive hash
1430
index, as they can do the necessary checks of the right
1431
search position just by looking at the records on this
1434
*split_rec = next_next_rec;
1443
/*************************************************************//**
1444
Calculates a split record such that the tuple will certainly fit on
1445
its half-page when the split is performed. We assume in this function
1446
only that the cursor page has at least one user record.
1447
@return split record, or NULL if tuple will be the first record on
1451
btr_page_get_sure_split_rec(
1452
/*========================*/
1453
btr_cur_t* cursor, /*!< in: cursor at which insert should be made */
1454
const dtuple_t* tuple, /*!< in: tuple to insert */
1455
ulint n_ext) /*!< in: number of externally stored columns */
1458
page_zip_des_t* page_zip;
1472
page = btr_cur_get_page(cursor);
1474
insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
1475
free_space = page_get_free_space_of_empty(page_is_comp(page));
1477
page_zip = btr_cur_get_page_zip(cursor);
1478
if (UNIV_LIKELY_NULL(page_zip)) {
1479
/* Estimate the free space of an empty compressed page. */
1480
ulint free_space_zip = page_zip_empty_size(
1481
cursor->index->n_fields,
1482
page_zip_get_size(page_zip));
1484
if (UNIV_LIKELY(free_space > (ulint) free_space_zip)) {
1485
free_space = (ulint) free_space_zip;
1489
/* free_space is now the free space of a created new page */
1491
total_data = page_get_data_size(page) + insert_size;
1492
total_n_recs = page_get_n_recs(page) + 1;
1493
ut_ad(total_n_recs >= 2);
1494
total_space = total_data + page_dir_calc_reserved_space(total_n_recs);
1498
ins_rec = btr_cur_get_rec(cursor);
1499
rec = page_get_infimum_rec(page);
1504
/* We start to include records to the left half, and when the
1505
space reserved by them exceeds half of total_space, then if
1506
the included records fit on the left page, they will be put there
1507
if something was left over also for the right page,
1508
otherwise the last included record will be the first on the right
1512
/* Decide the next record to include */
1513
if (rec == ins_rec) {
1514
rec = NULL; /* NULL denotes that tuple is
1516
} else if (rec == NULL) {
1517
rec = page_rec_get_next(ins_rec);
1519
rec = page_rec_get_next(rec);
1524
incl_data += insert_size;
1526
offsets = rec_get_offsets(rec, cursor->index,
1527
offsets, ULINT_UNDEFINED,
1529
incl_data += rec_offs_size(offsets);
1533
} while (incl_data + page_dir_calc_reserved_space(n)
1536
if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
1537
/* The next record will be the first on
1538
the right half page if it is not the
1539
supremum record of page */
1541
if (rec == ins_rec) {
1545
} else if (rec == NULL) {
1546
next_rec = page_rec_get_next(ins_rec);
1548
next_rec = page_rec_get_next(rec);
1551
if (!page_rec_is_supremum(next_rec)) {
1557
if (UNIV_LIKELY_NULL(heap)) {
1558
mem_heap_free(heap);
1563
/*************************************************************//**
1564
Returns TRUE if the insert fits on the appropriate half-page with the
1566
@return TRUE if fits */
1569
btr_page_insert_fits(
1570
/*=================*/
1571
btr_cur_t* cursor, /*!< in: cursor at which insert
1573
const rec_t* split_rec,/*!< in: suggestion for first record
1574
on upper half-page, or NULL if
1575
tuple to be inserted should be first */
1576
const ulint* offsets,/*!< in: rec_get_offsets(
1577
split_rec, cursor->index) */
1578
const dtuple_t* tuple, /*!< in: tuple to insert */
1579
ulint n_ext, /*!< in: number of externally stored columns */
1580
mem_heap_t* heap) /*!< in: temporary memory heap */
1588
const rec_t* end_rec;
1591
page = btr_cur_get_page(cursor);
1593
ut_ad(!split_rec == !offsets);
1595
|| !page_is_comp(page) == !rec_offs_comp(offsets));
1597
|| rec_offs_validate(split_rec, cursor->index, offsets));
1599
insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
1600
free_space = page_get_free_space_of_empty(page_is_comp(page));
1602
/* free_space is now the free space of a created new page */
1604
total_data = page_get_data_size(page) + insert_size;
1605
total_n_recs = page_get_n_recs(page) + 1;
1607
/* We determine which records (from rec to end_rec, not including
1608
end_rec) will end up on the other half page from tuple when it is
1611
if (split_rec == NULL) {
1612
rec = page_rec_get_next(page_get_infimum_rec(page));
1613
end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
1615
} else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) {
1617
rec = page_rec_get_next(page_get_infimum_rec(page));
1618
end_rec = split_rec;
1621
end_rec = page_get_supremum_rec(page);
1624
if (total_data + page_dir_calc_reserved_space(total_n_recs)
1627
/* Ok, there will be enough available space on the
1628
half page where the tuple is inserted */
1635
while (rec != end_rec) {
1636
/* In this loop we calculate the amount of reserved
1637
space after rec is removed from page. */
1639
offs = rec_get_offsets(rec, cursor->index, offs,
1640
ULINT_UNDEFINED, &heap);
1642
total_data -= rec_offs_size(offs);
1645
if (total_data + page_dir_calc_reserved_space(total_n_recs)
1648
/* Ok, there will be enough available space on the
1649
half page where the tuple is inserted */
1654
rec = page_rec_get_next_const(rec);
1660
/*******************************************************//**
1661
Inserts a data tuple to a tree on a non-leaf level. It is assumed
1662
that mtr holds an x-latch on the tree. */
1665
btr_insert_on_non_leaf_level(
1666
/*=========================*/
1667
dict_index_t* index, /*!< in: index */
1668
ulint level, /*!< in: level, must be > 0 */
1669
dtuple_t* tuple, /*!< in: the record to be inserted */
1670
mtr_t* mtr) /*!< in: mtr */
1672
big_rec_t* dummy_big_rec;
1679
btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE,
1680
BTR_CONT_MODIFY_TREE,
1683
err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
1685
| BTR_NO_UNDO_LOG_FLAG,
1686
&cursor, tuple, &rec,
1687
&dummy_big_rec, 0, NULL, mtr);
1688
ut_a(err == DB_SUCCESS);
1691
/**************************************************************//**
1692
Attaches the halves of an index page on the appropriate level in an
1696
btr_attach_half_pages(
1697
/*==================*/
1698
dict_index_t* index, /*!< in: the index tree */
1699
buf_block_t* block, /*!< in/out: page to be split */
1700
rec_t* split_rec, /*!< in: first record on upper
1702
buf_block_t* new_block, /*!< in/out: the new half page */
1703
ulint direction, /*!< in: FSP_UP or FSP_DOWN */
1704
mtr_t* mtr) /*!< in: mtr */
1711
page_t* page = buf_block_get_frame(block);
1714
ulint lower_page_no;
1715
ulint upper_page_no;
1716
page_zip_des_t* lower_page_zip;
1717
page_zip_des_t* upper_page_zip;
1718
dtuple_t* node_ptr_upper;
1721
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1722
ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX));
1724
/* Create a memory heap where the data tuple is stored */
1725
heap = mem_heap_create(1024);
1727
/* Based on split direction, decide upper and lower pages */
1728
if (direction == FSP_DOWN) {
1733
lower_page = buf_block_get_frame(new_block);
1734
lower_page_no = buf_block_get_page_no(new_block);
1735
lower_page_zip = buf_block_get_page_zip(new_block);
1736
upper_page = buf_block_get_frame(block);
1737
upper_page_no = buf_block_get_page_no(block);
1738
upper_page_zip = buf_block_get_page_zip(block);
1740
/* Look up the index for the node pointer to page */
1741
offsets = btr_page_get_father_block(NULL, heap, index,
1742
block, mtr, &cursor);
1744
/* Replace the address of the old child node (= page) with the
1745
address of the new lower half */
1747
btr_node_ptr_set_child_page_no(
1748
btr_cur_get_rec(&cursor),
1749
btr_cur_get_page_zip(&cursor),
1750
offsets, lower_page_no, mtr);
1751
mem_heap_empty(heap);
1753
lower_page = buf_block_get_frame(block);
1754
lower_page_no = buf_block_get_page_no(block);
1755
lower_page_zip = buf_block_get_page_zip(block);
1756
upper_page = buf_block_get_frame(new_block);
1757
upper_page_no = buf_block_get_page_no(new_block);
1758
upper_page_zip = buf_block_get_page_zip(new_block);
1761
/* Get the level of the split pages */
1762
level = btr_page_get_level(buf_block_get_frame(block), mtr);
1764
== btr_page_get_level(buf_block_get_frame(new_block), mtr));
1766
/* Build the node pointer (= node key and page address) for the upper
1769
node_ptr_upper = dict_index_build_node_ptr(index, split_rec,
1770
upper_page_no, heap, level);
1772
/* Insert it next to the pointer to the lower half. Note that this
1773
may generate recursion leading to a split on the higher level. */
1775
btr_insert_on_non_leaf_level(index, level + 1, node_ptr_upper, mtr);
1777
/* Free the memory heap */
1778
mem_heap_free(heap);
1780
/* Get the previous and next pages of page */
1782
prev_page_no = btr_page_get_prev(page, mtr);
1783
next_page_no = btr_page_get_next(page, mtr);
1784
space = buf_block_get_space(block);
1785
zip_size = buf_block_get_zip_size(block);
1787
/* Update page links of the level */
1789
if (prev_page_no != FIL_NULL) {
1790
buf_block_t* prev_block = btr_block_get(space, zip_size,
1793
#ifdef UNIV_BTR_DEBUG
1794
ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
1795
ut_a(btr_page_get_next(prev_block->frame, mtr)
1796
== buf_block_get_page_no(block));
1797
#endif /* UNIV_BTR_DEBUG */
1799
btr_page_set_next(buf_block_get_frame(prev_block),
1800
buf_block_get_page_zip(prev_block),
1801
lower_page_no, mtr);
1804
if (next_page_no != FIL_NULL) {
1805
buf_block_t* next_block = btr_block_get(space, zip_size,
1808
#ifdef UNIV_BTR_DEBUG
1809
ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
1810
ut_a(btr_page_get_prev(next_block->frame, mtr)
1811
== page_get_page_no(page));
1812
#endif /* UNIV_BTR_DEBUG */
1814
btr_page_set_prev(buf_block_get_frame(next_block),
1815
buf_block_get_page_zip(next_block),
1816
upper_page_no, mtr);
1819
btr_page_set_prev(lower_page, lower_page_zip, prev_page_no, mtr);
1820
btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr);
1822
btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr);
1823
btr_page_set_next(upper_page, upper_page_zip, next_page_no, mtr);
1826
/*************************************************************//**
1827
Splits an index page to halves and inserts the tuple. It is assumed
1828
that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
1829
released within this function! NOTE that the operation of this
1830
function must always succeed, we cannot reverse it: therefore enough
1831
free disk space (2 pages) must be guaranteed to be available before
1832
this function is called.
1834
@return inserted record */
1837
btr_page_split_and_insert(
1838
/*======================*/
1839
btr_cur_t* cursor, /*!< in: cursor at which to insert; when the
1840
function returns, the cursor is positioned
1841
on the predecessor of the inserted record */
1842
const dtuple_t* tuple, /*!< in: tuple to insert */
1843
ulint n_ext, /*!< in: number of externally stored columns */
1844
mtr_t* mtr) /*!< in: mtr */
1848
page_zip_des_t* page_zip;
1852
buf_block_t* new_block;
1854
page_zip_des_t* new_page_zip;
1856
buf_block_t* left_block;
1857
buf_block_t* right_block;
1858
buf_block_t* insert_block;
1859
page_t* insert_page;
1860
page_cur_t* page_cursor;
1862
byte* buf = 0; /* remove warning */
1864
ibool insert_will_fit;
1866
ulint n_iterations = 0;
1872
heap = mem_heap_create(1024);
1873
n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
1875
mem_heap_empty(heap);
1878
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
1880
#ifdef UNIV_SYNC_DEBUG
1881
ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX));
1882
#endif /* UNIV_SYNC_DEBUG */
1884
block = btr_cur_get_block(cursor);
1885
page = buf_block_get_frame(block);
1886
page_zip = buf_block_get_page_zip(block);
1888
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1889
ut_ad(page_get_n_recs(page) >= 1);
1891
page_no = buf_block_get_page_no(block);
1893
/* 1. Decide the split record; split_rec == NULL means that the
1894
tuple to be inserted should be the first record on the upper
1897
if (n_iterations > 0) {
1899
hint_page_no = page_no + 1;
1900
split_rec = btr_page_get_sure_split_rec(cursor, tuple, n_ext);
1902
} else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
1904
hint_page_no = page_no + 1;
1906
} else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) {
1907
direction = FSP_DOWN;
1908
hint_page_no = page_no - 1;
1911
hint_page_no = page_no + 1;
1913
if (page_get_n_recs(page) == 1) {
1916
/* There is only one record in the index page
1917
therefore we can't split the node in the middle
1918
by default. We need to determine whether the
1919
new record will be inserted to the left or right. */
1921
/* Read the first (and only) record in the page. */
1922
page_cur_set_before_first(block, &pcur);
1923
page_cur_move_to_next(&pcur);
1924
first_rec = page_cur_get_rec(&pcur);
1926
offsets = rec_get_offsets(
1927
first_rec, cursor->index, offsets,
1930
/* If the new record is less than the existing record
1931
the split in the middle will copy the existing
1932
record to the new node. */
1933
if (cmp_dtuple_rec(tuple, first_rec, offsets) < 0) {
1934
split_rec = page_get_middle_rec(page);
1939
split_rec = page_get_middle_rec(page);
1943
/* 2. Allocate a new page to the index */
1944
new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
1945
btr_page_get_level(page, mtr), mtr);
1946
new_page = buf_block_get_frame(new_block);
1947
new_page_zip = buf_block_get_page_zip(new_block);
1948
btr_page_create(new_block, new_page_zip, cursor->index,
1949
btr_page_get_level(page, mtr), mtr);
1951
/* 3. Calculate the first record on the upper half-page, and the
1952
first record (move_limit) on original page which ends up on the
1956
first_rec = move_limit = split_rec;
1958
offsets = rec_get_offsets(split_rec, cursor->index, offsets,
1961
insert_left = cmp_dtuple_rec(tuple, split_rec, offsets) < 0;
1963
if (UNIV_UNLIKELY(!insert_left && new_page_zip
1964
&& n_iterations > 0)) {
1965
/* If a compressed page has already been split,
1966
avoid further splits by inserting the record
1967
to an empty page. */
1973
insert_left = FALSE;
1974
buf = mem_alloc(rec_get_converted_size(cursor->index,
1977
first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
1979
move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
1982
/* 4. Do first the modifications in the tree structure */
1984
btr_attach_half_pages(cursor->index, block,
1985
first_rec, new_block, direction, mtr);
1987
/* If the split is made on the leaf level and the insert will fit
1988
on the appropriate half-page, we may release the tree x-latch.
1989
We can then move the records after releasing the tree latch,
1990
thus reducing the tree latch contention. */
1993
insert_will_fit = !new_page_zip
1994
&& btr_page_insert_fits(cursor, split_rec,
1995
offsets, tuple, n_ext, heap);
1998
insert_will_fit = !new_page_zip
1999
&& btr_page_insert_fits(cursor, NULL,
2000
NULL, tuple, n_ext, heap);
2003
if (insert_will_fit && page_is_leaf(page)) {
2005
mtr_memo_release(mtr, dict_index_get_lock(cursor->index),
2009
/* 5. Move then the records to the new page */
2010
if (direction == FSP_DOWN) {
2011
/* fputs("Split left\n", stderr); */
2014
#ifdef UNIV_ZIP_COPY
2016
#endif /* UNIV_ZIP_COPY */
2018
(!page_move_rec_list_start(new_block, block, move_limit,
2019
cursor->index, mtr))) {
2020
/* For some reason, compressing new_page failed,
2021
even though it should contain fewer records than
2022
the original page. Copy the page byte for byte
2023
and then delete the records from both pages
2024
as appropriate. Deleting will always succeed. */
2027
page_zip_copy_recs(new_page_zip, new_page,
2028
page_zip, page, cursor->index, mtr);
2029
page_delete_rec_list_end(move_limit - page + new_page,
2030
new_block, cursor->index,
2032
ULINT_UNDEFINED, mtr);
2034
/* Update the lock table and possible hash index. */
2036
lock_move_rec_list_start(
2037
new_block, block, move_limit,
2038
new_page + PAGE_NEW_INFIMUM);
2040
btr_search_move_or_delete_hash_entries(
2041
new_block, block, cursor->index);
2043
/* Delete the records from the source page. */
2045
page_delete_rec_list_start(move_limit, block,
2046
cursor->index, mtr);
2049
left_block = new_block;
2050
right_block = block;
2052
lock_update_split_left(right_block, left_block);
2054
/* fputs("Split right\n", stderr); */
2057
#ifdef UNIV_ZIP_COPY
2059
#endif /* UNIV_ZIP_COPY */
2061
(!page_move_rec_list_end(new_block, block, move_limit,
2062
cursor->index, mtr))) {
2063
/* For some reason, compressing new_page failed,
2064
even though it should contain fewer records than
2065
the original page. Copy the page byte for byte
2066
and then delete the records from both pages
2067
as appropriate. Deleting will always succeed. */
2070
page_zip_copy_recs(new_page_zip, new_page,
2071
page_zip, page, cursor->index, mtr);
2072
page_delete_rec_list_start(move_limit - page
2073
+ new_page, new_block,
2074
cursor->index, mtr);
2076
/* Update the lock table and possible hash index. */
2078
lock_move_rec_list_end(new_block, block, move_limit);
2080
btr_search_move_or_delete_hash_entries(
2081
new_block, block, cursor->index);
2083
/* Delete the records from the source page. */
2085
page_delete_rec_list_end(move_limit, block,
2088
ULINT_UNDEFINED, mtr);
2092
right_block = new_block;
2094
lock_update_split_right(right_block, left_block);
2097
#ifdef UNIV_ZIP_DEBUG
2098
if (UNIV_LIKELY_NULL(page_zip)) {
2099
ut_a(page_zip_validate(page_zip, page));
2100
ut_a(page_zip_validate(new_page_zip, new_page));
2102
#endif /* UNIV_ZIP_DEBUG */
2104
/* At this point, split_rec, move_limit and first_rec may point
2105
to garbage on the old page. */
2107
/* 6. The split and the tree modification is now completed. Decide the
2108
page where the tuple should be inserted */
2111
insert_block = left_block;
2113
insert_block = right_block;
2116
insert_page = buf_block_get_frame(insert_block);
2118
/* 7. Reposition the cursor for insert and try insertion */
2119
page_cursor = btr_cur_get_page_cur(cursor);
2121
page_cur_search(insert_block, cursor->index, tuple,
2122
PAGE_CUR_LE, page_cursor);
2124
rec = page_cur_tuple_insert(page_cursor, tuple,
2125
cursor->index, n_ext, mtr);
2127
#ifdef UNIV_ZIP_DEBUG
2129
page_zip_des_t* insert_page_zip
2130
= buf_block_get_page_zip(insert_block);
2131
ut_a(!insert_page_zip
2132
|| page_zip_validate(insert_page_zip, insert_page));
2134
#endif /* UNIV_ZIP_DEBUG */
2136
if (UNIV_LIKELY(rec != NULL)) {
2141
/* 8. If insert did not fit, try page reorganization */
2144
(!btr_page_reorganize(insert_block, cursor->index, mtr))) {
2149
page_cur_search(insert_block, cursor->index, tuple,
2150
PAGE_CUR_LE, page_cursor);
2151
rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
2154
if (UNIV_UNLIKELY(rec == NULL)) {
2155
/* The insert did not fit on the page: loop back to the
2156
start of the function for a new split */
2158
/* We play safe and reset the free bits for new_page */
2159
if (!dict_index_is_clust(cursor->index)) {
2160
ibuf_reset_free_bits(new_block);
2163
/* fprintf(stderr, "Split second round %lu\n",
2164
page_get_page_no(page)); */
2166
ut_ad(n_iterations < 2
2167
|| buf_block_get_page_zip(insert_block));
2168
ut_ad(!insert_will_fit);
2174
/* Insert fit on the page: update the free bits for the
2175
left and right pages in the same mtr */
2177
if (!dict_index_is_clust(cursor->index) && page_is_leaf(page)) {
2178
ibuf_update_free_bits_for_two_pages_low(
2179
buf_block_get_zip_size(left_block),
2180
left_block, right_block, mtr);
2184
fprintf(stderr, "Split and insert done %lu %lu\n",
2185
buf_block_get_page_no(left_block),
2186
buf_block_get_page_no(right_block));
2189
ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
2190
ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
2192
mem_heap_free(heap);
2196
/*************************************************************//**
2197
Removes a page from the level list of pages. */
2200
btr_level_list_remove(
2201
/*==================*/
2202
ulint space, /*!< in: space where removed */
2203
ulint zip_size,/*!< in: compressed page size in bytes
2204
or 0 for uncompressed pages */
2205
page_t* page, /*!< in: page to remove */
2206
mtr_t* mtr) /*!< in: mtr */
2212
ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
2213
ut_ad(space == page_get_space_id(page));
2214
/* Get the previous and next page numbers of page */
2216
prev_page_no = btr_page_get_prev(page, mtr);
2217
next_page_no = btr_page_get_next(page, mtr);
2219
/* Update page links of the level */
2221
if (prev_page_no != FIL_NULL) {
2222
buf_block_t* prev_block
2223
= btr_block_get(space, zip_size, prev_page_no,
2226
= buf_block_get_frame(prev_block);
2227
#ifdef UNIV_BTR_DEBUG
2228
ut_a(page_is_comp(prev_page) == page_is_comp(page));
2229
ut_a(btr_page_get_next(prev_page, mtr)
2230
== page_get_page_no(page));
2231
#endif /* UNIV_BTR_DEBUG */
2233
btr_page_set_next(prev_page,
2234
buf_block_get_page_zip(prev_block),
2238
if (next_page_no != FIL_NULL) {
2239
buf_block_t* next_block
2240
= btr_block_get(space, zip_size, next_page_no,
2243
= buf_block_get_frame(next_block);
2244
#ifdef UNIV_BTR_DEBUG
2245
ut_a(page_is_comp(next_page) == page_is_comp(page));
2246
ut_a(btr_page_get_prev(next_page, mtr)
2247
== page_get_page_no(page));
2248
#endif /* UNIV_BTR_DEBUG */
2250
btr_page_set_prev(next_page,
2251
buf_block_get_page_zip(next_block),
2256
/****************************************************************//**
2257
Writes the redo log record for setting an index record as the predefined
2261
btr_set_min_rec_mark_log(
2262
/*=====================*/
2263
rec_t* rec, /*!< in: record */
2264
byte type, /*!< in: MLOG_COMP_REC_MIN_MARK or MLOG_REC_MIN_MARK */
2265
mtr_t* mtr) /*!< in: mtr */
2267
mlog_write_initial_log_record(rec, type, mtr);
2269
/* Write rec offset as a 2-byte ulint */
2270
mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES);
2272
#else /* !UNIV_HOTBACKUP */
2273
# define btr_set_min_rec_mark_log(rec,comp,mtr) ((void) 0)
2274
#endif /* !UNIV_HOTBACKUP */
2276
/****************************************************************//**
2277
Parses the redo log record for setting an index record as the predefined
2279
@return end of log record or NULL */
2282
btr_parse_set_min_rec_mark(
2283
/*=======================*/
2284
byte* ptr, /*!< in: buffer */
2285
byte* end_ptr,/*!< in: buffer end */
2286
ulint comp, /*!< in: nonzero=compact page format */
2287
page_t* page, /*!< in: page or NULL */
2288
mtr_t* mtr) /*!< in: mtr or NULL */
2292
if (end_ptr < ptr + 2) {
2298
ut_a(!page_is_comp(page) == !comp);
2300
rec = page + mach_read_from_2(ptr);
2302
btr_set_min_rec_mark(rec, mtr);
2308
/****************************************************************//**
2309
Sets a record as the predefined minimum record. */
2312
btr_set_min_rec_mark(
2313
/*=================*/
2314
rec_t* rec, /*!< in: record */
2315
mtr_t* mtr) /*!< in: mtr */
2319
if (UNIV_LIKELY(page_rec_is_comp(rec))) {
2320
info_bits = rec_get_info_bits(rec, TRUE);
2322
rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG);
2324
btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr);
2326
info_bits = rec_get_info_bits(rec, FALSE);
2328
rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG);
2330
btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr);
2334
#ifndef UNIV_HOTBACKUP
2335
/*************************************************************//**
2336
Deletes on the upper level the node pointer to a page. */
2339
btr_node_ptr_delete(
2340
/*================*/
2341
dict_index_t* index, /*!< in: index tree */
2342
buf_block_t* block, /*!< in: page whose node pointer is deleted */
2343
mtr_t* mtr) /*!< in: mtr */
2349
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2351
/* Delete node pointer on father page */
2352
btr_page_get_father(index, block, mtr, &cursor);
2354
compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, RB_NONE,
2356
ut_a(err == DB_SUCCESS);
2359
btr_cur_compress_if_useful(&cursor, mtr);
2363
/*************************************************************//**
2364
If page is the only on its level, this function moves its records to the
2365
father page, thus reducing the tree height. */
2370
dict_index_t* index, /*!< in: index tree */
2371
buf_block_t* block, /*!< in: page which is the only on its level;
2372
must not be empty: use
2373
btr_discard_only_page_on_level if the last
2374
record from the page should be removed */
2375
mtr_t* mtr) /*!< in: mtr */
2377
buf_block_t* father_block;
2378
page_t* father_page;
2380
page_zip_des_t* father_page_zip;
2381
page_t* page = buf_block_get_frame(block);
2383
buf_block_t* blocks[BTR_MAX_LEVELS];
2384
ulint n_blocks; /*!< last used index in blocks[] */
2387
ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
2388
ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
2389
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2391
page_level = btr_page_get_level(page, mtr);
2392
root_page_no = dict_index_get_page(index);
2396
mem_heap_t* heap = mem_heap_create(100);
2400
offsets = btr_page_get_father_block(NULL, heap, index,
2401
block, mtr, &cursor);
2402
father_block = btr_cur_get_block(&cursor);
2403
father_page_zip = buf_block_get_page_zip(father_block);
2404
father_page = buf_block_get_frame(father_block);
2408
/* Store all ancestor pages so we can reset their
2409
levels later on. We have to do all the searches on
2410
the tree now because later on, after we've replaced
2411
the first level, the tree is in an inconsistent state
2412
and can not be searched. */
2413
for (b = father_block;
2414
buf_block_get_page_no(b) != root_page_no; ) {
2415
ut_a(n_blocks < BTR_MAX_LEVELS);
2417
offsets = btr_page_get_father_block(offsets, heap,
2421
blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
2424
mem_heap_free(heap);
2427
btr_search_drop_page_hash_index(block);
2429
/* Make the father empty */
2430
btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
2432
/* Copy the records to the father page one by one. */
2434
#ifdef UNIV_ZIP_COPY
2436
#endif /* UNIV_ZIP_COPY */
2438
(!page_copy_rec_list_end(father_block, block,
2439
page_get_infimum_rec(page),
2441
const page_zip_des_t* page_zip
2442
= buf_block_get_page_zip(block);
2443
ut_a(father_page_zip);
2446
/* Copy the page byte for byte. */
2447
page_zip_copy_recs(father_page_zip, father_page,
2448
page_zip, page, index, mtr);
2450
/* Update the lock table and possible hash index. */
2452
lock_move_rec_list_end(father_block, block,
2453
page_get_infimum_rec(page));
2455
btr_search_move_or_delete_hash_entries(father_block, block,
2459
lock_update_copy_and_discard(father_block, block);
2461
/* Go upward to root page, decrementing levels by one. */
2462
for (i = 0; i < n_blocks; i++, page_level++) {
2463
page_t* page = buf_block_get_frame(blocks[i]);
2464
page_zip_des_t* page_zip= buf_block_get_page_zip(blocks[i]);
2466
ut_ad(btr_page_get_level(page, mtr) == page_level + 1);
2468
btr_page_set_level(page, page_zip, page_level, mtr);
2469
#ifdef UNIV_ZIP_DEBUG
2470
ut_a(!page_zip || page_zip_validate(page_zip, page));
2471
#endif /* UNIV_ZIP_DEBUG */
2474
/* Free the file page */
2475
btr_page_free(index, block, mtr);
2477
/* We play it safe and reset the free bits for the father */
2478
if (!dict_index_is_clust(index)) {
2479
ibuf_reset_free_bits(father_block);
2481
ut_ad(page_validate(father_page, index));
2482
ut_ad(btr_check_node_ptr(index, father_block, mtr));
2485
/*************************************************************//**
2486
Tries to merge the page first to the left immediate brother if such a
2487
brother exists, and the node pointers to the current page and to the brother
2488
reside on the same page. If the left brother does not satisfy these
2489
conditions, looks at the right brother. If the page is the only one on that
2490
level lifts the records of the page to the father page, thus reducing the
2491
tree height. It is assumed that mtr holds an x-latch on the tree and on the
2492
page. If cursor is on the leaf level, mtr must also hold x-latches to the
2493
brothers, if they exist.
2494
@return TRUE on success */
2499
btr_cur_t* cursor, /*!< in: cursor on the page to merge or lift;
2500
the page must not be empty: in record delete
2501
use btr_discard_page if the page would become
2503
mtr_t* mtr) /*!< in: mtr */
2505
dict_index_t* index;
2509
ulint right_page_no;
2510
buf_block_t* merge_block;
2512
page_zip_des_t* merge_page_zip;
2516
btr_cur_t father_cursor;
2522
ulint max_ins_size_reorg;
2525
block = btr_cur_get_block(cursor);
2526
page = btr_cur_get_page(cursor);
2527
index = btr_cur_get_index(cursor);
2528
ut_a((ibool) !!page_is_comp(page) == dict_table_is_comp(index->table));
2530
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2532
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2533
level = btr_page_get_level(page, mtr);
2534
space = dict_index_get_space(index);
2535
zip_size = dict_table_zip_size(index->table);
2537
left_page_no = btr_page_get_prev(page, mtr);
2538
right_page_no = btr_page_get_next(page, mtr);
2541
fprintf(stderr, "Merge left page %lu right %lu \n",
2542
left_page_no, right_page_no);
2545
heap = mem_heap_create(100);
2546
offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
2549
/* Decide the page to which we try to merge and which will inherit
2552
is_left = left_page_no != FIL_NULL;
2556
merge_block = btr_block_get(space, zip_size, left_page_no,
2558
merge_page = buf_block_get_frame(merge_block);
2559
#ifdef UNIV_BTR_DEBUG
2560
ut_a(btr_page_get_next(merge_page, mtr)
2561
== buf_block_get_page_no(block));
2562
#endif /* UNIV_BTR_DEBUG */
2563
} else if (right_page_no != FIL_NULL) {
2565
merge_block = btr_block_get(space, zip_size, right_page_no,
2567
merge_page = buf_block_get_frame(merge_block);
2568
#ifdef UNIV_BTR_DEBUG
2569
ut_a(btr_page_get_prev(merge_page, mtr)
2570
== buf_block_get_page_no(block));
2571
#endif /* UNIV_BTR_DEBUG */
2573
/* The page is the only one on the level, lift the records
2575
btr_lift_page_up(index, block, mtr);
2576
mem_heap_free(heap);
2580
n_recs = page_get_n_recs(page);
2581
data_size = page_get_data_size(page);
2582
#ifdef UNIV_BTR_DEBUG
2583
ut_a(page_is_comp(merge_page) == page_is_comp(page));
2584
#endif /* UNIV_BTR_DEBUG */
2586
max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
2587
merge_page, n_recs);
2588
if (data_size > max_ins_size_reorg) {
2590
/* No space for merge */
2592
/* We play it safe and reset the free bits. */
2594
&& page_is_leaf(merge_page)
2595
&& !dict_index_is_clust(index)) {
2596
ibuf_reset_free_bits(merge_block);
2599
mem_heap_free(heap);
2603
ut_ad(page_validate(merge_page, index));
2605
max_ins_size = page_get_max_insert_size(merge_page, n_recs);
2607
if (UNIV_UNLIKELY(data_size > max_ins_size)) {
2609
/* We have to reorganize merge_page */
2611
if (UNIV_UNLIKELY(!btr_page_reorganize(merge_block,
2617
max_ins_size = page_get_max_insert_size(merge_page, n_recs);
2619
ut_ad(page_validate(merge_page, index));
2620
ut_ad(max_ins_size == max_ins_size_reorg);
2622
if (UNIV_UNLIKELY(data_size > max_ins_size)) {
2624
/* Add fault tolerance, though this should
2631
merge_page_zip = buf_block_get_page_zip(merge_block);
2632
#ifdef UNIV_ZIP_DEBUG
2633
if (UNIV_LIKELY_NULL(merge_page_zip)) {
2634
const page_zip_des_t* page_zip
2635
= buf_block_get_page_zip(block);
2637
ut_a(page_zip_validate(merge_page_zip, merge_page));
2638
ut_a(page_zip_validate(page_zip, page));
2640
#endif /* UNIV_ZIP_DEBUG */
2642
/* Move records to the merge page */
2644
rec_t* orig_pred = page_copy_rec_list_start(
2645
merge_block, block, page_get_supremum_rec(page),
2648
if (UNIV_UNLIKELY(!orig_pred)) {
2652
btr_search_drop_page_hash_index(block);
2654
/* Remove the page from the level list */
2655
btr_level_list_remove(space, zip_size, page, mtr);
2657
btr_node_ptr_delete(index, block, mtr);
2658
lock_update_merge_left(merge_block, orig_pred, block);
2661
#ifdef UNIV_BTR_DEBUG
2662
byte fil_page_prev[4];
2663
#endif /* UNIV_BTR_DEBUG */
2665
if (UNIV_LIKELY_NULL(merge_page_zip)) {
2666
/* The function page_zip_compress(), which will be
2667
invoked by page_copy_rec_list_end() below,
2668
requires that FIL_PAGE_PREV be FIL_NULL.
2669
Clear the field, but prepare to restore it. */
2670
#ifdef UNIV_BTR_DEBUG
2671
memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
2672
#endif /* UNIV_BTR_DEBUG */
2673
#if FIL_NULL != 0xffffffff
2674
# error "FIL_NULL != 0xffffffff"
2676
memset(merge_page + FIL_PAGE_PREV, 0xff, 4);
2679
orig_succ = page_copy_rec_list_end(merge_block, block,
2680
page_get_infimum_rec(page),
2681
cursor->index, mtr);
2683
if (UNIV_UNLIKELY(!orig_succ)) {
2684
ut_a(merge_page_zip);
2685
#ifdef UNIV_BTR_DEBUG
2686
/* FIL_PAGE_PREV was restored from merge_page_zip. */
2687
ut_a(!memcmp(fil_page_prev,
2688
merge_page + FIL_PAGE_PREV, 4));
2689
#endif /* UNIV_BTR_DEBUG */
2693
btr_search_drop_page_hash_index(block);
2695
#ifdef UNIV_BTR_DEBUG
2696
if (UNIV_LIKELY_NULL(merge_page_zip)) {
2697
/* Restore FIL_PAGE_PREV in order to avoid an assertion
2698
failure in btr_level_list_remove(), which will set
2699
the field again to FIL_NULL. Even though this makes
2700
merge_page and merge_page_zip inconsistent for a
2701
split second, it is harmless, because the pages
2703
memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
2705
#endif /* UNIV_BTR_DEBUG */
2707
/* Remove the page from the level list */
2708
btr_level_list_remove(space, zip_size, page, mtr);
2710
/* Replace the address of the old child node (= page) with the
2711
address of the merge page to the right */
2713
btr_node_ptr_set_child_page_no(
2714
btr_cur_get_rec(&father_cursor),
2715
btr_cur_get_page_zip(&father_cursor),
2716
offsets, right_page_no, mtr);
2717
btr_node_ptr_delete(index, merge_block, mtr);
2719
lock_update_merge_right(merge_block, orig_succ, block);
2722
mem_heap_free(heap);
2724
if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) {
2725
/* Update the free bits of the B-tree page in the
2726
insert buffer bitmap. This has to be done in a
2727
separate mini-transaction that is committed before the
2728
main mini-transaction. We cannot update the insert
2729
buffer bitmap in this mini-transaction, because
2730
btr_compress() can be invoked recursively without
2731
committing the mini-transaction in between. Since
2732
insert buffer bitmap pages have a lower rank than
2733
B-tree pages, we must not access other pages in the
2734
same mini-transaction after accessing an insert buffer
2737
/* The free bits in the insert buffer bitmap must
2738
never exceed the free space on a page. It is safe to
2739
decrement or reset the bits in the bitmap in a
2740
mini-transaction that is committed before the
2741
mini-transaction that affects the free space. */
2743
/* It is unsafe to increment the bits in a separately
2744
committed mini-transaction, because in crash recovery,
2745
the free bits could momentarily be set too high. */
2748
/* Because the free bits may be incremented
2749
and we cannot update the insert buffer bitmap
2750
in the same mini-transaction, the only safe
2751
thing we can do here is the pessimistic
2752
approach: reset the free bits. */
2753
ibuf_reset_free_bits(merge_block);
2755
/* On uncompressed pages, the free bits will
2756
never increase here. Thus, it is safe to
2757
write the bits accurately in a separate
2758
mini-transaction. */
2759
ibuf_update_free_bits_if_full(merge_block,
2765
ut_ad(page_validate(merge_page, index));
2766
#ifdef UNIV_ZIP_DEBUG
2767
ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page));
2768
#endif /* UNIV_ZIP_DEBUG */
2770
/* Free the file page */
2771
btr_page_free(index, block, mtr);
2773
ut_ad(btr_check_node_ptr(index, merge_block, mtr));
2777
/*************************************************************//**
2778
Discards a page that is the only page on its level. This will empty
2779
the whole B-tree, leaving just an empty root page. This function
2780
should never be reached, because btr_compress(), which is invoked in
2781
delete operations, calls btr_lift_page_up() to flatten the B-tree. */
2784
btr_discard_only_page_on_level(
2785
/*===========================*/
2786
dict_index_t* index, /*!< in: index tree */
2787
buf_block_t* block, /*!< in: page which is the only on its level */
2788
mtr_t* mtr) /*!< in: mtr */
2790
ulint page_level = 0;
2791
trx_id_t max_trx_id;
2793
/* Save the PAGE_MAX_TRX_ID from the leaf page. */
2794
max_trx_id = page_get_max_trx_id(buf_block_get_frame(block));
2796
while (buf_block_get_page_no(block) != dict_index_get_page(index)) {
2798
buf_block_t* father;
2799
const page_t* page = buf_block_get_frame(block);
2801
ut_a(page_get_n_recs(page) == 1);
2802
ut_a(page_level == btr_page_get_level(page, mtr));
2803
ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
2804
ut_a(btr_page_get_next(page, mtr) == FIL_NULL);
2806
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2807
btr_search_drop_page_hash_index(block);
2809
btr_page_get_father(index, block, mtr, &cursor);
2810
father = btr_cur_get_block(&cursor);
2812
lock_update_discard(father, PAGE_HEAP_NO_SUPREMUM, block);
2814
/* Free the file page */
2815
btr_page_free(index, block, mtr);
2821
/* block is the root page, which must be empty, except
2822
for the node pointer to the (now discarded) block(s). */
2824
#ifdef UNIV_BTR_DEBUG
2825
if (!dict_index_is_ibuf(index)) {
2826
const page_t* root = buf_block_get_frame(block);
2827
const ulint space = dict_index_get_space(index);
2828
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
2830
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
2833
#endif /* UNIV_BTR_DEBUG */
2835
btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
2837
if (!dict_index_is_clust(index)) {
2838
/* We play it safe and reset the free bits for the root */
2839
ibuf_reset_free_bits(block);
2841
if (page_is_leaf(buf_block_get_frame(block))) {
2842
ut_a(!ut_dulint_is_zero(max_trx_id));
2843
page_set_max_trx_id(block,
2844
buf_block_get_page_zip(block),
2850
/*************************************************************//**
2851
Discards a page from a B-tree. This is used to remove the last record from
2852
a B-tree page: the whole page must be removed at the same time. This cannot
2853
be used for the root page, which is allowed to be empty. */
2858
btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
2860
mtr_t* mtr) /*!< in: mtr */
2862
dict_index_t* index;
2866
ulint right_page_no;
2867
buf_block_t* merge_block;
2873
block = btr_cur_get_block(cursor);
2874
index = btr_cur_get_index(cursor);
2876
ut_ad(dict_index_get_page(index) != buf_block_get_page_no(block));
2877
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2879
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2880
space = dict_index_get_space(index);
2881
zip_size = dict_table_zip_size(index->table);
2883
/* Decide the page which will inherit the locks */
2885
left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr);
2886
right_page_no = btr_page_get_next(buf_block_get_frame(block), mtr);
2888
if (left_page_no != FIL_NULL) {
2889
merge_block = btr_block_get(space, zip_size, left_page_no,
2891
merge_page = buf_block_get_frame(merge_block);
2892
#ifdef UNIV_BTR_DEBUG
2893
ut_a(btr_page_get_next(merge_page, mtr)
2894
== buf_block_get_page_no(block));
2895
#endif /* UNIV_BTR_DEBUG */
2896
} else if (right_page_no != FIL_NULL) {
2897
merge_block = btr_block_get(space, zip_size, right_page_no,
2899
merge_page = buf_block_get_frame(merge_block);
2900
#ifdef UNIV_BTR_DEBUG
2901
ut_a(btr_page_get_prev(merge_page, mtr)
2902
== buf_block_get_page_no(block));
2903
#endif /* UNIV_BTR_DEBUG */
2905
btr_discard_only_page_on_level(index, block, mtr);
2910
page = buf_block_get_frame(block);
2911
ut_a(page_is_comp(merge_page) == page_is_comp(page));
2912
btr_search_drop_page_hash_index(block);
2914
if (left_page_no == FIL_NULL && !page_is_leaf(page)) {
2916
/* We have to mark the leftmost node pointer on the right
2917
side page as the predefined minimum record */
2918
node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page));
2920
ut_ad(page_rec_is_user_rec(node_ptr));
2922
/* This will make page_zip_validate() fail on merge_page
2923
until btr_level_list_remove() completes. This is harmless,
2924
because everything will take place within a single
2925
mini-transaction and because writing to the redo log
2926
is an atomic operation (performed by mtr_commit()). */
2927
btr_set_min_rec_mark(node_ptr, mtr);
2930
btr_node_ptr_delete(index, block, mtr);
2932
/* Remove the page from the level list */
2933
btr_level_list_remove(space, zip_size, page, mtr);
2934
#ifdef UNIV_ZIP_DEBUG
2936
page_zip_des_t* merge_page_zip
2937
= buf_block_get_page_zip(merge_block);
2938
ut_a(!merge_page_zip
2939
|| page_zip_validate(merge_page_zip, merge_page));
2941
#endif /* UNIV_ZIP_DEBUG */
2943
if (left_page_no != FIL_NULL) {
2944
lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
2947
lock_update_discard(merge_block,
2948
lock_get_min_heap_no(merge_block),
2952
/* Free the file page */
2953
btr_page_free(index, block, mtr);
2955
ut_ad(btr_check_node_ptr(index, merge_block, mtr));
2958
#ifdef UNIV_BTR_PRINT
2959
/*************************************************************//**
2960
Prints size info of a B-tree. */
2965
dict_index_t* index) /*!< in: index tree */
2971
if (dict_index_is_ibuf(index)) {
2972
fputs("Sorry, cannot print info of an ibuf tree:"
2973
" use ibuf functions\n", stderr);
2980
root = btr_root_get(index, &mtr);
2982
seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
2984
fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
2985
fseg_print(seg, &mtr);
2987
if (!(index->type & DICT_UNIVERSAL)) {
2989
seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
2991
fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
2992
fseg_print(seg, &mtr);
2998
/************************************************************//**
2999
Prints recursively index tree pages. */
3002
btr_print_recursive(
3003
/*================*/
3004
dict_index_t* index, /*!< in: index tree */
3005
buf_block_t* block, /*!< in: index page */
3006
ulint width, /*!< in: print this many entries from start
3008
mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
3009
ulint** offsets,/*!< in/out: buffer for rec_get_offsets() */
3010
mtr_t* mtr) /*!< in: mtr */
3012
const page_t* page = buf_block_get_frame(block);
3018
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3019
fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n",
3020
(ulong) btr_page_get_level(page, mtr),
3021
(ulong) buf_block_get_page_no(block));
3023
page_print(block, index, width, width);
3025
n_recs = page_get_n_recs(page);
3027
page_cur_set_before_first(block, &cursor);
3028
page_cur_move_to_next(&cursor);
3030
while (!page_cur_is_after_last(&cursor)) {
3032
if (page_is_leaf(page)) {
3034
/* If this is the leaf level, do nothing */
3036
} else if ((i <= width) || (i >= n_recs - width)) {
3038
const rec_t* node_ptr;
3042
node_ptr = page_cur_get_rec(&cursor);
3044
*offsets = rec_get_offsets(node_ptr, index, *offsets,
3045
ULINT_UNDEFINED, heap);
3046
btr_print_recursive(index,
3047
btr_node_ptr_get_child(node_ptr,
3051
width, heap, offsets, &mtr2);
3055
page_cur_move_to_next(&cursor);
3060
/**************************************************************//**
3061
Prints directories and other info of all nodes in the tree. */
3066
dict_index_t* index, /*!< in: index */
3067
ulint width) /*!< in: print this many entries from start
3072
mem_heap_t* heap = NULL;
3073
ulint offsets_[REC_OFFS_NORMAL_SIZE];
3074
ulint* offsets = offsets_;
3075
rec_offs_init(offsets_);
3077
fputs("--------------------------\n"
3078
"INDEX TREE PRINT\n", stderr);
3082
root = btr_root_block_get(index, &mtr);
3084
btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
3085
if (UNIV_LIKELY_NULL(heap)) {
3086
mem_heap_free(heap);
3091
btr_validate_index(index, NULL);
3093
#endif /* UNIV_BTR_PRINT */
3096
/************************************************************//**
3097
Checks that the node pointer to a page is appropriate.
3103
dict_index_t* index, /*!< in: index tree */
3104
buf_block_t* block, /*!< in: index page */
3105
mtr_t* mtr) /*!< in: mtr */
3111
page_t* page = buf_block_get_frame(block);
3113
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3114
if (dict_index_get_page(index) == buf_block_get_page_no(block)) {
3119
heap = mem_heap_create(256);
3120
offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
3123
if (page_is_leaf(page)) {
3128
tuple = dict_index_build_node_ptr(
3129
index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
3130
btr_page_get_level(page, mtr));
3132
ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets));
3134
mem_heap_free(heap);
3138
#endif /* UNIV_DEBUG */
3140
/************************************************************//**
3141
Display identification information for a record. */
3144
btr_index_rec_validate_report(
3145
/*==========================*/
3146
const page_t* page, /*!< in: index page */
3147
const rec_t* rec, /*!< in: index record */
3148
const dict_index_t* index) /*!< in: index */
3150
fputs("InnoDB: Record in ", stderr);
3151
dict_index_name_print(stderr, NULL, index);
3152
fprintf(stderr, ", page %lu, at offset %lu\n",
3153
page_get_page_no(page), (ulint) page_offset(rec));
3156
/************************************************************//**
3157
Checks the size and number of fields in a record based on the definition of
3159
@return TRUE if ok */
3162
btr_index_rec_validate(
3163
/*===================*/
3164
const rec_t* rec, /*!< in: index record */
3165
const dict_index_t* index, /*!< in: index */
3166
ibool dump_on_error) /*!< in: TRUE if the function
3167
should print hex dump of record
3168
and page on error */
3174
mem_heap_t* heap = NULL;
3175
ulint offsets_[REC_OFFS_NORMAL_SIZE];
3176
ulint* offsets = offsets_;
3177
rec_offs_init(offsets_);
3179
page = page_align(rec);
3181
if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
3182
/* The insert buffer index tree can contain records from any
3183
other index: we cannot check the number of fields or
3189
if (UNIV_UNLIKELY((ibool)!!page_is_comp(page)
3190
!= dict_table_is_comp(index->table))) {
3191
btr_index_rec_validate_report(page, rec, index);
3192
fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n",
3193
(ulong) !!page_is_comp(page),
3194
(ulong) dict_table_is_comp(index->table));
3199
n = dict_index_get_n_fields(index);
3201
if (!page_is_comp(page)
3202
&& UNIV_UNLIKELY(rec_get_n_fields_old(rec) != n)) {
3203
btr_index_rec_validate_report(page, rec, index);
3204
fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n",
3205
(ulong) rec_get_n_fields_old(rec), (ulong) n);
3207
if (dump_on_error) {
3208
buf_page_print(page, 0);
3210
fputs("InnoDB: corrupt record ", stderr);
3211
rec_print_old(stderr, rec);
3217
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
3219
for (i = 0; i < n; i++) {
3220
ulint fixed_size = dict_col_get_fixed_size(
3221
dict_index_get_nth_col(index, i), page_is_comp(page));
3223
rec_get_nth_field_offs(offsets, i, &len);
3225
/* Note that if fixed_size != 0, it equals the
3226
length of a fixed-size column in the clustered index.
3227
A prefix index of the column is of fixed, but different
3228
length. When fixed_size == 0, prefix_len is the maximum
3229
length of the prefix index column. */
3231
if ((dict_index_get_nth_field(index, i)->prefix_len == 0
3232
&& len != UNIV_SQL_NULL && fixed_size
3233
&& len != fixed_size)
3234
|| (dict_index_get_nth_field(index, i)->prefix_len > 0
3235
&& len != UNIV_SQL_NULL
3237
> dict_index_get_nth_field(index, i)->prefix_len)) {
3239
btr_index_rec_validate_report(page, rec, index);
3241
"InnoDB: field %lu len is %lu,"
3243
(ulong) i, (ulong) len, (ulong) fixed_size);
3245
if (dump_on_error) {
3246
buf_page_print(page, 0);
3248
fputs("InnoDB: corrupt record ", stderr);
3249
rec_print_new(stderr, rec, offsets);
3252
if (UNIV_LIKELY_NULL(heap)) {
3253
mem_heap_free(heap);
3259
if (UNIV_LIKELY_NULL(heap)) {
3260
mem_heap_free(heap);
3265
/************************************************************//**
3266
Checks the size and number of fields in records based on the definition of
3268
@return TRUE if ok */
3271
btr_index_page_validate(
3272
/*====================*/
3273
buf_block_t* block, /*!< in: index page */
3274
dict_index_t* index) /*!< in: index */
3279
page_cur_set_before_first(block, &cur);
3280
page_cur_move_to_next(&cur);
3283
if (page_cur_is_after_last(&cur)) {
3288
if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
3293
page_cur_move_to_next(&cur);
3299
/************************************************************//**
3300
Report an error on one page of an index tree. */
3303
btr_validate_report1(
3304
/*=================*/
3305
dict_index_t* index, /*!< in: index */
3306
ulint level, /*!< in: B-tree level */
3307
const buf_block_t* block) /*!< in: index page */
3309
fprintf(stderr, "InnoDB: Error in page %lu of ",
3310
buf_block_get_page_no(block));
3311
dict_index_name_print(stderr, NULL, index);
3313
fprintf(stderr, ", index tree level %lu", level);
3318
/************************************************************//**
3319
Report an error on two pages of an index tree. */
3322
btr_validate_report2(
3323
/*=================*/
3324
const dict_index_t* index, /*!< in: index */
3325
ulint level, /*!< in: B-tree level */
3326
const buf_block_t* block1, /*!< in: first index page */
3327
const buf_block_t* block2) /*!< in: second index page */
3329
fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ",
3330
buf_block_get_page_no(block1),
3331
buf_block_get_page_no(block2));
3332
dict_index_name_print(stderr, NULL, index);
3334
fprintf(stderr, ", index tree level %lu", level);
3339
/************************************************************//**
3340
Validates index tree level.
3341
@return TRUE if ok */
3346
dict_index_t* index, /*!< in: index tree */
3347
trx_t* trx, /*!< in: transaction or NULL */
3348
ulint level) /*!< in: level number */
3354
buf_block_t* right_block = 0; /* remove warning */
3355
page_t* right_page = 0; /* remove warning */
3356
page_t* father_page;
3358
btr_cur_t right_node_cur;
3360
ulint right_page_no;
3363
dtuple_t* node_ptr_tuple;
3366
mem_heap_t* heap = mem_heap_create(256);
3367
ulint* offsets = NULL;
3368
ulint* offsets2= NULL;
3369
#ifdef UNIV_ZIP_DEBUG
3370
page_zip_des_t* page_zip;
3371
#endif /* UNIV_ZIP_DEBUG */
3375
mtr_x_lock(dict_index_get_lock(index), &mtr);
3377
block = btr_root_block_get(index, &mtr);
3378
page = buf_block_get_frame(block);
3380
space = dict_index_get_space(index);
3381
zip_size = dict_table_zip_size(index->table);
3383
while (level != btr_page_get_level(page, &mtr)) {
3384
const rec_t* node_ptr;
3386
ut_a(space == buf_block_get_space(block));
3387
ut_a(space == page_get_space_id(page));
3388
#ifdef UNIV_ZIP_DEBUG
3389
page_zip = buf_block_get_page_zip(block);
3390
ut_a(!page_zip || page_zip_validate(page_zip, page));
3391
#endif /* UNIV_ZIP_DEBUG */
3392
ut_a(!page_is_leaf(page));
3394
page_cur_set_before_first(block, &cursor);
3395
page_cur_move_to_next(&cursor);
3397
node_ptr = page_cur_get_rec(&cursor);
3398
offsets = rec_get_offsets(node_ptr, index, offsets,
3399
ULINT_UNDEFINED, &heap);
3400
block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr);
3401
page = buf_block_get_frame(block);
3404
/* Now we are on the desired level. Loop through the pages on that
3407
if (trx_is_interrupted(trx)) {
3409
mem_heap_free(heap);
3412
mem_heap_empty(heap);
3413
offsets = offsets2 = NULL;
3414
mtr_x_lock(dict_index_get_lock(index), &mtr);
3416
#ifdef UNIV_ZIP_DEBUG
3417
page_zip = buf_block_get_page_zip(block);
3418
ut_a(!page_zip || page_zip_validate(page_zip, page));
3419
#endif /* UNIV_ZIP_DEBUG */
3421
/* Check ordering etc. of records */
3423
if (!page_validate(page, index)) {
3424
btr_validate_report1(index, level, block);
3427
} else if (level == 0) {
3428
/* We are on level 0. Check that the records have the right
3429
number of fields, and field lengths are right. */
3431
if (!btr_index_page_validate(block, index)) {
3437
ut_a(btr_page_get_level(page, &mtr) == level);
3439
right_page_no = btr_page_get_next(page, &mtr);
3440
left_page_no = btr_page_get_prev(page, &mtr);
3442
ut_a(page_get_n_recs(page) > 0 || (level == 0
3443
&& page_get_page_no(page)
3444
== dict_index_get_page(index)));
3446
if (right_page_no != FIL_NULL) {
3447
const rec_t* right_rec;
3448
right_block = btr_block_get(space, zip_size, right_page_no,
3450
right_page = buf_block_get_frame(right_block);
3451
if (UNIV_UNLIKELY(btr_page_get_prev(right_page, &mtr)
3452
!= page_get_page_no(page))) {
3453
btr_validate_report2(index, level, block, right_block);
3454
fputs("InnoDB: broken FIL_PAGE_NEXT"
3455
" or FIL_PAGE_PREV links\n", stderr);
3456
buf_page_print(page, 0);
3457
buf_page_print(right_page, 0);
3462
if (UNIV_UNLIKELY(page_is_comp(right_page)
3463
!= page_is_comp(page))) {
3464
btr_validate_report2(index, level, block, right_block);
3465
fputs("InnoDB: 'compact' flag mismatch\n", stderr);
3466
buf_page_print(page, 0);
3467
buf_page_print(right_page, 0);
3471
goto node_ptr_fails;
3474
rec = page_rec_get_prev(page_get_supremum_rec(page));
3475
right_rec = page_rec_get_next(page_get_infimum_rec(
3477
offsets = rec_get_offsets(rec, index,
3478
offsets, ULINT_UNDEFINED, &heap);
3479
offsets2 = rec_get_offsets(right_rec, index,
3480
offsets2, ULINT_UNDEFINED, &heap);
3481
if (UNIV_UNLIKELY(cmp_rec_rec(rec, right_rec,
3485
btr_validate_report2(index, level, block, right_block);
3487
fputs("InnoDB: records in wrong order"
3488
" on adjacent pages\n", stderr);
3490
buf_page_print(page, 0);
3491
buf_page_print(right_page, 0);
3493
fputs("InnoDB: record ", stderr);
3494
rec = page_rec_get_prev(page_get_supremum_rec(page));
3495
rec_print(stderr, rec, index);
3497
fputs("InnoDB: record ", stderr);
3498
rec = page_rec_get_next(
3499
page_get_infimum_rec(right_page));
3500
rec_print(stderr, rec, index);
3507
if (level > 0 && left_page_no == FIL_NULL) {
3508
ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
3509
page_rec_get_next(page_get_infimum_rec(page)),
3510
page_is_comp(page)));
3513
if (buf_block_get_page_no(block) != dict_index_get_page(index)) {
3515
/* Check father node pointers */
3519
offsets = btr_page_get_father_block(offsets, heap, index,
3520
block, &mtr, &node_cur);
3521
father_page = btr_cur_get_page(&node_cur);
3522
node_ptr = btr_cur_get_rec(&node_cur);
3525
index, page_rec_get_prev(page_get_supremum_rec(page)),
3527
offsets = btr_page_get_father_node_ptr(offsets, heap,
3530
if (UNIV_UNLIKELY(node_ptr != btr_cur_get_rec(&node_cur))
3531
|| UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr,
3533
!= buf_block_get_page_no(block))) {
3535
btr_validate_report1(index, level, block);
3537
fputs("InnoDB: node pointer to the page is wrong\n",
3540
buf_page_print(father_page, 0);
3541
buf_page_print(page, 0);
3543
fputs("InnoDB: node ptr ", stderr);
3544
rec_print(stderr, node_ptr, index);
3546
rec = btr_cur_get_rec(&node_cur);
3547
fprintf(stderr, "\n"
3548
"InnoDB: node ptr child page n:o %lu\n",
3549
(ulong) btr_node_ptr_get_child_page_no(
3552
fputs("InnoDB: record on page ", stderr);
3553
rec_print_new(stderr, rec, offsets);
3557
goto node_ptr_fails;
3560
if (!page_is_leaf(page)) {
3561
node_ptr_tuple = dict_index_build_node_ptr(
3563
page_rec_get_next(page_get_infimum_rec(page)),
3564
0, heap, btr_page_get_level(page, &mtr));
3566
if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
3568
const rec_t* first_rec = page_rec_get_next(
3569
page_get_infimum_rec(page));
3571
btr_validate_report1(index, level, block);
3573
buf_page_print(father_page, 0);
3574
buf_page_print(page, 0);
3576
fputs("InnoDB: Error: node ptrs differ"
3578
"InnoDB: node ptr ", stderr);
3579
rec_print_new(stderr, node_ptr, offsets);
3580
fputs("InnoDB: first rec ", stderr);
3581
rec_print(stderr, first_rec, index);
3585
goto node_ptr_fails;
3589
if (left_page_no == FIL_NULL) {
3590
ut_a(node_ptr == page_rec_get_next(
3591
page_get_infimum_rec(father_page)));
3592
ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL);
3595
if (right_page_no == FIL_NULL) {
3596
ut_a(node_ptr == page_rec_get_prev(
3597
page_get_supremum_rec(father_page)));
3598
ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL);
3600
const rec_t* right_node_ptr
3601
= page_rec_get_next(node_ptr);
3603
offsets = btr_page_get_father_block(
3604
offsets, heap, index, right_block,
3605
&mtr, &right_node_cur);
3607
!= page_get_supremum_rec(father_page)) {
3609
if (btr_cur_get_rec(&right_node_cur)
3610
!= right_node_ptr) {
3612
fputs("InnoDB: node pointer to"
3613
" the right page is wrong\n",
3616
btr_validate_report1(index, level,
3619
buf_page_print(father_page, 0);
3620
buf_page_print(page, 0);
3621
buf_page_print(right_page, 0);
3624
page_t* right_father_page
3625
= btr_cur_get_page(&right_node_cur);
3627
if (btr_cur_get_rec(&right_node_cur)
3628
!= page_rec_get_next(
3629
page_get_infimum_rec(
3630
right_father_page))) {
3632
fputs("InnoDB: node pointer 2 to"
3633
" the right page is wrong\n",
3636
btr_validate_report1(index, level,
3639
buf_page_print(father_page, 0);
3640
buf_page_print(right_father_page, 0);
3641
buf_page_print(page, 0);
3642
buf_page_print(right_page, 0);
3645
if (page_get_page_no(right_father_page)
3646
!= btr_page_get_next(father_page, &mtr)) {
3649
fputs("InnoDB: node pointer 3 to"
3650
" the right page is wrong\n",
3653
btr_validate_report1(index, level,
3656
buf_page_print(father_page, 0);
3657
buf_page_print(right_father_page, 0);
3658
buf_page_print(page, 0);
3659
buf_page_print(right_page, 0);
3666
/* Commit the mini-transaction to release the latch on 'page'.
3667
Re-acquire the latch on right_page, which will become 'page'
3668
on the next loop. The page has already been checked. */
3671
if (right_page_no != FIL_NULL) {
3674
block = btr_block_get(space, zip_size, right_page_no,
3676
page = buf_block_get_frame(block);
3681
mem_heap_free(heap);
3685
/**************************************************************//**
3686
Checks the consistency of an index tree.
3687
@return TRUE if ok */
3692
dict_index_t* index, /*!< in: index */
3693
trx_t* trx) /*!< in: transaction or NULL */
3701
mtr_x_lock(dict_index_get_lock(index), &mtr);
3703
root = btr_root_get(index, &mtr);
3704
n = btr_page_get_level(root, &mtr);
3706
for (i = 0; i <= n && !trx_is_interrupted(trx); i++) {
3707
if (!btr_validate_level(index, trx, n - i)) {
3719
#endif /* !UNIV_HOTBACKUP */