~ubuntu-branches/ubuntu/precise/mysql-5.1/precise

« back to all changes in this revision

Viewing changes to storage/innodb_plugin/btr/btr0btr.c

  • Committer: Bazaar Package Importer
  • Author(s): Norbert Tretkowski
  • Date: 2010-03-17 14:56:02 UTC
  • Revision ID: james.westby@ubuntu.com-20100317145602-x7e30l1b2sb5s6w6
Tags: upstream-5.1.45
ImportĀ upstreamĀ versionĀ 5.1.45

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*****************************************************************************
 
2
 
 
3
Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
 
4
 
 
5
This program is free software; you can redistribute it and/or modify it under
 
6
the terms of the GNU General Public License as published by the Free Software
 
7
Foundation; version 2 of the License.
 
8
 
 
9
This program is distributed in the hope that it will be useful, but WITHOUT
 
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
12
 
 
13
You should have received a copy of the GNU General Public License along with
 
14
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 
15
Place, Suite 330, Boston, MA 02111-1307 USA
 
16
 
 
17
*****************************************************************************/
 
18
 
 
19
/**************************************************//**
 
20
@file btr/btr0btr.c
 
21
The B-tree
 
22
 
 
23
Created 6/2/1994 Heikki Tuuri
 
24
*******************************************************/
 
25
 
 
26
#include "btr0btr.h"
 
27
 
 
28
#ifdef UNIV_NONINL
 
29
#include "btr0btr.ic"
 
30
#endif
 
31
 
 
32
#include "fsp0fsp.h"
 
33
#include "page0page.h"
 
34
#include "page0zip.h"
 
35
 
 
36
#ifndef UNIV_HOTBACKUP
 
37
#include "btr0cur.h"
 
38
#include "btr0sea.h"
 
39
#include "btr0pcur.h"
 
40
#include "rem0cmp.h"
 
41
#include "lock0lock.h"
 
42
#include "ibuf0ibuf.h"
 
43
#include "trx0trx.h"
 
44
 
 
45
/*
 
46
Latching strategy of the InnoDB B-tree
 
47
--------------------------------------
 
48
A tree latch protects all non-leaf nodes of the tree. Each node of a tree
 
49
also has a latch of its own.
 
50
 
 
51
A B-tree operation normally first acquires an S-latch on the tree. It
 
52
searches down the tree and releases the tree latch when it has the
 
53
leaf node latch. To save CPU time we do not acquire any latch on
 
54
non-leaf nodes of the tree during a search, those pages are only bufferfixed.
 
55
 
 
56
If an operation needs to restructure the tree, it acquires an X-latch on
 
57
the tree before searching to a leaf node. If it needs, for example, to
 
58
split a leaf,
 
59
(1) InnoDB decides the split point in the leaf,
 
60
(2) allocates a new page,
 
61
(3) inserts the appropriate node pointer to the first non-leaf level,
 
62
(4) releases the tree X-latch,
 
63
(5) and then moves records from the leaf to the new allocated page.
 
64
 
 
65
Node pointers
 
66
-------------
 
67
Leaf pages of a B-tree contain the index records stored in the
 
68
tree. On levels n > 0 we store 'node pointers' to pages on level
 
69
n - 1. For each page there is exactly one node pointer stored:
 
70
thus the our tree is an ordinary B-tree, not a B-link tree.
 
71
 
 
72
A node pointer contains a prefix P of an index record. The prefix
 
73
is long enough so that it determines an index record uniquely.
 
74
The file page number of the child page is added as the last
 
75
field. To the child page we can store node pointers or index records
 
76
which are >= P in the alphabetical order, but < P1 if there is
 
77
a next node pointer on the level, and P1 is its prefix.
 
78
 
 
79
If a node pointer with a prefix P points to a non-leaf child,
 
80
then the leftmost record in the child must have the same
 
81
prefix P. If it points to a leaf node, the child is not required
 
82
to contain any record with a prefix equal to P. The leaf case
 
83
is decided this way to allow arbitrary deletions in a leaf node
 
84
without touching upper levels of the tree.
 
85
 
 
86
We have predefined a special minimum record which we
 
87
define as the smallest record in any alphabetical order.
 
88
A minimum record is denoted by setting a bit in the record
 
89
header. A minimum record acts as the prefix of a node pointer
 
90
which points to a leftmost node on any level of the tree.
 
91
 
 
92
File page allocation
 
93
--------------------
 
94
In the root node of a B-tree there are two file segment headers.
 
95
The leaf pages of a tree are allocated from one file segment, to
 
96
make them consecutive on disk if possible. From the other file segment
 
97
we allocate pages for the non-leaf levels of the tree.
 
98
*/
 
99
 
 
100
#ifdef UNIV_BTR_DEBUG
 
101
/**************************************************************//**
 
102
Checks a file segment header within a B-tree root page.
 
103
@return TRUE if valid */
 
104
static
 
105
ibool
 
106
btr_root_fseg_validate(
 
107
/*===================*/
 
108
        const fseg_header_t*    seg_header,     /*!< in: segment header */
 
109
        ulint                   space)          /*!< in: tablespace identifier */
 
110
{
 
111
        ulint   offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
 
112
 
 
113
        ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space);
 
114
        ut_a(offset >= FIL_PAGE_DATA);
 
115
        ut_a(offset <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END);
 
116
        return(TRUE);
 
117
}
 
118
#endif /* UNIV_BTR_DEBUG */
 
119
 
 
120
/**************************************************************//**
 
121
Gets the root node of a tree and x-latches it.
 
122
@return root page, x-latched */
 
123
static
 
124
buf_block_t*
 
125
btr_root_block_get(
 
126
/*===============*/
 
127
        dict_index_t*   index,  /*!< in: index tree */
 
128
        mtr_t*          mtr)    /*!< in: mtr */
 
129
{
 
130
        ulint           space;
 
131
        ulint           zip_size;
 
132
        ulint           root_page_no;
 
133
        buf_block_t*    block;
 
134
 
 
135
        space = dict_index_get_space(index);
 
136
        zip_size = dict_table_zip_size(index->table);
 
137
        root_page_no = dict_index_get_page(index);
 
138
 
 
139
        block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
 
140
        ut_a((ibool)!!page_is_comp(buf_block_get_frame(block))
 
141
             == dict_table_is_comp(index->table));
 
142
#ifdef UNIV_BTR_DEBUG
 
143
        if (!dict_index_is_ibuf(index)) {
 
144
                const page_t*   root = buf_block_get_frame(block);
 
145
 
 
146
                ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
 
147
                                            + root, space));
 
148
                ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
 
149
                                            + root, space));
 
150
        }
 
151
#endif /* UNIV_BTR_DEBUG */
 
152
 
 
153
        return(block);
 
154
}
 
155
 
 
156
/**************************************************************//**
 
157
Gets the root node of a tree and x-latches it.
 
158
@return root page, x-latched */
 
159
UNIV_INTERN
 
160
page_t*
 
161
btr_root_get(
 
162
/*=========*/
 
163
        dict_index_t*   index,  /*!< in: index tree */
 
164
        mtr_t*          mtr)    /*!< in: mtr */
 
165
{
 
166
        return(buf_block_get_frame(btr_root_block_get(index, mtr)));
 
167
}
 
168
 
 
169
/*************************************************************//**
 
170
Gets pointer to the previous user record in the tree. It is assumed that
 
171
the caller has appropriate latches on the page and its neighbor.
 
172
@return previous user record, NULL if there is none */
 
173
UNIV_INTERN
 
174
rec_t*
 
175
btr_get_prev_user_rec(
 
176
/*==================*/
 
177
        rec_t*  rec,    /*!< in: record on leaf level */
 
178
        mtr_t*  mtr)    /*!< in: mtr holding a latch on the page, and if
 
179
                        needed, also to the previous page */
 
180
{
 
181
        page_t* page;
 
182
        page_t* prev_page;
 
183
        ulint   prev_page_no;
 
184
 
 
185
        if (!page_rec_is_infimum(rec)) {
 
186
 
 
187
                rec_t*  prev_rec = page_rec_get_prev(rec);
 
188
 
 
189
                if (!page_rec_is_infimum(prev_rec)) {
 
190
 
 
191
                        return(prev_rec);
 
192
                }
 
193
        }
 
194
 
 
195
        page = page_align(rec);
 
196
        prev_page_no = btr_page_get_prev(page, mtr);
 
197
 
 
198
        if (prev_page_no != FIL_NULL) {
 
199
 
 
200
                ulint           space;
 
201
                ulint           zip_size;
 
202
                buf_block_t*    prev_block;
 
203
 
 
204
                space = page_get_space_id(page);
 
205
                zip_size = fil_space_get_zip_size(space);
 
206
 
 
207
                prev_block = buf_page_get_with_no_latch(space, zip_size,
 
208
                                                        prev_page_no, mtr);
 
209
                prev_page = buf_block_get_frame(prev_block);
 
210
                /* The caller must already have a latch to the brother */
 
211
                ut_ad(mtr_memo_contains(mtr, prev_block,
 
212
                                        MTR_MEMO_PAGE_S_FIX)
 
213
                      || mtr_memo_contains(mtr, prev_block,
 
214
                                           MTR_MEMO_PAGE_X_FIX));
 
215
#ifdef UNIV_BTR_DEBUG
 
216
                ut_a(page_is_comp(prev_page) == page_is_comp(page));
 
217
                ut_a(btr_page_get_next(prev_page, mtr)
 
218
                     == page_get_page_no(page));
 
219
#endif /* UNIV_BTR_DEBUG */
 
220
 
 
221
                return(page_rec_get_prev(page_get_supremum_rec(prev_page)));
 
222
        }
 
223
 
 
224
        return(NULL);
 
225
}
 
226
 
 
227
/*************************************************************//**
 
228
Gets pointer to the next user record in the tree. It is assumed that the
 
229
caller has appropriate latches on the page and its neighbor.
 
230
@return next user record, NULL if there is none */
 
231
UNIV_INTERN
 
232
rec_t*
 
233
btr_get_next_user_rec(
 
234
/*==================*/
 
235
        rec_t*  rec,    /*!< in: record on leaf level */
 
236
        mtr_t*  mtr)    /*!< in: mtr holding a latch on the page, and if
 
237
                        needed, also to the next page */
 
238
{
 
239
        page_t* page;
 
240
        page_t* next_page;
 
241
        ulint   next_page_no;
 
242
 
 
243
        if (!page_rec_is_supremum(rec)) {
 
244
 
 
245
                rec_t*  next_rec = page_rec_get_next(rec);
 
246
 
 
247
                if (!page_rec_is_supremum(next_rec)) {
 
248
 
 
249
                        return(next_rec);
 
250
                }
 
251
        }
 
252
 
 
253
        page = page_align(rec);
 
254
        next_page_no = btr_page_get_next(page, mtr);
 
255
 
 
256
        if (next_page_no != FIL_NULL) {
 
257
                ulint           space;
 
258
                ulint           zip_size;
 
259
                buf_block_t*    next_block;
 
260
 
 
261
                space = page_get_space_id(page);
 
262
                zip_size = fil_space_get_zip_size(space);
 
263
 
 
264
                next_block = buf_page_get_with_no_latch(space, zip_size,
 
265
                                                        next_page_no, mtr);
 
266
                next_page = buf_block_get_frame(next_block);
 
267
                /* The caller must already have a latch to the brother */
 
268
                ut_ad(mtr_memo_contains(mtr, next_block, MTR_MEMO_PAGE_S_FIX)
 
269
                      || mtr_memo_contains(mtr, next_block,
 
270
                                           MTR_MEMO_PAGE_X_FIX));
 
271
#ifdef UNIV_BTR_DEBUG
 
272
                ut_a(page_is_comp(next_page) == page_is_comp(page));
 
273
                ut_a(btr_page_get_prev(next_page, mtr)
 
274
                     == page_get_page_no(page));
 
275
#endif /* UNIV_BTR_DEBUG */
 
276
 
 
277
                return(page_rec_get_next(page_get_infimum_rec(next_page)));
 
278
        }
 
279
 
 
280
        return(NULL);
 
281
}
 
282
 
 
283
/**************************************************************//**
 
284
Creates a new index page (not the root, and also not
 
285
used in page reorganization).  @see btr_page_empty(). */
 
286
static
 
287
void
 
288
btr_page_create(
 
289
/*============*/
 
290
        buf_block_t*    block,  /*!< in/out: page to be created */
 
291
        page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
 
292
        dict_index_t*   index,  /*!< in: index */
 
293
        ulint           level,  /*!< in: the B-tree level of the page */
 
294
        mtr_t*          mtr)    /*!< in: mtr */
 
295
{
 
296
        page_t*         page = buf_block_get_frame(block);
 
297
 
 
298
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
299
 
 
300
        if (UNIV_LIKELY_NULL(page_zip)) {
 
301
                page_create_zip(block, index, level, mtr);
 
302
        } else {
 
303
                page_create(block, mtr, dict_table_is_comp(index->table));
 
304
                /* Set the level of the new index page */
 
305
                btr_page_set_level(page, NULL, level, mtr);
 
306
        }
 
307
 
 
308
        block->check_index_page_at_flush = TRUE;
 
309
 
 
310
        btr_page_set_index_id(page, page_zip, index->id, mtr);
 
311
}
 
312
 
 
313
/**************************************************************//**
 
314
Allocates a new file page to be used in an ibuf tree. Takes the page from
 
315
the free list of the tree, which must contain pages!
 
316
@return new allocated block, x-latched */
 
317
static
 
318
buf_block_t*
 
319
btr_page_alloc_for_ibuf(
 
320
/*====================*/
 
321
        dict_index_t*   index,  /*!< in: index tree */
 
322
        mtr_t*          mtr)    /*!< in: mtr */
 
323
{
 
324
        fil_addr_t      node_addr;
 
325
        page_t*         root;
 
326
        page_t*         new_page;
 
327
        buf_block_t*    new_block;
 
328
 
 
329
        root = btr_root_get(index, mtr);
 
330
 
 
331
        node_addr = flst_get_first(root + PAGE_HEADER
 
332
                                   + PAGE_BTR_IBUF_FREE_LIST, mtr);
 
333
        ut_a(node_addr.page != FIL_NULL);
 
334
 
 
335
        new_block = buf_page_get(dict_index_get_space(index),
 
336
                                 dict_table_zip_size(index->table),
 
337
                                 node_addr.page, RW_X_LATCH, mtr);
 
338
        new_page = buf_block_get_frame(new_block);
 
339
        buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
 
340
 
 
341
        flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
 
342
                    new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
 
343
                    mtr);
 
344
        ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
 
345
                            mtr));
 
346
 
 
347
        return(new_block);
 
348
}
 
349
 
 
350
/**************************************************************//**
 
351
Allocates a new file page to be used in an index tree. NOTE: we assume
 
352
that the caller has made the reservation for free extents!
 
353
@return new allocated block, x-latched; NULL if out of space */
 
354
UNIV_INTERN
 
355
buf_block_t*
 
356
btr_page_alloc(
 
357
/*===========*/
 
358
        dict_index_t*   index,          /*!< in: index */
 
359
        ulint           hint_page_no,   /*!< in: hint of a good page */
 
360
        byte            file_direction, /*!< in: direction where a possible
 
361
                                        page split is made */
 
362
        ulint           level,          /*!< in: level where the page is placed
 
363
                                        in the tree */
 
364
        mtr_t*          mtr)            /*!< in: mtr */
 
365
{
 
366
        fseg_header_t*  seg_header;
 
367
        page_t*         root;
 
368
        buf_block_t*    new_block;
 
369
        ulint           new_page_no;
 
370
 
 
371
        if (dict_index_is_ibuf(index)) {
 
372
 
 
373
                return(btr_page_alloc_for_ibuf(index, mtr));
 
374
        }
 
375
 
 
376
        root = btr_root_get(index, mtr);
 
377
 
 
378
        if (level == 0) {
 
379
                seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
 
380
        } else {
 
381
                seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
 
382
        }
 
383
 
 
384
        /* Parameter TRUE below states that the caller has made the
 
385
        reservation for free extents, and thus we know that a page can
 
386
        be allocated: */
 
387
 
 
388
        new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
 
389
                                                   file_direction, TRUE, mtr);
 
390
        if (new_page_no == FIL_NULL) {
 
391
 
 
392
                return(NULL);
 
393
        }
 
394
 
 
395
        new_block = buf_page_get(dict_index_get_space(index),
 
396
                                 dict_table_zip_size(index->table),
 
397
                                 new_page_no, RW_X_LATCH, mtr);
 
398
        buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
 
399
 
 
400
        return(new_block);
 
401
}
 
402
 
 
403
/**************************************************************//**
 
404
Gets the number of pages in a B-tree.
 
405
@return number of pages */
 
406
UNIV_INTERN
 
407
ulint
 
408
btr_get_size(
 
409
/*=========*/
 
410
        dict_index_t*   index,  /*!< in: index */
 
411
        ulint           flag)   /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
 
412
{
 
413
        fseg_header_t*  seg_header;
 
414
        page_t*         root;
 
415
        ulint           n;
 
416
        ulint           dummy;
 
417
        mtr_t           mtr;
 
418
 
 
419
        mtr_start(&mtr);
 
420
 
 
421
        mtr_s_lock(dict_index_get_lock(index), &mtr);
 
422
 
 
423
        root = btr_root_get(index, &mtr);
 
424
 
 
425
        if (flag == BTR_N_LEAF_PAGES) {
 
426
                seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
 
427
 
 
428
                fseg_n_reserved_pages(seg_header, &n, &mtr);
 
429
 
 
430
        } else if (flag == BTR_TOTAL_SIZE) {
 
431
                seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
 
432
 
 
433
                n = fseg_n_reserved_pages(seg_header, &dummy, &mtr);
 
434
 
 
435
                seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
 
436
 
 
437
                n += fseg_n_reserved_pages(seg_header, &dummy, &mtr);
 
438
        } else {
 
439
                ut_error;
 
440
        }
 
441
 
 
442
        mtr_commit(&mtr);
 
443
 
 
444
        return(n);
 
445
}
 
446
 
 
447
/**************************************************************//**
 
448
Frees a page used in an ibuf tree. Puts the page to the free list of the
 
449
ibuf tree. */
 
450
static
 
451
void
 
452
btr_page_free_for_ibuf(
 
453
/*===================*/
 
454
        dict_index_t*   index,  /*!< in: index tree */
 
455
        buf_block_t*    block,  /*!< in: block to be freed, x-latched */
 
456
        mtr_t*          mtr)    /*!< in: mtr */
 
457
{
 
458
        page_t*         root;
 
459
 
 
460
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
461
        root = btr_root_get(index, mtr);
 
462
 
 
463
        flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
 
464
                       buf_block_get_frame(block)
 
465
                       + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
 
466
 
 
467
        ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
 
468
                            mtr));
 
469
}
 
470
 
 
471
/**************************************************************//**
 
472
Frees a file page used in an index tree. Can be used also to (BLOB)
 
473
external storage pages, because the page level 0 can be given as an
 
474
argument. */
 
475
UNIV_INTERN
 
476
void
 
477
btr_page_free_low(
 
478
/*==============*/
 
479
        dict_index_t*   index,  /*!< in: index tree */
 
480
        buf_block_t*    block,  /*!< in: block to be freed, x-latched */
 
481
        ulint           level,  /*!< in: page level */
 
482
        mtr_t*          mtr)    /*!< in: mtr */
 
483
{
 
484
        fseg_header_t*  seg_header;
 
485
        page_t*         root;
 
486
 
 
487
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
488
        /* The page gets invalid for optimistic searches: increment the frame
 
489
        modify clock */
 
490
 
 
491
        buf_block_modify_clock_inc(block);
 
492
 
 
493
        if (dict_index_is_ibuf(index)) {
 
494
 
 
495
                btr_page_free_for_ibuf(index, block, mtr);
 
496
 
 
497
                return;
 
498
        }
 
499
 
 
500
        root = btr_root_get(index, mtr);
 
501
 
 
502
        if (level == 0) {
 
503
                seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
 
504
        } else {
 
505
                seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
 
506
        }
 
507
 
 
508
        fseg_free_page(seg_header,
 
509
                       buf_block_get_space(block),
 
510
                       buf_block_get_page_no(block), mtr);
 
511
}
 
512
 
 
513
/**************************************************************//**
 
514
Frees a file page used in an index tree. NOTE: cannot free field external
 
515
storage pages because the page must contain info on its level. */
 
516
UNIV_INTERN
 
517
void
 
518
btr_page_free(
 
519
/*==========*/
 
520
        dict_index_t*   index,  /*!< in: index tree */
 
521
        buf_block_t*    block,  /*!< in: block to be freed, x-latched */
 
522
        mtr_t*          mtr)    /*!< in: mtr */
 
523
{
 
524
        ulint           level;
 
525
 
 
526
        level = btr_page_get_level(buf_block_get_frame(block), mtr);
 
527
 
 
528
        btr_page_free_low(index, block, level, mtr);
 
529
}
 
530
 
 
531
/**************************************************************//**
 
532
Sets the child node file address in a node pointer. */
 
533
UNIV_INLINE
 
534
void
 
535
btr_node_ptr_set_child_page_no(
 
536
/*===========================*/
 
537
        rec_t*          rec,    /*!< in: node pointer record */
 
538
        page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
 
539
                                part will be updated, or NULL */
 
540
        const ulint*    offsets,/*!< in: array returned by rec_get_offsets() */
 
541
        ulint           page_no,/*!< in: child node address */
 
542
        mtr_t*          mtr)    /*!< in: mtr */
 
543
{
 
544
        byte*   field;
 
545
        ulint   len;
 
546
 
 
547
        ut_ad(rec_offs_validate(rec, NULL, offsets));
 
548
        ut_ad(!page_is_leaf(page_align(rec)));
 
549
        ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
 
550
 
 
551
        /* The child address is in the last field */
 
552
        field = rec_get_nth_field(rec, offsets,
 
553
                                  rec_offs_n_fields(offsets) - 1, &len);
 
554
 
 
555
        ut_ad(len == REC_NODE_PTR_SIZE);
 
556
 
 
557
        if (UNIV_LIKELY_NULL(page_zip)) {
 
558
                page_zip_write_node_ptr(page_zip, rec,
 
559
                                        rec_offs_data_size(offsets),
 
560
                                        page_no, mtr);
 
561
        } else {
 
562
                mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
 
563
        }
 
564
}
 
565
 
 
566
/************************************************************//**
 
567
Returns the child page of a node pointer and x-latches it.
 
568
@return child page, x-latched */
 
569
static
 
570
buf_block_t*
 
571
btr_node_ptr_get_child(
 
572
/*===================*/
 
573
        const rec_t*    node_ptr,/*!< in: node pointer */
 
574
        dict_index_t*   index,  /*!< in: index */
 
575
        const ulint*    offsets,/*!< in: array returned by rec_get_offsets() */
 
576
        mtr_t*          mtr)    /*!< in: mtr */
 
577
{
 
578
        ulint   page_no;
 
579
        ulint   space;
 
580
 
 
581
        ut_ad(rec_offs_validate(node_ptr, index, offsets));
 
582
        space = page_get_space_id(page_align(node_ptr));
 
583
        page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
 
584
 
 
585
        return(btr_block_get(space, dict_table_zip_size(index->table),
 
586
                             page_no, RW_X_LATCH, mtr));
 
587
}
 
588
 
 
589
/************************************************************//**
 
590
Returns the upper level node pointer to a page. It is assumed that mtr holds
 
591
an x-latch on the tree.
 
592
@return rec_get_offsets() of the node pointer record */
 
593
static
 
594
ulint*
 
595
btr_page_get_father_node_ptr(
 
596
/*=========================*/
 
597
        ulint*          offsets,/*!< in: work area for the return value */
 
598
        mem_heap_t*     heap,   /*!< in: memory heap to use */
 
599
        btr_cur_t*      cursor, /*!< in: cursor pointing to user record,
 
600
                                out: cursor on node pointer record,
 
601
                                its page x-latched */
 
602
        mtr_t*          mtr)    /*!< in: mtr */
 
603
{
 
604
        dtuple_t*       tuple;
 
605
        rec_t*          user_rec;
 
606
        rec_t*          node_ptr;
 
607
        ulint           level;
 
608
        ulint           page_no;
 
609
        dict_index_t*   index;
 
610
 
 
611
        page_no = buf_block_get_page_no(btr_cur_get_block(cursor));
 
612
        index = btr_cur_get_index(cursor);
 
613
 
 
614
        ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 
615
                                MTR_MEMO_X_LOCK));
 
616
 
 
617
        ut_ad(dict_index_get_page(index) != page_no);
 
618
 
 
619
        level = btr_page_get_level(btr_cur_get_page(cursor), mtr);
 
620
        user_rec = btr_cur_get_rec(cursor);
 
621
        ut_a(page_rec_is_user_rec(user_rec));
 
622
        tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
 
623
 
 
624
        btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE,
 
625
                                    BTR_CONT_MODIFY_TREE, cursor, 0, mtr);
 
626
 
 
627
        node_ptr = btr_cur_get_rec(cursor);
 
628
        ut_ad(!page_rec_is_comp(node_ptr)
 
629
              || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
 
630
        offsets = rec_get_offsets(node_ptr, index, offsets,
 
631
                                  ULINT_UNDEFINED, &heap);
 
632
 
 
633
        if (UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, offsets)
 
634
                          != page_no)) {
 
635
                rec_t*  print_rec;
 
636
                fputs("InnoDB: Dump of the child page:\n", stderr);
 
637
                buf_page_print(page_align(user_rec), 0);
 
638
                fputs("InnoDB: Dump of the parent page:\n", stderr);
 
639
                buf_page_print(page_align(node_ptr), 0);
 
640
 
 
641
                fputs("InnoDB: Corruption of an index tree: table ", stderr);
 
642
                ut_print_name(stderr, NULL, TRUE, index->table_name);
 
643
                fputs(", index ", stderr);
 
644
                ut_print_name(stderr, NULL, FALSE, index->name);
 
645
                fprintf(stderr, ",\n"
 
646
                        "InnoDB: father ptr page no %lu, child page no %lu\n",
 
647
                        (ulong)
 
648
                        btr_node_ptr_get_child_page_no(node_ptr, offsets),
 
649
                        (ulong) page_no);
 
650
                print_rec = page_rec_get_next(
 
651
                        page_get_infimum_rec(page_align(user_rec)));
 
652
                offsets = rec_get_offsets(print_rec, index,
 
653
                                          offsets, ULINT_UNDEFINED, &heap);
 
654
                page_rec_print(print_rec, offsets);
 
655
                offsets = rec_get_offsets(node_ptr, index, offsets,
 
656
                                          ULINT_UNDEFINED, &heap);
 
657
                page_rec_print(node_ptr, offsets);
 
658
 
 
659
                fputs("InnoDB: You should dump + drop + reimport the table"
 
660
                      " to fix the\n"
 
661
                      "InnoDB: corruption. If the crash happens at "
 
662
                      "the database startup, see\n"
 
663
                      "InnoDB: " REFMAN "forcing-recovery.html about\n"
 
664
                      "InnoDB: forcing recovery. "
 
665
                      "Then dump + drop + reimport.\n", stderr);
 
666
 
 
667
                ut_error;
 
668
        }
 
669
 
 
670
        return(offsets);
 
671
}
 
672
 
 
673
/************************************************************//**
 
674
Returns the upper level node pointer to a page. It is assumed that mtr holds
 
675
an x-latch on the tree.
 
676
@return rec_get_offsets() of the node pointer record */
 
677
static
 
678
ulint*
 
679
btr_page_get_father_block(
 
680
/*======================*/
 
681
        ulint*          offsets,/*!< in: work area for the return value */
 
682
        mem_heap_t*     heap,   /*!< in: memory heap to use */
 
683
        dict_index_t*   index,  /*!< in: b-tree index */
 
684
        buf_block_t*    block,  /*!< in: child page in the index */
 
685
        mtr_t*          mtr,    /*!< in: mtr */
 
686
        btr_cur_t*      cursor) /*!< out: cursor on node pointer record,
 
687
                                its page x-latched */
 
688
{
 
689
        rec_t*  rec
 
690
                = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
 
691
                                                                 block)));
 
692
        btr_cur_position(index, rec, block, cursor);
 
693
        return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr));
 
694
}
 
695
 
 
696
/************************************************************//**
 
697
Seeks to the upper level node pointer to a page.
 
698
It is assumed that mtr holds an x-latch on the tree. */
 
699
static
 
700
void
 
701
btr_page_get_father(
 
702
/*================*/
 
703
        dict_index_t*   index,  /*!< in: b-tree index */
 
704
        buf_block_t*    block,  /*!< in: child page in the index */
 
705
        mtr_t*          mtr,    /*!< in: mtr */
 
706
        btr_cur_t*      cursor) /*!< out: cursor on node pointer record,
 
707
                                its page x-latched */
 
708
{
 
709
        mem_heap_t*     heap;
 
710
        rec_t*          rec
 
711
                = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
 
712
                                                                 block)));
 
713
        btr_cur_position(index, rec, block, cursor);
 
714
 
 
715
        heap = mem_heap_create(100);
 
716
        btr_page_get_father_node_ptr(NULL, heap, cursor, mtr);
 
717
        mem_heap_free(heap);
 
718
}
 
719
 
 
720
/************************************************************//**
 
721
Creates the root node for a new index tree.
 
722
@return page number of the created root, FIL_NULL if did not succeed */
 
723
UNIV_INTERN
 
724
ulint
 
725
btr_create(
 
726
/*=======*/
 
727
        ulint           type,   /*!< in: type of the index */
 
728
        ulint           space,  /*!< in: space where created */
 
729
        ulint           zip_size,/*!< in: compressed page size in bytes
 
730
                                or 0 for uncompressed pages */
 
731
        dulint          index_id,/*!< in: index id */
 
732
        dict_index_t*   index,  /*!< in: index */
 
733
        mtr_t*          mtr)    /*!< in: mini-transaction handle */
 
734
{
 
735
        ulint           page_no;
 
736
        buf_block_t*    block;
 
737
        buf_frame_t*    frame;
 
738
        page_t*         page;
 
739
        page_zip_des_t* page_zip;
 
740
 
 
741
        /* Create the two new segments (one, in the case of an ibuf tree) for
 
742
        the index tree; the segment headers are put on the allocated root page
 
743
        (for an ibuf tree, not in the root, but on a separate ibuf header
 
744
        page) */
 
745
 
 
746
        if (type & DICT_IBUF) {
 
747
                /* Allocate first the ibuf header page */
 
748
                buf_block_t*    ibuf_hdr_block = fseg_create(
 
749
                        space, 0,
 
750
                        IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
 
751
 
 
752
                buf_block_dbg_add_level(ibuf_hdr_block, SYNC_TREE_NODE_NEW);
 
753
 
 
754
                ut_ad(buf_block_get_page_no(ibuf_hdr_block)
 
755
                      == IBUF_HEADER_PAGE_NO);
 
756
                /* Allocate then the next page to the segment: it will be the
 
757
                tree root page */
 
758
 
 
759
                page_no = fseg_alloc_free_page(buf_block_get_frame(
 
760
                                                       ibuf_hdr_block)
 
761
                                               + IBUF_HEADER
 
762
                                               + IBUF_TREE_SEG_HEADER,
 
763
                                               IBUF_TREE_ROOT_PAGE_NO,
 
764
                                               FSP_UP, mtr);
 
765
                ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
 
766
 
 
767
                block = buf_page_get(space, zip_size, page_no,
 
768
                                     RW_X_LATCH, mtr);
 
769
        } else {
 
770
                block = fseg_create(space, 0,
 
771
                                    PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
 
772
        }
 
773
 
 
774
        if (block == NULL) {
 
775
 
 
776
                return(FIL_NULL);
 
777
        }
 
778
 
 
779
        page_no = buf_block_get_page_no(block);
 
780
        frame = buf_block_get_frame(block);
 
781
 
 
782
        buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
 
783
 
 
784
        if (type & DICT_IBUF) {
 
785
                /* It is an insert buffer tree: initialize the free list */
 
786
 
 
787
                ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
 
788
 
 
789
                flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
 
790
        } else {
 
791
                /* It is a non-ibuf tree: create a file segment for leaf
 
792
                pages */
 
793
                if (!fseg_create(space, page_no,
 
794
                                 PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) {
 
795
                        /* Not enough space for new segment, free root
 
796
                        segment before return. */
 
797
                        btr_free_root(space, zip_size, page_no, mtr);
 
798
 
 
799
                        return(FIL_NULL);
 
800
                }
 
801
 
 
802
                /* The fseg create acquires a second latch on the page,
 
803
                therefore we must declare it: */
 
804
                buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
 
805
        }
 
806
 
 
807
        /* Create a new index page on the allocated segment page */
 
808
        page_zip = buf_block_get_page_zip(block);
 
809
 
 
810
        if (UNIV_LIKELY_NULL(page_zip)) {
 
811
                page = page_create_zip(block, index, 0, mtr);
 
812
        } else {
 
813
                page = page_create(block, mtr,
 
814
                                   dict_table_is_comp(index->table));
 
815
                /* Set the level of the new index page */
 
816
                btr_page_set_level(page, NULL, 0, mtr);
 
817
        }
 
818
 
 
819
        block->check_index_page_at_flush = TRUE;
 
820
 
 
821
        /* Set the index id of the page */
 
822
        btr_page_set_index_id(page, page_zip, index_id, mtr);
 
823
 
 
824
        /* Set the next node and previous node fields */
 
825
        btr_page_set_next(page, page_zip, FIL_NULL, mtr);
 
826
        btr_page_set_prev(page, page_zip, FIL_NULL, mtr);
 
827
 
 
828
        /* We reset the free bits for the page to allow creation of several
 
829
        trees in the same mtr, otherwise the latch on a bitmap page would
 
830
        prevent it because of the latching order */
 
831
 
 
832
        if (!(type & DICT_CLUSTERED)) {
 
833
                ibuf_reset_free_bits(block);
 
834
        }
 
835
 
 
836
        /* In the following assertion we test that two records of maximum
 
837
        allowed size fit on the root page: this fact is needed to ensure
 
838
        correctness of split algorithms */
 
839
 
 
840
        ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
 
841
 
 
842
        return(page_no);
 
843
}
 
844
 
 
845
/************************************************************//**
 
846
Frees a B-tree except the root page, which MUST be freed after this
 
847
by calling btr_free_root. */
 
848
UNIV_INTERN
 
849
void
 
850
btr_free_but_not_root(
 
851
/*==================*/
 
852
        ulint   space,          /*!< in: space where created */
 
853
        ulint   zip_size,       /*!< in: compressed page size in bytes
 
854
                                or 0 for uncompressed pages */
 
855
        ulint   root_page_no)   /*!< in: root page number */
 
856
{
 
857
        ibool   finished;
 
858
        page_t* root;
 
859
        mtr_t   mtr;
 
860
 
 
861
leaf_loop:
 
862
        mtr_start(&mtr);
 
863
 
 
864
        root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr);
 
865
#ifdef UNIV_BTR_DEBUG
 
866
        ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
 
867
                                    + root, space));
 
868
        ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
 
869
                                    + root, space));
 
870
#endif /* UNIV_BTR_DEBUG */
 
871
 
 
872
        /* NOTE: page hash indexes are dropped when a page is freed inside
 
873
        fsp0fsp. */
 
874
 
 
875
        finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
 
876
                                  &mtr);
 
877
        mtr_commit(&mtr);
 
878
 
 
879
        if (!finished) {
 
880
 
 
881
                goto leaf_loop;
 
882
        }
 
883
top_loop:
 
884
        mtr_start(&mtr);
 
885
 
 
886
        root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr);
 
887
#ifdef UNIV_BTR_DEBUG
 
888
        ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
 
889
                                    + root, space));
 
890
#endif /* UNIV_BTR_DEBUG */
 
891
 
 
892
        finished = fseg_free_step_not_header(
 
893
                root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
 
894
        mtr_commit(&mtr);
 
895
 
 
896
        if (!finished) {
 
897
 
 
898
                goto top_loop;
 
899
        }
 
900
}
 
901
 
 
902
/************************************************************//**
 
903
Frees the B-tree root page. Other tree MUST already have been freed. */
 
904
UNIV_INTERN
 
905
void
 
906
btr_free_root(
 
907
/*==========*/
 
908
        ulint   space,          /*!< in: space where created */
 
909
        ulint   zip_size,       /*!< in: compressed page size in bytes
 
910
                                or 0 for uncompressed pages */
 
911
        ulint   root_page_no,   /*!< in: root page number */
 
912
        mtr_t*  mtr)            /*!< in: a mini-transaction which has already
 
913
                                been started */
 
914
{
 
915
        buf_block_t*    block;
 
916
        fseg_header_t*  header;
 
917
 
 
918
        block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
 
919
 
 
920
        btr_search_drop_page_hash_index(block);
 
921
 
 
922
        header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP;
 
923
#ifdef UNIV_BTR_DEBUG
 
924
        ut_a(btr_root_fseg_validate(header, space));
 
925
#endif /* UNIV_BTR_DEBUG */
 
926
 
 
927
        while (!fseg_free_step(header, mtr));
 
928
}
 
929
#endif /* !UNIV_HOTBACKUP */
 
930
 
 
931
/*************************************************************//**
 
932
Reorganizes an index page. */
 
933
static
 
934
ibool
 
935
btr_page_reorganize_low(
 
936
/*====================*/
 
937
        ibool           recovery,/*!< in: TRUE if called in recovery:
 
938
                                locks should not be updated, i.e.,
 
939
                                there cannot exist locks on the
 
940
                                page, and a hash index should not be
 
941
                                dropped: it cannot exist */
 
942
        buf_block_t*    block,  /*!< in: page to be reorganized */
 
943
        dict_index_t*   index,  /*!< in: record descriptor */
 
944
        mtr_t*          mtr)    /*!< in: mtr */
 
945
{
 
946
        page_t*         page            = buf_block_get_frame(block);
 
947
        page_zip_des_t* page_zip        = buf_block_get_page_zip(block);
 
948
        buf_block_t*    temp_block;
 
949
        page_t*         temp_page;
 
950
        ulint           log_mode;
 
951
        ulint           data_size1;
 
952
        ulint           data_size2;
 
953
        ulint           max_ins_size1;
 
954
        ulint           max_ins_size2;
 
955
        ibool           success         = FALSE;
 
956
 
 
957
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
958
        ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
 
959
#ifdef UNIV_ZIP_DEBUG
 
960
        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
961
#endif /* UNIV_ZIP_DEBUG */
 
962
        data_size1 = page_get_data_size(page);
 
963
        max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
 
964
 
 
965
#ifndef UNIV_HOTBACKUP
 
966
        /* Write the log record */
 
967
        mlog_open_and_write_index(mtr, page, index, page_is_comp(page)
 
968
                                  ? MLOG_COMP_PAGE_REORGANIZE
 
969
                                  : MLOG_PAGE_REORGANIZE, 0);
 
970
#endif /* !UNIV_HOTBACKUP */
 
971
 
 
972
        /* Turn logging off */
 
973
        log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
 
974
 
 
975
#ifndef UNIV_HOTBACKUP
 
976
        temp_block = buf_block_alloc(0);
 
977
#else /* !UNIV_HOTBACKUP */
 
978
        ut_ad(block == back_block1);
 
979
        temp_block = back_block2;
 
980
#endif /* !UNIV_HOTBACKUP */
 
981
        temp_page = temp_block->frame;
 
982
 
 
983
        /* Copy the old page to temporary space */
 
984
        buf_frame_copy(temp_page, page);
 
985
 
 
986
#ifndef UNIV_HOTBACKUP
 
987
        if (UNIV_LIKELY(!recovery)) {
 
988
                btr_search_drop_page_hash_index(block);
 
989
        }
 
990
 
 
991
        block->check_index_page_at_flush = TRUE;
 
992
#endif /* !UNIV_HOTBACKUP */
 
993
 
 
994
        /* Recreate the page: note that global data on page (possible
 
995
        segment headers, next page-field, etc.) is preserved intact */
 
996
 
 
997
        page_create(block, mtr, dict_table_is_comp(index->table));
 
998
 
 
999
        /* Copy the records from the temporary space to the recreated page;
 
1000
        do not copy the lock bits yet */
 
1001
 
 
1002
        page_copy_rec_list_end_no_locks(block, temp_block,
 
1003
                                        page_get_infimum_rec(temp_page),
 
1004
                                        index, mtr);
 
1005
 
 
1006
        if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
 
1007
                /* Copy max trx id to recreated page */
 
1008
                trx_id_t        max_trx_id = page_get_max_trx_id(temp_page);
 
1009
                page_set_max_trx_id(block, NULL, max_trx_id, mtr);
 
1010
                /* In crash recovery, dict_index_is_sec_or_ibuf() always
 
1011
                returns TRUE, even for clustered indexes.  max_trx_id is
 
1012
                unused in clustered index pages. */
 
1013
                ut_ad(!ut_dulint_is_zero(max_trx_id) || recovery);
 
1014
        }
 
1015
 
 
1016
        if (UNIV_LIKELY_NULL(page_zip)
 
1017
            && UNIV_UNLIKELY
 
1018
            (!page_zip_compress(page_zip, page, index, NULL))) {
 
1019
 
 
1020
                /* Restore the old page and exit. */
 
1021
 
 
1022
#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
 
1023
                /* Check that the bytes that we skip are identical. */
 
1024
                ut_a(!memcmp(page, temp_page, PAGE_HEADER));
 
1025
                ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
 
1026
                             PAGE_HEADER + PAGE_N_RECS + temp_page,
 
1027
                             PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
 
1028
                ut_a(!memcmp(UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page,
 
1029
                             UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + temp_page,
 
1030
                             FIL_PAGE_DATA_END));
 
1031
#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
 
1032
 
 
1033
                memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
 
1034
                       PAGE_N_RECS - PAGE_N_DIR_SLOTS);
 
1035
                memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
 
1036
                       UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END);
 
1037
 
 
1038
#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
 
1039
                ut_a(!memcmp(page, temp_page, UNIV_PAGE_SIZE));
 
1040
#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
 
1041
 
 
1042
                goto func_exit;
 
1043
        }
 
1044
 
 
1045
#ifndef UNIV_HOTBACKUP
 
1046
        if (UNIV_LIKELY(!recovery)) {
 
1047
                /* Update the record lock bitmaps */
 
1048
                lock_move_reorganize_page(block, temp_block);
 
1049
        }
 
1050
#endif /* !UNIV_HOTBACKUP */
 
1051
 
 
1052
        data_size2 = page_get_data_size(page);
 
1053
        max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
 
1054
 
 
1055
        if (UNIV_UNLIKELY(data_size1 != data_size2)
 
1056
            || UNIV_UNLIKELY(max_ins_size1 != max_ins_size2)) {
 
1057
                buf_page_print(page, 0);
 
1058
                buf_page_print(temp_page, 0);
 
1059
                fprintf(stderr,
 
1060
                        "InnoDB: Error: page old data size %lu"
 
1061
                        " new data size %lu\n"
 
1062
                        "InnoDB: Error: page old max ins size %lu"
 
1063
                        " new max ins size %lu\n"
 
1064
                        "InnoDB: Submit a detailed bug report"
 
1065
                        " to http://bugs.mysql.com\n",
 
1066
                        (unsigned long) data_size1, (unsigned long) data_size2,
 
1067
                        (unsigned long) max_ins_size1,
 
1068
                        (unsigned long) max_ins_size2);
 
1069
        } else {
 
1070
                success = TRUE;
 
1071
        }
 
1072
 
 
1073
func_exit:
 
1074
#ifdef UNIV_ZIP_DEBUG
 
1075
        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
1076
#endif /* UNIV_ZIP_DEBUG */
 
1077
#ifndef UNIV_HOTBACKUP
 
1078
        buf_block_free(temp_block);
 
1079
#endif /* !UNIV_HOTBACKUP */
 
1080
 
 
1081
        /* Restore logging mode */
 
1082
        mtr_set_log_mode(mtr, log_mode);
 
1083
 
 
1084
        return(success);
 
1085
}
 
1086
 
 
1087
#ifndef UNIV_HOTBACKUP
 
1088
/*************************************************************//**
 
1089
Reorganizes an index page.
 
1090
IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf
 
1091
page of a non-clustered index, the caller must update the insert
 
1092
buffer free bits in the same mini-transaction in such a way that the
 
1093
modification will be redo-logged.
 
1094
@return TRUE on success, FALSE on failure */
 
1095
UNIV_INTERN
 
1096
ibool
 
1097
btr_page_reorganize(
 
1098
/*================*/
 
1099
        buf_block_t*    block,  /*!< in: page to be reorganized */
 
1100
        dict_index_t*   index,  /*!< in: record descriptor */
 
1101
        mtr_t*          mtr)    /*!< in: mtr */
 
1102
{
 
1103
        return(btr_page_reorganize_low(FALSE, block, index, mtr));
 
1104
}
 
1105
#endif /* !UNIV_HOTBACKUP */
 
1106
 
 
1107
/***********************************************************//**
 
1108
Parses a redo log record of reorganizing a page.
 
1109
@return end of log record or NULL */
 
1110
UNIV_INTERN
 
1111
byte*
 
1112
btr_parse_page_reorganize(
 
1113
/*======================*/
 
1114
        byte*           ptr,    /*!< in: buffer */
 
1115
        byte*           end_ptr __attribute__((unused)),
 
1116
                                /*!< in: buffer end */
 
1117
        dict_index_t*   index,  /*!< in: record descriptor */
 
1118
        buf_block_t*    block,  /*!< in: page to be reorganized, or NULL */
 
1119
        mtr_t*          mtr)    /*!< in: mtr or NULL */
 
1120
{
 
1121
        ut_ad(ptr && end_ptr);
 
1122
 
 
1123
        /* The record is empty, except for the record initial part */
 
1124
 
 
1125
        if (UNIV_LIKELY(block != NULL)) {
 
1126
                btr_page_reorganize_low(TRUE, block, index, mtr);
 
1127
        }
 
1128
 
 
1129
        return(ptr);
 
1130
}
 
1131
 
 
1132
#ifndef UNIV_HOTBACKUP
 
1133
/*************************************************************//**
 
1134
Empties an index page.  @see btr_page_create(). */
 
1135
static
 
1136
void
 
1137
btr_page_empty(
 
1138
/*===========*/
 
1139
        buf_block_t*    block,  /*!< in: page to be emptied */
 
1140
        page_zip_des_t* page_zip,/*!< out: compressed page, or NULL */
 
1141
        dict_index_t*   index,  /*!< in: index of the page */
 
1142
        ulint           level,  /*!< in: the B-tree level of the page */
 
1143
        mtr_t*          mtr)    /*!< in: mtr */
 
1144
{
 
1145
        page_t* page = buf_block_get_frame(block);
 
1146
 
 
1147
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
1148
        ut_ad(page_zip == buf_block_get_page_zip(block));
 
1149
#ifdef UNIV_ZIP_DEBUG
 
1150
        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
1151
#endif /* UNIV_ZIP_DEBUG */
 
1152
 
 
1153
        btr_search_drop_page_hash_index(block);
 
1154
 
 
1155
        /* Recreate the page: note that global data on page (possible
 
1156
        segment headers, next page-field, etc.) is preserved intact */
 
1157
 
 
1158
        if (UNIV_LIKELY_NULL(page_zip)) {
 
1159
                page_create_zip(block, index, level, mtr);
 
1160
        } else {
 
1161
                page_create(block, mtr, dict_table_is_comp(index->table));
 
1162
                btr_page_set_level(page, NULL, level, mtr);
 
1163
        }
 
1164
 
 
1165
        block->check_index_page_at_flush = TRUE;
 
1166
}
 
1167
 
 
1168
/*************************************************************//**
 
1169
Makes tree one level higher by splitting the root, and inserts
 
1170
the tuple. It is assumed that mtr contains an x-latch on the tree.
 
1171
NOTE that the operation of this function must always succeed,
 
1172
we cannot reverse it: therefore enough free disk space must be
 
1173
guaranteed to be available before this function is called.
 
1174
@return inserted record */
 
1175
UNIV_INTERN
 
1176
rec_t*
 
1177
btr_root_raise_and_insert(
 
1178
/*======================*/
 
1179
        btr_cur_t*      cursor, /*!< in: cursor at which to insert: must be
 
1180
                                on the root page; when the function returns,
 
1181
                                the cursor is positioned on the predecessor
 
1182
                                of the inserted record */
 
1183
        const dtuple_t* tuple,  /*!< in: tuple to insert */
 
1184
        ulint           n_ext,  /*!< in: number of externally stored columns */
 
1185
        mtr_t*          mtr)    /*!< in: mtr */
 
1186
{
 
1187
        dict_index_t*   index;
 
1188
        page_t*         root;
 
1189
        page_t*         new_page;
 
1190
        ulint           new_page_no;
 
1191
        rec_t*          rec;
 
1192
        mem_heap_t*     heap;
 
1193
        dtuple_t*       node_ptr;
 
1194
        ulint           level;
 
1195
        rec_t*          node_ptr_rec;
 
1196
        page_cur_t*     page_cursor;
 
1197
        page_zip_des_t* root_page_zip;
 
1198
        page_zip_des_t* new_page_zip;
 
1199
        buf_block_t*    root_block;
 
1200
        buf_block_t*    new_block;
 
1201
 
 
1202
        root = btr_cur_get_page(cursor);
 
1203
        root_block = btr_cur_get_block(cursor);
 
1204
        root_page_zip = buf_block_get_page_zip(root_block);
 
1205
#ifdef UNIV_ZIP_DEBUG
 
1206
        ut_a(!root_page_zip || page_zip_validate(root_page_zip, root));
 
1207
#endif /* UNIV_ZIP_DEBUG */
 
1208
        index = btr_cur_get_index(cursor);
 
1209
#ifdef UNIV_BTR_DEBUG
 
1210
        if (!dict_index_is_ibuf(index)) {
 
1211
                ulint   space = dict_index_get_space(index);
 
1212
 
 
1213
                ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
 
1214
                                            + root, space));
 
1215
                ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
 
1216
                                            + root, space));
 
1217
        }
 
1218
 
 
1219
        ut_a(dict_index_get_page(index) == page_get_page_no(root));
 
1220
#endif /* UNIV_BTR_DEBUG */
 
1221
        ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 
1222
                                MTR_MEMO_X_LOCK));
 
1223
        ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX));
 
1224
 
 
1225
        /* Allocate a new page to the tree. Root splitting is done by first
 
1226
        moving the root records to the new page, emptying the root, putting
 
1227
        a node pointer to the new page, and then splitting the new page. */
 
1228
 
 
1229
        level = btr_page_get_level(root, mtr);
 
1230
 
 
1231
        new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr);
 
1232
        new_page = buf_block_get_frame(new_block);
 
1233
        new_page_zip = buf_block_get_page_zip(new_block);
 
1234
        ut_a(!new_page_zip == !root_page_zip);
 
1235
        ut_a(!new_page_zip
 
1236
             || page_zip_get_size(new_page_zip)
 
1237
             == page_zip_get_size(root_page_zip));
 
1238
 
 
1239
        btr_page_create(new_block, new_page_zip, index, level, mtr);
 
1240
 
 
1241
        /* Set the next node and previous node fields of new page */
 
1242
        btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr);
 
1243
        btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr);
 
1244
 
 
1245
        /* Copy the records from root to the new page one by one. */
 
1246
 
 
1247
        if (0
 
1248
#ifdef UNIV_ZIP_COPY
 
1249
            || new_page_zip
 
1250
#endif /* UNIV_ZIP_COPY */
 
1251
            || UNIV_UNLIKELY
 
1252
            (!page_copy_rec_list_end(new_block, root_block,
 
1253
                                     page_get_infimum_rec(root),
 
1254
                                     index, mtr))) {
 
1255
                ut_a(new_page_zip);
 
1256
 
 
1257
                /* Copy the page byte for byte. */
 
1258
                page_zip_copy_recs(new_page_zip, new_page,
 
1259
                                   root_page_zip, root, index, mtr);
 
1260
 
 
1261
                /* Update the lock table and possible hash index. */
 
1262
 
 
1263
                lock_move_rec_list_end(new_block, root_block,
 
1264
                                       page_get_infimum_rec(root));
 
1265
 
 
1266
                btr_search_move_or_delete_hash_entries(new_block, root_block,
 
1267
                                                       index);
 
1268
        }
 
1269
 
 
1270
        /* If this is a pessimistic insert which is actually done to
 
1271
        perform a pessimistic update then we have stored the lock
 
1272
        information of the record to be inserted on the infimum of the
 
1273
        root page: we cannot discard the lock structs on the root page */
 
1274
 
 
1275
        lock_update_root_raise(new_block, root_block);
 
1276
 
 
1277
        /* Create a memory heap where the node pointer is stored */
 
1278
        heap = mem_heap_create(100);
 
1279
 
 
1280
        rec = page_rec_get_next(page_get_infimum_rec(new_page));
 
1281
        new_page_no = buf_block_get_page_no(new_block);
 
1282
 
 
1283
        /* Build the node pointer (= node key and page address) for the
 
1284
        child */
 
1285
 
 
1286
        node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap,
 
1287
                                             level);
 
1288
        /* The node pointer must be marked as the predefined minimum record,
 
1289
        as there is no lower alphabetical limit to records in the leftmost
 
1290
        node of a level: */
 
1291
        dtuple_set_info_bits(node_ptr,
 
1292
                             dtuple_get_info_bits(node_ptr)
 
1293
                             | REC_INFO_MIN_REC_FLAG);
 
1294
 
 
1295
        /* Rebuild the root page to get free space */
 
1296
        btr_page_empty(root_block, root_page_zip, index, level + 1, mtr);
 
1297
 
 
1298
        /* Set the next node and previous node fields, although
 
1299
        they should already have been set.  The previous node field
 
1300
        must be FIL_NULL if root_page_zip != NULL, because the
 
1301
        REC_INFO_MIN_REC_FLAG (of the first user record) will be
 
1302
        set if and only if btr_page_get_prev() == FIL_NULL. */
 
1303
        btr_page_set_next(root, root_page_zip, FIL_NULL, mtr);
 
1304
        btr_page_set_prev(root, root_page_zip, FIL_NULL, mtr);
 
1305
 
 
1306
        page_cursor = btr_cur_get_page_cur(cursor);
 
1307
 
 
1308
        /* Insert node pointer to the root */
 
1309
 
 
1310
        page_cur_set_before_first(root_block, page_cursor);
 
1311
 
 
1312
        node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
 
1313
                                             index, 0, mtr);
 
1314
 
 
1315
        /* The root page should only contain the node pointer
 
1316
        to new_page at this point.  Thus, the data should fit. */
 
1317
        ut_a(node_ptr_rec);
 
1318
 
 
1319
        /* Free the memory heap */
 
1320
        mem_heap_free(heap);
 
1321
 
 
1322
        /* We play safe and reset the free bits for the new page */
 
1323
 
 
1324
#if 0
 
1325
        fprintf(stderr, "Root raise new page no %lu\n", new_page_no);
 
1326
#endif
 
1327
 
 
1328
        if (!dict_index_is_clust(index)) {
 
1329
                ibuf_reset_free_bits(new_block);
 
1330
        }
 
1331
 
 
1332
        /* Reposition the cursor to the child node */
 
1333
        page_cur_search(new_block, index, tuple,
 
1334
                        PAGE_CUR_LE, page_cursor);
 
1335
 
 
1336
        /* Split the child and insert tuple */
 
1337
        return(btr_page_split_and_insert(cursor, tuple, n_ext, mtr));
 
1338
}
 
1339
 
 
1340
/*************************************************************//**
 
1341
Decides if the page should be split at the convergence point of inserts
 
1342
converging to the left.
 
1343
@return TRUE if split recommended */
 
1344
UNIV_INTERN
 
1345
ibool
 
1346
btr_page_get_split_rec_to_left(
 
1347
/*===========================*/
 
1348
        btr_cur_t*      cursor, /*!< in: cursor at which to insert */
 
1349
        rec_t**         split_rec) /*!< out: if split recommended,
 
1350
                                the first record on upper half page,
 
1351
                                or NULL if tuple to be inserted should
 
1352
                                be first */
 
1353
{
 
1354
        page_t* page;
 
1355
        rec_t*  insert_point;
 
1356
        rec_t*  infimum;
 
1357
 
 
1358
        page = btr_cur_get_page(cursor);
 
1359
        insert_point = btr_cur_get_rec(cursor);
 
1360
 
 
1361
        if (page_header_get_ptr(page, PAGE_LAST_INSERT)
 
1362
            == page_rec_get_next(insert_point)) {
 
1363
 
 
1364
                infimum = page_get_infimum_rec(page);
 
1365
 
 
1366
                /* If the convergence is in the middle of a page, include also
 
1367
                the record immediately before the new insert to the upper
 
1368
                page. Otherwise, we could repeatedly move from page to page
 
1369
                lots of records smaller than the convergence point. */
 
1370
 
 
1371
                if (infimum != insert_point
 
1372
                    && page_rec_get_next(infimum) != insert_point) {
 
1373
 
 
1374
                        *split_rec = insert_point;
 
1375
                } else {
 
1376
                        *split_rec = page_rec_get_next(insert_point);
 
1377
                }
 
1378
 
 
1379
                return(TRUE);
 
1380
        }
 
1381
 
 
1382
        return(FALSE);
 
1383
}
 
1384
 
 
1385
/*************************************************************//**
 
1386
Decides if the page should be split at the convergence point of inserts
 
1387
converging to the right.
 
1388
@return TRUE if split recommended */
 
1389
UNIV_INTERN
 
1390
ibool
 
1391
btr_page_get_split_rec_to_right(
 
1392
/*============================*/
 
1393
        btr_cur_t*      cursor, /*!< in: cursor at which to insert */
 
1394
        rec_t**         split_rec) /*!< out: if split recommended,
 
1395
                                the first record on upper half page,
 
1396
                                or NULL if tuple to be inserted should
 
1397
                                be first */
 
1398
{
 
1399
        page_t* page;
 
1400
        rec_t*  insert_point;
 
1401
 
 
1402
        page = btr_cur_get_page(cursor);
 
1403
        insert_point = btr_cur_get_rec(cursor);
 
1404
 
 
1405
        /* We use eager heuristics: if the new insert would be right after
 
1406
        the previous insert on the same page, we assume that there is a
 
1407
        pattern of sequential inserts here. */
 
1408
 
 
1409
        if (UNIV_LIKELY(page_header_get_ptr(page, PAGE_LAST_INSERT)
 
1410
                        == insert_point)) {
 
1411
 
 
1412
                rec_t*  next_rec;
 
1413
 
 
1414
                next_rec = page_rec_get_next(insert_point);
 
1415
 
 
1416
                if (page_rec_is_supremum(next_rec)) {
 
1417
split_at_new:
 
1418
                        /* Split at the new record to insert */
 
1419
                        *split_rec = NULL;
 
1420
                } else {
 
1421
                        rec_t*  next_next_rec = page_rec_get_next(next_rec);
 
1422
                        if (page_rec_is_supremum(next_next_rec)) {
 
1423
 
 
1424
                                goto split_at_new;
 
1425
                        }
 
1426
 
 
1427
                        /* If there are >= 2 user records up from the insert
 
1428
                        point, split all but 1 off. We want to keep one because
 
1429
                        then sequential inserts can use the adaptive hash
 
1430
                        index, as they can do the necessary checks of the right
 
1431
                        search position just by looking at the records on this
 
1432
                        page. */
 
1433
 
 
1434
                        *split_rec = next_next_rec;
 
1435
                }
 
1436
 
 
1437
                return(TRUE);
 
1438
        }
 
1439
 
 
1440
        return(FALSE);
 
1441
}
 
1442
 
 
1443
/*************************************************************//**
 
1444
Calculates a split record such that the tuple will certainly fit on
 
1445
its half-page when the split is performed. We assume in this function
 
1446
only that the cursor page has at least one user record.
 
1447
@return split record, or NULL if tuple will be the first record on
 
1448
upper half-page */
 
1449
static
 
1450
rec_t*
 
1451
btr_page_get_sure_split_rec(
 
1452
/*========================*/
 
1453
        btr_cur_t*      cursor, /*!< in: cursor at which insert should be made */
 
1454
        const dtuple_t* tuple,  /*!< in: tuple to insert */
 
1455
        ulint           n_ext)  /*!< in: number of externally stored columns */
 
1456
{
 
1457
        page_t*         page;
 
1458
        page_zip_des_t* page_zip;
 
1459
        ulint           insert_size;
 
1460
        ulint           free_space;
 
1461
        ulint           total_data;
 
1462
        ulint           total_n_recs;
 
1463
        ulint           total_space;
 
1464
        ulint           incl_data;
 
1465
        rec_t*          ins_rec;
 
1466
        rec_t*          rec;
 
1467
        rec_t*          next_rec;
 
1468
        ulint           n;
 
1469
        mem_heap_t*     heap;
 
1470
        ulint*          offsets;
 
1471
 
 
1472
        page = btr_cur_get_page(cursor);
 
1473
 
 
1474
        insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
 
1475
        free_space  = page_get_free_space_of_empty(page_is_comp(page));
 
1476
 
 
1477
        page_zip = btr_cur_get_page_zip(cursor);
 
1478
        if (UNIV_LIKELY_NULL(page_zip)) {
 
1479
                /* Estimate the free space of an empty compressed page. */
 
1480
                ulint   free_space_zip = page_zip_empty_size(
 
1481
                        cursor->index->n_fields,
 
1482
                        page_zip_get_size(page_zip));
 
1483
 
 
1484
                if (UNIV_LIKELY(free_space > (ulint) free_space_zip)) {
 
1485
                        free_space = (ulint) free_space_zip;
 
1486
                }
 
1487
        }
 
1488
 
 
1489
        /* free_space is now the free space of a created new page */
 
1490
 
 
1491
        total_data   = page_get_data_size(page) + insert_size;
 
1492
        total_n_recs = page_get_n_recs(page) + 1;
 
1493
        ut_ad(total_n_recs >= 2);
 
1494
        total_space  = total_data + page_dir_calc_reserved_space(total_n_recs);
 
1495
 
 
1496
        n = 0;
 
1497
        incl_data = 0;
 
1498
        ins_rec = btr_cur_get_rec(cursor);
 
1499
        rec = page_get_infimum_rec(page);
 
1500
 
 
1501
        heap = NULL;
 
1502
        offsets = NULL;
 
1503
 
 
1504
        /* We start to include records to the left half, and when the
 
1505
        space reserved by them exceeds half of total_space, then if
 
1506
        the included records fit on the left page, they will be put there
 
1507
        if something was left over also for the right page,
 
1508
        otherwise the last included record will be the first on the right
 
1509
        half page */
 
1510
 
 
1511
        do {
 
1512
                /* Decide the next record to include */
 
1513
                if (rec == ins_rec) {
 
1514
                        rec = NULL;     /* NULL denotes that tuple is
 
1515
                                        now included */
 
1516
                } else if (rec == NULL) {
 
1517
                        rec = page_rec_get_next(ins_rec);
 
1518
                } else {
 
1519
                        rec = page_rec_get_next(rec);
 
1520
                }
 
1521
 
 
1522
                if (rec == NULL) {
 
1523
                        /* Include tuple */
 
1524
                        incl_data += insert_size;
 
1525
                } else {
 
1526
                        offsets = rec_get_offsets(rec, cursor->index,
 
1527
                                                  offsets, ULINT_UNDEFINED,
 
1528
                                                  &heap);
 
1529
                        incl_data += rec_offs_size(offsets);
 
1530
                }
 
1531
 
 
1532
                n++;
 
1533
        } while (incl_data + page_dir_calc_reserved_space(n)
 
1534
                 < total_space / 2);
 
1535
 
 
1536
        if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
 
1537
                /* The next record will be the first on
 
1538
                the right half page if it is not the
 
1539
                supremum record of page */
 
1540
 
 
1541
                if (rec == ins_rec) {
 
1542
                        rec = NULL;
 
1543
 
 
1544
                        goto func_exit;
 
1545
                } else if (rec == NULL) {
 
1546
                        next_rec = page_rec_get_next(ins_rec);
 
1547
                } else {
 
1548
                        next_rec = page_rec_get_next(rec);
 
1549
                }
 
1550
                ut_ad(next_rec);
 
1551
                if (!page_rec_is_supremum(next_rec)) {
 
1552
                        rec = next_rec;
 
1553
                }
 
1554
        }
 
1555
 
 
1556
func_exit:
 
1557
        if (UNIV_LIKELY_NULL(heap)) {
 
1558
                mem_heap_free(heap);
 
1559
        }
 
1560
        return(rec);
 
1561
}
 
1562
 
 
1563
/*************************************************************//**
 
1564
Returns TRUE if the insert fits on the appropriate half-page with the
 
1565
chosen split_rec.
 
1566
@return TRUE if fits */
 
1567
static
 
1568
ibool
 
1569
btr_page_insert_fits(
 
1570
/*=================*/
 
1571
        btr_cur_t*      cursor, /*!< in: cursor at which insert
 
1572
                                should be made */
 
1573
        const rec_t*    split_rec,/*!< in: suggestion for first record
 
1574
                                on upper half-page, or NULL if
 
1575
                                tuple to be inserted should be first */
 
1576
        const ulint*    offsets,/*!< in: rec_get_offsets(
 
1577
                                split_rec, cursor->index) */
 
1578
        const dtuple_t* tuple,  /*!< in: tuple to insert */
 
1579
        ulint           n_ext,  /*!< in: number of externally stored columns */
 
1580
        mem_heap_t*     heap)   /*!< in: temporary memory heap */
 
1581
{
 
1582
        page_t*         page;
 
1583
        ulint           insert_size;
 
1584
        ulint           free_space;
 
1585
        ulint           total_data;
 
1586
        ulint           total_n_recs;
 
1587
        const rec_t*    rec;
 
1588
        const rec_t*    end_rec;
 
1589
        ulint*          offs;
 
1590
 
 
1591
        page = btr_cur_get_page(cursor);
 
1592
 
 
1593
        ut_ad(!split_rec == !offsets);
 
1594
        ut_ad(!offsets
 
1595
              || !page_is_comp(page) == !rec_offs_comp(offsets));
 
1596
        ut_ad(!offsets
 
1597
              || rec_offs_validate(split_rec, cursor->index, offsets));
 
1598
 
 
1599
        insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
 
1600
        free_space  = page_get_free_space_of_empty(page_is_comp(page));
 
1601
 
 
1602
        /* free_space is now the free space of a created new page */
 
1603
 
 
1604
        total_data   = page_get_data_size(page) + insert_size;
 
1605
        total_n_recs = page_get_n_recs(page) + 1;
 
1606
 
 
1607
        /* We determine which records (from rec to end_rec, not including
 
1608
        end_rec) will end up on the other half page from tuple when it is
 
1609
        inserted. */
 
1610
 
 
1611
        if (split_rec == NULL) {
 
1612
                rec = page_rec_get_next(page_get_infimum_rec(page));
 
1613
                end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
 
1614
 
 
1615
        } else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) {
 
1616
 
 
1617
                rec = page_rec_get_next(page_get_infimum_rec(page));
 
1618
                end_rec = split_rec;
 
1619
        } else {
 
1620
                rec = split_rec;
 
1621
                end_rec = page_get_supremum_rec(page);
 
1622
        }
 
1623
 
 
1624
        if (total_data + page_dir_calc_reserved_space(total_n_recs)
 
1625
            <= free_space) {
 
1626
 
 
1627
                /* Ok, there will be enough available space on the
 
1628
                half page where the tuple is inserted */
 
1629
 
 
1630
                return(TRUE);
 
1631
        }
 
1632
 
 
1633
        offs = NULL;
 
1634
 
 
1635
        while (rec != end_rec) {
 
1636
                /* In this loop we calculate the amount of reserved
 
1637
                space after rec is removed from page. */
 
1638
 
 
1639
                offs = rec_get_offsets(rec, cursor->index, offs,
 
1640
                                       ULINT_UNDEFINED, &heap);
 
1641
 
 
1642
                total_data -= rec_offs_size(offs);
 
1643
                total_n_recs--;
 
1644
 
 
1645
                if (total_data + page_dir_calc_reserved_space(total_n_recs)
 
1646
                    <= free_space) {
 
1647
 
 
1648
                        /* Ok, there will be enough available space on the
 
1649
                        half page where the tuple is inserted */
 
1650
 
 
1651
                        return(TRUE);
 
1652
                }
 
1653
 
 
1654
                rec = page_rec_get_next_const(rec);
 
1655
        }
 
1656
 
 
1657
        return(FALSE);
 
1658
}
 
1659
 
 
1660
/*******************************************************//**
 
1661
Inserts a data tuple to a tree on a non-leaf level. It is assumed
 
1662
that mtr holds an x-latch on the tree. */
 
1663
UNIV_INTERN
 
1664
void
 
1665
btr_insert_on_non_leaf_level(
 
1666
/*=========================*/
 
1667
        dict_index_t*   index,  /*!< in: index */
 
1668
        ulint           level,  /*!< in: level, must be > 0 */
 
1669
        dtuple_t*       tuple,  /*!< in: the record to be inserted */
 
1670
        mtr_t*          mtr)    /*!< in: mtr */
 
1671
{
 
1672
        big_rec_t*      dummy_big_rec;
 
1673
        btr_cur_t       cursor;
 
1674
        ulint           err;
 
1675
        rec_t*          rec;
 
1676
 
 
1677
        ut_ad(level > 0);
 
1678
 
 
1679
        btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE,
 
1680
                                    BTR_CONT_MODIFY_TREE,
 
1681
                                    &cursor, 0, mtr);
 
1682
 
 
1683
        err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
 
1684
                                         | BTR_KEEP_SYS_FLAG
 
1685
                                         | BTR_NO_UNDO_LOG_FLAG,
 
1686
                                         &cursor, tuple, &rec,
 
1687
                                         &dummy_big_rec, 0, NULL, mtr);
 
1688
        ut_a(err == DB_SUCCESS);
 
1689
}
 
1690
 
 
1691
/**************************************************************//**
 
1692
Attaches the halves of an index page on the appropriate level in an
 
1693
index tree. */
 
1694
static
 
1695
void
 
1696
btr_attach_half_pages(
 
1697
/*==================*/
 
1698
        dict_index_t*   index,          /*!< in: the index tree */
 
1699
        buf_block_t*    block,          /*!< in/out: page to be split */
 
1700
        rec_t*          split_rec,      /*!< in: first record on upper
 
1701
                                        half page */
 
1702
        buf_block_t*    new_block,      /*!< in/out: the new half page */
 
1703
        ulint           direction,      /*!< in: FSP_UP or FSP_DOWN */
 
1704
        mtr_t*          mtr)            /*!< in: mtr */
 
1705
{
 
1706
        ulint           space;
 
1707
        ulint           zip_size;
 
1708
        ulint           prev_page_no;
 
1709
        ulint           next_page_no;
 
1710
        ulint           level;
 
1711
        page_t*         page            = buf_block_get_frame(block);
 
1712
        page_t*         lower_page;
 
1713
        page_t*         upper_page;
 
1714
        ulint           lower_page_no;
 
1715
        ulint           upper_page_no;
 
1716
        page_zip_des_t* lower_page_zip;
 
1717
        page_zip_des_t* upper_page_zip;
 
1718
        dtuple_t*       node_ptr_upper;
 
1719
        mem_heap_t*     heap;
 
1720
 
 
1721
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
1722
        ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX));
 
1723
 
 
1724
        /* Create a memory heap where the data tuple is stored */
 
1725
        heap = mem_heap_create(1024);
 
1726
 
 
1727
        /* Based on split direction, decide upper and lower pages */
 
1728
        if (direction == FSP_DOWN) {
 
1729
 
 
1730
                btr_cur_t       cursor;
 
1731
                ulint*          offsets;
 
1732
 
 
1733
                lower_page = buf_block_get_frame(new_block);
 
1734
                lower_page_no = buf_block_get_page_no(new_block);
 
1735
                lower_page_zip = buf_block_get_page_zip(new_block);
 
1736
                upper_page = buf_block_get_frame(block);
 
1737
                upper_page_no = buf_block_get_page_no(block);
 
1738
                upper_page_zip = buf_block_get_page_zip(block);
 
1739
 
 
1740
                /* Look up the index for the node pointer to page */
 
1741
                offsets = btr_page_get_father_block(NULL, heap, index,
 
1742
                                                    block, mtr, &cursor);
 
1743
 
 
1744
                /* Replace the address of the old child node (= page) with the
 
1745
                address of the new lower half */
 
1746
 
 
1747
                btr_node_ptr_set_child_page_no(
 
1748
                        btr_cur_get_rec(&cursor),
 
1749
                        btr_cur_get_page_zip(&cursor),
 
1750
                        offsets, lower_page_no, mtr);
 
1751
                mem_heap_empty(heap);
 
1752
        } else {
 
1753
                lower_page = buf_block_get_frame(block);
 
1754
                lower_page_no = buf_block_get_page_no(block);
 
1755
                lower_page_zip = buf_block_get_page_zip(block);
 
1756
                upper_page = buf_block_get_frame(new_block);
 
1757
                upper_page_no = buf_block_get_page_no(new_block);
 
1758
                upper_page_zip = buf_block_get_page_zip(new_block);
 
1759
        }
 
1760
 
 
1761
        /* Get the level of the split pages */
 
1762
        level = btr_page_get_level(buf_block_get_frame(block), mtr);
 
1763
        ut_ad(level
 
1764
              == btr_page_get_level(buf_block_get_frame(new_block), mtr));
 
1765
 
 
1766
        /* Build the node pointer (= node key and page address) for the upper
 
1767
        half */
 
1768
 
 
1769
        node_ptr_upper = dict_index_build_node_ptr(index, split_rec,
 
1770
                                                   upper_page_no, heap, level);
 
1771
 
 
1772
        /* Insert it next to the pointer to the lower half. Note that this
 
1773
        may generate recursion leading to a split on the higher level. */
 
1774
 
 
1775
        btr_insert_on_non_leaf_level(index, level + 1, node_ptr_upper, mtr);
 
1776
 
 
1777
        /* Free the memory heap */
 
1778
        mem_heap_free(heap);
 
1779
 
 
1780
        /* Get the previous and next pages of page */
 
1781
 
 
1782
        prev_page_no = btr_page_get_prev(page, mtr);
 
1783
        next_page_no = btr_page_get_next(page, mtr);
 
1784
        space = buf_block_get_space(block);
 
1785
        zip_size = buf_block_get_zip_size(block);
 
1786
 
 
1787
        /* Update page links of the level */
 
1788
 
 
1789
        if (prev_page_no != FIL_NULL) {
 
1790
                buf_block_t*    prev_block = btr_block_get(space, zip_size,
 
1791
                                                           prev_page_no,
 
1792
                                                           RW_X_LATCH, mtr);
 
1793
#ifdef UNIV_BTR_DEBUG
 
1794
                ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
 
1795
                ut_a(btr_page_get_next(prev_block->frame, mtr)
 
1796
                     == buf_block_get_page_no(block));
 
1797
#endif /* UNIV_BTR_DEBUG */
 
1798
 
 
1799
                btr_page_set_next(buf_block_get_frame(prev_block),
 
1800
                                  buf_block_get_page_zip(prev_block),
 
1801
                                  lower_page_no, mtr);
 
1802
        }
 
1803
 
 
1804
        if (next_page_no != FIL_NULL) {
 
1805
                buf_block_t*    next_block = btr_block_get(space, zip_size,
 
1806
                                                           next_page_no,
 
1807
                                                           RW_X_LATCH, mtr);
 
1808
#ifdef UNIV_BTR_DEBUG
 
1809
                ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
 
1810
                ut_a(btr_page_get_prev(next_block->frame, mtr)
 
1811
                     == page_get_page_no(page));
 
1812
#endif /* UNIV_BTR_DEBUG */
 
1813
 
 
1814
                btr_page_set_prev(buf_block_get_frame(next_block),
 
1815
                                  buf_block_get_page_zip(next_block),
 
1816
                                  upper_page_no, mtr);
 
1817
        }
 
1818
 
 
1819
        btr_page_set_prev(lower_page, lower_page_zip, prev_page_no, mtr);
 
1820
        btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr);
 
1821
 
 
1822
        btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr);
 
1823
        btr_page_set_next(upper_page, upper_page_zip, next_page_no, mtr);
 
1824
}
 
1825
 
 
1826
/*************************************************************//**
 
1827
Splits an index page to halves and inserts the tuple. It is assumed
 
1828
that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
 
1829
released within this function! NOTE that the operation of this
 
1830
function must always succeed, we cannot reverse it: therefore enough
 
1831
free disk space (2 pages) must be guaranteed to be available before
 
1832
this function is called.
 
1833
 
 
1834
@return inserted record */
 
1835
UNIV_INTERN
 
1836
rec_t*
 
1837
btr_page_split_and_insert(
 
1838
/*======================*/
 
1839
        btr_cur_t*      cursor, /*!< in: cursor at which to insert; when the
 
1840
                                function returns, the cursor is positioned
 
1841
                                on the predecessor of the inserted record */
 
1842
        const dtuple_t* tuple,  /*!< in: tuple to insert */
 
1843
        ulint           n_ext,  /*!< in: number of externally stored columns */
 
1844
        mtr_t*          mtr)    /*!< in: mtr */
 
1845
{
 
1846
        buf_block_t*    block;
 
1847
        page_t*         page;
 
1848
        page_zip_des_t* page_zip;
 
1849
        ulint           page_no;
 
1850
        byte            direction;
 
1851
        ulint           hint_page_no;
 
1852
        buf_block_t*    new_block;
 
1853
        page_t*         new_page;
 
1854
        page_zip_des_t* new_page_zip;
 
1855
        rec_t*          split_rec;
 
1856
        buf_block_t*    left_block;
 
1857
        buf_block_t*    right_block;
 
1858
        buf_block_t*    insert_block;
 
1859
        page_t*         insert_page;
 
1860
        page_cur_t*     page_cursor;
 
1861
        rec_t*          first_rec;
 
1862
        byte*           buf = 0; /* remove warning */
 
1863
        rec_t*          move_limit;
 
1864
        ibool           insert_will_fit;
 
1865
        ibool           insert_left;
 
1866
        ulint           n_iterations = 0;
 
1867
        rec_t*          rec;
 
1868
        mem_heap_t*     heap;
 
1869
        ulint           n_uniq;
 
1870
        ulint*          offsets;
 
1871
 
 
1872
        heap = mem_heap_create(1024);
 
1873
        n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
 
1874
func_start:
 
1875
        mem_heap_empty(heap);
 
1876
        offsets = NULL;
 
1877
 
 
1878
        ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
 
1879
                                MTR_MEMO_X_LOCK));
 
1880
#ifdef UNIV_SYNC_DEBUG
 
1881
        ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX));
 
1882
#endif /* UNIV_SYNC_DEBUG */
 
1883
 
 
1884
        block = btr_cur_get_block(cursor);
 
1885
        page = buf_block_get_frame(block);
 
1886
        page_zip = buf_block_get_page_zip(block);
 
1887
 
 
1888
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
1889
        ut_ad(page_get_n_recs(page) >= 1);
 
1890
 
 
1891
        page_no = buf_block_get_page_no(block);
 
1892
 
 
1893
        /* 1. Decide the split record; split_rec == NULL means that the
 
1894
        tuple to be inserted should be the first record on the upper
 
1895
        half-page */
 
1896
 
 
1897
        if (n_iterations > 0) {
 
1898
                direction = FSP_UP;
 
1899
                hint_page_no = page_no + 1;
 
1900
                split_rec = btr_page_get_sure_split_rec(cursor, tuple, n_ext);
 
1901
 
 
1902
        } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
 
1903
                direction = FSP_UP;
 
1904
                hint_page_no = page_no + 1;
 
1905
 
 
1906
        } else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) {
 
1907
                direction = FSP_DOWN;
 
1908
                hint_page_no = page_no - 1;
 
1909
        } else {
 
1910
                direction = FSP_UP;
 
1911
                hint_page_no = page_no + 1;
 
1912
 
 
1913
                if (page_get_n_recs(page) == 1) {
 
1914
                        page_cur_t      pcur;
 
1915
 
 
1916
                        /* There is only one record in the index page
 
1917
                        therefore we can't split the node in the middle
 
1918
                        by default. We need to determine whether the
 
1919
                        new record will be inserted to the left or right. */
 
1920
 
 
1921
                        /* Read the first (and only) record in the page. */
 
1922
                        page_cur_set_before_first(block, &pcur);
 
1923
                        page_cur_move_to_next(&pcur);
 
1924
                        first_rec = page_cur_get_rec(&pcur);
 
1925
 
 
1926
                        offsets = rec_get_offsets(
 
1927
                                first_rec, cursor->index, offsets,
 
1928
                                n_uniq, &heap);
 
1929
 
 
1930
                        /* If the new record is less than the existing record
 
1931
                        the split in the middle will copy the existing
 
1932
                        record to the new node. */
 
1933
                        if (cmp_dtuple_rec(tuple, first_rec, offsets) < 0) {
 
1934
                                split_rec = page_get_middle_rec(page);
 
1935
                        } else {
 
1936
                                split_rec = NULL;
 
1937
                        }
 
1938
                } else {
 
1939
                        split_rec = page_get_middle_rec(page);
 
1940
                }
 
1941
        }
 
1942
 
 
1943
        /* 2. Allocate a new page to the index */
 
1944
        new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
 
1945
                                   btr_page_get_level(page, mtr), mtr);
 
1946
        new_page = buf_block_get_frame(new_block);
 
1947
        new_page_zip = buf_block_get_page_zip(new_block);
 
1948
        btr_page_create(new_block, new_page_zip, cursor->index,
 
1949
                        btr_page_get_level(page, mtr), mtr);
 
1950
 
 
1951
        /* 3. Calculate the first record on the upper half-page, and the
 
1952
        first record (move_limit) on original page which ends up on the
 
1953
        upper half */
 
1954
 
 
1955
        if (split_rec) {
 
1956
                first_rec = move_limit = split_rec;
 
1957
 
 
1958
                offsets = rec_get_offsets(split_rec, cursor->index, offsets,
 
1959
                                          n_uniq, &heap);
 
1960
 
 
1961
                insert_left = cmp_dtuple_rec(tuple, split_rec, offsets) < 0;
 
1962
 
 
1963
                if (UNIV_UNLIKELY(!insert_left && new_page_zip
 
1964
                                  && n_iterations > 0)) {
 
1965
                        /* If a compressed page has already been split,
 
1966
                        avoid further splits by inserting the record
 
1967
                        to an empty page. */
 
1968
                        split_rec = NULL;
 
1969
                        goto insert_right;
 
1970
                }
 
1971
        } else {
 
1972
insert_right:
 
1973
                insert_left = FALSE;
 
1974
                buf = mem_alloc(rec_get_converted_size(cursor->index,
 
1975
                                                       tuple, n_ext));
 
1976
 
 
1977
                first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
 
1978
                                                      tuple, n_ext);
 
1979
                move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
 
1980
        }
 
1981
 
 
1982
        /* 4. Do first the modifications in the tree structure */
 
1983
 
 
1984
        btr_attach_half_pages(cursor->index, block,
 
1985
                              first_rec, new_block, direction, mtr);
 
1986
 
 
1987
        /* If the split is made on the leaf level and the insert will fit
 
1988
        on the appropriate half-page, we may release the tree x-latch.
 
1989
        We can then move the records after releasing the tree latch,
 
1990
        thus reducing the tree latch contention. */
 
1991
 
 
1992
        if (split_rec) {
 
1993
                insert_will_fit = !new_page_zip
 
1994
                        && btr_page_insert_fits(cursor, split_rec,
 
1995
                                                offsets, tuple, n_ext, heap);
 
1996
        } else {
 
1997
                mem_free(buf);
 
1998
                insert_will_fit = !new_page_zip
 
1999
                        && btr_page_insert_fits(cursor, NULL,
 
2000
                                                NULL, tuple, n_ext, heap);
 
2001
        }
 
2002
 
 
2003
        if (insert_will_fit && page_is_leaf(page)) {
 
2004
 
 
2005
                mtr_memo_release(mtr, dict_index_get_lock(cursor->index),
 
2006
                                 MTR_MEMO_X_LOCK);
 
2007
        }
 
2008
 
 
2009
        /* 5. Move then the records to the new page */
 
2010
        if (direction == FSP_DOWN) {
 
2011
                /*              fputs("Split left\n", stderr); */
 
2012
 
 
2013
                if (0
 
2014
#ifdef UNIV_ZIP_COPY
 
2015
                    || page_zip
 
2016
#endif /* UNIV_ZIP_COPY */
 
2017
                    || UNIV_UNLIKELY
 
2018
                    (!page_move_rec_list_start(new_block, block, move_limit,
 
2019
                                               cursor->index, mtr))) {
 
2020
                        /* For some reason, compressing new_page failed,
 
2021
                        even though it should contain fewer records than
 
2022
                        the original page.  Copy the page byte for byte
 
2023
                        and then delete the records from both pages
 
2024
                        as appropriate.  Deleting will always succeed. */
 
2025
                        ut_a(new_page_zip);
 
2026
 
 
2027
                        page_zip_copy_recs(new_page_zip, new_page,
 
2028
                                           page_zip, page, cursor->index, mtr);
 
2029
                        page_delete_rec_list_end(move_limit - page + new_page,
 
2030
                                                 new_block, cursor->index,
 
2031
                                                 ULINT_UNDEFINED,
 
2032
                                                 ULINT_UNDEFINED, mtr);
 
2033
 
 
2034
                        /* Update the lock table and possible hash index. */
 
2035
 
 
2036
                        lock_move_rec_list_start(
 
2037
                                new_block, block, move_limit,
 
2038
                                new_page + PAGE_NEW_INFIMUM);
 
2039
 
 
2040
                        btr_search_move_or_delete_hash_entries(
 
2041
                                new_block, block, cursor->index);
 
2042
 
 
2043
                        /* Delete the records from the source page. */
 
2044
 
 
2045
                        page_delete_rec_list_start(move_limit, block,
 
2046
                                                   cursor->index, mtr);
 
2047
                }
 
2048
 
 
2049
                left_block = new_block;
 
2050
                right_block = block;
 
2051
 
 
2052
                lock_update_split_left(right_block, left_block);
 
2053
        } else {
 
2054
                /*              fputs("Split right\n", stderr); */
 
2055
 
 
2056
                if (0
 
2057
#ifdef UNIV_ZIP_COPY
 
2058
                    || page_zip
 
2059
#endif /* UNIV_ZIP_COPY */
 
2060
                    || UNIV_UNLIKELY
 
2061
                    (!page_move_rec_list_end(new_block, block, move_limit,
 
2062
                                             cursor->index, mtr))) {
 
2063
                        /* For some reason, compressing new_page failed,
 
2064
                        even though it should contain fewer records than
 
2065
                        the original page.  Copy the page byte for byte
 
2066
                        and then delete the records from both pages
 
2067
                        as appropriate.  Deleting will always succeed. */
 
2068
                        ut_a(new_page_zip);
 
2069
 
 
2070
                        page_zip_copy_recs(new_page_zip, new_page,
 
2071
                                           page_zip, page, cursor->index, mtr);
 
2072
                        page_delete_rec_list_start(move_limit - page
 
2073
                                                   + new_page, new_block,
 
2074
                                                   cursor->index, mtr);
 
2075
 
 
2076
                        /* Update the lock table and possible hash index. */
 
2077
 
 
2078
                        lock_move_rec_list_end(new_block, block, move_limit);
 
2079
 
 
2080
                        btr_search_move_or_delete_hash_entries(
 
2081
                                new_block, block, cursor->index);
 
2082
 
 
2083
                        /* Delete the records from the source page. */
 
2084
 
 
2085
                        page_delete_rec_list_end(move_limit, block,
 
2086
                                                 cursor->index,
 
2087
                                                 ULINT_UNDEFINED,
 
2088
                                                 ULINT_UNDEFINED, mtr);
 
2089
                }
 
2090
 
 
2091
                left_block = block;
 
2092
                right_block = new_block;
 
2093
 
 
2094
                lock_update_split_right(right_block, left_block);
 
2095
        }
 
2096
 
 
2097
#ifdef UNIV_ZIP_DEBUG
 
2098
        if (UNIV_LIKELY_NULL(page_zip)) {
 
2099
                ut_a(page_zip_validate(page_zip, page));
 
2100
                ut_a(page_zip_validate(new_page_zip, new_page));
 
2101
        }
 
2102
#endif /* UNIV_ZIP_DEBUG */
 
2103
 
 
2104
        /* At this point, split_rec, move_limit and first_rec may point
 
2105
        to garbage on the old page. */
 
2106
 
 
2107
        /* 6. The split and the tree modification is now completed. Decide the
 
2108
        page where the tuple should be inserted */
 
2109
 
 
2110
        if (insert_left) {
 
2111
                insert_block = left_block;
 
2112
        } else {
 
2113
                insert_block = right_block;
 
2114
        }
 
2115
 
 
2116
        insert_page = buf_block_get_frame(insert_block);
 
2117
 
 
2118
        /* 7. Reposition the cursor for insert and try insertion */
 
2119
        page_cursor = btr_cur_get_page_cur(cursor);
 
2120
 
 
2121
        page_cur_search(insert_block, cursor->index, tuple,
 
2122
                        PAGE_CUR_LE, page_cursor);
 
2123
 
 
2124
        rec = page_cur_tuple_insert(page_cursor, tuple,
 
2125
                                    cursor->index, n_ext, mtr);
 
2126
 
 
2127
#ifdef UNIV_ZIP_DEBUG
 
2128
        {
 
2129
                page_zip_des_t* insert_page_zip
 
2130
                        = buf_block_get_page_zip(insert_block);
 
2131
                ut_a(!insert_page_zip
 
2132
                     || page_zip_validate(insert_page_zip, insert_page));
 
2133
        }
 
2134
#endif /* UNIV_ZIP_DEBUG */
 
2135
 
 
2136
        if (UNIV_LIKELY(rec != NULL)) {
 
2137
 
 
2138
                goto func_exit;
 
2139
        }
 
2140
 
 
2141
        /* 8. If insert did not fit, try page reorganization */
 
2142
 
 
2143
        if (UNIV_UNLIKELY
 
2144
            (!btr_page_reorganize(insert_block, cursor->index, mtr))) {
 
2145
 
 
2146
                goto insert_failed;
 
2147
        }
 
2148
 
 
2149
        page_cur_search(insert_block, cursor->index, tuple,
 
2150
                        PAGE_CUR_LE, page_cursor);
 
2151
        rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
 
2152
                                    n_ext, mtr);
 
2153
 
 
2154
        if (UNIV_UNLIKELY(rec == NULL)) {
 
2155
                /* The insert did not fit on the page: loop back to the
 
2156
                start of the function for a new split */
 
2157
insert_failed:
 
2158
                /* We play safe and reset the free bits for new_page */
 
2159
                if (!dict_index_is_clust(cursor->index)) {
 
2160
                        ibuf_reset_free_bits(new_block);
 
2161
                }
 
2162
 
 
2163
                /* fprintf(stderr, "Split second round %lu\n",
 
2164
                page_get_page_no(page)); */
 
2165
                n_iterations++;
 
2166
                ut_ad(n_iterations < 2
 
2167
                      || buf_block_get_page_zip(insert_block));
 
2168
                ut_ad(!insert_will_fit);
 
2169
 
 
2170
                goto func_start;
 
2171
        }
 
2172
 
 
2173
func_exit:
 
2174
        /* Insert fit on the page: update the free bits for the
 
2175
        left and right pages in the same mtr */
 
2176
 
 
2177
        if (!dict_index_is_clust(cursor->index) && page_is_leaf(page)) {
 
2178
                ibuf_update_free_bits_for_two_pages_low(
 
2179
                        buf_block_get_zip_size(left_block),
 
2180
                        left_block, right_block, mtr);
 
2181
        }
 
2182
 
 
2183
#if 0
 
2184
        fprintf(stderr, "Split and insert done %lu %lu\n",
 
2185
                buf_block_get_page_no(left_block),
 
2186
                buf_block_get_page_no(right_block));
 
2187
#endif
 
2188
 
 
2189
        ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
 
2190
        ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
 
2191
 
 
2192
        mem_heap_free(heap);
 
2193
        return(rec);
 
2194
}
 
2195
 
 
2196
/*************************************************************//**
 
2197
Removes a page from the level list of pages. */
 
2198
static
 
2199
void
 
2200
btr_level_list_remove(
 
2201
/*==================*/
 
2202
        ulint           space,  /*!< in: space where removed */
 
2203
        ulint           zip_size,/*!< in: compressed page size in bytes
 
2204
                                or 0 for uncompressed pages */
 
2205
        page_t*         page,   /*!< in: page to remove */
 
2206
        mtr_t*          mtr)    /*!< in: mtr */
 
2207
{
 
2208
        ulint   prev_page_no;
 
2209
        ulint   next_page_no;
 
2210
 
 
2211
        ut_ad(page && mtr);
 
2212
        ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
 
2213
        ut_ad(space == page_get_space_id(page));
 
2214
        /* Get the previous and next page numbers of page */
 
2215
 
 
2216
        prev_page_no = btr_page_get_prev(page, mtr);
 
2217
        next_page_no = btr_page_get_next(page, mtr);
 
2218
 
 
2219
        /* Update page links of the level */
 
2220
 
 
2221
        if (prev_page_no != FIL_NULL) {
 
2222
                buf_block_t*    prev_block
 
2223
                        = btr_block_get(space, zip_size, prev_page_no,
 
2224
                                        RW_X_LATCH, mtr);
 
2225
                page_t*         prev_page
 
2226
                        = buf_block_get_frame(prev_block);
 
2227
#ifdef UNIV_BTR_DEBUG
 
2228
                ut_a(page_is_comp(prev_page) == page_is_comp(page));
 
2229
                ut_a(btr_page_get_next(prev_page, mtr)
 
2230
                     == page_get_page_no(page));
 
2231
#endif /* UNIV_BTR_DEBUG */
 
2232
 
 
2233
                btr_page_set_next(prev_page,
 
2234
                                  buf_block_get_page_zip(prev_block),
 
2235
                                  next_page_no, mtr);
 
2236
        }
 
2237
 
 
2238
        if (next_page_no != FIL_NULL) {
 
2239
                buf_block_t*    next_block
 
2240
                        = btr_block_get(space, zip_size, next_page_no,
 
2241
                                        RW_X_LATCH, mtr);
 
2242
                page_t*         next_page
 
2243
                        = buf_block_get_frame(next_block);
 
2244
#ifdef UNIV_BTR_DEBUG
 
2245
                ut_a(page_is_comp(next_page) == page_is_comp(page));
 
2246
                ut_a(btr_page_get_prev(next_page, mtr)
 
2247
                     == page_get_page_no(page));
 
2248
#endif /* UNIV_BTR_DEBUG */
 
2249
 
 
2250
                btr_page_set_prev(next_page,
 
2251
                                  buf_block_get_page_zip(next_block),
 
2252
                                  prev_page_no, mtr);
 
2253
        }
 
2254
}
 
2255
 
 
2256
/****************************************************************//**
 
2257
Writes the redo log record for setting an index record as the predefined
 
2258
minimum record. */
 
2259
UNIV_INLINE
 
2260
void
 
2261
btr_set_min_rec_mark_log(
 
2262
/*=====================*/
 
2263
        rec_t*  rec,    /*!< in: record */
 
2264
        byte    type,   /*!< in: MLOG_COMP_REC_MIN_MARK or MLOG_REC_MIN_MARK */
 
2265
        mtr_t*  mtr)    /*!< in: mtr */
 
2266
{
 
2267
        mlog_write_initial_log_record(rec, type, mtr);
 
2268
 
 
2269
        /* Write rec offset as a 2-byte ulint */
 
2270
        mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES);
 
2271
}
 
2272
#else /* !UNIV_HOTBACKUP */
 
2273
# define btr_set_min_rec_mark_log(rec,comp,mtr) ((void) 0)
 
2274
#endif /* !UNIV_HOTBACKUP */
 
2275
 
 
2276
/****************************************************************//**
 
2277
Parses the redo log record for setting an index record as the predefined
 
2278
minimum record.
 
2279
@return end of log record or NULL */
 
2280
UNIV_INTERN
 
2281
byte*
 
2282
btr_parse_set_min_rec_mark(
 
2283
/*=======================*/
 
2284
        byte*   ptr,    /*!< in: buffer */
 
2285
        byte*   end_ptr,/*!< in: buffer end */
 
2286
        ulint   comp,   /*!< in: nonzero=compact page format */
 
2287
        page_t* page,   /*!< in: page or NULL */
 
2288
        mtr_t*  mtr)    /*!< in: mtr or NULL */
 
2289
{
 
2290
        rec_t*  rec;
 
2291
 
 
2292
        if (end_ptr < ptr + 2) {
 
2293
 
 
2294
                return(NULL);
 
2295
        }
 
2296
 
 
2297
        if (page) {
 
2298
                ut_a(!page_is_comp(page) == !comp);
 
2299
 
 
2300
                rec = page + mach_read_from_2(ptr);
 
2301
 
 
2302
                btr_set_min_rec_mark(rec, mtr);
 
2303
        }
 
2304
 
 
2305
        return(ptr + 2);
 
2306
}
 
2307
 
 
2308
/****************************************************************//**
 
2309
Sets a record as the predefined minimum record. */
 
2310
UNIV_INTERN
 
2311
void
 
2312
btr_set_min_rec_mark(
 
2313
/*=================*/
 
2314
        rec_t*  rec,    /*!< in: record */
 
2315
        mtr_t*  mtr)    /*!< in: mtr */
 
2316
{
 
2317
        ulint   info_bits;
 
2318
 
 
2319
        if (UNIV_LIKELY(page_rec_is_comp(rec))) {
 
2320
                info_bits = rec_get_info_bits(rec, TRUE);
 
2321
 
 
2322
                rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG);
 
2323
 
 
2324
                btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr);
 
2325
        } else {
 
2326
                info_bits = rec_get_info_bits(rec, FALSE);
 
2327
 
 
2328
                rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG);
 
2329
 
 
2330
                btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr);
 
2331
        }
 
2332
}
 
2333
 
 
2334
#ifndef UNIV_HOTBACKUP
 
2335
/*************************************************************//**
 
2336
Deletes on the upper level the node pointer to a page. */
 
2337
UNIV_INTERN
 
2338
void
 
2339
btr_node_ptr_delete(
 
2340
/*================*/
 
2341
        dict_index_t*   index,  /*!< in: index tree */
 
2342
        buf_block_t*    block,  /*!< in: page whose node pointer is deleted */
 
2343
        mtr_t*          mtr)    /*!< in: mtr */
 
2344
{
 
2345
        btr_cur_t       cursor;
 
2346
        ibool           compressed;
 
2347
        ulint           err;
 
2348
 
 
2349
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
2350
 
 
2351
        /* Delete node pointer on father page */
 
2352
        btr_page_get_father(index, block, mtr, &cursor);
 
2353
 
 
2354
        compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, RB_NONE,
 
2355
                                                mtr);
 
2356
        ut_a(err == DB_SUCCESS);
 
2357
 
 
2358
        if (!compressed) {
 
2359
                btr_cur_compress_if_useful(&cursor, mtr);
 
2360
        }
 
2361
}
 
2362
 
 
2363
/*************************************************************//**
 
2364
If page is the only on its level, this function moves its records to the
 
2365
father page, thus reducing the tree height. */
 
2366
static
 
2367
void
 
2368
btr_lift_page_up(
 
2369
/*=============*/
 
2370
        dict_index_t*   index,  /*!< in: index tree */
 
2371
        buf_block_t*    block,  /*!< in: page which is the only on its level;
 
2372
                                must not be empty: use
 
2373
                                btr_discard_only_page_on_level if the last
 
2374
                                record from the page should be removed */
 
2375
        mtr_t*          mtr)    /*!< in: mtr */
 
2376
{
 
2377
        buf_block_t*    father_block;
 
2378
        page_t*         father_page;
 
2379
        ulint           page_level;
 
2380
        page_zip_des_t* father_page_zip;
 
2381
        page_t*         page            = buf_block_get_frame(block);
 
2382
        ulint           root_page_no;
 
2383
        buf_block_t*    blocks[BTR_MAX_LEVELS];
 
2384
        ulint           n_blocks;       /*!< last used index in blocks[] */
 
2385
        ulint           i;
 
2386
 
 
2387
        ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
 
2388
        ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
 
2389
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
2390
 
 
2391
        page_level = btr_page_get_level(page, mtr);
 
2392
        root_page_no = dict_index_get_page(index);
 
2393
 
 
2394
        {
 
2395
                btr_cur_t       cursor;
 
2396
                mem_heap_t*     heap    = mem_heap_create(100);
 
2397
                ulint*          offsets;
 
2398
                buf_block_t*    b;
 
2399
 
 
2400
                offsets = btr_page_get_father_block(NULL, heap, index,
 
2401
                                                    block, mtr, &cursor);
 
2402
                father_block = btr_cur_get_block(&cursor);
 
2403
                father_page_zip = buf_block_get_page_zip(father_block);
 
2404
                father_page = buf_block_get_frame(father_block);
 
2405
 
 
2406
                n_blocks = 0;
 
2407
 
 
2408
                /* Store all ancestor pages so we can reset their
 
2409
                levels later on.  We have to do all the searches on
 
2410
                the tree now because later on, after we've replaced
 
2411
                the first level, the tree is in an inconsistent state
 
2412
                and can not be searched. */
 
2413
                for (b = father_block;
 
2414
                     buf_block_get_page_no(b) != root_page_no; ) {
 
2415
                        ut_a(n_blocks < BTR_MAX_LEVELS);
 
2416
 
 
2417
                        offsets = btr_page_get_father_block(offsets, heap,
 
2418
                                                            index, b,
 
2419
                                                            mtr, &cursor);
 
2420
 
 
2421
                        blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
 
2422
                }
 
2423
 
 
2424
                mem_heap_free(heap);
 
2425
        }
 
2426
 
 
2427
        btr_search_drop_page_hash_index(block);
 
2428
 
 
2429
        /* Make the father empty */
 
2430
        btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
 
2431
 
 
2432
        /* Copy the records to the father page one by one. */
 
2433
        if (0
 
2434
#ifdef UNIV_ZIP_COPY
 
2435
            || father_page_zip
 
2436
#endif /* UNIV_ZIP_COPY */
 
2437
            || UNIV_UNLIKELY
 
2438
            (!page_copy_rec_list_end(father_block, block,
 
2439
                                     page_get_infimum_rec(page),
 
2440
                                     index, mtr))) {
 
2441
                const page_zip_des_t*   page_zip
 
2442
                        = buf_block_get_page_zip(block);
 
2443
                ut_a(father_page_zip);
 
2444
                ut_a(page_zip);
 
2445
 
 
2446
                /* Copy the page byte for byte. */
 
2447
                page_zip_copy_recs(father_page_zip, father_page,
 
2448
                                   page_zip, page, index, mtr);
 
2449
 
 
2450
                /* Update the lock table and possible hash index. */
 
2451
 
 
2452
                lock_move_rec_list_end(father_block, block,
 
2453
                                       page_get_infimum_rec(page));
 
2454
 
 
2455
                btr_search_move_or_delete_hash_entries(father_block, block,
 
2456
                                                       index);
 
2457
        }
 
2458
 
 
2459
        lock_update_copy_and_discard(father_block, block);
 
2460
 
 
2461
        /* Go upward to root page, decrementing levels by one. */
 
2462
        for (i = 0; i < n_blocks; i++, page_level++) {
 
2463
                page_t*         page    = buf_block_get_frame(blocks[i]);
 
2464
                page_zip_des_t* page_zip= buf_block_get_page_zip(blocks[i]);
 
2465
 
 
2466
                ut_ad(btr_page_get_level(page, mtr) == page_level + 1);
 
2467
 
 
2468
                btr_page_set_level(page, page_zip, page_level, mtr);
 
2469
#ifdef UNIV_ZIP_DEBUG
 
2470
                ut_a(!page_zip || page_zip_validate(page_zip, page));
 
2471
#endif /* UNIV_ZIP_DEBUG */
 
2472
        }
 
2473
 
 
2474
        /* Free the file page */
 
2475
        btr_page_free(index, block, mtr);
 
2476
 
 
2477
        /* We play it safe and reset the free bits for the father */
 
2478
        if (!dict_index_is_clust(index)) {
 
2479
                ibuf_reset_free_bits(father_block);
 
2480
        }
 
2481
        ut_ad(page_validate(father_page, index));
 
2482
        ut_ad(btr_check_node_ptr(index, father_block, mtr));
 
2483
}
 
2484
 
 
2485
/*************************************************************//**
 
2486
Tries to merge the page first to the left immediate brother if such a
 
2487
brother exists, and the node pointers to the current page and to the brother
 
2488
reside on the same page. If the left brother does not satisfy these
 
2489
conditions, looks at the right brother. If the page is the only one on that
 
2490
level lifts the records of the page to the father page, thus reducing the
 
2491
tree height. It is assumed that mtr holds an x-latch on the tree and on the
 
2492
page. If cursor is on the leaf level, mtr must also hold x-latches to the
 
2493
brothers, if they exist.
 
2494
@return TRUE on success */
 
2495
UNIV_INTERN
 
2496
ibool
 
2497
btr_compress(
 
2498
/*=========*/
 
2499
        btr_cur_t*      cursor, /*!< in: cursor on the page to merge or lift;
 
2500
                                the page must not be empty: in record delete
 
2501
                                use btr_discard_page if the page would become
 
2502
                                empty */
 
2503
        mtr_t*          mtr)    /*!< in: mtr */
 
2504
{
 
2505
        dict_index_t*   index;
 
2506
        ulint           space;
 
2507
        ulint           zip_size;
 
2508
        ulint           left_page_no;
 
2509
        ulint           right_page_no;
 
2510
        buf_block_t*    merge_block;
 
2511
        page_t*         merge_page;
 
2512
        page_zip_des_t* merge_page_zip;
 
2513
        ibool           is_left;
 
2514
        buf_block_t*    block;
 
2515
        page_t*         page;
 
2516
        btr_cur_t       father_cursor;
 
2517
        mem_heap_t*     heap;
 
2518
        ulint*          offsets;
 
2519
        ulint           data_size;
 
2520
        ulint           n_recs;
 
2521
        ulint           max_ins_size;
 
2522
        ulint           max_ins_size_reorg;
 
2523
        ulint           level;
 
2524
 
 
2525
        block = btr_cur_get_block(cursor);
 
2526
        page = btr_cur_get_page(cursor);
 
2527
        index = btr_cur_get_index(cursor);
 
2528
        ut_a((ibool) !!page_is_comp(page) == dict_table_is_comp(index->table));
 
2529
 
 
2530
        ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 
2531
                                MTR_MEMO_X_LOCK));
 
2532
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
2533
        level = btr_page_get_level(page, mtr);
 
2534
        space = dict_index_get_space(index);
 
2535
        zip_size = dict_table_zip_size(index->table);
 
2536
 
 
2537
        left_page_no = btr_page_get_prev(page, mtr);
 
2538
        right_page_no = btr_page_get_next(page, mtr);
 
2539
 
 
2540
#if 0
 
2541
        fprintf(stderr, "Merge left page %lu right %lu \n",
 
2542
                left_page_no, right_page_no);
 
2543
#endif
 
2544
 
 
2545
        heap = mem_heap_create(100);
 
2546
        offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
 
2547
                                            &father_cursor);
 
2548
 
 
2549
        /* Decide the page to which we try to merge and which will inherit
 
2550
        the locks */
 
2551
 
 
2552
        is_left = left_page_no != FIL_NULL;
 
2553
 
 
2554
        if (is_left) {
 
2555
 
 
2556
                merge_block = btr_block_get(space, zip_size, left_page_no,
 
2557
                                            RW_X_LATCH, mtr);
 
2558
                merge_page = buf_block_get_frame(merge_block);
 
2559
#ifdef UNIV_BTR_DEBUG
 
2560
                ut_a(btr_page_get_next(merge_page, mtr)
 
2561
                     == buf_block_get_page_no(block));
 
2562
#endif /* UNIV_BTR_DEBUG */
 
2563
        } else if (right_page_no != FIL_NULL) {
 
2564
 
 
2565
                merge_block = btr_block_get(space, zip_size, right_page_no,
 
2566
                                            RW_X_LATCH, mtr);
 
2567
                merge_page = buf_block_get_frame(merge_block);
 
2568
#ifdef UNIV_BTR_DEBUG
 
2569
                ut_a(btr_page_get_prev(merge_page, mtr)
 
2570
                     == buf_block_get_page_no(block));
 
2571
#endif /* UNIV_BTR_DEBUG */
 
2572
        } else {
 
2573
                /* The page is the only one on the level, lift the records
 
2574
                to the father */
 
2575
                btr_lift_page_up(index, block, mtr);
 
2576
                mem_heap_free(heap);
 
2577
                return(TRUE);
 
2578
        }
 
2579
 
 
2580
        n_recs = page_get_n_recs(page);
 
2581
        data_size = page_get_data_size(page);
 
2582
#ifdef UNIV_BTR_DEBUG
 
2583
        ut_a(page_is_comp(merge_page) == page_is_comp(page));
 
2584
#endif /* UNIV_BTR_DEBUG */
 
2585
 
 
2586
        max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
 
2587
                merge_page, n_recs);
 
2588
        if (data_size > max_ins_size_reorg) {
 
2589
 
 
2590
                /* No space for merge */
 
2591
err_exit:
 
2592
                /* We play it safe and reset the free bits. */
 
2593
                if (zip_size
 
2594
                    && page_is_leaf(merge_page)
 
2595
                    && !dict_index_is_clust(index)) {
 
2596
                        ibuf_reset_free_bits(merge_block);
 
2597
                }
 
2598
 
 
2599
                mem_heap_free(heap);
 
2600
                return(FALSE);
 
2601
        }
 
2602
 
 
2603
        ut_ad(page_validate(merge_page, index));
 
2604
 
 
2605
        max_ins_size = page_get_max_insert_size(merge_page, n_recs);
 
2606
 
 
2607
        if (UNIV_UNLIKELY(data_size > max_ins_size)) {
 
2608
 
 
2609
                /* We have to reorganize merge_page */
 
2610
 
 
2611
                if (UNIV_UNLIKELY(!btr_page_reorganize(merge_block,
 
2612
                                                       index, mtr))) {
 
2613
 
 
2614
                        goto err_exit;
 
2615
                }
 
2616
 
 
2617
                max_ins_size = page_get_max_insert_size(merge_page, n_recs);
 
2618
 
 
2619
                ut_ad(page_validate(merge_page, index));
 
2620
                ut_ad(max_ins_size == max_ins_size_reorg);
 
2621
 
 
2622
                if (UNIV_UNLIKELY(data_size > max_ins_size)) {
 
2623
 
 
2624
                        /* Add fault tolerance, though this should
 
2625
                        never happen */
 
2626
 
 
2627
                        goto err_exit;
 
2628
                }
 
2629
        }
 
2630
 
 
2631
        merge_page_zip = buf_block_get_page_zip(merge_block);
 
2632
#ifdef UNIV_ZIP_DEBUG
 
2633
        if (UNIV_LIKELY_NULL(merge_page_zip)) {
 
2634
                const page_zip_des_t*   page_zip
 
2635
                        = buf_block_get_page_zip(block);
 
2636
                ut_a(page_zip);
 
2637
                ut_a(page_zip_validate(merge_page_zip, merge_page));
 
2638
                ut_a(page_zip_validate(page_zip, page));
 
2639
        }
 
2640
#endif /* UNIV_ZIP_DEBUG */
 
2641
 
 
2642
        /* Move records to the merge page */
 
2643
        if (is_left) {
 
2644
                rec_t*  orig_pred = page_copy_rec_list_start(
 
2645
                        merge_block, block, page_get_supremum_rec(page),
 
2646
                        index, mtr);
 
2647
 
 
2648
                if (UNIV_UNLIKELY(!orig_pred)) {
 
2649
                        goto err_exit;
 
2650
                }
 
2651
 
 
2652
                btr_search_drop_page_hash_index(block);
 
2653
 
 
2654
                /* Remove the page from the level list */
 
2655
                btr_level_list_remove(space, zip_size, page, mtr);
 
2656
 
 
2657
                btr_node_ptr_delete(index, block, mtr);
 
2658
                lock_update_merge_left(merge_block, orig_pred, block);
 
2659
        } else {
 
2660
                rec_t*          orig_succ;
 
2661
#ifdef UNIV_BTR_DEBUG
 
2662
                byte            fil_page_prev[4];
 
2663
#endif /* UNIV_BTR_DEBUG */
 
2664
 
 
2665
                if (UNIV_LIKELY_NULL(merge_page_zip)) {
 
2666
                        /* The function page_zip_compress(), which will be
 
2667
                        invoked by page_copy_rec_list_end() below,
 
2668
                        requires that FIL_PAGE_PREV be FIL_NULL.
 
2669
                        Clear the field, but prepare to restore it. */
 
2670
#ifdef UNIV_BTR_DEBUG
 
2671
                        memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
 
2672
#endif /* UNIV_BTR_DEBUG */
 
2673
#if FIL_NULL != 0xffffffff
 
2674
# error "FIL_NULL != 0xffffffff"
 
2675
#endif
 
2676
                        memset(merge_page + FIL_PAGE_PREV, 0xff, 4);
 
2677
                }
 
2678
 
 
2679
                orig_succ = page_copy_rec_list_end(merge_block, block,
 
2680
                                                   page_get_infimum_rec(page),
 
2681
                                                   cursor->index, mtr);
 
2682
 
 
2683
                if (UNIV_UNLIKELY(!orig_succ)) {
 
2684
                        ut_a(merge_page_zip);
 
2685
#ifdef UNIV_BTR_DEBUG
 
2686
                        /* FIL_PAGE_PREV was restored from merge_page_zip. */
 
2687
                        ut_a(!memcmp(fil_page_prev,
 
2688
                                     merge_page + FIL_PAGE_PREV, 4));
 
2689
#endif /* UNIV_BTR_DEBUG */
 
2690
                        goto err_exit;
 
2691
                }
 
2692
 
 
2693
                btr_search_drop_page_hash_index(block);
 
2694
 
 
2695
#ifdef UNIV_BTR_DEBUG
 
2696
                if (UNIV_LIKELY_NULL(merge_page_zip)) {
 
2697
                        /* Restore FIL_PAGE_PREV in order to avoid an assertion
 
2698
                        failure in btr_level_list_remove(), which will set
 
2699
                        the field again to FIL_NULL.  Even though this makes
 
2700
                        merge_page and merge_page_zip inconsistent for a
 
2701
                        split second, it is harmless, because the pages
 
2702
                        are X-latched. */
 
2703
                        memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
 
2704
                }
 
2705
#endif /* UNIV_BTR_DEBUG */
 
2706
 
 
2707
                /* Remove the page from the level list */
 
2708
                btr_level_list_remove(space, zip_size, page, mtr);
 
2709
 
 
2710
                /* Replace the address of the old child node (= page) with the
 
2711
                address of the merge page to the right */
 
2712
 
 
2713
                btr_node_ptr_set_child_page_no(
 
2714
                        btr_cur_get_rec(&father_cursor),
 
2715
                        btr_cur_get_page_zip(&father_cursor),
 
2716
                        offsets, right_page_no, mtr);
 
2717
                btr_node_ptr_delete(index, merge_block, mtr);
 
2718
 
 
2719
                lock_update_merge_right(merge_block, orig_succ, block);
 
2720
        }
 
2721
 
 
2722
        mem_heap_free(heap);
 
2723
 
 
2724
        if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) {
 
2725
                /* Update the free bits of the B-tree page in the
 
2726
                insert buffer bitmap.  This has to be done in a
 
2727
                separate mini-transaction that is committed before the
 
2728
                main mini-transaction.  We cannot update the insert
 
2729
                buffer bitmap in this mini-transaction, because
 
2730
                btr_compress() can be invoked recursively without
 
2731
                committing the mini-transaction in between.  Since
 
2732
                insert buffer bitmap pages have a lower rank than
 
2733
                B-tree pages, we must not access other pages in the
 
2734
                same mini-transaction after accessing an insert buffer
 
2735
                bitmap page. */
 
2736
 
 
2737
                /* The free bits in the insert buffer bitmap must
 
2738
                never exceed the free space on a page.  It is safe to
 
2739
                decrement or reset the bits in the bitmap in a
 
2740
                mini-transaction that is committed before the
 
2741
                mini-transaction that affects the free space. */
 
2742
 
 
2743
                /* It is unsafe to increment the bits in a separately
 
2744
                committed mini-transaction, because in crash recovery,
 
2745
                the free bits could momentarily be set too high. */
 
2746
 
 
2747
                if (zip_size) {
 
2748
                        /* Because the free bits may be incremented
 
2749
                        and we cannot update the insert buffer bitmap
 
2750
                        in the same mini-transaction, the only safe
 
2751
                        thing we can do here is the pessimistic
 
2752
                        approach: reset the free bits. */
 
2753
                        ibuf_reset_free_bits(merge_block);
 
2754
                } else {
 
2755
                        /* On uncompressed pages, the free bits will
 
2756
                        never increase here.  Thus, it is safe to
 
2757
                        write the bits accurately in a separate
 
2758
                        mini-transaction. */
 
2759
                        ibuf_update_free_bits_if_full(merge_block,
 
2760
                                                      UNIV_PAGE_SIZE,
 
2761
                                                      ULINT_UNDEFINED);
 
2762
                }
 
2763
        }
 
2764
 
 
2765
        ut_ad(page_validate(merge_page, index));
 
2766
#ifdef UNIV_ZIP_DEBUG
 
2767
        ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page));
 
2768
#endif /* UNIV_ZIP_DEBUG */
 
2769
 
 
2770
        /* Free the file page */
 
2771
        btr_page_free(index, block, mtr);
 
2772
 
 
2773
        ut_ad(btr_check_node_ptr(index, merge_block, mtr));
 
2774
        return(TRUE);
 
2775
}
 
2776
 
 
2777
/*************************************************************//**
 
2778
Discards a page that is the only page on its level.  This will empty
 
2779
the whole B-tree, leaving just an empty root page.  This function
 
2780
should never be reached, because btr_compress(), which is invoked in
 
2781
delete operations, calls btr_lift_page_up() to flatten the B-tree. */
 
2782
static
 
2783
void
 
2784
btr_discard_only_page_on_level(
 
2785
/*===========================*/
 
2786
        dict_index_t*   index,  /*!< in: index tree */
 
2787
        buf_block_t*    block,  /*!< in: page which is the only on its level */
 
2788
        mtr_t*          mtr)    /*!< in: mtr */
 
2789
{
 
2790
        ulint           page_level = 0;
 
2791
        trx_id_t        max_trx_id;
 
2792
 
 
2793
        /* Save the PAGE_MAX_TRX_ID from the leaf page. */
 
2794
        max_trx_id = page_get_max_trx_id(buf_block_get_frame(block));
 
2795
 
 
2796
        while (buf_block_get_page_no(block) != dict_index_get_page(index)) {
 
2797
                btr_cur_t       cursor;
 
2798
                buf_block_t*    father;
 
2799
                const page_t*   page    = buf_block_get_frame(block);
 
2800
 
 
2801
                ut_a(page_get_n_recs(page) == 1);
 
2802
                ut_a(page_level == btr_page_get_level(page, mtr));
 
2803
                ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
 
2804
                ut_a(btr_page_get_next(page, mtr) == FIL_NULL);
 
2805
 
 
2806
                ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
2807
                btr_search_drop_page_hash_index(block);
 
2808
 
 
2809
                btr_page_get_father(index, block, mtr, &cursor);
 
2810
                father = btr_cur_get_block(&cursor);
 
2811
 
 
2812
                lock_update_discard(father, PAGE_HEAP_NO_SUPREMUM, block);
 
2813
 
 
2814
                /* Free the file page */
 
2815
                btr_page_free(index, block, mtr);
 
2816
 
 
2817
                block = father;
 
2818
                page_level++;
 
2819
        }
 
2820
 
 
2821
        /* block is the root page, which must be empty, except
 
2822
        for the node pointer to the (now discarded) block(s). */
 
2823
 
 
2824
#ifdef UNIV_BTR_DEBUG
 
2825
        if (!dict_index_is_ibuf(index)) {
 
2826
                const page_t*   root    = buf_block_get_frame(block);
 
2827
                const ulint     space   = dict_index_get_space(index);
 
2828
                ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
 
2829
                                            + root, space));
 
2830
                ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
 
2831
                                            + root, space));
 
2832
        }
 
2833
#endif /* UNIV_BTR_DEBUG */
 
2834
 
 
2835
        btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
 
2836
 
 
2837
        if (!dict_index_is_clust(index)) {
 
2838
                /* We play it safe and reset the free bits for the root */
 
2839
                ibuf_reset_free_bits(block);
 
2840
 
 
2841
                if (page_is_leaf(buf_block_get_frame(block))) {
 
2842
                        ut_a(!ut_dulint_is_zero(max_trx_id));
 
2843
                        page_set_max_trx_id(block,
 
2844
                                            buf_block_get_page_zip(block),
 
2845
                                            max_trx_id, mtr);
 
2846
                }
 
2847
        }
 
2848
}
 
2849
 
 
2850
/*************************************************************//**
 
2851
Discards a page from a B-tree. This is used to remove the last record from
 
2852
a B-tree page: the whole page must be removed at the same time. This cannot
 
2853
be used for the root page, which is allowed to be empty. */
 
2854
UNIV_INTERN
 
2855
void
 
2856
btr_discard_page(
 
2857
/*=============*/
 
2858
        btr_cur_t*      cursor, /*!< in: cursor on the page to discard: not on
 
2859
                                the root page */
 
2860
        mtr_t*          mtr)    /*!< in: mtr */
 
2861
{
 
2862
        dict_index_t*   index;
 
2863
        ulint           space;
 
2864
        ulint           zip_size;
 
2865
        ulint           left_page_no;
 
2866
        ulint           right_page_no;
 
2867
        buf_block_t*    merge_block;
 
2868
        page_t*         merge_page;
 
2869
        buf_block_t*    block;
 
2870
        page_t*         page;
 
2871
        rec_t*          node_ptr;
 
2872
 
 
2873
        block = btr_cur_get_block(cursor);
 
2874
        index = btr_cur_get_index(cursor);
 
2875
 
 
2876
        ut_ad(dict_index_get_page(index) != buf_block_get_page_no(block));
 
2877
        ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 
2878
                                MTR_MEMO_X_LOCK));
 
2879
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
2880
        space = dict_index_get_space(index);
 
2881
        zip_size = dict_table_zip_size(index->table);
 
2882
 
 
2883
        /* Decide the page which will inherit the locks */
 
2884
 
 
2885
        left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr);
 
2886
        right_page_no = btr_page_get_next(buf_block_get_frame(block), mtr);
 
2887
 
 
2888
        if (left_page_no != FIL_NULL) {
 
2889
                merge_block = btr_block_get(space, zip_size, left_page_no,
 
2890
                                            RW_X_LATCH, mtr);
 
2891
                merge_page = buf_block_get_frame(merge_block);
 
2892
#ifdef UNIV_BTR_DEBUG
 
2893
                ut_a(btr_page_get_next(merge_page, mtr)
 
2894
                     == buf_block_get_page_no(block));
 
2895
#endif /* UNIV_BTR_DEBUG */
 
2896
        } else if (right_page_no != FIL_NULL) {
 
2897
                merge_block = btr_block_get(space, zip_size, right_page_no,
 
2898
                                            RW_X_LATCH, mtr);
 
2899
                merge_page = buf_block_get_frame(merge_block);
 
2900
#ifdef UNIV_BTR_DEBUG
 
2901
                ut_a(btr_page_get_prev(merge_page, mtr)
 
2902
                     == buf_block_get_page_no(block));
 
2903
#endif /* UNIV_BTR_DEBUG */
 
2904
        } else {
 
2905
                btr_discard_only_page_on_level(index, block, mtr);
 
2906
 
 
2907
                return;
 
2908
        }
 
2909
 
 
2910
        page = buf_block_get_frame(block);
 
2911
        ut_a(page_is_comp(merge_page) == page_is_comp(page));
 
2912
        btr_search_drop_page_hash_index(block);
 
2913
 
 
2914
        if (left_page_no == FIL_NULL && !page_is_leaf(page)) {
 
2915
 
 
2916
                /* We have to mark the leftmost node pointer on the right
 
2917
                side page as the predefined minimum record */
 
2918
                node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page));
 
2919
 
 
2920
                ut_ad(page_rec_is_user_rec(node_ptr));
 
2921
 
 
2922
                /* This will make page_zip_validate() fail on merge_page
 
2923
                until btr_level_list_remove() completes.  This is harmless,
 
2924
                because everything will take place within a single
 
2925
                mini-transaction and because writing to the redo log
 
2926
                is an atomic operation (performed by mtr_commit()). */
 
2927
                btr_set_min_rec_mark(node_ptr, mtr);
 
2928
        }
 
2929
 
 
2930
        btr_node_ptr_delete(index, block, mtr);
 
2931
 
 
2932
        /* Remove the page from the level list */
 
2933
        btr_level_list_remove(space, zip_size, page, mtr);
 
2934
#ifdef UNIV_ZIP_DEBUG
 
2935
        {
 
2936
                page_zip_des_t* merge_page_zip
 
2937
                        = buf_block_get_page_zip(merge_block);
 
2938
                ut_a(!merge_page_zip
 
2939
                     || page_zip_validate(merge_page_zip, merge_page));
 
2940
        }
 
2941
#endif /* UNIV_ZIP_DEBUG */
 
2942
 
 
2943
        if (left_page_no != FIL_NULL) {
 
2944
                lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
 
2945
                                    block);
 
2946
        } else {
 
2947
                lock_update_discard(merge_block,
 
2948
                                    lock_get_min_heap_no(merge_block),
 
2949
                                    block);
 
2950
        }
 
2951
 
 
2952
        /* Free the file page */
 
2953
        btr_page_free(index, block, mtr);
 
2954
 
 
2955
        ut_ad(btr_check_node_ptr(index, merge_block, mtr));
 
2956
}
 
2957
 
 
2958
#ifdef UNIV_BTR_PRINT
 
2959
/*************************************************************//**
 
2960
Prints size info of a B-tree. */
 
2961
UNIV_INTERN
 
2962
void
 
2963
btr_print_size(
 
2964
/*===========*/
 
2965
        dict_index_t*   index)  /*!< in: index tree */
 
2966
{
 
2967
        page_t*         root;
 
2968
        fseg_header_t*  seg;
 
2969
        mtr_t           mtr;
 
2970
 
 
2971
        if (dict_index_is_ibuf(index)) {
 
2972
                fputs("Sorry, cannot print info of an ibuf tree:"
 
2973
                      " use ibuf functions\n", stderr);
 
2974
 
 
2975
                return;
 
2976
        }
 
2977
 
 
2978
        mtr_start(&mtr);
 
2979
 
 
2980
        root = btr_root_get(index, &mtr);
 
2981
 
 
2982
        seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
 
2983
 
 
2984
        fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
 
2985
        fseg_print(seg, &mtr);
 
2986
 
 
2987
        if (!(index->type & DICT_UNIVERSAL)) {
 
2988
 
 
2989
                seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
 
2990
 
 
2991
                fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
 
2992
                fseg_print(seg, &mtr);
 
2993
        }
 
2994
 
 
2995
        mtr_commit(&mtr);
 
2996
}
 
2997
 
 
2998
/************************************************************//**
 
2999
Prints recursively index tree pages. */
 
3000
static
 
3001
void
 
3002
btr_print_recursive(
 
3003
/*================*/
 
3004
        dict_index_t*   index,  /*!< in: index tree */
 
3005
        buf_block_t*    block,  /*!< in: index page */
 
3006
        ulint           width,  /*!< in: print this many entries from start
 
3007
                                and end */
 
3008
        mem_heap_t**    heap,   /*!< in/out: heap for rec_get_offsets() */
 
3009
        ulint**         offsets,/*!< in/out: buffer for rec_get_offsets() */
 
3010
        mtr_t*          mtr)    /*!< in: mtr */
 
3011
{
 
3012
        const page_t*   page    = buf_block_get_frame(block);
 
3013
        page_cur_t      cursor;
 
3014
        ulint           n_recs;
 
3015
        ulint           i       = 0;
 
3016
        mtr_t           mtr2;
 
3017
 
 
3018
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
3019
        fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n",
 
3020
                (ulong) btr_page_get_level(page, mtr),
 
3021
                (ulong) buf_block_get_page_no(block));
 
3022
 
 
3023
        page_print(block, index, width, width);
 
3024
 
 
3025
        n_recs = page_get_n_recs(page);
 
3026
 
 
3027
        page_cur_set_before_first(block, &cursor);
 
3028
        page_cur_move_to_next(&cursor);
 
3029
 
 
3030
        while (!page_cur_is_after_last(&cursor)) {
 
3031
 
 
3032
                if (page_is_leaf(page)) {
 
3033
 
 
3034
                        /* If this is the leaf level, do nothing */
 
3035
 
 
3036
                } else if ((i <= width) || (i >= n_recs - width)) {
 
3037
 
 
3038
                        const rec_t*    node_ptr;
 
3039
 
 
3040
                        mtr_start(&mtr2);
 
3041
 
 
3042
                        node_ptr = page_cur_get_rec(&cursor);
 
3043
 
 
3044
                        *offsets = rec_get_offsets(node_ptr, index, *offsets,
 
3045
                                                   ULINT_UNDEFINED, heap);
 
3046
                        btr_print_recursive(index,
 
3047
                                            btr_node_ptr_get_child(node_ptr,
 
3048
                                                                   index,
 
3049
                                                                   *offsets,
 
3050
                                                                   &mtr2),
 
3051
                                            width, heap, offsets, &mtr2);
 
3052
                        mtr_commit(&mtr2);
 
3053
                }
 
3054
 
 
3055
                page_cur_move_to_next(&cursor);
 
3056
                i++;
 
3057
        }
 
3058
}
 
3059
 
 
3060
/**************************************************************//**
 
3061
Prints directories and other info of all nodes in the tree. */
 
3062
UNIV_INTERN
 
3063
void
 
3064
btr_print_index(
 
3065
/*============*/
 
3066
        dict_index_t*   index,  /*!< in: index */
 
3067
        ulint           width)  /*!< in: print this many entries from start
 
3068
                                and end */
 
3069
{
 
3070
        mtr_t           mtr;
 
3071
        buf_block_t*    root;
 
3072
        mem_heap_t*     heap    = NULL;
 
3073
        ulint           offsets_[REC_OFFS_NORMAL_SIZE];
 
3074
        ulint*          offsets = offsets_;
 
3075
        rec_offs_init(offsets_);
 
3076
 
 
3077
        fputs("--------------------------\n"
 
3078
              "INDEX TREE PRINT\n", stderr);
 
3079
 
 
3080
        mtr_start(&mtr);
 
3081
 
 
3082
        root = btr_root_block_get(index, &mtr);
 
3083
 
 
3084
        btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
 
3085
        if (UNIV_LIKELY_NULL(heap)) {
 
3086
                mem_heap_free(heap);
 
3087
        }
 
3088
 
 
3089
        mtr_commit(&mtr);
 
3090
 
 
3091
        btr_validate_index(index, NULL);
 
3092
}
 
3093
#endif /* UNIV_BTR_PRINT */
 
3094
 
 
3095
#ifdef UNIV_DEBUG
 
3096
/************************************************************//**
 
3097
Checks that the node pointer to a page is appropriate.
 
3098
@return TRUE */
 
3099
UNIV_INTERN
 
3100
ibool
 
3101
btr_check_node_ptr(
 
3102
/*===============*/
 
3103
        dict_index_t*   index,  /*!< in: index tree */
 
3104
        buf_block_t*    block,  /*!< in: index page */
 
3105
        mtr_t*          mtr)    /*!< in: mtr */
 
3106
{
 
3107
        mem_heap_t*     heap;
 
3108
        dtuple_t*       tuple;
 
3109
        ulint*          offsets;
 
3110
        btr_cur_t       cursor;
 
3111
        page_t*         page = buf_block_get_frame(block);
 
3112
 
 
3113
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
3114
        if (dict_index_get_page(index) == buf_block_get_page_no(block)) {
 
3115
 
 
3116
                return(TRUE);
 
3117
        }
 
3118
 
 
3119
        heap = mem_heap_create(256);
 
3120
        offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
 
3121
                                            &cursor);
 
3122
 
 
3123
        if (page_is_leaf(page)) {
 
3124
 
 
3125
                goto func_exit;
 
3126
        }
 
3127
 
 
3128
        tuple = dict_index_build_node_ptr(
 
3129
                index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
 
3130
                btr_page_get_level(page, mtr));
 
3131
 
 
3132
        ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets));
 
3133
func_exit:
 
3134
        mem_heap_free(heap);
 
3135
 
 
3136
        return(TRUE);
 
3137
}
 
3138
#endif /* UNIV_DEBUG */
 
3139
 
 
3140
/************************************************************//**
 
3141
Display identification information for a record. */
 
3142
static
 
3143
void
 
3144
btr_index_rec_validate_report(
 
3145
/*==========================*/
 
3146
        const page_t*           page,   /*!< in: index page */
 
3147
        const rec_t*            rec,    /*!< in: index record */
 
3148
        const dict_index_t*     index)  /*!< in: index */
 
3149
{
 
3150
        fputs("InnoDB: Record in ", stderr);
 
3151
        dict_index_name_print(stderr, NULL, index);
 
3152
        fprintf(stderr, ", page %lu, at offset %lu\n",
 
3153
                page_get_page_no(page), (ulint) page_offset(rec));
 
3154
}
 
3155
 
 
3156
/************************************************************//**
 
3157
Checks the size and number of fields in a record based on the definition of
 
3158
the index.
 
3159
@return TRUE if ok */
 
3160
UNIV_INTERN
 
3161
ibool
 
3162
btr_index_rec_validate(
 
3163
/*===================*/
 
3164
        const rec_t*            rec,            /*!< in: index record */
 
3165
        const dict_index_t*     index,          /*!< in: index */
 
3166
        ibool                   dump_on_error)  /*!< in: TRUE if the function
 
3167
                                                should print hex dump of record
 
3168
                                                and page on error */
 
3169
{
 
3170
        ulint           len;
 
3171
        ulint           n;
 
3172
        ulint           i;
 
3173
        const page_t*   page;
 
3174
        mem_heap_t*     heap    = NULL;
 
3175
        ulint           offsets_[REC_OFFS_NORMAL_SIZE];
 
3176
        ulint*          offsets = offsets_;
 
3177
        rec_offs_init(offsets_);
 
3178
 
 
3179
        page = page_align(rec);
 
3180
 
 
3181
        if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
 
3182
                /* The insert buffer index tree can contain records from any
 
3183
                other index: we cannot check the number of fields or
 
3184
                their length */
 
3185
 
 
3186
                return(TRUE);
 
3187
        }
 
3188
 
 
3189
        if (UNIV_UNLIKELY((ibool)!!page_is_comp(page)
 
3190
                          != dict_table_is_comp(index->table))) {
 
3191
                btr_index_rec_validate_report(page, rec, index);
 
3192
                fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n",
 
3193
                        (ulong) !!page_is_comp(page),
 
3194
                        (ulong) dict_table_is_comp(index->table));
 
3195
 
 
3196
                return(FALSE);
 
3197
        }
 
3198
 
 
3199
        n = dict_index_get_n_fields(index);
 
3200
 
 
3201
        if (!page_is_comp(page)
 
3202
            && UNIV_UNLIKELY(rec_get_n_fields_old(rec) != n)) {
 
3203
                btr_index_rec_validate_report(page, rec, index);
 
3204
                fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n",
 
3205
                        (ulong) rec_get_n_fields_old(rec), (ulong) n);
 
3206
 
 
3207
                if (dump_on_error) {
 
3208
                        buf_page_print(page, 0);
 
3209
 
 
3210
                        fputs("InnoDB: corrupt record ", stderr);
 
3211
                        rec_print_old(stderr, rec);
 
3212
                        putc('\n', stderr);
 
3213
                }
 
3214
                return(FALSE);
 
3215
        }
 
3216
 
 
3217
        offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
 
3218
 
 
3219
        for (i = 0; i < n; i++) {
 
3220
                ulint   fixed_size = dict_col_get_fixed_size(
 
3221
                        dict_index_get_nth_col(index, i), page_is_comp(page));
 
3222
 
 
3223
                rec_get_nth_field_offs(offsets, i, &len);
 
3224
 
 
3225
                /* Note that if fixed_size != 0, it equals the
 
3226
                length of a fixed-size column in the clustered index.
 
3227
                A prefix index of the column is of fixed, but different
 
3228
                length.  When fixed_size == 0, prefix_len is the maximum
 
3229
                length of the prefix index column. */
 
3230
 
 
3231
                if ((dict_index_get_nth_field(index, i)->prefix_len == 0
 
3232
                     && len != UNIV_SQL_NULL && fixed_size
 
3233
                     && len != fixed_size)
 
3234
                    || (dict_index_get_nth_field(index, i)->prefix_len > 0
 
3235
                        && len != UNIV_SQL_NULL
 
3236
                        && len
 
3237
                        > dict_index_get_nth_field(index, i)->prefix_len)) {
 
3238
 
 
3239
                        btr_index_rec_validate_report(page, rec, index);
 
3240
                        fprintf(stderr,
 
3241
                                "InnoDB: field %lu len is %lu,"
 
3242
                                " should be %lu\n",
 
3243
                                (ulong) i, (ulong) len, (ulong) fixed_size);
 
3244
 
 
3245
                        if (dump_on_error) {
 
3246
                                buf_page_print(page, 0);
 
3247
 
 
3248
                                fputs("InnoDB: corrupt record ", stderr);
 
3249
                                rec_print_new(stderr, rec, offsets);
 
3250
                                putc('\n', stderr);
 
3251
                        }
 
3252
                        if (UNIV_LIKELY_NULL(heap)) {
 
3253
                                mem_heap_free(heap);
 
3254
                        }
 
3255
                        return(FALSE);
 
3256
                }
 
3257
        }
 
3258
 
 
3259
        if (UNIV_LIKELY_NULL(heap)) {
 
3260
                mem_heap_free(heap);
 
3261
        }
 
3262
        return(TRUE);
 
3263
}
 
3264
 
 
3265
/************************************************************//**
 
3266
Checks the size and number of fields in records based on the definition of
 
3267
the index.
 
3268
@return TRUE if ok */
 
3269
static
 
3270
ibool
 
3271
btr_index_page_validate(
 
3272
/*====================*/
 
3273
        buf_block_t*    block,  /*!< in: index page */
 
3274
        dict_index_t*   index)  /*!< in: index */
 
3275
{
 
3276
        page_cur_t      cur;
 
3277
        ibool           ret     = TRUE;
 
3278
 
 
3279
        page_cur_set_before_first(block, &cur);
 
3280
        page_cur_move_to_next(&cur);
 
3281
 
 
3282
        for (;;) {
 
3283
                if (page_cur_is_after_last(&cur)) {
 
3284
 
 
3285
                        break;
 
3286
                }
 
3287
 
 
3288
                if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
 
3289
 
 
3290
                        return(FALSE);
 
3291
                }
 
3292
 
 
3293
                page_cur_move_to_next(&cur);
 
3294
        }
 
3295
 
 
3296
        return(ret);
 
3297
}
 
3298
 
 
3299
/************************************************************//**
 
3300
Report an error on one page of an index tree. */
 
3301
static
 
3302
void
 
3303
btr_validate_report1(
 
3304
/*=================*/
 
3305
        dict_index_t*           index,  /*!< in: index */
 
3306
        ulint                   level,  /*!< in: B-tree level */
 
3307
        const buf_block_t*      block)  /*!< in: index page */
 
3308
{
 
3309
        fprintf(stderr, "InnoDB: Error in page %lu of ",
 
3310
                buf_block_get_page_no(block));
 
3311
        dict_index_name_print(stderr, NULL, index);
 
3312
        if (level) {
 
3313
                fprintf(stderr, ", index tree level %lu", level);
 
3314
        }
 
3315
        putc('\n', stderr);
 
3316
}
 
3317
 
 
3318
/************************************************************//**
 
3319
Report an error on two pages of an index tree. */
 
3320
static
 
3321
void
 
3322
btr_validate_report2(
 
3323
/*=================*/
 
3324
        const dict_index_t*     index,  /*!< in: index */
 
3325
        ulint                   level,  /*!< in: B-tree level */
 
3326
        const buf_block_t*      block1, /*!< in: first index page */
 
3327
        const buf_block_t*      block2) /*!< in: second index page */
 
3328
{
 
3329
        fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ",
 
3330
                buf_block_get_page_no(block1),
 
3331
                buf_block_get_page_no(block2));
 
3332
        dict_index_name_print(stderr, NULL, index);
 
3333
        if (level) {
 
3334
                fprintf(stderr, ", index tree level %lu", level);
 
3335
        }
 
3336
        putc('\n', stderr);
 
3337
}
 
3338
 
 
3339
/************************************************************//**
 
3340
Validates index tree level.
 
3341
@return TRUE if ok */
 
3342
static
 
3343
ibool
 
3344
btr_validate_level(
 
3345
/*===============*/
 
3346
        dict_index_t*   index,  /*!< in: index tree */
 
3347
        trx_t*          trx,    /*!< in: transaction or NULL */
 
3348
        ulint           level)  /*!< in: level number */
 
3349
{
 
3350
        ulint           space;
 
3351
        ulint           zip_size;
 
3352
        buf_block_t*    block;
 
3353
        page_t*         page;
 
3354
        buf_block_t*    right_block = 0; /* remove warning */
 
3355
        page_t*         right_page = 0; /* remove warning */
 
3356
        page_t*         father_page;
 
3357
        btr_cur_t       node_cur;
 
3358
        btr_cur_t       right_node_cur;
 
3359
        rec_t*          rec;
 
3360
        ulint           right_page_no;
 
3361
        ulint           left_page_no;
 
3362
        page_cur_t      cursor;
 
3363
        dtuple_t*       node_ptr_tuple;
 
3364
        ibool           ret     = TRUE;
 
3365
        mtr_t           mtr;
 
3366
        mem_heap_t*     heap    = mem_heap_create(256);
 
3367
        ulint*          offsets = NULL;
 
3368
        ulint*          offsets2= NULL;
 
3369
#ifdef UNIV_ZIP_DEBUG
 
3370
        page_zip_des_t* page_zip;
 
3371
#endif /* UNIV_ZIP_DEBUG */
 
3372
 
 
3373
        mtr_start(&mtr);
 
3374
 
 
3375
        mtr_x_lock(dict_index_get_lock(index), &mtr);
 
3376
 
 
3377
        block = btr_root_block_get(index, &mtr);
 
3378
        page = buf_block_get_frame(block);
 
3379
 
 
3380
        space = dict_index_get_space(index);
 
3381
        zip_size = dict_table_zip_size(index->table);
 
3382
 
 
3383
        while (level != btr_page_get_level(page, &mtr)) {
 
3384
                const rec_t*    node_ptr;
 
3385
 
 
3386
                ut_a(space == buf_block_get_space(block));
 
3387
                ut_a(space == page_get_space_id(page));
 
3388
#ifdef UNIV_ZIP_DEBUG
 
3389
                page_zip = buf_block_get_page_zip(block);
 
3390
                ut_a(!page_zip || page_zip_validate(page_zip, page));
 
3391
#endif /* UNIV_ZIP_DEBUG */
 
3392
                ut_a(!page_is_leaf(page));
 
3393
 
 
3394
                page_cur_set_before_first(block, &cursor);
 
3395
                page_cur_move_to_next(&cursor);
 
3396
 
 
3397
                node_ptr = page_cur_get_rec(&cursor);
 
3398
                offsets = rec_get_offsets(node_ptr, index, offsets,
 
3399
                                          ULINT_UNDEFINED, &heap);
 
3400
                block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr);
 
3401
                page = buf_block_get_frame(block);
 
3402
        }
 
3403
 
 
3404
        /* Now we are on the desired level. Loop through the pages on that
 
3405
        level. */
 
3406
loop:
 
3407
        if (trx_is_interrupted(trx)) {
 
3408
                mtr_commit(&mtr);
 
3409
                mem_heap_free(heap);
 
3410
                return(ret);
 
3411
        }
 
3412
        mem_heap_empty(heap);
 
3413
        offsets = offsets2 = NULL;
 
3414
        mtr_x_lock(dict_index_get_lock(index), &mtr);
 
3415
 
 
3416
#ifdef UNIV_ZIP_DEBUG
 
3417
        page_zip = buf_block_get_page_zip(block);
 
3418
        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
3419
#endif /* UNIV_ZIP_DEBUG */
 
3420
 
 
3421
        /* Check ordering etc. of records */
 
3422
 
 
3423
        if (!page_validate(page, index)) {
 
3424
                btr_validate_report1(index, level, block);
 
3425
 
 
3426
                ret = FALSE;
 
3427
        } else if (level == 0) {
 
3428
                /* We are on level 0. Check that the records have the right
 
3429
                number of fields, and field lengths are right. */
 
3430
 
 
3431
                if (!btr_index_page_validate(block, index)) {
 
3432
 
 
3433
                        ret = FALSE;
 
3434
                }
 
3435
        }
 
3436
 
 
3437
        ut_a(btr_page_get_level(page, &mtr) == level);
 
3438
 
 
3439
        right_page_no = btr_page_get_next(page, &mtr);
 
3440
        left_page_no = btr_page_get_prev(page, &mtr);
 
3441
 
 
3442
        ut_a(page_get_n_recs(page) > 0 || (level == 0
 
3443
                                           && page_get_page_no(page)
 
3444
                                           == dict_index_get_page(index)));
 
3445
 
 
3446
        if (right_page_no != FIL_NULL) {
 
3447
                const rec_t*    right_rec;
 
3448
                right_block = btr_block_get(space, zip_size, right_page_no,
 
3449
                                            RW_X_LATCH, &mtr);
 
3450
                right_page = buf_block_get_frame(right_block);
 
3451
                if (UNIV_UNLIKELY(btr_page_get_prev(right_page, &mtr)
 
3452
                                  != page_get_page_no(page))) {
 
3453
                        btr_validate_report2(index, level, block, right_block);
 
3454
                        fputs("InnoDB: broken FIL_PAGE_NEXT"
 
3455
                              " or FIL_PAGE_PREV links\n", stderr);
 
3456
                        buf_page_print(page, 0);
 
3457
                        buf_page_print(right_page, 0);
 
3458
 
 
3459
                        ret = FALSE;
 
3460
                }
 
3461
 
 
3462
                if (UNIV_UNLIKELY(page_is_comp(right_page)
 
3463
                                  != page_is_comp(page))) {
 
3464
                        btr_validate_report2(index, level, block, right_block);
 
3465
                        fputs("InnoDB: 'compact' flag mismatch\n", stderr);
 
3466
                        buf_page_print(page, 0);
 
3467
                        buf_page_print(right_page, 0);
 
3468
 
 
3469
                        ret = FALSE;
 
3470
 
 
3471
                        goto node_ptr_fails;
 
3472
                }
 
3473
 
 
3474
                rec = page_rec_get_prev(page_get_supremum_rec(page));
 
3475
                right_rec = page_rec_get_next(page_get_infimum_rec(
 
3476
                                                      right_page));
 
3477
                offsets = rec_get_offsets(rec, index,
 
3478
                                          offsets, ULINT_UNDEFINED, &heap);
 
3479
                offsets2 = rec_get_offsets(right_rec, index,
 
3480
                                           offsets2, ULINT_UNDEFINED, &heap);
 
3481
                if (UNIV_UNLIKELY(cmp_rec_rec(rec, right_rec,
 
3482
                                              offsets, offsets2,
 
3483
                                              index) >= 0)) {
 
3484
 
 
3485
                        btr_validate_report2(index, level, block, right_block);
 
3486
 
 
3487
                        fputs("InnoDB: records in wrong order"
 
3488
                              " on adjacent pages\n", stderr);
 
3489
 
 
3490
                        buf_page_print(page, 0);
 
3491
                        buf_page_print(right_page, 0);
 
3492
 
 
3493
                        fputs("InnoDB: record ", stderr);
 
3494
                        rec = page_rec_get_prev(page_get_supremum_rec(page));
 
3495
                        rec_print(stderr, rec, index);
 
3496
                        putc('\n', stderr);
 
3497
                        fputs("InnoDB: record ", stderr);
 
3498
                        rec = page_rec_get_next(
 
3499
                                page_get_infimum_rec(right_page));
 
3500
                        rec_print(stderr, rec, index);
 
3501
                        putc('\n', stderr);
 
3502
 
 
3503
                        ret = FALSE;
 
3504
                }
 
3505
        }
 
3506
 
 
3507
        if (level > 0 && left_page_no == FIL_NULL) {
 
3508
                ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
 
3509
                             page_rec_get_next(page_get_infimum_rec(page)),
 
3510
                             page_is_comp(page)));
 
3511
        }
 
3512
 
 
3513
        if (buf_block_get_page_no(block) != dict_index_get_page(index)) {
 
3514
 
 
3515
                /* Check father node pointers */
 
3516
 
 
3517
                rec_t*  node_ptr;
 
3518
 
 
3519
                offsets = btr_page_get_father_block(offsets, heap, index,
 
3520
                                                    block, &mtr, &node_cur);
 
3521
                father_page = btr_cur_get_page(&node_cur);
 
3522
                node_ptr = btr_cur_get_rec(&node_cur);
 
3523
 
 
3524
                btr_cur_position(
 
3525
                        index, page_rec_get_prev(page_get_supremum_rec(page)),
 
3526
                        block, &node_cur);
 
3527
                offsets = btr_page_get_father_node_ptr(offsets, heap,
 
3528
                                                       &node_cur, &mtr);
 
3529
 
 
3530
                if (UNIV_UNLIKELY(node_ptr != btr_cur_get_rec(&node_cur))
 
3531
                    || UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr,
 
3532
                                                                    offsets)
 
3533
                                     != buf_block_get_page_no(block))) {
 
3534
 
 
3535
                        btr_validate_report1(index, level, block);
 
3536
 
 
3537
                        fputs("InnoDB: node pointer to the page is wrong\n",
 
3538
                              stderr);
 
3539
 
 
3540
                        buf_page_print(father_page, 0);
 
3541
                        buf_page_print(page, 0);
 
3542
 
 
3543
                        fputs("InnoDB: node ptr ", stderr);
 
3544
                        rec_print(stderr, node_ptr, index);
 
3545
 
 
3546
                        rec = btr_cur_get_rec(&node_cur);
 
3547
                        fprintf(stderr, "\n"
 
3548
                                "InnoDB: node ptr child page n:o %lu\n",
 
3549
                                (ulong) btr_node_ptr_get_child_page_no(
 
3550
                                        rec, offsets));
 
3551
 
 
3552
                        fputs("InnoDB: record on page ", stderr);
 
3553
                        rec_print_new(stderr, rec, offsets);
 
3554
                        putc('\n', stderr);
 
3555
                        ret = FALSE;
 
3556
 
 
3557
                        goto node_ptr_fails;
 
3558
                }
 
3559
 
 
3560
                if (!page_is_leaf(page)) {
 
3561
                        node_ptr_tuple = dict_index_build_node_ptr(
 
3562
                                index,
 
3563
                                page_rec_get_next(page_get_infimum_rec(page)),
 
3564
                                0, heap, btr_page_get_level(page, &mtr));
 
3565
 
 
3566
                        if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
 
3567
                                           offsets)) {
 
3568
                                const rec_t* first_rec = page_rec_get_next(
 
3569
                                        page_get_infimum_rec(page));
 
3570
 
 
3571
                                btr_validate_report1(index, level, block);
 
3572
 
 
3573
                                buf_page_print(father_page, 0);
 
3574
                                buf_page_print(page, 0);
 
3575
 
 
3576
                                fputs("InnoDB: Error: node ptrs differ"
 
3577
                                      " on levels > 0\n"
 
3578
                                      "InnoDB: node ptr ", stderr);
 
3579
                                rec_print_new(stderr, node_ptr, offsets);
 
3580
                                fputs("InnoDB: first rec ", stderr);
 
3581
                                rec_print(stderr, first_rec, index);
 
3582
                                putc('\n', stderr);
 
3583
                                ret = FALSE;
 
3584
 
 
3585
                                goto node_ptr_fails;
 
3586
                        }
 
3587
                }
 
3588
 
 
3589
                if (left_page_no == FIL_NULL) {
 
3590
                        ut_a(node_ptr == page_rec_get_next(
 
3591
                                     page_get_infimum_rec(father_page)));
 
3592
                        ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL);
 
3593
                }
 
3594
 
 
3595
                if (right_page_no == FIL_NULL) {
 
3596
                        ut_a(node_ptr == page_rec_get_prev(
 
3597
                                     page_get_supremum_rec(father_page)));
 
3598
                        ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL);
 
3599
                } else {
 
3600
                        const rec_t*    right_node_ptr
 
3601
                                = page_rec_get_next(node_ptr);
 
3602
 
 
3603
                        offsets = btr_page_get_father_block(
 
3604
                                offsets, heap, index, right_block,
 
3605
                                &mtr, &right_node_cur);
 
3606
                        if (right_node_ptr
 
3607
                            != page_get_supremum_rec(father_page)) {
 
3608
 
 
3609
                                if (btr_cur_get_rec(&right_node_cur)
 
3610
                                    != right_node_ptr) {
 
3611
                                        ret = FALSE;
 
3612
                                        fputs("InnoDB: node pointer to"
 
3613
                                              " the right page is wrong\n",
 
3614
                                              stderr);
 
3615
 
 
3616
                                        btr_validate_report1(index, level,
 
3617
                                                             block);
 
3618
 
 
3619
                                        buf_page_print(father_page, 0);
 
3620
                                        buf_page_print(page, 0);
 
3621
                                        buf_page_print(right_page, 0);
 
3622
                                }
 
3623
                        } else {
 
3624
                                page_t* right_father_page
 
3625
                                        = btr_cur_get_page(&right_node_cur);
 
3626
 
 
3627
                                if (btr_cur_get_rec(&right_node_cur)
 
3628
                                    != page_rec_get_next(
 
3629
                                            page_get_infimum_rec(
 
3630
                                                    right_father_page))) {
 
3631
                                        ret = FALSE;
 
3632
                                        fputs("InnoDB: node pointer 2 to"
 
3633
                                              " the right page is wrong\n",
 
3634
                                              stderr);
 
3635
 
 
3636
                                        btr_validate_report1(index, level,
 
3637
                                                             block);
 
3638
 
 
3639
                                        buf_page_print(father_page, 0);
 
3640
                                        buf_page_print(right_father_page, 0);
 
3641
                                        buf_page_print(page, 0);
 
3642
                                        buf_page_print(right_page, 0);
 
3643
                                }
 
3644
 
 
3645
                                if (page_get_page_no(right_father_page)
 
3646
                                    != btr_page_get_next(father_page, &mtr)) {
 
3647
 
 
3648
                                        ret = FALSE;
 
3649
                                        fputs("InnoDB: node pointer 3 to"
 
3650
                                              " the right page is wrong\n",
 
3651
                                              stderr);
 
3652
 
 
3653
                                        btr_validate_report1(index, level,
 
3654
                                                             block);
 
3655
 
 
3656
                                        buf_page_print(father_page, 0);
 
3657
                                        buf_page_print(right_father_page, 0);
 
3658
                                        buf_page_print(page, 0);
 
3659
                                        buf_page_print(right_page, 0);
 
3660
                                }
 
3661
                        }
 
3662
                }
 
3663
        }
 
3664
 
 
3665
node_ptr_fails:
 
3666
        /* Commit the mini-transaction to release the latch on 'page'.
 
3667
        Re-acquire the latch on right_page, which will become 'page'
 
3668
        on the next loop.  The page has already been checked. */
 
3669
        mtr_commit(&mtr);
 
3670
 
 
3671
        if (right_page_no != FIL_NULL) {
 
3672
                mtr_start(&mtr);
 
3673
 
 
3674
                block = btr_block_get(space, zip_size, right_page_no,
 
3675
                                      RW_X_LATCH, &mtr);
 
3676
                page = buf_block_get_frame(block);
 
3677
 
 
3678
                goto loop;
 
3679
        }
 
3680
 
 
3681
        mem_heap_free(heap);
 
3682
        return(ret);
 
3683
}
 
3684
 
 
3685
/**************************************************************//**
 
3686
Checks the consistency of an index tree.
 
3687
@return TRUE if ok */
 
3688
UNIV_INTERN
 
3689
ibool
 
3690
btr_validate_index(
 
3691
/*===============*/
 
3692
        dict_index_t*   index,  /*!< in: index */
 
3693
        trx_t*          trx)    /*!< in: transaction or NULL */
 
3694
{
 
3695
        mtr_t   mtr;
 
3696
        page_t* root;
 
3697
        ulint   i;
 
3698
        ulint   n;
 
3699
 
 
3700
        mtr_start(&mtr);
 
3701
        mtr_x_lock(dict_index_get_lock(index), &mtr);
 
3702
 
 
3703
        root = btr_root_get(index, &mtr);
 
3704
        n = btr_page_get_level(root, &mtr);
 
3705
 
 
3706
        for (i = 0; i <= n && !trx_is_interrupted(trx); i++) {
 
3707
                if (!btr_validate_level(index, trx, n - i)) {
 
3708
 
 
3709
                        mtr_commit(&mtr);
 
3710
 
 
3711
                        return(FALSE);
 
3712
                }
 
3713
        }
 
3714
 
 
3715
        mtr_commit(&mtr);
 
3716
 
 
3717
        return(TRUE);
 
3718
}
 
3719
#endif /* !UNIV_HOTBACKUP */