~ubuntu-branches/ubuntu/lucid/mysql-dfsg-5.1/lucid-security

0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
1
/*****************************************************************************
2
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3
Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4
Copyright (c) 2008, Google Inc.
5
6
Portions of this file contain modifications contributed and copyrighted by
7
Google, Inc. Those modifications are gratefully acknowledged and are described
8
briefly in the InnoDB documentation. The contributions by Google are
9
incorporated with their permission, and subject to the conditions contained in
10
the file COPYING.Google.
11
12
This program is free software; you can redistribute it and/or modify it under
13
the terms of the GNU General Public License as published by the Free Software
14
Foundation; version 2 of the License.
15
16
This program is distributed in the hope that it will be useful, but WITHOUT
17
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
19
20
You should have received a copy of the GNU General Public License along with
21
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
22
Place, Suite 330, Boston, MA 02111-1307 USA
23
24
*****************************************************************************/
25
26
/**************************************************//**
27
@file btr/btr0cur.c
28
The index tree cursor
29
30
All changes that row operations make to a B-tree or the records
31
there must go through this module! Undo log records are written here
32
of every modify or insert of a clustered index record.
33
34
			NOTE!!!
35
To make sure we do not run out of disk space during a pessimistic
36
insert or update, we have to reserve 2 x the height of the index tree
37
many pages in the tablespace before we start the operation, because
38
if leaf splitting has been started, it is difficult to undo, except
39
by crashing the database and doing a roll-forward.
40
41
Created 10/16/1994 Heikki Tuuri
42
*******************************************************/
43
44
#include "btr0cur.h"
45
46
#ifdef UNIV_NONINL
47
#include "btr0cur.ic"
48
#endif
49
50
#include "row0upd.h"
51
#ifndef UNIV_HOTBACKUP
52
#include "mtr0log.h"
53
#include "page0page.h"
54
#include "page0zip.h"
55
#include "rem0rec.h"
56
#include "rem0cmp.h"
57
#include "buf0lru.h"
58
#include "btr0btr.h"
59
#include "btr0sea.h"
60
#include "trx0rec.h"
61
#include "trx0roll.h" /* trx_is_recv() */
62
#include "que0que.h"
63
#include "row0row.h"
64
#include "srv0srv.h"
65
#include "ibuf0ibuf.h"
66
#include "lock0lock.h"
67
#include "zlib.h"
68
69
#ifdef UNIV_DEBUG
70
/** If the following is set to TRUE, this module prints a lot of
71
trace information of individual record operations */
72
UNIV_INTERN ibool	btr_cur_print_record_ops = FALSE;
73
#endif /* UNIV_DEBUG */
74
75
/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
76
UNIV_INTERN ulint	btr_cur_n_non_sea	= 0;
77
/** Number of successful adaptive hash index lookups in
78
btr_cur_search_to_nth_level(). */
79
UNIV_INTERN ulint	btr_cur_n_sea		= 0;
80
/** Old value of btr_cur_n_non_sea.  Copied by
81
srv_refresh_innodb_monitor_stats().  Referenced by
82
srv_printf_innodb_monitor(). */
83
UNIV_INTERN ulint	btr_cur_n_non_sea_old	= 0;
84
/** Old value of btr_cur_n_sea.  Copied by
85
srv_refresh_innodb_monitor_stats().  Referenced by
86
srv_printf_innodb_monitor(). */
87
UNIV_INTERN ulint	btr_cur_n_sea_old	= 0;
88
89
/** In the optimistic insert, if the insert does not fit, but this much space
90
can be released by page reorganize, then it is reorganized */
91
#define BTR_CUR_PAGE_REORGANIZE_LIMIT	(UNIV_PAGE_SIZE / 32)
92
93
/** The structure of a BLOB part header */
94
/* @{ */
95
/*--------------------------------------*/
96
#define BTR_BLOB_HDR_PART_LEN		0	/*!< BLOB part len on this
97
						page */
98
#define BTR_BLOB_HDR_NEXT_PAGE_NO	4	/*!< next BLOB part page no,
99
						FIL_NULL if none */
100
/*--------------------------------------*/
101
#define BTR_BLOB_HDR_SIZE		8	/*!< Size of a BLOB
102
						part header, in bytes */
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
103
104
/** Estimated table level stats from sampled value.
105
@param value		sampled stats
106
@param index		index being sampled
107
@param sample		number of sampled rows
108
@param ext_size		external stored data size
109
@param not_empty	table not empty
110
@return estimated table wide stats from sampled value */
111
#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty)\
112
	(((value) * (ib_int64_t) index->stat_n_leaf_pages		\
113
	  + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
114
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
115
/* @} */
116
#endif /* !UNIV_HOTBACKUP */
117
118
/** A BLOB field reference full of zero, for use in assertions and tests.
119
Initially, BLOB field references are set to zero, in
120
dtuple_convert_big_rec(). */
121
UNIV_INTERN const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE];
122
123
#ifndef UNIV_HOTBACKUP
124
/*******************************************************************//**
125
Marks all extern fields in a record as owned by the record. This function
126
should be called if the delete mark of a record is removed: a not delete
127
marked record always owns all its extern fields. */
128
static
129
void
130
btr_cur_unmark_extern_fields(
131
/*=========================*/
132
	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
133
				part will be updated, or NULL */
134
	rec_t*		rec,	/*!< in/out: record in a clustered index */
135
	dict_index_t*	index,	/*!< in: index of the page */
136
	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
137
	mtr_t*		mtr);	/*!< in: mtr, or NULL if not logged */
138
/*******************************************************************//**
139
Adds path information to the cursor for the current page, for which
140
the binary search has been performed. */
141
static
142
void
143
btr_cur_add_path_info(
144
/*==================*/
145
	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
146
	ulint		height,		/*!< in: height of the page in tree;
147
					0 means leaf node */
148
	ulint		root_height);	/*!< in: root node height in tree */
149
/***********************************************************//**
150
Frees the externally stored fields for a record, if the field is mentioned
151
in the update vector. */
152
static
153
void
154
btr_rec_free_updated_extern_fields(
155
/*===============================*/
156
	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
157
				X-latched */
158
	rec_t*		rec,	/*!< in: record */
159
	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
160
				part will be updated, or NULL */
161
	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
162
	const upd_t*	update,	/*!< in: update vector */
163
	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
164
	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
165
				an X-latch to record page and to the tree */
166
/***********************************************************//**
167
Frees the externally stored fields for a record. */
168
static
169
void
170
btr_rec_free_externally_stored_fields(
171
/*==================================*/
172
	dict_index_t*	index,	/*!< in: index of the data, the index
173
				tree MUST be X-latched */
174
	rec_t*		rec,	/*!< in: record */
175
	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
176
	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
177
				part will be updated, or NULL */
178
	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
179
	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
180
				an X-latch to record page and to the index
181
				tree */
182
/***********************************************************//**
183
Gets the externally stored size of a record, in units of a database page.
184
@return	externally stored part, in units of a database page */
185
static
186
ulint
187
btr_rec_get_externally_stored_len(
188
/*==============================*/
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
189
	const rec_t*	rec,	/*!< in: record */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
190
	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
191
#endif /* !UNIV_HOTBACKUP */
192
193
/******************************************************//**
194
The following function is used to set the deleted bit of a record. */
195
UNIV_INLINE
196
void
197
btr_rec_set_deleted_flag(
198
/*=====================*/
199
	rec_t*		rec,	/*!< in/out: physical record */
200
	page_zip_des_t*	page_zip,/*!< in/out: compressed page (or NULL) */
201
	ulint		flag)	/*!< in: nonzero if delete marked */
202
{
203
	if (page_rec_is_comp(rec)) {
204
		rec_set_deleted_flag_new(rec, page_zip, flag);
205
	} else {
206
		ut_ad(!page_zip);
207
		rec_set_deleted_flag_old(rec, flag);
208
	}
209
}
210
211
#ifndef UNIV_HOTBACKUP
212
/*==================== B-TREE SEARCH =========================*/
213
214
/********************************************************************//**
215
Latches the leaf page or pages requested. */
216
static
217
void
218
btr_cur_latch_leaves(
219
/*=================*/
220
	page_t*		page,		/*!< in: leaf page where the search
221
					converged */
222
	ulint		space,		/*!< in: space id */
223
	ulint		zip_size,	/*!< in: compressed page size in bytes
224
					or 0 for uncompressed pages */
225
	ulint		page_no,	/*!< in: page number of the leaf */
226
	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
227
	btr_cur_t*	cursor,		/*!< in: cursor */
228
	mtr_t*		mtr)		/*!< in: mtr */
229
{
230
	ulint		mode;
231
	ulint		left_page_no;
232
	ulint		right_page_no;
233
	buf_block_t*	get_block;
234
235
	ut_ad(page && mtr);
236
237
	switch (latch_mode) {
238
	case BTR_SEARCH_LEAF:
239
	case BTR_MODIFY_LEAF:
240
		mode = latch_mode == BTR_SEARCH_LEAF ? RW_S_LATCH : RW_X_LATCH;
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
241
		get_block = btr_block_get(
242
			space, zip_size, page_no, mode, cursor->index, mtr);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
243
#ifdef UNIV_BTR_DEBUG
244
		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
245
#endif /* UNIV_BTR_DEBUG */
246
		get_block->check_index_page_at_flush = TRUE;
247
		return;
248
	case BTR_MODIFY_TREE:
249
		/* x-latch also brothers from left to right */
250
		left_page_no = btr_page_get_prev(page, mtr);
251
252
		if (left_page_no != FIL_NULL) {
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
253
			get_block = btr_block_get(
254
				space, zip_size, left_page_no,
255
				RW_X_LATCH, cursor->index, mtr);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
256
#ifdef UNIV_BTR_DEBUG
257
			ut_a(page_is_comp(get_block->frame)
258
			     == page_is_comp(page));
259
			ut_a(btr_page_get_next(get_block->frame, mtr)
260
			     == page_get_page_no(page));
261
#endif /* UNIV_BTR_DEBUG */
262
			get_block->check_index_page_at_flush = TRUE;
263
		}
264
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
265
		get_block = btr_block_get(
266
			space, zip_size, page_no,
267
			RW_X_LATCH, cursor->index, mtr);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
268
#ifdef UNIV_BTR_DEBUG
269
		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
270
#endif /* UNIV_BTR_DEBUG */
271
		get_block->check_index_page_at_flush = TRUE;
272
273
		right_page_no = btr_page_get_next(page, mtr);
274
275
		if (right_page_no != FIL_NULL) {
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
276
			get_block = btr_block_get(
277
				space, zip_size, right_page_no,
278
				RW_X_LATCH, cursor->index, mtr);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
279
#ifdef UNIV_BTR_DEBUG
280
			ut_a(page_is_comp(get_block->frame)
281
			     == page_is_comp(page));
282
			ut_a(btr_page_get_prev(get_block->frame, mtr)
283
			     == page_get_page_no(page));
284
#endif /* UNIV_BTR_DEBUG */
285
			get_block->check_index_page_at_flush = TRUE;
286
		}
287
288
		return;
289
290
	case BTR_SEARCH_PREV:
291
	case BTR_MODIFY_PREV:
292
		mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
293
		/* latch also left brother */
294
		left_page_no = btr_page_get_prev(page, mtr);
295
296
		if (left_page_no != FIL_NULL) {
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
297
			get_block = btr_block_get(
298
				space, zip_size,
299
				left_page_no, mode, cursor->index, mtr);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
300
			cursor->left_block = get_block;
301
#ifdef UNIV_BTR_DEBUG
302
			ut_a(page_is_comp(get_block->frame)
303
			     == page_is_comp(page));
304
			ut_a(btr_page_get_next(get_block->frame, mtr)
305
			     == page_get_page_no(page));
306
#endif /* UNIV_BTR_DEBUG */
307
			get_block->check_index_page_at_flush = TRUE;
308
		}
309
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
310
		get_block = btr_block_get(
311
			space, zip_size, page_no, mode, cursor->index, mtr);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
312
#ifdef UNIV_BTR_DEBUG
313
		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
314
#endif /* UNIV_BTR_DEBUG */
315
		get_block->check_index_page_at_flush = TRUE;
316
		return;
317
	}
318
319
	ut_error;
320
}
321
322
/********************************************************************//**
323
Searches an index tree and positions a tree cursor on a given level.
324
NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
325
to node pointer page number fields on the upper levels of the tree!
326
Note that if mode is PAGE_CUR_LE, which is used in inserts, then
327
cursor->up_match and cursor->low_match both will have sensible values.
328
If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
329
330
If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
331
search tuple should be performed in the B-tree. InnoDB does an insert
332
immediately after the cursor. Thus, the cursor may end up on a user record,
333
or on a page infimum record. */
334
UNIV_INTERN
335
void
336
btr_cur_search_to_nth_level(
337
/*========================*/
338
	dict_index_t*	index,	/*!< in: index */
339
	ulint		level,	/*!< in: the tree level of search */
340
	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
341
				tuple must be set so that it cannot get
342
				compared to the node ptr page number field! */
343
	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
344
				Inserts should always be made using
345
				PAGE_CUR_LE to search the position! */
346
	ulint		latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
347
				BTR_INSERT and BTR_ESTIMATE;
348
				cursor->left_block is used to store a pointer
349
				to the left neighbor page, in the cases
350
				BTR_SEARCH_PREV and BTR_MODIFY_PREV;
351
				NOTE that if has_search_latch
352
				is != 0, we maybe do not have a latch set
353
				on the cursor page, we assume
354
				the caller uses his search latch
355
				to protect the record! */
356
	btr_cur_t*	cursor, /*!< in/out: tree cursor; the cursor page is
357
				s- or x-latched, but see also above! */
358
	ulint		has_search_latch,/*!< in: info on the latch mode the
359
				caller currently has on btr_search_latch:
360
				RW_S_LATCH, or 0 */
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
361
	const char*	file,	/*!< in: file name */
362
	ulint		line,	/*!< in: line where called */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
363
	mtr_t*		mtr)	/*!< in: mtr */
364
{
365
	page_cur_t*	page_cursor;
366
	page_t*		page;
367
	buf_block_t*	guess;
368
	rec_t*		node_ptr;
369
	ulint		page_no;
370
	ulint		space;
371
	ulint		up_match;
372
	ulint		up_bytes;
373
	ulint		low_match;
374
	ulint		low_bytes;
375
	ulint		height;
376
	ulint		savepoint;
377
	ulint		page_mode;
378
	ulint		insert_planned;
379
	ulint		estimate;
380
	ulint		ignore_sec_unique;
381
	ulint		root_height = 0; /* remove warning */
382
#ifdef BTR_CUR_ADAPT
383
	btr_search_t*	info;
384
#endif
385
	mem_heap_t*	heap		= NULL;
386
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
387
	ulint*		offsets		= offsets_;
388
	rec_offs_init(offsets_);
389
	/* Currently, PAGE_CUR_LE is the only search mode used for searches
390
	ending to upper levels */
391
392
	ut_ad(level == 0 || mode == PAGE_CUR_LE);
393
	ut_ad(dict_index_check_search_tuple(index, tuple));
394
	ut_ad(!dict_index_is_ibuf(index) || ibuf_inside());
395
	ut_ad(dtuple_check_typed(tuple));
396
397
#ifdef UNIV_DEBUG
398
	cursor->up_match = ULINT_UNDEFINED;
399
	cursor->low_match = ULINT_UNDEFINED;
400
#endif
401
	insert_planned = latch_mode & BTR_INSERT;
402
	estimate = latch_mode & BTR_ESTIMATE;
403
	ignore_sec_unique = latch_mode & BTR_IGNORE_SEC_UNIQUE;
404
	latch_mode = latch_mode & ~(BTR_INSERT | BTR_ESTIMATE
405
				    | BTR_IGNORE_SEC_UNIQUE);
406
407
	ut_ad(!insert_planned || (mode == PAGE_CUR_LE));
408
409
	cursor->flag = BTR_CUR_BINARY;
410
	cursor->index = index;
411
412
#ifndef BTR_CUR_ADAPT
413
	guess = NULL;
414
#else
415
	info = btr_search_get_info(index);
416
417
	guess = info->root_guess;
418
419
#ifdef BTR_CUR_HASH_ADAPT
420
421
#ifdef UNIV_SEARCH_PERF_STAT
422
	info->n_searches++;
423
#endif
424
	if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED
425
	    && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
426
	    && !estimate
427
#ifdef PAGE_CUR_LE_OR_EXTENDS
428
	    && mode != PAGE_CUR_LE_OR_EXTENDS
429
#endif /* PAGE_CUR_LE_OR_EXTENDS */
430
	    /* If !has_search_latch, we do a dirty read of
431
	    btr_search_enabled below, and btr_search_guess_on_hash()
432
	    will have to check it again. */
433
	    && UNIV_LIKELY(btr_search_enabled)
434
	    && btr_search_guess_on_hash(index, info, tuple, mode,
435
					latch_mode, cursor,
436
					has_search_latch, mtr)) {
437
438
		/* Search using the hash index succeeded */
439
440
		ut_ad(cursor->up_match != ULINT_UNDEFINED
441
		      || mode != PAGE_CUR_GE);
442
		ut_ad(cursor->up_match != ULINT_UNDEFINED
443
		      || mode != PAGE_CUR_LE);
444
		ut_ad(cursor->low_match != ULINT_UNDEFINED
445
		      || mode != PAGE_CUR_LE);
446
		btr_cur_n_sea++;
447
448
		return;
449
	}
450
#endif /* BTR_CUR_HASH_ADAPT */
451
#endif /* BTR_CUR_ADAPT */
452
	btr_cur_n_non_sea++;
453
454
	/* If the hash search did not succeed, do binary search down the
455
	tree */
456
457
	if (has_search_latch) {
458
		/* Release possible search latch to obey latching order */
459
		rw_lock_s_unlock(&btr_search_latch);
460
	}
461
462
	/* Store the position of the tree latch we push to mtr so that we
463
	know how to release it when we have latched leaf node(s) */
464
465
	savepoint = mtr_set_savepoint(mtr);
466
467
	if (latch_mode == BTR_MODIFY_TREE) {
468
		mtr_x_lock(dict_index_get_lock(index), mtr);
469
470
	} else if (latch_mode == BTR_CONT_MODIFY_TREE) {
471
		/* Do nothing */
472
		ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
473
					MTR_MEMO_X_LOCK));
474
	} else {
475
		mtr_s_lock(dict_index_get_lock(index), mtr);
476
	}
477
478
	page_cursor = btr_cur_get_page_cur(cursor);
479
480
	space = dict_index_get_space(index);
481
	page_no = dict_index_get_page(index);
482
483
	up_match = 0;
484
	up_bytes = 0;
485
	low_match = 0;
486
	low_bytes = 0;
487
488
	height = ULINT_UNDEFINED;
489
490
	/* We use these modified search modes on non-leaf levels of the
491
	B-tree. These let us end up in the right B-tree leaf. In that leaf
492
	we use the original search mode. */
493
494
	switch (mode) {
495
	case PAGE_CUR_GE:
496
		page_mode = PAGE_CUR_L;
497
		break;
498
	case PAGE_CUR_G:
499
		page_mode = PAGE_CUR_LE;
500
		break;
501
	default:
502
#ifdef PAGE_CUR_LE_OR_EXTENDS
503
		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
504
		      || mode == PAGE_CUR_LE_OR_EXTENDS);
505
#else /* PAGE_CUR_LE_OR_EXTENDS */
506
		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
507
#endif /* PAGE_CUR_LE_OR_EXTENDS */
508
		page_mode = mode;
509
		break;
510
	}
511
512
	/* Loop and search until we arrive at the desired level */
513
514
	for (;;) {
515
		ulint		zip_size;
516
		buf_block_t*	block;
517
		ulint		rw_latch;
518
		ulint		buf_mode;
519
520
		zip_size = dict_table_zip_size(index->table);
521
		rw_latch = RW_NO_LATCH;
522
		buf_mode = BUF_GET;
523
524
		if (height == 0 && latch_mode <= BTR_MODIFY_LEAF) {
525
526
			rw_latch = latch_mode;
527
528
			if (insert_planned
529
			    && ibuf_should_try(index, ignore_sec_unique)) {
530
531
				/* Try insert to the insert buffer if the
532
				page is not in the buffer pool */
533
534
				buf_mode = BUF_GET_IF_IN_POOL;
535
			}
536
		}
537
538
retry_page_get:
539
		block = buf_page_get_gen(space, zip_size, page_no,
540
					 rw_latch, guess, buf_mode,
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
541
					 file, line, mtr);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
542
		if (block == NULL) {
543
			/* This must be a search to perform an insert;
544
			try insert to the insert buffer */
545
546
			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
547
			ut_ad(insert_planned);
548
			ut_ad(cursor->thr);
549
550
			if (ibuf_insert(tuple, index, space, zip_size,
551
					page_no, cursor->thr)) {
552
				/* Insertion to the insert buffer succeeded */
553
				cursor->flag = BTR_CUR_INSERT_TO_IBUF;
554
				if (UNIV_LIKELY_NULL(heap)) {
555
					mem_heap_free(heap);
556
				}
557
				goto func_exit;
558
			}
559
560
			/* Insert to the insert buffer did not succeed:
561
			retry page get */
562
563
			buf_mode = BUF_GET;
564
565
			goto retry_page_get;
566
		}
567
568
		page = buf_block_get_frame(block);
569
570
		block->check_index_page_at_flush = TRUE;
571
572
		if (rw_latch != RW_NO_LATCH) {
573
#ifdef UNIV_ZIP_DEBUG
574
			const page_zip_des_t*	page_zip
575
				= buf_block_get_page_zip(block);
576
			ut_a(!page_zip || page_zip_validate(page_zip, page));
577
#endif /* UNIV_ZIP_DEBUG */
578
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
579
			buf_block_dbg_add_level(
580
				block, dict_index_is_ibuf(index)
581
				? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
582
		}
583
584
		ut_ad(0 == ut_dulint_cmp(index->id,
585
					 btr_page_get_index_id(page)));
586
587
		if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
588
			/* We are in the root node */
589
590
			height = btr_page_get_level(page, mtr);
591
			root_height = height;
592
			cursor->tree_height = root_height + 1;
593
#ifdef BTR_CUR_ADAPT
594
			if (block != guess) {
595
				info->root_guess = block;
596
			}
597
#endif
598
		}
599
600
		if (height == 0) {
601
			if (rw_latch == RW_NO_LATCH) {
602
603
				btr_cur_latch_leaves(page, space, zip_size,
604
						     page_no, latch_mode,
605
						     cursor, mtr);
606
			}
607
608
			if ((latch_mode != BTR_MODIFY_TREE)
609
			    && (latch_mode != BTR_CONT_MODIFY_TREE)) {
610
611
				/* Release the tree s-latch */
612
613
				mtr_release_s_latch_at_savepoint(
614
					mtr, savepoint,
615
					dict_index_get_lock(index));
616
			}
617
618
			page_mode = mode;
619
		}
620
621
		page_cur_search_with_match(block, index, tuple, page_mode,
622
					   &up_match, &up_bytes,
623
					   &low_match, &low_bytes,
624
					   page_cursor);
625
626
		if (estimate) {
627
			btr_cur_add_path_info(cursor, height, root_height);
628
		}
629
630
		/* If this is the desired level, leave the loop */
631
632
		ut_ad(height == btr_page_get_level(
633
			      page_cur_get_page(page_cursor), mtr));
634
635
		if (level == height) {
636
637
			if (level > 0) {
638
				/* x-latch the page */
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
639
				page = btr_page_get(space, zip_size, page_no,
640
						    RW_X_LATCH, index, mtr);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
641
				ut_a((ibool)!!page_is_comp(page)
642
				     == dict_table_is_comp(index->table));
643
			}
644
645
			break;
646
		}
647
648
		ut_ad(height > 0);
649
650
		height--;
651
652
		guess = NULL;
653
654
		node_ptr = page_cur_get_rec(page_cursor);
655
		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
656
					  ULINT_UNDEFINED, &heap);
657
		/* Go to the child node */
658
		page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
659
	}
660
661
	if (UNIV_LIKELY_NULL(heap)) {
662
		mem_heap_free(heap);
663
	}
664
665
	if (level == 0) {
666
		cursor->low_match = low_match;
667
		cursor->low_bytes = low_bytes;
668
		cursor->up_match = up_match;
669
		cursor->up_bytes = up_bytes;
670
671
#ifdef BTR_CUR_ADAPT
672
		/* We do a dirty read of btr_search_enabled here.  We
673
		will properly check btr_search_enabled again in
674
		btr_search_build_page_hash_index() before building a
675
		page hash index, while holding btr_search_latch. */
676
		if (UNIV_LIKELY(btr_search_enabled)) {
677
678
			btr_search_info_update(index, cursor);
679
		}
680
#endif
681
		ut_ad(cursor->up_match != ULINT_UNDEFINED
682
		      || mode != PAGE_CUR_GE);
683
		ut_ad(cursor->up_match != ULINT_UNDEFINED
684
		      || mode != PAGE_CUR_LE);
685
		ut_ad(cursor->low_match != ULINT_UNDEFINED
686
		      || mode != PAGE_CUR_LE);
687
	}
688
689
func_exit:
690
	if (has_search_latch) {
691
692
		rw_lock_s_lock(&btr_search_latch);
693
	}
694
}
695
696
/*****************************************************************//**
697
Opens a cursor at either end of an index. */
698
UNIV_INTERN
699
void
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
700
btr_cur_open_at_index_side_func(
701
/*============================*/
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
702
	ibool		from_left,	/*!< in: TRUE if open to the low end,
703
					FALSE if to the high end */
704
	dict_index_t*	index,		/*!< in: index */
705
	ulint		latch_mode,	/*!< in: latch mode */
706
	btr_cur_t*	cursor,		/*!< in: cursor */
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
707
	const char*	file,		/*!< in: file name */
708
	ulint		line,		/*!< in: line where called */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
709
	mtr_t*		mtr)		/*!< in: mtr */
710
{
711
	page_cur_t*	page_cursor;
712
	ulint		page_no;
713
	ulint		space;
714
	ulint		zip_size;
715
	ulint		height;
716
	ulint		root_height = 0; /* remove warning */
717
	rec_t*		node_ptr;
718
	ulint		estimate;
719
	ulint		savepoint;
720
	mem_heap_t*	heap		= NULL;
721
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
722
	ulint*		offsets		= offsets_;
723
	rec_offs_init(offsets_);
724
725
	estimate = latch_mode & BTR_ESTIMATE;
726
	latch_mode = latch_mode & ~BTR_ESTIMATE;
727
728
	/* Store the position of the tree latch we push to mtr so that we
729
	know how to release it when we have latched the leaf node */
730
731
	savepoint = mtr_set_savepoint(mtr);
732
733
	if (latch_mode == BTR_MODIFY_TREE) {
734
		mtr_x_lock(dict_index_get_lock(index), mtr);
735
	} else {
736
		mtr_s_lock(dict_index_get_lock(index), mtr);
737
	}
738
739
	page_cursor = btr_cur_get_page_cur(cursor);
740
	cursor->index = index;
741
742
	space = dict_index_get_space(index);
743
	zip_size = dict_table_zip_size(index->table);
744
	page_no = dict_index_get_page(index);
745
746
	height = ULINT_UNDEFINED;
747
748
	for (;;) {
749
		buf_block_t*	block;
750
		page_t*		page;
751
		block = buf_page_get_gen(space, zip_size, page_no,
752
					 RW_NO_LATCH, NULL, BUF_GET,
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
753
					 file, line, mtr);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
754
		page = buf_block_get_frame(block);
755
		ut_ad(0 == ut_dulint_cmp(index->id,
756
					 btr_page_get_index_id(page)));
757
758
		block->check_index_page_at_flush = TRUE;
759
760
		if (height == ULINT_UNDEFINED) {
761
			/* We are in the root node */
762
763
			height = btr_page_get_level(page, mtr);
764
			root_height = height;
765
		}
766
767
		if (height == 0) {
768
			btr_cur_latch_leaves(page, space, zip_size, page_no,
769
					     latch_mode, cursor, mtr);
770
771
			/* In versions <= 3.23.52 we had forgotten to
772
			release the tree latch here. If in an index scan
773
			we had to scan far to find a record visible to the
774
			current transaction, that could starve others
775
			waiting for the tree latch. */
776
777
			if ((latch_mode != BTR_MODIFY_TREE)
778
			    && (latch_mode != BTR_CONT_MODIFY_TREE)) {
779
780
				/* Release the tree s-latch */
781
782
				mtr_release_s_latch_at_savepoint(
783
					mtr, savepoint,
784
					dict_index_get_lock(index));
785
			}
786
		}
787
788
		if (from_left) {
789
			page_cur_set_before_first(block, page_cursor);
790
		} else {
791
			page_cur_set_after_last(block, page_cursor);
792
		}
793
794
		if (height == 0) {
795
			if (estimate) {
796
				btr_cur_add_path_info(cursor, height,
797
						      root_height);
798
			}
799
800
			break;
801
		}
802
803
		ut_ad(height > 0);
804
805
		if (from_left) {
806
			page_cur_move_to_next(page_cursor);
807
		} else {
808
			page_cur_move_to_prev(page_cursor);
809
		}
810
811
		if (estimate) {
812
			btr_cur_add_path_info(cursor, height, root_height);
813
		}
814
815
		height--;
816
817
		node_ptr = page_cur_get_rec(page_cursor);
818
		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
819
					  ULINT_UNDEFINED, &heap);
820
		/* Go to the child node */
821
		page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
822
	}
823
824
	if (UNIV_LIKELY_NULL(heap)) {
825
		mem_heap_free(heap);
826
	}
827
}
828
829
/**********************************************************************//**
830
Positions a cursor at a randomly chosen position within a B-tree. */
831
UNIV_INTERN
832
void
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
833
btr_cur_open_at_rnd_pos_func(
834
/*=========================*/
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
835
	dict_index_t*	index,		/*!< in: index */
836
	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
837
	btr_cur_t*	cursor,		/*!< in/out: B-tree cursor */
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
838
	const char*	file,		/*!< in: file name */
839
	ulint		line,		/*!< in: line where called */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
840
	mtr_t*		mtr)		/*!< in: mtr */
841
{
842
	page_cur_t*	page_cursor;
843
	ulint		page_no;
844
	ulint		space;
845
	ulint		zip_size;
846
	ulint		height;
847
	rec_t*		node_ptr;
848
	mem_heap_t*	heap		= NULL;
849
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
850
	ulint*		offsets		= offsets_;
851
	rec_offs_init(offsets_);
852
853
	if (latch_mode == BTR_MODIFY_TREE) {
854
		mtr_x_lock(dict_index_get_lock(index), mtr);
855
	} else {
856
		mtr_s_lock(dict_index_get_lock(index), mtr);
857
	}
858
859
	page_cursor = btr_cur_get_page_cur(cursor);
860
	cursor->index = index;
861
862
	space = dict_index_get_space(index);
863
	zip_size = dict_table_zip_size(index->table);
864
	page_no = dict_index_get_page(index);
865
866
	height = ULINT_UNDEFINED;
867
868
	for (;;) {
869
		buf_block_t*	block;
870
		page_t*		page;
871
872
		block = buf_page_get_gen(space, zip_size, page_no,
873
					 RW_NO_LATCH, NULL, BUF_GET,
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
874
					 file, line, mtr);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
875
		page = buf_block_get_frame(block);
876
		ut_ad(0 == ut_dulint_cmp(index->id,
877
					 btr_page_get_index_id(page)));
878
879
		if (height == ULINT_UNDEFINED) {
880
			/* We are in the root node */
881
882
			height = btr_page_get_level(page, mtr);
883
		}
884
885
		if (height == 0) {
886
			btr_cur_latch_leaves(page, space, zip_size, page_no,
887
					     latch_mode, cursor, mtr);
888
		}
889
890
		page_cur_open_on_rnd_user_rec(block, page_cursor);
891
892
		if (height == 0) {
893
894
			break;
895
		}
896
897
		ut_ad(height > 0);
898
899
		height--;
900
901
		node_ptr = page_cur_get_rec(page_cursor);
902
		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
903
					  ULINT_UNDEFINED, &heap);
904
		/* Go to the child node */
905
		page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
906
	}
907
908
	if (UNIV_LIKELY_NULL(heap)) {
909
		mem_heap_free(heap);
910
	}
911
}
912
913
/*==================== B-TREE INSERT =========================*/
914
915
/*************************************************************//**
916
Inserts a record if there is enough space, or if enough space can
917
be freed by reorganizing. Differs from btr_cur_optimistic_insert because
918
no heuristics is applied to whether it pays to use CPU time for
919
reorganizing the page or not.
920
@return	pointer to inserted record if succeed, else NULL */
921
static
922
rec_t*
923
btr_cur_insert_if_possible(
924
/*=======================*/
925
	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
926
				cursor stays valid */
927
	const dtuple_t*	tuple,	/*!< in: tuple to insert; the size info need not
928
				have been stored to tuple */
929
	ulint		n_ext,	/*!< in: number of externally stored columns */
930
	mtr_t*		mtr)	/*!< in: mtr */
931
{
932
	page_cur_t*	page_cursor;
933
	buf_block_t*	block;
934
	rec_t*		rec;
935
936
	ut_ad(dtuple_check_typed(tuple));
937
938
	block = btr_cur_get_block(cursor);
939
940
	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
941
	page_cursor = btr_cur_get_page_cur(cursor);
942
943
	/* Now, try the insert */
944
	rec = page_cur_tuple_insert(page_cursor, tuple,
945
				    cursor->index, n_ext, mtr);
946
947
	if (UNIV_UNLIKELY(!rec)) {
948
		/* If record did not fit, reorganize */
949
950
		if (btr_page_reorganize(block, cursor->index, mtr)) {
951
952
			page_cur_search(block, cursor->index, tuple,
953
					PAGE_CUR_LE, page_cursor);
954
955
			rec = page_cur_tuple_insert(page_cursor, tuple,
956
						    cursor->index, n_ext, mtr);
957
		}
958
	}
959
960
	return(rec);
961
}
962
963
/*************************************************************//**
964
For an insert, checks the locks and does the undo logging if desired.
965
@return	DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
966
UNIV_INLINE
967
ulint
968
btr_cur_ins_lock_and_undo(
969
/*======================*/
970
	ulint		flags,	/*!< in: undo logging and locking flags: if
971
				not zero, the parameters index and thr
972
				should be specified */
973
	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert */
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
974
	dtuple_t*	entry,	/*!< in/out: entry to insert */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
975
	que_thr_t*	thr,	/*!< in: query thread or NULL */
976
	mtr_t*		mtr,	/*!< in/out: mini-transaction */
977
	ibool*		inherit)/*!< out: TRUE if the inserted new record maybe
978
				should inherit LOCK_GAP type locks from the
979
				successor record */
980
{
981
	dict_index_t*	index;
982
	ulint		err;
983
	rec_t*		rec;
984
	roll_ptr_t	roll_ptr;
985
986
	/* Check if we have to wait for a lock: enqueue an explicit lock
987
	request if yes */
988
989
	rec = btr_cur_get_rec(cursor);
990
	index = cursor->index;
991
992
	err = lock_rec_insert_check_and_lock(flags, rec,
993
					     btr_cur_get_block(cursor),
994
					     index, thr, mtr, inherit);
995
996
	if (err != DB_SUCCESS) {
997
998
		return(err);
999
	}
1000
1001
	if (dict_index_is_clust(index) && !dict_index_is_ibuf(index)) {
1002
1003
		err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
1004
						    thr, index, entry,
1005
						    NULL, 0, NULL,
1006
						    &roll_ptr);
1007
		if (err != DB_SUCCESS) {
1008
1009
			return(err);
1010
		}
1011
1012
		/* Now we can fill in the roll ptr field in entry */
1013
1014
		if (!(flags & BTR_KEEP_SYS_FLAG)) {
1015
1016
			row_upd_index_entry_sys_field(entry, index,
1017
						      DATA_ROLL_PTR, roll_ptr);
1018
		}
1019
	}
1020
1021
	return(DB_SUCCESS);
1022
}
1023
1024
#ifdef UNIV_DEBUG
1025
/*************************************************************//**
1026
Report information about a transaction. */
1027
static
1028
void
1029
btr_cur_trx_report(
1030
/*===============*/
1031
	trx_t*			trx,	/*!< in: transaction */
1032
	const dict_index_t*	index,	/*!< in: index */
1033
	const char*		op)	/*!< in: operation */
1034
{
1035
	fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ",
1036
		TRX_ID_PREP_PRINTF(trx->id));
1037
	fputs(op, stderr);
1038
	dict_index_name_print(stderr, trx, index);
1039
	putc('\n', stderr);
1040
}
1041
#endif /* UNIV_DEBUG */
1042
1043
/*************************************************************//**
1044
Tries to perform an insert to a page in an index tree, next to cursor.
1045
It is assumed that mtr holds an x-latch on the page. The operation does
1046
not succeed if there is too little space on the page. If there is just
1047
one record on the page, the insert will always succeed; this is to
1048
prevent trying to split a page with just one record.
1049
@return	DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
1050
UNIV_INTERN
1051
ulint
1052
btr_cur_optimistic_insert(
1053
/*======================*/
1054
	ulint		flags,	/*!< in: undo logging and locking flags: if not
1055
				zero, the parameters index and thr should be
1056
				specified */
1057
	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
1058
				cursor stays valid */
1059
	dtuple_t*	entry,	/*!< in/out: entry to insert */
1060
	rec_t**		rec,	/*!< out: pointer to inserted record if
1061
				succeed */
1062
	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
1063
				be stored externally by the caller, or
1064
				NULL */
1065
	ulint		n_ext,	/*!< in: number of externally stored columns */
1066
	que_thr_t*	thr,	/*!< in: query thread or NULL */
1067
	mtr_t*		mtr)	/*!< in: mtr; if this function returns
1068
				DB_SUCCESS on a leaf page of a secondary
1069
				index in a compressed tablespace, the
1070
				mtr must be committed before latching
1071
				any further pages */
1072
{
1073
	big_rec_t*	big_rec_vec	= NULL;
1074
	dict_index_t*	index;
1075
	page_cur_t*	page_cursor;
1076
	buf_block_t*	block;
1077
	page_t*		page;
1078
	ulint		max_size;
1079
	rec_t*		dummy_rec;
1080
	ibool		leaf;
1081
	ibool		reorg;
1082
	ibool		inherit;
1083
	ulint		zip_size;
1084
	ulint		rec_size;
1085
	ulint		err;
1086
1087
	*big_rec = NULL;
1088
1089
	block = btr_cur_get_block(cursor);
1090
	page = buf_block_get_frame(block);
1091
	index = cursor->index;
1092
	zip_size = buf_block_get_zip_size(block);
1093
#ifdef UNIV_DEBUG_VALGRIND
1094
	if (zip_size) {
1095
		UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
1096
		UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
1097
	}
1098
#endif /* UNIV_DEBUG_VALGRIND */
1099
1100
	if (!dtuple_check_typed_no_assert(entry)) {
1101
		fputs("InnoDB: Error in a tuple to insert into ", stderr);
1102
		dict_index_name_print(stderr, thr_get_trx(thr), index);
1103
	}
1104
#ifdef UNIV_DEBUG
1105
	if (btr_cur_print_record_ops && thr) {
1106
		btr_cur_trx_report(thr_get_trx(thr), index, "insert into ");
1107
		dtuple_print(stderr, entry);
1108
	}
1109
#endif /* UNIV_DEBUG */
1110
1111
	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1112
	max_size = page_get_max_insert_size_after_reorganize(page, 1);
1113
	leaf = page_is_leaf(page);
1114
1115
	/* Calculate the record size when entry is converted to a record */
1116
	rec_size = rec_get_converted_size(index, entry, n_ext);
1117
1118
	if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
1119
				   dtuple_get_n_fields(entry), zip_size)) {
1120
1121
		/* The record is so big that we have to store some fields
1122
		externally on separate database pages */
1123
		big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
1124
1125
		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
1126
1127
			return(DB_TOO_BIG_RECORD);
1128
		}
1129
1130
		rec_size = rec_get_converted_size(index, entry, n_ext);
1131
	}
1132
1133
	if (UNIV_UNLIKELY(zip_size)) {
1134
		/* Estimate the free space of an empty compressed page.
1135
		Subtract one byte for the encoded heap_no in the
1136
		modification log. */
1137
		ulint	free_space_zip = page_zip_empty_size(
1138
			cursor->index->n_fields, zip_size) - 1;
1139
		ulint	n_uniq = dict_index_get_n_unique_in_tree(index);
1140
1141
		ut_ad(dict_table_is_comp(index->table));
1142
1143
		/* There should be enough room for two node pointer
1144
		records on an empty non-leaf page.  This prevents
1145
		infinite page splits. */
1146
1147
		if (UNIV_LIKELY(entry->n_fields >= n_uniq)
1148
		    && UNIV_UNLIKELY(REC_NODE_PTR_SIZE
1149
				     + rec_get_converted_size_comp_prefix(
1150
					     index, entry->fields, n_uniq,
1151
					     NULL)
1152
				     /* On a compressed page, there is
1153
				     a two-byte entry in the dense
1154
				     page directory for every record.
1155
				     But there is no record header. */
1156
				     - (REC_N_NEW_EXTRA_BYTES - 2)
1157
				     > free_space_zip / 2)) {
1158
1159
			if (big_rec_vec) {
1160
				dtuple_convert_back_big_rec(
1161
					index, entry, big_rec_vec);
1162
			}
1163
1164
			return(DB_TOO_BIG_RECORD);
1165
		}
1166
	}
1167
1168
	/* If there have been many consecutive inserts, and we are on the leaf
1169
	level, check if we have to split the page to reserve enough free space
1170
	for future updates of records. */
1171
1172
	if (dict_index_is_clust(index)
1173
	    && (page_get_n_recs(page) >= 2)
1174
	    && UNIV_LIKELY(leaf)
1175
	    && (dict_index_get_space_reserve() + rec_size > max_size)
1176
	    && (btr_page_get_split_rec_to_right(cursor, &dummy_rec)
1177
		|| btr_page_get_split_rec_to_left(cursor, &dummy_rec))) {
1178
fail:
1179
		err = DB_FAIL;
1180
fail_err:
1181
1182
		if (big_rec_vec) {
1183
			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
1184
		}
1185
1186
		return(err);
1187
	}
1188
1189
	if (UNIV_UNLIKELY(max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
1190
			  || max_size < rec_size)
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
1191
	    && UNIV_LIKELY(page_get_n_recs(page) > 1)
1192
	    && page_get_max_insert_size(page, 1) < rec_size) {
1193
1194
		goto fail;
1195
	}
1196
1197
	/* Check locks and write to the undo log, if specified */
1198
	err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
1199
					thr, mtr, &inherit);
1200
1201
	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
1202
1203
		goto fail_err;
1204
	}
1205
1206
	page_cursor = btr_cur_get_page_cur(cursor);
1207
1208
	/* Now, try the insert */
1209
1210
	{
1211
		const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
1212
		*rec = page_cur_tuple_insert(page_cursor, entry, index,
1213
					     n_ext, mtr);
1214
		reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
1215
1216
		if (UNIV_UNLIKELY(reorg)) {
1217
			ut_a(zip_size);
1218
			ut_a(*rec);
1219
		}
1220
	}
1221
1222
	if (UNIV_UNLIKELY(!*rec) && UNIV_LIKELY(!reorg)) {
1223
		/* If the record did not fit, reorganize */
1224
		if (UNIV_UNLIKELY(!btr_page_reorganize(block, index, mtr))) {
1225
			ut_a(zip_size);
1226
1227
			goto fail;
1228
		}
1229
1230
		ut_ad(zip_size
1231
		      || page_get_max_insert_size(page, 1) == max_size);
1232
1233
		reorg = TRUE;
1234
1235
		page_cur_search(block, index, entry, PAGE_CUR_LE, page_cursor);
1236
1237
		*rec = page_cur_tuple_insert(page_cursor, entry, index,
1238
					     n_ext, mtr);
1239
1240
		if (UNIV_UNLIKELY(!*rec)) {
1241
			if (UNIV_LIKELY(zip_size != 0)) {
1242
1243
				goto fail;
1244
			}
1245
1246
			fputs("InnoDB: Error: cannot insert tuple ", stderr);
1247
			dtuple_print(stderr, entry);
1248
			fputs(" into ", stderr);
1249
			dict_index_name_print(stderr, thr_get_trx(thr), index);
1250
			fprintf(stderr, "\nInnoDB: max insert size %lu\n",
1251
				(ulong) max_size);
1252
			ut_error;
1253
		}
1254
	}
1255
1256
#ifdef BTR_CUR_HASH_ADAPT
1257
	if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
1258
		btr_search_update_hash_node_on_insert(cursor);
1259
	} else {
1260
		btr_search_update_hash_on_insert(cursor);
1261
	}
1262
#endif
1263
1264
	if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
1265
1266
		lock_update_insert(block, *rec);
1267
	}
1268
1269
#if 0
1270
	fprintf(stderr, "Insert into page %lu, max ins size %lu,"
1271
		" rec %lu ind type %lu\n",
1272
		buf_block_get_page_no(block), max_size,
1273
		rec_size + PAGE_DIR_SLOT_SIZE, index->type);
1274
#endif
1275
	if (leaf && !dict_index_is_clust(index)) {
1276
		/* Update the free bits of the B-tree page in the
1277
		insert buffer bitmap. */
1278
1279
		/* The free bits in the insert buffer bitmap must
1280
		never exceed the free space on a page.  It is safe to
1281
		decrement or reset the bits in the bitmap in a
1282
		mini-transaction that is committed before the
1283
		mini-transaction that affects the free space. */
1284
1285
		/* It is unsafe to increment the bits in a separately
1286
		committed mini-transaction, because in crash recovery,
1287
		the free bits could momentarily be set too high. */
1288
1289
		if (zip_size) {
1290
			/* Update the bits in the same mini-transaction. */
1291
			ibuf_update_free_bits_zip(block, mtr);
1292
		} else {
1293
			/* Decrement the bits in a separate
1294
			mini-transaction. */
1295
			ibuf_update_free_bits_if_full(
1296
				block, max_size,
1297
				rec_size + PAGE_DIR_SLOT_SIZE);
1298
		}
1299
	}
1300
1301
	*big_rec = big_rec_vec;
1302
1303
	return(DB_SUCCESS);
1304
}
1305
1306
/*************************************************************//**
1307
Performs an insert on a page of an index tree. It is assumed that mtr
1308
holds an x-latch on the tree and on the cursor page. If the insert is
1309
made on the leaf level, to avoid deadlocks, mtr must also own x-latches
1310
to brothers of page, if those brothers exist.
1311
@return	DB_SUCCESS or error number */
1312
UNIV_INTERN
1313
ulint
1314
btr_cur_pessimistic_insert(
1315
/*=======================*/
1316
	ulint		flags,	/*!< in: undo logging and locking flags: if not
1317
				zero, the parameter thr should be
1318
				specified; if no undo logging is specified,
1319
				then the caller must have reserved enough
1320
				free extents in the file space so that the
1321
				insertion will certainly succeed */
1322
	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
1323
				cursor stays valid */
1324
	dtuple_t*	entry,	/*!< in/out: entry to insert */
1325
	rec_t**		rec,	/*!< out: pointer to inserted record if
1326
				succeed */
1327
	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
1328
				be stored externally by the caller, or
1329
				NULL */
1330
	ulint		n_ext,	/*!< in: number of externally stored columns */
1331
	que_thr_t*	thr,	/*!< in: query thread or NULL */
1332
	mtr_t*		mtr)	/*!< in: mtr */
1333
{
1334
	dict_index_t*	index		= cursor->index;
1335
	ulint		zip_size	= dict_table_zip_size(index->table);
1336
	big_rec_t*	big_rec_vec	= NULL;
1337
	mem_heap_t*	heap		= NULL;
1338
	ulint		err;
1339
	ibool		dummy_inh;
1340
	ibool		success;
1341
	ulint		n_extents	= 0;
1342
	ulint		n_reserved;
1343
1344
	ut_ad(dtuple_check_typed(entry));
1345
1346
	*big_rec = NULL;
1347
1348
	ut_ad(mtr_memo_contains(mtr,
1349
				dict_index_get_lock(btr_cur_get_index(cursor)),
1350
				MTR_MEMO_X_LOCK));
1351
	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
1352
				MTR_MEMO_PAGE_X_FIX));
1353
1354
	/* Try first an optimistic insert; reset the cursor flag: we do not
1355
	assume anything of how it was positioned */
1356
1357
	cursor->flag = BTR_CUR_BINARY;
1358
1359
	err = btr_cur_optimistic_insert(flags, cursor, entry, rec,
1360
					big_rec, n_ext, thr, mtr);
1361
	if (err != DB_FAIL) {
1362
1363
		return(err);
1364
	}
1365
1366
	/* Retry with a pessimistic insert. Check locks and write to undo log,
1367
	if specified */
1368
1369
	err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
1370
					thr, mtr, &dummy_inh);
1371
1372
	if (err != DB_SUCCESS) {
1373
1374
		return(err);
1375
	}
1376
1377
	if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
1378
		/* First reserve enough free space for the file segments
1379
		of the index tree, so that the insert will not fail because
1380
		of lack of space */
1381
1382
		n_extents = cursor->tree_height / 16 + 3;
1383
1384
		success = fsp_reserve_free_extents(&n_reserved, index->space,
1385
						   n_extents, FSP_NORMAL, mtr);
1386
		if (!success) {
1387
			return(DB_OUT_OF_FILE_SPACE);
1388
		}
1389
	}
1390
1391
	if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
1392
				   dict_table_is_comp(index->table),
1393
				   dict_index_get_n_fields(index),
1394
				   zip_size)) {
1395
		/* The record is so big that we have to store some fields
1396
		externally on separate database pages */
1397
1398
		if (UNIV_LIKELY_NULL(big_rec_vec)) {
1399
			/* This should never happen, but we handle
1400
			the situation in a robust manner. */
1401
			ut_ad(0);
1402
			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
1403
		}
1404
1405
		big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
1406
1407
		if (big_rec_vec == NULL) {
1408
1409
			if (n_extents > 0) {
1410
				fil_space_release_free_extents(index->space,
1411
							       n_reserved);
1412
			}
1413
			return(DB_TOO_BIG_RECORD);
1414
		}
1415
	}
1416
1417
	if (dict_index_get_page(index)
1418
	    == buf_block_get_page_no(btr_cur_get_block(cursor))) {
1419
1420
		/* The page is the root page */
1421
		*rec = btr_root_raise_and_insert(cursor, entry, n_ext, mtr);
1422
	} else {
1423
		*rec = btr_page_split_and_insert(cursor, entry, n_ext, mtr);
1424
	}
1425
1426
	if (UNIV_LIKELY_NULL(heap)) {
1427
		mem_heap_free(heap);
1428
	}
1429
1430
	ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec);
1431
1432
#ifdef BTR_CUR_ADAPT
1433
	btr_search_update_hash_on_insert(cursor);
1434
#endif
1435
	if (!(flags & BTR_NO_LOCKING_FLAG)) {
1436
1437
		lock_update_insert(btr_cur_get_block(cursor), *rec);
1438
	}
1439
1440
	if (n_extents > 0) {
1441
		fil_space_release_free_extents(index->space, n_reserved);
1442
	}
1443
1444
	*big_rec = big_rec_vec;
1445
1446
	return(DB_SUCCESS);
1447
}
1448
1449
/*==================== B-TREE UPDATE =========================*/
1450
1451
/*************************************************************//**
1452
For an update, checks the locks and does the undo logging.
1453
@return	DB_SUCCESS, DB_WAIT_LOCK, or error number */
1454
UNIV_INLINE
1455
ulint
1456
btr_cur_upd_lock_and_undo(
1457
/*======================*/
1458
	ulint		flags,	/*!< in: undo logging and locking flags */
1459
	btr_cur_t*	cursor,	/*!< in: cursor on record to update */
1460
	const upd_t*	update,	/*!< in: update vector */
1461
	ulint		cmpl_info,/*!< in: compiler info on secondary index
1462
				updates */
1463
	que_thr_t*	thr,	/*!< in: query thread */
1464
	mtr_t*		mtr,	/*!< in/out: mini-transaction */
1465
	roll_ptr_t*	roll_ptr)/*!< out: roll pointer */
1466
{
1467
	dict_index_t*	index;
1468
	rec_t*		rec;
1469
	ulint		err;
1470
1471
	ut_ad(cursor && update && thr && roll_ptr);
1472
1473
	rec = btr_cur_get_rec(cursor);
1474
	index = cursor->index;
1475
1476
	if (!dict_index_is_clust(index)) {
1477
		/* We do undo logging only when we update a clustered index
1478
		record */
1479
		return(lock_sec_rec_modify_check_and_lock(
1480
			       flags, btr_cur_get_block(cursor), rec,
1481
			       index, thr, mtr));
1482
	}
1483
1484
	/* Check if we have to wait for a lock: enqueue an explicit lock
1485
	request if yes */
1486
1487
	err = DB_SUCCESS;
1488
1489
	if (!(flags & BTR_NO_LOCKING_FLAG)) {
1490
		mem_heap_t*	heap		= NULL;
1491
		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
1492
		rec_offs_init(offsets_);
1493
1494
		err = lock_clust_rec_modify_check_and_lock(
1495
			flags, btr_cur_get_block(cursor), rec, index,
1496
			rec_get_offsets(rec, index, offsets_,
1497
					ULINT_UNDEFINED, &heap), thr);
1498
		if (UNIV_LIKELY_NULL(heap)) {
1499
			mem_heap_free(heap);
1500
		}
1501
		if (err != DB_SUCCESS) {
1502
1503
			return(err);
1504
		}
1505
	}
1506
1507
	/* Append the info about the update in the undo log */
1508
1509
	err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
1510
					    index, NULL, update,
1511
					    cmpl_info, rec, roll_ptr);
1512
	return(err);
1513
}
1514
1515
/***********************************************************//**
1516
Writes a redo log record of updating a record in-place. */
1517
UNIV_INLINE
1518
void
1519
btr_cur_update_in_place_log(
1520
/*========================*/
1521
	ulint		flags,		/*!< in: flags */
1522
	rec_t*		rec,		/*!< in: record */
1523
	dict_index_t*	index,		/*!< in: index where cursor positioned */
1524
	const upd_t*	update,		/*!< in: update vector */
1525
	trx_t*		trx,		/*!< in: transaction */
1526
	roll_ptr_t	roll_ptr,	/*!< in: roll ptr */
1527
	mtr_t*		mtr)		/*!< in: mtr */
1528
{
1529
	byte*	log_ptr;
1530
	page_t*	page	= page_align(rec);
1531
	ut_ad(flags < 256);
1532
	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
1533
1534
	log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
1535
					    ? MLOG_COMP_REC_UPDATE_IN_PLACE
1536
					    : MLOG_REC_UPDATE_IN_PLACE,
1537
					    1 + DATA_ROLL_PTR_LEN + 14 + 2
1538
					    + MLOG_BUF_MARGIN);
1539
1540
	if (!log_ptr) {
1541
		/* Logging in mtr is switched off during crash recovery */
1542
		return;
1543
	}
1544
1545
	/* The code below assumes index is a clustered index: change index to
1546
	the clustered index if we are updating a secondary index record (or we
1547
	could as well skip writing the sys col values to the log in this case
1548
	because they are not needed for a secondary index record update) */
1549
1550
	index = dict_table_get_first_index(index->table);
1551
1552
	mach_write_to_1(log_ptr, flags);
1553
	log_ptr++;
1554
1555
	log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
1556
						mtr);
1557
	mach_write_to_2(log_ptr, page_offset(rec));
1558
	log_ptr += 2;
1559
1560
	row_upd_index_write_log(update, log_ptr, mtr);
1561
}
1562
#endif /* UNIV_HOTBACKUP */
1563
1564
/***********************************************************//**
1565
Parses a redo log record of updating a record in-place.
1566
@return	end of log record or NULL */
1567
UNIV_INTERN
1568
byte*
1569
btr_cur_parse_update_in_place(
1570
/*==========================*/
1571
	byte*		ptr,	/*!< in: buffer */
1572
	byte*		end_ptr,/*!< in: buffer end */
1573
	page_t*		page,	/*!< in/out: page or NULL */
1574
	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
1575
	dict_index_t*	index)	/*!< in: index corresponding to page */
1576
{
1577
	ulint		flags;
1578
	rec_t*		rec;
1579
	upd_t*		update;
1580
	ulint		pos;
1581
	trx_id_t	trx_id;
1582
	roll_ptr_t	roll_ptr;
1583
	ulint		rec_offset;
1584
	mem_heap_t*	heap;
1585
	ulint*		offsets;
1586
1587
	if (end_ptr < ptr + 1) {
1588
1589
		return(NULL);
1590
	}
1591
1592
	flags = mach_read_from_1(ptr);
1593
	ptr++;
1594
1595
	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
1596
1597
	if (ptr == NULL) {
1598
1599
		return(NULL);
1600
	}
1601
1602
	if (end_ptr < ptr + 2) {
1603
1604
		return(NULL);
1605
	}
1606
1607
	rec_offset = mach_read_from_2(ptr);
1608
	ptr += 2;
1609
1610
	ut_a(rec_offset <= UNIV_PAGE_SIZE);
1611
1612
	heap = mem_heap_create(256);
1613
1614
	ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
1615
1616
	if (!ptr || !page) {
1617
1618
		goto func_exit;
1619
	}
1620
1621
	ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
1622
	rec = page + rec_offset;
1623
1624
	/* We do not need to reserve btr_search_latch, as the page is only
1625
	being recovered, and there cannot be a hash index to it. */
1626
1627
	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
1628
1629
	if (!(flags & BTR_KEEP_SYS_FLAG)) {
1630
		row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
1631
						   pos, trx_id, roll_ptr);
1632
	}
1633
1634
	row_upd_rec_in_place(rec, index, offsets, update, page_zip);
1635
1636
func_exit:
1637
	mem_heap_free(heap);
1638
1639
	return(ptr);
1640
}
1641
1642
#ifndef UNIV_HOTBACKUP
1643
/*************************************************************//**
1644
See if there is enough place in the page modification log to log
1645
an update-in-place.
1646
@return	TRUE if enough place */
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
1647
UNIV_INTERN
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
1648
ibool
1649
btr_cur_update_alloc_zip(
1650
/*=====================*/
1651
	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
1652
	buf_block_t*	block,	/*!< in/out: buffer page */
1653
	dict_index_t*	index,	/*!< in: the index corresponding to the block */
1654
	ulint		length,	/*!< in: size needed */
1655
	ibool		create,	/*!< in: TRUE=delete-and-insert,
1656
				FALSE=update-in-place */
1657
	mtr_t*		mtr)	/*!< in: mini-transaction */
1658
{
1659
	ut_a(page_zip == buf_block_get_page_zip(block));
1660
	ut_ad(page_zip);
1661
	ut_ad(!dict_index_is_ibuf(index));
1662
1663
	if (page_zip_available(page_zip, dict_index_is_clust(index),
1664
			       length, create)) {
1665
		return(TRUE);
1666
	}
1667
1668
	if (!page_zip->m_nonempty) {
1669
		/* The page has been freshly compressed, so
1670
		recompressing it will not help. */
1671
		return(FALSE);
1672
	}
1673
1674
	if (!page_zip_compress(page_zip, buf_block_get_frame(block),
1675
			       index, mtr)) {
1676
		/* Unable to compress the page */
1677
		return(FALSE);
1678
	}
1679
1680
	/* After recompressing a page, we must make sure that the free
1681
	bits in the insert buffer bitmap will not exceed the free
1682
	space on the page.  Because this function will not attempt
1683
	recompression unless page_zip_available() fails above, it is
1684
	safe to reset the free bits if page_zip_available() fails
1685
	again, below.  The free bits can safely be reset in a separate
1686
	mini-transaction.  If page_zip_available() succeeds below, we
1687
	can be sure that the page_zip_compress() above did not reduce
1688
	the free space available on the page. */
1689
1690
	if (!page_zip_available(page_zip, dict_index_is_clust(index),
1691
				length, create)) {
1692
		/* Out of space: reset the free bits. */
1693
		if (!dict_index_is_clust(index)
1694
		    && page_is_leaf(buf_block_get_frame(block))) {
1695
			ibuf_reset_free_bits(block);
1696
		}
1697
		return(FALSE);
1698
	}
1699
1700
	return(TRUE);
1701
}
1702
1703
/*************************************************************//**
1704
Updates a record when the update causes no size changes in its fields.
1705
We assume here that the ordering fields of the record do not change.
1706
@return	DB_SUCCESS or error number */
1707
UNIV_INTERN
1708
ulint
1709
btr_cur_update_in_place(
1710
/*====================*/
1711
	ulint		flags,	/*!< in: undo logging and locking flags */
1712
	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
1713
				cursor stays valid and positioned on the
1714
				same record */
1715
	const upd_t*	update,	/*!< in: update vector */
1716
	ulint		cmpl_info,/*!< in: compiler info on secondary index
1717
				updates */
1718
	que_thr_t*	thr,	/*!< in: query thread */
1719
	mtr_t*		mtr)	/*!< in: mtr; must be committed before
1720
				latching any further pages */
1721
{
1722
	dict_index_t*	index;
1723
	buf_block_t*	block;
1724
	page_zip_des_t*	page_zip;
1725
	ulint		err;
1726
	rec_t*		rec;
1727
	roll_ptr_t	roll_ptr	= ut_dulint_zero;
1728
	trx_t*		trx;
1729
	ulint		was_delete_marked;
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
1730
	ibool		is_hashed;
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
1731
	mem_heap_t*	heap		= NULL;
1732
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
1733
	ulint*		offsets		= offsets_;
1734
	rec_offs_init(offsets_);
1735
1736
	rec = btr_cur_get_rec(cursor);
1737
	index = cursor->index;
1738
	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
1739
	/* The insert buffer tree should never be updated in place. */
1740
	ut_ad(!dict_index_is_ibuf(index));
1741
1742
	trx = thr_get_trx(thr);
1743
	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1744
#ifdef UNIV_DEBUG
1745
	if (btr_cur_print_record_ops && thr) {
1746
		btr_cur_trx_report(trx, index, "update ");
1747
		rec_print_new(stderr, rec, offsets);
1748
	}
1749
#endif /* UNIV_DEBUG */
1750
1751
	block = btr_cur_get_block(cursor);
1752
	page_zip = buf_block_get_page_zip(block);
1753
1754
	/* Check that enough space is available on the compressed page. */
1755
	if (UNIV_LIKELY_NULL(page_zip)
1756
	    && !btr_cur_update_alloc_zip(page_zip, block, index,
1757
					 rec_offs_size(offsets), FALSE, mtr)) {
1758
		return(DB_ZIP_OVERFLOW);
1759
	}
1760
1761
	/* Do lock checking and undo logging */
1762
	err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
1763
					thr, mtr, &roll_ptr);
1764
	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
1765
1766
		if (UNIV_LIKELY_NULL(heap)) {
1767
			mem_heap_free(heap);
1768
		}
1769
		return(err);
1770
	}
1771
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
1772
	if (!(flags & BTR_KEEP_SYS_FLAG)) {
1773
		row_upd_rec_sys_fields(rec, NULL,
1774
				       index, offsets, trx, roll_ptr);
1775
	}
1776
1777
	was_delete_marked = rec_get_deleted_flag(
1778
		rec, page_is_comp(buf_block_get_frame(block)));
1779
1780
	is_hashed = (block->index != NULL);
1781
1782
	if (is_hashed) {
1783
		/* TO DO: Can we skip this if none of the fields
1784
		index->search_info->curr_n_fields
1785
		are being updated? */
1786
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
1787
		/* The function row_upd_changes_ord_field_binary works only
1788
		if the update vector was built for a clustered index, we must
1789
		NOT call it if index is secondary */
1790
1791
		if (!dict_index_is_clust(index)
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
1792
		    || row_upd_changes_ord_field_binary(index, update, thr,
1793
							NULL, NULL)) {
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
1794
1795
			/* Remove possible hash index pointer to this record */
1796
			btr_search_update_hash_on_delete(cursor);
1797
		}
1798
1799
		rw_lock_x_lock(&btr_search_latch);
1800
	}
1801
1802
	row_upd_rec_in_place(rec, index, offsets, update, page_zip);
1803
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
1804
	if (is_hashed) {
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
1805
		rw_lock_x_unlock(&btr_search_latch);
1806
	}
1807
1808
	if (page_zip && !dict_index_is_clust(index)
1809
	    && page_is_leaf(buf_block_get_frame(block))) {
1810
		/* Update the free bits in the insert buffer. */
1811
		ibuf_update_free_bits_zip(block, mtr);
1812
	}
1813
1814
	btr_cur_update_in_place_log(flags, rec, index, update,
1815
				    trx, roll_ptr, mtr);
1816
1817
	if (was_delete_marked
1818
	    && !rec_get_deleted_flag(rec, page_is_comp(
1819
					     buf_block_get_frame(block)))) {
1820
		/* The new updated record owns its possible externally
1821
		stored fields */
1822
1823
		btr_cur_unmark_extern_fields(page_zip,
1824
					     rec, index, offsets, mtr);
1825
	}
1826
1827
	if (UNIV_LIKELY_NULL(heap)) {
1828
		mem_heap_free(heap);
1829
	}
1830
	return(DB_SUCCESS);
1831
}
1832
1833
/*************************************************************//**
1834
Tries to update a record on a page in an index tree. It is assumed that mtr
1835
holds an x-latch on the page. The operation does not succeed if there is too
1836
little space on the page or if the update would result in too empty a page,
1837
so that tree compression is recommended. We assume here that the ordering
1838
fields of the record do not change.
1839
@return DB_SUCCESS, or DB_OVERFLOW if the updated record does not fit,
1840
DB_UNDERFLOW if the page would become too empty, or DB_ZIP_OVERFLOW if
1841
there is not enough space left on the compressed page */
1842
UNIV_INTERN
1843
ulint
1844
btr_cur_optimistic_update(
1845
/*======================*/
1846
	ulint		flags,	/*!< in: undo logging and locking flags */
1847
	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
1848
				cursor stays valid and positioned on the
1849
				same record */
1850
	const upd_t*	update,	/*!< in: update vector; this must also
1851
				contain trx id and roll ptr fields */
1852
	ulint		cmpl_info,/*!< in: compiler info on secondary index
1853
				updates */
1854
	que_thr_t*	thr,	/*!< in: query thread */
1855
	mtr_t*		mtr)	/*!< in: mtr; must be committed before
1856
				latching any further pages */
1857
{
1858
	dict_index_t*	index;
1859
	page_cur_t*	page_cursor;
1860
	ulint		err;
1861
	buf_block_t*	block;
1862
	page_t*		page;
1863
	page_zip_des_t*	page_zip;
1864
	rec_t*		rec;
1865
	ulint		max_size;
1866
	ulint		new_rec_size;
1867
	ulint		old_rec_size;
1868
	dtuple_t*	new_entry;
1869
	roll_ptr_t	roll_ptr;
1870
	mem_heap_t*	heap;
1871
	ulint		i;
1872
	ulint		n_ext;
1873
	ulint*		offsets;
1874
1875
	block = btr_cur_get_block(cursor);
1876
	page = buf_block_get_frame(block);
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
1877
	rec = btr_cur_get_rec(cursor);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
1878
	index = cursor->index;
1879
	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
1880
	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1881
	/* The insert buffer tree should never be updated in place. */
1882
	ut_ad(!dict_index_is_ibuf(index));
1883
1884
	heap = mem_heap_create(1024);
1885
	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
1886
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
1887
	ut_a(!rec_offs_any_null_extern(rec, offsets)
1888
	     || trx_is_recv(thr_get_trx(thr)));
1889
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
1890
1891
#ifdef UNIV_DEBUG
1892
	if (btr_cur_print_record_ops && thr) {
1893
		btr_cur_trx_report(thr_get_trx(thr), index, "update ");
1894
		rec_print_new(stderr, rec, offsets);
1895
	}
1896
#endif /* UNIV_DEBUG */
1897
1898
	if (!row_upd_changes_field_size_or_external(index, offsets, update)) {
1899
1900
		/* The simplest and the most common case: the update does not
1901
		change the size of any field and none of the updated fields is
1902
		externally stored in rec or update, and there is enough space
1903
		on the compressed page to log the update. */
1904
1905
		mem_heap_free(heap);
1906
		return(btr_cur_update_in_place(flags, cursor, update,
1907
					       cmpl_info, thr, mtr));
1908
	}
1909
1910
	if (rec_offs_any_extern(offsets)) {
1911
any_extern:
1912
		/* Externally stored fields are treated in pessimistic
1913
		update */
1914
1915
		mem_heap_free(heap);
1916
		return(DB_OVERFLOW);
1917
	}
1918
1919
	for (i = 0; i < upd_get_n_fields(update); i++) {
1920
		if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
1921
1922
			goto any_extern;
1923
		}
1924
	}
1925
1926
	page_cursor = btr_cur_get_page_cur(cursor);
1927
1928
	new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
1929
					   &n_ext, heap);
1930
	/* We checked above that there are no externally stored fields. */
1931
	ut_a(!n_ext);
1932
1933
	/* The page containing the clustered index record
1934
	corresponding to new_entry is latched in mtr.
1935
	Thus the following call is safe. */
1936
	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
1937
						     FALSE, heap);
1938
	old_rec_size = rec_offs_size(offsets);
1939
	new_rec_size = rec_get_converted_size(index, new_entry, 0);
1940
1941
	page_zip = buf_block_get_page_zip(block);
1942
#ifdef UNIV_ZIP_DEBUG
1943
	ut_a(!page_zip || page_zip_validate(page_zip, page));
1944
#endif /* UNIV_ZIP_DEBUG */
1945
1946
	if (UNIV_LIKELY_NULL(page_zip)
1947
	    && !btr_cur_update_alloc_zip(page_zip, block, index,
1948
					 new_rec_size, TRUE, mtr)) {
1949
		err = DB_ZIP_OVERFLOW;
1950
		goto err_exit;
1951
	}
1952
1953
	if (UNIV_UNLIKELY(new_rec_size
1954
			  >= (page_get_free_space_of_empty(page_is_comp(page))
1955
			      / 2))) {
1956
1957
		err = DB_OVERFLOW;
1958
		goto err_exit;
1959
	}
1960
1961
	if (UNIV_UNLIKELY(page_get_data_size(page)
1962
			  - old_rec_size + new_rec_size
1963
			  < BTR_CUR_PAGE_COMPRESS_LIMIT)) {
1964
1965
		/* The page would become too empty */
1966
1967
		err = DB_UNDERFLOW;
1968
		goto err_exit;
1969
	}
1970
1971
	max_size = old_rec_size
1972
		+ page_get_max_insert_size_after_reorganize(page, 1);
1973
1974
	if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
1975
	       && (max_size >= new_rec_size))
1976
	      || (page_get_n_recs(page) <= 1))) {
1977
1978
		/* There was not enough space, or it did not pay to
1979
		reorganize: for simplicity, we decide what to do assuming a
1980
		reorganization is needed, though it might not be necessary */
1981
1982
		err = DB_OVERFLOW;
1983
		goto err_exit;
1984
	}
1985
1986
	/* Do lock checking and undo logging */
1987
	err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
1988
					thr, mtr, &roll_ptr);
1989
	if (err != DB_SUCCESS) {
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
1990
1991
		goto err_exit;
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
1992
	}
1993
1994
	/* Ok, we may do the replacement. Store on the page infimum the
1995
	explicit locks on rec, before deleting rec (see the comment in
1996
	btr_cur_pessimistic_update). */
1997
1998
	lock_rec_store_on_page_infimum(block, rec);
1999
2000
	btr_search_update_hash_on_delete(cursor);
2001
2002
	/* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
2003
	invokes rec_offs_make_valid() to point to the copied record that
2004
	the fields of new_entry point to.  We have to undo it here. */
2005
	ut_ad(rec_offs_validate(NULL, index, offsets));
2006
	rec_offs_make_valid(page_cur_get_rec(page_cursor), index, offsets);
2007
2008
	page_cur_delete_rec(page_cursor, index, offsets, mtr);
2009
2010
	page_cur_move_to_prev(page_cursor);
2011
2012
	if (!(flags & BTR_KEEP_SYS_FLAG)) {
2013
		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
2014
					      roll_ptr);
2015
		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
2016
					      thr_get_trx(thr)->id);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
2017
	}
2018
2019
	/* There are no externally stored columns in new_entry */
2020
	rec = btr_cur_insert_if_possible(cursor, new_entry, 0/*n_ext*/, mtr);
2021
	ut_a(rec); /* <- We calculated above the insert would fit */
2022
2023
	if (page_zip && !dict_index_is_clust(index)
2024
	    && page_is_leaf(page)) {
2025
		/* Update the free bits in the insert buffer. */
2026
		ibuf_update_free_bits_zip(block, mtr);
2027
	}
2028
2029
	/* Restore the old explicit lock state on the record */
2030
2031
	lock_rec_restore_from_page_infimum(block, rec, block);
2032
2033
	page_cur_move_to_next(page_cursor);
2034
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
2035
	err = DB_SUCCESS;
2036
err_exit:
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
2037
	mem_heap_free(heap);
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
2038
	return(err);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
2039
}
2040
2041
/*************************************************************//**
2042
If, in a split, a new supremum record was created as the predecessor of the
2043
updated record, the supremum record must inherit exactly the locks on the
2044
updated record. In the split it may have inherited locks from the successor
2045
of the updated record, which is not correct. This function restores the
2046
right locks for the new supremum. */
2047
static
2048
void
2049
btr_cur_pess_upd_restore_supremum(
2050
/*==============================*/
2051
	buf_block_t*	block,	/*!< in: buffer block of rec */
2052
	const rec_t*	rec,	/*!< in: updated record */
2053
	mtr_t*		mtr)	/*!< in: mtr */
2054
{
2055
	page_t*		page;
2056
	buf_block_t*	prev_block;
2057
	ulint		space;
2058
	ulint		zip_size;
2059
	ulint		prev_page_no;
2060
2061
	page = buf_block_get_frame(block);
2062
2063
	if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
2064
		/* Updated record is not the first user record on its page */
2065
2066
		return;
2067
	}
2068
2069
	space = buf_block_get_space(block);
2070
	zip_size = buf_block_get_zip_size(block);
2071
	prev_page_no = btr_page_get_prev(page, mtr);
2072
2073
	ut_ad(prev_page_no != FIL_NULL);
2074
	prev_block = buf_page_get_with_no_latch(space, zip_size,
2075
						prev_page_no, mtr);
2076
#ifdef UNIV_BTR_DEBUG
2077
	ut_a(btr_page_get_next(prev_block->frame, mtr)
2078
	     == page_get_page_no(page));
2079
#endif /* UNIV_BTR_DEBUG */
2080
2081
	/* We must already have an x-latch on prev_block! */
2082
	ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
2083
2084
	lock_rec_reset_and_inherit_gap_locks(prev_block, block,
2085
					     PAGE_HEAP_NO_SUPREMUM,
2086
					     page_rec_get_heap_no(rec));
2087
}
2088
2089
/*************************************************************//**
2090
Performs an update of a record on a page of a tree. It is assumed
2091
that mtr holds an x-latch on the tree and on the cursor page. If the
2092
update is made on the leaf level, to avoid deadlocks, mtr must also
2093
own x-latches to brothers of page, if those brothers exist. We assume
2094
here that the ordering fields of the record do not change.
2095
@return	DB_SUCCESS or error code */
2096
UNIV_INTERN
2097
ulint
2098
btr_cur_pessimistic_update(
2099
/*=======================*/
2100
	ulint		flags,	/*!< in: undo logging, locking, and rollback
2101
				flags */
2102
	btr_cur_t*	cursor,	/*!< in: cursor on the record to update */
2103
	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
2104
	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
2105
				be stored externally by the caller, or NULL */
2106
	const upd_t*	update,	/*!< in: update vector; this is allowed also
2107
				contain trx id and roll ptr fields, but
2108
				the values in update vector have no effect */
2109
	ulint		cmpl_info,/*!< in: compiler info on secondary index
2110
				updates */
2111
	que_thr_t*	thr,	/*!< in: query thread */
2112
	mtr_t*		mtr)	/*!< in: mtr; must be committed before
2113
				latching any further pages */
2114
{
2115
	big_rec_t*	big_rec_vec	= NULL;
2116
	big_rec_t*	dummy_big_rec;
2117
	dict_index_t*	index;
2118
	buf_block_t*	block;
2119
	page_t*		page;
2120
	page_zip_des_t*	page_zip;
2121
	rec_t*		rec;
2122
	page_cur_t*	page_cursor;
2123
	dtuple_t*	new_entry;
2124
	ulint		err;
2125
	ulint		optim_err;
2126
	roll_ptr_t	roll_ptr;
2127
	trx_t*		trx;
2128
	ibool		was_first;
2129
	ulint		n_extents	= 0;
2130
	ulint		n_reserved;
2131
	ulint		n_ext;
2132
	ulint*		offsets		= NULL;
2133
2134
	*big_rec = NULL;
2135
2136
	block = btr_cur_get_block(cursor);
2137
	page = buf_block_get_frame(block);
2138
	page_zip = buf_block_get_page_zip(block);
2139
	rec = btr_cur_get_rec(cursor);
2140
	index = cursor->index;
2141
2142
	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2143
				MTR_MEMO_X_LOCK));
2144
	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2145
#ifdef UNIV_ZIP_DEBUG
2146
	ut_a(!page_zip || page_zip_validate(page_zip, page));
2147
#endif /* UNIV_ZIP_DEBUG */
2148
	/* The insert buffer tree should never be updated in place. */
2149
	ut_ad(!dict_index_is_ibuf(index));
2150
2151
	optim_err = btr_cur_optimistic_update(flags, cursor, update,
2152
					      cmpl_info, thr, mtr);
2153
2154
	switch (optim_err) {
2155
	case DB_UNDERFLOW:
2156
	case DB_OVERFLOW:
2157
	case DB_ZIP_OVERFLOW:
2158
		break;
2159
	default:
2160
		return(optim_err);
2161
	}
2162
2163
	/* Do lock checking and undo logging */
2164
	err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
2165
					thr, mtr, &roll_ptr);
2166
	if (err != DB_SUCCESS) {
2167
2168
		return(err);
2169
	}
2170
2171
	if (optim_err == DB_OVERFLOW) {
2172
		ulint	reserve_flag;
2173
2174
		/* First reserve enough free space for the file segments
2175
		of the index tree, so that the update will not fail because
2176
		of lack of space */
2177
2178
		n_extents = cursor->tree_height / 16 + 3;
2179
2180
		if (flags & BTR_NO_UNDO_LOG_FLAG) {
2181
			reserve_flag = FSP_CLEANING;
2182
		} else {
2183
			reserve_flag = FSP_NORMAL;
2184
		}
2185
2186
		if (!fsp_reserve_free_extents(&n_reserved, index->space,
2187
					      n_extents, reserve_flag, mtr)) {
2188
			return(DB_OUT_OF_FILE_SPACE);
2189
		}
2190
	}
2191
2192
	if (!*heap) {
2193
		*heap = mem_heap_create(1024);
2194
	}
2195
	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, heap);
2196
2197
	trx = thr_get_trx(thr);
2198
2199
	new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
2200
					   &n_ext, *heap);
2201
	/* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
2202
	invokes rec_offs_make_valid() to point to the copied record that
2203
	the fields of new_entry point to.  We have to undo it here. */
2204
	ut_ad(rec_offs_validate(NULL, index, offsets));
2205
	rec_offs_make_valid(rec, index, offsets);
2206
2207
	/* The page containing the clustered index record
2208
	corresponding to new_entry is latched in mtr.  If the
2209
	clustered index record is delete-marked, then its externally
2210
	stored fields cannot have been purged yet, because then the
2211
	purge would also have removed the clustered index record
2212
	itself.  Thus the following call is safe. */
2213
	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
2214
						     FALSE, *heap);
2215
	if (!(flags & BTR_KEEP_SYS_FLAG)) {
2216
		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
2217
					      roll_ptr);
2218
		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
2219
					      trx->id);
2220
	}
2221
2222
	if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(offsets)) {
2223
		/* We are in a transaction rollback undoing a row
2224
		update: we must free possible externally stored fields
2225
		which got new values in the update, if they are not
2226
		inherited values. They can be inherited if we have
2227
		updated the primary key to another value, and then
2228
		update it back again. */
2229
2230
		ut_ad(big_rec_vec == NULL);
2231
2232
		btr_rec_free_updated_extern_fields(
2233
			index, rec, page_zip, offsets, update,
2234
			trx_is_recv(trx) ? RB_RECOVERY : RB_NORMAL, mtr);
2235
	}
2236
2237
	/* We have to set appropriate extern storage bits in the new
2238
	record to be inserted: we have to remember which fields were such */
2239
2240
	ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
2241
	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, heap);
2242
	n_ext += btr_push_update_extern_fields(new_entry, update, *heap);
2243
2244
	if (UNIV_LIKELY_NULL(page_zip)) {
2245
		ut_ad(page_is_comp(page));
2246
		if (page_zip_rec_needs_ext(
2247
			    rec_get_converted_size(index, new_entry, n_ext),
2248
			    TRUE,
2249
			    dict_index_get_n_fields(index),
2250
			    page_zip_get_size(page_zip))) {
2251
2252
			goto make_external;
2253
		}
2254
	} else if (page_zip_rec_needs_ext(
2255
			   rec_get_converted_size(index, new_entry, n_ext),
2256
			   page_is_comp(page), 0, 0)) {
2257
make_external:
2258
		big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext);
2259
		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
2260
2261
			err = DB_TOO_BIG_RECORD;
2262
			goto return_after_reservations;
2263
		}
2264
	}
2265
2266
	/* Store state of explicit locks on rec on the page infimum record,
2267
	before deleting rec. The page infimum acts as a dummy carrier of the
2268
	locks, taking care also of lock releases, before we can move the locks
2269
	back on the actual record. There is a special case: if we are
2270
	inserting on the root page and the insert causes a call of
2271
	btr_root_raise_and_insert. Therefore we cannot in the lock system
2272
	delete the lock structs set on the root page even if the root
2273
	page carries just node pointers. */
2274
2275
	lock_rec_store_on_page_infimum(block, rec);
2276
2277
	btr_search_update_hash_on_delete(cursor);
2278
2279
#ifdef UNIV_ZIP_DEBUG
2280
	ut_a(!page_zip || page_zip_validate(page_zip, page));
2281
#endif /* UNIV_ZIP_DEBUG */
2282
	page_cursor = btr_cur_get_page_cur(cursor);
2283
2284
	page_cur_delete_rec(page_cursor, index, offsets, mtr);
2285
2286
	page_cur_move_to_prev(page_cursor);
2287
2288
	rec = btr_cur_insert_if_possible(cursor, new_entry, n_ext, mtr);
2289
2290
	if (rec) {
2291
		lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
2292
						   rec, block);
2293
2294
		offsets = rec_get_offsets(rec, index, offsets,
2295
					  ULINT_UNDEFINED, heap);
2296
2297
		if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
2298
			/* The new inserted record owns its possible externally
2299
			stored fields */
2300
			btr_cur_unmark_extern_fields(page_zip,
2301
						     rec, index, offsets, mtr);
2302
		}
2303
2304
		btr_cur_compress_if_useful(cursor, mtr);
2305
2306
		if (page_zip && !dict_index_is_clust(index)
2307
		    && page_is_leaf(page)) {
2308
			/* Update the free bits in the insert buffer. */
2309
			ibuf_update_free_bits_zip(block, mtr);
2310
		}
2311
2312
		err = DB_SUCCESS;
2313
		goto return_after_reservations;
2314
	} else {
2315
		ut_a(optim_err != DB_UNDERFLOW);
2316
2317
		/* Out of space: reset the free bits. */
2318
		if (!dict_index_is_clust(index)
2319
		    && page_is_leaf(page)) {
2320
			ibuf_reset_free_bits(block);
2321
		}
2322
	}
2323
2324
	/* Was the record to be updated positioned as the first user
2325
	record on its page? */
2326
	was_first = page_cur_is_before_first(page_cursor);
2327
2328
	/* The first parameter means that no lock checking and undo logging
2329
	is made in the insert */
2330
2331
	err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
2332
					 | BTR_NO_LOCKING_FLAG
2333
					 | BTR_KEEP_SYS_FLAG,
2334
					 cursor, new_entry, &rec,
2335
					 &dummy_big_rec, n_ext, NULL, mtr);
2336
	ut_a(rec);
2337
	ut_a(err == DB_SUCCESS);
2338
	ut_a(dummy_big_rec == NULL);
2339
2340
	if (dict_index_is_sec_or_ibuf(index)) {
2341
		/* Update PAGE_MAX_TRX_ID in the index page header.
2342
		It was not updated by btr_cur_pessimistic_insert()
2343
		because of BTR_NO_LOCKING_FLAG. */
2344
		buf_block_t*	rec_block;
2345
2346
		rec_block = btr_cur_get_block(cursor);
2347
2348
		page_update_max_trx_id(rec_block,
2349
				       buf_block_get_page_zip(rec_block),
2350
				       trx->id, mtr);
2351
	}
2352
2353
	if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
2354
		/* The new inserted record owns its possible externally
2355
		stored fields */
2356
		buf_block_t*	rec_block = btr_cur_get_block(cursor);
2357
2358
#ifdef UNIV_ZIP_DEBUG
2359
		ut_a(!page_zip || page_zip_validate(page_zip, page));
2360
		page = buf_block_get_frame(rec_block);
2361
#endif /* UNIV_ZIP_DEBUG */
2362
		page_zip = buf_block_get_page_zip(rec_block);
2363
2364
		offsets = rec_get_offsets(rec, index, offsets,
2365
					  ULINT_UNDEFINED, heap);
2366
		btr_cur_unmark_extern_fields(page_zip,
2367
					     rec, index, offsets, mtr);
2368
	}
2369
2370
	lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
2371
					   rec, block);
2372
2373
	/* If necessary, restore also the correct lock state for a new,
2374
	preceding supremum record created in a page split. While the old
2375
	record was nonexistent, the supremum might have inherited its locks
2376
	from a wrong record. */
2377
2378
	if (!was_first) {
2379
		btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
2380
						  rec, mtr);
2381
	}
2382
2383
return_after_reservations:
2384
#ifdef UNIV_ZIP_DEBUG
2385
	ut_a(!page_zip || page_zip_validate(page_zip, page));
2386
#endif /* UNIV_ZIP_DEBUG */
2387
2388
	if (n_extents > 0) {
2389
		fil_space_release_free_extents(index->space, n_reserved);
2390
	}
2391
2392
	*big_rec = big_rec_vec;
2393
2394
	return(err);
2395
}
2396
2397
/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
2398
2399
/****************************************************************//**
2400
Writes the redo log record for delete marking or unmarking of an index
2401
record. */
2402
UNIV_INLINE
2403
void
2404
btr_cur_del_mark_set_clust_rec_log(
2405
/*===============================*/
2406
	ulint		flags,	/*!< in: flags */
2407
	rec_t*		rec,	/*!< in: record */
2408
	dict_index_t*	index,	/*!< in: index of the record */
2409
	ibool		val,	/*!< in: value to set */
2410
	trx_t*		trx,	/*!< in: deleting transaction */
2411
	roll_ptr_t	roll_ptr,/*!< in: roll ptr to the undo log record */
2412
	mtr_t*		mtr)	/*!< in: mtr */
2413
{
2414
	byte*	log_ptr;
2415
	ut_ad(flags < 256);
2416
	ut_ad(val <= 1);
2417
2418
	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
2419
2420
	log_ptr = mlog_open_and_write_index(mtr, rec, index,
2421
					    page_rec_is_comp(rec)
2422
					    ? MLOG_COMP_REC_CLUST_DELETE_MARK
2423
					    : MLOG_REC_CLUST_DELETE_MARK,
2424
					    1 + 1 + DATA_ROLL_PTR_LEN
2425
					    + 14 + 2);
2426
2427
	if (!log_ptr) {
2428
		/* Logging in mtr is switched off during crash recovery */
2429
		return;
2430
	}
2431
2432
	mach_write_to_1(log_ptr, flags);
2433
	log_ptr++;
2434
	mach_write_to_1(log_ptr, val);
2435
	log_ptr++;
2436
2437
	log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
2438
						mtr);
2439
	mach_write_to_2(log_ptr, page_offset(rec));
2440
	log_ptr += 2;
2441
2442
	mlog_close(mtr, log_ptr);
2443
}
2444
#endif /* !UNIV_HOTBACKUP */
2445
2446
/****************************************************************//**
2447
Parses the redo log record for delete marking or unmarking of a clustered
2448
index record.
2449
@return	end of log record or NULL */
2450
UNIV_INTERN
2451
byte*
2452
btr_cur_parse_del_mark_set_clust_rec(
2453
/*=================================*/
2454
	byte*		ptr,	/*!< in: buffer */
2455
	byte*		end_ptr,/*!< in: buffer end */
2456
	page_t*		page,	/*!< in/out: page or NULL */
2457
	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
2458
	dict_index_t*	index)	/*!< in: index corresponding to page */
2459
{
2460
	ulint		flags;
2461
	ulint		val;
2462
	ulint		pos;
2463
	trx_id_t	trx_id;
2464
	roll_ptr_t	roll_ptr;
2465
	ulint		offset;
2466
	rec_t*		rec;
2467
2468
	ut_ad(!page
2469
	      || !!page_is_comp(page) == dict_table_is_comp(index->table));
2470
2471
	if (end_ptr < ptr + 2) {
2472
2473
		return(NULL);
2474
	}
2475
2476
	flags = mach_read_from_1(ptr);
2477
	ptr++;
2478
	val = mach_read_from_1(ptr);
2479
	ptr++;
2480
2481
	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
2482
2483
	if (ptr == NULL) {
2484
2485
		return(NULL);
2486
	}
2487
2488
	if (end_ptr < ptr + 2) {
2489
2490
		return(NULL);
2491
	}
2492
2493
	offset = mach_read_from_2(ptr);
2494
	ptr += 2;
2495
2496
	ut_a(offset <= UNIV_PAGE_SIZE);
2497
2498
	if (page) {
2499
		rec = page + offset;
2500
2501
		/* We do not need to reserve btr_search_latch, as the page
2502
		is only being recovered, and there cannot be a hash index to
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
2503
		it. Besides, these fields are being updated in place
2504
		and the adaptive hash index does not depend on them. */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
2505
2506
		btr_rec_set_deleted_flag(rec, page_zip, val);
2507
2508
		if (!(flags & BTR_KEEP_SYS_FLAG)) {
2509
			mem_heap_t*	heap		= NULL;
2510
			ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2511
			rec_offs_init(offsets_);
2512
2513
			row_upd_rec_sys_fields_in_recovery(
2514
				rec, page_zip,
2515
				rec_get_offsets(rec, index, offsets_,
2516
						ULINT_UNDEFINED, &heap),
2517
				pos, trx_id, roll_ptr);
2518
			if (UNIV_LIKELY_NULL(heap)) {
2519
				mem_heap_free(heap);
2520
			}
2521
		}
2522
	}
2523
2524
	return(ptr);
2525
}
2526
2527
#ifndef UNIV_HOTBACKUP
2528
/***********************************************************//**
2529
Marks a clustered index record deleted. Writes an undo log record to
2530
undo log on this delete marking. Writes in the trx id field the id
2531
of the deleting transaction, and in the roll ptr field pointer to the
2532
undo log record created.
2533
@return	DB_SUCCESS, DB_LOCK_WAIT, or error number */
2534
UNIV_INTERN
2535
ulint
2536
btr_cur_del_mark_set_clust_rec(
2537
/*===========================*/
2538
	ulint		flags,	/*!< in: undo logging and locking flags */
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
2539
	buf_block_t*	block,	/*!< in/out: buffer block of the record */
2540
	rec_t*		rec,	/*!< in/out: record */
2541
	dict_index_t*	index,	/*!< in: clustered index of the record */
2542
	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
2543
	ibool		val,	/*!< in: value to set */
2544
	que_thr_t*	thr,	/*!< in: query thread */
2545
	mtr_t*		mtr)	/*!< in: mtr */
2546
{
2547
	roll_ptr_t	roll_ptr;
2548
	ulint		err;
2549
	page_zip_des_t*	page_zip;
2550
	trx_t*		trx;
2551
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
2552
	ut_ad(dict_index_is_clust(index));
2553
	ut_ad(rec_offs_validate(rec, index, offsets));
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
2554
	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
2555
	ut_ad(buf_block_get_frame(block) == page_align(rec));
2556
	ut_ad(page_is_leaf(page_align(rec)));
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
2557
2558
#ifdef UNIV_DEBUG
2559
	if (btr_cur_print_record_ops && thr) {
2560
		btr_cur_trx_report(thr_get_trx(thr), index, "del mark ");
2561
		rec_print_new(stderr, rec, offsets);
2562
	}
2563
#endif /* UNIV_DEBUG */
2564
2565
	ut_ad(dict_index_is_clust(index));
2566
	ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
2567
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
2568
	err = lock_clust_rec_modify_check_and_lock(flags, block,
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
2569
						   rec, index, offsets, thr);
2570
2571
	if (err != DB_SUCCESS) {
2572
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
2573
		return(err);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
2574
	}
2575
2576
	err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
2577
					    index, NULL, NULL, 0, rec,
2578
					    &roll_ptr);
2579
	if (err != DB_SUCCESS) {
2580
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
2581
		return(err);
2582
	}
2583
2584
	/* The btr_search_latch is not needed here, because
2585
	the adaptive hash index does not depend on the delete-mark
2586
	and the delete-mark is being updated in place. */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
2587
2588
	page_zip = buf_block_get_page_zip(block);
2589
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
2590
	btr_blob_dbg_set_deleted_flag(rec, index, offsets, val);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
2591
	btr_rec_set_deleted_flag(rec, page_zip, val);
2592
2593
	trx = thr_get_trx(thr);
2594
2595
	if (!(flags & BTR_KEEP_SYS_FLAG)) {
2596
		row_upd_rec_sys_fields(rec, page_zip,
2597
				       index, offsets, trx, roll_ptr);
2598
	}
2599
2600
	btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx,
2601
					   roll_ptr, mtr);
2602
2603
	return(err);
2604
}
2605
2606
/****************************************************************//**
2607
Writes the redo log record for a delete mark setting of a secondary
2608
index record. */
2609
UNIV_INLINE
2610
void
2611
btr_cur_del_mark_set_sec_rec_log(
2612
/*=============================*/
2613
	rec_t*		rec,	/*!< in: record */
2614
	ibool		val,	/*!< in: value to set */
2615
	mtr_t*		mtr)	/*!< in: mtr */
2616
{
2617
	byte*	log_ptr;
2618
	ut_ad(val <= 1);
2619
2620
	log_ptr = mlog_open(mtr, 11 + 1 + 2);
2621
2622
	if (!log_ptr) {
2623
		/* Logging in mtr is switched off during crash recovery:
2624
		in that case mlog_open returns NULL */
2625
		return;
2626
	}
2627
2628
	log_ptr = mlog_write_initial_log_record_fast(
2629
		rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
2630
	mach_write_to_1(log_ptr, val);
2631
	log_ptr++;
2632
2633
	mach_write_to_2(log_ptr, page_offset(rec));
2634
	log_ptr += 2;
2635
2636
	mlog_close(mtr, log_ptr);
2637
}
2638
#endif /* !UNIV_HOTBACKUP */
2639
2640
/****************************************************************//**
2641
Parses the redo log record for delete marking or unmarking of a secondary
2642
index record.
2643
@return	end of log record or NULL */
2644
UNIV_INTERN
2645
byte*
2646
btr_cur_parse_del_mark_set_sec_rec(
2647
/*===============================*/
2648
	byte*		ptr,	/*!< in: buffer */
2649
	byte*		end_ptr,/*!< in: buffer end */
2650
	page_t*		page,	/*!< in/out: page or NULL */
2651
	page_zip_des_t*	page_zip)/*!< in/out: compressed page, or NULL */
2652
{
2653
	ulint	val;
2654
	ulint	offset;
2655
	rec_t*	rec;
2656
2657
	if (end_ptr < ptr + 3) {
2658
2659
		return(NULL);
2660
	}
2661
2662
	val = mach_read_from_1(ptr);
2663
	ptr++;
2664
2665
	offset = mach_read_from_2(ptr);
2666
	ptr += 2;
2667
2668
	ut_a(offset <= UNIV_PAGE_SIZE);
2669
2670
	if (page) {
2671
		rec = page + offset;
2672
2673
		/* We do not need to reserve btr_search_latch, as the page
2674
		is only being recovered, and there cannot be a hash index to
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
2675
		it. Besides, the delete-mark flag is being updated in place
2676
		and the adaptive hash index does not depend on it. */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
2677
2678
		btr_rec_set_deleted_flag(rec, page_zip, val);
2679
	}
2680
2681
	return(ptr);
2682
}
2683
2684
#ifndef UNIV_HOTBACKUP
2685
/***********************************************************//**
2686
Sets a secondary index record delete mark to TRUE or FALSE.
2687
@return	DB_SUCCESS, DB_LOCK_WAIT, or error number */
2688
UNIV_INTERN
2689
ulint
2690
btr_cur_del_mark_set_sec_rec(
2691
/*=========================*/
2692
	ulint		flags,	/*!< in: locking flag */
2693
	btr_cur_t*	cursor,	/*!< in: cursor */
2694
	ibool		val,	/*!< in: value to set */
2695
	que_thr_t*	thr,	/*!< in: query thread */
2696
	mtr_t*		mtr)	/*!< in: mtr */
2697
{
2698
	buf_block_t*	block;
2699
	rec_t*		rec;
2700
	ulint		err;
2701
2702
	block = btr_cur_get_block(cursor);
2703
	rec = btr_cur_get_rec(cursor);
2704
2705
#ifdef UNIV_DEBUG
2706
	if (btr_cur_print_record_ops && thr) {
2707
		btr_cur_trx_report(thr_get_trx(thr), cursor->index,
2708
				   "del mark ");
2709
		rec_print(stderr, rec, cursor->index);
2710
	}
2711
#endif /* UNIV_DEBUG */
2712
2713
	err = lock_sec_rec_modify_check_and_lock(flags,
2714
						 btr_cur_get_block(cursor),
2715
						 rec, cursor->index, thr, mtr);
2716
	if (err != DB_SUCCESS) {
2717
2718
		return(err);
2719
	}
2720
2721
	ut_ad(!!page_rec_is_comp(rec)
2722
	      == dict_table_is_comp(cursor->index->table));
2723
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
2724
	/* We do not need to reserve btr_search_latch, as the
2725
	delete-mark flag is being updated in place and the adaptive
2726
	hash index does not depend on it. */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
2727
	btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
2728
2729
	btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
2730
2731
	return(DB_SUCCESS);
2732
}
2733
2734
/***********************************************************//**
2735
Clear a secondary index record's delete mark.  This function is only
2736
used by the insert buffer insert merge mechanism. */
2737
UNIV_INTERN
2738
void
2739
btr_cur_del_unmark_for_ibuf(
2740
/*========================*/
2741
	rec_t*		rec,		/*!< in/out: record to delete unmark */
2742
	page_zip_des_t*	page_zip,	/*!< in/out: compressed page
2743
					corresponding to rec, or NULL
2744
					when the tablespace is
2745
					uncompressed */
2746
	mtr_t*		mtr)		/*!< in: mtr */
2747
{
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
2748
	/* We do not need to reserve btr_search_latch, as the page
2749
	has just been read to the buffer pool and there cannot be
2750
	a hash index to it.  Besides, the delete-mark flag is being
2751
	updated in place and the adaptive hash index does not depend
2752
	on it. */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
2753
2754
	btr_rec_set_deleted_flag(rec, page_zip, FALSE);
2755
2756
	btr_cur_del_mark_set_sec_rec_log(rec, FALSE, mtr);
2757
}
2758
2759
/*==================== B-TREE RECORD REMOVE =========================*/
2760
2761
/*************************************************************//**
2762
Tries to compress a page of the tree if it seems useful. It is assumed
2763
that mtr holds an x-latch on the tree and on the cursor page. To avoid
2764
deadlocks, mtr must also own x-latches to brothers of page, if those
2765
brothers exist. NOTE: it is assumed that the caller has reserved enough
2766
free extents so that the compression will always succeed if done!
2767
@return	TRUE if compression occurred */
2768
UNIV_INTERN
2769
ibool
2770
btr_cur_compress_if_useful(
2771
/*=======================*/
2772
	btr_cur_t*	cursor,	/*!< in: cursor on the page to compress;
2773
				cursor does not stay valid if compression
2774
				occurs */
2775
	mtr_t*		mtr)	/*!< in: mtr */
2776
{
2777
	ut_ad(mtr_memo_contains(mtr,
2778
				dict_index_get_lock(btr_cur_get_index(cursor)),
2779
				MTR_MEMO_X_LOCK));
2780
	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
2781
				MTR_MEMO_PAGE_X_FIX));
2782
2783
	return(btr_cur_compress_recommendation(cursor, mtr)
2784
	       && btr_compress(cursor, mtr));
2785
}
2786
2787
/*******************************************************//**
2788
Removes the record on which the tree cursor is positioned on a leaf page.
2789
It is assumed that the mtr has an x-latch on the page where the cursor is
2790
positioned, but no latch on the whole tree.
2791
@return	TRUE if success, i.e., the page did not become too empty */
2792
UNIV_INTERN
2793
ibool
2794
btr_cur_optimistic_delete(
2795
/*======================*/
2796
	btr_cur_t*	cursor,	/*!< in: cursor on leaf page, on the record to
2797
				delete; cursor stays valid: if deletion
2798
				succeeds, on function exit it points to the
2799
				successor of the deleted record */
2800
	mtr_t*		mtr)	/*!< in: mtr; if this function returns
2801
				TRUE on a leaf page of a secondary
2802
				index, the mtr must be committed
2803
				before latching any further pages */
2804
{
2805
	buf_block_t*	block;
2806
	rec_t*		rec;
2807
	mem_heap_t*	heap		= NULL;
2808
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2809
	ulint*		offsets		= offsets_;
2810
	ibool		no_compress_needed;
2811
	rec_offs_init(offsets_);
2812
2813
	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
2814
				MTR_MEMO_PAGE_X_FIX));
2815
	/* This is intended only for leaf page deletions */
2816
2817
	block = btr_cur_get_block(cursor);
2818
2819
	ut_ad(page_is_leaf(buf_block_get_frame(block)));
2820
2821
	rec = btr_cur_get_rec(cursor);
2822
	offsets = rec_get_offsets(rec, cursor->index, offsets,
2823
				  ULINT_UNDEFINED, &heap);
2824
2825
	no_compress_needed = !rec_offs_any_extern(offsets)
2826
		&& btr_cur_can_delete_without_compress(
2827
			cursor, rec_offs_size(offsets), mtr);
2828
2829
	if (no_compress_needed) {
2830
2831
		page_t*		page	= buf_block_get_frame(block);
2832
		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
2833
		ulint		max_ins	= 0;
2834
2835
		lock_update_delete(block, rec);
2836
2837
		btr_search_update_hash_on_delete(cursor);
2838
2839
		if (!page_zip) {
2840
			max_ins = page_get_max_insert_size_after_reorganize(
2841
				page, 1);
2842
		}
2843
#ifdef UNIV_ZIP_DEBUG
2844
		ut_a(!page_zip || page_zip_validate(page_zip, page));
2845
#endif /* UNIV_ZIP_DEBUG */
2846
		page_cur_delete_rec(btr_cur_get_page_cur(cursor),
2847
				    cursor->index, offsets, mtr);
2848
#ifdef UNIV_ZIP_DEBUG
2849
		ut_a(!page_zip || page_zip_validate(page_zip, page));
2850
#endif /* UNIV_ZIP_DEBUG */
2851
2852
		if (dict_index_is_clust(cursor->index)
2853
		    || dict_index_is_ibuf(cursor->index)
2854
		    || !page_is_leaf(page)) {
2855
			/* The insert buffer does not handle
2856
			inserts to clustered indexes, to
2857
			non-leaf pages of secondary index B-trees,
2858
			or to the insert buffer. */
2859
		} else if (page_zip) {
2860
			ibuf_update_free_bits_zip(block, mtr);
2861
		} else {
2862
			ibuf_update_free_bits_low(block, max_ins, mtr);
2863
		}
2864
	}
2865
2866
	if (UNIV_LIKELY_NULL(heap)) {
2867
		mem_heap_free(heap);
2868
	}
2869
2870
	return(no_compress_needed);
2871
}
2872
2873
/*************************************************************//**
2874
Removes the record on which the tree cursor is positioned. Tries
2875
to compress the page if its fillfactor drops below a threshold
2876
or if it is the only page on the level. It is assumed that mtr holds
2877
an x-latch on the tree and on the cursor page. To avoid deadlocks,
2878
mtr must also own x-latches to brothers of page, if those brothers
2879
exist.
2880
@return	TRUE if compression occurred */
2881
UNIV_INTERN
2882
ibool
2883
btr_cur_pessimistic_delete(
2884
/*=======================*/
2885
	ulint*		err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
2886
				the latter may occur because we may have
2887
				to update node pointers on upper levels,
2888
				and in the case of variable length keys
2889
				these may actually grow in size */
2890
	ibool		has_reserved_extents, /*!< in: TRUE if the
2891
				caller has already reserved enough free
2892
				extents so that he knows that the operation
2893
				will succeed */
2894
	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
2895
				if compression does not occur, the cursor
2896
				stays valid: it points to successor of
2897
				deleted record on function exit */
2898
	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
2899
	mtr_t*		mtr)	/*!< in: mtr */
2900
{
2901
	buf_block_t*	block;
2902
	page_t*		page;
2903
	page_zip_des_t*	page_zip;
2904
	dict_index_t*	index;
2905
	rec_t*		rec;
2906
	dtuple_t*	node_ptr;
2907
	ulint		n_extents	= 0;
2908
	ulint		n_reserved;
2909
	ibool		success;
2910
	ibool		ret		= FALSE;
2911
	ulint		level;
2912
	mem_heap_t*	heap;
2913
	ulint*		offsets;
2914
2915
	block = btr_cur_get_block(cursor);
2916
	page = buf_block_get_frame(block);
2917
	index = btr_cur_get_index(cursor);
2918
2919
	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2920
				MTR_MEMO_X_LOCK));
2921
	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2922
	if (!has_reserved_extents) {
2923
		/* First reserve enough free space for the file segments
2924
		of the index tree, so that the node pointer updates will
2925
		not fail because of lack of space */
2926
2927
		n_extents = cursor->tree_height / 32 + 1;
2928
2929
		success = fsp_reserve_free_extents(&n_reserved,
2930
						   index->space,
2931
						   n_extents,
2932
						   FSP_CLEANING, mtr);
2933
		if (!success) {
2934
			*err = DB_OUT_OF_FILE_SPACE;
2935
2936
			return(FALSE);
2937
		}
2938
	}
2939
2940
	heap = mem_heap_create(1024);
2941
	rec = btr_cur_get_rec(cursor);
2942
	page_zip = buf_block_get_page_zip(block);
2943
#ifdef UNIV_ZIP_DEBUG
2944
	ut_a(!page_zip || page_zip_validate(page_zip, page));
2945
#endif /* UNIV_ZIP_DEBUG */
2946
2947
	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
2948
2949
	if (rec_offs_any_extern(offsets)) {
2950
		btr_rec_free_externally_stored_fields(index,
2951
						      rec, offsets, page_zip,
2952
						      rb_ctx, mtr);
2953
#ifdef UNIV_ZIP_DEBUG
2954
		ut_a(!page_zip || page_zip_validate(page_zip, page));
2955
#endif /* UNIV_ZIP_DEBUG */
2956
	}
2957
2958
	if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
2959
	    && UNIV_UNLIKELY(dict_index_get_page(index)
2960
			     != buf_block_get_page_no(block))) {
2961
2962
		/* If there is only one record, drop the whole page in
2963
		btr_discard_page, if this is not the root page */
2964
2965
		btr_discard_page(cursor, mtr);
2966
2967
		*err = DB_SUCCESS;
2968
		ret = TRUE;
2969
2970
		goto return_after_reservations;
2971
	}
2972
2973
	lock_update_delete(block, rec);
2974
	level = btr_page_get_level(page, mtr);
2975
2976
	if (level > 0
2977
	    && UNIV_UNLIKELY(rec == page_rec_get_next(
2978
				     page_get_infimum_rec(page)))) {
2979
2980
		rec_t*	next_rec = page_rec_get_next(rec);
2981
2982
		if (btr_page_get_prev(page, mtr) == FIL_NULL) {
2983
2984
			/* If we delete the leftmost node pointer on a
2985
			non-leaf level, we must mark the new leftmost node
2986
			pointer as the predefined minimum record */
2987
2988
			/* This will make page_zip_validate() fail until
2989
			page_cur_delete_rec() completes.  This is harmless,
2990
			because everything will take place within a single
2991
			mini-transaction and because writing to the redo log
2992
			is an atomic operation (performed by mtr_commit()). */
2993
			btr_set_min_rec_mark(next_rec, mtr);
2994
		} else {
2995
			/* Otherwise, if we delete the leftmost node pointer
2996
			on a page, we have to change the father node pointer
2997
			so that it is equal to the new leftmost node pointer
2998
			on the page */
2999
3000
			btr_node_ptr_delete(index, block, mtr);
3001
3002
			node_ptr = dict_index_build_node_ptr(
3003
				index, next_rec, buf_block_get_page_no(block),
3004
				heap, level);
3005
3006
			btr_insert_on_non_leaf_level(index,
3007
						     level + 1, node_ptr, mtr);
3008
		}
3009
	}
3010
3011
	btr_search_update_hash_on_delete(cursor);
3012
3013
	page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
3014
#ifdef UNIV_ZIP_DEBUG
3015
	ut_a(!page_zip || page_zip_validate(page_zip, page));
3016
#endif /* UNIV_ZIP_DEBUG */
3017
3018
	ut_ad(btr_check_node_ptr(index, block, mtr));
3019
3020
	*err = DB_SUCCESS;
3021
3022
return_after_reservations:
3023
	mem_heap_free(heap);
3024
3025
	if (ret == FALSE) {
3026
		ret = btr_cur_compress_if_useful(cursor, mtr);
3027
	}
3028
3029
	if (n_extents > 0) {
3030
		fil_space_release_free_extents(index->space, n_reserved);
3031
	}
3032
3033
	return(ret);
3034
}
3035
3036
/*******************************************************************//**
3037
Adds path information to the cursor for the current page, for which
3038
the binary search has been performed. */
3039
static
3040
void
3041
btr_cur_add_path_info(
3042
/*==================*/
3043
	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
3044
	ulint		height,		/*!< in: height of the page in tree;
3045
					0 means leaf node */
3046
	ulint		root_height)	/*!< in: root node height in tree */
3047
{
3048
	btr_path_t*	slot;
3049
	rec_t*		rec;
3050
3051
	ut_a(cursor->path_arr);
3052
3053
	if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
3054
		/* Do nothing; return empty path */
3055
3056
		slot = cursor->path_arr;
3057
		slot->nth_rec = ULINT_UNDEFINED;
3058
3059
		return;
3060
	}
3061
3062
	if (height == 0) {
3063
		/* Mark end of slots for path */
3064
		slot = cursor->path_arr + root_height + 1;
3065
		slot->nth_rec = ULINT_UNDEFINED;
3066
	}
3067
3068
	rec = btr_cur_get_rec(cursor);
3069
3070
	slot = cursor->path_arr + (root_height - height);
3071
3072
	slot->nth_rec = page_rec_get_n_recs_before(rec);
3073
	slot->n_recs = page_get_n_recs(page_align(rec));
3074
}
3075
3076
/*******************************************************************//**
3077
Estimates the number of rows in a given index range.
3078
@return	estimated number of rows */
3079
UNIV_INTERN
3080
ib_int64_t
3081
btr_estimate_n_rows_in_range(
3082
/*=========================*/
3083
	dict_index_t*	index,	/*!< in: index */
3084
	const dtuple_t*	tuple1,	/*!< in: range start, may also be empty tuple */
3085
	ulint		mode1,	/*!< in: search mode for range start */
3086
	const dtuple_t*	tuple2,	/*!< in: range end, may also be empty tuple */
3087
	ulint		mode2)	/*!< in: search mode for range end */
3088
{
3089
	btr_path_t	path1[BTR_PATH_ARRAY_N_SLOTS];
3090
	btr_path_t	path2[BTR_PATH_ARRAY_N_SLOTS];
3091
	btr_cur_t	cursor;
3092
	btr_path_t*	slot1;
3093
	btr_path_t*	slot2;
3094
	ibool		diverged;
3095
	ibool		diverged_lot;
3096
	ulint		divergence_level;
3097
	ib_int64_t	n_rows;
3098
	ulint		i;
3099
	mtr_t		mtr;
3100
3101
	mtr_start(&mtr);
3102
3103
	cursor.path_arr = path1;
3104
3105
	if (dtuple_get_n_fields(tuple1) > 0) {
3106
3107
		btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
3108
					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3109
					    &cursor, 0,
3110
					    __FILE__, __LINE__, &mtr);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3111
	} else {
3112
		btr_cur_open_at_index_side(TRUE, index,
3113
					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
3114
					   &cursor, &mtr);
3115
	}
3116
3117
	mtr_commit(&mtr);
3118
3119
	mtr_start(&mtr);
3120
3121
	cursor.path_arr = path2;
3122
3123
	if (dtuple_get_n_fields(tuple2) > 0) {
3124
3125
		btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
3126
					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3127
					    &cursor, 0,
3128
					    __FILE__, __LINE__, &mtr);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3129
	} else {
3130
		btr_cur_open_at_index_side(FALSE, index,
3131
					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
3132
					   &cursor, &mtr);
3133
	}
3134
3135
	mtr_commit(&mtr);
3136
3137
	/* We have the path information for the range in path1 and path2 */
3138
3139
	n_rows = 1;
3140
	diverged = FALSE;	    /* This becomes true when the path is not
3141
				    the same any more */
3142
	diverged_lot = FALSE;	    /* This becomes true when the paths are
3143
				    not the same or adjacent any more */
3144
	divergence_level = 1000000; /* This is the level where paths diverged
3145
				    a lot */
3146
	for (i = 0; ; i++) {
3147
		ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
3148
3149
		slot1 = path1 + i;
3150
		slot2 = path2 + i;
3151
3152
		if (slot1->nth_rec == ULINT_UNDEFINED
3153
		    || slot2->nth_rec == ULINT_UNDEFINED) {
3154
3155
			if (i > divergence_level + 1) {
3156
				/* In trees whose height is > 1 our algorithm
3157
				tends to underestimate: multiply the estimate
3158
				by 2: */
3159
3160
				n_rows = n_rows * 2;
3161
			}
3162
3163
			/* Do not estimate the number of rows in the range
3164
			to over 1 / 2 of the estimated rows in the whole
3165
			table */
3166
3167
			if (n_rows > index->table->stat_n_rows / 2) {
3168
				n_rows = index->table->stat_n_rows / 2;
3169
3170
				/* If there are just 0 or 1 rows in the table,
3171
				then we estimate all rows are in the range */
3172
3173
				if (n_rows == 0) {
3174
					n_rows = index->table->stat_n_rows;
3175
				}
3176
			}
3177
3178
			return(n_rows);
3179
		}
3180
3181
		if (!diverged && slot1->nth_rec != slot2->nth_rec) {
3182
3183
			diverged = TRUE;
3184
3185
			if (slot1->nth_rec < slot2->nth_rec) {
3186
				n_rows = slot2->nth_rec - slot1->nth_rec;
3187
3188
				if (n_rows > 1) {
3189
					diverged_lot = TRUE;
3190
					divergence_level = i;
3191
				}
3192
			} else {
3193
				/* Maybe the tree has changed between
3194
				searches */
3195
3196
				return(10);
3197
			}
3198
3199
		} else if (diverged && !diverged_lot) {
3200
3201
			if (slot1->nth_rec < slot1->n_recs
3202
			    || slot2->nth_rec > 1) {
3203
3204
				diverged_lot = TRUE;
3205
				divergence_level = i;
3206
3207
				n_rows = 0;
3208
3209
				if (slot1->nth_rec < slot1->n_recs) {
3210
					n_rows += slot1->n_recs
3211
						- slot1->nth_rec;
3212
				}
3213
3214
				if (slot2->nth_rec > 1) {
3215
					n_rows += slot2->nth_rec - 1;
3216
				}
3217
			}
3218
		} else if (diverged_lot) {
3219
3220
			n_rows = (n_rows * (slot1->n_recs + slot2->n_recs))
3221
				/ 2;
3222
		}
3223
	}
3224
}
3225
3226
/*******************************************************************//**
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3227
Record the number of non_null key values in a given index for
3228
each n-column prefix of the index where n < dict_index_get_n_unique(index).
3229
The estimates are eventually stored in the array:
3230
index->stat_n_non_null_key_vals. */
3231
static
3232
void
3233
btr_record_not_null_field_in_rec(
3234
/*=============================*/
3235
	ulint		n_unique,	/*!< in: dict_index_get_n_unique(index),
3236
					number of columns uniquely determine
3237
					an index entry */
3238
	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
3239
					its size could be for all fields or
3240
					that of "n_unique" */
3241
	ib_int64_t*	n_not_null)	/*!< in/out: array to record number of
3242
					not null rows for n-column prefix */
3243
{
3244
	ulint	i;
3245
3246
	ut_ad(rec_offs_n_fields(offsets) >= n_unique);
3247
3248
	if (n_not_null == NULL) {
3249
		return;
3250
	}
3251
3252
	for (i = 0; i < n_unique; i++) {
3253
		if (rec_offs_nth_sql_null(offsets, i)) {
3254
			break;
3255
		}
3256
3257
		n_not_null[i]++;
3258
	}
3259
}
3260
3261
/*******************************************************************//**
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3262
Estimates the number of different key values in a given index, for
3263
each n-column prefix of the index where n <= dict_index_get_n_unique(index).
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3264
The estimates are stored in the array index->stat_n_diff_key_vals.
3265
If innodb_stats_method is "nulls_ignored", we also record the number of
3266
non-null values for each prefix and store the estimates in
3267
array index->stat_n_non_null_key_vals. */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3268
UNIV_INTERN
3269
void
3270
btr_estimate_number_of_different_key_vals(
3271
/*======================================*/
3272
	dict_index_t*	index)	/*!< in: index */
3273
{
3274
	btr_cur_t	cursor;
3275
	page_t*		page;
3276
	rec_t*		rec;
3277
	ulint		n_cols;
3278
	ulint		matched_fields;
3279
	ulint		matched_bytes;
3280
	ib_int64_t*	n_diff;
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3281
	ib_int64_t*	n_not_null;
3282
	ibool		stats_null_not_equal;
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3283
	ullint		n_sample_pages; /* number of pages to sample */
3284
	ulint		not_empty_flag	= 0;
3285
	ulint		total_external_size = 0;
3286
	ulint		i;
3287
	ulint		j;
3288
	ullint		add_on;
3289
	mtr_t		mtr;
3290
	mem_heap_t*	heap		= NULL;
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3291
	ulint*		offsets_rec	= NULL;
3292
	ulint*		offsets_next_rec = NULL;
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3293
3294
	n_cols = dict_index_get_n_unique(index);
3295
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3296
	heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
3297
			       * (n_cols + 1)
3298
			       + dict_index_get_n_fields(index)
3299
			       * (sizeof *offsets_rec
3300
				  + sizeof *offsets_next_rec));
3301
3302
	n_diff = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_int64_t));
3303
3304
	n_not_null = NULL;
3305
3306
	/* Check srv_innodb_stats_method setting, and decide whether we
3307
	need to record non-null value and also decide if NULL is
3308
	considered equal (by setting stats_null_not_equal value) */
3309
	switch (srv_innodb_stats_method) {
3310
	case SRV_STATS_NULLS_IGNORED:
3311
		n_not_null = mem_heap_zalloc(heap, (n_cols + 1)
3312
					     * sizeof *n_not_null);
3313
		/* fall through */
3314
3315
	case SRV_STATS_NULLS_UNEQUAL:
3316
		/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
3317
		case, we will treat NULLs as unequal value */
3318
		stats_null_not_equal = TRUE;
3319
		break;
3320
3321
	case SRV_STATS_NULLS_EQUAL:
3322
		stats_null_not_equal = FALSE;
3323
		break;
3324
3325
	default:
3326
		ut_error;
3327
        }
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3328
3329
	/* It makes no sense to test more pages than are contained
3330
	in the index, thus we lower the number if it is too high */
3331
	if (srv_stats_sample_pages > index->stat_index_size) {
3332
		if (index->stat_index_size > 0) {
3333
			n_sample_pages = index->stat_index_size;
3334
		} else {
3335
			n_sample_pages = 1;
3336
		}
3337
	} else {
3338
		n_sample_pages = srv_stats_sample_pages;
3339
	}
3340
3341
	/* We sample some pages in the index to get an estimate */
3342
3343
	for (i = 0; i < n_sample_pages; i++) {
3344
		mtr_start(&mtr);
3345
3346
		btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr);
3347
3348
		/* Count the number of different key values for each prefix of
3349
		the key on this index page. If the prefix does not determine
3350
		the index record uniquely in the B-tree, then we subtract one
3351
		because otherwise our algorithm would give a wrong estimate
3352
		for an index where there is just one key value. */
3353
3354
		page = btr_cur_get_page(&cursor);
3355
3356
		rec = page_rec_get_next(page_get_infimum_rec(page));
3357
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3358
		if (!page_rec_is_supremum(rec)) {
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3359
			not_empty_flag = 1;
3360
			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
3361
						      ULINT_UNDEFINED, &heap);
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3362
3363
			if (n_not_null) {
3364
				btr_record_not_null_field_in_rec(
3365
					n_cols, offsets_rec, n_not_null);
3366
			}
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3367
		}
3368
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3369
		while (!page_rec_is_supremum(rec)) {
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3370
			rec_t*	next_rec = page_rec_get_next(rec);
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3371
			if (page_rec_is_supremum(next_rec)) {
3372
				total_external_size +=
3373
					btr_rec_get_externally_stored_len(
3374
						rec, offsets_rec);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3375
				break;
3376
			}
3377
3378
			matched_fields = 0;
3379
			matched_bytes = 0;
3380
			offsets_next_rec = rec_get_offsets(next_rec, index,
3381
							   offsets_next_rec,
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3382
							   ULINT_UNDEFINED,
3383
							   &heap);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3384
3385
			cmp_rec_rec_with_match(rec, next_rec,
3386
					       offsets_rec, offsets_next_rec,
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3387
					       index, stats_null_not_equal,
3388
					       &matched_fields,
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3389
					       &matched_bytes);
3390
3391
			for (j = matched_fields + 1; j <= n_cols; j++) {
3392
				/* We add one if this index record has
3393
				a different prefix from the previous */
3394
3395
				n_diff[j]++;
3396
			}
3397
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3398
			if (n_not_null) {
3399
				btr_record_not_null_field_in_rec(
3400
					n_cols, offsets_next_rec, n_not_null);
3401
			}
3402
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3403
			total_external_size
3404
				+= btr_rec_get_externally_stored_len(
3405
					rec, offsets_rec);
3406
3407
			rec = next_rec;
3408
			/* Initialize offsets_rec for the next round
3409
			and assign the old offsets_rec buffer to
3410
			offsets_next_rec. */
3411
			{
3412
				ulint*	offsets_tmp = offsets_rec;
3413
				offsets_rec = offsets_next_rec;
3414
				offsets_next_rec = offsets_tmp;
3415
			}
3416
		}
3417
3418
3419
		if (n_cols == dict_index_get_n_unique_in_tree(index)) {
3420
3421
			/* If there is more than one leaf page in the tree,
3422
			we add one because we know that the first record
3423
			on the page certainly had a different prefix than the
3424
			last record on the previous index page in the
3425
			alphabetical order. Before this fix, if there was
3426
			just one big record on each clustered index page, the
3427
			algorithm grossly underestimated the number of rows
3428
			in the table. */
3429
3430
			if (btr_page_get_prev(page, &mtr) != FIL_NULL
3431
			    || btr_page_get_next(page, &mtr) != FIL_NULL) {
3432
3433
				n_diff[n_cols]++;
3434
			}
3435
		}
3436
3437
		mtr_commit(&mtr);
3438
	}
3439
3440
	/* If we saw k borders between different key values on
3441
	n_sample_pages leaf pages, we can estimate how many
3442
	there will be in index->stat_n_leaf_pages */
3443
3444
	/* We must take into account that our sample actually represents
3445
	also the pages used for external storage of fields (those pages are
3446
	included in index->stat_n_leaf_pages) */
3447
3448
	for (j = 0; j <= n_cols; j++) {
3449
		index->stat_n_diff_key_vals[j]
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3450
			= BTR_TABLE_STATS_FROM_SAMPLE(
3451
				n_diff[j], index, n_sample_pages,
3452
				total_external_size, not_empty_flag); 
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3453
3454
		/* If the tree is small, smaller than
3455
		10 * n_sample_pages + total_external_size, then
3456
		the above estimate is ok. For bigger trees it is common that we
3457
		do not see any borders between key values in the few pages
3458
		we pick. But still there may be n_sample_pages
3459
		different key values, or even more. Let us try to approximate
3460
		that: */
3461
3462
		add_on = index->stat_n_leaf_pages
3463
			/ (10 * (n_sample_pages
3464
				 + total_external_size));
3465
3466
		if (add_on > n_sample_pages) {
3467
			add_on = n_sample_pages;
3468
		}
3469
3470
		index->stat_n_diff_key_vals[j] += add_on;
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3471
3472
		/* Update the stat_n_non_null_key_vals[] with our
3473
		sampled result. stat_n_non_null_key_vals[] is created
3474
		and initialized to zero in dict_index_add_to_cache(),
3475
		along with stat_n_diff_key_vals[] array */
3476
		if (n_not_null != NULL && (j < n_cols)) {
3477
			index->stat_n_non_null_key_vals[j] =
3478
				 BTR_TABLE_STATS_FROM_SAMPLE(
3479
					n_not_null[j], index, n_sample_pages,
3480
					total_external_size, not_empty_flag);
3481
		}
3482
	}
3483
3484
	mem_heap_free(heap);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3485
}
3486
3487
/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
3488
3489
/***********************************************************//**
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3490
Gets the offset of the pointer to the externally stored part of a field.
3491
@return	offset of the pointer to the externally stored part */
3492
static
3493
ulint
3494
btr_rec_get_field_ref_offs(
3495
/*=======================*/
3496
	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
3497
	ulint		n)	/*!< in: index of the external field */
3498
{
3499
	ulint	field_ref_offs;
3500
	ulint	local_len;
3501
3502
	ut_a(rec_offs_nth_extern(offsets, n));
3503
	field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
3504
	ut_a(local_len != UNIV_SQL_NULL);
3505
	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
3506
3507
	return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
3508
}
3509
3510
/** Gets a pointer to the externally stored part of a field.
3511
@param rec	record
3512
@param offsets	rec_get_offsets(rec)
3513
@param n	index of the externally stored field
3514
@return pointer to the externally stored part */
3515
#define btr_rec_get_field_ref(rec, offsets, n)			\
3516
	((rec) + btr_rec_get_field_ref_offs(offsets, n))
3517
3518
/***********************************************************//**
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3519
Gets the externally stored size of a record, in units of a database page.
3520
@return	externally stored part, in units of a database page */
3521
static
3522
ulint
3523
btr_rec_get_externally_stored_len(
3524
/*==============================*/
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3525
	const rec_t*	rec,	/*!< in: record */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3526
	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
3527
{
3528
	ulint	n_fields;
3529
	ulint	total_extern_len = 0;
3530
	ulint	i;
3531
3532
	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3533
3534
	if (!rec_offs_any_extern(offsets)) {
3535
		return(0);
3536
	}
3537
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3538
	n_fields = rec_offs_n_fields(offsets);
3539
3540
	for (i = 0; i < n_fields; i++) {
3541
		if (rec_offs_nth_extern(offsets, i)) {
3542
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3543
			ulint	extern_len = mach_read_from_4(
3544
				btr_rec_get_field_ref(rec, offsets, i)
3545
				+ BTR_EXTERN_LEN + 4);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3546
3547
			total_extern_len += ut_calc_align(extern_len,
3548
							  UNIV_PAGE_SIZE);
3549
		}
3550
	}
3551
3552
	return(total_extern_len / UNIV_PAGE_SIZE);
3553
}
3554
3555
/*******************************************************************//**
3556
Sets the ownership bit of an externally stored field in a record. */
3557
static
3558
void
3559
btr_cur_set_ownership_of_extern_field(
3560
/*==================================*/
3561
	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
3562
				part will be updated, or NULL */
3563
	rec_t*		rec,	/*!< in/out: clustered index record */
3564
	dict_index_t*	index,	/*!< in: index of the page */
3565
	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
3566
	ulint		i,	/*!< in: field number */
3567
	ibool		val,	/*!< in: value to set */
3568
	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
3569
{
3570
	byte*	data;
3571
	ulint	local_len;
3572
	ulint	byte_val;
3573
3574
	data = rec_get_nth_field(rec, offsets, i, &local_len);
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3575
	ut_ad(rec_offs_nth_extern(offsets, i));
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3576
	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
3577
3578
	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
3579
3580
	byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
3581
3582
	if (val) {
3583
		byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
3584
	} else {
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3585
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
3586
		ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
3587
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3588
		byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
3589
	}
3590
3591
	if (UNIV_LIKELY_NULL(page_zip)) {
3592
		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
3593
		page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
3594
	} else if (UNIV_LIKELY(mtr != NULL)) {
3595
3596
		mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
3597
				 MLOG_1BYTE, mtr);
3598
	} else {
3599
		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
3600
	}
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3601
3602
	btr_blob_dbg_owner(rec, index, offsets, i, val);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3603
}
3604
3605
/*******************************************************************//**
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3606
Marks non-updated off-page fields as disowned by this record. The ownership
3607
must be transferred to the updated record which is inserted elsewhere in the
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3608
index tree. In purge only the owner of externally stored field is allowed
3609
to free the field. */
3610
UNIV_INTERN
3611
void
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3612
btr_cur_disown_inherited_fields(
3613
/*============================*/
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3614
	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
3615
				part will be updated, or NULL */
3616
	rec_t*		rec,	/*!< in/out: record in a clustered index */
3617
	dict_index_t*	index,	/*!< in: index of the page */
3618
	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
3619
	const upd_t*	update,	/*!< in: update vector */
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3620
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3621
{
3622
	ulint	i;
3623
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3624
	ut_ad(rec_offs_validate(rec, index, offsets));
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3625
	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3626
	ut_ad(rec_offs_any_extern(offsets));
3627
	ut_ad(mtr);
3628
3629
	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
3630
		if (rec_offs_nth_extern(offsets, i)
3631
		    && !upd_get_field_by_field_no(update, i)) {
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3632
			btr_cur_set_ownership_of_extern_field(
3633
				page_zip, rec, index, offsets, i, FALSE, mtr);
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3634
		}
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3635
	}
3636
}
3637
3638
/*******************************************************************//**
3639
Marks all extern fields in a record as owned by the record. This function
3640
should be called if the delete mark of a record is removed: a not delete
3641
marked record always owns all its extern fields. */
3642
static
3643
void
3644
btr_cur_unmark_extern_fields(
3645
/*=========================*/
3646
	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
3647
				part will be updated, or NULL */
3648
	rec_t*		rec,	/*!< in/out: record in a clustered index */
3649
	dict_index_t*	index,	/*!< in: index of the page */
3650
	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
3651
	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
3652
{
3653
	ulint	n;
3654
	ulint	i;
3655
3656
	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
3657
	n = rec_offs_n_fields(offsets);
3658
3659
	if (!rec_offs_any_extern(offsets)) {
3660
3661
		return;
3662
	}
3663
3664
	for (i = 0; i < n; i++) {
3665
		if (rec_offs_nth_extern(offsets, i)) {
3666
3667
			btr_cur_set_ownership_of_extern_field(
3668
				page_zip, rec, index, offsets, i, TRUE, mtr);
3669
		}
3670
	}
3671
}
3672
3673
/*******************************************************************//**
3674
Flags the data tuple fields that are marked as extern storage in the
3675
update vector.  We use this function to remember which fields we must
3676
mark as extern storage in a record inserted for an update.
3677
@return	number of flagged external columns */
3678
UNIV_INTERN
3679
ulint
3680
btr_push_update_extern_fields(
3681
/*==========================*/
3682
	dtuple_t*	tuple,	/*!< in/out: data tuple */
3683
	const upd_t*	update,	/*!< in: update vector */
3684
	mem_heap_t*	heap)	/*!< in: memory heap */
3685
{
3686
	ulint			n_pushed	= 0;
3687
	ulint			n;
3688
	const upd_field_t*	uf;
3689
3690
	ut_ad(tuple);
3691
	ut_ad(update);
3692
3693
	uf = update->fields;
3694
	n = upd_get_n_fields(update);
3695
3696
	for (; n--; uf++) {
3697
		if (dfield_is_ext(&uf->new_val)) {
3698
			dfield_t*	field
3699
				= dtuple_get_nth_field(tuple, uf->field_no);
3700
3701
			if (!dfield_is_ext(field)) {
3702
				dfield_set_ext(field);
3703
				n_pushed++;
3704
			}
3705
3706
			switch (uf->orig_len) {
3707
				byte*	data;
3708
				ulint	len;
3709
				byte*	buf;
3710
			case 0:
3711
				break;
3712
			case BTR_EXTERN_FIELD_REF_SIZE:
3713
				/* Restore the original locally stored
3714
				part of the column.  In the undo log,
3715
				InnoDB writes a longer prefix of externally
3716
				stored columns, so that column prefixes
3717
				in secondary indexes can be reconstructed. */
3718
				dfield_set_data(field, (byte*) dfield_get_data(field)
3719
						+ dfield_get_len(field)
3720
						- BTR_EXTERN_FIELD_REF_SIZE,
3721
						BTR_EXTERN_FIELD_REF_SIZE);
3722
				dfield_set_ext(field);
3723
				break;
3724
			default:
3725
				/* Reconstruct the original locally
3726
				stored part of the column.  The data
3727
				will have to be copied. */
3728
				ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
3729
3730
				data = dfield_get_data(field);
3731
				len = dfield_get_len(field);
3732
3733
				buf = mem_heap_alloc(heap, uf->orig_len);
3734
				/* Copy the locally stored prefix. */
3735
				memcpy(buf, data,
3736
				       uf->orig_len
3737
				       - BTR_EXTERN_FIELD_REF_SIZE);
3738
				/* Copy the BLOB pointer. */
3739
				memcpy(buf + uf->orig_len
3740
				       - BTR_EXTERN_FIELD_REF_SIZE,
3741
				       data + len - BTR_EXTERN_FIELD_REF_SIZE,
3742
				       BTR_EXTERN_FIELD_REF_SIZE);
3743
3744
				dfield_set_data(field, buf, uf->orig_len);
3745
				dfield_set_ext(field);
3746
			}
3747
		}
3748
	}
3749
3750
	return(n_pushed);
3751
}
3752
3753
/*******************************************************************//**
3754
Returns the length of a BLOB part stored on the header page.
3755
@return	part length */
3756
static
3757
ulint
3758
btr_blob_get_part_len(
3759
/*==================*/
3760
	const byte*	blob_header)	/*!< in: blob header */
3761
{
3762
	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
3763
}
3764
3765
/*******************************************************************//**
3766
Returns the page number where the next BLOB part is stored.
3767
@return	page number or FIL_NULL if no more pages */
3768
static
3769
ulint
3770
btr_blob_get_next_page_no(
3771
/*======================*/
3772
	const byte*	blob_header)	/*!< in: blob header */
3773
{
3774
	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
3775
}
3776
3777
/*******************************************************************//**
3778
Deallocate a buffer block that was reserved for a BLOB part. */
3779
static
3780
void
3781
btr_blob_free(
3782
/*==========*/
3783
	buf_block_t*	block,	/*!< in: buffer block */
3784
	ibool		all,	/*!< in: TRUE=remove also the compressed page
3785
				if there is one */
3786
	mtr_t*		mtr)	/*!< in: mini-transaction to commit */
3787
{
3788
	ulint	space	= buf_block_get_space(block);
3789
	ulint	page_no	= buf_block_get_page_no(block);
3790
3791
	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3792
3793
	mtr_commit(mtr);
3794
3795
	buf_pool_mutex_enter();
3796
	mutex_enter(&block->mutex);
3797
3798
	/* Only free the block if it is still allocated to
3799
	the same file page. */
3800
3801
	if (buf_block_get_state(block)
3802
	    == BUF_BLOCK_FILE_PAGE
3803
	    && buf_block_get_space(block) == space
3804
	    && buf_block_get_page_no(block) == page_no) {
3805
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3806
		if (!buf_LRU_free_block(&block->page, all)
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3807
		    && all && block->page.zip.data) {
3808
			/* Attempt to deallocate the uncompressed page
3809
			if the whole block cannot be deallocted. */
3810
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3811
			buf_LRU_free_block(&block->page, FALSE);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3812
		}
3813
	}
3814
3815
	buf_pool_mutex_exit();
3816
	mutex_exit(&block->mutex);
3817
}
3818
3819
/*******************************************************************//**
3820
Stores the fields in big_rec_vec to the tablespace and puts pointers to
3821
them in rec.  The extern flags in rec will have to be set beforehand.
3822
The fields are stored on pages allocated from leaf node
3823
file segment of the index tree.
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3824
@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3825
UNIV_INTERN
3826
ulint
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3827
btr_store_big_rec_extern_fields_func(
3828
/*=================================*/
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3829
	dict_index_t*	index,		/*!< in: index of rec; the index tree
3830
					MUST be X-latched */
3831
	buf_block_t*	rec_block,	/*!< in/out: block containing rec */
3832
	rec_t*		rec,		/*!< in/out: record */
3833
	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index);
3834
					the "external storage" flags in offsets
3835
					will not correspond to rec when
3836
					this function returns */
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3837
#ifdef UNIV_DEBUG
3838
	mtr_t*		local_mtr,	/*!< in: mtr containing the
3839
					latch to rec and to the tree */
3840
#endif /* UNIV_DEBUG */
3841
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
3842
	ibool		update_in_place,/*! in: TRUE if the record is updated
3843
					in place (not delete+insert) */
3844
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
3845
	const big_rec_t*big_rec_vec)	/*!< in: vector containing fields
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3846
					to be stored externally */
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3847
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3848
{
3849
	ulint	rec_page_no;
3850
	byte*	field_ref;
3851
	ulint	extern_len;
3852
	ulint	store_len;
3853
	ulint	page_no;
3854
	ulint	space_id;
3855
	ulint	zip_size;
3856
	ulint	prev_page_no;
3857
	ulint	hint_page_no;
3858
	ulint	i;
3859
	mtr_t	mtr;
3860
	mem_heap_t* heap = NULL;
3861
	page_zip_des_t*	page_zip;
3862
	z_stream c_stream;
3863
3864
	ut_ad(rec_offs_validate(rec, index, offsets));
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3865
	ut_ad(rec_offs_any_extern(offsets));
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3866
	ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
3867
				MTR_MEMO_X_LOCK));
3868
	ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
3869
	ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
3870
	ut_a(dict_index_is_clust(index));
3871
3872
	page_zip = buf_block_get_page_zip(rec_block);
3873
	ut_a(dict_table_zip_size(index->table)
3874
	     == buf_block_get_zip_size(rec_block));
3875
3876
	space_id = buf_block_get_space(rec_block);
3877
	zip_size = buf_block_get_zip_size(rec_block);
3878
	rec_page_no = buf_block_get_page_no(rec_block);
3879
	ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);
3880
3881
	if (UNIV_LIKELY_NULL(page_zip)) {
3882
		int	err;
3883
3884
		/* Zlib deflate needs 128 kilobytes for the default
3885
		window size, plus 512 << memLevel, plus a few
3886
		kilobytes for small objects.  We use reduced memLevel
3887
		to limit the memory consumption, and preallocate the
3888
		heap, hoping to avoid memory fragmentation. */
3889
		heap = mem_heap_create(250000);
3890
		page_zip_set_alloc(&c_stream, heap);
3891
3892
		err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION,
3893
				   Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
3894
		ut_a(err == Z_OK);
3895
	}
3896
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3897
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
3898
	/* All pointers to externally stored columns in the record
3899
	must either be zero or they must be pointers to inherited
3900
	columns, owned by this record or an earlier record version. */
3901
	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
3902
		if (!rec_offs_nth_extern(offsets, i)) {
3903
			continue;
3904
		}
3905
		field_ref = btr_rec_get_field_ref(rec, offsets, i);
3906
3907
		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
3908
		/* Either this must be an update in place,
3909
		or the BLOB must be inherited, or the BLOB pointer
3910
		must be zero (will be written in this function). */
3911
		ut_a(update_in_place
3912
		     || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
3913
		     || !memcmp(field_ref, field_ref_zero,
3914
				BTR_EXTERN_FIELD_REF_SIZE));
3915
	}
3916
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3917
	/* We have to create a file segment to the tablespace
3918
	for each field and put the pointer to the field in rec */
3919
3920
	for (i = 0; i < big_rec_vec->n_fields; i++) {
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3921
		field_ref = btr_rec_get_field_ref(
3922
			rec, offsets, big_rec_vec->fields[i].field_no);
3923
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
3924
		/* A zero BLOB pointer should have been initially inserted. */
3925
		ut_a(!memcmp(field_ref, field_ref_zero,
3926
			     BTR_EXTERN_FIELD_REF_SIZE));
3927
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3928
		extern_len = big_rec_vec->fields[i].len;
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
3929
		UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
3930
				   extern_len);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
3931
3932
		ut_a(extern_len > 0);
3933
3934
		prev_page_no = FIL_NULL;
3935
3936
		if (UNIV_LIKELY_NULL(page_zip)) {
3937
			int	err = deflateReset(&c_stream);
3938
			ut_a(err == Z_OK);
3939
3940
			c_stream.next_in = (void*) big_rec_vec->fields[i].data;
3941
			c_stream.avail_in = extern_len;
3942
		}
3943
3944
		for (;;) {
3945
			buf_block_t*	block;
3946
			page_t*		page;
3947
3948
			mtr_start(&mtr);
3949
3950
			if (prev_page_no == FIL_NULL) {
3951
				hint_page_no = 1 + rec_page_no;
3952
			} else {
3953
				hint_page_no = prev_page_no + 1;
3954
			}
3955
3956
			block = btr_page_alloc(index, hint_page_no,
3957
					       FSP_NO_DIR, 0, &mtr);
3958
			if (UNIV_UNLIKELY(block == NULL)) {
3959
3960
				mtr_commit(&mtr);
3961
3962
				if (UNIV_LIKELY_NULL(page_zip)) {
3963
					deflateEnd(&c_stream);
3964
					mem_heap_free(heap);
3965
				}
3966
3967
				return(DB_OUT_OF_FILE_SPACE);
3968
			}
3969
3970
			page_no = buf_block_get_page_no(block);
3971
			page = buf_block_get_frame(block);
3972
3973
			if (prev_page_no != FIL_NULL) {
3974
				buf_block_t*	prev_block;
3975
				page_t*		prev_page;
3976
3977
				prev_block = buf_page_get(space_id, zip_size,
3978
							  prev_page_no,
3979
							  RW_X_LATCH, &mtr);
3980
				buf_block_dbg_add_level(prev_block,
3981
							SYNC_EXTERN_STORAGE);
3982
				prev_page = buf_block_get_frame(prev_block);
3983
3984
				if (UNIV_LIKELY_NULL(page_zip)) {
3985
					mlog_write_ulint(
3986
						prev_page + FIL_PAGE_NEXT,
3987
						page_no, MLOG_4BYTES, &mtr);
3988
					memcpy(buf_block_get_page_zip(
3989
						       prev_block)
3990
					       ->data + FIL_PAGE_NEXT,
3991
					       prev_page + FIL_PAGE_NEXT, 4);
3992
				} else {
3993
					mlog_write_ulint(
3994
						prev_page + FIL_PAGE_DATA
3995
						+ BTR_BLOB_HDR_NEXT_PAGE_NO,
3996
						page_no, MLOG_4BYTES, &mtr);
3997
				}
3998
3999
			}
4000
4001
			if (UNIV_LIKELY_NULL(page_zip)) {
4002
				int		err;
4003
				page_zip_des_t*	blob_page_zip;
4004
4005
				/* Write FIL_PAGE_TYPE to the redo log
4006
				separately, before logging any other
4007
				changes to the page, so that the debug
4008
				assertions in
4009
				recv_parse_or_apply_log_rec_body() can
4010
				be made simpler.  Before InnoDB Plugin
4011
				1.0.4, the initialization of
4012
				FIL_PAGE_TYPE was logged as part of
4013
				the mlog_log_string() below. */
4014
4015
				mlog_write_ulint(page + FIL_PAGE_TYPE,
4016
						 prev_page_no == FIL_NULL
4017
						 ? FIL_PAGE_TYPE_ZBLOB
4018
						 : FIL_PAGE_TYPE_ZBLOB2,
4019
						 MLOG_2BYTES, &mtr);
4020
4021
				c_stream.next_out = page
4022
					+ FIL_PAGE_DATA;
4023
				c_stream.avail_out
4024
					= page_zip_get_size(page_zip)
4025
					- FIL_PAGE_DATA;
4026
4027
				err = deflate(&c_stream, Z_FINISH);
4028
				ut_a(err == Z_OK || err == Z_STREAM_END);
4029
				ut_a(err == Z_STREAM_END
4030
				     || c_stream.avail_out == 0);
4031
4032
				/* Write the "next BLOB page" pointer */
4033
				mlog_write_ulint(page + FIL_PAGE_NEXT,
4034
						 FIL_NULL, MLOG_4BYTES, &mtr);
4035
				/* Initialize the unused "prev page" pointer */
4036
				mlog_write_ulint(page + FIL_PAGE_PREV,
4037
						 FIL_NULL, MLOG_4BYTES, &mtr);
4038
				/* Write a back pointer to the record
4039
				into the otherwise unused area.  This
4040
				information could be useful in
4041
				debugging.  Later, we might want to
4042
				implement the possibility to relocate
4043
				BLOB pages.  Then, we would need to be
4044
				able to adjust the BLOB pointer in the
4045
				record.  We do not store the heap
4046
				number of the record, because it can
4047
				change in page_zip_reorganize() or
4048
				btr_page_reorganize().  However, also
4049
				the page number of the record may
4050
				change when B-tree nodes are split or
4051
				merged. */
4052
				mlog_write_ulint(page
4053
						 + FIL_PAGE_FILE_FLUSH_LSN,
4054
						 space_id,
4055
						 MLOG_4BYTES, &mtr);
4056
				mlog_write_ulint(page
4057
						 + FIL_PAGE_FILE_FLUSH_LSN + 4,
4058
						 rec_page_no,
4059
						 MLOG_4BYTES, &mtr);
4060
4061
				/* Zero out the unused part of the page. */
4062
				memset(page + page_zip_get_size(page_zip)
4063
				       - c_stream.avail_out,
4064
				       0, c_stream.avail_out);
4065
				mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN,
4066
						page_zip_get_size(page_zip)
4067
						- FIL_PAGE_FILE_FLUSH_LSN,
4068
						&mtr);
4069
				/* Copy the page to compressed storage,
4070
				because it will be flushed to disk
4071
				from there. */
4072
				blob_page_zip = buf_block_get_page_zip(block);
4073
				ut_ad(blob_page_zip);
4074
				ut_ad(page_zip_get_size(blob_page_zip)
4075
				      == page_zip_get_size(page_zip));
4076
				memcpy(blob_page_zip->data, page,
4077
				       page_zip_get_size(page_zip));
4078
4079
				if (err == Z_OK && prev_page_no != FIL_NULL) {
4080
4081
					goto next_zip_page;
4082
				}
4083
4084
				rec_block = buf_page_get(space_id, zip_size,
4085
							 rec_page_no,
4086
							 RW_X_LATCH, &mtr);
4087
				buf_block_dbg_add_level(rec_block,
4088
							SYNC_NO_ORDER_CHECK);
4089
4090
				if (err == Z_STREAM_END) {
4091
					mach_write_to_4(field_ref
4092
							+ BTR_EXTERN_LEN, 0);
4093
					mach_write_to_4(field_ref
4094
							+ BTR_EXTERN_LEN + 4,
4095
							c_stream.total_in);
4096
				} else {
4097
					memset(field_ref + BTR_EXTERN_LEN,
4098
					       0, 8);
4099
				}
4100
4101
				if (prev_page_no == FIL_NULL) {
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4102
					btr_blob_dbg_add_blob(
4103
						rec, big_rec_vec->fields[i]
4104
						.field_no, page_no, index,
4105
						"store");
4106
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4107
					mach_write_to_4(field_ref
4108
							+ BTR_EXTERN_SPACE_ID,
4109
							space_id);
4110
4111
					mach_write_to_4(field_ref
4112
							+ BTR_EXTERN_PAGE_NO,
4113
							page_no);
4114
4115
					mach_write_to_4(field_ref
4116
							+ BTR_EXTERN_OFFSET,
4117
							FIL_PAGE_NEXT);
4118
				}
4119
4120
				page_zip_write_blob_ptr(
4121
					page_zip, rec, index, offsets,
4122
					big_rec_vec->fields[i].field_no, &mtr);
4123
4124
next_zip_page:
4125
				prev_page_no = page_no;
4126
4127
				/* Commit mtr and release the
4128
				uncompressed page frame to save memory. */
4129
				btr_blob_free(block, FALSE, &mtr);
4130
4131
				if (err == Z_STREAM_END) {
4132
					break;
4133
				}
4134
			} else {
4135
				mlog_write_ulint(page + FIL_PAGE_TYPE,
4136
						 FIL_PAGE_TYPE_BLOB,
4137
						 MLOG_2BYTES, &mtr);
4138
4139
				if (extern_len > (UNIV_PAGE_SIZE
4140
						  - FIL_PAGE_DATA
4141
						  - BTR_BLOB_HDR_SIZE
4142
						  - FIL_PAGE_DATA_END)) {
4143
					store_len = UNIV_PAGE_SIZE
4144
						- FIL_PAGE_DATA
4145
						- BTR_BLOB_HDR_SIZE
4146
						- FIL_PAGE_DATA_END;
4147
				} else {
4148
					store_len = extern_len;
4149
				}
4150
4151
				mlog_write_string(page + FIL_PAGE_DATA
4152
						  + BTR_BLOB_HDR_SIZE,
4153
						  (const byte*)
4154
						  big_rec_vec->fields[i].data
4155
						  + big_rec_vec->fields[i].len
4156
						  - extern_len,
4157
						  store_len, &mtr);
4158
				mlog_write_ulint(page + FIL_PAGE_DATA
4159
						 + BTR_BLOB_HDR_PART_LEN,
4160
						 store_len, MLOG_4BYTES, &mtr);
4161
				mlog_write_ulint(page + FIL_PAGE_DATA
4162
						 + BTR_BLOB_HDR_NEXT_PAGE_NO,
4163
						 FIL_NULL, MLOG_4BYTES, &mtr);
4164
4165
				extern_len -= store_len;
4166
4167
				rec_block = buf_page_get(space_id, zip_size,
4168
							 rec_page_no,
4169
							 RW_X_LATCH, &mtr);
4170
				buf_block_dbg_add_level(rec_block,
4171
							SYNC_NO_ORDER_CHECK);
4172
4173
				mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
4174
						 MLOG_4BYTES, &mtr);
4175
				mlog_write_ulint(field_ref
4176
						 + BTR_EXTERN_LEN + 4,
4177
						 big_rec_vec->fields[i].len
4178
						 - extern_len,
4179
						 MLOG_4BYTES, &mtr);
4180
4181
				if (prev_page_no == FIL_NULL) {
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4182
					btr_blob_dbg_add_blob(
4183
						rec, big_rec_vec->fields[i]
4184
						.field_no, page_no, index,
4185
						"store");
4186
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4187
					mlog_write_ulint(field_ref
4188
							 + BTR_EXTERN_SPACE_ID,
4189
							 space_id,
4190
							 MLOG_4BYTES, &mtr);
4191
4192
					mlog_write_ulint(field_ref
4193
							 + BTR_EXTERN_PAGE_NO,
4194
							 page_no,
4195
							 MLOG_4BYTES, &mtr);
4196
4197
					mlog_write_ulint(field_ref
4198
							 + BTR_EXTERN_OFFSET,
4199
							 FIL_PAGE_DATA,
4200
							 MLOG_4BYTES, &mtr);
4201
				}
4202
4203
				prev_page_no = page_no;
4204
4205
				mtr_commit(&mtr);
4206
4207
				if (extern_len == 0) {
4208
					break;
4209
				}
4210
			}
4211
		}
4212
	}
4213
4214
	if (UNIV_LIKELY_NULL(page_zip)) {
4215
		deflateEnd(&c_stream);
4216
		mem_heap_free(heap);
4217
	}
4218
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4219
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
4220
	/* All pointers to externally stored columns in the record
4221
	must be valid. */
4222
	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
4223
		if (!rec_offs_nth_extern(offsets, i)) {
4224
			continue;
4225
		}
4226
4227
		field_ref = btr_rec_get_field_ref(rec, offsets, i);
4228
4229
		/* The pointer must not be zero. */
4230
		ut_a(0 != memcmp(field_ref, field_ref_zero,
4231
				 BTR_EXTERN_FIELD_REF_SIZE));
4232
		/* The column must not be disowned by this record. */
4233
		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
4234
	}
4235
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4236
	return(DB_SUCCESS);
4237
}
4238
4239
/*******************************************************************//**
4240
Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
4241
static
4242
void
4243
btr_check_blob_fil_page_type(
4244
/*=========================*/
4245
	ulint		space_id,	/*!< in: space id */
4246
	ulint		page_no,	/*!< in: page number */
4247
	const page_t*	page,		/*!< in: page */
4248
	ibool		read)		/*!< in: TRUE=read, FALSE=purge */
4249
{
4250
	ulint	type = fil_page_get_type(page);
4251
4252
	ut_a(space_id == page_get_space_id(page));
4253
	ut_a(page_no == page_get_page_no(page));
4254
4255
	if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
4256
		ulint	flags = fil_space_get_flags(space_id);
4257
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4258
#ifndef UNIV_DEBUG /* Improve debug test coverage */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4259
		if (UNIV_LIKELY
4260
		    ((flags & DICT_TF_FORMAT_MASK) == DICT_TF_FORMAT_51)) {
4261
			/* Old versions of InnoDB did not initialize
4262
			FIL_PAGE_TYPE on BLOB pages.  Do not print
4263
			anything about the type mismatch when reading
4264
			a BLOB page that is in Antelope format.*/
4265
			return;
4266
		}
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4267
#endif /* !UNIV_DEBUG */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4268
4269
		ut_print_timestamp(stderr);
4270
		fprintf(stderr,
4271
			"  InnoDB: FIL_PAGE_TYPE=%lu"
4272
			" on BLOB %s space %lu page %lu flags %lx\n",
4273
			(ulong) type, read ? "read" : "purge",
4274
			(ulong) space_id, (ulong) page_no, (ulong) flags);
4275
		ut_error;
4276
	}
4277
}
4278
4279
/*******************************************************************//**
4280
Frees the space in an externally stored field to the file space
4281
management if the field in data is owned by the externally stored field,
4282
in a rollback we may have the additional condition that the field must
4283
not be inherited. */
4284
UNIV_INTERN
4285
void
4286
btr_free_externally_stored_field(
4287
/*=============================*/
4288
	dict_index_t*	index,		/*!< in: index of the data, the index
4289
					tree MUST be X-latched; if the tree
4290
					height is 1, then also the root page
4291
					must be X-latched! (this is relevant
4292
					in the case this function is called
4293
					from purge where 'data' is located on
4294
					an undo log page, not an index
4295
					page) */
4296
	byte*		field_ref,	/*!< in/out: field reference */
4297
	const rec_t*	rec,		/*!< in: record containing field_ref, for
4298
					page_zip_write_blob_ptr(), or NULL */
4299
	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
4300
					or NULL */
4301
	page_zip_des_t*	page_zip,	/*!< in: compressed page corresponding
4302
					to rec, or NULL if rec == NULL */
4303
	ulint		i,		/*!< in: field number of field_ref;
4304
					ignored if rec == NULL */
4305
	enum trx_rb_ctx	rb_ctx,		/*!< in: rollback context */
4306
	mtr_t*		local_mtr __attribute__((unused))) /*!< in: mtr
4307
					containing the latch to data an an
4308
					X-latch to the index tree */
4309
{
4310
	page_t*		page;
4311
	ulint		space_id;
4312
	ulint		rec_zip_size = dict_table_zip_size(index->table);
4313
	ulint		ext_zip_size;
4314
	ulint		page_no;
4315
	ulint		next_page_no;
4316
	mtr_t		mtr;
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4317
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4318
	ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
4319
				MTR_MEMO_X_LOCK));
4320
	ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
4321
				     MTR_MEMO_PAGE_X_FIX));
4322
	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4323
	ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4324
4325
	if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
4326
				  BTR_EXTERN_FIELD_REF_SIZE))) {
4327
		/* In the rollback of uncommitted transactions, we may
4328
		encounter a clustered index record whose BLOBs have
4329
		not been written.  There is nothing to free then. */
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4330
		ut_a(rb_ctx == RB_RECOVERY || rb_ctx == RB_RECOVERY_PURGE_REC);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4331
		return;
4332
	}
4333
4334
	space_id = mach_read_from_4(field_ref + BTR_EXTERN_SPACE_ID);
4335
4336
	if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) {
4337
		ext_zip_size = fil_space_get_zip_size(space_id);
4338
		/* This must be an undo log record in the system tablespace,
4339
		that is, in row_purge_upd_exist_or_extern().
4340
		Currently, externally stored records are stored in the
4341
		same tablespace as the referring records. */
4342
		ut_ad(!page_get_space_id(page_align(field_ref)));
4343
		ut_ad(!rec);
4344
		ut_ad(!page_zip);
4345
	} else {
4346
		ext_zip_size = rec_zip_size;
4347
	}
4348
4349
	if (!rec) {
4350
		/* This is a call from row_purge_upd_exist_or_extern(). */
4351
		ut_ad(!page_zip);
4352
		rec_zip_size = 0;
4353
	}
4354
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4355
#ifdef UNIV_BLOB_DEBUG
4356
	if (!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)
4357
	    && !((field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
4358
		 && (rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY))) {
4359
		/* This off-page column will be freed.
4360
		Check that no references remain. */
4361
4362
		btr_blob_dbg_t	b;
4363
4364
		b.blob_page_no = mach_read_from_4(
4365
			field_ref + BTR_EXTERN_PAGE_NO);
4366
4367
		if (rec) {
4368
			/* Remove the reference from the record to the
4369
			BLOB. If the BLOB were not freed, the
4370
			reference would be removed when the record is
4371
			removed. Freeing the BLOB will overwrite the
4372
			BTR_EXTERN_PAGE_NO in the field_ref of the
4373
			record with FIL_NULL, which would make the
4374
			btr_blob_dbg information inconsistent with the
4375
			record. */
4376
			b.ref_page_no = page_get_page_no(page_align(rec));
4377
			b.ref_heap_no = page_rec_get_heap_no(rec);
4378
			b.ref_field_no = i;
4379
			btr_blob_dbg_rbt_delete(index, &b, "free");
4380
		}
4381
4382
		btr_blob_dbg_assert_empty(index, b.blob_page_no);
4383
	}
4384
#endif /* UNIV_BLOB_DEBUG */
4385
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4386
	for (;;) {
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4387
#ifdef UNIV_SYNC_DEBUG
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4388
		buf_block_t*	rec_block;
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4389
#endif /* UNIV_SYNC_DEBUG */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4390
		buf_block_t*	ext_block;
4391
4392
		mtr_start(&mtr);
4393
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4394
#ifdef UNIV_SYNC_DEBUG
4395
		rec_block =
4396
#endif /* UNIV_SYNC_DEBUG */
4397
			buf_page_get(page_get_space_id(
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4398
						 page_align(field_ref)),
4399
					 rec_zip_size,
4400
					 page_get_page_no(
4401
						 page_align(field_ref)),
4402
					 RW_X_LATCH, &mtr);
4403
		buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
4404
		page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
4405
4406
		if (/* There is no external storage data */
4407
		    page_no == FIL_NULL
4408
		    /* This field does not own the externally stored field */
4409
		    || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
4410
			& BTR_EXTERN_OWNER_FLAG)
4411
		    /* Rollback and inherited field */
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4412
		    || ((rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY)
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4413
			&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
4414
			    & BTR_EXTERN_INHERITED_FLAG))) {
4415
4416
			/* Do not free */
4417
			mtr_commit(&mtr);
4418
4419
			return;
4420
		}
4421
4422
		ext_block = buf_page_get(space_id, ext_zip_size, page_no,
4423
					 RW_X_LATCH, &mtr);
4424
		buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
4425
		page = buf_block_get_frame(ext_block);
4426
4427
		if (ext_zip_size) {
4428
			/* Note that page_zip will be NULL
4429
			in row_purge_upd_exist_or_extern(). */
4430
			switch (fil_page_get_type(page)) {
4431
			case FIL_PAGE_TYPE_ZBLOB:
4432
			case FIL_PAGE_TYPE_ZBLOB2:
4433
				break;
4434
			default:
4435
				ut_error;
4436
			}
4437
			next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
4438
4439
			btr_page_free_low(index, ext_block, 0, &mtr);
4440
4441
			if (UNIV_LIKELY(page_zip != NULL)) {
4442
				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
4443
						next_page_no);
4444
				mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
4445
						0);
4446
				page_zip_write_blob_ptr(page_zip, rec, index,
4447
							offsets, i, &mtr);
4448
			} else {
4449
				mlog_write_ulint(field_ref
4450
						 + BTR_EXTERN_PAGE_NO,
4451
						 next_page_no,
4452
						 MLOG_4BYTES, &mtr);
4453
				mlog_write_ulint(field_ref
4454
						 + BTR_EXTERN_LEN + 4, 0,
4455
						 MLOG_4BYTES, &mtr);
4456
			}
4457
		} else {
4458
			ut_a(!page_zip);
4459
			btr_check_blob_fil_page_type(space_id, page_no, page,
4460
						     FALSE);
4461
4462
			next_page_no = mach_read_from_4(
4463
				page + FIL_PAGE_DATA
4464
				+ BTR_BLOB_HDR_NEXT_PAGE_NO);
4465
4466
			/* We must supply the page level (= 0) as an argument
4467
			because we did not store it on the page (we save the
4468
			space overhead from an index page header. */
4469
4470
			btr_page_free_low(index, ext_block, 0, &mtr);
4471
4472
			mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
4473
					 next_page_no,
4474
					 MLOG_4BYTES, &mtr);
4475
			/* Zero out the BLOB length.  If the server
4476
			crashes during the execution of this function,
4477
			trx_rollback_or_clean_all_recovered() could
4478
			dereference the half-deleted BLOB, fetching a
4479
			wrong prefix for the BLOB. */
4480
			mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
4481
					 0,
4482
					 MLOG_4BYTES, &mtr);
4483
		}
4484
4485
		/* Commit mtr and release the BLOB block to save memory. */
4486
		btr_blob_free(ext_block, TRUE, &mtr);
4487
	}
4488
}
4489
4490
/***********************************************************//**
4491
Frees the externally stored fields for a record. */
4492
static
4493
void
4494
btr_rec_free_externally_stored_fields(
4495
/*==================================*/
4496
	dict_index_t*	index,	/*!< in: index of the data, the index
4497
				tree MUST be X-latched */
4498
	rec_t*		rec,	/*!< in/out: record */
4499
	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
4500
	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
4501
				part will be updated, or NULL */
4502
	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
4503
	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
4504
				an X-latch to record page and to the index
4505
				tree */
4506
{
4507
	ulint	n_fields;
4508
	ulint	i;
4509
4510
	ut_ad(rec_offs_validate(rec, index, offsets));
4511
	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
4512
	/* Free possible externally stored fields in the record */
4513
4514
	ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
4515
	n_fields = rec_offs_n_fields(offsets);
4516
4517
	for (i = 0; i < n_fields; i++) {
4518
		if (rec_offs_nth_extern(offsets, i)) {
4519
			btr_free_externally_stored_field(
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4520
				index, btr_rec_get_field_ref(rec, offsets, i),
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4521
				rec, offsets, page_zip, i, rb_ctx, mtr);
4522
		}
4523
	}
4524
}
4525
4526
/***********************************************************//**
4527
Frees the externally stored fields for a record, if the field is mentioned
4528
in the update vector. */
4529
static
4530
void
4531
btr_rec_free_updated_extern_fields(
4532
/*===============================*/
4533
	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
4534
				X-latched */
4535
	rec_t*		rec,	/*!< in/out: record */
4536
	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
4537
				part will be updated, or NULL */
4538
	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
4539
	const upd_t*	update,	/*!< in: update vector */
4540
	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
4541
	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
4542
				an X-latch to record page and to the tree */
4543
{
4544
	ulint	n_fields;
4545
	ulint	i;
4546
4547
	ut_ad(rec_offs_validate(rec, index, offsets));
4548
	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
4549
4550
	/* Free possible externally stored fields in the record */
4551
4552
	n_fields = upd_get_n_fields(update);
4553
4554
	for (i = 0; i < n_fields; i++) {
4555
		const upd_field_t* ufield = upd_get_nth_field(update, i);
4556
4557
		if (rec_offs_nth_extern(offsets, ufield->field_no)) {
4558
			ulint	len;
4559
			byte*	data = rec_get_nth_field(
4560
				rec, offsets, ufield->field_no, &len);
4561
			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
4562
4563
			btr_free_externally_stored_field(
4564
				index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
4565
				rec, offsets, page_zip,
4566
				ufield->field_no, rb_ctx, mtr);
4567
		}
4568
	}
4569
}
4570
4571
/*******************************************************************//**
4572
Copies the prefix of an uncompressed BLOB.  The clustered index record
4573
that points to this BLOB must be protected by a lock or a page latch.
4574
@return	number of bytes written to buf */
4575
static
4576
ulint
4577
btr_copy_blob_prefix(
4578
/*=================*/
4579
	byte*		buf,	/*!< out: the externally stored part of
4580
				the field, or a prefix of it */
4581
	ulint		len,	/*!< in: length of buf, in bytes */
4582
	ulint		space_id,/*!< in: space id of the BLOB pages */
4583
	ulint		page_no,/*!< in: page number of the first BLOB page */
4584
	ulint		offset)	/*!< in: offset on the first BLOB page */
4585
{
4586
	ulint	copied_len	= 0;
4587
4588
	for (;;) {
4589
		mtr_t		mtr;
4590
		buf_block_t*	block;
4591
		const page_t*	page;
4592
		const byte*	blob_header;
4593
		ulint		part_len;
4594
		ulint		copy_len;
4595
4596
		mtr_start(&mtr);
4597
4598
		block = buf_page_get(space_id, 0, page_no, RW_S_LATCH, &mtr);
4599
		buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
4600
		page = buf_block_get_frame(block);
4601
4602
		btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
4603
4604
		blob_header = page + offset;
4605
		part_len = btr_blob_get_part_len(blob_header);
4606
		copy_len = ut_min(part_len, len - copied_len);
4607
4608
		memcpy(buf + copied_len,
4609
		       blob_header + BTR_BLOB_HDR_SIZE, copy_len);
4610
		copied_len += copy_len;
4611
4612
		page_no = btr_blob_get_next_page_no(blob_header);
4613
4614
		mtr_commit(&mtr);
4615
4616
		if (page_no == FIL_NULL || copy_len != part_len) {
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4617
			UNIV_MEM_ASSERT_RW(buf, copied_len);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4618
			return(copied_len);
4619
		}
4620
4621
		/* On other BLOB pages except the first the BLOB header
4622
		always is at the page data start: */
4623
4624
		offset = FIL_PAGE_DATA;
4625
4626
		ut_ad(copied_len <= len);
4627
	}
4628
}
4629
4630
/*******************************************************************//**
4631
Copies the prefix of a compressed BLOB.  The clustered index record
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4632
that points to this BLOB must be protected by a lock or a page latch.
4633
@return	number of bytes written to buf */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4634
static
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4635
ulint
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4636
btr_copy_zblob_prefix(
4637
/*==================*/
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4638
	byte*		buf,	/*!< out: the externally stored part of
4639
				the field, or a prefix of it */
4640
	ulint		len,	/*!< in: length of buf, in bytes */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4641
	ulint		zip_size,/*!< in: compressed BLOB page size */
4642
	ulint		space_id,/*!< in: space id of the BLOB pages */
4643
	ulint		page_no,/*!< in: page number of the first BLOB page */
4644
	ulint		offset)	/*!< in: offset on the first BLOB page */
4645
{
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4646
	ulint		page_type = FIL_PAGE_TYPE_ZBLOB;
4647
	mem_heap_t*	heap;
4648
	int		err;
4649
	z_stream	d_stream;
4650
4651
	d_stream.next_out = buf;
4652
	d_stream.avail_out = len;
4653
	d_stream.next_in = Z_NULL;
4654
	d_stream.avail_in = 0;
4655
4656
	/* Zlib inflate needs 32 kilobytes for the default
4657
	window size, plus a few kilobytes for small objects. */
4658
	heap = mem_heap_create(40000);
4659
	page_zip_set_alloc(&d_stream, heap);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4660
4661
	ut_ad(ut_is_2pow(zip_size));
4662
	ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE);
4663
	ut_ad(zip_size <= UNIV_PAGE_SIZE);
4664
	ut_ad(space_id);
4665
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4666
	err = inflateInit(&d_stream);
4667
	ut_a(err == Z_OK);
4668
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4669
	for (;;) {
4670
		buf_page_t*	bpage;
4671
		ulint		next_page_no;
4672
4673
		/* There is no latch on bpage directly.  Instead,
4674
		bpage is protected by the B-tree page latch that
4675
		is being held on the clustered index record, or,
4676
		in row_merge_copy_blobs(), by an exclusive table lock. */
4677
		bpage = buf_page_get_zip(space_id, zip_size, page_no);
4678
4679
		if (UNIV_UNLIKELY(!bpage)) {
4680
			ut_print_timestamp(stderr);
4681
			fprintf(stderr,
4682
				"  InnoDB: Cannot load"
4683
				" compressed BLOB"
4684
				" page %lu space %lu\n",
4685
				(ulong) page_no, (ulong) space_id);
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4686
			goto func_exit;
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4687
		}
4688
4689
		if (UNIV_UNLIKELY
4690
		    (fil_page_get_type(bpage->zip.data) != page_type)) {
4691
			ut_print_timestamp(stderr);
4692
			fprintf(stderr,
4693
				"  InnoDB: Unexpected type %lu of"
4694
				" compressed BLOB"
4695
				" page %lu space %lu\n",
4696
				(ulong) fil_page_get_type(bpage->zip.data),
4697
				(ulong) page_no, (ulong) space_id);
4698
			goto end_of_blob;
4699
		}
4700
4701
		next_page_no = mach_read_from_4(bpage->zip.data + offset);
4702
4703
		if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
4704
			/* When the BLOB begins at page header,
4705
			the compressed data payload does not
4706
			immediately follow the next page pointer. */
4707
			offset = FIL_PAGE_DATA;
4708
		} else {
4709
			offset += 4;
4710
		}
4711
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4712
		d_stream.next_in = bpage->zip.data + offset;
4713
		d_stream.avail_in = zip_size - offset;
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4714
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4715
		err = inflate(&d_stream, Z_NO_FLUSH);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4716
		switch (err) {
4717
		case Z_OK:
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4718
			if (!d_stream.avail_out) {
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4719
				goto end_of_blob;
4720
			}
4721
			break;
4722
		case Z_STREAM_END:
4723
			if (next_page_no == FIL_NULL) {
4724
				goto end_of_blob;
4725
			}
4726
			/* fall through */
4727
		default:
4728
inflate_error:
4729
			ut_print_timestamp(stderr);
4730
			fprintf(stderr,
4731
				"  InnoDB: inflate() of"
4732
				" compressed BLOB"
4733
				" page %lu space %lu returned %d (%s)\n",
4734
				(ulong) page_no, (ulong) space_id,
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4735
				err, d_stream.msg);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4736
		case Z_BUF_ERROR:
4737
			goto end_of_blob;
4738
		}
4739
4740
		if (next_page_no == FIL_NULL) {
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4741
			if (!d_stream.avail_in) {
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4742
				ut_print_timestamp(stderr);
4743
				fprintf(stderr,
4744
					"  InnoDB: unexpected end of"
4745
					" compressed BLOB"
4746
					" page %lu space %lu\n",
4747
					(ulong) page_no,
4748
					(ulong) space_id);
4749
			} else {
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4750
				err = inflate(&d_stream, Z_FINISH);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4751
				switch (err) {
4752
				case Z_STREAM_END:
4753
				case Z_BUF_ERROR:
4754
					break;
4755
				default:
4756
					goto inflate_error;
4757
				}
4758
			}
4759
4760
end_of_blob:
4761
			buf_page_release_zip(bpage);
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4762
			goto func_exit;
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4763
		}
4764
4765
		buf_page_release_zip(bpage);
4766
4767
		/* On other BLOB pages except the first
4768
		the BLOB header always is at the page header: */
4769
4770
		page_no = next_page_no;
4771
		offset = FIL_PAGE_NEXT;
4772
		page_type = FIL_PAGE_TYPE_ZBLOB2;
4773
	}
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4774
4775
func_exit:
4776
	inflateEnd(&d_stream);
4777
	mem_heap_free(heap);
4778
	UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
4779
	return(d_stream.total_out);
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4780
}
4781
4782
/*******************************************************************//**
4783
Copies the prefix of an externally stored field of a record.  The
4784
clustered index record that points to this BLOB must be protected by a
4785
lock or a page latch.
4786
@return	number of bytes written to buf */
4787
static
4788
ulint
4789
btr_copy_externally_stored_field_prefix_low(
4790
/*========================================*/
4791
	byte*		buf,	/*!< out: the externally stored part of
4792
				the field, or a prefix of it */
4793
	ulint		len,	/*!< in: length of buf, in bytes */
4794
	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
4795
				zero for uncompressed BLOBs */
4796
	ulint		space_id,/*!< in: space id of the first BLOB page */
4797
	ulint		page_no,/*!< in: page number of the first BLOB page */
4798
	ulint		offset)	/*!< in: offset on the first BLOB page */
4799
{
4800
	if (UNIV_UNLIKELY(len == 0)) {
4801
		return(0);
4802
	}
4803
4804
	if (UNIV_UNLIKELY(zip_size)) {
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4805
		return(btr_copy_zblob_prefix(buf, len, zip_size,
4806
					     space_id, page_no, offset));
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4807
	} else {
4808
		return(btr_copy_blob_prefix(buf, len, space_id,
4809
					    page_no, offset));
4810
	}
4811
}
4812
4813
/*******************************************************************//**
4814
Copies the prefix of an externally stored field of a record.  The
4815
clustered index record must be protected by a lock or a page latch.
4816
@return the length of the copied field, or 0 if the column was being
4817
or has been deleted */
4818
UNIV_INTERN
4819
ulint
4820
btr_copy_externally_stored_field_prefix(
4821
/*====================================*/
4822
	byte*		buf,	/*!< out: the field, or a prefix of it */
4823
	ulint		len,	/*!< in: length of buf, in bytes */
4824
	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
4825
				zero for uncompressed BLOBs */
4826
	const byte*	data,	/*!< in: 'internally' stored part of the
4827
				field containing also the reference to
4828
				the external part; must be protected by
4829
				a lock or a page latch */
4830
	ulint		local_len)/*!< in: length of data, in bytes */
4831
{
4832
	ulint	space_id;
4833
	ulint	page_no;
4834
	ulint	offset;
4835
4836
	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
4837
4838
	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
4839
4840
	if (UNIV_UNLIKELY(local_len >= len)) {
4841
		memcpy(buf, data, len);
4842
		return(len);
4843
	}
4844
4845
	memcpy(buf, data, local_len);
4846
	data += local_len;
4847
4848
	ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
4849
4850
	if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
4851
		/* The externally stored part of the column has been
4852
		(partially) deleted.  Signal the half-deleted BLOB
4853
		to the caller. */
4854
4855
		return(0);
4856
	}
4857
4858
	space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
4859
4860
	page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
4861
4862
	offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
4863
4864
	return(local_len
4865
	       + btr_copy_externally_stored_field_prefix_low(buf + local_len,
4866
							     len - local_len,
4867
							     zip_size,
4868
							     space_id, page_no,
4869
							     offset));
4870
}
4871
4872
/*******************************************************************//**
4873
Copies an externally stored field of a record to mem heap.  The
4874
clustered index record must be protected by a lock or a page latch.
4875
@return	the whole field copied to heap */
4876
static
4877
byte*
4878
btr_copy_externally_stored_field(
4879
/*=============================*/
4880
	ulint*		len,	/*!< out: length of the whole field */
4881
	const byte*	data,	/*!< in: 'internally' stored part of the
4882
				field containing also the reference to
4883
				the external part; must be protected by
4884
				a lock or a page latch */
4885
	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
4886
				zero for uncompressed BLOBs */
4887
	ulint		local_len,/*!< in: length of data */
4888
	mem_heap_t*	heap)	/*!< in: mem heap */
4889
{
4890
	ulint	space_id;
4891
	ulint	page_no;
4892
	ulint	offset;
4893
	ulint	extern_len;
4894
	byte*	buf;
4895
4896
	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
4897
4898
	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
4899
4900
	space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
4901
4902
	page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
4903
4904
	offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
4905
4906
	/* Currently a BLOB cannot be bigger than 4 GB; we
4907
	leave the 4 upper bytes in the length field unused */
4908
4909
	extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
4910
4911
	buf = mem_heap_alloc(heap, local_len + extern_len);
4912
4913
	memcpy(buf, data, local_len);
4914
	*len = local_len
4915
		+ btr_copy_externally_stored_field_prefix_low(buf + local_len,
4916
							      extern_len,
4917
							      zip_size,
4918
							      space_id,
4919
							      page_no, offset);
4920
4921
	return(buf);
4922
}
4923
4924
/*******************************************************************//**
4925
Copies an externally stored field of a record to mem heap.
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4926
@return	the field copied to heap, or NULL if the field is incomplete */
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4927
UNIV_INTERN
4928
byte*
4929
btr_rec_copy_externally_stored_field(
4930
/*=================================*/
4931
	const rec_t*	rec,	/*!< in: record in a clustered index;
4932
				must be protected by a lock or a page latch */
4933
	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
4934
	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
4935
				zero for uncompressed BLOBs */
4936
	ulint		no,	/*!< in: field number */
4937
	ulint*		len,	/*!< out: length of the field */
4938
	mem_heap_t*	heap)	/*!< in: mem heap */
4939
{
4940
	ulint		local_len;
4941
	const byte*	data;
4942
4943
	ut_a(rec_offs_nth_extern(offsets, no));
4944
4945
	/* An externally stored field can contain some initial
4946
	data from the field, and in the last 20 bytes it has the
4947
	space id, page number, and offset where the rest of the
4948
	field data is stored, and the data length in addition to
4949
	the data stored locally. We may need to store some data
4950
	locally to get the local record length above the 128 byte
4951
	limit so that field offsets are stored in two bytes, and
4952
	the extern bit is available in those two bytes. */
4953
4954
	data = rec_get_nth_field(rec, offsets, no, &local_len);
4955
1.1.5 by Marc Deslauriers
Import upstream version 5.1.61
4956
	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
4957
4958
	if (UNIV_UNLIKELY
4959
	    (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
4960
		     field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
4961
		/* The externally stored field was not written yet.
4962
		This record should only be seen by
4963
		recv_recovery_rollback_active() or any
4964
		TRX_ISO_READ_UNCOMMITTED transactions. */
4965
		return(NULL);
4966
	}
4967
0.2.4 by Norbert Tretkowski
Import upstream version 5.1.39
4968
	return(btr_copy_externally_stored_field(len, data,
4969
						zip_size, local_len, heap));
4970
}
4971
#endif /* !UNIV_HOTBACKUP */