1
/*****************************************************************************
3
Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15
Place, Suite 330, Boston, MA 02111-1307 USA
17
*****************************************************************************/
19
/**************************************************//**
21
The database buffer buf_pool flush algorithm
23
Created 11/11/1995 Heikki Tuuri
24
*******************************************************/
35
#ifndef UNIV_HOTBACKUP
38
#include "page0page.h"
42
#include "ibuf0ibuf.h"
47
/**********************************************************************
48
These statistics are generated for heuristics used in estimating the
49
rate at which we should flush the dirty blocks to avoid bursty IO
50
activity. Note that the rate of flushing not only depends on how many
51
dirty pages we have in the buffer pool but it is also a fucntion of
52
how much redo the workload is generating and at what rate. */
55
/** Number of intervals for which we keep the history of these stats.
56
Each interval is 1 second, defined by the rate at which
57
srv_error_monitor_thread() calls buf_flush_stat_update(). */
58
#define BUF_FLUSH_STAT_N_INTERVAL 20
60
/** Sampled values buf_flush_stat_cur.
61
Not protected by any mutex. Updated by buf_flush_stat_update(). */
62
static buf_flush_stat_t buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
64
/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
65
static ulint buf_flush_stat_arr_ind;
67
/** Values at start of the current interval. Reset by
68
buf_flush_stat_update(). */
69
static buf_flush_stat_t buf_flush_stat_cur;
71
/** Running sum of past values of buf_flush_stat_cur.
72
Updated by buf_flush_stat_update(). Not protected by any mutex. */
73
static buf_flush_stat_t buf_flush_stat_sum;
75
/** Number of pages flushed through non flush_list flushes. */
76
static ulint buf_lru_flush_page_count = 0;
80
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
81
/******************************************************************//**
82
Validates the flush list.
86
buf_flush_validate_low(void);
87
/*========================*/
88
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
90
/********************************************************************//**
91
Inserts a modified block into the flush list. */
94
buf_flush_insert_into_flush_list(
95
/*=============================*/
96
buf_block_t* block) /*!< in/out: block which is modified */
98
ut_ad(buf_pool_mutex_own());
99
ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
100
|| (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
101
<= block->page.oldest_modification));
103
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
104
ut_ad(block->page.in_LRU_list);
105
ut_ad(block->page.in_page_hash);
106
ut_ad(!block->page.in_zip_hash);
107
ut_ad(!block->page.in_flush_list);
108
ut_d(block->page.in_flush_list = TRUE);
109
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
111
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
112
ut_a(buf_flush_validate_low());
113
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
116
/********************************************************************//**
117
Inserts a modified block into the flush list in the right sorted position.
118
This function is used by recovery, because there the modifications do not
119
necessarily come in the order of lsn's. */
122
buf_flush_insert_sorted_into_flush_list(
123
/*====================================*/
124
buf_block_t* block) /*!< in/out: block which is modified */
129
ut_ad(buf_pool_mutex_own());
130
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
132
ut_ad(block->page.in_LRU_list);
133
ut_ad(block->page.in_page_hash);
134
ut_ad(!block->page.in_zip_hash);
135
ut_ad(!block->page.in_flush_list);
136
ut_d(block->page.in_flush_list = TRUE);
139
b = UT_LIST_GET_FIRST(buf_pool->flush_list);
141
while (b && b->oldest_modification > block->page.oldest_modification) {
142
ut_ad(b->in_flush_list);
144
b = UT_LIST_GET_NEXT(list, b);
147
if (prev_b == NULL) {
148
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
150
UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
151
prev_b, &block->page);
154
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
155
ut_a(buf_flush_validate_low());
156
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
159
/********************************************************************//**
160
Returns TRUE if the file page block is immediately suitable for replacement,
161
i.e., the transition FILE_PAGE => NOT_USED allowed.
162
@return TRUE if can replace immediately */
165
buf_flush_ready_for_replace(
166
/*========================*/
167
buf_page_t* bpage) /*!< in: buffer control block, must be
168
buf_page_in_file(bpage) and in the LRU list */
170
ut_ad(buf_pool_mutex_own());
171
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
172
ut_ad(bpage->in_LRU_list);
174
if (UNIV_LIKELY(buf_page_in_file(bpage))) {
176
return(bpage->oldest_modification == 0
177
&& buf_page_get_io_fix(bpage) == BUF_IO_NONE
178
&& bpage->buf_fix_count == 0);
181
ut_print_timestamp(stderr);
183
" InnoDB: Error: buffer block state %lu"
184
" in the LRU list!\n",
185
(ulong) buf_page_get_state(bpage));
186
ut_print_buf(stderr, bpage, sizeof(buf_page_t));
192
/********************************************************************//**
193
Returns TRUE if the block is modified and ready for flushing.
194
@return TRUE if can flush immediately */
197
buf_flush_ready_for_flush(
198
/*======================*/
199
buf_page_t* bpage, /*!< in: buffer control block, must be
200
buf_page_in_file(bpage) */
201
enum buf_flush flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
203
ut_a(buf_page_in_file(bpage));
204
ut_ad(buf_pool_mutex_own());
205
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
206
ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST);
208
if (bpage->oldest_modification != 0
209
&& buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
210
ut_ad(bpage->in_flush_list);
212
if (flush_type != BUF_FLUSH_LRU) {
216
} else if (bpage->buf_fix_count == 0) {
218
/* If we are flushing the LRU list, to avoid deadlocks
219
we require the block not to be bufferfixed, and hence
229
/********************************************************************//**
230
Remove a block from the flush list of modified blocks. */
235
buf_page_t* bpage) /*!< in: pointer to the block in question */
237
ut_ad(buf_pool_mutex_own());
238
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
239
ut_ad(bpage->in_flush_list);
240
ut_d(bpage->in_flush_list = FALSE);
242
switch (buf_page_get_state(bpage)) {
243
case BUF_BLOCK_ZIP_PAGE:
244
/* clean compressed pages should not be on the flush list */
245
case BUF_BLOCK_ZIP_FREE:
246
case BUF_BLOCK_NOT_USED:
247
case BUF_BLOCK_READY_FOR_USE:
248
case BUF_BLOCK_MEMORY:
249
case BUF_BLOCK_REMOVE_HASH:
252
case BUF_BLOCK_ZIP_DIRTY:
253
buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
254
UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
255
buf_LRU_insert_zip_clean(bpage);
257
case BUF_BLOCK_FILE_PAGE:
258
UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
262
bpage->oldest_modification = 0;
264
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
265
ut_ad(ut_list_node_313->in_flush_list)));
268
/********************************************************************//**
269
Updates the flush system data structures when a write is completed. */
272
buf_flush_write_complete(
273
/*=====================*/
274
buf_page_t* bpage) /*!< in: pointer to the block in question */
276
enum buf_flush flush_type;
280
buf_flush_remove(bpage);
282
flush_type = buf_page_get_flush_type(bpage);
283
buf_pool->n_flush[flush_type]--;
285
if (flush_type == BUF_FLUSH_LRU) {
286
/* Put the block to the end of the LRU list to wait to be
287
moved to the free list */
289
buf_LRU_make_block_old(bpage);
291
buf_pool->LRU_flush_ended++;
294
/* fprintf(stderr, "n pending flush %lu\n",
295
buf_pool->n_flush[flush_type]); */
297
if ((buf_pool->n_flush[flush_type] == 0)
298
&& (buf_pool->init_flush[flush_type] == FALSE)) {
300
/* The running flush batch has ended */
302
os_event_set(buf_pool->no_flush[flush_type]);
306
/********************************************************************//**
307
Flush a batch of writes to the datafiles that have already been
308
written by the OS. */
311
buf_flush_sync_datafiles(void)
312
/*==========================*/
314
/* Wake possible simulated aio thread to actually post the
315
writes to the operating system */
316
os_aio_simulated_wake_handler_threads();
318
/* Wait that all async writes to tablespaces have been posted to
320
os_aio_wait_until_no_pending_writes();
322
/* Now we flush the data to disk (for example, with fsync) */
323
fil_flush_file_spaces(FIL_TABLESPACE);
328
/********************************************************************//**
329
Flushes possible buffered writes from the doublewrite memory buffer to disk,
330
and also wakes up the aio thread if simulated aio is used. It is very
331
important to call this function after a batch of writes has been posted,
332
and also when we may have to wait for a page latch! Otherwise a deadlock
333
of threads can occur. */
336
buf_flush_buffered_writes(void)
337
/*===========================*/
344
if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
345
/* Sync the writes to the disk. */
346
buf_flush_sync_datafiles();
350
mutex_enter(&(trx_doublewrite->mutex));
352
/* Write first to doublewrite buffer blocks. We use synchronous
353
aio and thus know that file write has been completed when the
356
if (trx_doublewrite->first_free == 0) {
358
mutex_exit(&(trx_doublewrite->mutex));
363
for (i = 0; i < trx_doublewrite->first_free; i++) {
365
const buf_block_t* block;
367
block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
369
if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
370
|| block->page.zip.data) {
371
/* No simple validate for compressed pages exists. */
376
(memcmp(block->frame + (FIL_PAGE_LSN + 4),
377
block->frame + (UNIV_PAGE_SIZE
378
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
380
ut_print_timestamp(stderr);
382
" InnoDB: ERROR: The page to be written"
384
"InnoDB: The lsn fields do not match!"
385
" Noticed in the buffer pool\n"
386
"InnoDB: before posting to the"
387
" doublewrite buffer.\n");
390
if (!block->check_index_page_at_flush) {
391
} else if (page_is_comp(block->frame)) {
393
(!page_simple_validate_new(block->frame))) {
395
buf_page_print(block->frame, 0);
397
ut_print_timestamp(stderr);
399
" InnoDB: Apparent corruption of an"
400
" index page n:o %lu in space %lu\n"
401
"InnoDB: to be written to data file."
402
" We intentionally crash server\n"
403
"InnoDB: to prevent corrupt data"
404
" from ending up in data\n"
406
(ulong) buf_block_get_page_no(block),
407
(ulong) buf_block_get_space(block));
411
} else if (UNIV_UNLIKELY
412
(!page_simple_validate_old(block->frame))) {
418
/* increment the doublewrite flushed pages counter */
419
srv_dblwr_pages_written+= trx_doublewrite->first_free;
422
len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
423
trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
425
write_buf = trx_doublewrite->write_buf;
428
fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
429
trx_doublewrite->block1, 0, len,
430
(void*) write_buf, NULL);
432
for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
433
len2 += UNIV_PAGE_SIZE, i++) {
434
const buf_block_t* block = (buf_block_t*)
435
trx_doublewrite->buf_block_arr[i];
437
if (UNIV_LIKELY(!block->page.zip.data)
438
&& UNIV_LIKELY(buf_block_get_state(block)
439
== BUF_BLOCK_FILE_PAGE)
441
(memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
444
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
445
ut_print_timestamp(stderr);
447
" InnoDB: ERROR: The page to be written"
449
"InnoDB: The lsn fields do not match!"
450
" Noticed in the doublewrite block1.\n");
454
if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
458
len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
461
write_buf = trx_doublewrite->write_buf
462
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
463
ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
465
fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
466
trx_doublewrite->block2, 0, len,
467
(void*) write_buf, NULL);
469
for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
470
len2 += UNIV_PAGE_SIZE, i++) {
471
const buf_block_t* block = (buf_block_t*)
472
trx_doublewrite->buf_block_arr[i];
474
if (UNIV_LIKELY(!block->page.zip.data)
475
&& UNIV_LIKELY(buf_block_get_state(block)
476
== BUF_BLOCK_FILE_PAGE)
478
(memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
481
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
482
ut_print_timestamp(stderr);
484
" InnoDB: ERROR: The page to be"
485
" written seems corrupt!\n"
486
"InnoDB: The lsn fields do not match!"
488
" the doublewrite block2.\n");
493
/* Now flush the doublewrite buffer data to disk */
495
fil_flush(TRX_SYS_SPACE);
497
/* We know that the writes have been flushed to disk now
498
and in recovery we will find them in the doublewrite buffer
499
blocks. Next do the writes to the intended positions. */
501
for (i = 0; i < trx_doublewrite->first_free; i++) {
502
const buf_block_t* block = (buf_block_t*)
503
trx_doublewrite->buf_block_arr[i];
505
ut_a(buf_page_in_file(&block->page));
506
if (UNIV_LIKELY_NULL(block->page.zip.data)) {
507
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
508
FALSE, buf_page_get_space(&block->page),
509
buf_page_get_zip_size(&block->page),
510
buf_page_get_page_no(&block->page), 0,
511
buf_page_get_zip_size(&block->page),
512
(void*)block->page.zip.data,
515
/* Increment the counter of I/O operations used
516
for selecting LRU policy. */
517
buf_LRU_stat_inc_io();
522
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
524
if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
527
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
529
ut_print_timestamp(stderr);
531
" InnoDB: ERROR: The page to be written"
533
"InnoDB: The lsn fields do not match!"
534
" Noticed in the buffer pool\n"
535
"InnoDB: after posting and flushing"
536
" the doublewrite buffer.\n"
537
"InnoDB: Page buf fix count %lu,"
538
" io fix %lu, state %lu\n",
539
(ulong)block->page.buf_fix_count,
540
(ulong)buf_block_get_io_fix(block),
541
(ulong)buf_block_get_state(block));
544
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
545
FALSE, buf_block_get_space(block), 0,
546
buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
547
(void*)block->frame, (void*)block);
549
/* Increment the counter of I/O operations used
550
for selecting LRU policy. */
551
buf_LRU_stat_inc_io();
554
/* Sync the writes to the disk. */
555
buf_flush_sync_datafiles();
557
/* We can now reuse the doublewrite memory buffer: */
558
trx_doublewrite->first_free = 0;
560
mutex_exit(&(trx_doublewrite->mutex));
563
/********************************************************************//**
564
Posts a buffer page for writing. If the doublewrite memory buffer is
565
full, calls buf_flush_buffered_writes and waits for for free space to
569
buf_flush_post_to_doublewrite_buf(
570
/*==============================*/
571
buf_page_t* bpage) /*!< in: buffer block to write */
575
mutex_enter(&(trx_doublewrite->mutex));
577
ut_a(buf_page_in_file(bpage));
579
if (trx_doublewrite->first_free
580
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
581
mutex_exit(&(trx_doublewrite->mutex));
583
buf_flush_buffered_writes();
588
zip_size = buf_page_get_zip_size(bpage);
590
if (UNIV_UNLIKELY(zip_size)) {
591
/* Copy the compressed page and clear the rest. */
592
memcpy(trx_doublewrite->write_buf
593
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
594
bpage->zip.data, zip_size);
595
memset(trx_doublewrite->write_buf
596
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free
597
+ zip_size, 0, UNIV_PAGE_SIZE - zip_size);
599
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
601
memcpy(trx_doublewrite->write_buf
602
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
603
((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
606
trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
608
trx_doublewrite->first_free++;
610
if (trx_doublewrite->first_free
611
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
612
mutex_exit(&(trx_doublewrite->mutex));
614
buf_flush_buffered_writes();
619
mutex_exit(&(trx_doublewrite->mutex));
621
#endif /* !UNIV_HOTBACKUP */
623
/********************************************************************//**
624
Initializes a page for writing to the tablespace. */
627
buf_flush_init_for_writing(
628
/*=======================*/
629
byte* page, /*!< in/out: page */
630
void* page_zip_, /*!< in/out: compressed page, or NULL */
631
ib_uint64_t newest_lsn) /*!< in: newest modification lsn
637
page_zip_des_t* page_zip = page_zip_;
638
ulint zip_size = page_zip_get_size(page_zip);
640
ut_ad(ut_is_2pow(zip_size));
641
ut_ad(zip_size <= UNIV_PAGE_SIZE);
643
switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
644
case FIL_PAGE_TYPE_ALLOCATED:
646
case FIL_PAGE_IBUF_BITMAP:
647
case FIL_PAGE_TYPE_FSP_HDR:
648
case FIL_PAGE_TYPE_XDES:
649
/* These are essentially uncompressed pages. */
650
memcpy(page_zip->data, page, zip_size);
652
case FIL_PAGE_TYPE_ZBLOB:
653
case FIL_PAGE_TYPE_ZBLOB2:
655
mach_write_ull(page_zip->data
656
+ FIL_PAGE_LSN, newest_lsn);
657
memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
658
mach_write_to_4(page_zip->data
659
+ FIL_PAGE_SPACE_OR_CHKSUM,
661
? page_zip_calc_checksum(
662
page_zip->data, zip_size)
663
: BUF_NO_CHECKSUM_MAGIC);
667
ut_print_timestamp(stderr);
668
fputs(" InnoDB: ERROR: The compressed page to be written"
669
" seems corrupt:", stderr);
670
ut_print_buf(stderr, page, zip_size);
671
fputs("\nInnoDB: Possibly older version of the page:", stderr);
672
ut_print_buf(stderr, page_zip->data, zip_size);
677
/* Write the newest modification lsn to the page header and trailer */
678
mach_write_ull(page + FIL_PAGE_LSN, newest_lsn);
680
mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
683
/* Store the new formula checksum */
685
mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
687
? buf_calc_page_new_checksum(page)
688
: BUF_NO_CHECKSUM_MAGIC);
690
/* We overwrite the first 4 bytes of the end lsn field to store
691
the old formula checksum. Since it depends also on the field
692
FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
693
new formula checksum. */
695
mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
697
? buf_calc_page_old_checksum(page)
698
: BUF_NO_CHECKSUM_MAGIC);
701
#ifndef UNIV_HOTBACKUP
702
/********************************************************************//**
703
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
704
also when the doublewrite buffer is used, we must call
705
buf_flush_buffered_writes after we have posted a batch of writes! */
708
buf_flush_write_block_low(
709
/*======================*/
710
buf_page_t* bpage) /*!< in: buffer block to write */
712
ulint zip_size = buf_page_get_zip_size(bpage);
713
page_t* frame = NULL;
714
#ifdef UNIV_LOG_DEBUG
715
static ibool univ_log_debug_warned;
716
#endif /* UNIV_LOG_DEBUG */
718
ut_ad(buf_page_in_file(bpage));
720
/* We are not holding buf_pool_mutex or block_mutex here.
721
Nevertheless, it is safe to access bpage, because it is
722
io_fixed and oldest_modification != 0. Thus, it cannot be
723
relocated in the buffer pool or removed from flush_list or
725
ut_ad(!buf_pool_mutex_own());
726
ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
727
ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
728
ut_ad(bpage->oldest_modification != 0);
730
#ifdef UNIV_IBUF_COUNT_DEBUG
731
ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
733
ut_ad(bpage->newest_modification != 0);
735
#ifdef UNIV_LOG_DEBUG
736
if (!univ_log_debug_warned) {
737
univ_log_debug_warned = TRUE;
738
fputs("Warning: cannot force log to disk if"
739
" UNIV_LOG_DEBUG is defined!\n"
740
"Crash recovery will not work!\n",
744
/* Force the log to the disk before writing the modified block */
745
log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
747
switch (buf_page_get_state(bpage)) {
748
case BUF_BLOCK_ZIP_FREE:
749
case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
750
case BUF_BLOCK_NOT_USED:
751
case BUF_BLOCK_READY_FOR_USE:
752
case BUF_BLOCK_MEMORY:
753
case BUF_BLOCK_REMOVE_HASH:
756
case BUF_BLOCK_ZIP_DIRTY:
757
frame = bpage->zip.data;
758
if (UNIV_LIKELY(srv_use_checksums)) {
759
ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
760
== page_zip_calc_checksum(frame, zip_size));
762
mach_write_ull(frame + FIL_PAGE_LSN,
763
bpage->newest_modification);
764
memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
766
case BUF_BLOCK_FILE_PAGE:
767
frame = bpage->zip.data;
769
frame = ((buf_block_t*) bpage)->frame;
772
buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
774
? &bpage->zip : NULL,
775
bpage->newest_modification);
779
if (!srv_use_doublewrite_buf || !trx_doublewrite) {
780
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
781
FALSE, buf_page_get_space(bpage), zip_size,
782
buf_page_get_page_no(bpage), 0,
783
zip_size ? zip_size : UNIV_PAGE_SIZE,
786
buf_flush_post_to_doublewrite_buf(bpage);
790
/********************************************************************//**
791
Writes a flushable page asynchronously from the buffer pool to a file.
792
NOTE: in simulated aio we must call
793
os_aio_simulated_wake_handler_threads after we have posted a batch of
794
writes! NOTE: buf_pool_mutex and buf_page_get_mutex(bpage) must be
795
held upon entering this function, and they will be released by this
801
buf_page_t* bpage, /*!< in: buffer control block */
802
enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU
805
mutex_t* block_mutex;
806
ibool is_uncompressed;
808
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
809
ut_ad(buf_pool_mutex_own());
810
ut_ad(buf_page_in_file(bpage));
812
block_mutex = buf_page_get_mutex(bpage);
813
ut_ad(mutex_own(block_mutex));
815
ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
817
buf_page_set_io_fix(bpage, BUF_IO_WRITE);
819
buf_page_set_flush_type(bpage, flush_type);
821
if (buf_pool->n_flush[flush_type] == 0) {
823
os_event_reset(buf_pool->no_flush[flush_type]);
826
buf_pool->n_flush[flush_type]++;
828
is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
829
ut_ad(is_uncompressed == (block_mutex != &buf_pool_zip_mutex));
831
switch (flush_type) {
834
/* If the simulated aio thread is not running, we must
835
not wait for any latch, as we may end up in a deadlock:
836
if buf_fix_count == 0, then we know we need not wait */
838
is_s_latched = (bpage->buf_fix_count == 0);
839
if (is_s_latched && is_uncompressed) {
840
rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
844
mutex_exit(block_mutex);
845
buf_pool_mutex_exit();
847
/* Even though bpage is not protected by any mutex at
848
this point, it is safe to access bpage, because it is
849
io_fixed and oldest_modification != 0. Thus, it
850
cannot be relocated in the buffer pool or removed from
851
flush_list or LRU_list. */
854
buf_flush_buffered_writes();
856
if (is_uncompressed) {
857
rw_lock_s_lock_gen(&((buf_block_t*) bpage)
858
->lock, BUF_IO_WRITE);
866
Because any thread may call the LRU flush, even when owning
867
locks on pages, to avoid deadlocks, we must make sure that the
868
s-lock is acquired on the page without waiting: this is
869
accomplished because buf_flush_ready_for_flush() must hold,
870
and that requires the page not to be bufferfixed. */
872
if (is_uncompressed) {
873
rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
877
/* Note that the s-latch is acquired before releasing the
878
buf_pool mutex: this ensures that the latch is acquired
881
mutex_exit(block_mutex);
882
buf_pool_mutex_exit();
889
/* Even though bpage is not protected by any mutex at this
890
point, it is safe to access bpage, because it is io_fixed and
891
oldest_modification != 0. Thus, it cannot be relocated in the
892
buffer pool or removed from flush_list or LRU_list. */
895
if (buf_debug_prints) {
897
"Flushing %u space %u page %u\n",
898
flush_type, bpage->space, bpage->offset);
900
#endif /* UNIV_DEBUG */
901
buf_flush_write_block_low(bpage);
904
/***********************************************************//**
905
Flushes to disk all flushable pages within the flush area.
906
@return number of pages flushed */
909
buf_flush_try_neighbors(
910
/*====================*/
911
ulint space, /*!< in: space id */
912
ulint offset, /*!< in: page offset */
913
enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU or
921
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
923
if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
924
/* If there is little space, it is better not to flush any
925
block except from the end of the LRU list */
930
/* When flushed, dirty blocks are searched in neighborhoods of
931
this size, and flushed along with the original page. */
933
ulint buf_flush_area = ut_min(BUF_READ_AHEAD_AREA,
934
buf_pool->curr_size / 16);
936
low = (offset / buf_flush_area) * buf_flush_area;
937
high = (offset / buf_flush_area + 1) * buf_flush_area;
940
/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
942
if (high > fil_space_get_size(space)) {
943
high = fil_space_get_size(space);
946
buf_pool_mutex_enter();
948
for (i = low; i < high; i++) {
950
bpage = buf_page_hash_get(space, i);
957
ut_a(buf_page_in_file(bpage));
959
/* We avoid flushing 'non-old' blocks in an LRU flush,
960
because the flushed blocks are soon freed */
962
if (flush_type != BUF_FLUSH_LRU
964
|| buf_page_is_old(bpage)) {
965
mutex_t* block_mutex = buf_page_get_mutex(bpage);
967
mutex_enter(block_mutex);
969
if (buf_flush_ready_for_flush(bpage, flush_type)
970
&& (i == offset || !bpage->buf_fix_count)) {
971
/* We only try to flush those
972
neighbors != offset where the buf fix count is
973
zero, as we then know that we probably can
974
latch the page without a semaphore wait.
975
Semaphore waits are expensive because we must
976
flush the doublewrite buffer before we start
979
buf_flush_page(bpage, flush_type);
980
ut_ad(!mutex_own(block_mutex));
983
buf_pool_mutex_enter();
985
mutex_exit(block_mutex);
990
buf_pool_mutex_exit();
995
/*******************************************************************//**
996
This utility flushes dirty blocks from the end of the LRU list or flush_list.
997
NOTE 1: in the case of an LRU flush the calling thread may own latches to
998
pages: to avoid deadlocks, this function must be written so that it cannot
999
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
1000
the calling thread is not allowed to own any latches on pages!
1001
@return number of blocks for which the write request was queued;
1002
ULINT_UNDEFINED if there was a flush of the same type already running */
1007
enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU or
1008
BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
1009
then the caller must not own any
1011
ulint min_n, /*!< in: wished minimum mumber of blocks
1012
flushed (it is not guaranteed that the
1013
actual number is that big, though) */
1014
ib_uint64_t lsn_limit) /*!< in the case BUF_FLUSH_LIST all
1015
blocks whose oldest_modification is
1016
smaller than this should be flushed
1017
(if their number does not exceed
1018
min_n), otherwise ignored */
1021
ulint page_count = 0;
1022
ulint old_page_count;
1026
ut_ad((flush_type == BUF_FLUSH_LRU)
1027
|| (flush_type == BUF_FLUSH_LIST));
1028
#ifdef UNIV_SYNC_DEBUG
1029
ut_ad((flush_type != BUF_FLUSH_LIST)
1030
|| sync_thread_levels_empty_gen(TRUE));
1031
#endif /* UNIV_SYNC_DEBUG */
1032
buf_pool_mutex_enter();
1034
if ((buf_pool->n_flush[flush_type] > 0)
1035
|| (buf_pool->init_flush[flush_type] == TRUE)) {
1037
/* There is already a flush batch of the same type running */
1039
buf_pool_mutex_exit();
1041
return(ULINT_UNDEFINED);
1044
buf_pool->init_flush[flush_type] = TRUE;
1048
/* If we have flushed enough, leave the loop */
1049
if (page_count >= min_n) {
1054
/* Start from the end of the list looking for a suitable
1055
block to be flushed. */
1057
if (flush_type == BUF_FLUSH_LRU) {
1058
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1060
ut_ad(flush_type == BUF_FLUSH_LIST);
1062
bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1064
|| bpage->oldest_modification >= lsn_limit) {
1065
/* We have flushed enough */
1069
ut_ad(bpage->in_flush_list);
1072
/* Note that after finding a single flushable page, we try to
1073
flush also all its neighbors, and after that start from the
1074
END of the LRU list or flush list again: the list may change
1075
during the flushing and we cannot safely preserve within this
1076
function a pointer to a block in the list! */
1079
mutex_t*block_mutex = buf_page_get_mutex(bpage);
1082
ut_a(buf_page_in_file(bpage));
1084
mutex_enter(block_mutex);
1085
ready = buf_flush_ready_for_flush(bpage, flush_type);
1086
mutex_exit(block_mutex);
1089
space = buf_page_get_space(bpage);
1090
offset = buf_page_get_page_no(bpage);
1092
buf_pool_mutex_exit();
1094
old_page_count = page_count;
1096
/* Try to flush also all the neighbors */
1097
page_count += buf_flush_try_neighbors(
1098
space, offset, flush_type);
1100
"Flush type %lu, page no %lu, neighb %lu\n",
1102
page_count - old_page_count); */
1104
buf_pool_mutex_enter();
1107
} else if (flush_type == BUF_FLUSH_LRU) {
1108
bpage = UT_LIST_GET_PREV(LRU, bpage);
1110
ut_ad(flush_type == BUF_FLUSH_LIST);
1112
bpage = UT_LIST_GET_PREV(list, bpage);
1113
ut_ad(!bpage || bpage->in_flush_list);
1115
} while (bpage != NULL);
1117
/* If we could not find anything to flush, leave the loop */
1122
buf_pool->init_flush[flush_type] = FALSE;
1124
if (buf_pool->n_flush[flush_type] == 0) {
1126
/* The running flush batch has ended */
1128
os_event_set(buf_pool->no_flush[flush_type]);
1131
buf_pool_mutex_exit();
1133
buf_flush_buffered_writes();
1136
if (buf_debug_prints && page_count > 0) {
1137
ut_a(flush_type == BUF_FLUSH_LRU
1138
|| flush_type == BUF_FLUSH_LIST);
1139
fprintf(stderr, flush_type == BUF_FLUSH_LRU
1140
? "Flushed %lu pages in LRU flush\n"
1141
: "Flushed %lu pages in flush list flush\n",
1142
(ulong) page_count);
1144
#endif /* UNIV_DEBUG */
1146
srv_buf_pool_flushed += page_count;
1148
/* We keep track of all flushes happening as part of LRU
1149
flush. When estimating the desired rate at which flush_list
1150
should be flushed we factor in this value. */
1151
if (flush_type == BUF_FLUSH_LRU) {
1152
buf_lru_flush_page_count += page_count;
1158
/******************************************************************//**
1159
Waits until a flush batch of the given type ends */
1162
buf_flush_wait_batch_end(
1163
/*=====================*/
1164
enum buf_flush type) /*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
1166
ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
1168
os_event_wait(buf_pool->no_flush[type]);
1171
/******************************************************************//**
1172
Gives a recommendation of how many blocks should be flushed to establish
1173
a big enough margin of replaceable blocks near the end of the LRU list
1174
and in the free list.
1175
@return number of blocks which should be flushed from the end of the
1179
buf_flush_LRU_recommendation(void)
1180
/*==============================*/
1183
ulint n_replaceable;
1186
buf_pool_mutex_enter();
1188
n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
1190
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1192
while ((bpage != NULL)
1193
&& (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
1194
+ BUF_FLUSH_EXTRA_MARGIN)
1195
&& (distance < BUF_LRU_FREE_SEARCH_LEN)) {
1197
mutex_t* block_mutex = buf_page_get_mutex(bpage);
1199
mutex_enter(block_mutex);
1201
if (buf_flush_ready_for_replace(bpage)) {
1205
mutex_exit(block_mutex);
1209
bpage = UT_LIST_GET_PREV(LRU, bpage);
1212
buf_pool_mutex_exit();
1214
if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {
1219
return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
1223
/*********************************************************************//**
1224
Flushes pages from the end of the LRU list if there is too small a margin
1225
of replaceable pages there or in the free list. VERY IMPORTANT: this function
1226
is called also by threads which have locks on pages. To avoid deadlocks, we
1227
flush only pages such that the s-lock required for flushing can be acquired
1228
immediately, without waiting. */
1231
buf_flush_free_margin(void)
1232
/*=======================*/
1237
n_to_flush = buf_flush_LRU_recommendation();
1239
if (n_to_flush > 0) {
1240
n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, 0);
1241
if (n_flushed == ULINT_UNDEFINED) {
1242
/* There was an LRU type flush batch already running;
1243
let us wait for it to end */
1245
buf_flush_wait_batch_end(BUF_FLUSH_LRU);
1250
/*********************************************************************
1251
Update the historical stats that we are collecting for flush rate
1252
heuristics at the end of each interval.
1253
Flush rate heuristic depends on (a) rate of redo log generation and
1254
(b) the rate at which LRU flush is happening. */
1257
buf_flush_stat_update(void)
1258
/*=======================*/
1260
buf_flush_stat_t* item;
1261
ib_uint64_t lsn_diff;
1265
lsn = log_get_lsn();
1266
if (buf_flush_stat_cur.redo == 0) {
1267
/* First time around. Just update the current LSN
1269
buf_flush_stat_cur.redo = lsn;
1273
item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
1275
/* values for this interval */
1276
lsn_diff = lsn - buf_flush_stat_cur.redo;
1277
n_flushed = buf_lru_flush_page_count
1278
- buf_flush_stat_cur.n_flushed;
1280
/* add the current value and subtract the obsolete entry. */
1281
buf_flush_stat_sum.redo += lsn_diff - item->redo;
1282
buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
1284
/* put current entry in the array. */
1285
item->redo = lsn_diff;
1286
item->n_flushed = n_flushed;
1288
/* update the index */
1289
buf_flush_stat_arr_ind++;
1290
buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
1292
/* reset the current entry. */
1293
buf_flush_stat_cur.redo = lsn;
1294
buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
1297
/*********************************************************************
1298
Determines the fraction of dirty pages that need to be flushed based
1299
on the speed at which we generate redo log. Note that if redo log
1300
is generated at a significant rate without corresponding increase
1301
in the number of dirty pages (for example, an in-memory workload)
1302
it can cause IO bursts of flushing. This function implements heuristics
1303
to avoid this burstiness.
1304
@return number of dirty pages to be flushed / second */
1307
buf_flush_get_desired_flush_rate(void)
1308
/*==================================*/
1311
ulint lru_flush_avg;
1315
ib_uint64_t lsn = log_get_lsn();
1316
ulint log_capacity = log_get_capacity();
1318
/* log_capacity should never be zero after the initialization
1319
of log subsystem. */
1320
ut_ad(log_capacity != 0);
1322
/* Get total number of dirty pages. It is OK to access
1323
flush_list without holding any mtex as we are using this
1324
only for heuristics. */
1325
n_dirty = UT_LIST_GET_LEN(buf_pool->flush_list);
1327
/* An overflow can happen if we generate more than 2^32 bytes
1328
of redo in this interval i.e.: 4G of redo in 1 second. We can
1329
safely consider this as infinity because if we ever come close
1330
to 4G we'll start a synchronous flush of dirty pages. */
1331
/* redo_avg below is average at which redo is generated in
1332
past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
1334
redo_avg = (ulint) (buf_flush_stat_sum.redo
1335
/ BUF_FLUSH_STAT_N_INTERVAL
1336
+ (lsn - buf_flush_stat_cur.redo));
1338
/* An overflow can happen possibly if we flush more than 2^32
1339
pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
1340
unlikely scenario. Even when this happens it means that our
1341
flush rate will be off the mark. It won't affect correctness
1342
of any subsystem. */
1343
/* lru_flush_avg below is rate at which pages are flushed as
1344
part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
1345
number of pages flushed in the current interval. */
1346
lru_flush_avg = buf_flush_stat_sum.n_flushed
1347
/ BUF_FLUSH_STAT_N_INTERVAL
1348
+ (buf_lru_flush_page_count
1349
- buf_flush_stat_cur.n_flushed);
1351
n_flush_req = (n_dirty * redo_avg) / log_capacity;
1353
/* The number of pages that we want to flush from the flush
1354
list is the difference between the required rate and the
1355
number of pages that we are historically flushing from the
1357
rate = n_flush_req - lru_flush_avg;
1358
return(rate > 0 ? (ulint) rate : 0);
1361
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
1362
/******************************************************************//**
1363
Validates the flush list.
1364
@return TRUE if ok */
1367
buf_flush_validate_low(void)
1368
/*========================*/
1372
UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
1373
ut_ad(ut_list_node_313->in_flush_list));
1375
bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1377
while (bpage != NULL) {
1378
const ib_uint64_t om = bpage->oldest_modification;
1379
ut_ad(bpage->in_flush_list);
1380
ut_a(buf_page_in_file(bpage));
1383
bpage = UT_LIST_GET_NEXT(list, bpage);
1385
ut_a(!bpage || om >= bpage->oldest_modification);
1391
/******************************************************************//**
1392
Validates the flush list.
1393
@return TRUE if ok */
1396
buf_flush_validate(void)
1397
/*====================*/
1401
buf_pool_mutex_enter();
1403
ret = buf_flush_validate_low();
1405
buf_pool_mutex_exit();
1409
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
1410
#endif /* !UNIV_HOTBACKUP */