~ubuntu-branches/ubuntu/precise/mysql-5.1/precise

« back to all changes in this revision

Viewing changes to storage/innodb_plugin/buf/buf0flu.c

  • Committer: Bazaar Package Importer
  • Author(s): Norbert Tretkowski
  • Date: 2010-03-17 14:56:02 UTC
  • Revision ID: james.westby@ubuntu.com-20100317145602-x7e30l1b2sb5s6w6
Tags: upstream-5.1.45
ImportĀ upstreamĀ versionĀ 5.1.45

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*****************************************************************************
 
2
 
 
3
Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
 
4
 
 
5
This program is free software; you can redistribute it and/or modify it under
 
6
the terms of the GNU General Public License as published by the Free Software
 
7
Foundation; version 2 of the License.
 
8
 
 
9
This program is distributed in the hope that it will be useful, but WITHOUT
 
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
12
 
 
13
You should have received a copy of the GNU General Public License along with
 
14
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 
15
Place, Suite 330, Boston, MA 02111-1307 USA
 
16
 
 
17
*****************************************************************************/
 
18
 
 
19
/**************************************************//**
 
20
@file buf/buf0flu.c
 
21
The database buffer buf_pool flush algorithm
 
22
 
 
23
Created 11/11/1995 Heikki Tuuri
 
24
*******************************************************/
 
25
 
 
26
#include "buf0flu.h"
 
27
 
 
28
#ifdef UNIV_NONINL
 
29
#include "buf0flu.ic"
 
30
#endif
 
31
 
 
32
#include "buf0buf.h"
 
33
#include "srv0srv.h"
 
34
#include "page0zip.h"
 
35
#ifndef UNIV_HOTBACKUP
 
36
#include "ut0byte.h"
 
37
#include "ut0lst.h"
 
38
#include "page0page.h"
 
39
#include "fil0fil.h"
 
40
#include "buf0lru.h"
 
41
#include "buf0rea.h"
 
42
#include "ibuf0ibuf.h"
 
43
#include "log0log.h"
 
44
#include "os0file.h"
 
45
#include "trx0sys.h"
 
46
 
 
47
/**********************************************************************
 
48
These statistics are generated for heuristics used in estimating the
 
49
rate at which we should flush the dirty blocks to avoid bursty IO
 
50
activity. Note that the rate of flushing not only depends on how many
 
51
dirty pages we have in the buffer pool but it is also a fucntion of
 
52
how much redo the workload is generating and at what rate. */
 
53
/* @{ */
 
54
 
 
55
/** Number of intervals for which we keep the history of these stats.
 
56
Each interval is 1 second, defined by the rate at which
 
57
srv_error_monitor_thread() calls buf_flush_stat_update(). */
 
58
#define BUF_FLUSH_STAT_N_INTERVAL 20
 
59
 
 
60
/** Sampled values buf_flush_stat_cur.
 
61
Not protected by any mutex.  Updated by buf_flush_stat_update(). */
 
62
static buf_flush_stat_t buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
 
63
 
 
64
/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
 
65
static ulint            buf_flush_stat_arr_ind;
 
66
 
 
67
/** Values at start of the current interval. Reset by
 
68
buf_flush_stat_update(). */
 
69
static buf_flush_stat_t buf_flush_stat_cur;
 
70
 
 
71
/** Running sum of past values of buf_flush_stat_cur.
 
72
Updated by buf_flush_stat_update(). Not protected by any mutex. */
 
73
static buf_flush_stat_t buf_flush_stat_sum;
 
74
 
 
75
/** Number of pages flushed through non flush_list flushes. */
 
76
static ulint buf_lru_flush_page_count = 0;
 
77
 
 
78
/* @} */
 
79
 
 
80
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 
81
/******************************************************************//**
 
82
Validates the flush list.
 
83
@return TRUE if ok */
 
84
static
 
85
ibool
 
86
buf_flush_validate_low(void);
 
87
/*========================*/
 
88
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
89
 
 
90
/********************************************************************//**
 
91
Inserts a modified block into the flush list. */
 
92
UNIV_INTERN
 
93
void
 
94
buf_flush_insert_into_flush_list(
 
95
/*=============================*/
 
96
        buf_block_t*    block)  /*!< in/out: block which is modified */
 
97
{
 
98
        ut_ad(buf_pool_mutex_own());
 
99
        ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
 
100
              || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
 
101
                  <= block->page.oldest_modification));
 
102
 
 
103
        ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 
104
        ut_ad(block->page.in_LRU_list);
 
105
        ut_ad(block->page.in_page_hash);
 
106
        ut_ad(!block->page.in_zip_hash);
 
107
        ut_ad(!block->page.in_flush_list);
 
108
        ut_d(block->page.in_flush_list = TRUE);
 
109
        UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
 
110
 
 
111
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 
112
        ut_a(buf_flush_validate_low());
 
113
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
114
}
 
115
 
 
116
/********************************************************************//**
 
117
Inserts a modified block into the flush list in the right sorted position.
 
118
This function is used by recovery, because there the modifications do not
 
119
necessarily come in the order of lsn's. */
 
120
UNIV_INTERN
 
121
void
 
122
buf_flush_insert_sorted_into_flush_list(
 
123
/*====================================*/
 
124
        buf_block_t*    block)  /*!< in/out: block which is modified */
 
125
{
 
126
        buf_page_t*     prev_b;
 
127
        buf_page_t*     b;
 
128
 
 
129
        ut_ad(buf_pool_mutex_own());
 
130
        ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 
131
 
 
132
        ut_ad(block->page.in_LRU_list);
 
133
        ut_ad(block->page.in_page_hash);
 
134
        ut_ad(!block->page.in_zip_hash);
 
135
        ut_ad(!block->page.in_flush_list);
 
136
        ut_d(block->page.in_flush_list = TRUE);
 
137
 
 
138
        prev_b = NULL;
 
139
        b = UT_LIST_GET_FIRST(buf_pool->flush_list);
 
140
 
 
141
        while (b && b->oldest_modification > block->page.oldest_modification) {
 
142
                ut_ad(b->in_flush_list);
 
143
                prev_b = b;
 
144
                b = UT_LIST_GET_NEXT(list, b);
 
145
        }
 
146
 
 
147
        if (prev_b == NULL) {
 
148
                UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
 
149
        } else {
 
150
                UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
 
151
                                     prev_b, &block->page);
 
152
        }
 
153
 
 
154
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 
155
        ut_a(buf_flush_validate_low());
 
156
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
157
}
 
158
 
 
159
/********************************************************************//**
 
160
Returns TRUE if the file page block is immediately suitable for replacement,
 
161
i.e., the transition FILE_PAGE => NOT_USED allowed.
 
162
@return TRUE if can replace immediately */
 
163
UNIV_INTERN
 
164
ibool
 
165
buf_flush_ready_for_replace(
 
166
/*========================*/
 
167
        buf_page_t*     bpage)  /*!< in: buffer control block, must be
 
168
                                buf_page_in_file(bpage) and in the LRU list */
 
169
{
 
170
        ut_ad(buf_pool_mutex_own());
 
171
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 
172
        ut_ad(bpage->in_LRU_list);
 
173
 
 
174
        if (UNIV_LIKELY(buf_page_in_file(bpage))) {
 
175
 
 
176
                return(bpage->oldest_modification == 0
 
177
                       && buf_page_get_io_fix(bpage) == BUF_IO_NONE
 
178
                       && bpage->buf_fix_count == 0);
 
179
        }
 
180
 
 
181
        ut_print_timestamp(stderr);
 
182
        fprintf(stderr,
 
183
                "  InnoDB: Error: buffer block state %lu"
 
184
                " in the LRU list!\n",
 
185
                (ulong) buf_page_get_state(bpage));
 
186
        ut_print_buf(stderr, bpage, sizeof(buf_page_t));
 
187
        putc('\n', stderr);
 
188
 
 
189
        return(FALSE);
 
190
}
 
191
 
 
192
/********************************************************************//**
 
193
Returns TRUE if the block is modified and ready for flushing.
 
194
@return TRUE if can flush immediately */
 
195
UNIV_INLINE
 
196
ibool
 
197
buf_flush_ready_for_flush(
 
198
/*======================*/
 
199
        buf_page_t*     bpage,  /*!< in: buffer control block, must be
 
200
                                buf_page_in_file(bpage) */
 
201
        enum buf_flush  flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
 
202
{
 
203
        ut_a(buf_page_in_file(bpage));
 
204
        ut_ad(buf_pool_mutex_own());
 
205
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 
206
        ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST);
 
207
 
 
208
        if (bpage->oldest_modification != 0
 
209
            && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
 
210
                ut_ad(bpage->in_flush_list);
 
211
 
 
212
                if (flush_type != BUF_FLUSH_LRU) {
 
213
 
 
214
                        return(TRUE);
 
215
 
 
216
                } else if (bpage->buf_fix_count == 0) {
 
217
 
 
218
                        /* If we are flushing the LRU list, to avoid deadlocks
 
219
                        we require the block not to be bufferfixed, and hence
 
220
                        not latched. */
 
221
 
 
222
                        return(TRUE);
 
223
                }
 
224
        }
 
225
 
 
226
        return(FALSE);
 
227
}
 
228
 
 
229
/********************************************************************//**
 
230
Remove a block from the flush list of modified blocks. */
 
231
UNIV_INTERN
 
232
void
 
233
buf_flush_remove(
 
234
/*=============*/
 
235
        buf_page_t*     bpage)  /*!< in: pointer to the block in question */
 
236
{
 
237
        ut_ad(buf_pool_mutex_own());
 
238
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 
239
        ut_ad(bpage->in_flush_list);
 
240
        ut_d(bpage->in_flush_list = FALSE);
 
241
 
 
242
        switch (buf_page_get_state(bpage)) {
 
243
        case BUF_BLOCK_ZIP_PAGE:
 
244
                /* clean compressed pages should not be on the flush list */
 
245
        case BUF_BLOCK_ZIP_FREE:
 
246
        case BUF_BLOCK_NOT_USED:
 
247
        case BUF_BLOCK_READY_FOR_USE:
 
248
        case BUF_BLOCK_MEMORY:
 
249
        case BUF_BLOCK_REMOVE_HASH:
 
250
                ut_error;
 
251
                return;
 
252
        case BUF_BLOCK_ZIP_DIRTY:
 
253
                buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
 
254
                UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
 
255
                buf_LRU_insert_zip_clean(bpage);
 
256
                break;
 
257
        case BUF_BLOCK_FILE_PAGE:
 
258
                UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
 
259
                break;
 
260
        }
 
261
 
 
262
        bpage->oldest_modification = 0;
 
263
 
 
264
        ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
 
265
                              ut_ad(ut_list_node_313->in_flush_list)));
 
266
}
 
267
 
 
268
/********************************************************************//**
 
269
Updates the flush system data structures when a write is completed. */
 
270
UNIV_INTERN
 
271
void
 
272
buf_flush_write_complete(
 
273
/*=====================*/
 
274
        buf_page_t*     bpage)  /*!< in: pointer to the block in question */
 
275
{
 
276
        enum buf_flush  flush_type;
 
277
 
 
278
        ut_ad(bpage);
 
279
 
 
280
        buf_flush_remove(bpage);
 
281
 
 
282
        flush_type = buf_page_get_flush_type(bpage);
 
283
        buf_pool->n_flush[flush_type]--;
 
284
 
 
285
        if (flush_type == BUF_FLUSH_LRU) {
 
286
                /* Put the block to the end of the LRU list to wait to be
 
287
                moved to the free list */
 
288
 
 
289
                buf_LRU_make_block_old(bpage);
 
290
 
 
291
                buf_pool->LRU_flush_ended++;
 
292
        }
 
293
 
 
294
        /* fprintf(stderr, "n pending flush %lu\n",
 
295
        buf_pool->n_flush[flush_type]); */
 
296
 
 
297
        if ((buf_pool->n_flush[flush_type] == 0)
 
298
            && (buf_pool->init_flush[flush_type] == FALSE)) {
 
299
 
 
300
                /* The running flush batch has ended */
 
301
 
 
302
                os_event_set(buf_pool->no_flush[flush_type]);
 
303
        }
 
304
}
 
305
 
 
306
/********************************************************************//**
 
307
Flush a batch of writes to the datafiles that have already been
 
308
written by the OS. */
 
309
static
 
310
void
 
311
buf_flush_sync_datafiles(void)
 
312
/*==========================*/
 
313
{
 
314
        /* Wake possible simulated aio thread to actually post the
 
315
        writes to the operating system */
 
316
        os_aio_simulated_wake_handler_threads();
 
317
 
 
318
        /* Wait that all async writes to tablespaces have been posted to
 
319
        the OS */
 
320
        os_aio_wait_until_no_pending_writes();
 
321
 
 
322
        /* Now we flush the data to disk (for example, with fsync) */
 
323
        fil_flush_file_spaces(FIL_TABLESPACE);
 
324
 
 
325
        return;
 
326
}
 
327
 
 
328
/********************************************************************//**
 
329
Flushes possible buffered writes from the doublewrite memory buffer to disk,
 
330
and also wakes up the aio thread if simulated aio is used. It is very
 
331
important to call this function after a batch of writes has been posted,
 
332
and also when we may have to wait for a page latch! Otherwise a deadlock
 
333
of threads can occur. */
 
334
static
 
335
void
 
336
buf_flush_buffered_writes(void)
 
337
/*===========================*/
 
338
{
 
339
        byte*           write_buf;
 
340
        ulint           len;
 
341
        ulint           len2;
 
342
        ulint           i;
 
343
 
 
344
        if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
 
345
                /* Sync the writes to the disk. */
 
346
                buf_flush_sync_datafiles();
 
347
                return;
 
348
        }
 
349
 
 
350
        mutex_enter(&(trx_doublewrite->mutex));
 
351
 
 
352
        /* Write first to doublewrite buffer blocks. We use synchronous
 
353
        aio and thus know that file write has been completed when the
 
354
        control returns. */
 
355
 
 
356
        if (trx_doublewrite->first_free == 0) {
 
357
 
 
358
                mutex_exit(&(trx_doublewrite->mutex));
 
359
 
 
360
                return;
 
361
        }
 
362
 
 
363
        for (i = 0; i < trx_doublewrite->first_free; i++) {
 
364
 
 
365
                const buf_block_t*      block;
 
366
 
 
367
                block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
 
368
 
 
369
                if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
 
370
                    || block->page.zip.data) {
 
371
                        /* No simple validate for compressed pages exists. */
 
372
                        continue;
 
373
                }
 
374
 
 
375
                if (UNIV_UNLIKELY
 
376
                    (memcmp(block->frame + (FIL_PAGE_LSN + 4),
 
377
                            block->frame + (UNIV_PAGE_SIZE
 
378
                                            - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
 
379
                            4))) {
 
380
                        ut_print_timestamp(stderr);
 
381
                        fprintf(stderr,
 
382
                                "  InnoDB: ERROR: The page to be written"
 
383
                                " seems corrupt!\n"
 
384
                                "InnoDB: The lsn fields do not match!"
 
385
                                " Noticed in the buffer pool\n"
 
386
                                "InnoDB: before posting to the"
 
387
                                " doublewrite buffer.\n");
 
388
                }
 
389
 
 
390
                if (!block->check_index_page_at_flush) {
 
391
                } else if (page_is_comp(block->frame)) {
 
392
                        if (UNIV_UNLIKELY
 
393
                            (!page_simple_validate_new(block->frame))) {
 
394
corrupted_page:
 
395
                                buf_page_print(block->frame, 0);
 
396
 
 
397
                                ut_print_timestamp(stderr);
 
398
                                fprintf(stderr,
 
399
                                        "  InnoDB: Apparent corruption of an"
 
400
                                        " index page n:o %lu in space %lu\n"
 
401
                                        "InnoDB: to be written to data file."
 
402
                                        " We intentionally crash server\n"
 
403
                                        "InnoDB: to prevent corrupt data"
 
404
                                        " from ending up in data\n"
 
405
                                        "InnoDB: files.\n",
 
406
                                        (ulong) buf_block_get_page_no(block),
 
407
                                        (ulong) buf_block_get_space(block));
 
408
 
 
409
                                ut_error;
 
410
                        }
 
411
                } else if (UNIV_UNLIKELY
 
412
                           (!page_simple_validate_old(block->frame))) {
 
413
 
 
414
                        goto corrupted_page;
 
415
                }
 
416
        }
 
417
 
 
418
        /* increment the doublewrite flushed pages counter */
 
419
        srv_dblwr_pages_written+= trx_doublewrite->first_free;
 
420
        srv_dblwr_writes++;
 
421
 
 
422
        len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
 
423
                     trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
 
424
 
 
425
        write_buf = trx_doublewrite->write_buf;
 
426
        i = 0;
 
427
 
 
428
        fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
 
429
               trx_doublewrite->block1, 0, len,
 
430
               (void*) write_buf, NULL);
 
431
 
 
432
        for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
 
433
             len2 += UNIV_PAGE_SIZE, i++) {
 
434
                const buf_block_t* block = (buf_block_t*)
 
435
                        trx_doublewrite->buf_block_arr[i];
 
436
 
 
437
                if (UNIV_LIKELY(!block->page.zip.data)
 
438
                    && UNIV_LIKELY(buf_block_get_state(block)
 
439
                                   == BUF_BLOCK_FILE_PAGE)
 
440
                    && UNIV_UNLIKELY
 
441
                    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
 
442
                            write_buf + len2
 
443
                            + (UNIV_PAGE_SIZE
 
444
                               - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
 
445
                        ut_print_timestamp(stderr);
 
446
                        fprintf(stderr,
 
447
                                "  InnoDB: ERROR: The page to be written"
 
448
                                " seems corrupt!\n"
 
449
                                "InnoDB: The lsn fields do not match!"
 
450
                                " Noticed in the doublewrite block1.\n");
 
451
                }
 
452
        }
 
453
 
 
454
        if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
455
                goto flush;
 
456
        }
 
457
 
 
458
        len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
 
459
                * UNIV_PAGE_SIZE;
 
460
 
 
461
        write_buf = trx_doublewrite->write_buf
 
462
                + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
 
463
        ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
 
464
 
 
465
        fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
 
466
               trx_doublewrite->block2, 0, len,
 
467
               (void*) write_buf, NULL);
 
468
 
 
469
        for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
 
470
             len2 += UNIV_PAGE_SIZE, i++) {
 
471
                const buf_block_t* block = (buf_block_t*)
 
472
                        trx_doublewrite->buf_block_arr[i];
 
473
 
 
474
                if (UNIV_LIKELY(!block->page.zip.data)
 
475
                    && UNIV_LIKELY(buf_block_get_state(block)
 
476
                                   == BUF_BLOCK_FILE_PAGE)
 
477
                    && UNIV_UNLIKELY
 
478
                    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
 
479
                            write_buf + len2
 
480
                            + (UNIV_PAGE_SIZE
 
481
                               - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
 
482
                        ut_print_timestamp(stderr);
 
483
                        fprintf(stderr,
 
484
                                "  InnoDB: ERROR: The page to be"
 
485
                                " written seems corrupt!\n"
 
486
                                "InnoDB: The lsn fields do not match!"
 
487
                                " Noticed in"
 
488
                                " the doublewrite block2.\n");
 
489
                }
 
490
        }
 
491
 
 
492
flush:
 
493
        /* Now flush the doublewrite buffer data to disk */
 
494
 
 
495
        fil_flush(TRX_SYS_SPACE);
 
496
 
 
497
        /* We know that the writes have been flushed to disk now
 
498
        and in recovery we will find them in the doublewrite buffer
 
499
        blocks. Next do the writes to the intended positions. */
 
500
 
 
501
        for (i = 0; i < trx_doublewrite->first_free; i++) {
 
502
                const buf_block_t* block = (buf_block_t*)
 
503
                        trx_doublewrite->buf_block_arr[i];
 
504
 
 
505
                ut_a(buf_page_in_file(&block->page));
 
506
                if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 
507
                        fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
 
508
                               FALSE, buf_page_get_space(&block->page),
 
509
                               buf_page_get_zip_size(&block->page),
 
510
                               buf_page_get_page_no(&block->page), 0,
 
511
                               buf_page_get_zip_size(&block->page),
 
512
                               (void*)block->page.zip.data,
 
513
                               (void*)block);
 
514
 
 
515
                        /* Increment the counter of I/O operations used
 
516
                        for selecting LRU policy. */
 
517
                        buf_LRU_stat_inc_io();
 
518
 
 
519
                        continue;
 
520
                }
 
521
 
 
522
                ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 
523
 
 
524
                if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
 
525
                                         block->frame
 
526
                                         + (UNIV_PAGE_SIZE
 
527
                                            - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
 
528
                                         4))) {
 
529
                        ut_print_timestamp(stderr);
 
530
                        fprintf(stderr,
 
531
                                "  InnoDB: ERROR: The page to be written"
 
532
                                " seems corrupt!\n"
 
533
                                "InnoDB: The lsn fields do not match!"
 
534
                                " Noticed in the buffer pool\n"
 
535
                                "InnoDB: after posting and flushing"
 
536
                                " the doublewrite buffer.\n"
 
537
                                "InnoDB: Page buf fix count %lu,"
 
538
                                " io fix %lu, state %lu\n",
 
539
                                (ulong)block->page.buf_fix_count,
 
540
                                (ulong)buf_block_get_io_fix(block),
 
541
                                (ulong)buf_block_get_state(block));
 
542
                }
 
543
 
 
544
                fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
 
545
                       FALSE, buf_block_get_space(block), 0,
 
546
                       buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
 
547
                       (void*)block->frame, (void*)block);
 
548
 
 
549
                /* Increment the counter of I/O operations used
 
550
                for selecting LRU policy. */
 
551
                buf_LRU_stat_inc_io();
 
552
        }
 
553
 
 
554
        /* Sync the writes to the disk. */
 
555
        buf_flush_sync_datafiles();
 
556
 
 
557
        /* We can now reuse the doublewrite memory buffer: */
 
558
        trx_doublewrite->first_free = 0;
 
559
 
 
560
        mutex_exit(&(trx_doublewrite->mutex));
 
561
}
 
562
 
 
563
/********************************************************************//**
 
564
Posts a buffer page for writing. If the doublewrite memory buffer is
 
565
full, calls buf_flush_buffered_writes and waits for for free space to
 
566
appear. */
 
567
static
 
568
void
 
569
buf_flush_post_to_doublewrite_buf(
 
570
/*==============================*/
 
571
        buf_page_t*     bpage)  /*!< in: buffer block to write */
 
572
{
 
573
        ulint   zip_size;
 
574
try_again:
 
575
        mutex_enter(&(trx_doublewrite->mutex));
 
576
 
 
577
        ut_a(buf_page_in_file(bpage));
 
578
 
 
579
        if (trx_doublewrite->first_free
 
580
            >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
581
                mutex_exit(&(trx_doublewrite->mutex));
 
582
 
 
583
                buf_flush_buffered_writes();
 
584
 
 
585
                goto try_again;
 
586
        }
 
587
 
 
588
        zip_size = buf_page_get_zip_size(bpage);
 
589
 
 
590
        if (UNIV_UNLIKELY(zip_size)) {
 
591
                /* Copy the compressed page and clear the rest. */
 
592
                memcpy(trx_doublewrite->write_buf
 
593
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
 
594
                       bpage->zip.data, zip_size);
 
595
                memset(trx_doublewrite->write_buf
 
596
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free
 
597
                       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
 
598
        } else {
 
599
                ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
 
600
 
 
601
                memcpy(trx_doublewrite->write_buf
 
602
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
 
603
                       ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
 
604
        }
 
605
 
 
606
        trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
 
607
 
 
608
        trx_doublewrite->first_free++;
 
609
 
 
610
        if (trx_doublewrite->first_free
 
611
            >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
612
                mutex_exit(&(trx_doublewrite->mutex));
 
613
 
 
614
                buf_flush_buffered_writes();
 
615
 
 
616
                return;
 
617
        }
 
618
 
 
619
        mutex_exit(&(trx_doublewrite->mutex));
 
620
}
 
621
#endif /* !UNIV_HOTBACKUP */
 
622
 
 
623
/********************************************************************//**
 
624
Initializes a page for writing to the tablespace. */
 
625
UNIV_INTERN
 
626
void
 
627
buf_flush_init_for_writing(
 
628
/*=======================*/
 
629
        byte*           page,           /*!< in/out: page */
 
630
        void*           page_zip_,      /*!< in/out: compressed page, or NULL */
 
631
        ib_uint64_t     newest_lsn)     /*!< in: newest modification lsn
 
632
                                        to the page */
 
633
{
 
634
        ut_ad(page);
 
635
 
 
636
        if (page_zip_) {
 
637
                page_zip_des_t* page_zip = page_zip_;
 
638
                ulint           zip_size = page_zip_get_size(page_zip);
 
639
                ut_ad(zip_size);
 
640
                ut_ad(ut_is_2pow(zip_size));
 
641
                ut_ad(zip_size <= UNIV_PAGE_SIZE);
 
642
 
 
643
                switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
 
644
                case FIL_PAGE_TYPE_ALLOCATED:
 
645
                case FIL_PAGE_INODE:
 
646
                case FIL_PAGE_IBUF_BITMAP:
 
647
                case FIL_PAGE_TYPE_FSP_HDR:
 
648
                case FIL_PAGE_TYPE_XDES:
 
649
                        /* These are essentially uncompressed pages. */
 
650
                        memcpy(page_zip->data, page, zip_size);
 
651
                        /* fall through */
 
652
                case FIL_PAGE_TYPE_ZBLOB:
 
653
                case FIL_PAGE_TYPE_ZBLOB2:
 
654
                case FIL_PAGE_INDEX:
 
655
                        mach_write_ull(page_zip->data
 
656
                                       + FIL_PAGE_LSN, newest_lsn);
 
657
                        memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
 
658
                        mach_write_to_4(page_zip->data
 
659
                                        + FIL_PAGE_SPACE_OR_CHKSUM,
 
660
                                        srv_use_checksums
 
661
                                        ? page_zip_calc_checksum(
 
662
                                                page_zip->data, zip_size)
 
663
                                        : BUF_NO_CHECKSUM_MAGIC);
 
664
                        return;
 
665
                }
 
666
 
 
667
                ut_print_timestamp(stderr);
 
668
                fputs("  InnoDB: ERROR: The compressed page to be written"
 
669
                      " seems corrupt:", stderr);
 
670
                ut_print_buf(stderr, page, zip_size);
 
671
                fputs("\nInnoDB: Possibly older version of the page:", stderr);
 
672
                ut_print_buf(stderr, page_zip->data, zip_size);
 
673
                putc('\n', stderr);
 
674
                ut_error;
 
675
        }
 
676
 
 
677
        /* Write the newest modification lsn to the page header and trailer */
 
678
        mach_write_ull(page + FIL_PAGE_LSN, newest_lsn);
 
679
 
 
680
        mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
 
681
                       newest_lsn);
 
682
 
 
683
        /* Store the new formula checksum */
 
684
 
 
685
        mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
 
686
                        srv_use_checksums
 
687
                        ? buf_calc_page_new_checksum(page)
 
688
                        : BUF_NO_CHECKSUM_MAGIC);
 
689
 
 
690
        /* We overwrite the first 4 bytes of the end lsn field to store
 
691
        the old formula checksum. Since it depends also on the field
 
692
        FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
 
693
        new formula checksum. */
 
694
 
 
695
        mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
 
696
                        srv_use_checksums
 
697
                        ? buf_calc_page_old_checksum(page)
 
698
                        : BUF_NO_CHECKSUM_MAGIC);
 
699
}
 
700
 
 
701
#ifndef UNIV_HOTBACKUP
 
702
/********************************************************************//**
 
703
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
 
704
also when the doublewrite buffer is used, we must call
 
705
buf_flush_buffered_writes after we have posted a batch of writes! */
 
706
static
 
707
void
 
708
buf_flush_write_block_low(
 
709
/*======================*/
 
710
        buf_page_t*     bpage)  /*!< in: buffer block to write */
 
711
{
 
712
        ulint   zip_size        = buf_page_get_zip_size(bpage);
 
713
        page_t* frame           = NULL;
 
714
#ifdef UNIV_LOG_DEBUG
 
715
        static ibool univ_log_debug_warned;
 
716
#endif /* UNIV_LOG_DEBUG */
 
717
 
 
718
        ut_ad(buf_page_in_file(bpage));
 
719
 
 
720
        /* We are not holding buf_pool_mutex or block_mutex here.
 
721
        Nevertheless, it is safe to access bpage, because it is
 
722
        io_fixed and oldest_modification != 0.  Thus, it cannot be
 
723
        relocated in the buffer pool or removed from flush_list or
 
724
        LRU_list. */
 
725
        ut_ad(!buf_pool_mutex_own());
 
726
        ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
 
727
        ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
 
728
        ut_ad(bpage->oldest_modification != 0);
 
729
 
 
730
#ifdef UNIV_IBUF_COUNT_DEBUG
 
731
        ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
 
732
#endif
 
733
        ut_ad(bpage->newest_modification != 0);
 
734
 
 
735
#ifdef UNIV_LOG_DEBUG
 
736
        if (!univ_log_debug_warned) {
 
737
                univ_log_debug_warned = TRUE;
 
738
                fputs("Warning: cannot force log to disk if"
 
739
                      " UNIV_LOG_DEBUG is defined!\n"
 
740
                      "Crash recovery will not work!\n",
 
741
                      stderr);
 
742
        }
 
743
#else
 
744
        /* Force the log to the disk before writing the modified block */
 
745
        log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
 
746
#endif
 
747
        switch (buf_page_get_state(bpage)) {
 
748
        case BUF_BLOCK_ZIP_FREE:
 
749
        case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
 
750
        case BUF_BLOCK_NOT_USED:
 
751
        case BUF_BLOCK_READY_FOR_USE:
 
752
        case BUF_BLOCK_MEMORY:
 
753
        case BUF_BLOCK_REMOVE_HASH:
 
754
                ut_error;
 
755
                break;
 
756
        case BUF_BLOCK_ZIP_DIRTY:
 
757
                frame = bpage->zip.data;
 
758
                if (UNIV_LIKELY(srv_use_checksums)) {
 
759
                        ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
 
760
                             == page_zip_calc_checksum(frame, zip_size));
 
761
                }
 
762
                mach_write_ull(frame + FIL_PAGE_LSN,
 
763
                               bpage->newest_modification);
 
764
                memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
 
765
                break;
 
766
        case BUF_BLOCK_FILE_PAGE:
 
767
                frame = bpage->zip.data;
 
768
                if (!frame) {
 
769
                        frame = ((buf_block_t*) bpage)->frame;
 
770
                }
 
771
 
 
772
                buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
 
773
                                           bpage->zip.data
 
774
                                           ? &bpage->zip : NULL,
 
775
                                           bpage->newest_modification);
 
776
                break;
 
777
        }
 
778
 
 
779
        if (!srv_use_doublewrite_buf || !trx_doublewrite) {
 
780
                fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
 
781
                       FALSE, buf_page_get_space(bpage), zip_size,
 
782
                       buf_page_get_page_no(bpage), 0,
 
783
                       zip_size ? zip_size : UNIV_PAGE_SIZE,
 
784
                       frame, bpage);
 
785
        } else {
 
786
                buf_flush_post_to_doublewrite_buf(bpage);
 
787
        }
 
788
}
 
789
 
 
790
/********************************************************************//**
 
791
Writes a flushable page asynchronously from the buffer pool to a file.
 
792
NOTE: in simulated aio we must call
 
793
os_aio_simulated_wake_handler_threads after we have posted a batch of
 
794
writes! NOTE: buf_pool_mutex and buf_page_get_mutex(bpage) must be
 
795
held upon entering this function, and they will be released by this
 
796
function. */
 
797
static
 
798
void
 
799
buf_flush_page(
 
800
/*===========*/
 
801
        buf_page_t*     bpage,          /*!< in: buffer control block */
 
802
        enum buf_flush  flush_type)     /*!< in: BUF_FLUSH_LRU
 
803
                                        or BUF_FLUSH_LIST */
 
804
{
 
805
        mutex_t*        block_mutex;
 
806
        ibool           is_uncompressed;
 
807
 
 
808
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
 
809
        ut_ad(buf_pool_mutex_own());
 
810
        ut_ad(buf_page_in_file(bpage));
 
811
 
 
812
        block_mutex = buf_page_get_mutex(bpage);
 
813
        ut_ad(mutex_own(block_mutex));
 
814
 
 
815
        ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
 
816
 
 
817
        buf_page_set_io_fix(bpage, BUF_IO_WRITE);
 
818
 
 
819
        buf_page_set_flush_type(bpage, flush_type);
 
820
 
 
821
        if (buf_pool->n_flush[flush_type] == 0) {
 
822
 
 
823
                os_event_reset(buf_pool->no_flush[flush_type]);
 
824
        }
 
825
 
 
826
        buf_pool->n_flush[flush_type]++;
 
827
 
 
828
        is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
 
829
        ut_ad(is_uncompressed == (block_mutex != &buf_pool_zip_mutex));
 
830
 
 
831
        switch (flush_type) {
 
832
                ibool   is_s_latched;
 
833
        case BUF_FLUSH_LIST:
 
834
                /* If the simulated aio thread is not running, we must
 
835
                not wait for any latch, as we may end up in a deadlock:
 
836
                if buf_fix_count == 0, then we know we need not wait */
 
837
 
 
838
                is_s_latched = (bpage->buf_fix_count == 0);
 
839
                if (is_s_latched && is_uncompressed) {
 
840
                        rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
 
841
                                           BUF_IO_WRITE);
 
842
                }
 
843
 
 
844
                mutex_exit(block_mutex);
 
845
                buf_pool_mutex_exit();
 
846
 
 
847
                /* Even though bpage is not protected by any mutex at
 
848
                this point, it is safe to access bpage, because it is
 
849
                io_fixed and oldest_modification != 0.  Thus, it
 
850
                cannot be relocated in the buffer pool or removed from
 
851
                flush_list or LRU_list. */
 
852
 
 
853
                if (!is_s_latched) {
 
854
                        buf_flush_buffered_writes();
 
855
 
 
856
                        if (is_uncompressed) {
 
857
                                rw_lock_s_lock_gen(&((buf_block_t*) bpage)
 
858
                                                   ->lock, BUF_IO_WRITE);
 
859
                        }
 
860
                }
 
861
 
 
862
                break;
 
863
 
 
864
        case BUF_FLUSH_LRU:
 
865
                /* VERY IMPORTANT:
 
866
                Because any thread may call the LRU flush, even when owning
 
867
                locks on pages, to avoid deadlocks, we must make sure that the
 
868
                s-lock is acquired on the page without waiting: this is
 
869
                accomplished because buf_flush_ready_for_flush() must hold,
 
870
                and that requires the page not to be bufferfixed. */
 
871
 
 
872
                if (is_uncompressed) {
 
873
                        rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
 
874
                                           BUF_IO_WRITE);
 
875
                }
 
876
 
 
877
                /* Note that the s-latch is acquired before releasing the
 
878
                buf_pool mutex: this ensures that the latch is acquired
 
879
                immediately. */
 
880
 
 
881
                mutex_exit(block_mutex);
 
882
                buf_pool_mutex_exit();
 
883
                break;
 
884
 
 
885
        default:
 
886
                ut_error;
 
887
        }
 
888
 
 
889
        /* Even though bpage is not protected by any mutex at this
 
890
        point, it is safe to access bpage, because it is io_fixed and
 
891
        oldest_modification != 0.  Thus, it cannot be relocated in the
 
892
        buffer pool or removed from flush_list or LRU_list. */
 
893
 
 
894
#ifdef UNIV_DEBUG
 
895
        if (buf_debug_prints) {
 
896
                fprintf(stderr,
 
897
                        "Flushing %u space %u page %u\n",
 
898
                        flush_type, bpage->space, bpage->offset);
 
899
        }
 
900
#endif /* UNIV_DEBUG */
 
901
        buf_flush_write_block_low(bpage);
 
902
}
 
903
 
 
904
/***********************************************************//**
 
905
Flushes to disk all flushable pages within the flush area.
 
906
@return number of pages flushed */
 
907
static
 
908
ulint
 
909
buf_flush_try_neighbors(
 
910
/*====================*/
 
911
        ulint           space,          /*!< in: space id */
 
912
        ulint           offset,         /*!< in: page offset */
 
913
        enum buf_flush  flush_type)     /*!< in: BUF_FLUSH_LRU or
 
914
                                        BUF_FLUSH_LIST */
 
915
{
 
916
        buf_page_t*     bpage;
 
917
        ulint           low, high;
 
918
        ulint           count           = 0;
 
919
        ulint           i;
 
920
 
 
921
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
 
922
 
 
923
        if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
 
924
                /* If there is little space, it is better not to flush any
 
925
                block except from the end of the LRU list */
 
926
 
 
927
                low = offset;
 
928
                high = offset + 1;
 
929
        } else {
 
930
                /* When flushed, dirty blocks are searched in neighborhoods of
 
931
                this size, and flushed along with the original page. */
 
932
 
 
933
                ulint   buf_flush_area  = ut_min(BUF_READ_AHEAD_AREA,
 
934
                                                 buf_pool->curr_size / 16);
 
935
 
 
936
                low = (offset / buf_flush_area) * buf_flush_area;
 
937
                high = (offset / buf_flush_area + 1) * buf_flush_area;
 
938
        }
 
939
 
 
940
        /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
 
941
 
 
942
        if (high > fil_space_get_size(space)) {
 
943
                high = fil_space_get_size(space);
 
944
        }
 
945
 
 
946
        buf_pool_mutex_enter();
 
947
 
 
948
        for (i = low; i < high; i++) {
 
949
 
 
950
                bpage = buf_page_hash_get(space, i);
 
951
 
 
952
                if (!bpage) {
 
953
 
 
954
                        continue;
 
955
                }
 
956
 
 
957
                ut_a(buf_page_in_file(bpage));
 
958
 
 
959
                /* We avoid flushing 'non-old' blocks in an LRU flush,
 
960
                because the flushed blocks are soon freed */
 
961
 
 
962
                if (flush_type != BUF_FLUSH_LRU
 
963
                    || i == offset
 
964
                    || buf_page_is_old(bpage)) {
 
965
                        mutex_t* block_mutex = buf_page_get_mutex(bpage);
 
966
 
 
967
                        mutex_enter(block_mutex);
 
968
 
 
969
                        if (buf_flush_ready_for_flush(bpage, flush_type)
 
970
                            && (i == offset || !bpage->buf_fix_count)) {
 
971
                                /* We only try to flush those
 
972
                                neighbors != offset where the buf fix count is
 
973
                                zero, as we then know that we probably can
 
974
                                latch the page without a semaphore wait.
 
975
                                Semaphore waits are expensive because we must
 
976
                                flush the doublewrite buffer before we start
 
977
                                waiting. */
 
978
 
 
979
                                buf_flush_page(bpage, flush_type);
 
980
                                ut_ad(!mutex_own(block_mutex));
 
981
                                count++;
 
982
 
 
983
                                buf_pool_mutex_enter();
 
984
                        } else {
 
985
                                mutex_exit(block_mutex);
 
986
                        }
 
987
                }
 
988
        }
 
989
 
 
990
        buf_pool_mutex_exit();
 
991
 
 
992
        return(count);
 
993
}
 
994
 
 
995
/*******************************************************************//**
 
996
This utility flushes dirty blocks from the end of the LRU list or flush_list.
 
997
NOTE 1: in the case of an LRU flush the calling thread may own latches to
 
998
pages: to avoid deadlocks, this function must be written so that it cannot
 
999
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
 
1000
the calling thread is not allowed to own any latches on pages!
 
1001
@return number of blocks for which the write request was queued;
 
1002
ULINT_UNDEFINED if there was a flush of the same type already running */
 
1003
UNIV_INTERN
 
1004
ulint
 
1005
buf_flush_batch(
 
1006
/*============*/
 
1007
        enum buf_flush  flush_type,     /*!< in: BUF_FLUSH_LRU or
 
1008
                                        BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
 
1009
                                        then the caller must not own any
 
1010
                                        latches on pages */
 
1011
        ulint           min_n,          /*!< in: wished minimum mumber of blocks
 
1012
                                        flushed (it is not guaranteed that the
 
1013
                                        actual number is that big, though) */
 
1014
        ib_uint64_t     lsn_limit)      /*!< in the case BUF_FLUSH_LIST all
 
1015
                                        blocks whose oldest_modification is
 
1016
                                        smaller than this should be flushed
 
1017
                                        (if their number does not exceed
 
1018
                                        min_n), otherwise ignored */
 
1019
{
 
1020
        buf_page_t*     bpage;
 
1021
        ulint           page_count      = 0;
 
1022
        ulint           old_page_count;
 
1023
        ulint           space;
 
1024
        ulint           offset;
 
1025
 
 
1026
        ut_ad((flush_type == BUF_FLUSH_LRU)
 
1027
              || (flush_type == BUF_FLUSH_LIST));
 
1028
#ifdef UNIV_SYNC_DEBUG
 
1029
        ut_ad((flush_type != BUF_FLUSH_LIST)
 
1030
              || sync_thread_levels_empty_gen(TRUE));
 
1031
#endif /* UNIV_SYNC_DEBUG */
 
1032
        buf_pool_mutex_enter();
 
1033
 
 
1034
        if ((buf_pool->n_flush[flush_type] > 0)
 
1035
            || (buf_pool->init_flush[flush_type] == TRUE)) {
 
1036
 
 
1037
                /* There is already a flush batch of the same type running */
 
1038
 
 
1039
                buf_pool_mutex_exit();
 
1040
 
 
1041
                return(ULINT_UNDEFINED);
 
1042
        }
 
1043
 
 
1044
        buf_pool->init_flush[flush_type] = TRUE;
 
1045
 
 
1046
        for (;;) {
 
1047
flush_next:
 
1048
                /* If we have flushed enough, leave the loop */
 
1049
                if (page_count >= min_n) {
 
1050
 
 
1051
                        break;
 
1052
                }
 
1053
 
 
1054
                /* Start from the end of the list looking for a suitable
 
1055
                block to be flushed. */
 
1056
 
 
1057
                if (flush_type == BUF_FLUSH_LRU) {
 
1058
                        bpage = UT_LIST_GET_LAST(buf_pool->LRU);
 
1059
                } else {
 
1060
                        ut_ad(flush_type == BUF_FLUSH_LIST);
 
1061
 
 
1062
                        bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
 
1063
                        if (!bpage
 
1064
                            || bpage->oldest_modification >= lsn_limit) {
 
1065
                                /* We have flushed enough */
 
1066
 
 
1067
                                break;
 
1068
                        }
 
1069
                        ut_ad(bpage->in_flush_list);
 
1070
                }
 
1071
 
 
1072
                /* Note that after finding a single flushable page, we try to
 
1073
                flush also all its neighbors, and after that start from the
 
1074
                END of the LRU list or flush list again: the list may change
 
1075
                during the flushing and we cannot safely preserve within this
 
1076
                function a pointer to a block in the list! */
 
1077
 
 
1078
                do {
 
1079
                        mutex_t*block_mutex = buf_page_get_mutex(bpage);
 
1080
                        ibool   ready;
 
1081
 
 
1082
                        ut_a(buf_page_in_file(bpage));
 
1083
 
 
1084
                        mutex_enter(block_mutex);
 
1085
                        ready = buf_flush_ready_for_flush(bpage, flush_type);
 
1086
                        mutex_exit(block_mutex);
 
1087
 
 
1088
                        if (ready) {
 
1089
                                space = buf_page_get_space(bpage);
 
1090
                                offset = buf_page_get_page_no(bpage);
 
1091
 
 
1092
                                buf_pool_mutex_exit();
 
1093
 
 
1094
                                old_page_count = page_count;
 
1095
 
 
1096
                                /* Try to flush also all the neighbors */
 
1097
                                page_count += buf_flush_try_neighbors(
 
1098
                                        space, offset, flush_type);
 
1099
                                /* fprintf(stderr,
 
1100
                                "Flush type %lu, page no %lu, neighb %lu\n",
 
1101
                                flush_type, offset,
 
1102
                                page_count - old_page_count); */
 
1103
 
 
1104
                                buf_pool_mutex_enter();
 
1105
                                goto flush_next;
 
1106
 
 
1107
                        } else if (flush_type == BUF_FLUSH_LRU) {
 
1108
                                bpage = UT_LIST_GET_PREV(LRU, bpage);
 
1109
                        } else {
 
1110
                                ut_ad(flush_type == BUF_FLUSH_LIST);
 
1111
 
 
1112
                                bpage = UT_LIST_GET_PREV(list, bpage);
 
1113
                                ut_ad(!bpage || bpage->in_flush_list);
 
1114
                        }
 
1115
                } while (bpage != NULL);
 
1116
 
 
1117
                /* If we could not find anything to flush, leave the loop */
 
1118
 
 
1119
                break;
 
1120
        }
 
1121
 
 
1122
        buf_pool->init_flush[flush_type] = FALSE;
 
1123
 
 
1124
        if (buf_pool->n_flush[flush_type] == 0) {
 
1125
 
 
1126
                /* The running flush batch has ended */
 
1127
 
 
1128
                os_event_set(buf_pool->no_flush[flush_type]);
 
1129
        }
 
1130
 
 
1131
        buf_pool_mutex_exit();
 
1132
 
 
1133
        buf_flush_buffered_writes();
 
1134
 
 
1135
#ifdef UNIV_DEBUG
 
1136
        if (buf_debug_prints && page_count > 0) {
 
1137
                ut_a(flush_type == BUF_FLUSH_LRU
 
1138
                     || flush_type == BUF_FLUSH_LIST);
 
1139
                fprintf(stderr, flush_type == BUF_FLUSH_LRU
 
1140
                        ? "Flushed %lu pages in LRU flush\n"
 
1141
                        : "Flushed %lu pages in flush list flush\n",
 
1142
                        (ulong) page_count);
 
1143
        }
 
1144
#endif /* UNIV_DEBUG */
 
1145
 
 
1146
        srv_buf_pool_flushed += page_count;
 
1147
 
 
1148
        /* We keep track of all flushes happening as part of LRU
 
1149
        flush. When estimating the desired rate at which flush_list
 
1150
        should be flushed we factor in this value. */
 
1151
        if (flush_type == BUF_FLUSH_LRU) {
 
1152
                buf_lru_flush_page_count += page_count;
 
1153
        }
 
1154
 
 
1155
        return(page_count);
 
1156
}
 
1157
 
 
1158
/******************************************************************//**
 
1159
Waits until a flush batch of the given type ends */
 
1160
UNIV_INTERN
 
1161
void
 
1162
buf_flush_wait_batch_end(
 
1163
/*=====================*/
 
1164
        enum buf_flush  type)   /*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
 
1165
{
 
1166
        ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
 
1167
 
 
1168
        os_event_wait(buf_pool->no_flush[type]);
 
1169
}
 
1170
 
 
1171
/******************************************************************//**
 
1172
Gives a recommendation of how many blocks should be flushed to establish
 
1173
a big enough margin of replaceable blocks near the end of the LRU list
 
1174
and in the free list.
 
1175
@return number of blocks which should be flushed from the end of the
 
1176
LRU list */
 
1177
static
 
1178
ulint
 
1179
buf_flush_LRU_recommendation(void)
 
1180
/*==============================*/
 
1181
{
 
1182
        buf_page_t*     bpage;
 
1183
        ulint           n_replaceable;
 
1184
        ulint           distance        = 0;
 
1185
 
 
1186
        buf_pool_mutex_enter();
 
1187
 
 
1188
        n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
 
1189
 
 
1190
        bpage = UT_LIST_GET_LAST(buf_pool->LRU);
 
1191
 
 
1192
        while ((bpage != NULL)
 
1193
               && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
 
1194
                   + BUF_FLUSH_EXTRA_MARGIN)
 
1195
               && (distance < BUF_LRU_FREE_SEARCH_LEN)) {
 
1196
 
 
1197
                mutex_t* block_mutex = buf_page_get_mutex(bpage);
 
1198
 
 
1199
                mutex_enter(block_mutex);
 
1200
 
 
1201
                if (buf_flush_ready_for_replace(bpage)) {
 
1202
                        n_replaceable++;
 
1203
                }
 
1204
 
 
1205
                mutex_exit(block_mutex);
 
1206
 
 
1207
                distance++;
 
1208
 
 
1209
                bpage = UT_LIST_GET_PREV(LRU, bpage);
 
1210
        }
 
1211
 
 
1212
        buf_pool_mutex_exit();
 
1213
 
 
1214
        if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {
 
1215
 
 
1216
                return(0);
 
1217
        }
 
1218
 
 
1219
        return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
 
1220
               - n_replaceable);
 
1221
}
 
1222
 
 
1223
/*********************************************************************//**
 
1224
Flushes pages from the end of the LRU list if there is too small a margin
 
1225
of replaceable pages there or in the free list. VERY IMPORTANT: this function
 
1226
is called also by threads which have locks on pages. To avoid deadlocks, we
 
1227
flush only pages such that the s-lock required for flushing can be acquired
 
1228
immediately, without waiting. */
 
1229
UNIV_INTERN
 
1230
void
 
1231
buf_flush_free_margin(void)
 
1232
/*=======================*/
 
1233
{
 
1234
        ulint   n_to_flush;
 
1235
        ulint   n_flushed;
 
1236
 
 
1237
        n_to_flush = buf_flush_LRU_recommendation();
 
1238
 
 
1239
        if (n_to_flush > 0) {
 
1240
                n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, 0);
 
1241
                if (n_flushed == ULINT_UNDEFINED) {
 
1242
                        /* There was an LRU type flush batch already running;
 
1243
                        let us wait for it to end */
 
1244
 
 
1245
                        buf_flush_wait_batch_end(BUF_FLUSH_LRU);
 
1246
                }
 
1247
        }
 
1248
}
 
1249
 
 
1250
/*********************************************************************
 
1251
Update the historical stats that we are collecting for flush rate
 
1252
heuristics at the end of each interval.
 
1253
Flush rate heuristic depends on (a) rate of redo log generation and
 
1254
(b) the rate at which LRU flush is happening. */
 
1255
UNIV_INTERN
 
1256
void
 
1257
buf_flush_stat_update(void)
 
1258
/*=======================*/
 
1259
{
 
1260
        buf_flush_stat_t*       item;
 
1261
        ib_uint64_t             lsn_diff;
 
1262
        ib_uint64_t             lsn;
 
1263
        ulint                   n_flushed;
 
1264
 
 
1265
        lsn = log_get_lsn();
 
1266
        if (buf_flush_stat_cur.redo == 0) {
 
1267
                /* First time around. Just update the current LSN
 
1268
                and return. */
 
1269
                buf_flush_stat_cur.redo = lsn;
 
1270
                return;
 
1271
        }
 
1272
 
 
1273
        item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
 
1274
 
 
1275
        /* values for this interval */
 
1276
        lsn_diff = lsn - buf_flush_stat_cur.redo;
 
1277
        n_flushed = buf_lru_flush_page_count
 
1278
                    - buf_flush_stat_cur.n_flushed;
 
1279
 
 
1280
        /* add the current value and subtract the obsolete entry. */
 
1281
        buf_flush_stat_sum.redo += lsn_diff - item->redo;
 
1282
        buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
 
1283
 
 
1284
        /* put current entry in the array. */
 
1285
        item->redo = lsn_diff;
 
1286
        item->n_flushed = n_flushed;
 
1287
 
 
1288
        /* update the index */
 
1289
        buf_flush_stat_arr_ind++;
 
1290
        buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
 
1291
 
 
1292
        /* reset the current entry. */
 
1293
        buf_flush_stat_cur.redo = lsn;
 
1294
        buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
 
1295
}
 
1296
 
 
1297
/*********************************************************************
 
1298
Determines the fraction of dirty pages that need to be flushed based
 
1299
on the speed at which we generate redo log. Note that if redo log
 
1300
is generated at a significant rate without corresponding increase
 
1301
in the number of dirty pages (for example, an in-memory workload)
 
1302
it can cause IO bursts of flushing. This function implements heuristics
 
1303
to avoid this burstiness.
 
1304
@return number of dirty pages to be flushed / second */
 
1305
UNIV_INTERN
 
1306
ulint
 
1307
buf_flush_get_desired_flush_rate(void)
 
1308
/*==================================*/
 
1309
{
 
1310
        ulint                   redo_avg;
 
1311
        ulint                   lru_flush_avg;
 
1312
        ulint                   n_dirty;
 
1313
        ulint                   n_flush_req;
 
1314
        lint                    rate;
 
1315
        ib_uint64_t             lsn = log_get_lsn();
 
1316
        ulint                   log_capacity = log_get_capacity();
 
1317
 
 
1318
        /* log_capacity should never be zero after the initialization
 
1319
        of log subsystem. */
 
1320
        ut_ad(log_capacity != 0);
 
1321
 
 
1322
        /* Get total number of dirty pages. It is OK to access
 
1323
        flush_list without holding any mtex as we are using this
 
1324
        only for heuristics. */
 
1325
        n_dirty = UT_LIST_GET_LEN(buf_pool->flush_list);
 
1326
 
 
1327
        /* An overflow can happen if we generate more than 2^32 bytes
 
1328
        of redo in this interval i.e.: 4G of redo in 1 second. We can
 
1329
        safely consider this as infinity because if we ever come close
 
1330
        to 4G we'll start a synchronous flush of dirty pages. */
 
1331
        /* redo_avg below is average at which redo is generated in
 
1332
        past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
 
1333
        interval. */
 
1334
        redo_avg = (ulint) (buf_flush_stat_sum.redo
 
1335
                            / BUF_FLUSH_STAT_N_INTERVAL
 
1336
                            + (lsn - buf_flush_stat_cur.redo));
 
1337
 
 
1338
        /* An overflow can happen possibly if we flush more than 2^32
 
1339
        pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
 
1340
        unlikely scenario. Even when this happens it means that our
 
1341
        flush rate will be off the mark. It won't affect correctness
 
1342
        of any subsystem. */
 
1343
        /* lru_flush_avg below is rate at which pages are flushed as
 
1344
        part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
 
1345
        number of pages flushed in the current interval. */
 
1346
        lru_flush_avg = buf_flush_stat_sum.n_flushed
 
1347
                        / BUF_FLUSH_STAT_N_INTERVAL
 
1348
                        + (buf_lru_flush_page_count
 
1349
                           - buf_flush_stat_cur.n_flushed);
 
1350
 
 
1351
        n_flush_req = (n_dirty * redo_avg) / log_capacity;
 
1352
 
 
1353
        /* The number of pages that we want to flush from the flush
 
1354
        list is the difference between the required rate and the
 
1355
        number of pages that we are historically flushing from the
 
1356
        LRU list */
 
1357
        rate = n_flush_req - lru_flush_avg;
 
1358
        return(rate > 0 ? (ulint) rate : 0);
 
1359
}
 
1360
 
 
1361
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 
1362
/******************************************************************//**
 
1363
Validates the flush list.
 
1364
@return TRUE if ok */
 
1365
static
 
1366
ibool
 
1367
buf_flush_validate_low(void)
 
1368
/*========================*/
 
1369
{
 
1370
        buf_page_t*     bpage;
 
1371
 
 
1372
        UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
 
1373
                         ut_ad(ut_list_node_313->in_flush_list));
 
1374
 
 
1375
        bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
 
1376
 
 
1377
        while (bpage != NULL) {
 
1378
                const ib_uint64_t om = bpage->oldest_modification;
 
1379
                ut_ad(bpage->in_flush_list);
 
1380
                ut_a(buf_page_in_file(bpage));
 
1381
                ut_a(om > 0);
 
1382
 
 
1383
                bpage = UT_LIST_GET_NEXT(list, bpage);
 
1384
 
 
1385
                ut_a(!bpage || om >= bpage->oldest_modification);
 
1386
        }
 
1387
 
 
1388
        return(TRUE);
 
1389
}
 
1390
 
 
1391
/******************************************************************//**
 
1392
Validates the flush list.
 
1393
@return TRUE if ok */
 
1394
UNIV_INTERN
 
1395
ibool
 
1396
buf_flush_validate(void)
 
1397
/*====================*/
 
1398
{
 
1399
        ibool   ret;
 
1400
 
 
1401
        buf_pool_mutex_enter();
 
1402
 
 
1403
        ret = buf_flush_validate_low();
 
1404
 
 
1405
        buf_pool_mutex_exit();
 
1406
 
 
1407
        return(ret);
 
1408
}
 
1409
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
1410
#endif /* !UNIV_HOTBACKUP */