1
/*****************************************************************************
3
Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
4
Copyright (c) 2008, Google Inc.
6
Portions of this file contain modifications contributed and copyrighted by
7
Google, Inc. Those modifications are gratefully acknowledged and are described
8
briefly in the InnoDB documentation. The contributions by Google are
9
incorporated with their permission, and subject to the conditions contained in
10
the file COPYING.Google.
12
This program is free software; you can redistribute it and/or modify it under
13
the terms of the GNU General Public License as published by the Free Software
14
Foundation; version 2 of the License.
16
This program is distributed in the hope that it will be useful, but WITHOUT
17
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20
You should have received a copy of the GNU General Public License along with
21
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
22
Place, Suite 330, Boston, MA 02111-1307 USA
24
*****************************************************************************/
26
/**************************************************//**
28
The database buffer buf_pool
30
Created 11/5/1995 Heikki Tuuri
31
*******************************************************/
42
#ifndef UNIV_HOTBACKUP
43
#include "buf0buddy.h"
44
#include "lock0lock.h"
46
#include "ibuf0ibuf.h"
49
#endif /* !UNIV_HOTBACKUP */
51
#include "dict0dict.h"
56
IMPLEMENTATION OF THE BUFFER POOL
57
=================================
59
Performance improvement:
60
------------------------
61
Thread scheduling in NT may be so slow that the OS wait mechanism should
62
not be used even in waiting for disk reads to complete.
63
Rather, we should put waiting query threads to the queue of
64
waiting jobs, and let the OS thread do something useful while the i/o
65
is processed. In this way we could remove most OS thread switches in
66
an i/o-intensive benchmark like TPC-C.
68
A possibility is to put a user space thread library between the database
69
and NT. User space thread libraries might be very fast.
71
SQL Server 7.0 can be configured to use 'fibers' which are lightweight
72
threads in NT. These should be studied.
74
Buffer frames and blocks
75
------------------------
76
Following the terminology of Gray and Reuter, we call the memory
77
blocks where file pages are loaded buffer frames. For each buffer
78
frame there is a control block, or shortly, a block, in the buffer
79
control array. The control info which does not need to be stored
80
in the file along with the file page, resides in the control block.
84
The buffer buf_pool contains a single mutex which protects all the
85
control data structures of the buf_pool. The content of a buffer frame is
86
protected by a separate read-write lock in its control block, though.
87
These locks can be locked and unlocked without owning the buf_pool mutex.
88
The OS events in the buf_pool struct can be waited for without owning the
91
The buf_pool mutex is a hot-spot in main memory, causing a lot of
92
memory bus traffic on multiprocessor systems when processors
93
alternately access the mutex. On our Pentium, the mutex is accessed
94
maybe every 10 microseconds. We gave up the solution to have mutexes
95
for each control block, for instance, because it seemed to be
98
A solution to reduce mutex contention of the buf_pool mutex is to
99
create a separate mutex for the page hash table. On Pentium,
100
accessing the hash table takes 2 microseconds, about half
101
of the total buf_pool mutex hold time.
106
The control block contains, for instance, the bufferfix count
107
which is incremented when a thread wants a file page to be fixed
108
in a buffer frame. The bufferfix operation does not lock the
109
contents of the frame, however. For this purpose, the control
110
block contains a read-write lock.
112
The buffer frames have to be aligned so that the start memory
113
address of a frame is divisible by the universal page size, which
116
We intend to make the buffer buf_pool size on-line reconfigurable,
117
that is, the buf_pool size can be changed without closing the database.
118
Then the database administarator may adjust it to be bigger
119
at night, for example. The control block array must
120
contain enough control blocks for the maximum buffer buf_pool size
121
which is used in the particular database.
122
If the buf_pool size is cut, we exploit the virtual memory mechanism of
123
the OS, and just refrain from using frames at high addresses. Then the OS
124
can swap them to disk.
126
The control blocks containing file pages are put to a hash table
127
according to the file address of the page.
128
We could speed up the access to an individual page by using
129
"pointer swizzling": we could replace the page references on
130
non-leaf index pages by direct pointers to the page, if it exists
131
in the buf_pool. We could make a separate hash table where we could
132
chain all the page references in non-leaf pages residing in the buf_pool,
133
using the page reference as the hash key,
134
and at the time of reading of a page update the pointers accordingly.
135
Drawbacks of this solution are added complexity and,
136
possibly, extra space required on non-leaf pages for memory pointers.
137
A simpler solution is just to speed up the hash table mechanism
138
in the database, using tables whose size is a power of 2.
143
There are several lists of control blocks.
145
The free list (buf_pool->free) contains blocks which are currently not
148
The common LRU list contains all the blocks holding a file page
149
except those for which the bufferfix count is non-zero.
150
The pages are in the LRU list roughly in the order of the last
151
access to the page, so that the oldest pages are at the end of the
152
list. We also keep a pointer to near the end of the LRU list,
153
which we can use when we want to artificially age a page in the
154
buf_pool. This is used if we know that some page is not needed
155
again for some time: we insert the block right after the pointer,
156
causing it to be replaced sooner than would noramlly be the case.
157
Currently this aging mechanism is used for read-ahead mechanism
158
of pages, and it can also be used when there is a scan of a full
159
table which cannot fit in the memory. Putting the pages near the
160
of the LRU list, we make sure that most of the buf_pool stays in the
161
main memory, undisturbed.
163
The unzip_LRU list contains a subset of the common LRU list. The
164
blocks on the unzip_LRU list hold a compressed file page and the
165
corresponding uncompressed page frame. A block is in unzip_LRU if and
166
only if the predicate buf_page_belongs_to_unzip_LRU(&block->page)
167
holds. The blocks in unzip_LRU will be in same order as they are in
168
the common LRU list. That is, each manipulation of the common LRU
169
list will result in the same manipulation of the unzip_LRU list.
171
The chain of modified blocks (buf_pool->flush_list) contains the blocks
172
holding file pages that have been modified in the memory
173
but not written to disk yet. The block with the oldest modification
174
which has not yet been written to disk is at the end of the chain.
176
The chain of unmodified compressed blocks (buf_pool->zip_clean)
177
contains the control blocks (buf_page_t) of those compressed pages
178
that are not in buf_pool->flush_list and for which no uncompressed
179
page has been allocated in the buffer pool. The control blocks for
180
uncompressed pages are accessible via buf_block_t objects that are
181
reachable via buf_pool->chunks[].
183
The chains of free memory blocks (buf_pool->zip_free[]) are used by
184
the buddy allocator (buf0buddy.c) to keep track of currently unused
185
memory blocks of size sizeof(buf_page_t)..UNIV_PAGE_SIZE / 2. These
186
blocks are inside the UNIV_PAGE_SIZE-sized memory blocks of type
187
BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
188
pool. The buddy allocator is solely used for allocating control
189
blocks for compressed pages (buf_page_t) and compressed page frames.
194
First, a victim block for replacement has to be found in the
195
buf_pool. It is taken from the free list or searched for from the
196
end of the LRU-list. An exclusive lock is reserved for the frame,
197
the io_fix field is set in the block fixing the block in buf_pool,
198
and the io-operation for loading the page is queued. The io-handler thread
199
releases the X-lock on the frame and resets the io_fix field
200
when the io operation completes.
202
A thread may request the above operation using the function
203
buf_page_get(). It may then continue to request a lock on the frame.
204
The lock is granted when the io-handler releases the x-lock.
209
The read-ahead mechanism is intended to be intelligent and
210
isolated from the semantically higher levels of the database
211
index management. From the higher level we only need the
212
information if a file page has a natural successor or
213
predecessor page. On the leaf level of a B-tree index,
214
these are the next and previous pages in the natural
217
Let us first explain the read-ahead mechanism when the leafs
218
of a B-tree are scanned in an ascending or descending order.
219
When a read page is the first time referenced in the buf_pool,
220
the buffer manager checks if it is at the border of a so-called
221
linear read-ahead area. The tablespace is divided into these
222
areas of size 64 blocks, for example. So if the page is at the
223
border of such an area, the read-ahead mechanism checks if
224
all the other blocks in the area have been accessed in an
225
ascending or descending order. If this is the case, the system
226
looks at the natural successor or predecessor of the page,
227
checks if that is at the border of another area, and in this case
228
issues read-requests for all the pages in that area. Maybe
229
we could relax the condition that all the pages in the area
230
have to be accessed: if data is deleted from a table, there may
231
appear holes of unused pages in the area.
233
A different read-ahead mechanism is used when there appears
234
to be a random access pattern to a file.
235
If a new page is referenced in the buf_pool, and several pages
236
of its random access area (for instance, 32 consecutive pages
237
in a tablespace) have recently been referenced, we may predict
238
that the whole area may be needed in the near future, and issue
239
the read requests for the whole area.
242
#ifndef UNIV_HOTBACKUP
243
/** Value in microseconds */
244
static const int WAIT_FOR_READ = 5000;
246
/** The buffer buf_pool of the database */
247
UNIV_INTERN buf_pool_t* buf_pool = NULL;
249
/** mutex protecting the buffer pool struct and control blocks, except the
250
read-write lock in them */
251
UNIV_INTERN mutex_t buf_pool_mutex;
252
/** mutex protecting the control blocks of compressed-only pages
253
(of type buf_page_t, not buf_block_t) */
254
UNIV_INTERN mutex_t buf_pool_zip_mutex;
256
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
257
static ulint buf_dbg_counter = 0; /*!< This is used to insert validation
258
operations in excution in the
260
/** Flag to forbid the release of the buffer pool mutex.
261
Protected by buf_pool_mutex. */
262
UNIV_INTERN ulint buf_pool_mutex_exit_forbidden = 0;
263
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
265
/** If this is set TRUE, the program prints info whenever
266
read-ahead or flush occurs */
267
UNIV_INTERN ibool buf_debug_prints = FALSE;
268
#endif /* UNIV_DEBUG */
270
/** A chunk of buffers. The buffer pool is allocated in chunks. */
271
struct buf_chunk_struct{
272
ulint mem_size; /*!< allocated size of the chunk */
273
ulint size; /*!< size of frames[] and blocks[] */
274
void* mem; /*!< pointer to the memory area which
275
was allocated for the frames */
276
buf_block_t* blocks; /*!< array of buffer control blocks */
278
#endif /* !UNIV_HOTBACKUP */
280
/********************************************************************//**
281
Calculates a page checksum which is stored to the page when it is written
282
to a file. Note that we must be careful to calculate the same value on
283
32-bit and 64-bit architectures.
287
buf_calc_page_new_checksum(
288
/*=======================*/
289
const byte* page) /*!< in: buffer page */
293
/* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
294
..._ARCH_LOG_NO, are written outside the buffer pool to the first
295
pages of data files, we have to skip them in the page checksum
297
We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
298
checksum is stored, and also the last 8 bytes of page because
299
there we store the old formula checksum. */
301
checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
302
FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
303
+ ut_fold_binary(page + FIL_PAGE_DATA,
304
UNIV_PAGE_SIZE - FIL_PAGE_DATA
305
- FIL_PAGE_END_LSN_OLD_CHKSUM);
306
checksum = checksum & 0xFFFFFFFFUL;
311
/********************************************************************//**
312
In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
313
looked at the first few bytes of the page. This calculates that old
315
NOTE: we must first store the new formula checksum to
316
FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
317
because this takes that field as an input!
321
buf_calc_page_old_checksum(
322
/*=======================*/
323
const byte* page) /*!< in: buffer page */
327
checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
329
checksum = checksum & 0xFFFFFFFFUL;
334
/********************************************************************//**
335
Checks if a page is corrupt.
336
@return TRUE if corrupted */
339
buf_page_is_corrupted(
340
/*==================*/
341
const byte* read_buf, /*!< in: a database page */
342
ulint zip_size) /*!< in: size of compressed page;
343
0 for uncompressed pages */
345
ulint checksum_field;
346
ulint old_checksum_field;
348
if (UNIV_LIKELY(!zip_size)
349
&& memcmp(read_buf + FIL_PAGE_LSN + 4,
350
read_buf + UNIV_PAGE_SIZE
351
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
353
/* Stored log sequence numbers at the start and the end
354
of page do not match */
359
#ifndef UNIV_HOTBACKUP
360
if (recv_lsn_checks_on) {
361
ib_uint64_t current_lsn;
363
if (log_peek_lsn(¤t_lsn)
364
&& current_lsn < mach_read_ull(read_buf + FIL_PAGE_LSN)) {
365
ut_print_timestamp(stderr);
368
" InnoDB: Error: page %lu log sequence number"
370
"InnoDB: is in the future! Current system "
371
"log sequence number %"PRIu64".\n"
372
"InnoDB: Your database may be corrupt or "
373
"you may have copied the InnoDB\n"
374
"InnoDB: tablespace but not the InnoDB "
376
"InnoDB: " REFMAN "forcing-recovery.html\n"
377
"InnoDB: for more information.\n",
378
(ulong) mach_read_from_4(read_buf
380
mach_read_ull(read_buf + FIL_PAGE_LSN),
386
/* If we use checksums validation, make additional check before
387
returning TRUE to ensure that the checksum is not equal to
388
BUF_NO_CHECKSUM_MAGIC which might be stored by InnoDB with checksums
389
disabled. Otherwise, skip checksum calculation and return FALSE */
391
if (UNIV_LIKELY(srv_use_checksums)) {
392
checksum_field = mach_read_from_4(read_buf
393
+ FIL_PAGE_SPACE_OR_CHKSUM);
395
if (UNIV_UNLIKELY(zip_size)) {
396
return(checksum_field != BUF_NO_CHECKSUM_MAGIC
398
!= page_zip_calc_checksum(read_buf, zip_size));
401
old_checksum_field = mach_read_from_4(
402
read_buf + UNIV_PAGE_SIZE
403
- FIL_PAGE_END_LSN_OLD_CHKSUM);
405
/* There are 2 valid formulas for old_checksum_field:
407
1. Very old versions of InnoDB only stored 8 byte lsn to the
408
start and the end of the page.
410
2. Newer InnoDB versions store the old formula checksum
413
if (old_checksum_field != mach_read_from_4(read_buf
415
&& old_checksum_field != BUF_NO_CHECKSUM_MAGIC
416
&& old_checksum_field
417
!= buf_calc_page_old_checksum(read_buf)) {
422
/* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
423
(always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
425
if (checksum_field != 0
426
&& checksum_field != BUF_NO_CHECKSUM_MAGIC
428
!= buf_calc_page_new_checksum(read_buf)) {
437
/********************************************************************//**
438
Prints a page to stderr. */
443
const byte* read_buf, /*!< in: a database page */
444
ulint zip_size) /*!< in: compressed page size, or
445
0 for uncompressed pages */
447
#ifndef UNIV_HOTBACKUP
449
#endif /* !UNIV_HOTBACKUP */
452
ulint size = zip_size;
455
size = UNIV_PAGE_SIZE;
458
ut_print_timestamp(stderr);
459
fprintf(stderr, " InnoDB: Page dump in ascii and hex (%lu bytes):\n",
461
ut_print_buf(stderr, read_buf, size);
462
fputs("\nInnoDB: End of page dump\n", stderr);
465
/* Print compressed page. */
467
switch (fil_page_get_type(read_buf)) {
468
case FIL_PAGE_TYPE_ZBLOB:
469
case FIL_PAGE_TYPE_ZBLOB2:
470
checksum = srv_use_checksums
471
? page_zip_calc_checksum(read_buf, zip_size)
472
: BUF_NO_CHECKSUM_MAGIC;
473
ut_print_timestamp(stderr);
475
" InnoDB: Compressed BLOB page"
476
" checksum %lu, stored %lu\n"
477
"InnoDB: Page lsn %lu %lu\n"
478
"InnoDB: Page number (if stored"
479
" to page already) %lu,\n"
480
"InnoDB: space id (if stored"
481
" to page already) %lu\n",
483
(ulong) mach_read_from_4(
484
read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
485
(ulong) mach_read_from_4(
486
read_buf + FIL_PAGE_LSN),
487
(ulong) mach_read_from_4(
488
read_buf + (FIL_PAGE_LSN + 4)),
489
(ulong) mach_read_from_4(
490
read_buf + FIL_PAGE_OFFSET),
491
(ulong) mach_read_from_4(
493
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
496
ut_print_timestamp(stderr);
498
" InnoDB: unknown page type %lu,"
499
" assuming FIL_PAGE_INDEX\n",
500
fil_page_get_type(read_buf));
503
checksum = srv_use_checksums
504
? page_zip_calc_checksum(read_buf, zip_size)
505
: BUF_NO_CHECKSUM_MAGIC;
507
ut_print_timestamp(stderr);
509
" InnoDB: Compressed page checksum %lu,"
511
"InnoDB: Page lsn %lu %lu\n"
512
"InnoDB: Page number (if stored"
513
" to page already) %lu,\n"
514
"InnoDB: space id (if stored"
515
" to page already) %lu\n",
517
(ulong) mach_read_from_4(
518
read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
519
(ulong) mach_read_from_4(
520
read_buf + FIL_PAGE_LSN),
521
(ulong) mach_read_from_4(
522
read_buf + (FIL_PAGE_LSN + 4)),
523
(ulong) mach_read_from_4(
524
read_buf + FIL_PAGE_OFFSET),
525
(ulong) mach_read_from_4(
527
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
529
case FIL_PAGE_TYPE_XDES:
530
/* This is an uncompressed page. */
535
checksum = srv_use_checksums
536
? buf_calc_page_new_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
537
old_checksum = srv_use_checksums
538
? buf_calc_page_old_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
540
ut_print_timestamp(stderr);
542
" InnoDB: Page checksum %lu, prior-to-4.0.14-form"
544
"InnoDB: stored checksum %lu, prior-to-4.0.14-form"
545
" stored checksum %lu\n"
546
"InnoDB: Page lsn %lu %lu, low 4 bytes of lsn"
548
"InnoDB: Page number (if stored to page already) %lu,\n"
549
"InnoDB: space id (if created with >= MySQL-4.1.1"
550
" and stored already) %lu\n",
551
(ulong) checksum, (ulong) old_checksum,
552
(ulong) mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
553
(ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
554
- FIL_PAGE_END_LSN_OLD_CHKSUM),
555
(ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN),
556
(ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
557
(ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
558
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
559
(ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
560
(ulong) mach_read_from_4(read_buf
561
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
563
#ifndef UNIV_HOTBACKUP
564
if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE)
565
== TRX_UNDO_INSERT) {
567
"InnoDB: Page may be an insert undo log page\n");
568
} else if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR
569
+ TRX_UNDO_PAGE_TYPE)
570
== TRX_UNDO_UPDATE) {
572
"InnoDB: Page may be an update undo log page\n");
574
#endif /* !UNIV_HOTBACKUP */
576
switch (fil_page_get_type(read_buf)) {
579
"InnoDB: Page may be an index page where"
580
" index id is %lu %lu\n",
581
(ulong) ut_dulint_get_high(
582
btr_page_get_index_id(read_buf)),
583
(ulong) ut_dulint_get_low(
584
btr_page_get_index_id(read_buf)));
585
#ifndef UNIV_HOTBACKUP
586
index = dict_index_find_on_id_low(
587
btr_page_get_index_id(read_buf));
589
fputs("InnoDB: (", stderr);
590
dict_index_name_print(stderr, NULL, index);
591
fputs(")\n", stderr);
593
#endif /* !UNIV_HOTBACKUP */
596
fputs("InnoDB: Page may be an 'inode' page\n", stderr);
598
case FIL_PAGE_IBUF_FREE_LIST:
599
fputs("InnoDB: Page may be an insert buffer free list page\n",
602
case FIL_PAGE_TYPE_ALLOCATED:
603
fputs("InnoDB: Page may be a freshly allocated page\n",
606
case FIL_PAGE_IBUF_BITMAP:
607
fputs("InnoDB: Page may be an insert buffer bitmap page\n",
610
case FIL_PAGE_TYPE_SYS:
611
fputs("InnoDB: Page may be a system page\n",
614
case FIL_PAGE_TYPE_TRX_SYS:
615
fputs("InnoDB: Page may be a transaction system page\n",
618
case FIL_PAGE_TYPE_FSP_HDR:
619
fputs("InnoDB: Page may be a file space header page\n",
622
case FIL_PAGE_TYPE_XDES:
623
fputs("InnoDB: Page may be an extent descriptor page\n",
626
case FIL_PAGE_TYPE_BLOB:
627
fputs("InnoDB: Page may be a BLOB page\n",
630
case FIL_PAGE_TYPE_ZBLOB:
631
case FIL_PAGE_TYPE_ZBLOB2:
632
fputs("InnoDB: Page may be a compressed BLOB page\n",
638
#ifndef UNIV_HOTBACKUP
639
/********************************************************************//**
640
Initializes a buffer control block when the buf_pool is created. */
645
buf_block_t* block, /*!< in: pointer to control block */
646
byte* frame) /*!< in: pointer to buffer frame */
648
UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE, block);
650
block->frame = frame;
652
block->page.state = BUF_BLOCK_NOT_USED;
653
block->page.buf_fix_count = 0;
654
block->page.io_fix = BUF_IO_NONE;
656
block->modify_clock = 0;
658
#ifdef UNIV_DEBUG_FILE_ACCESSES
659
block->page.file_page_was_freed = FALSE;
660
#endif /* UNIV_DEBUG_FILE_ACCESSES */
662
block->check_index_page_at_flush = FALSE;
666
block->page.in_page_hash = FALSE;
667
block->page.in_zip_hash = FALSE;
668
block->page.in_flush_list = FALSE;
669
block->page.in_free_list = FALSE;
670
block->page.in_LRU_list = FALSE;
671
block->in_unzip_LRU_list = FALSE;
672
#endif /* UNIV_DEBUG */
673
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
674
block->n_pointers = 0;
675
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
676
page_zip_des_init(&block->page.zip);
678
mutex_create(&block->mutex, SYNC_BUF_BLOCK);
680
rw_lock_create(&block->lock, SYNC_LEVEL_VARYING);
681
ut_ad(rw_lock_validate(&(block->lock)));
683
#ifdef UNIV_SYNC_DEBUG
684
rw_lock_create(&block->debug_latch, SYNC_NO_ORDER_CHECK);
685
#endif /* UNIV_SYNC_DEBUG */
688
/********************************************************************//**
689
Allocates a chunk of buffer frames.
690
@return chunk, or NULL on failure */
695
buf_chunk_t* chunk, /*!< out: chunk of buffers */
696
ulint mem_size) /*!< in: requested size in bytes */
702
/* Round down to a multiple of page size,
703
although it already should be. */
704
mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
705
/* Reserve space for the block descriptors. */
706
mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block)
707
+ (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
709
chunk->mem_size = mem_size;
710
chunk->mem = os_mem_alloc_large(&chunk->mem_size);
712
if (UNIV_UNLIKELY(chunk->mem == NULL)) {
717
/* Allocate the block descriptors from
718
the start of the memory block. */
719
chunk->blocks = chunk->mem;
721
/* Align a pointer to the first frame. Note that when
722
os_large_page_size is smaller than UNIV_PAGE_SIZE,
723
we may allocate one fewer block than requested. When
724
it is bigger, we may allocate more blocks than requested. */
726
frame = ut_align(chunk->mem, UNIV_PAGE_SIZE);
727
chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
728
- (frame != chunk->mem);
730
/* Subtract the space needed for block descriptors. */
732
ulint size = chunk->size;
734
while (frame < (byte*) (chunk->blocks + size)) {
735
frame += UNIV_PAGE_SIZE;
742
/* Init block structs and assign frames for them. Then we
743
assign the frames to the first blocks (we already mapped the
746
block = chunk->blocks;
748
for (i = chunk->size; i--; ) {
750
buf_block_init(block, frame);
753
/* Wipe contents of frame to eliminate a Purify warning */
754
memset(block->frame, '\0', UNIV_PAGE_SIZE);
756
/* Add the block to the free list */
757
UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page));
758
ut_d(block->page.in_free_list = TRUE);
761
frame += UNIV_PAGE_SIZE;
768
/*********************************************************************//**
769
Finds a block in the given buffer chunk that points to a
770
given compressed page.
771
@return buffer block pointing to the compressed page, or NULL */
774
buf_chunk_contains_zip(
775
/*===================*/
776
buf_chunk_t* chunk, /*!< in: chunk being checked */
777
const void* data) /*!< in: pointer to compressed page */
783
ut_ad(buf_pool_mutex_own());
785
block = chunk->blocks;
787
for (i = chunk->size; i--; block++) {
788
if (block->page.zip.data == data) {
797
/*********************************************************************//**
798
Finds a block in the buffer pool that points to a
799
given compressed page.
800
@return buffer block pointing to the compressed page, or NULL */
803
buf_pool_contains_zip(
804
/*==================*/
805
const void* data) /*!< in: pointer to compressed page */
808
buf_chunk_t* chunk = buf_pool->chunks;
810
for (n = buf_pool->n_chunks; n--; chunk++) {
811
buf_block_t* block = buf_chunk_contains_zip(chunk, data);
820
#endif /* UNIV_DEBUG */
822
/*********************************************************************//**
823
Checks that all file pages in the buffer chunk are in a replaceable state.
824
@return address of a non-free block, or NULL if all freed */
829
buf_chunk_t* chunk) /*!< in: chunk being checked */
835
ut_ad(buf_pool_mutex_own());
837
block = chunk->blocks;
839
for (i = chunk->size; i--; block++) {
840
mutex_enter(&block->mutex);
842
if (buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE
843
&& !buf_flush_ready_for_replace(&block->page)) {
845
mutex_exit(&block->mutex);
849
mutex_exit(&block->mutex);
855
/*********************************************************************//**
856
Checks that all blocks in the buffer chunk are in BUF_BLOCK_NOT_USED state.
857
@return TRUE if all freed */
862
const buf_chunk_t* chunk) /*!< in: chunk being checked */
864
const buf_block_t* block;
868
ut_ad(buf_pool_mutex_own());
870
block = chunk->blocks;
872
for (i = chunk->size; i--; block++) {
874
if (buf_block_get_state(block) != BUF_BLOCK_NOT_USED) {
883
/********************************************************************//**
884
Frees a chunk of buffer frames. */
889
buf_chunk_t* chunk) /*!< out: chunk of buffers */
892
const buf_block_t* block_end;
894
ut_ad(buf_pool_mutex_own());
896
block_end = chunk->blocks + chunk->size;
898
for (block = chunk->blocks; block < block_end; block++) {
899
ut_a(buf_block_get_state(block) == BUF_BLOCK_NOT_USED);
900
ut_a(!block->page.zip.data);
902
ut_ad(!block->page.in_LRU_list);
903
ut_ad(!block->in_unzip_LRU_list);
904
ut_ad(!block->page.in_flush_list);
905
/* Remove the block from the free list. */
906
ut_ad(block->page.in_free_list);
907
UT_LIST_REMOVE(list, buf_pool->free, (&block->page));
909
/* Free the latches. */
910
mutex_free(&block->mutex);
911
rw_lock_free(&block->lock);
912
#ifdef UNIV_SYNC_DEBUG
913
rw_lock_free(&block->debug_latch);
914
#endif /* UNIV_SYNC_DEBUG */
915
UNIV_MEM_UNDESC(block);
918
os_mem_free_large(chunk->mem, chunk->mem_size);
921
/********************************************************************//**
922
Creates the buffer pool.
923
@return own: buf_pool object, NULL if not enough memory or error */
932
buf_pool = mem_zalloc(sizeof(buf_pool_t));
934
/* 1. Initialize general fields
935
------------------------------- */
936
mutex_create(&buf_pool_mutex, SYNC_BUF_POOL);
937
mutex_create(&buf_pool_zip_mutex, SYNC_BUF_BLOCK);
939
buf_pool_mutex_enter();
941
buf_pool->n_chunks = 1;
942
buf_pool->chunks = chunk = mem_alloc(sizeof *chunk);
944
UT_LIST_INIT(buf_pool->free);
946
if (!buf_chunk_init(chunk, srv_buf_pool_size)) {
953
srv_buf_pool_old_size = srv_buf_pool_size;
954
buf_pool->curr_size = chunk->size;
955
srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
957
buf_pool->page_hash = hash_create(2 * buf_pool->curr_size);
958
buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
960
buf_pool->last_printout_time = time(NULL);
962
/* 2. Initialize flushing fields
963
-------------------------------- */
965
for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
966
buf_pool->no_flush[i] = os_event_create(NULL);
969
buf_pool->ulint_clock = 1;
971
/* 3. Initialize LRU fields
972
--------------------------- */
973
/* All fields are initialized by mem_zalloc(). */
975
buf_pool_mutex_exit();
977
btr_search_sys_create(buf_pool->curr_size
978
* UNIV_PAGE_SIZE / sizeof(void*) / 64);
980
/* 4. Initialize the buddy allocator fields */
981
/* All fields are initialized by mem_zalloc(). */
986
/********************************************************************//**
987
Frees the buffer pool at shutdown. This must not be invoked before
988
freeing all mutexes. */
997
chunks = buf_pool->chunks;
998
chunk = chunks + buf_pool->n_chunks;
1000
while (--chunk >= chunks) {
1001
/* Bypass the checks of buf_chunk_free(), since they
1002
would fail at shutdown. */
1003
os_mem_free_large(chunk->mem, chunk->mem_size);
1006
buf_pool->n_chunks = 0;
1009
/********************************************************************//**
1010
Drops the adaptive hash index. To prevent a livelock, this function
1011
is only to be called while holding btr_search_latch and while
1012
btr_search_enabled == FALSE. */
1015
buf_pool_drop_hash_index(void)
1016
/*==========================*/
1018
ibool released_search_latch;
1020
#ifdef UNIV_SYNC_DEBUG
1021
ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
1022
#endif /* UNIV_SYNC_DEBUG */
1023
ut_ad(!btr_search_enabled);
1026
buf_chunk_t* chunks = buf_pool->chunks;
1027
buf_chunk_t* chunk = chunks + buf_pool->n_chunks;
1029
released_search_latch = FALSE;
1031
while (--chunk >= chunks) {
1032
buf_block_t* block = chunk->blocks;
1033
ulint i = chunk->size;
1035
for (; i--; block++) {
1036
/* block->is_hashed cannot be modified
1037
when we have an x-latch on btr_search_latch;
1038
see the comment in buf0buf.h */
1040
if (!block->is_hashed) {
1044
/* To follow the latching order, we
1045
have to release btr_search_latch
1046
before acquiring block->latch. */
1047
rw_lock_x_unlock(&btr_search_latch);
1048
/* When we release the search latch,
1049
we must rescan all blocks, because
1050
some may become hashed again. */
1051
released_search_latch = TRUE;
1053
rw_lock_x_lock(&block->lock);
1055
/* This should be guaranteed by the
1056
callers, which will be holding
1057
btr_search_enabled_mutex. */
1058
ut_ad(!btr_search_enabled);
1060
/* Because we did not buffer-fix the
1061
block by calling buf_block_get_gen(),
1062
it is possible that the block has been
1063
allocated for some other use after
1064
btr_search_latch was released above.
1065
We do not care which file page the
1066
block is mapped to. All we want to do
1067
is to drop any hash entries referring
1070
/* It is possible that
1071
block->page.state != BUF_FILE_PAGE.
1072
Even that does not matter, because
1073
btr_search_drop_page_hash_index() will
1074
check block->is_hashed before doing
1075
anything. block->is_hashed can only
1076
be set on uncompressed file pages. */
1078
btr_search_drop_page_hash_index(block);
1080
rw_lock_x_unlock(&block->lock);
1082
rw_lock_x_lock(&btr_search_latch);
1084
ut_ad(!btr_search_enabled);
1087
} while (released_search_latch);
1090
/********************************************************************//**
1091
Relocate a buffer control block. Relocates the block on the LRU list
1092
and in buf_pool->page_hash. Does not relocate bpage->list.
1093
The caller must take care of relocating bpage->list. */
1098
buf_page_t* bpage, /*!< in/out: control block being relocated;
1099
buf_page_get_state(bpage) must be
1100
BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
1101
buf_page_t* dpage) /*!< in/out: destination control block */
1106
ut_ad(buf_pool_mutex_own());
1107
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
1108
ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
1109
ut_a(bpage->buf_fix_count == 0);
1110
ut_ad(bpage->in_LRU_list);
1111
ut_ad(!bpage->in_zip_hash);
1112
ut_ad(bpage->in_page_hash);
1113
ut_ad(bpage == buf_page_hash_get(bpage->space, bpage->offset));
1115
switch (buf_page_get_state(bpage)) {
1116
case BUF_BLOCK_ZIP_FREE:
1117
case BUF_BLOCK_NOT_USED:
1118
case BUF_BLOCK_READY_FOR_USE:
1119
case BUF_BLOCK_FILE_PAGE:
1120
case BUF_BLOCK_MEMORY:
1121
case BUF_BLOCK_REMOVE_HASH:
1123
case BUF_BLOCK_ZIP_DIRTY:
1124
case BUF_BLOCK_ZIP_PAGE:
1127
#endif /* UNIV_DEBUG */
1129
memcpy(dpage, bpage, sizeof *dpage);
1131
ut_d(bpage->in_LRU_list = FALSE);
1132
ut_d(bpage->in_page_hash = FALSE);
1134
/* relocate buf_pool->LRU */
1135
b = UT_LIST_GET_PREV(LRU, bpage);
1136
UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage);
1139
UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, b, dpage);
1141
UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, dpage);
1144
if (UNIV_UNLIKELY(buf_pool->LRU_old == bpage)) {
1145
buf_pool->LRU_old = dpage;
1146
#ifdef UNIV_LRU_DEBUG
1147
/* buf_pool->LRU_old must be the first item in the LRU list
1148
whose "old" flag is set. */
1149
ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
1150
|| !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
1151
ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
1152
|| UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
1153
#endif /* UNIV_LRU_DEBUG */
1156
ut_d(UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU,
1157
ut_ad(ut_list_node_313->in_LRU_list)));
1159
/* relocate buf_pool->page_hash */
1160
fold = buf_page_address_fold(bpage->space, bpage->offset);
1162
HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
1163
HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
1165
UNIV_MEM_INVALID(bpage, sizeof *bpage);
1168
/********************************************************************//**
1169
Shrinks the buffer pool. */
1174
ulint chunk_size) /*!< in: number of pages to remove */
1176
buf_chunk_t* chunks;
1179
ulint max_free_size;
1180
buf_chunk_t* max_chunk;
1181
buf_chunk_t* max_free_chunk;
1183
ut_ad(!buf_pool_mutex_own());
1186
btr_search_disable(); /* Empty the adaptive hash index again */
1187
buf_pool_mutex_enter();
1190
if (buf_pool->n_chunks <= 1) {
1192
/* Cannot shrink if there is only one chunk */
1196
/* Search for the largest free chunk
1197
not larger than the size difference */
1198
chunks = buf_pool->chunks;
1199
chunk = chunks + buf_pool->n_chunks;
1200
max_size = max_free_size = 0;
1201
max_chunk = max_free_chunk = NULL;
1203
while (--chunk >= chunks) {
1204
if (chunk->size <= chunk_size
1205
&& chunk->size > max_free_size) {
1206
if (chunk->size > max_size) {
1207
max_size = chunk->size;
1211
if (buf_chunk_all_free(chunk)) {
1212
max_free_size = chunk->size;
1213
max_free_chunk = chunk;
1218
if (!max_free_size) {
1225
/* Cannot shrink: try again later
1226
(do not assign srv_buf_pool_old_size) */
1232
block = max_chunk->blocks;
1233
bend = block + max_chunk->size;
1235
/* Move the blocks of chunk to the end of the
1236
LRU list and try to flush them. */
1237
for (; block < bend; block++) {
1238
switch (buf_block_get_state(block)) {
1239
case BUF_BLOCK_NOT_USED:
1241
case BUF_BLOCK_FILE_PAGE:
1248
mutex_enter(&block->mutex);
1249
/* The following calls will temporarily
1250
release block->mutex and buf_pool_mutex.
1251
Therefore, we have to always retry,
1252
even if !dirty && !nonfree. */
1254
if (!buf_flush_ready_for_replace(&block->page)) {
1256
buf_LRU_make_block_old(&block->page);
1258
} else if (buf_LRU_free_block(&block->page, TRUE, NULL)
1263
mutex_exit(&block->mutex);
1266
buf_pool_mutex_exit();
1268
/* Request for a flush of the chunk if it helps.
1269
Do not flush if there are non-free blocks, since
1270
flushing will not make the chunk freeable. */
1272
/* Avoid busy-waiting. */
1273
os_thread_sleep(100000);
1275
&& buf_flush_batch(BUF_FLUSH_LRU, dirty, 0)
1276
== ULINT_UNDEFINED) {
1278
buf_flush_wait_batch_end(BUF_FLUSH_LRU);
1284
max_size = max_free_size;
1285
max_chunk = max_free_chunk;
1287
srv_buf_pool_old_size = srv_buf_pool_size;
1289
/* Rewrite buf_pool->chunks. Copy everything but max_chunk. */
1290
chunks = mem_alloc((buf_pool->n_chunks - 1) * sizeof *chunks);
1291
memcpy(chunks, buf_pool->chunks,
1292
(max_chunk - buf_pool->chunks) * sizeof *chunks);
1293
memcpy(chunks + (max_chunk - buf_pool->chunks),
1295
buf_pool->chunks + buf_pool->n_chunks
1297
ut_a(buf_pool->curr_size > max_chunk->size);
1298
buf_pool->curr_size -= max_chunk->size;
1299
srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
1300
chunk_size -= max_chunk->size;
1301
buf_chunk_free(max_chunk);
1302
mem_free(buf_pool->chunks);
1303
buf_pool->chunks = chunks;
1304
buf_pool->n_chunks--;
1306
/* Allow a slack of one megabyte. */
1307
if (chunk_size > 1048576 / UNIV_PAGE_SIZE) {
1313
srv_buf_pool_old_size = srv_buf_pool_size;
1315
buf_pool_mutex_exit();
1316
btr_search_enable();
1319
/********************************************************************//**
1320
Rebuild buf_pool->page_hash. */
1323
buf_pool_page_hash_rebuild(void)
1324
/*============================*/
1329
hash_table_t* page_hash;
1330
hash_table_t* zip_hash;
1333
buf_pool_mutex_enter();
1335
/* Free, create, and populate the hash table. */
1336
hash_table_free(buf_pool->page_hash);
1337
buf_pool->page_hash = page_hash = hash_create(2 * buf_pool->curr_size);
1338
zip_hash = hash_create(2 * buf_pool->curr_size);
1340
HASH_MIGRATE(buf_pool->zip_hash, zip_hash, buf_page_t, hash,
1341
BUF_POOL_ZIP_FOLD_BPAGE);
1343
hash_table_free(buf_pool->zip_hash);
1344
buf_pool->zip_hash = zip_hash;
1346
/* Insert the uncompressed file pages to buf_pool->page_hash. */
1348
chunk = buf_pool->chunks;
1349
n_chunks = buf_pool->n_chunks;
1351
for (i = 0; i < n_chunks; i++, chunk++) {
1353
buf_block_t* block = chunk->blocks;
1355
for (j = 0; j < chunk->size; j++, block++) {
1356
if (buf_block_get_state(block)
1357
== BUF_BLOCK_FILE_PAGE) {
1358
ut_ad(!block->page.in_zip_hash);
1359
ut_ad(block->page.in_page_hash);
1361
HASH_INSERT(buf_page_t, hash, page_hash,
1362
buf_page_address_fold(
1364
block->page.offset),
1370
/* Insert the compressed-only pages to buf_pool->page_hash.
1371
All such blocks are either in buf_pool->zip_clean or
1372
in buf_pool->flush_list. */
1374
for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
1375
b = UT_LIST_GET_NEXT(list, b)) {
1376
ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
1377
ut_ad(!b->in_flush_list);
1378
ut_ad(b->in_LRU_list);
1379
ut_ad(b->in_page_hash);
1380
ut_ad(!b->in_zip_hash);
1382
HASH_INSERT(buf_page_t, hash, page_hash,
1383
buf_page_address_fold(b->space, b->offset), b);
1386
for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
1387
b = UT_LIST_GET_NEXT(list, b)) {
1388
ut_ad(b->in_flush_list);
1389
ut_ad(b->in_LRU_list);
1390
ut_ad(b->in_page_hash);
1391
ut_ad(!b->in_zip_hash);
1393
switch (buf_page_get_state(b)) {
1394
case BUF_BLOCK_ZIP_DIRTY:
1395
HASH_INSERT(buf_page_t, hash, page_hash,
1396
buf_page_address_fold(b->space,
1399
case BUF_BLOCK_FILE_PAGE:
1400
/* uncompressed page */
1402
case BUF_BLOCK_ZIP_FREE:
1403
case BUF_BLOCK_ZIP_PAGE:
1404
case BUF_BLOCK_NOT_USED:
1405
case BUF_BLOCK_READY_FOR_USE:
1406
case BUF_BLOCK_MEMORY:
1407
case BUF_BLOCK_REMOVE_HASH:
1413
buf_pool_mutex_exit();
1416
/********************************************************************//**
1417
Resizes the buffer pool. */
1420
buf_pool_resize(void)
1421
/*=================*/
1423
buf_pool_mutex_enter();
1425
if (srv_buf_pool_old_size == srv_buf_pool_size) {
1427
buf_pool_mutex_exit();
1431
if (srv_buf_pool_curr_size + 1048576 > srv_buf_pool_size) {
1433
buf_pool_mutex_exit();
1435
/* Disable adaptive hash indexes and empty the index
1436
in order to free up memory in the buffer pool chunks. */
1437
buf_pool_shrink((srv_buf_pool_curr_size - srv_buf_pool_size)
1439
} else if (srv_buf_pool_curr_size + 1048576 < srv_buf_pool_size) {
1441
/* Enlarge the buffer pool by at least one megabyte */
1444
= srv_buf_pool_size - srv_buf_pool_curr_size;
1445
buf_chunk_t* chunks;
1448
chunks = mem_alloc((buf_pool->n_chunks + 1) * sizeof *chunks);
1450
memcpy(chunks, buf_pool->chunks, buf_pool->n_chunks
1453
chunk = &chunks[buf_pool->n_chunks];
1455
if (!buf_chunk_init(chunk, mem_size)) {
1458
buf_pool->curr_size += chunk->size;
1459
srv_buf_pool_curr_size = buf_pool->curr_size
1461
mem_free(buf_pool->chunks);
1462
buf_pool->chunks = chunks;
1463
buf_pool->n_chunks++;
1466
srv_buf_pool_old_size = srv_buf_pool_size;
1467
buf_pool_mutex_exit();
1470
buf_pool_page_hash_rebuild();
1473
/********************************************************************//**
1474
Moves the block to the start of the LRU list if there is a danger
1475
that the block would drift out of the buffer pool. */
1478
buf_block_make_young(
1479
/*=================*/
1480
buf_page_t* bpage) /*!< in: block to make younger */
1482
ut_ad(!buf_pool_mutex_own());
1484
/* Note that we read freed_page_clock's without holding any mutex:
1485
this is allowed since the result is used only in heuristics */
1487
if (buf_page_peek_if_too_old(bpage)) {
1489
buf_pool_mutex_enter();
1490
/* There has been freeing activity in the LRU list:
1491
best to move to the head of the LRU list */
1493
buf_LRU_make_block_young(bpage);
1494
buf_pool_mutex_exit();
1498
/********************************************************************//**
1499
Moves a page to the start of the buffer pool LRU list. This high-level
1500
function can be used to prevent an important page from from slipping out of
1504
buf_page_make_young(
1505
/*================*/
1506
buf_page_t* bpage) /*!< in: buffer block of a file page */
1508
buf_pool_mutex_enter();
1510
ut_a(buf_page_in_file(bpage));
1512
buf_LRU_make_block_young(bpage);
1514
buf_pool_mutex_exit();
1517
/********************************************************************//**
1518
Resets the check_index_page_at_flush field of a page if found in the buffer
1522
buf_reset_check_index_page_at_flush(
1523
/*================================*/
1524
ulint space, /*!< in: space id */
1525
ulint offset) /*!< in: page number */
1529
buf_pool_mutex_enter();
1531
block = (buf_block_t*) buf_page_hash_get(space, offset);
1533
if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) {
1534
block->check_index_page_at_flush = FALSE;
1537
buf_pool_mutex_exit();
1540
/********************************************************************//**
1541
Returns the current state of is_hashed of a page. FALSE if the page is
1542
not in the pool. NOTE that this operation does not fix the page in the
1543
pool if it is found there.
1544
@return TRUE if page hash index is built in search system */
1547
buf_page_peek_if_search_hashed(
1548
/*===========================*/
1549
ulint space, /*!< in: space id */
1550
ulint offset) /*!< in: page number */
1555
buf_pool_mutex_enter();
1557
block = (buf_block_t*) buf_page_hash_get(space, offset);
1559
if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
1562
is_hashed = block->is_hashed;
1565
buf_pool_mutex_exit();
1570
#ifdef UNIV_DEBUG_FILE_ACCESSES
1571
/********************************************************************//**
1572
Sets file_page_was_freed TRUE if the page is found in the buffer pool.
1573
This function should be called when we free a file page and want the
1574
debug version to check that it is not accessed any more unless
1576
@return control block if found in page hash table, otherwise NULL */
1579
buf_page_set_file_page_was_freed(
1580
/*=============================*/
1581
ulint space, /*!< in: space id */
1582
ulint offset) /*!< in: page number */
1586
buf_pool_mutex_enter();
1588
bpage = buf_page_hash_get(space, offset);
1591
bpage->file_page_was_freed = TRUE;
1594
buf_pool_mutex_exit();
1599
/********************************************************************//**
1600
Sets file_page_was_freed FALSE if the page is found in the buffer pool.
1601
This function should be called when we free a file page and want the
1602
debug version to check that it is not accessed any more unless
1604
@return control block if found in page hash table, otherwise NULL */
1607
buf_page_reset_file_page_was_freed(
1608
/*===============================*/
1609
ulint space, /*!< in: space id */
1610
ulint offset) /*!< in: page number */
1614
buf_pool_mutex_enter();
1616
bpage = buf_page_hash_get(space, offset);
1619
bpage->file_page_was_freed = FALSE;
1622
buf_pool_mutex_exit();
1626
#endif /* UNIV_DEBUG_FILE_ACCESSES */
1628
/********************************************************************//**
1629
Get read access to a compressed page (usually of type
1630
FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
1631
The page must be released with buf_page_release_zip().
1632
NOTE: the page is not protected by any latch. Mutual exclusion has to
1633
be implemented at a higher level. In other words, all possible
1634
accesses to a given page through this function must be protected by
1635
the same set of mutexes or latches.
1636
@return pointer to the block */
1641
ulint space, /*!< in: space id */
1642
ulint zip_size,/*!< in: compressed page size */
1643
ulint offset) /*!< in: page number */
1646
mutex_t* block_mutex;
1649
#ifndef UNIV_LOG_DEBUG
1650
ut_ad(!ibuf_inside());
1652
buf_pool->n_page_gets++;
1655
buf_pool_mutex_enter();
1657
bpage = buf_page_hash_get(space, offset);
1662
/* Page not in buf_pool: needs to be read from file */
1664
buf_pool_mutex_exit();
1666
buf_read_page(space, zip_size, offset);
1668
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
1669
ut_a(++buf_dbg_counter % 37 || buf_validate());
1670
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
1673
if (UNIV_UNLIKELY(!bpage->zip.data)) {
1674
/* There is no compressed page. */
1676
buf_pool_mutex_exit();
1680
switch (buf_page_get_state(bpage)) {
1681
case BUF_BLOCK_NOT_USED:
1682
case BUF_BLOCK_READY_FOR_USE:
1683
case BUF_BLOCK_MEMORY:
1684
case BUF_BLOCK_REMOVE_HASH:
1685
case BUF_BLOCK_ZIP_FREE:
1687
case BUF_BLOCK_ZIP_PAGE:
1688
case BUF_BLOCK_ZIP_DIRTY:
1689
block_mutex = &buf_pool_zip_mutex;
1690
mutex_enter(block_mutex);
1691
bpage->buf_fix_count++;
1693
case BUF_BLOCK_FILE_PAGE:
1694
block_mutex = &((buf_block_t*) bpage)->mutex;
1695
mutex_enter(block_mutex);
1697
/* Discard the uncompressed page frame if possible. */
1698
if (buf_LRU_free_block(bpage, FALSE, NULL)
1701
mutex_exit(block_mutex);
1705
buf_block_buf_fix_inc((buf_block_t*) bpage,
1706
__FILE__, __LINE__);
1714
must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ;
1716
buf_pool_mutex_exit();
1718
buf_page_set_accessed(bpage, TRUE);
1720
mutex_exit(block_mutex);
1722
buf_block_make_young(bpage);
1724
#ifdef UNIV_DEBUG_FILE_ACCESSES
1725
ut_a(!bpage->file_page_was_freed);
1728
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
1729
ut_a(++buf_dbg_counter % 5771 || buf_validate());
1730
ut_a(bpage->buf_fix_count > 0);
1731
ut_a(buf_page_in_file(bpage));
1732
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
1735
/* Let us wait until the read operation
1739
enum buf_io_fix io_fix;
1741
mutex_enter(block_mutex);
1742
io_fix = buf_page_get_io_fix(bpage);
1743
mutex_exit(block_mutex);
1745
if (io_fix == BUF_IO_READ) {
1747
os_thread_sleep(WAIT_FOR_READ);
1754
#ifdef UNIV_IBUF_COUNT_DEBUG
1755
ut_a(ibuf_count_get(buf_page_get_space(bpage),
1756
buf_page_get_page_no(bpage)) == 0);
1761
/********************************************************************//**
1762
Initialize some fields of a control block. */
1767
buf_block_t* block) /*!< in: block to init */
1769
block->check_index_page_at_flush = FALSE;
1770
block->index = NULL;
1772
block->n_hash_helps = 0;
1773
block->is_hashed = FALSE;
1774
block->n_fields = 1;
1776
block->left_side = TRUE;
1778
#endif /* !UNIV_HOTBACKUP */
1780
/********************************************************************//**
1782
@return TRUE if successful */
1787
buf_block_t* block, /*!< in/out: block */
1788
ibool check) /*!< in: TRUE=verify the page checksum */
1790
const byte* frame = block->page.zip.data;
1792
ut_ad(buf_block_get_zip_size(block));
1793
ut_a(buf_block_get_space(block) != 0);
1795
if (UNIV_LIKELY(check)) {
1796
ulint stamp_checksum = mach_read_from_4(
1797
frame + FIL_PAGE_SPACE_OR_CHKSUM);
1798
ulint calc_checksum = page_zip_calc_checksum(
1799
frame, page_zip_get_size(&block->page.zip));
1801
if (UNIV_UNLIKELY(stamp_checksum != calc_checksum)) {
1802
ut_print_timestamp(stderr);
1804
" InnoDB: compressed page checksum mismatch"
1805
" (space %u page %u): %lu != %lu\n",
1806
block->page.space, block->page.offset,
1807
stamp_checksum, calc_checksum);
1812
switch (fil_page_get_type(frame)) {
1813
case FIL_PAGE_INDEX:
1814
if (page_zip_decompress(&block->page.zip,
1820
"InnoDB: unable to decompress space %lu page %lu\n",
1821
(ulong) block->page.space,
1822
(ulong) block->page.offset);
1825
case FIL_PAGE_TYPE_ALLOCATED:
1826
case FIL_PAGE_INODE:
1827
case FIL_PAGE_IBUF_BITMAP:
1828
case FIL_PAGE_TYPE_FSP_HDR:
1829
case FIL_PAGE_TYPE_XDES:
1830
case FIL_PAGE_TYPE_ZBLOB:
1831
case FIL_PAGE_TYPE_ZBLOB2:
1832
/* Copy to uncompressed storage. */
1833
memcpy(block->frame, frame,
1834
buf_block_get_zip_size(block));
1838
ut_print_timestamp(stderr);
1840
" InnoDB: unknown compressed page"
1842
fil_page_get_type(frame));
1846
#ifndef UNIV_HOTBACKUP
1847
/*******************************************************************//**
1848
Gets the block to whose frame the pointer is pointing to.
1849
@return pointer to block, never NULL */
1854
const byte* ptr) /*!< in: pointer to a frame */
1859
/* TODO: protect buf_pool->chunks with a mutex (it will
1860
currently remain constant after buf_pool_init()) */
1861
for (chunk = buf_pool->chunks, i = buf_pool->n_chunks; i--; chunk++) {
1862
lint offs = ptr - chunk->blocks->frame;
1864
if (UNIV_UNLIKELY(offs < 0)) {
1869
offs >>= UNIV_PAGE_SIZE_SHIFT;
1871
if (UNIV_LIKELY((ulint) offs < chunk->size)) {
1872
buf_block_t* block = &chunk->blocks[offs];
1874
/* The function buf_chunk_init() invokes
1875
buf_block_init() so that block[n].frame ==
1876
block->frame + n * UNIV_PAGE_SIZE. Check it. */
1877
ut_ad(block->frame == page_align(ptr));
1879
/* A thread that updates these fields must
1880
hold buf_pool_mutex and block->mutex. Acquire
1882
mutex_enter(&block->mutex);
1884
switch (buf_block_get_state(block)) {
1885
case BUF_BLOCK_ZIP_FREE:
1886
case BUF_BLOCK_ZIP_PAGE:
1887
case BUF_BLOCK_ZIP_DIRTY:
1888
/* These types should only be used in
1889
the compressed buffer pool, whose
1890
memory is allocated from
1891
buf_pool->chunks, in UNIV_PAGE_SIZE
1892
blocks flagged as BUF_BLOCK_MEMORY. */
1895
case BUF_BLOCK_NOT_USED:
1896
case BUF_BLOCK_READY_FOR_USE:
1897
case BUF_BLOCK_MEMORY:
1898
/* Some data structures contain
1899
"guess" pointers to file pages. The
1900
file pages may have been freed and
1901
reused. Do not complain. */
1903
case BUF_BLOCK_REMOVE_HASH:
1904
/* buf_LRU_block_remove_hashed_page()
1905
will overwrite the FIL_PAGE_OFFSET and
1906
FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID with
1907
0xff and set the state to
1908
BUF_BLOCK_REMOVE_HASH. */
1909
ut_ad(page_get_space_id(page_align(ptr))
1911
ut_ad(page_get_page_no(page_align(ptr))
1914
case BUF_BLOCK_FILE_PAGE:
1915
ut_ad(block->page.space
1916
== page_get_space_id(page_align(ptr)));
1917
ut_ad(block->page.offset
1918
== page_get_page_no(page_align(ptr)));
1922
mutex_exit(&block->mutex);
1923
#endif /* UNIV_DEBUG */
1929
/* The block should always be found. */
1934
/********************************************************************//**
1935
Find out if a pointer belongs to a buf_block_t. It can be a pointer to
1936
the buf_block_t itself or a member of it
1937
@return TRUE if ptr belongs to a buf_block_t struct */
1940
buf_pointer_is_block_field(
1941
/*=======================*/
1942
const void* ptr) /*!< in: pointer not
1945
const buf_chunk_t* chunk = buf_pool->chunks;
1946
const buf_chunk_t* const echunk = chunk + buf_pool->n_chunks;
1948
/* TODO: protect buf_pool->chunks with a mutex (it will
1949
currently remain constant after buf_pool_init()) */
1950
while (chunk < echunk) {
1951
if (ptr >= (void *)chunk->blocks
1952
&& ptr < (void *)(chunk->blocks + chunk->size)) {
1963
/********************************************************************//**
1964
Find out if a buffer block was created by buf_chunk_init().
1965
@return TRUE if "block" has been added to buf_pool->free by buf_chunk_init() */
1968
buf_block_is_uncompressed(
1969
/*======================*/
1970
const buf_block_t* block) /*!< in: pointer to block,
1973
ut_ad(buf_pool_mutex_own());
1975
if (UNIV_UNLIKELY((((ulint) block) % sizeof *block) != 0)) {
1976
/* The pointer should be aligned. */
1980
return(buf_pointer_is_block_field((void *)block));
1983
/********************************************************************//**
1984
This is the general function used to get access to a database page.
1985
@return pointer to the block or NULL */
1990
ulint space, /*!< in: space id */
1991
ulint zip_size,/*!< in: compressed page size in bytes
1992
or 0 for uncompressed pages */
1993
ulint offset, /*!< in: page number */
1994
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
1995
buf_block_t* guess, /*!< in: guessed block or NULL */
1996
ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
1998
const char* file, /*!< in: file name */
1999
ulint line, /*!< in: line where called */
2000
mtr_t* mtr) /*!< in: mini-transaction */
2008
ut_ad((rw_latch == RW_S_LATCH)
2009
|| (rw_latch == RW_X_LATCH)
2010
|| (rw_latch == RW_NO_LATCH));
2011
ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH));
2012
ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL)
2013
|| (mode == BUF_GET_NO_LATCH));
2014
ut_ad(zip_size == fil_space_get_zip_size(space));
2015
ut_ad(ut_is_2pow(zip_size));
2016
#ifndef UNIV_LOG_DEBUG
2017
ut_ad(!ibuf_inside() || ibuf_page(space, zip_size, offset, NULL));
2019
buf_pool->n_page_gets++;
2022
buf_pool_mutex_enter();
2025
/* If the guess is a compressed page descriptor that
2026
has been allocated by buf_buddy_alloc(), it may have
2027
been invalidated by buf_buddy_relocate(). In that
2028
case, block could point to something that happens to
2029
contain the expected bits in block->page. Similarly,
2030
the guess may be pointing to a buffer pool chunk that
2031
has been released when resizing the buffer pool. */
2033
if (!buf_block_is_uncompressed(block)
2034
|| offset != block->page.offset
2035
|| space != block->page.space
2036
|| buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
2038
block = guess = NULL;
2040
ut_ad(!block->page.in_zip_hash);
2041
ut_ad(block->page.in_page_hash);
2045
if (block == NULL) {
2046
block = (buf_block_t*) buf_page_hash_get(space, offset);
2050
if (block == NULL) {
2051
/* Page not in buf_pool: needs to be read from file */
2053
buf_pool_mutex_exit();
2055
if (mode == BUF_GET_IF_IN_POOL) {
2060
buf_read_page(space, zip_size, offset);
2062
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2063
ut_a(++buf_dbg_counter % 37 || buf_validate());
2064
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2068
ut_ad(page_zip_get_size(&block->page.zip) == zip_size);
2070
must_read = buf_block_get_io_fix(block) == BUF_IO_READ;
2072
if (must_read && mode == BUF_GET_IF_IN_POOL) {
2073
/* The page is only being read to buffer */
2074
buf_pool_mutex_exit();
2079
switch (buf_block_get_state(block)) {
2083
case BUF_BLOCK_FILE_PAGE:
2086
case BUF_BLOCK_ZIP_PAGE:
2087
case BUF_BLOCK_ZIP_DIRTY:
2088
bpage = &block->page;
2089
/* Protect bpage->buf_fix_count. */
2090
mutex_enter(&buf_pool_zip_mutex);
2092
if (bpage->buf_fix_count
2093
|| buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
2094
/* This condition often occurs when the buffer
2095
is not buffer-fixed, but I/O-fixed by
2096
buf_page_init_for_read(). */
2097
mutex_exit(&buf_pool_zip_mutex);
2099
/* The block is buffer-fixed or I/O-fixed.
2101
buf_pool_mutex_exit();
2102
os_thread_sleep(WAIT_FOR_READ);
2107
/* Allocate an uncompressed page. */
2108
buf_pool_mutex_exit();
2109
mutex_exit(&buf_pool_zip_mutex);
2111
block = buf_LRU_get_free_block(0);
2114
buf_pool_mutex_enter();
2115
mutex_enter(&block->mutex);
2118
buf_page_t* hash_bpage
2119
= buf_page_hash_get(space, offset);
2121
if (UNIV_UNLIKELY(bpage != hash_bpage)) {
2122
/* The buf_pool->page_hash was modified
2123
while buf_pool_mutex was released.
2124
Free the block that was allocated. */
2126
buf_LRU_block_free_non_file_page(block);
2127
mutex_exit(&block->mutex);
2129
block = (buf_block_t*) hash_bpage;
2135
(bpage->buf_fix_count
2136
|| buf_page_get_io_fix(bpage) != BUF_IO_NONE)) {
2138
/* The block was buffer-fixed or I/O-fixed
2139
while buf_pool_mutex was not held by this thread.
2140
Free the block that was allocated and try again.
2141
This should be extremely unlikely. */
2143
buf_LRU_block_free_non_file_page(block);
2144
mutex_exit(&block->mutex);
2146
goto wait_until_unfixed;
2149
/* Move the compressed page from bpage to block,
2150
and uncompress it. */
2152
mutex_enter(&buf_pool_zip_mutex);
2154
buf_relocate(bpage, &block->page);
2155
buf_block_init_low(block);
2156
block->lock_hash_val = lock_rec_hash(space, offset);
2158
UNIV_MEM_DESC(&block->page.zip.data,
2159
page_zip_get_size(&block->page.zip), block);
2161
if (buf_page_get_state(&block->page)
2162
== BUF_BLOCK_ZIP_PAGE) {
2163
UT_LIST_REMOVE(list, buf_pool->zip_clean,
2165
ut_ad(!block->page.in_flush_list);
2167
/* Relocate buf_pool->flush_list. */
2170
b = UT_LIST_GET_PREV(list, &block->page);
2171
ut_ad(block->page.in_flush_list);
2172
UT_LIST_REMOVE(list, buf_pool->flush_list,
2176
UT_LIST_INSERT_AFTER(
2177
list, buf_pool->flush_list, b,
2181
list, buf_pool->flush_list,
2186
/* Buffer-fix, I/O-fix, and X-latch the block
2187
for the duration of the decompression.
2188
Also add the block to the unzip_LRU list. */
2189
block->page.state = BUF_BLOCK_FILE_PAGE;
2191
/* Insert at the front of unzip_LRU list */
2192
buf_unzip_LRU_add_block(block, FALSE);
2194
block->page.buf_fix_count = 1;
2195
buf_block_set_io_fix(block, BUF_IO_READ);
2196
rw_lock_x_lock(&block->lock);
2197
mutex_exit(&block->mutex);
2198
mutex_exit(&buf_pool_zip_mutex);
2199
buf_pool->n_pend_unzip++;
2201
buf_buddy_free(bpage, sizeof *bpage);
2203
buf_pool_mutex_exit();
2205
/* Decompress the page and apply buffered operations
2206
while not holding buf_pool_mutex or block->mutex. */
2207
success = buf_zip_decompress(block, srv_use_checksums);
2209
if (UNIV_LIKELY(success)) {
2210
ibuf_merge_or_delete_for_page(block, space, offset,
2214
/* Unfix and unlatch the block. */
2215
buf_pool_mutex_enter();
2216
mutex_enter(&block->mutex);
2217
block->page.buf_fix_count--;
2218
buf_block_set_io_fix(block, BUF_IO_NONE);
2219
mutex_exit(&block->mutex);
2220
buf_pool->n_pend_unzip--;
2221
rw_lock_x_unlock(&block->lock);
2223
if (UNIV_UNLIKELY(!success)) {
2225
buf_pool_mutex_exit();
2231
case BUF_BLOCK_ZIP_FREE:
2232
case BUF_BLOCK_NOT_USED:
2233
case BUF_BLOCK_READY_FOR_USE:
2234
case BUF_BLOCK_MEMORY:
2235
case BUF_BLOCK_REMOVE_HASH:
2240
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
2242
mutex_enter(&block->mutex);
2243
UNIV_MEM_ASSERT_RW(&block->page, sizeof block->page);
2245
buf_block_buf_fix_inc(block, file, line);
2246
buf_pool_mutex_exit();
2248
/* Check if this is the first access to the page */
2250
accessed = buf_page_is_accessed(&block->page);
2252
buf_page_set_accessed(&block->page, TRUE);
2254
mutex_exit(&block->mutex);
2256
buf_block_make_young(&block->page);
2258
#ifdef UNIV_DEBUG_FILE_ACCESSES
2259
ut_a(!block->page.file_page_was_freed);
2262
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2263
ut_a(++buf_dbg_counter % 5771 || buf_validate());
2264
ut_a(block->page.buf_fix_count > 0);
2265
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
2266
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2271
/* Let us wait until the read operation
2275
enum buf_io_fix io_fix;
2277
mutex_enter(&block->mutex);
2278
io_fix = buf_block_get_io_fix(block);
2279
mutex_exit(&block->mutex);
2281
if (io_fix == BUF_IO_READ) {
2283
os_thread_sleep(WAIT_FOR_READ);
2290
fix_type = MTR_MEMO_BUF_FIX;
2294
rw_lock_s_lock_func(&(block->lock), 0, file, line);
2296
fix_type = MTR_MEMO_PAGE_S_FIX;
2300
ut_ad(rw_latch == RW_X_LATCH);
2301
rw_lock_x_lock_func(&(block->lock), 0, file, line);
2303
fix_type = MTR_MEMO_PAGE_X_FIX;
2307
mtr_memo_push(mtr, block, fix_type);
2310
/* In the case of a first access, try to apply linear
2313
buf_read_ahead_linear(space, zip_size, offset);
2316
#ifdef UNIV_IBUF_COUNT_DEBUG
2317
ut_a(ibuf_count_get(buf_block_get_space(block),
2318
buf_block_get_page_no(block)) == 0);
2323
/********************************************************************//**
2324
This is the general function used to get optimistic access to a database
2326
@return TRUE if success */
2329
buf_page_optimistic_get_func(
2330
/*=========================*/
2331
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
2332
buf_block_t* block, /*!< in: guessed buffer block */
2333
ib_uint64_t modify_clock,/*!< in: modify clock value if mode is
2334
..._GUESS_ON_CLOCK */
2335
const char* file, /*!< in: file name */
2336
ulint line, /*!< in: line where called */
2337
mtr_t* mtr) /*!< in: mini-transaction */
2343
ut_ad(mtr && block);
2344
ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
2346
mutex_enter(&block->mutex);
2348
if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) {
2350
mutex_exit(&block->mutex);
2355
buf_block_buf_fix_inc(block, file, line);
2356
accessed = buf_page_is_accessed(&block->page);
2357
buf_page_set_accessed(&block->page, TRUE);
2359
mutex_exit(&block->mutex);
2361
buf_block_make_young(&block->page);
2363
/* Check if this is the first access to the page */
2365
ut_ad(!ibuf_inside()
2366
|| ibuf_page(buf_block_get_space(block),
2367
buf_block_get_zip_size(block),
2368
buf_block_get_page_no(block), NULL));
2370
if (rw_latch == RW_S_LATCH) {
2371
success = rw_lock_s_lock_nowait(&(block->lock),
2373
fix_type = MTR_MEMO_PAGE_S_FIX;
2375
success = rw_lock_x_lock_func_nowait(&(block->lock),
2377
fix_type = MTR_MEMO_PAGE_X_FIX;
2380
if (UNIV_UNLIKELY(!success)) {
2381
mutex_enter(&block->mutex);
2382
buf_block_buf_fix_dec(block);
2383
mutex_exit(&block->mutex);
2388
if (UNIV_UNLIKELY(modify_clock != block->modify_clock)) {
2389
buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
2391
if (rw_latch == RW_S_LATCH) {
2392
rw_lock_s_unlock(&(block->lock));
2394
rw_lock_x_unlock(&(block->lock));
2397
mutex_enter(&block->mutex);
2398
buf_block_buf_fix_dec(block);
2399
mutex_exit(&block->mutex);
2404
mtr_memo_push(mtr, block, fix_type);
2406
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2407
ut_a(++buf_dbg_counter % 5771 || buf_validate());
2408
ut_a(block->page.buf_fix_count > 0);
2409
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
2410
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2412
#ifdef UNIV_DEBUG_FILE_ACCESSES
2413
ut_a(block->page.file_page_was_freed == FALSE);
2415
if (UNIV_UNLIKELY(!accessed)) {
2416
/* In the case of a first access, try to apply linear
2419
buf_read_ahead_linear(buf_block_get_space(block),
2420
buf_block_get_zip_size(block),
2421
buf_block_get_page_no(block));
2424
#ifdef UNIV_IBUF_COUNT_DEBUG
2425
ut_a(ibuf_count_get(buf_block_get_space(block),
2426
buf_block_get_page_no(block)) == 0);
2428
buf_pool->n_page_gets++;
2433
/********************************************************************//**
2434
This is used to get access to a known database page, when no waiting can be
2435
done. For example, if a search in an adaptive hash index leads us to this
2437
@return TRUE if success */
2440
buf_page_get_known_nowait(
2441
/*======================*/
2442
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
2443
buf_block_t* block, /*!< in: the known page */
2444
ulint mode, /*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
2445
const char* file, /*!< in: file name */
2446
ulint line, /*!< in: line where called */
2447
mtr_t* mtr) /*!< in: mini-transaction */
2453
ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
2455
mutex_enter(&block->mutex);
2457
if (buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH) {
2458
/* Another thread is just freeing the block from the LRU list
2459
of the buffer pool: do not try to access this page; this
2460
attempt to access the page can only come through the hash
2461
index because when the buffer block state is ..._REMOVE_HASH,
2462
we have already removed it from the page address hash table
2463
of the buffer pool. */
2465
mutex_exit(&block->mutex);
2470
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
2472
buf_block_buf_fix_inc(block, file, line);
2474
mutex_exit(&block->mutex);
2476
if (mode == BUF_MAKE_YOUNG) {
2477
buf_block_make_young(&block->page);
2480
ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD));
2482
if (rw_latch == RW_S_LATCH) {
2483
success = rw_lock_s_lock_nowait(&(block->lock),
2485
fix_type = MTR_MEMO_PAGE_S_FIX;
2487
success = rw_lock_x_lock_func_nowait(&(block->lock),
2489
fix_type = MTR_MEMO_PAGE_X_FIX;
2493
mutex_enter(&block->mutex);
2494
buf_block_buf_fix_dec(block);
2495
mutex_exit(&block->mutex);
2500
mtr_memo_push(mtr, block, fix_type);
2502
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2503
ut_a(++buf_dbg_counter % 5771 || buf_validate());
2504
ut_a(block->page.buf_fix_count > 0);
2505
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
2506
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2507
#ifdef UNIV_DEBUG_FILE_ACCESSES
2508
ut_a(block->page.file_page_was_freed == FALSE);
2511
#ifdef UNIV_IBUF_COUNT_DEBUG
2512
ut_a((mode == BUF_KEEP_OLD)
2513
|| (ibuf_count_get(buf_block_get_space(block),
2514
buf_block_get_page_no(block)) == 0));
2516
buf_pool->n_page_gets++;
2521
/*******************************************************************//**
2522
Given a tablespace id and page number tries to get that page. If the
2523
page is not in the buffer pool it is not loaded and NULL is returned.
2524
Suitable for using when holding the kernel mutex.
2525
@return pointer to a page or NULL */
2528
buf_page_try_get_func(
2529
/*==================*/
2530
ulint space_id,/*!< in: tablespace id */
2531
ulint page_no,/*!< in: page number */
2532
const char* file, /*!< in: file name */
2533
ulint line, /*!< in: line where called */
2534
mtr_t* mtr) /*!< in: mini-transaction */
2540
buf_pool_mutex_enter();
2541
block = buf_block_hash_get(space_id, page_no);
2544
buf_pool_mutex_exit();
2548
mutex_enter(&block->mutex);
2549
buf_pool_mutex_exit();
2551
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2552
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
2553
ut_a(buf_block_get_space(block) == space_id);
2554
ut_a(buf_block_get_page_no(block) == page_no);
2555
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2557
buf_block_buf_fix_inc(block, file, line);
2558
mutex_exit(&block->mutex);
2560
fix_type = MTR_MEMO_PAGE_S_FIX;
2561
success = rw_lock_s_lock_nowait(&block->lock, file, line);
2564
/* Let us try to get an X-latch. If the current thread
2565
is holding an X-latch on the page, we cannot get an
2568
fix_type = MTR_MEMO_PAGE_X_FIX;
2569
success = rw_lock_x_lock_func_nowait(&block->lock,
2574
mutex_enter(&block->mutex);
2575
buf_block_buf_fix_dec(block);
2576
mutex_exit(&block->mutex);
2581
mtr_memo_push(mtr, block, fix_type);
2582
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2583
ut_a(++buf_dbg_counter % 5771 || buf_validate());
2584
ut_a(block->page.buf_fix_count > 0);
2585
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
2586
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2587
#ifdef UNIV_DEBUG_FILE_ACCESSES
2588
ut_a(block->page.file_page_was_freed == FALSE);
2589
#endif /* UNIV_DEBUG_FILE_ACCESSES */
2590
buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
2592
buf_pool->n_page_gets++;
2594
#ifdef UNIV_IBUF_COUNT_DEBUG
2595
ut_a(ibuf_count_get(buf_block_get_space(block),
2596
buf_block_get_page_no(block)) == 0);
2602
/********************************************************************//**
2603
Initialize some fields of a control block. */
2608
buf_page_t* bpage) /*!< in: block to init */
2610
bpage->flush_type = BUF_FLUSH_LRU;
2611
bpage->accessed = FALSE;
2612
bpage->io_fix = BUF_IO_NONE;
2613
bpage->buf_fix_count = 0;
2614
bpage->freed_page_clock = 0;
2615
bpage->newest_modification = 0;
2616
bpage->oldest_modification = 0;
2617
HASH_INVALIDATE(bpage, hash);
2618
#ifdef UNIV_DEBUG_FILE_ACCESSES
2619
bpage->file_page_was_freed = FALSE;
2620
#endif /* UNIV_DEBUG_FILE_ACCESSES */
2623
/********************************************************************//**
2624
Inits a page to the buffer buf_pool. */
2629
ulint space, /*!< in: space id */
2630
ulint offset, /*!< in: offset of the page within space
2631
in units of a page */
2632
buf_block_t* block) /*!< in: block to init */
2634
buf_page_t* hash_page;
2636
ut_ad(buf_pool_mutex_own());
2637
ut_ad(mutex_own(&(block->mutex)));
2638
ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
2640
/* Set the state of the block */
2641
buf_block_set_file_page(block, space, offset);
2643
#ifdef UNIV_DEBUG_VALGRIND
2645
/* Silence valid Valgrind warnings about uninitialized
2646
data being written to data files. There are some unused
2647
bytes on some pages that InnoDB does not initialize. */
2648
UNIV_MEM_VALID(block->frame, UNIV_PAGE_SIZE);
2650
#endif /* UNIV_DEBUG_VALGRIND */
2652
buf_block_init_low(block);
2654
block->lock_hash_val = lock_rec_hash(space, offset);
2656
/* Insert into the hash table of file pages */
2658
hash_page = buf_page_hash_get(space, offset);
2660
if (UNIV_LIKELY_NULL(hash_page)) {
2662
"InnoDB: Error: page %lu %lu already found"
2663
" in the hash table: %p, %p\n",
2666
(const void*) hash_page, (const void*) block);
2667
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2668
mutex_exit(&block->mutex);
2669
buf_pool_mutex_exit();
2674
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2678
buf_page_init_low(&block->page);
2680
ut_ad(!block->page.in_zip_hash);
2681
ut_ad(!block->page.in_page_hash);
2682
ut_d(block->page.in_page_hash = TRUE);
2683
HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
2684
buf_page_address_fold(space, offset), &block->page);
2687
/********************************************************************//**
2688
Function which inits a page for read to the buffer buf_pool. If the page is
2689
(1) already in buf_pool, or
2690
(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
2691
(3) if the space is deleted or being deleted,
2692
then this function does nothing.
2693
Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
2694
on the buffer frame. The io-handler must take care that the flag is cleared
2695
and the lock released later.
2696
@return pointer to the block or NULL */
2699
buf_page_init_for_read(
2700
/*===================*/
2701
ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
2702
ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */
2703
ulint space, /*!< in: space id */
2704
ulint zip_size,/*!< in: compressed page size, or 0 */
2705
ibool unzip, /*!< in: TRUE=request uncompressed page */
2706
ib_int64_t tablespace_version,/*!< in: prevents reading from a wrong
2707
version of the tablespace in case we have done
2709
ulint offset) /*!< in: page number */
2721
if (mode == BUF_READ_IBUF_PAGES_ONLY) {
2722
/* It is a read-ahead within an ibuf routine */
2724
ut_ad(!ibuf_bitmap_page(zip_size, offset));
2725
ut_ad(ibuf_inside());
2729
if (!recv_no_ibuf_operations
2730
&& !ibuf_page(space, zip_size, offset, &mtr)) {
2737
ut_ad(mode == BUF_READ_ANY_PAGE);
2740
if (zip_size && UNIV_LIKELY(!unzip)
2741
&& UNIV_LIKELY(!recv_recovery_is_on())) {
2744
block = buf_LRU_get_free_block(0);
2748
buf_pool_mutex_enter();
2750
if (buf_page_hash_get(space, offset)) {
2751
/* The page is already in the buffer pool. */
2754
mutex_enter(&block->mutex);
2755
buf_LRU_block_free_non_file_page(block);
2756
mutex_exit(&block->mutex);
2763
if (fil_tablespace_deleted_or_being_deleted_in_mem(
2764
space, tablespace_version)) {
2765
/* The page belongs to a space which has been
2766
deleted or is being deleted. */
2767
*err = DB_TABLESPACE_DELETED;
2773
bpage = &block->page;
2774
mutex_enter(&block->mutex);
2775
buf_page_init(space, offset, block);
2777
/* The block must be put to the LRU list, to the old blocks */
2778
buf_LRU_add_block(bpage, TRUE/* to old blocks */);
2780
/* We set a pass-type x-lock on the frame because then
2781
the same thread which called for the read operation
2782
(and is running now at this point of code) can wait
2783
for the read to complete by waiting for the x-lock on
2784
the frame; if the x-lock were recursive, the same
2785
thread would illegally get the x-lock before the page
2786
read is completed. The x-lock is cleared by the
2787
io-handler thread. */
2789
rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
2790
buf_page_set_io_fix(bpage, BUF_IO_READ);
2792
if (UNIV_UNLIKELY(zip_size)) {
2793
page_zip_set_size(&block->page.zip, zip_size);
2795
/* buf_pool_mutex may be released and
2796
reacquired by buf_buddy_alloc(). Thus, we
2797
must release block->mutex in order not to
2798
break the latching order in the reacquisition
2799
of buf_pool_mutex. We also must defer this
2800
operation until after the block descriptor has
2801
been added to buf_pool->LRU and
2802
buf_pool->page_hash. */
2803
mutex_exit(&block->mutex);
2804
data = buf_buddy_alloc(zip_size, &lru);
2805
mutex_enter(&block->mutex);
2806
block->page.zip.data = data;
2808
/* To maintain the invariant
2809
block->in_unzip_LRU_list
2810
== buf_page_belongs_to_unzip_LRU(&block->page)
2811
we have to add this block to unzip_LRU
2812
after block->page.zip.data is set. */
2813
ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
2814
buf_unzip_LRU_add_block(block, TRUE);
2817
mutex_exit(&block->mutex);
2819
/* Defer buf_buddy_alloc() until after the block has
2820
been found not to exist. The buf_buddy_alloc() and
2821
buf_buddy_free() calls may be expensive because of
2822
buf_buddy_relocate(). */
2824
/* The compressed page must be allocated before the
2825
control block (bpage), in order to avoid the
2826
invocation of buf_buddy_relocate_block() on
2827
uninitialized data. */
2828
data = buf_buddy_alloc(zip_size, &lru);
2829
bpage = buf_buddy_alloc(sizeof *bpage, &lru);
2831
/* If buf_buddy_alloc() allocated storage from the LRU list,
2832
it released and reacquired buf_pool_mutex. Thus, we must
2833
check the page_hash again, as it may have been modified. */
2834
if (UNIV_UNLIKELY(lru)
2835
&& UNIV_LIKELY_NULL(buf_page_hash_get(space, offset))) {
2837
/* The block was added by some other thread. */
2838
buf_buddy_free(bpage, sizeof *bpage);
2839
buf_buddy_free(data, zip_size);
2845
page_zip_des_init(&bpage->zip);
2846
page_zip_set_size(&bpage->zip, zip_size);
2847
bpage->zip.data = data;
2849
mutex_enter(&buf_pool_zip_mutex);
2850
UNIV_MEM_DESC(bpage->zip.data,
2851
page_zip_get_size(&bpage->zip), bpage);
2852
buf_page_init_low(bpage);
2853
bpage->state = BUF_BLOCK_ZIP_PAGE;
2854
bpage->space = space;
2855
bpage->offset = offset;
2858
bpage->in_page_hash = FALSE;
2859
bpage->in_zip_hash = FALSE;
2860
bpage->in_flush_list = FALSE;
2861
bpage->in_free_list = FALSE;
2862
bpage->in_LRU_list = FALSE;
2863
#endif /* UNIV_DEBUG */
2865
ut_d(bpage->in_page_hash = TRUE);
2866
HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
2867
buf_page_address_fold(space, offset), bpage);
2869
/* The block must be put to the LRU list, to the old blocks */
2870
buf_LRU_add_block(bpage, TRUE/* to old blocks */);
2871
buf_LRU_insert_zip_clean(bpage);
2873
buf_page_set_io_fix(bpage, BUF_IO_READ);
2875
mutex_exit(&buf_pool_zip_mutex);
2878
buf_pool->n_pend_reads++;
2880
buf_pool_mutex_exit();
2882
if (mode == BUF_READ_IBUF_PAGES_ONLY) {
2887
ut_ad(!bpage || buf_page_in_file(bpage));
2891
/********************************************************************//**
2892
Initializes a page to the buffer buf_pool. The page is usually not read
2893
from a file even if it cannot be found in the buffer buf_pool. This is one
2894
of the functions which perform to a block a state transition NOT_USED =>
2895
FILE_PAGE (the other is buf_page_get_gen).
2896
@return pointer to the block, page bufferfixed */
2901
ulint space, /*!< in: space id */
2902
ulint offset, /*!< in: offset of the page within space in units of
2904
ulint zip_size,/*!< in: compressed page size, or 0 */
2905
mtr_t* mtr) /*!< in: mini-transaction handle */
2909
buf_block_t* free_block = NULL;
2912
ut_ad(space || !zip_size);
2914
free_block = buf_LRU_get_free_block(0);
2916
buf_pool_mutex_enter();
2918
block = (buf_block_t*) buf_page_hash_get(space, offset);
2920
if (block && buf_page_in_file(&block->page)) {
2921
#ifdef UNIV_IBUF_COUNT_DEBUG
2922
ut_a(ibuf_count_get(space, offset) == 0);
2924
#ifdef UNIV_DEBUG_FILE_ACCESSES
2925
block->page.file_page_was_freed = FALSE;
2926
#endif /* UNIV_DEBUG_FILE_ACCESSES */
2928
/* Page can be found in buf_pool */
2929
buf_pool_mutex_exit();
2931
buf_block_free(free_block);
2933
return(buf_page_get_with_no_latch(space, zip_size,
2937
/* If we get here, the page was not in buf_pool: init it there */
2940
if (buf_debug_prints) {
2941
fprintf(stderr, "Creating space %lu page %lu to buffer\n",
2942
(ulong) space, (ulong) offset);
2944
#endif /* UNIV_DEBUG */
2948
mutex_enter(&block->mutex);
2950
buf_page_init(space, offset, block);
2952
/* The block must be put to the LRU list */
2953
buf_LRU_add_block(&block->page, FALSE);
2955
buf_block_buf_fix_inc(block, __FILE__, __LINE__);
2956
buf_pool->n_pages_created++;
2962
/* Prevent race conditions during buf_buddy_alloc(),
2963
which may release and reacquire buf_pool_mutex,
2964
by IO-fixing and X-latching the block. */
2966
buf_page_set_io_fix(&block->page, BUF_IO_READ);
2967
rw_lock_x_lock(&block->lock);
2969
page_zip_set_size(&block->page.zip, zip_size);
2970
mutex_exit(&block->mutex);
2971
/* buf_pool_mutex may be released and reacquired by
2972
buf_buddy_alloc(). Thus, we must release block->mutex
2973
in order not to break the latching order in
2974
the reacquisition of buf_pool_mutex. We also must
2975
defer this operation until after the block descriptor
2976
has been added to buf_pool->LRU and buf_pool->page_hash. */
2977
data = buf_buddy_alloc(zip_size, &lru);
2978
mutex_enter(&block->mutex);
2979
block->page.zip.data = data;
2981
/* To maintain the invariant
2982
block->in_unzip_LRU_list
2983
== buf_page_belongs_to_unzip_LRU(&block->page)
2984
we have to add this block to unzip_LRU after
2985
block->page.zip.data is set. */
2986
ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
2987
buf_unzip_LRU_add_block(block, FALSE);
2989
buf_page_set_io_fix(&block->page, BUF_IO_NONE);
2990
rw_lock_x_unlock(&block->lock);
2993
buf_pool_mutex_exit();
2995
mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX);
2997
buf_page_set_accessed(&block->page, TRUE);
2999
mutex_exit(&block->mutex);
3001
/* Delete possible entries for the page from the insert buffer:
3002
such can exist if the page belonged to an index which was dropped */
3004
ibuf_merge_or_delete_for_page(NULL, space, offset, zip_size, TRUE);
3006
/* Flush pages from the end of the LRU list if necessary */
3007
buf_flush_free_margin();
3009
frame = block->frame;
3011
memset(frame + FIL_PAGE_PREV, 0xff, 4);
3012
memset(frame + FIL_PAGE_NEXT, 0xff, 4);
3013
mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
3015
/* Reset to zero the file flush lsn field in the page; if the first
3016
page of an ibdata file is 'created' in this function into the buffer
3017
pool then we lose the original contents of the file flush lsn stamp.
3018
Then InnoDB could in a crash recovery print a big, false, corruption
3019
warning if the stamp contains an lsn bigger than the ib_logfile lsn. */
3021
memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
3023
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3024
ut_a(++buf_dbg_counter % 357 || buf_validate());
3025
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3026
#ifdef UNIV_IBUF_COUNT_DEBUG
3027
ut_a(ibuf_count_get(buf_block_get_space(block),
3028
buf_block_get_page_no(block)) == 0);
3033
/********************************************************************//**
3034
Completes an asynchronous read or write request of a file page to or from
3038
buf_page_io_complete(
3039
/*=================*/
3040
buf_page_t* bpage) /*!< in: pointer to the block in question */
3042
enum buf_io_fix io_type;
3043
const ibool uncompressed = (buf_page_get_state(bpage)
3044
== BUF_BLOCK_FILE_PAGE);
3046
ut_a(buf_page_in_file(bpage));
3048
/* We do not need protect io_fix here by mutex to read
3049
it because this is the only function where we can change the value
3050
from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
3051
ensures that this is the only thread that handles the i/o for this
3054
io_type = buf_page_get_io_fix(bpage);
3055
ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
3057
if (io_type == BUF_IO_READ) {
3059
ulint read_space_id;
3062
if (buf_page_get_zip_size(bpage)) {
3063
frame = bpage->zip.data;
3064
buf_pool->n_pend_unzip++;
3066
&& !buf_zip_decompress((buf_block_t*) bpage,
3069
buf_pool->n_pend_unzip--;
3072
buf_pool->n_pend_unzip--;
3075
frame = ((buf_block_t*) bpage)->frame;
3078
/* If this page is not uninitialized and not in the
3079
doublewrite buffer, then the page number and space id
3080
should be the same as in block. */
3081
read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET);
3082
read_space_id = mach_read_from_4(
3083
frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
3085
if (bpage->space == TRX_SYS_SPACE
3086
&& trx_doublewrite_page_inside(bpage->offset)) {
3088
ut_print_timestamp(stderr);
3090
" InnoDB: Error: reading page %lu\n"
3091
"InnoDB: which is in the"
3092
" doublewrite buffer!\n",
3093
(ulong) bpage->offset);
3094
} else if (!read_space_id && !read_page_no) {
3095
/* This is likely an uninitialized page. */
3096
} else if ((bpage->space
3097
&& bpage->space != read_space_id)
3098
|| bpage->offset != read_page_no) {
3099
/* We did not compare space_id to read_space_id
3100
if bpage->space == 0, because the field on the
3101
page may contain garbage in MySQL < 4.1.1,
3102
which only supported bpage->space == 0. */
3104
ut_print_timestamp(stderr);
3106
" InnoDB: Error: space id and page n:o"
3107
" stored in the page\n"
3108
"InnoDB: read in are %lu:%lu,"
3109
" should be %lu:%lu!\n",
3110
(ulong) read_space_id, (ulong) read_page_no,
3111
(ulong) bpage->space,
3112
(ulong) bpage->offset);
3115
/* From version 3.23.38 up we store the page checksum
3116
to the 4 first bytes of the page end lsn field */
3118
if (buf_page_is_corrupted(frame,
3119
buf_page_get_zip_size(bpage))) {
3122
"InnoDB: Database page corruption on disk"
3124
"InnoDB: file read of page %lu.\n"
3125
"InnoDB: You may have to recover"
3126
" from a backup.\n",
3127
(ulong) bpage->offset);
3128
buf_page_print(frame, buf_page_get_zip_size(bpage));
3130
"InnoDB: Database page corruption on disk"
3132
"InnoDB: file read of page %lu.\n"
3133
"InnoDB: You may have to recover"
3134
" from a backup.\n",
3135
(ulong) bpage->offset);
3136
fputs("InnoDB: It is also possible that"
3138
"InnoDB: system has corrupted its"
3140
"InnoDB: and rebooting your computer"
3143
"InnoDB: If the corrupt page is an index page\n"
3144
"InnoDB: you can also try to"
3145
" fix the corruption\n"
3146
"InnoDB: by dumping, dropping,"
3147
" and reimporting\n"
3148
"InnoDB: the corrupt table."
3149
" You can use CHECK\n"
3150
"InnoDB: TABLE to scan your"
3151
" table for corruption.\n"
3153
REFMAN "forcing-recovery.html\n"
3154
"InnoDB: about forcing recovery.\n", stderr);
3156
if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) {
3157
fputs("InnoDB: Ending processing because of"
3158
" a corrupt database page.\n",
3164
if (recv_recovery_is_on()) {
3165
/* Pages must be uncompressed for crash recovery. */
3167
recv_recover_page(TRUE, (buf_block_t*) bpage);
3170
if (uncompressed && !recv_no_ibuf_operations) {
3171
ibuf_merge_or_delete_for_page(
3172
(buf_block_t*) bpage, bpage->space,
3173
bpage->offset, buf_page_get_zip_size(bpage),
3178
buf_pool_mutex_enter();
3179
mutex_enter(buf_page_get_mutex(bpage));
3181
#ifdef UNIV_IBUF_COUNT_DEBUG
3182
if (io_type == BUF_IO_WRITE || uncompressed) {
3183
/* For BUF_IO_READ of compressed-only blocks, the
3184
buffered operations will be merged by buf_page_get_gen()
3185
after the block has been uncompressed. */
3186
ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
3189
/* Because this thread which does the unlocking is not the same that
3190
did the locking, we use a pass value != 0 in unlock, which simply
3191
removes the newest lock debug record, without checking the thread
3194
buf_page_set_io_fix(bpage, BUF_IO_NONE);
3198
/* NOTE that the call to ibuf may have moved the ownership of
3199
the x-latch to this OS thread: do not let this confuse you in
3202
ut_ad(buf_pool->n_pend_reads > 0);
3203
buf_pool->n_pend_reads--;
3204
buf_pool->n_pages_read++;
3207
rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock,
3214
/* Write means a flush operation: call the completion
3215
routine in the flush system */
3217
buf_flush_write_complete(bpage);
3220
rw_lock_s_unlock_gen(&((buf_block_t*) bpage)->lock,
3224
buf_pool->n_pages_written++;
3233
if (buf_debug_prints) {
3234
fprintf(stderr, "Has %s page space %lu page no %lu\n",
3235
io_type == BUF_IO_READ ? "read" : "written",
3236
(ulong) buf_page_get_space(bpage),
3237
(ulong) buf_page_get_page_no(bpage));
3239
#endif /* UNIV_DEBUG */
3241
mutex_exit(buf_page_get_mutex(bpage));
3242
buf_pool_mutex_exit();
3245
/*********************************************************************//**
3246
Invalidates the file pages in the buffer pool when an archive recovery is
3247
completed. All the file pages buffered must be in a replaceable state when
3248
this function is called: not latched and not modified. */
3251
buf_pool_invalidate(void)
3252
/*=====================*/
3256
ut_ad(buf_all_freed());
3261
freed = buf_LRU_search_and_free_block(100);
3264
buf_pool_mutex_enter();
3266
ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
3267
ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0);
3269
buf_pool_mutex_exit();
3272
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3273
/*********************************************************************//**
3274
Validates the buffer buf_pool data structure.
3284
ulint n_single_flush = 0;
3285
ulint n_lru_flush = 0;
3286
ulint n_list_flush = 0;
3294
buf_pool_mutex_enter();
3296
chunk = buf_pool->chunks;
3298
/* Check the uncompressed blocks. */
3300
for (i = buf_pool->n_chunks; i--; chunk++) {
3303
buf_block_t* block = chunk->blocks;
3305
for (j = chunk->size; j--; block++) {
3307
mutex_enter(&block->mutex);
3309
switch (buf_block_get_state(block)) {
3310
case BUF_BLOCK_ZIP_FREE:
3311
case BUF_BLOCK_ZIP_PAGE:
3312
case BUF_BLOCK_ZIP_DIRTY:
3313
/* These should only occur on
3314
zip_clean, zip_free[], or flush_list. */
3318
case BUF_BLOCK_FILE_PAGE:
3319
ut_a(buf_page_hash_get(buf_block_get_space(
3321
buf_block_get_page_no(
3325
#ifdef UNIV_IBUF_COUNT_DEBUG
3326
ut_a(buf_page_get_io_fix(&block->page)
3328
|| !ibuf_count_get(buf_block_get_space(
3330
buf_block_get_page_no(
3333
switch (buf_page_get_io_fix(&block->page)) {
3338
switch (buf_page_get_flush_type(
3342
ut_a(rw_lock_is_locked(
3346
case BUF_FLUSH_LIST:
3349
case BUF_FLUSH_SINGLE_PAGE:
3360
ut_a(rw_lock_is_locked(&block->lock,
3367
if (block->page.oldest_modification > 0) {
3373
case BUF_BLOCK_NOT_USED:
3377
case BUF_BLOCK_READY_FOR_USE:
3378
case BUF_BLOCK_MEMORY:
3379
case BUF_BLOCK_REMOVE_HASH:
3384
mutex_exit(&block->mutex);
3388
mutex_enter(&buf_pool_zip_mutex);
3390
/* Check clean compressed-only blocks. */
3392
for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
3393
b = UT_LIST_GET_NEXT(list, b)) {
3394
ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
3395
switch (buf_page_get_io_fix(b)) {
3397
/* All clean blocks should be I/O-unfixed. */
3400
/* In buf_LRU_free_block(), we temporarily set
3401
b->io_fix = BUF_IO_READ for a newly allocated
3402
control block in order to prevent
3403
buf_page_get_gen() from decompressing the block. */
3409
ut_a(!b->oldest_modification);
3410
ut_a(buf_page_hash_get(b->space, b->offset) == b);
3416
/* Check dirty compressed-only blocks. */
3418
for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
3419
b = UT_LIST_GET_NEXT(list, b)) {
3420
ut_ad(b->in_flush_list);
3422
switch (buf_page_get_state(b)) {
3423
case BUF_BLOCK_ZIP_DIRTY:
3424
ut_a(b->oldest_modification);
3428
switch (buf_page_get_io_fix(b)) {
3434
switch (buf_page_get_flush_type(b)) {
3438
case BUF_FLUSH_LIST:
3441
case BUF_FLUSH_SINGLE_PAGE:
3450
case BUF_BLOCK_FILE_PAGE:
3451
/* uncompressed page */
3453
case BUF_BLOCK_ZIP_FREE:
3454
case BUF_BLOCK_ZIP_PAGE:
3455
case BUF_BLOCK_NOT_USED:
3456
case BUF_BLOCK_READY_FOR_USE:
3457
case BUF_BLOCK_MEMORY:
3458
case BUF_BLOCK_REMOVE_HASH:
3462
ut_a(buf_page_hash_get(b->space, b->offset) == b);
3465
mutex_exit(&buf_pool_zip_mutex);
3467
if (n_lru + n_free > buf_pool->curr_size + n_zip) {
3468
fprintf(stderr, "n LRU %lu, n free %lu, pool %lu zip %lu\n",
3469
(ulong) n_lru, (ulong) n_free,
3470
(ulong) buf_pool->curr_size, (ulong) n_zip);
3474
ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
3475
if (UT_LIST_GET_LEN(buf_pool->free) != n_free) {
3476
fprintf(stderr, "Free list len %lu, free blocks %lu\n",
3477
(ulong) UT_LIST_GET_LEN(buf_pool->free),
3481
ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
3483
ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush);
3484
ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
3485
ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
3487
buf_pool_mutex_exit();
3489
ut_a(buf_LRU_validate());
3490
ut_a(buf_flush_validate());
3494
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3496
#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3497
/*********************************************************************//**
3498
Prints info of the buffer buf_pool data structure. */
3512
dict_index_t* index;
3516
size = buf_pool->curr_size;
3518
index_ids = mem_alloc(sizeof(dulint) * size);
3519
counts = mem_alloc(sizeof(ulint) * size);
3521
buf_pool_mutex_enter();
3524
"buf_pool size %lu\n"
3525
"database pages %lu\n"
3527
"modified database pages %lu\n"
3528
"n pending decompressions %lu\n"
3529
"n pending reads %lu\n"
3530
"n pending flush LRU %lu list %lu single page %lu\n"
3531
"pages read %lu, created %lu, written %lu\n",
3533
(ulong) UT_LIST_GET_LEN(buf_pool->LRU),
3534
(ulong) UT_LIST_GET_LEN(buf_pool->free),
3535
(ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
3536
(ulong) buf_pool->n_pend_unzip,
3537
(ulong) buf_pool->n_pend_reads,
3538
(ulong) buf_pool->n_flush[BUF_FLUSH_LRU],
3539
(ulong) buf_pool->n_flush[BUF_FLUSH_LIST],
3540
(ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE],
3541
(ulong) buf_pool->n_pages_read, buf_pool->n_pages_created,
3542
(ulong) buf_pool->n_pages_written);
3544
/* Count the number of blocks belonging to each index in the buffer */
3548
chunk = buf_pool->chunks;
3550
for (i = buf_pool->n_chunks; i--; chunk++) {
3551
buf_block_t* block = chunk->blocks;
3552
ulint n_blocks = chunk->size;
3554
for (; n_blocks--; block++) {
3555
const buf_frame_t* frame = block->frame;
3557
if (fil_page_get_type(frame) == FIL_PAGE_INDEX) {
3559
id = btr_page_get_index_id(frame);
3561
/* Look for the id in the index_ids array */
3564
while (j < n_found) {
3566
if (ut_dulint_cmp(index_ids[j],
3584
buf_pool_mutex_exit();
3586
for (i = 0; i < n_found; i++) {
3587
index = dict_index_get_if_in_cache(index_ids[i]);
3590
"Block count for index %lu in buffer is about %lu",
3591
(ulong) ut_dulint_get_low(index_ids[i]),
3596
dict_index_name_print(stderr, NULL, index);
3602
mem_free(index_ids);
3605
ut_a(buf_validate());
3607
#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
3610
/*********************************************************************//**
3611
Returns the number of latched pages in the buffer pool.
3612
@return number of latched pages */
3615
buf_get_latched_pages_number(void)
3616
/*==============================*/
3621
ulint fixed_pages_number = 0;
3623
buf_pool_mutex_enter();
3625
chunk = buf_pool->chunks;
3627
for (i = buf_pool->n_chunks; i--; chunk++) {
3631
block = chunk->blocks;
3633
for (j = chunk->size; j--; block++) {
3634
if (buf_block_get_state(block)
3635
!= BUF_BLOCK_FILE_PAGE) {
3640
mutex_enter(&block->mutex);
3642
if (block->page.buf_fix_count != 0
3643
|| buf_page_get_io_fix(&block->page)
3645
fixed_pages_number++;
3648
mutex_exit(&block->mutex);
3652
mutex_enter(&buf_pool_zip_mutex);
3654
/* Traverse the lists of clean and dirty compressed-only blocks. */
3656
for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
3657
b = UT_LIST_GET_NEXT(list, b)) {
3658
ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
3659
ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE);
3661
if (b->buf_fix_count != 0
3662
|| buf_page_get_io_fix(b) != BUF_IO_NONE) {
3663
fixed_pages_number++;
3667
for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
3668
b = UT_LIST_GET_NEXT(list, b)) {
3669
ut_ad(b->in_flush_list);
3671
switch (buf_page_get_state(b)) {
3672
case BUF_BLOCK_ZIP_DIRTY:
3673
if (b->buf_fix_count != 0
3674
|| buf_page_get_io_fix(b) != BUF_IO_NONE) {
3675
fixed_pages_number++;
3678
case BUF_BLOCK_FILE_PAGE:
3679
/* uncompressed page */
3681
case BUF_BLOCK_ZIP_FREE:
3682
case BUF_BLOCK_ZIP_PAGE:
3683
case BUF_BLOCK_NOT_USED:
3684
case BUF_BLOCK_READY_FOR_USE:
3685
case BUF_BLOCK_MEMORY:
3686
case BUF_BLOCK_REMOVE_HASH:
3692
mutex_exit(&buf_pool_zip_mutex);
3693
buf_pool_mutex_exit();
3695
return(fixed_pages_number);
3697
#endif /* UNIV_DEBUG */
3699
/*********************************************************************//**
3700
Returns the number of pending buf pool ios.
3701
@return number of pending I/O operations */
3704
buf_get_n_pending_ios(void)
3705
/*=======================*/
3707
return(buf_pool->n_pend_reads
3708
+ buf_pool->n_flush[BUF_FLUSH_LRU]
3709
+ buf_pool->n_flush[BUF_FLUSH_LIST]
3710
+ buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
3713
/*********************************************************************//**
3714
Returns the ratio in percents of modified pages in the buffer pool /
3715
database pages in the buffer pool.
3716
@return modified page percentage ratio */
3719
buf_get_modified_ratio_pct(void)
3720
/*============================*/
3724
buf_pool_mutex_enter();
3726
ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list))
3727
/ (1 + UT_LIST_GET_LEN(buf_pool->LRU)
3728
+ UT_LIST_GET_LEN(buf_pool->free));
3730
/* 1 + is there to avoid division by zero */
3732
buf_pool_mutex_exit();
3737
/*********************************************************************//**
3738
Prints info of the buffer i/o. */
3743
FILE* file) /*!< in/out: buffer where to print */
3745
time_t current_time;
3746
double time_elapsed;
3750
size = buf_pool->curr_size;
3752
buf_pool_mutex_enter();
3755
"Buffer pool size %lu\n"
3756
"Free buffers %lu\n"
3757
"Database pages %lu\n"
3758
"Modified db pages %lu\n"
3759
"Pending reads %lu\n"
3760
"Pending writes: LRU %lu, flush list %lu, single page %lu\n",
3762
(ulong) UT_LIST_GET_LEN(buf_pool->free),
3763
(ulong) UT_LIST_GET_LEN(buf_pool->LRU),
3764
(ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
3765
(ulong) buf_pool->n_pend_reads,
3766
(ulong) buf_pool->n_flush[BUF_FLUSH_LRU]
3767
+ buf_pool->init_flush[BUF_FLUSH_LRU],
3768
(ulong) buf_pool->n_flush[BUF_FLUSH_LIST]
3769
+ buf_pool->init_flush[BUF_FLUSH_LIST],
3770
(ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
3772
current_time = time(NULL);
3773
time_elapsed = 0.001 + difftime(current_time,
3774
buf_pool->last_printout_time);
3775
buf_pool->last_printout_time = current_time;
3778
"Pages read %lu, created %lu, written %lu\n"
3779
"%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
3780
(ulong) buf_pool->n_pages_read,
3781
(ulong) buf_pool->n_pages_created,
3782
(ulong) buf_pool->n_pages_written,
3783
(buf_pool->n_pages_read - buf_pool->n_pages_read_old)
3785
(buf_pool->n_pages_created - buf_pool->n_pages_created_old)
3787
(buf_pool->n_pages_written - buf_pool->n_pages_written_old)
3790
if (buf_pool->n_page_gets > buf_pool->n_page_gets_old) {
3791
fprintf(file, "Buffer pool hit rate %lu / 1000\n",
3793
(1000 - ((1000 * (buf_pool->n_pages_read
3794
- buf_pool->n_pages_read_old))
3795
/ (buf_pool->n_page_gets
3796
- buf_pool->n_page_gets_old))));
3798
fputs("No buffer pool page gets since the last printout\n",
3802
buf_pool->n_page_gets_old = buf_pool->n_page_gets;
3803
buf_pool->n_pages_read_old = buf_pool->n_pages_read;
3804
buf_pool->n_pages_created_old = buf_pool->n_pages_created;
3805
buf_pool->n_pages_written_old = buf_pool->n_pages_written;
3807
/* Print some values to help us with visualizing what is
3808
happening with LRU eviction. */
3810
"LRU len: %lu, unzip_LRU len: %lu\n"
3811
"I/O sum[%lu]:cur[%lu], unzip sum[%lu]:cur[%lu]\n",
3812
UT_LIST_GET_LEN(buf_pool->LRU),
3813
UT_LIST_GET_LEN(buf_pool->unzip_LRU),
3814
buf_LRU_stat_sum.io, buf_LRU_stat_cur.io,
3815
buf_LRU_stat_sum.unzip, buf_LRU_stat_cur.unzip);
3817
buf_pool_mutex_exit();
3820
/**********************************************************************//**
3821
Refreshes the statistics used to print per-second averages. */
3824
buf_refresh_io_stats(void)
3825
/*======================*/
3827
buf_pool->last_printout_time = time(NULL);
3828
buf_pool->n_page_gets_old = buf_pool->n_page_gets;
3829
buf_pool->n_pages_read_old = buf_pool->n_pages_read;
3830
buf_pool->n_pages_created_old = buf_pool->n_pages_created;
3831
buf_pool->n_pages_written_old = buf_pool->n_pages_written;
3834
/*********************************************************************//**
3835
Asserts that all file pages in the buffer are in a replaceable state.
3847
buf_pool_mutex_enter();
3849
chunk = buf_pool->chunks;
3851
for (i = buf_pool->n_chunks; i--; chunk++) {
3853
const buf_block_t* block = buf_chunk_not_freed(chunk);
3855
if (UNIV_LIKELY_NULL(block)) {
3857
"Page %lu %lu still fixed or dirty\n",
3858
(ulong) block->page.space,
3859
(ulong) block->page.offset);
3864
buf_pool_mutex_exit();
3869
/*********************************************************************//**
3870
Checks that there currently are no pending i/o-operations for the buffer
3872
@return TRUE if there is no pending i/o */
3875
buf_pool_check_no_pending_io(void)
3876
/*==============================*/
3880
buf_pool_mutex_enter();
3882
if (buf_pool->n_pend_reads + buf_pool->n_flush[BUF_FLUSH_LRU]
3883
+ buf_pool->n_flush[BUF_FLUSH_LIST]
3884
+ buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]) {
3890
buf_pool_mutex_exit();
3895
/*********************************************************************//**
3896
Gets the current length of the free list of buffer blocks.
3897
@return length of the free list */
3900
buf_get_free_list_len(void)
3901
/*=======================*/
3905
buf_pool_mutex_enter();
3907
len = UT_LIST_GET_LEN(buf_pool->free);
3909
buf_pool_mutex_exit();
3913
#else /* !UNIV_HOTBACKUP */
3914
/********************************************************************//**
3915
Inits a page to the buffer buf_pool, for use in ibbackup --restore. */
3918
buf_page_init_for_backup_restore(
3919
/*=============================*/
3920
ulint space, /*!< in: space id */
3921
ulint offset, /*!< in: offset of the page within space
3922
in units of a page */
3923
ulint zip_size,/*!< in: compressed page size in bytes
3924
or 0 for uncompressed pages */
3925
buf_block_t* block) /*!< in: block to init */
3927
block->page.state = BUF_BLOCK_FILE_PAGE;
3928
block->page.space = space;
3929
block->page.offset = offset;
3931
page_zip_des_init(&block->page.zip);
3933
/* We assume that block->page.data has been allocated
3934
with zip_size == UNIV_PAGE_SIZE. */
3935
ut_ad(zip_size <= UNIV_PAGE_SIZE);
3936
ut_ad(ut_is_2pow(zip_size));
3937
page_zip_set_size(&block->page.zip, zip_size);
3939
block->page.zip.data = block->frame + UNIV_PAGE_SIZE;
3942
#endif /* !UNIV_HOTBACKUP */