1
/*****************************************************************************
3
Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15
Place, Suite 330, Boston, MA 02111-1307 USA
17
*****************************************************************************/
19
/**************************************************//**
21
The database buffer read
23
Created 11/5/1995 Heikki Tuuri
24
*******************************************************/
34
#include "ibuf0ibuf.h"
38
#include "srv0start.h"
41
/** The size in blocks of the area where the random read-ahead algorithm counts
42
the accessed pages when deciding whether to read-ahead */
43
#define BUF_READ_AHEAD_RANDOM_AREA BUF_READ_AHEAD_AREA
45
/** There must be at least this many pages in buf_pool in the area to start
46
a random read-ahead */
47
#define BUF_READ_AHEAD_RANDOM_THRESHOLD (1 + BUF_READ_AHEAD_RANDOM_AREA / 2)
49
/** The linear read-ahead area size */
50
#define BUF_READ_AHEAD_LINEAR_AREA BUF_READ_AHEAD_AREA
52
/** If there are buf_pool->curr_size per the number below pending reads, then
53
read-ahead is not done: this is to prevent flooding the buffer pool with
54
i/o-fixed buffer blocks */
55
#define BUF_READ_AHEAD_PEND_LIMIT 2
57
/********************************************************************//**
58
Low-level function which reads a page asynchronously from a file to the
59
buffer buf_pool if it is not already there, in which case does nothing.
60
Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
61
flag is cleared and the x-lock released by an i/o-handler thread.
62
@return 1 if a read request was queued, 0 if the page already resided
63
in buf_pool, or if the page is in the doublewrite buffer blocks in
64
which case it is never read into the pool, or if the tablespace does
65
not exist or is being dropped */
70
ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
71
trying to read from a non-existent tablespace, or a
72
tablespace which is just now being dropped */
73
ibool sync, /*!< in: TRUE if synchronous aio is desired */
74
ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
75
ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
76
at read-ahead functions) */
77
ulint space, /*!< in: space id */
78
ulint zip_size,/*!< in: compressed page size, or 0 */
79
ibool unzip, /*!< in: TRUE=request uncompressed page */
80
ib_int64_t tablespace_version, /*!< in: if the space memory object has
81
this timestamp different from what we are giving here,
82
treat the tablespace as dropped; this is a timestamp we
83
use to stop dangling page reads from a tablespace
84
which we have DISCARDed + IMPORTed back */
85
ulint offset) /*!< in: page number */
92
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
93
mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
95
if (trx_doublewrite && space == TRX_SYS_SPACE
96
&& ( (offset >= trx_doublewrite->block1
97
&& offset < trx_doublewrite->block1
98
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
99
|| (offset >= trx_doublewrite->block2
100
&& offset < trx_doublewrite->block2
101
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
102
ut_print_timestamp(stderr);
104
" InnoDB: Warning: trying to read"
105
" doublewrite buffer page %lu\n",
111
if (ibuf_bitmap_page(zip_size, offset)
112
|| trx_sys_hdr_page(space, offset)) {
114
/* Trx sys header is so low in the latching order that we play
115
safe and do not leave the i/o-completion to an asynchronous
116
i/o-thread. Ibuf bitmap pages must always be read with
117
syncronous i/o, to make sure they do not get involved in
123
/* The following call will also check if the tablespace does not exist
124
or is being dropped; if we succeed in initing the page in the buffer
125
pool for read, then DISCARD cannot proceed until the read has
127
bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip,
128
tablespace_version, offset);
135
if (buf_debug_prints) {
137
"Posting read request for page %lu, sync %lu\n",
143
ut_ad(buf_page_in_file(bpage));
146
*err = fil_io(OS_FILE_READ | wake_later,
147
sync, space, zip_size, offset, 0, zip_size,
148
bpage->zip.data, bpage);
150
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
152
*err = fil_io(OS_FILE_READ | wake_later,
153
sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
154
((buf_block_t*) bpage)->frame, bpage);
156
ut_a(*err == DB_SUCCESS);
159
/* The i/o is already completed when we arrive from
161
buf_page_io_complete(bpage);
167
/********************************************************************//**
168
Applies a random read-ahead in buf_pool if there are at least a threshold
169
value of accessed pages from the random read-ahead area. Does not read any
170
page, not even the one at the position (space, offset), if the read-ahead
171
mechanism is not activated. NOTE 1: the calling thread may own latches on
172
pages: to avoid deadlocks this function must be written such that it cannot
173
end up waiting for these latches! NOTE 2: the calling thread must want
174
access to the page given: this rule is set to prevent unintended read-aheads
175
performed by ibuf routines, a situation which could result in a deadlock if
176
the OS does not support asynchronous i/o.
177
@return number of page read requests issued; NOTE that if we read ibuf
178
pages, it may happen that the page at the given page number does not
179
get read even if we return a positive value! */
182
buf_read_ahead_random(
183
/*==================*/
184
ulint space, /*!< in: space id */
185
ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
186
ulint offset) /*!< in: page number of a page which the current thread
192
/* We have currently disabled random readahead */
197
/********************************************************************//**
198
High-level function which reads a page asynchronously from a file to the
199
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
200
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
201
released by the i/o-handler thread. Does a random read-ahead if it seems
203
@return number of page read requests issued: this can be greater than
204
1 if read-ahead occurred */
209
ulint space, /*!< in: space id */
210
ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
211
ulint offset) /*!< in: page number */
213
ib_int64_t tablespace_version;
218
tablespace_version = fil_space_get_version(space);
220
count = buf_read_ahead_random(space, zip_size, offset);
222
/* We do the i/o in the synchronous aio mode to save thread
223
switches: hence TRUE */
225
count2 = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
227
tablespace_version, offset);
228
srv_buf_pool_reads+= count2;
229
if (err == DB_TABLESPACE_DELETED) {
230
ut_print_timestamp(stderr);
232
" InnoDB: Error: trying to access"
233
" tablespace %lu page no. %lu,\n"
234
"InnoDB: but the tablespace does not exist"
235
" or is just being dropped.\n",
236
(ulong) space, (ulong) offset);
239
/* Flush pages from the end of the LRU list if necessary */
240
buf_flush_free_margin();
242
/* Increment number of I/O operations used for LRU policy. */
243
buf_LRU_stat_inc_io();
245
return(count + count2);
248
/********************************************************************//**
249
Applies linear read-ahead if in the buf_pool the page is a border page of
250
a linear read-ahead area and all the pages in the area have been accessed.
251
Does not read any page if the read-ahead mechanism is not activated. Note
252
that the the algorithm looks at the 'natural' adjacent successor and
253
predecessor of the page, which on the leaf level of a B-tree are the next
254
and previous page in the chain of leaves. To know these, the page specified
255
in (space, offset) must already be present in the buf_pool. Thus, the
256
natural way to use this function is to call it when a page in the buf_pool
257
is accessed the first time, calling this function just after it has been
259
NOTE 1: as this function looks at the natural predecessor and successor
260
fields on the page, what happens, if these are not initialized to any
261
sensible value? No problem, before applying read-ahead we check that the
262
area to read is within the span of the space, if not, read-ahead is not
263
applied. An uninitialized value may result in a useless read operation, but
264
only very improbably.
265
NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
266
function must be written such that it cannot end up waiting for these
268
NOTE 3: the calling thread must want access to the page given: this rule is
269
set to prevent unintended read-aheads performed by ibuf routines, a situation
270
which could result in a deadlock if the OS does not support asynchronous io.
271
@return number of page read requests issued */
274
buf_read_ahead_linear(
275
/*==================*/
276
ulint space, /*!< in: space id */
277
ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
278
ulint offset) /*!< in: page number of a page; NOTE: the current thread
279
must want access to this page (see NOTE 3 above) */
281
ib_int64_t tablespace_version;
284
buf_page_t* pred_bpage = NULL;
295
const ulint buf_read_ahead_linear_area
296
= BUF_READ_AHEAD_LINEAR_AREA;
299
if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
300
/* No read-ahead to avoid thread deadlocks */
304
low = (offset / buf_read_ahead_linear_area)
305
* buf_read_ahead_linear_area;
306
high = (offset / buf_read_ahead_linear_area + 1)
307
* buf_read_ahead_linear_area;
309
if ((offset != low) && (offset != high - 1)) {
310
/* This is not a border page of the area: return */
315
if (ibuf_bitmap_page(zip_size, offset)
316
|| trx_sys_hdr_page(space, offset)) {
318
/* If it is an ibuf bitmap page or trx sys hdr, we do
319
no read-ahead, as that could break the ibuf page access
325
/* Remember the tablespace version before we ask te tablespace size
326
below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
327
do not try to read outside the bounds of the tablespace! */
329
tablespace_version = fil_space_get_version(space);
331
buf_pool_mutex_enter();
333
if (high > fil_space_get_size(space)) {
334
buf_pool_mutex_exit();
335
/* The area is not whole, return */
340
if (buf_pool->n_pend_reads
341
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
342
buf_pool_mutex_exit();
347
/* Check that almost all pages in the area have been accessed; if
348
offset == low, the accesses must be in a descending order, otherwise,
349
in an ascending order. */
357
/* How many out of order accessed pages can we ignore
358
when working out the access pattern for linear readahead */
359
threshold = ut_min((64 - srv_read_ahead_threshold),
360
BUF_READ_AHEAD_AREA);
364
for (i = low; i < high; i++) {
365
bpage = buf_page_hash_get(space, i);
367
if ((bpage == NULL) || !buf_page_is_accessed(bpage)) {
371
} else if (pred_bpage) {
372
int res = (ut_ulint_cmp(
373
buf_page_get_LRU_position(bpage),
374
buf_page_get_LRU_position(pred_bpage)));
375
/* Accesses not in the right order */
376
if (res != 0 && res != asc_or_desc) {
381
if (fail_count > threshold) {
382
/* Too many failures: return */
383
buf_pool_mutex_exit();
387
if (bpage && buf_page_is_accessed(bpage)) {
392
/* If we got this far, we know that enough pages in the area have
393
been accessed in the right order: linear read-ahead can be sensible */
395
bpage = buf_page_hash_get(space, offset);
398
buf_pool_mutex_exit();
403
switch (buf_page_get_state(bpage)) {
404
case BUF_BLOCK_ZIP_PAGE:
405
frame = bpage->zip.data;
407
case BUF_BLOCK_FILE_PAGE:
408
frame = ((buf_block_t*) bpage)->frame;
415
/* Read the natural predecessor and successor page addresses from
416
the page; NOTE that because the calling thread may have an x-latch
417
on the page, we do not acquire an s-latch on the page, this is to
418
prevent deadlocks. Even if we read values which are nonsense, the
419
algorithm will work. */
421
pred_offset = fil_page_get_prev(frame);
422
succ_offset = fil_page_get_next(frame);
424
buf_pool_mutex_exit();
426
if ((offset == low) && (succ_offset == offset + 1)) {
428
/* This is ok, we can continue */
429
new_offset = pred_offset;
431
} else if ((offset == high - 1) && (pred_offset == offset - 1)) {
433
/* This is ok, we can continue */
434
new_offset = succ_offset;
436
/* Successor or predecessor not in the right order */
441
low = (new_offset / buf_read_ahead_linear_area)
442
* buf_read_ahead_linear_area;
443
high = (new_offset / buf_read_ahead_linear_area + 1)
444
* buf_read_ahead_linear_area;
446
if ((new_offset != low) && (new_offset != high - 1)) {
447
/* This is not a border page of the area: return */
452
if (high > fil_space_get_size(space)) {
453
/* The area is not whole, return */
458
/* If we got this far, read-ahead can be sensible: do it */
461
ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
463
ibuf_mode = BUF_READ_ANY_PAGE;
468
/* Since Windows XP seems to schedule the i/o handler thread
469
very eagerly, and consequently it does not wait for the
470
full read batch to be posted, we use special heuristics here */
472
os_aio_simulated_put_read_threads_to_sleep();
474
for (i = low; i < high; i++) {
475
/* It is only sensible to do read-ahead in the non-sync
476
aio mode: hence FALSE as the first parameter */
478
if (!ibuf_bitmap_page(zip_size, i)) {
479
count += buf_read_page_low(
481
ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
482
space, zip_size, FALSE, tablespace_version, i);
483
if (err == DB_TABLESPACE_DELETED) {
484
ut_print_timestamp(stderr);
486
" InnoDB: Warning: in"
487
" linear readahead trying to access\n"
488
"InnoDB: tablespace %lu page %lu,\n"
489
"InnoDB: but the tablespace does not"
490
" exist or is just being dropped.\n",
491
(ulong) space, (ulong) i);
496
/* In simulated aio we wake the aio handler threads only after
497
queuing all aio requests, in native aio the following call does
500
os_aio_simulated_wake_handler_threads();
502
/* Flush pages from the end of the LRU list if necessary */
503
buf_flush_free_margin();
506
if (buf_debug_prints && (count > 0)) {
508
"LINEAR read-ahead space %lu offset %lu pages %lu\n",
509
(ulong) space, (ulong) offset, (ulong) count);
511
#endif /* UNIV_DEBUG */
513
/* Read ahead is considered one I/O operation for the purpose of
514
LRU policy decision. */
515
buf_LRU_stat_inc_io();
517
++srv_read_ahead_seq;
521
/********************************************************************//**
522
Issues read requests for pages which the ibuf module wants to read in, in
523
order to contract the insert buffer tree. Technically, this function is like
524
a read-ahead function. */
527
buf_read_ibuf_merge_pages(
528
/*======================*/
529
ibool sync, /*!< in: TRUE if the caller
530
wants this function to wait
531
for the highest address page
532
to get read in, before this
534
const ulint* space_ids, /*!< in: array of space ids */
535
const ib_int64_t* space_versions,/*!< in: the spaces must have
537
(timestamp), otherwise we
538
discard the read; we use this
539
to cancel reads if DISCARD +
540
IMPORT may have changed the
542
const ulint* page_nos, /*!< in: array of page numbers
543
to read, with the highest page
544
number the last in the
546
ulint n_stored) /*!< in: number of elements
551
ut_ad(!ibuf_inside());
552
#ifdef UNIV_IBUF_DEBUG
553
ut_a(n_stored < UNIV_PAGE_SIZE);
555
while (buf_pool->n_pend_reads
556
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
557
os_thread_sleep(500000);
560
for (i = 0; i < n_stored; i++) {
561
ulint zip_size = fil_space_get_zip_size(space_ids[i]);
564
if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
566
goto tablespace_deleted;
569
buf_read_page_low(&err, sync && (i + 1 == n_stored),
570
BUF_READ_ANY_PAGE, space_ids[i],
571
zip_size, TRUE, space_versions[i],
574
if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) {
576
/* We have deleted or are deleting the single-table
577
tablespace: remove the entries for that page */
579
ibuf_merge_or_delete_for_page(NULL, space_ids[i],
585
os_aio_simulated_wake_handler_threads();
587
/* Flush pages from the end of the LRU list if necessary */
588
buf_flush_free_margin();
591
if (buf_debug_prints) {
593
"Ibuf merge read-ahead space %lu pages %lu\n",
594
(ulong) space_ids[0], (ulong) n_stored);
596
#endif /* UNIV_DEBUG */
599
/********************************************************************//**
600
Issues read requests for pages which recovery wants to read in. */
605
ibool sync, /*!< in: TRUE if the caller
606
wants this function to wait
607
for the highest address page
608
to get read in, before this
610
ulint space, /*!< in: space id */
611
ulint zip_size, /*!< in: compressed page size in
613
const ulint* page_nos, /*!< in: array of page numbers
614
to read, with the highest page
615
number the last in the
617
ulint n_stored) /*!< in: number of page numbers
620
ib_int64_t tablespace_version;
625
zip_size = fil_space_get_zip_size(space);
627
if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
628
/* It is a single table tablespace and the .ibd file is
629
missing: do nothing */
634
tablespace_version = fil_space_get_version(space);
636
for (i = 0; i < n_stored; i++) {
640
os_aio_print_debug = FALSE;
642
while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
644
os_aio_simulated_wake_handler_threads();
645
os_thread_sleep(500000);
651
"InnoDB: Error: InnoDB has waited for"
652
" 50 seconds for pending\n"
653
"InnoDB: reads to the buffer pool to"
655
"InnoDB: Number of pending reads %lu,"
656
" pending pread calls %lu\n",
657
(ulong) buf_pool->n_pend_reads,
658
(ulong)os_file_n_pending_preads);
660
os_aio_print_debug = TRUE;
664
os_aio_print_debug = FALSE;
666
if ((i + 1 == n_stored) && sync) {
667
buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
668
zip_size, TRUE, tablespace_version,
671
buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
672
| OS_AIO_SIMULATED_WAKE_LATER,
673
space, zip_size, TRUE,
674
tablespace_version, page_nos[i]);
678
os_aio_simulated_wake_handler_threads();
680
/* Flush pages from the end of the LRU list if necessary */
681
buf_flush_free_margin();
684
if (buf_debug_prints) {
686
"Recovery applies read-ahead pages %lu\n",
689
#endif /* UNIV_DEBUG */