1
/*****************************************************************************
3
Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15
Place, Suite 330, Boston, MA 02111-1307 USA
17
*****************************************************************************/
19
/**************************************************//**
21
Binary buddy allocator for compressed pages
23
Created December 2006 by Marko Makela
24
*******************************************************/
27
#include "buf0buddy.h"
29
# include "buf0buddy.ic"
37
/* Statistic counters */
40
/** Number of frames allocated from the buffer pool to the buddy system.
41
Protected by buf_pool_mutex. */
42
static ulint buf_buddy_n_frames;
43
#endif /* UNIV_DEBUG */
44
/** Statistics of the buddy system, indexed by block size.
45
Protected by buf_pool_mutex. */
46
UNIV_INTERN buf_buddy_stat_t buf_buddy_stat[BUF_BUDDY_SIZES + 1];
48
/**********************************************************************//**
49
Get the offset of the buddy of a compressed page frame.
50
@return the buddy relative of page */
55
byte* page, /*!< in: compressed page */
56
ulint size) /*!< in: page size in bytes */
58
ut_ad(ut_is_2pow(size));
59
ut_ad(size >= BUF_BUDDY_LOW);
60
ut_ad(size < BUF_BUDDY_HIGH);
61
ut_ad(!ut_align_offset(page, size));
63
if (((ulint) page) & size) {
70
/**********************************************************************//**
71
Add a block to the head of the appropriate buddy free list. */
74
buf_buddy_add_to_free(
75
/*==================*/
76
buf_page_t* bpage, /*!< in,own: block to be freed */
77
ulint i) /*!< in: index of buf_pool->zip_free[] */
79
#ifdef UNIV_DEBUG_VALGRIND
80
buf_page_t* b = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
82
if (b) UNIV_MEM_VALID(b, BUF_BUDDY_LOW << i);
83
#endif /* UNIV_DEBUG_VALGRIND */
85
ut_ad(buf_pool_mutex_own());
86
ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
87
ut_ad(buf_pool->zip_free[i].start != bpage);
88
UT_LIST_ADD_FIRST(list, buf_pool->zip_free[i], bpage);
90
#ifdef UNIV_DEBUG_VALGRIND
91
if (b) UNIV_MEM_FREE(b, BUF_BUDDY_LOW << i);
92
UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i);
93
#endif /* UNIV_DEBUG_VALGRIND */
96
/**********************************************************************//**
97
Remove a block from the appropriate buddy free list. */
100
buf_buddy_remove_from_free(
101
/*=======================*/
102
buf_page_t* bpage, /*!< in: block to be removed */
103
ulint i) /*!< in: index of buf_pool->zip_free[] */
105
#ifdef UNIV_DEBUG_VALGRIND
106
buf_page_t* prev = UT_LIST_GET_PREV(list, bpage);
107
buf_page_t* next = UT_LIST_GET_NEXT(list, bpage);
109
if (prev) UNIV_MEM_VALID(prev, BUF_BUDDY_LOW << i);
110
if (next) UNIV_MEM_VALID(next, BUF_BUDDY_LOW << i);
112
ut_ad(!prev || buf_page_get_state(prev) == BUF_BLOCK_ZIP_FREE);
113
ut_ad(!next || buf_page_get_state(next) == BUF_BLOCK_ZIP_FREE);
114
#endif /* UNIV_DEBUG_VALGRIND */
116
ut_ad(buf_pool_mutex_own());
117
ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
118
UT_LIST_REMOVE(list, buf_pool->zip_free[i], bpage);
120
#ifdef UNIV_DEBUG_VALGRIND
121
if (prev) UNIV_MEM_FREE(prev, BUF_BUDDY_LOW << i);
122
if (next) UNIV_MEM_FREE(next, BUF_BUDDY_LOW << i);
123
#endif /* UNIV_DEBUG_VALGRIND */
126
/**********************************************************************//**
127
Try to allocate a block from buf_pool->zip_free[].
128
@return allocated block, or NULL if buf_pool->zip_free[] was empty */
133
ulint i) /*!< in: index of buf_pool->zip_free[] */
137
ut_ad(buf_pool_mutex_own());
138
ut_a(i < BUF_BUDDY_SIZES);
140
#ifndef UNIV_DEBUG_VALGRIND
141
/* Valgrind would complain about accessing free memory. */
142
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i],
143
ut_ad(buf_page_get_state(ut_list_node_313)
144
== BUF_BLOCK_ZIP_FREE)));
145
#endif /* !UNIV_DEBUG_VALGRIND */
146
bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
149
UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
150
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
152
buf_buddy_remove_from_free(bpage, i);
153
} else if (i + 1 < BUF_BUDDY_SIZES) {
154
/* Attempt to split. */
155
bpage = buf_buddy_alloc_zip(i + 1);
158
buf_page_t* buddy = (buf_page_t*)
159
(((char*) bpage) + (BUF_BUDDY_LOW << i));
161
ut_ad(!buf_pool_contains_zip(buddy));
162
ut_d(memset(buddy, i, BUF_BUDDY_LOW << i));
163
buddy->state = BUF_BLOCK_ZIP_FREE;
164
buf_buddy_add_to_free(buddy, i);
170
memset(bpage, ~i, BUF_BUDDY_LOW << i);
172
#endif /* UNIV_DEBUG */
174
UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i);
179
/**********************************************************************//**
180
Deallocate a buffer frame of UNIV_PAGE_SIZE. */
183
buf_buddy_block_free(
184
/*=================*/
185
void* buf) /*!< in: buffer frame to deallocate */
187
const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf);
191
ut_ad(buf_pool_mutex_own());
192
ut_ad(!mutex_own(&buf_pool_zip_mutex));
193
ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE));
195
HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage,
196
ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY
197
&& bpage->in_zip_hash && !bpage->in_page_hash),
198
((buf_block_t*) bpage)->frame == buf);
200
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY);
201
ut_ad(!bpage->in_page_hash);
202
ut_ad(bpage->in_zip_hash);
203
ut_d(bpage->in_zip_hash = FALSE);
204
HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage);
206
ut_d(memset(buf, 0, UNIV_PAGE_SIZE));
207
UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE);
209
block = (buf_block_t*) bpage;
210
mutex_enter(&block->mutex);
211
buf_LRU_block_free_non_file_page(block);
212
mutex_exit(&block->mutex);
214
ut_ad(buf_buddy_n_frames > 0);
215
ut_d(buf_buddy_n_frames--);
218
/**********************************************************************//**
219
Allocate a buffer block to the buddy allocator. */
222
buf_buddy_block_register(
223
/*=====================*/
224
buf_block_t* block) /*!< in: buffer frame to allocate */
226
const ulint fold = BUF_POOL_ZIP_FOLD(block);
227
ut_ad(buf_pool_mutex_own());
228
ut_ad(!mutex_own(&buf_pool_zip_mutex));
229
ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE);
231
buf_block_set_state(block, BUF_BLOCK_MEMORY);
234
ut_a(!ut_align_offset(block->frame, UNIV_PAGE_SIZE));
236
ut_ad(!block->page.in_page_hash);
237
ut_ad(!block->page.in_zip_hash);
238
ut_d(block->page.in_zip_hash = TRUE);
239
HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page);
241
ut_d(buf_buddy_n_frames++);
244
/**********************************************************************//**
245
Allocate a block from a bigger object.
246
@return allocated block */
249
buf_buddy_alloc_from(
250
/*=================*/
251
void* buf, /*!< in: a block that is free to use */
252
ulint i, /*!< in: index of buf_pool->zip_free[] */
253
ulint j) /*!< in: size of buf as an index
254
of buf_pool->zip_free[] */
256
ulint offs = BUF_BUDDY_LOW << j;
257
ut_ad(j <= BUF_BUDDY_SIZES);
259
ut_ad(!ut_align_offset(buf, offs));
261
/* Add the unused parts of the block to the free lists. */
268
bpage = (buf_page_t*) ((byte*) buf + offs);
269
ut_d(memset(bpage, j, BUF_BUDDY_LOW << j));
270
bpage->state = BUF_BLOCK_ZIP_FREE;
271
#ifndef UNIV_DEBUG_VALGRIND
272
/* Valgrind would complain about accessing free memory. */
273
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i],
274
ut_ad(buf_page_get_state(
276
== BUF_BLOCK_ZIP_FREE)));
277
#endif /* !UNIV_DEBUG_VALGRIND */
278
buf_buddy_add_to_free(bpage, j);
284
/**********************************************************************//**
285
Allocate a block. The thread calling this function must hold
286
buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex.
287
The buf_pool_mutex may only be released and reacquired if lru != NULL.
288
@return allocated block, possibly NULL if lru==NULL */
293
ulint i, /*!< in: index of buf_pool->zip_free[],
294
or BUF_BUDDY_SIZES */
295
ibool* lru) /*!< in: pointer to a variable that will be assigned
296
TRUE if storage was allocated from the LRU list
297
and buf_pool_mutex was temporarily released,
298
or NULL if the LRU list should not be used */
302
ut_ad(buf_pool_mutex_own());
303
ut_ad(!mutex_own(&buf_pool_zip_mutex));
305
if (i < BUF_BUDDY_SIZES) {
306
/* Try to allocate from the buddy system. */
307
block = buf_buddy_alloc_zip(i);
315
/* Try allocating from the buf_pool->free list. */
316
block = buf_LRU_get_free_only();
328
/* Try replacing an uncompressed page in the buffer pool. */
329
buf_pool_mutex_exit();
330
block = buf_LRU_get_free_block(0);
332
buf_pool_mutex_enter();
335
buf_buddy_block_register(block);
337
block = buf_buddy_alloc_from(block->frame, i, BUF_BUDDY_SIZES);
340
buf_buddy_stat[i].used++;
344
/**********************************************************************//**
345
Try to relocate the control block of a compressed page.
346
@return TRUE if relocated */
349
buf_buddy_relocate_block(
350
/*=====================*/
351
buf_page_t* bpage, /*!< in: block to relocate */
352
buf_page_t* dpage) /*!< in: free block to relocate to */
356
ut_ad(buf_pool_mutex_own());
358
switch (buf_page_get_state(bpage)) {
359
case BUF_BLOCK_ZIP_FREE:
360
case BUF_BLOCK_NOT_USED:
361
case BUF_BLOCK_READY_FOR_USE:
362
case BUF_BLOCK_FILE_PAGE:
363
case BUF_BLOCK_MEMORY:
364
case BUF_BLOCK_REMOVE_HASH:
366
case BUF_BLOCK_ZIP_DIRTY:
367
/* Cannot relocate dirty pages. */
370
case BUF_BLOCK_ZIP_PAGE:
374
mutex_enter(&buf_pool_zip_mutex);
376
if (!buf_page_can_relocate(bpage)) {
377
mutex_exit(&buf_pool_zip_mutex);
381
buf_relocate(bpage, dpage);
382
ut_d(bpage->state = BUF_BLOCK_ZIP_FREE);
384
/* relocate buf_pool->zip_clean */
385
b = UT_LIST_GET_PREV(list, dpage);
386
UT_LIST_REMOVE(list, buf_pool->zip_clean, dpage);
389
UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, dpage);
391
UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, dpage);
394
mutex_exit(&buf_pool_zip_mutex);
398
/**********************************************************************//**
399
Try to relocate a block.
400
@return TRUE if relocated */
405
void* src, /*!< in: block to relocate */
406
void* dst, /*!< in: free block to relocate to */
407
ulint i) /*!< in: index of buf_pool->zip_free[] */
410
const ulint size = BUF_BUDDY_LOW << i;
411
ullint usec = ut_time_us(NULL);
413
ut_ad(buf_pool_mutex_own());
414
ut_ad(!mutex_own(&buf_pool_zip_mutex));
415
ut_ad(!ut_align_offset(src, size));
416
ut_ad(!ut_align_offset(dst, size));
417
UNIV_MEM_ASSERT_W(dst, size);
419
/* We assume that all memory from buf_buddy_alloc()
420
is used for either compressed pages or buf_page_t
421
objects covering compressed pages. */
423
/* We look inside the allocated objects returned by
424
buf_buddy_alloc() and assume that anything of
425
PAGE_ZIP_MIN_SIZE or larger is a compressed page that contains
426
a valid space_id and page_no in the page header. Should the
427
fields be invalid, we will be unable to relocate the block.
428
We also assume that anything that fits sizeof(buf_page_t)
429
actually is a properly initialized buf_page_t object. */
431
if (size >= PAGE_ZIP_MIN_SIZE) {
432
/* This is a compressed page. */
435
/* The src block may be split into smaller blocks,
436
some of which may be free. Thus, the
437
mach_read_from_4() calls below may attempt to read
438
from free memory. The memory is "owned" by the buddy
439
allocator (and it has been allocated from the buffer
440
pool), so there is nothing wrong about this. The
441
mach_read_from_4() calls here will only trigger bogus
442
Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */
443
bpage = buf_page_hash_get(
444
mach_read_from_4((const byte*) src
445
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID),
446
mach_read_from_4((const byte*) src
449
if (!bpage || bpage->zip.data != src) {
450
/* The block has probably been freshly
451
allocated by buf_LRU_get_free_block() but not
452
added to buf_pool->page_hash yet. Obviously,
453
it cannot be relocated. */
458
if (page_zip_get_size(&bpage->zip) != size) {
459
/* The block is of different size. We would
460
have to relocate all blocks covered by src.
461
For the sake of simplicity, give up. */
462
ut_ad(page_zip_get_size(&bpage->zip) < size);
467
/* The block must have been allocated, but it may
468
contain uninitialized data. */
469
UNIV_MEM_ASSERT_W(src, size);
471
mutex = buf_page_get_mutex(bpage);
475
if (buf_page_can_relocate(bpage)) {
476
/* Relocate the compressed page. */
477
ut_a(bpage->zip.data == src);
478
memcpy(dst, src, size);
479
bpage->zip.data = dst;
482
UNIV_MEM_INVALID(src, size);
484
buf_buddy_stat_t* buddy_stat
485
= &buf_buddy_stat[i];
486
buddy_stat->relocated++;
487
buddy_stat->relocated_usec
488
+= ut_time_us(NULL) - usec;
494
} else if (i == buf_buddy_get_slot(sizeof(buf_page_t))) {
495
/* This must be a buf_page_t object. */
496
UNIV_MEM_ASSERT_RW(src, size);
497
if (buf_buddy_relocate_block(src, dst)) {
506
/**********************************************************************//**
507
Deallocate a block. */
512
void* buf, /*!< in: block to be freed, must not be
513
pointed to by the buffer pool */
514
ulint i) /*!< in: index of buf_pool->zip_free[],
515
or BUF_BUDDY_SIZES */
520
ut_ad(buf_pool_mutex_own());
521
ut_ad(!mutex_own(&buf_pool_zip_mutex));
522
ut_ad(i <= BUF_BUDDY_SIZES);
523
ut_ad(buf_buddy_stat[i].used > 0);
525
buf_buddy_stat[i].used--;
527
UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i);
528
ut_d(((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE);
530
if (i == BUF_BUDDY_SIZES) {
531
buf_buddy_block_free(buf);
535
ut_ad(i < BUF_BUDDY_SIZES);
536
ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
537
ut_ad(!buf_pool_contains_zip(buf));
539
/* Try to combine adjacent blocks. */
541
buddy = (buf_page_t*) buf_buddy_get(((byte*) buf), BUF_BUDDY_LOW << i);
543
#ifndef UNIV_DEBUG_VALGRIND
544
/* Valgrind would complain about accessing free memory. */
546
if (buddy->state != BUF_BLOCK_ZIP_FREE) {
551
/* The field buddy->state can only be trusted for free blocks.
552
If buddy->state == BUF_BLOCK_ZIP_FREE, the block is free if
553
it is in the free list. */
554
#endif /* !UNIV_DEBUG_VALGRIND */
556
for (bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); bpage; ) {
557
UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
558
ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
560
if (bpage == buddy) {
562
/* The buddy is free: recombine */
563
buf_buddy_remove_from_free(bpage, i);
565
ut_ad(buf_page_get_state(buddy) == BUF_BLOCK_ZIP_FREE);
566
ut_ad(!buf_pool_contains_zip(buddy));
568
buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
576
buf_page_t* next = UT_LIST_GET_NEXT(list, bpage);
577
UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i);
582
#ifndef UNIV_DEBUG_VALGRIND
584
/* Valgrind would complain about accessing free memory. */
585
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i],
586
ut_ad(buf_page_get_state(ut_list_node_313)
587
== BUF_BLOCK_ZIP_FREE)));
588
#endif /* UNIV_DEBUG_VALGRIND */
590
/* The buddy is not free. Is there a free block of this size? */
591
bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
594
/* Remove the block from the free list, because a successful
595
buf_buddy_relocate() will overwrite bpage->list. */
597
UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
598
buf_buddy_remove_from_free(bpage, i);
600
/* Try to relocate the buddy of buf to the free block. */
601
if (buf_buddy_relocate(buddy, bpage, i)) {
603
ut_d(buddy->state = BUF_BLOCK_ZIP_FREE);
607
buf_buddy_add_to_free(bpage, i);
609
/* Try to relocate the buddy of the free block to buf. */
610
buddy = (buf_page_t*) buf_buddy_get(((byte*) bpage),
613
#ifndef UNIV_DEBUG_VALGRIND
614
/* Valgrind would complain about accessing free memory. */
616
/* The buddy must not be (completely) free, because we
617
always recombine adjacent free blocks.
619
(Parts of the buddy can be free in
620
buf_pool->zip_free[j] with j < i.) */
621
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i],
622
ut_ad(buf_page_get_state(
624
== BUF_BLOCK_ZIP_FREE
625
&& ut_list_node_313 != buddy)));
626
#endif /* !UNIV_DEBUG_VALGRIND */
628
if (buf_buddy_relocate(buddy, buf, i)) {
631
UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
632
ut_d(buddy->state = BUF_BLOCK_ZIP_FREE);
637
/* Free the block to the buddy list. */
640
if (i < buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)) {
641
/* This area has most likely been allocated for at
642
least one compressed-only block descriptor. Check
643
that there are no live objects in the area. This is
644
not a complete check: it may yield false positives as
645
well as false negatives. Also, due to buddy blocks
646
being recombined, it is possible (although unlikely)
647
that this branch is never reached. */
651
# ifndef UNIV_DEBUG_VALGRIND
652
/* Valgrind would complain about accessing
653
uninitialized memory. Besides, Valgrind performs a
654
more exhaustive check, at every memory access. */
655
const buf_page_t* b = buf;
656
const buf_page_t* const b_end = (buf_page_t*)
657
((char*) b + (BUF_BUDDY_LOW << i));
659
for (; b < b_end; b++) {
660
/* Avoid false positives (and cause false
661
negatives) by checking for b->space < 1000. */
663
if ((b->state == BUF_BLOCK_ZIP_PAGE
664
|| b->state == BUF_BLOCK_ZIP_DIRTY)
665
&& b->space > 0 && b->space < 1000) {
667
"buddy dirty %p %u (%u,%u) %p,%lu\n",
669
b->state, b->space, b->offset,
673
# endif /* !UNIV_DEBUG_VALGRIND */
675
/* Scramble the block. This should make any pointers
676
invalid and trigger a segmentation violation. Because
677
the scrambling can be reversed, it may be possible to
678
track down the object pointing to the freed data by
679
dereferencing the unscrambled bpage->LRU or
680
bpage->list pointers. */
681
for (c = (char*) buf + (BUF_BUDDY_LOW << i);
682
c-- > (char*) buf; ) {
686
/* Fill large blocks with a constant pattern. */
687
memset(bpage, i, BUF_BUDDY_LOW << i);
689
#endif /* UNIV_DEBUG */
690
bpage->state = BUF_BLOCK_ZIP_FREE;
691
buf_buddy_add_to_free(bpage, i);