1
/*-------------------------------------------------------------------------
4
* POSTGRES free space map for quickly finding free space in relations
7
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
8
* Portions Copyright (c) 1994, Regents of the University of California
11
* src/backend/storage/freespace/freespace.c
16
* Free Space Map keeps track of the amount of free space on pages, and
17
* allows quickly searching for a page with enough free space. The FSM is
18
* stored in a dedicated relation fork of all heap relations, and those
19
* index access methods that need it (see also indexfsm.c). See README for
22
*-------------------------------------------------------------------------
26
#include "access/htup.h"
27
#include "access/xlogutils.h"
28
#include "miscadmin.h"
29
#include "storage/bufmgr.h"
30
#include "storage/bufpage.h"
31
#include "storage/freespace.h"
32
#include "storage/fsm_internals.h"
33
#include "storage/lmgr.h"
34
#include "storage/lwlock.h"
35
#include "storage/smgr.h"
36
#include "utils/rel.h"
40
* We use just one byte to store the amount of free space on a page, so we
41
* divide the amount of free space a page can have into 256 different
42
* categories. The highest category, 255, represents a page with at least
43
* MaxFSMRequestSize bytes of free space, and the second highest category
44
* represents the range from 254 * FSM_CAT_STEP, inclusive, to
45
* MaxFSMRequestSize, exclusive.
47
* MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
48
* default 8k BLCKSZ, and that MaxFSMRequestSize is 24 bytes, the categories
60
* The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
61
* isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
62
* bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
63
* bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
64
* completely empty page, that would mean that we could never satisfy a
65
* request of exactly MaxFSMRequestSize bytes.
67
#define FSM_CATEGORIES 256
68
#define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES)
69
#define MaxFSMRequestSize MaxHeapTupleSize
72
* Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
73
* and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
74
* 216 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
75
* this means that 4096 bytes is the smallest BLCKSZ that we can get away
76
* with a 3-level tree, and 512 is the smallest we support.
78
#define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
80
#define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1)
81
#define FSM_BOTTOM_LEVEL 0
84
* The internal FSM routines work on a logical addressing scheme. Each
85
* level of the tree can be thought of as a separately addressable file.
89
int level; /* level */
90
int logpageno; /* page number within the level */
93
/* Address of the root page. */
94
static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
96
/* functions to navigate the tree */
97
static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
98
static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
99
static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
100
static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
101
static BlockNumber fsm_logical_to_physical(FSMAddress addr);
103
static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
104
static void fsm_extend(Relation rel, BlockNumber fsm_nblocks);
106
/* functions to convert amount of free space to a FSM category */
107
static uint8 fsm_space_avail_to_cat(Size avail);
108
static uint8 fsm_space_needed_to_cat(Size needed);
109
static Size fsm_space_cat_to_avail(uint8 cat);
111
/* workhorse functions for various operations */
112
static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
113
uint8 newValue, uint8 minValue);
114
static BlockNumber fsm_search(Relation rel, uint8 min_cat);
115
static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof);
118
/******** Public API ********/
121
* GetPageWithFreeSpace - try to find a page in the given relation with
122
* at least the specified amount of free space.
124
* If successful, return the block number; if not, return InvalidBlockNumber.
126
* The caller must be prepared for the possibility that the returned page
127
* will turn out to have too little space available by the time the caller
128
* gets a lock on it. In that case, the caller should report the actual
129
* amount of free space available on that page and then try again (see
130
* RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
131
* extend the relation.
134
GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
136
uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
138
return fsm_search(rel, min_cat);
142
* RecordAndGetPageWithFreeSpace - update info about a page and try again.
144
* We provide this combo form to save some locking overhead, compared to
145
* separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
146
* also some effort to return a page close to the old page; if there's a
147
* page with enough free space on the same FSM page where the old one page
148
* is located, it is preferred.
151
RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
152
Size oldSpaceAvail, Size spaceNeeded)
154
int old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
155
int search_cat = fsm_space_needed_to_cat(spaceNeeded);
160
/* Get the location of the FSM byte representing the heap block */
161
addr = fsm_get_location(oldPage, &slot);
163
search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
166
* If fsm_set_and_search found a suitable new block, return that.
167
* Otherwise, search as usual.
169
if (search_slot != -1)
170
return fsm_get_heap_blk(addr, search_slot);
172
return fsm_search(rel, search_cat);
176
* RecordPageWithFreeSpace - update info about a page.
178
* Note that if the new spaceAvail value is higher than the old value stored
179
* in the FSM, the space might not become visible to searchers until the next
180
* FreeSpaceMapVacuum call, which updates the upper level pages.
183
RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
185
int new_cat = fsm_space_avail_to_cat(spaceAvail);
189
/* Get the location of the FSM byte representing the heap block */
190
addr = fsm_get_location(heapBlk, &slot);
192
fsm_set_and_search(rel, addr, slot, new_cat, 0);
196
* XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
200
XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
203
int new_cat = fsm_space_avail_to_cat(spaceAvail);
210
/* Get the location of the FSM byte representing the heap block */
211
addr = fsm_get_location(heapBlk, &slot);
212
blkno = fsm_logical_to_physical(addr);
214
/* If the page doesn't exist already, extend */
215
buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR);
216
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
218
page = BufferGetPage(buf);
220
PageInit(page, BLCKSZ, 0);
222
if (fsm_set_avail(page, slot, new_cat))
223
MarkBufferDirty(buf);
224
UnlockReleaseBuffer(buf);
228
* GetRecordedFreePage - return the amount of free space on a particular page,
229
* according to the FSM.
232
GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
239
/* Get the location of the FSM byte representing the heap block */
240
addr = fsm_get_location(heapBlk, &slot);
242
buf = fsm_readbuf(rel, addr, false);
243
if (!BufferIsValid(buf))
245
cat = fsm_get_avail(BufferGetPage(buf), slot);
248
return fsm_space_cat_to_avail(cat);
252
* FreeSpaceMapTruncateRel - adjust for truncation of a relation.
254
* The caller must hold AccessExclusiveLock on the relation, to ensure that
255
* other backends receive the smgr invalidation event that this function sends
256
* before they access the FSM again.
258
* nblocks is the new size of the heap.
261
FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks)
263
BlockNumber new_nfsmblocks;
264
FSMAddress first_removed_address;
265
uint16 first_removed_slot;
268
RelationOpenSmgr(rel);
271
* If no FSM has been created yet for this relation, there's nothing to
274
if (!smgrexists(rel->rd_smgr, FSM_FORKNUM))
277
/* Get the location in the FSM of the first removed heap block */
278
first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
281
* Zero out the tail of the last remaining FSM page. If the slot
282
* representing the first removed heap block is at a page boundary, as the
283
* first slot on the FSM page that first_removed_address points to, we can
284
* just truncate that page altogether.
286
if (first_removed_slot > 0)
288
buf = fsm_readbuf(rel, first_removed_address, false);
289
if (!BufferIsValid(buf))
290
return; /* nothing to do; the FSM was already smaller */
291
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
292
fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
293
MarkBufferDirty(buf);
294
UnlockReleaseBuffer(buf);
296
new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
300
new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
301
if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks)
302
return; /* nothing to do; the FSM was already smaller */
305
/* Truncate the unused FSM pages, and send smgr inval message */
306
smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks);
309
* We might as well update the local smgr_fsm_nblocks setting.
310
* smgrtruncate sent an smgr cache inval message, which will cause other
311
* backends to invalidate their copy of smgr_fsm_nblocks, and this one too
312
* at the next command boundary. But this ensures it isn't outright wrong
316
rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks;
320
* FreeSpaceMapVacuum - scan and fix any inconsistencies in the FSM
323
FreeSpaceMapVacuum(Relation rel)
328
* Traverse the tree in depth-first order. The tree is stored physically
329
* in depth-first order, so this should be pretty I/O efficient.
331
fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, &dummy);
334
/******** Internal routines ********/
337
* Return category corresponding x bytes of free space
340
fsm_space_avail_to_cat(Size avail)
344
Assert(avail < BLCKSZ);
346
if (avail >= MaxFSMRequestSize)
349
cat = avail / FSM_CAT_STEP;
352
* The highest category, 255, is reserved for MaxFSMRequestSize bytes or
362
* Return the lower bound of the range of free space represented by given
366
fsm_space_cat_to_avail(uint8 cat)
368
/* The highest category represents exactly MaxFSMRequestSize bytes. */
370
return MaxFSMRequestSize;
372
return cat * FSM_CAT_STEP;
376
* Which category does a page need to have, to accommodate x bytes of data?
377
* While fsm_size_to_avail_cat() rounds down, this needs to round up.
380
fsm_space_needed_to_cat(Size needed)
384
/* Can't ask for more space than the highest category represents */
385
if (needed > MaxFSMRequestSize)
386
elog(ERROR, "invalid FSM request size %lu",
387
(unsigned long) needed);
392
cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
401
* Returns the physical block number an FSM page
404
fsm_logical_to_physical(FSMAddress addr)
411
* Calculate the logical page number of the first leaf page below the
414
leafno = addr.logpageno;
415
for (l = 0; l < addr.level; l++)
416
leafno *= SlotsPerFSMPage;
418
/* Count upper level nodes required to address the leaf page */
420
for (l = 0; l < FSM_TREE_DEPTH; l++)
423
leafno /= SlotsPerFSMPage;
427
* If the page we were asked for wasn't at the bottom level, subtract the
428
* additional lower level pages we counted above.
432
/* Turn the page count into 0-based block number */
437
* Return the FSM location corresponding to given heap block.
440
fsm_get_location(BlockNumber heapblk, uint16 *slot)
444
addr.level = FSM_BOTTOM_LEVEL;
445
addr.logpageno = heapblk / SlotsPerFSMPage;
446
*slot = heapblk % SlotsPerFSMPage;
452
* Return the heap block number corresponding to given location in the FSM.
455
fsm_get_heap_blk(FSMAddress addr, uint16 slot)
457
Assert(addr.level == FSM_BOTTOM_LEVEL);
458
return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
462
* Given a logical address of a child page, get the logical page number of
463
* the parent, and the slot within the parent corresponding to the child.
466
fsm_get_parent(FSMAddress child, uint16 *slot)
470
Assert(child.level < FSM_ROOT_LEVEL);
472
parent.level = child.level + 1;
473
parent.logpageno = child.logpageno / SlotsPerFSMPage;
474
*slot = child.logpageno % SlotsPerFSMPage;
480
* Given a logical address of a parent page, and a slot number get the
481
* logical address of the corresponding child page.
484
fsm_get_child(FSMAddress parent, uint16 slot)
488
Assert(parent.level > FSM_BOTTOM_LEVEL);
490
child.level = parent.level - 1;
491
child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
499
* If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
500
* true, the FSM file is extended.
503
fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
505
BlockNumber blkno = fsm_logical_to_physical(addr);
508
RelationOpenSmgr(rel);
511
* If we haven't cached the size of the FSM yet, check it first. Also
512
* recheck if the requested block seems to be past end, since our cached
513
* value might be stale. (We send smgr inval messages on truncation, but
516
if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber ||
517
blkno >= rel->rd_smgr->smgr_fsm_nblocks)
519
if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
520
rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr,
523
rel->rd_smgr->smgr_fsm_nblocks = 0;
526
/* Handle requests beyond EOF */
527
if (blkno >= rel->rd_smgr->smgr_fsm_nblocks)
530
fsm_extend(rel, blkno + 1);
532
return InvalidBuffer;
536
* Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM
537
* information is not accurate anyway, so it's better to clear corrupt
538
* pages than error out. Since the FSM changes are not WAL-logged, the
539
* so-called torn page problem on crash can lead to pages with corrupt
540
* headers, for example.
542
buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
543
if (PageIsNew(BufferGetPage(buf)))
544
PageInit(BufferGetPage(buf), BLCKSZ, 0);
549
* Ensure that the FSM fork is at least fsm_nblocks long, extending
550
* it if necessary with empty pages. And by empty, I mean pages filled
551
* with zeros, meaning there's no free space.
554
fsm_extend(Relation rel, BlockNumber fsm_nblocks)
556
BlockNumber fsm_nblocks_now;
559
pg = (Page) palloc(BLCKSZ);
560
PageInit(pg, BLCKSZ, 0);
563
* We use the relation extension lock to lock out other backends trying to
564
* extend the FSM at the same time. It also locks out extension of the
565
* main fork, unnecessarily, but extending the FSM happens seldom enough
566
* that it doesn't seem worthwhile to have a separate lock tag type for
569
* Note that another backend might have extended or created the relation
570
* by the time we get the lock.
572
LockRelationForExtension(rel, ExclusiveLock);
574
/* Might have to re-open if a cache flush happened */
575
RelationOpenSmgr(rel);
578
* Create the FSM file first if it doesn't exist. If smgr_fsm_nblocks is
579
* positive then it must exist, no need for an smgrexists call.
581
if ((rel->rd_smgr->smgr_fsm_nblocks == 0 ||
582
rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber) &&
583
!smgrexists(rel->rd_smgr, FSM_FORKNUM))
584
smgrcreate(rel->rd_smgr, FSM_FORKNUM, false);
586
fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
588
while (fsm_nblocks_now < fsm_nblocks)
590
smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now,
595
/* Update local cache with the up-to-date size */
596
rel->rd_smgr->smgr_fsm_nblocks = fsm_nblocks_now;
598
UnlockRelationForExtension(rel, ExclusiveLock);
604
* Set value in given FSM page and slot.
606
* If minValue > 0, the updated page is also searched for a page with at
607
* least minValue of free space. If one is found, its slot number is
608
* returned, -1 otherwise.
611
fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
612
uint8 newValue, uint8 minValue)
618
buf = fsm_readbuf(rel, addr, true);
619
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
621
page = BufferGetPage(buf);
623
if (fsm_set_avail(page, slot, newValue))
624
MarkBufferDirty(buf);
628
/* Search while we still hold the lock */
629
newslot = fsm_search_avail(buf, minValue,
630
addr.level == FSM_BOTTOM_LEVEL,
634
UnlockReleaseBuffer(buf);
640
* Search the tree for a heap page with at least min_cat of free space
643
fsm_search(Relation rel, uint8 min_cat)
646
FSMAddress addr = FSM_ROOT_ADDRESS;
654
/* Read the FSM page. */
655
buf = fsm_readbuf(rel, addr, false);
657
/* Search within the page */
658
if (BufferIsValid(buf))
660
LockBuffer(buf, BUFFER_LOCK_SHARE);
661
slot = fsm_search_avail(buf, min_cat,
662
(addr.level == FSM_BOTTOM_LEVEL),
665
max_avail = fsm_get_max_avail(BufferGetPage(buf));
666
UnlockReleaseBuffer(buf);
674
* Descend the tree, or return the found block if we're at the
677
if (addr.level == FSM_BOTTOM_LEVEL)
678
return fsm_get_heap_blk(addr, slot);
680
addr = fsm_get_child(addr, slot);
682
else if (addr.level == FSM_ROOT_LEVEL)
685
* At the root, failure means there's no page with enough free
686
* space in the FSM. Give up.
688
return InvalidBlockNumber;
696
* At lower level, failure can happen if the value in the upper-
697
* level node didn't reflect the value on the lower page. Update
698
* the upper node, to avoid falling into the same trap again, and
701
* There's a race condition here, if another backend updates this
702
* page right after we release it, and gets the lock on the parent
703
* page before us. We'll then update the parent page with the now
704
* stale information we had. It's OK, because it should happen
705
* rarely, and will be fixed by the next vacuum.
707
parent = fsm_get_parent(addr, &parentslot);
708
fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
711
* If the upper pages are badly out of date, we might need to loop
712
* quite a few times, updating them as we go. Any inconsistencies
713
* should eventually be corrected and the loop should end. Looping
714
* indefinitely is nevertheless scary, so provide an emergency
717
if (restarts++ > 10000)
718
return InvalidBlockNumber;
720
/* Start search all over from the root */
721
addr = FSM_ROOT_ADDRESS;
728
* Recursive guts of FreeSpaceMapVacuum
731
fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p)
737
/* Read the page if it exists, or return EOF */
738
buf = fsm_readbuf(rel, addr, false);
739
if (!BufferIsValid(buf))
747
page = BufferGetPage(buf);
750
* Recurse into children, and fix the information stored about them at
753
if (addr.level > FSM_BOTTOM_LEVEL)
758
for (slot = 0; slot < SlotsPerFSMPage; slot++)
762
CHECK_FOR_INTERRUPTS();
764
/* After we hit end-of-file, just clear the rest of the slots */
766
child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), &eof);
770
/* Update information about the child */
771
if (fsm_get_avail(page, slot) != child_avail)
773
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
774
fsm_set_avail(BufferGetPage(buf), slot, child_avail);
775
MarkBufferDirty(buf);
776
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
781
max_avail = fsm_get_max_avail(BufferGetPage(buf));
784
* Reset the next slot pointer. This encourages the use of low-numbered
785
* pages, increasing the chances that a later vacuum can truncate the
788
((FSMPage) PageGetContents(page))->fp_next_slot = 0;