1
/*-------------------------------------------------------------------------
4
* buffer manager interface routines
6
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
7
* Portions Copyright (c) 1994, Regents of the University of California
11
* $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.185 2005-01-10 20:02:21 tgl Exp $
13
*-------------------------------------------------------------------------
16
* ReadBuffer() -- find or create a buffer holding the requested page,
17
* and pin it so that no one can destroy it while this process
20
* ReleaseBuffer() -- unpin the buffer
22
* WriteNoReleaseBuffer() -- mark the buffer contents as "dirty"
23
* but don't unpin. The disk IO is delayed until buffer
26
* WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer()
28
* BufferSync() -- flush all (or some) dirty buffers in the buffer pool.
30
* InitBufferPool() -- Init the buffer module.
33
* freelist.c -- chooses victim for buffer replacement
34
* buf_table.c -- manages the buffer lookup table
41
#include "lib/stringinfo.h"
42
#include "miscadmin.h"
43
#include "storage/buf_internals.h"
44
#include "storage/bufmgr.h"
45
#include "storage/bufpage.h"
46
#include "storage/proc.h"
47
#include "storage/smgr.h"
48
#include "utils/relcache.h"
49
#include "utils/resowner.h"
53
#define BufferGetLSN(bufHdr) \
54
(*((XLogRecPtr*) MAKE_PTR((bufHdr)->data)))
58
bool zero_damaged_pages = false;
61
bool ShowPinTrace = false;
64
long NDirectFileRead; /* some I/O's are direct file access.
66
long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */
69
/* local state for StartBufferIO and related functions */
70
static BufferDesc *InProgressBuf = NULL;
71
static bool IsForInput;
72
/* local state for LockBufferForCleanup */
73
static BufferDesc *PinCountWaitBuf = NULL;
76
static void PinBuffer(BufferDesc *buf, bool fixOwner);
77
static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
78
static void WaitIO(BufferDesc *buf);
79
static void StartBufferIO(BufferDesc *buf, bool forInput);
80
static void TerminateBufferIO(BufferDesc *buf, int err_flag);
81
static void ContinueBufferIO(BufferDesc *buf, bool forInput);
82
static void buffer_write_error_callback(void *arg);
83
static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum,
85
static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
87
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, bool earlylock);
88
static void write_buffer(Buffer buffer, bool unpin);
92
* ReadBuffer -- returns a buffer containing the requested
93
* block of the requested relation. If the blknum
94
* requested is P_NEW, extend the relation file and
95
* allocate a new block. (Caller is responsible for
96
* ensuring that only one backend tries to extend a
97
* relation at the same time!)
99
* Returns: the buffer number for the buffer containing
100
* the block read. The returned buffer has been pinned.
101
* Does not return on error --- elog's instead.
103
* Assume when this function is called, that reln has been
107
ReadBuffer(Relation reln, BlockNumber blockNum)
109
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
110
return ReadBufferInternal(reln, blockNum, false);
114
* ReadBufferInternal -- internal version of ReadBuffer with more options
116
* bufferLockHeld: if true, caller already acquired the bufmgr lock.
117
* (This is assumed never to be true if dealing with a local buffer!)
119
* The caller must have done ResourceOwnerEnlargeBuffers(CurrentResourceOwner)
122
ReadBufferInternal(Relation reln, BlockNumber blockNum,
130
isExtend = (blockNum == P_NEW);
131
isLocalBuf = reln->rd_istemp;
133
/* Open it at the smgr level if not already done */
134
RelationOpenSmgr(reln);
136
/* Substitute proper block number if caller asked for P_NEW */
138
blockNum = smgrnblocks(reln->rd_smgr);
142
ReadLocalBufferCount++;
143
pgstat_count_buffer_read(&reln->pgstat_info, reln);
144
bufHdr = LocalBufferAlloc(reln, blockNum, &found);
146
LocalBufferHitCount++;
151
pgstat_count_buffer_read(&reln->pgstat_info, reln);
154
* lookup the buffer. IO_IN_PROGRESS is set if the requested
155
* block is not currently in memory.
158
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
159
bufHdr = BufferAlloc(reln, blockNum, &found);
164
/* At this point we do NOT hold the bufmgr lock. */
166
/* if it was already in the buffer pool, we're done */
169
/* Just need to update stats before we exit */
170
pgstat_count_buffer_hit(&reln->pgstat_info, reln);
172
if (VacuumCostActive)
173
VacuumCostBalance += VacuumCostPageHit;
175
return BufferDescriptorGetBuffer(bufHdr);
179
* if we have gotten to this point, we have allocated a buffer for the
180
* page but its contents are not yet valid. IO_IN_PROGRESS is set for
181
* it, if it's a shared buffer.
183
* Note: if smgrextend fails, we will end up with a buffer that is
184
* allocated but not marked BM_VALID. P_NEW will still select the
185
* same block number (because the relation didn't get any longer on
186
* disk) and so future attempts to extend the relation will find the
187
* same buffer (if it's not been recycled) but come right back here to
188
* try smgrextend again.
190
Assert(!(bufHdr->flags & BM_VALID));
194
/* new buffers are zero-filled */
195
MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ);
196
smgrextend(reln->rd_smgr, blockNum, (char *) MAKE_PTR(bufHdr->data),
201
smgrread(reln->rd_smgr, blockNum, (char *) MAKE_PTR(bufHdr->data));
202
/* check for garbage data */
203
if (!PageHeaderIsValid((PageHeader) MAKE_PTR(bufHdr->data)))
206
* During WAL recovery, the first access to any data page
207
* should overwrite the whole page from the WAL; so a
208
* clobbered page header is not reason to fail. Hence, when
209
* InRecovery we may always act as though zero_damaged_pages
212
if (zero_damaged_pages || InRecovery)
215
(errcode(ERRCODE_DATA_CORRUPTED),
216
errmsg("invalid page header in block %u of relation \"%s\"; zeroing out page",
217
blockNum, RelationGetRelationName(reln))));
218
MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ);
222
(errcode(ERRCODE_DATA_CORRUPTED),
223
errmsg("invalid page header in block %u of relation \"%s\"",
224
blockNum, RelationGetRelationName(reln))));
230
/* Only need to adjust flags */
231
bufHdr->flags |= BM_VALID;
235
/* lock buffer manager again to update IO IN PROGRESS */
236
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
238
/* IO Succeeded, so mark data valid */
239
bufHdr->flags |= BM_VALID;
241
/* If anyone was waiting for IO to complete, wake them up now */
242
TerminateBufferIO(bufHdr, 0);
244
LWLockRelease(BufMgrLock);
247
if (VacuumCostActive)
248
VacuumCostBalance += VacuumCostPageMiss;
250
return BufferDescriptorGetBuffer(bufHdr);
254
* BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
255
* buffer. If no buffer exists already, selects a replacement
256
* victim and evicts the old page, but does NOT read in new page.
258
* The returned buffer is pinned and is already marked as holding the
259
* desired page. If it already did have the desired page, *foundPtr is
260
* set TRUE. Otherwise, *foundPtr is set FALSE and the buffer is marked
261
* as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
263
* *foundPtr is actually redundant with the buffer's BM_VALID flag, but
264
* we keep it for simplicity in ReadBuffer.
266
* BufMgrLock must be held at entry. When this routine returns,
267
* the BufMgrLock is guaranteed NOT to be held.
270
BufferAlloc(Relation reln,
271
BlockNumber blockNum,
274
BufferTag newTag; /* identity of requested block */
279
bool inProgress; /* did we already do StartBufferIO? */
281
/* create a tag so we can lookup the buffer */
282
INIT_BUFFERTAG(newTag, reln, blockNum);
284
/* see if the block is in the buffer pool already */
285
buf = StrategyBufferLookup(&newTag, false, &cdb_found_index);
289
* Found it. Now, pin the buffer so no one can steal it from the
290
* buffer pool, and check to see if someone else is still reading
291
* data into the buffer. (Formerly, we'd always block here if
292
* IO_IN_PROGRESS is set, but there's no need to wait when someone
293
* is writing rather than reading.)
297
PinBuffer(buf, true);
299
if (!(buf->flags & BM_VALID))
301
if (buf->flags & BM_IO_IN_PROGRESS)
303
/* someone else is reading it, wait for them */
306
if (!(buf->flags & BM_VALID))
309
* If we get here, previous attempts to read the buffer
310
* must have failed ... but we shall bravely try again.
313
StartBufferIO(buf, true);
317
LWLockRelease(BufMgrLock);
325
* Didn't find it in the buffer pool. We'll have to initialize a new
326
* buffer. First, grab one from the free list. If it's dirty, flush
327
* it to disk. Remember to unlock BufMgrLock while doing the IO.
332
buf = StrategyGetBuffer(&cdb_replace_index);
334
/* StrategyGetBuffer will elog if it can't find a free buffer */
338
* There should be exactly one pin on the buffer after it is
339
* allocated -- ours. If it had a pin it wouldn't have been on
340
* the free list. No one else could have pinned it between
341
* StrategyGetBuffer and here because we have the BufMgrLock.
343
* (We must pin the buffer before releasing BufMgrLock ourselves,
344
* to ensure StrategyGetBuffer won't give the same buffer to someone
347
Assert(buf->refcount == 0);
349
PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1;
351
ResourceOwnerRememberBuffer(CurrentResourceOwner,
352
BufferDescriptorGetBuffer(buf));
354
if ((buf->flags & BM_VALID) &&
355
(buf->flags & BM_DIRTY || buf->cntxDirty))
358
* Set BM_IO_IN_PROGRESS to show the buffer is being written.
359
* It cannot already be set because the buffer would be pinned
360
* if someone were writing it.
362
* Note: it's okay to grab the io_in_progress lock while holding
363
* BufMgrLock. All code paths that acquire this lock pin the
364
* buffer first; since no one had it pinned (it just came off
365
* the free list), no one else can have the lock.
367
StartBufferIO(buf, false);
372
* Write the buffer out, being careful to release BufMgrLock
373
* while doing the I/O. We also tell FlushBuffer to share-lock
374
* the buffer before releasing BufMgrLock. This is safe because
375
* we know no other backend currently has the buffer pinned,
376
* therefore no one can have it locked either, so we can always
377
* get the lock without blocking. It is necessary because if
378
* we release BufMgrLock first, it's possible for someone else
379
* to pin and exclusive-lock the buffer before we get to the
380
* share-lock, causing us to block. If the someone else then
381
* blocks on a lock we hold, deadlock ensues. This has been
382
* observed to happen when two backends are both trying to split
383
* btree index pages, and the second one just happens to be
384
* trying to split the page the first one got from the freelist.
386
FlushBuffer(buf, NULL, true);
389
* Somebody could have allocated another buffer for the same
390
* block we are about to read in. While we flush out the dirty
391
* buffer, we don't hold the lock and someone could have
392
* allocated another buffer for the same block. The problem is
393
* we haven't yet inserted the new tag into the buffer table.
394
* So we need to check here. -ay 3/95
396
* Another reason we have to do this is to update
397
* cdb_found_index, since the CDB could have disappeared from
398
* B1/B2 list while we were writing.
400
buf2 = StrategyBufferLookup(&newTag, true, &cdb_found_index);
404
* Found it. Someone has already done what we were about
405
* to do. We'll just handle this as if it were found in
406
* the buffer pool in the first place. First, give up the
407
* buffer we were planning to use.
409
TerminateBufferIO(buf, 0);
410
UnpinBuffer(buf, true);
414
/* remaining code should match code at top of routine */
418
PinBuffer(buf, true);
420
if (!(buf->flags & BM_VALID))
422
if (buf->flags & BM_IO_IN_PROGRESS)
424
/* someone else is reading it, wait for them */
427
if (!(buf->flags & BM_VALID))
430
* If we get here, previous attempts to read the
431
* buffer must have failed ... but we shall
435
StartBufferIO(buf, true);
439
LWLockRelease(BufMgrLock);
445
* Somebody could have pinned the buffer while we were doing
446
* the I/O and had given up the BufMgrLock. If so, we can't
447
* recycle this buffer --- we need to clear the I/O flags,
448
* remove our pin and choose a new victim buffer. Similarly,
449
* we have to start over if somebody re-dirtied the buffer.
451
if (buf->refcount > 1 || buf->flags & BM_DIRTY || buf->cntxDirty)
453
TerminateBufferIO(buf, 0);
454
UnpinBuffer(buf, true);
459
} while (buf == NULL);
462
* At this point we should have the sole pin on a non-dirty buffer and
463
* we may or may not already have the BM_IO_IN_PROGRESS flag set.
467
* Tell the buffer replacement strategy that we are replacing the
468
* buffer content. Then rename the buffer. Clearing BM_VALID here is
469
* necessary, clearing the dirtybits is just paranoia.
471
StrategyReplaceBuffer(buf, &newTag, cdb_found_index, cdb_replace_index);
473
buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
474
buf->cntxDirty = false;
477
* Buffer contents are currently invalid. Have to mark IO IN PROGRESS
478
* so no one fiddles with them until the read completes. We may have
479
* already marked it, in which case we just flip from write to read
483
StartBufferIO(buf, true);
485
ContinueBufferIO(buf, true);
487
LWLockRelease(BufMgrLock);
493
* write_buffer -- common functionality for
494
* WriteBuffer and WriteNoReleaseBuffer
497
write_buffer(Buffer buffer, bool release)
501
if (!BufferIsValid(buffer))
502
elog(ERROR, "bad buffer id: %d", buffer);
504
if (BufferIsLocal(buffer))
506
WriteLocalBuffer(buffer, release);
510
bufHdr = &BufferDescriptors[buffer - 1];
512
Assert(PrivateRefCount[buffer - 1] > 0);
514
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
515
Assert(bufHdr->refcount > 0);
518
* If the buffer was not dirty already, do vacuum cost accounting.
520
if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
521
VacuumCostBalance += VacuumCostPageDirty;
523
bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
526
UnpinBuffer(bufHdr, true);
527
LWLockRelease(BufMgrLock);
533
* Marks buffer contents as dirty (actual write happens later).
535
* Assume that buffer is pinned. Assume that reln is valid.
538
* Pin count is decremented.
541
WriteBuffer(Buffer buffer)
543
write_buffer(buffer, true);
547
* WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer
548
* when the operation is complete.
551
WriteNoReleaseBuffer(Buffer buffer)
553
write_buffer(buffer, false);
557
* ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
558
* to save a lock release/acquire.
560
* Also, if the passed buffer is valid and already contains the desired block
561
* number, we simply return it without ever acquiring the lock at all.
562
* Since the passed buffer must be pinned, it's OK to examine its block
563
* number without getting the lock first.
565
* Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
566
* buffer actually needs to be released. This case is the same as ReadBuffer,
567
* but can save some tests in the caller.
569
* Also note: while it will work to call this routine with blockNum == P_NEW,
570
* it's best to avoid doing so, since that would result in calling
571
* smgrnblocks() while holding the bufmgr lock, hence some loss of
575
ReleaseAndReadBuffer(Buffer buffer,
577
BlockNumber blockNum)
581
if (BufferIsValid(buffer))
583
if (BufferIsLocal(buffer))
585
Assert(LocalRefCount[-buffer - 1] > 0);
586
bufHdr = &LocalBufferDescriptors[-buffer - 1];
587
if (bufHdr->tag.blockNum == blockNum &&
588
RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node))
590
ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
591
/* owner now has a free slot, so no need for Enlarge() */
592
LocalRefCount[-buffer - 1]--;
596
Assert(PrivateRefCount[buffer - 1] > 0);
597
bufHdr = &BufferDescriptors[buffer - 1];
598
if (bufHdr->tag.blockNum == blockNum &&
599
RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node))
601
ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
602
/* owner now has a free slot, so no need for Enlarge() */
603
if (PrivateRefCount[buffer - 1] > 1)
604
PrivateRefCount[buffer - 1]--;
607
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
608
UnpinBuffer(bufHdr, false);
609
return ReadBufferInternal(relation, blockNum, true);
614
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
616
return ReadBufferInternal(relation, blockNum, false);
620
* PinBuffer -- make buffer unavailable for replacement.
622
* This should be applied only to shared buffers, never local ones.
623
* Bufmgr lock must be held by caller.
625
* Most but not all callers want CurrentResourceOwner to be adjusted.
626
* Note that ResourceOwnerEnlargeBuffers must have been done already.
629
PinBuffer(BufferDesc *buf, bool fixOwner)
631
int b = BufferDescriptorGetBuffer(buf) - 1;
633
if (PrivateRefCount[b] == 0)
635
PrivateRefCount[b]++;
636
Assert(PrivateRefCount[b] > 0);
638
ResourceOwnerRememberBuffer(CurrentResourceOwner,
639
BufferDescriptorGetBuffer(buf));
643
* UnpinBuffer -- make buffer available for replacement.
645
* This should be applied only to shared buffers, never local ones.
646
* Bufmgr lock must be held by caller.
648
* Most but not all callers want CurrentResourceOwner to be adjusted.
651
UnpinBuffer(BufferDesc *buf, bool fixOwner)
653
int b = BufferDescriptorGetBuffer(buf) - 1;
656
ResourceOwnerForgetBuffer(CurrentResourceOwner,
657
BufferDescriptorGetBuffer(buf));
659
Assert(buf->refcount > 0);
660
Assert(PrivateRefCount[b] > 0);
661
PrivateRefCount[b]--;
662
if (PrivateRefCount[b] == 0)
665
/* I'd better not still hold any locks on the buffer */
666
Assert(!LWLockHeldByMe(buf->cntx_lock));
667
Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
670
if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
673
/* we just released the last pin other than the waiter's */
674
buf->flags &= ~BM_PIN_COUNT_WAITER;
675
ProcSendSignal(buf->wait_backend_id);
684
* BufferSync -- Write out dirty buffers in the pool.
686
* This is called at checkpoint time to write out all dirty shared buffers,
687
* and by the background writer process to write out some of the dirty blocks.
688
* percent/maxpages should be -1 in the former case, and limit values (>= 0)
691
* Returns the number of buffers written.
694
BufferSync(int percent, int maxpages)
696
BufferDesc **dirty_buffers;
698
int num_buffer_dirty;
701
/* If either limit is zero then we are disabled from doing anything... */
702
if (percent == 0 || maxpages == 0)
706
* Get a list of all currently dirty buffers and how many there are.
707
* We do not flush buffers that get dirtied after we started. They
708
* have to wait until the next checkpoint.
710
dirty_buffers = (BufferDesc **) palloc(NBuffers * sizeof(BufferDesc *));
711
buftags = (BufferTag *) palloc(NBuffers * sizeof(BufferTag));
713
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
714
num_buffer_dirty = StrategyDirtyBufferList(dirty_buffers, buftags,
718
* If called by the background writer, we are usually asked to only
719
* write out some portion of dirty buffers now, to prevent the IO
720
* storm at checkpoint time.
724
Assert(percent <= 100);
725
num_buffer_dirty = (num_buffer_dirty * percent + 99) / 100;
727
if (maxpages > 0 && num_buffer_dirty > maxpages)
728
num_buffer_dirty = maxpages;
730
/* Make sure we can handle the pin inside the loop */
731
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
734
* Loop over buffers to be written. Note the BufMgrLock is held at
735
* loop top, but is released and reacquired within FlushBuffer, so we
736
* aren't holding it long.
738
for (i = 0; i < num_buffer_dirty; i++)
740
BufferDesc *bufHdr = dirty_buffers[i];
743
* Check it is still the same page and still needs writing.
745
* We can check bufHdr->cntxDirty here *without* holding any lock on
746
* buffer context as long as we set this flag in access methods
747
* *before* logging changes with XLogInsert(): if someone will set
748
* cntxDirty just after our check we don't worry because of our
749
* checkpoint.redo points before log record for upcoming changes
750
* and so we are not required to write such dirty buffer.
752
if (!(bufHdr->flags & BM_VALID))
754
if (!BUFFERTAGS_EQUAL(bufHdr->tag, buftags[i]))
756
if (!(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
760
* IO synchronization. Note that we do it with unpinned buffer to
761
* avoid conflicts with FlushRelationBuffers.
763
if (bufHdr->flags & BM_IO_IN_PROGRESS)
766
/* Still need writing? */
767
if (!(bufHdr->flags & BM_VALID))
769
if (!BUFFERTAGS_EQUAL(bufHdr->tag, buftags[i]))
771
if (!(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
776
* Here: no one doing IO for this buffer and it's dirty. Pin
777
* buffer now and set IO state for it *before* acquiring shlock to
778
* avoid conflicts with FlushRelationBuffers.
780
PinBuffer(bufHdr, true);
781
StartBufferIO(bufHdr, false);
783
FlushBuffer(bufHdr, NULL, false);
785
TerminateBufferIO(bufHdr, 0);
786
UnpinBuffer(bufHdr, true);
789
LWLockRelease(BufMgrLock);
791
pfree(dirty_buffers);
794
return num_buffer_dirty;
798
* WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
800
* Should be entered with buffer manager lock held; releases it before
801
* waiting and re-acquires it afterwards.
804
WaitIO(BufferDesc *buf)
807
* Changed to wait until there's no IO - Inoue 01/13/2000
809
* Note this is *necessary* because an error abort in the process doing
810
* I/O could release the io_in_progress_lock prematurely. See
813
while ((buf->flags & BM_IO_IN_PROGRESS) != 0)
815
LWLockRelease(BufMgrLock);
816
LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
817
LWLockRelease(buf->io_in_progress_lock);
818
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
824
* Return a palloc'd string containing buffer usage statistics.
827
ShowBufferUsage(void)
833
initStringInfo(&str);
835
if (ReadBufferCount == 0)
838
hitrate = (float) BufferHitCount *100.0 / ReadBufferCount;
840
if (ReadLocalBufferCount == 0)
843
localhitrate = (float) LocalBufferHitCount *100.0 / ReadLocalBufferCount;
845
appendStringInfo(&str,
846
"!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
847
ReadBufferCount - BufferHitCount, BufferFlushCount, hitrate);
848
appendStringInfo(&str,
849
"!\tLocal blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
850
ReadLocalBufferCount - LocalBufferHitCount, LocalBufferFlushCount, localhitrate);
851
appendStringInfo(&str,
852
"!\tDirect blocks: %10ld read, %10ld written\n",
853
NDirectFileRead, NDirectFileWrite);
859
ResetBufferUsage(void)
863
BufferFlushCount = 0;
864
LocalBufferHitCount = 0;
865
ReadLocalBufferCount = 0;
866
LocalBufferFlushCount = 0;
868
NDirectFileWrite = 0;
872
* AtEOXact_Buffers - clean up at end of transaction.
874
* As of PostgreSQL 8.0, buffer pins should get released by the
875
* ResourceOwner mechanism. This routine is just a debugging
876
* cross-check that no pins remain.
879
AtEOXact_Buffers(bool isCommit)
881
#ifdef USE_ASSERT_CHECKING
884
for (i = 0; i < NBuffers; i++)
886
Assert(PrivateRefCount[i] == 0);
889
AtEOXact_LocalBuffers(isCommit);
894
* Ensure we have released all shared-buffer locks and pins during backend exit
897
AtProcExit_Buffers(void)
904
for (i = 0; i < NBuffers; i++)
906
if (PrivateRefCount[i] != 0)
908
BufferDesc *buf = &(BufferDescriptors[i]);
911
* We don't worry about updating ResourceOwner; if we even got
912
* here, it suggests that ResourceOwners are messed up.
914
PrivateRefCount[i] = 1; /* make sure we release shared pin */
915
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
916
UnpinBuffer(buf, false);
917
LWLockRelease(BufMgrLock);
918
Assert(PrivateRefCount[i] == 0);
924
* Helper routine to issue warnings when a buffer is unexpectedly pinned
927
PrintBufferLeakWarning(Buffer buffer)
932
Assert(BufferIsValid(buffer));
933
if (BufferIsLocal(buffer))
935
buf = &LocalBufferDescriptors[-buffer - 1];
936
loccount = LocalRefCount[-buffer - 1];
940
buf = &BufferDescriptors[buffer - 1];
941
loccount = PrivateRefCount[buffer - 1];
945
"buffer refcount leak: [%03d] "
946
"(rel=%u/%u/%u, blockNum=%u, flags=0x%x, refcount=%u %d)",
948
buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
949
buf->tag.rnode.relNode,
950
buf->tag.blockNum, buf->flags,
951
buf->refcount, loccount);
957
* Flush all dirty blocks in buffer pool to disk at the checkpoint time.
958
* Local relations do not participate in checkpoints, so they don't need to be
962
FlushBufferPool(void)
970
* Do whatever is needed to prepare for commit at the bufmgr and smgr levels
975
/* Nothing to do in bufmgr anymore... */
981
* BufferGetBlockNumber
982
* Returns the block number associated with a buffer.
985
* Assumes that the buffer is valid and pinned, else the
986
* value may be obsolete immediately...
989
BufferGetBlockNumber(Buffer buffer)
991
Assert(BufferIsPinned(buffer));
993
if (BufferIsLocal(buffer))
994
return LocalBufferDescriptors[-buffer - 1].tag.blockNum;
996
return BufferDescriptors[buffer - 1].tag.blockNum;
1001
* Returns the relation ID (RelFileNode) associated with a buffer.
1003
* This should make the same checks as BufferGetBlockNumber, but since the
1004
* two are generally called together, we don't bother.
1007
BufferGetFileNode(Buffer buffer)
1011
if (BufferIsLocal(buffer))
1012
bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
1014
bufHdr = &BufferDescriptors[buffer - 1];
1016
return (bufHdr->tag.rnode);
1021
* Physically write out a shared buffer.
1023
* NOTE: this actually just passes the buffer contents to the kernel; the
1024
* real write to disk won't happen until the kernel feels like it. This
1025
* is okay from our point of view since we can redo the changes from WAL.
1026
* However, we will need to force the changes to disk via fsync before
1027
* we can checkpoint WAL.
1029
* BufMgrLock must be held at entry, and the buffer must be pinned. The
1030
* caller is also responsible for doing StartBufferIO/TerminateBufferIO.
1032
* If the caller has an smgr reference for the buffer's relation, pass it
1033
* as the second parameter. If not, pass NULL. (Do not open relation
1034
* while holding BufMgrLock!)
1036
* When earlylock is TRUE, we grab the per-buffer sharelock before releasing
1037
* BufMgrLock, rather than after. Normally this would be a bad idea since
1038
* we might deadlock, but it is safe and necessary when called from
1039
* BufferAlloc() --- see comments therein.
1042
FlushBuffer(BufferDesc *buf, SMgrRelation reln, bool earlylock)
1044
Buffer buffer = BufferDescriptorGetBuffer(buf);
1046
ErrorContextCallback errcontext;
1048
/* Transpose cntxDirty into flags while holding BufMgrLock */
1049
buf->cntxDirty = false;
1050
buf->flags |= BM_DIRTY;
1052
/* To check if block content changed while flushing. - vadim 01/17/97 */
1053
buf->flags &= ~BM_JUST_DIRTIED;
1056
* If earlylock, grab buffer sharelock before anyone else could re-lock
1060
LockBuffer(buffer, BUFFER_LOCK_SHARE);
1062
/* Release BufMgrLock while doing xlog work */
1063
LWLockRelease(BufMgrLock);
1065
/* Setup error traceback support for ereport() */
1066
errcontext.callback = buffer_write_error_callback;
1067
errcontext.arg = buf;
1068
errcontext.previous = error_context_stack;
1069
error_context_stack = &errcontext;
1071
/* Find smgr relation for buffer while holding minimal locks */
1073
reln = smgropen(buf->tag.rnode);
1076
* Protect buffer content against concurrent update. (Note that
1077
* hint-bit updates can still occur while the write is in progress,
1078
* but we assume that that will not invalidate the data written.)
1081
LockBuffer(buffer, BUFFER_LOCK_SHARE);
1084
* Force XLOG flush for buffer' LSN. This implements the basic WAL
1085
* rule that log updates must hit disk before any of the data-file
1086
* changes they describe do.
1088
recptr = BufferGetLSN(buf);
1092
* Now it's safe to write buffer to disk. Note that no one else should
1093
* have been able to write it while we were busy with locking and log
1094
* flushing because caller has set the IO flag.
1096
* It would be better to clear BM_JUST_DIRTIED right here, but we'd have
1097
* to reacquire the BufMgrLock and it doesn't seem worth it.
1101
(char *) MAKE_PTR(buf->data),
1104
/* Pop the error context stack */
1105
error_context_stack = errcontext.previous;
1108
* Release the per-buffer readlock, reacquire BufMgrLock.
1110
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1112
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
1117
* If this buffer was marked by someone as DIRTY while we were
1118
* flushing it out we must not clear DIRTY flag - vadim 01/17/97
1120
if (!(buf->flags & BM_JUST_DIRTIED))
1121
buf->flags &= ~BM_DIRTY;
1125
* RelationGetNumberOfBlocks
1126
* Determines the current number of pages in the relation.
1129
RelationGetNumberOfBlocks(Relation relation)
1131
/* Open it at the smgr level if not already done */
1132
RelationOpenSmgr(relation);
1134
return smgrnblocks(relation->rd_smgr);
1139
* Physically truncate a relation to the specified number of blocks.
1141
* Caller should already have done something to flush any buffered pages
1142
* that are to be dropped.
1145
RelationTruncate(Relation rel, BlockNumber nblocks)
1147
/* Open it at the smgr level if not already done */
1148
RelationOpenSmgr(rel);
1150
/* Make sure rd_targblock isn't pointing somewhere past end */
1151
rel->rd_targblock = InvalidBlockNumber;
1153
/* Do the real work */
1154
smgrtruncate(rel->rd_smgr, nblocks, rel->rd_istemp);
1157
/* ---------------------------------------------------------------------
1158
* DropRelationBuffers
1160
* This function removes all the buffered pages for a relation
1161
* from the buffer pool. Dirty pages are simply dropped, without
1162
* bothering to write them out first. This is NOT rollback-able,
1163
* and so should be used only with extreme caution!
1165
* There is no particularly good reason why this doesn't have a
1166
* firstDelBlock parameter, except that current callers don't need it.
1168
* We assume that the caller holds an exclusive lock on the relation,
1169
* which should assure that no new buffers will be acquired for the rel
1171
* --------------------------------------------------------------------
1174
DropRelationBuffers(Relation rel)
1176
DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp, 0);
1179
/* ---------------------------------------------------------------------
1180
* DropRelFileNodeBuffers
1182
* This is the same as DropRelationBuffers, except that the target
1183
* relation is specified by RelFileNode and temp status, and one
1184
* may specify the first block to drop.
1186
* This is NOT rollback-able. One legitimate use is to clear the
1187
* buffer cache of buffers for a relation that is being deleted
1188
* during transaction abort.
1189
* --------------------------------------------------------------------
1192
DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
1193
BlockNumber firstDelBlock)
1200
for (i = 0; i < NLocBuffer; i++)
1202
bufHdr = &LocalBufferDescriptors[i];
1203
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
1204
bufHdr->tag.blockNum >= firstDelBlock)
1206
if (LocalRefCount[i] != 0)
1207
elog(ERROR, "block %u of %u/%u/%u is still referenced (local %u)",
1208
bufHdr->tag.blockNum,
1209
bufHdr->tag.rnode.spcNode,
1210
bufHdr->tag.rnode.dbNode,
1211
bufHdr->tag.rnode.relNode,
1213
bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
1214
bufHdr->cntxDirty = false;
1215
bufHdr->tag.rnode.relNode = InvalidOid;
1221
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
1223
for (i = 1; i <= NBuffers; i++)
1225
bufHdr = &BufferDescriptors[i - 1];
1227
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
1228
bufHdr->tag.blockNum >= firstDelBlock)
1231
* If there is I/O in progress, better wait till it's done;
1232
* don't want to delete the relation out from under someone
1233
* who's just trying to flush the buffer!
1235
if (bufHdr->flags & BM_IO_IN_PROGRESS)
1240
* By now, the buffer very possibly belongs to some other
1241
* rel, so check again before proceeding.
1247
* There should be no pin on the buffer.
1249
if (bufHdr->refcount != 0)
1250
elog(ERROR, "block %u of %u/%u/%u is still referenced (private %d, global %u)",
1251
bufHdr->tag.blockNum,
1252
bufHdr->tag.rnode.spcNode,
1253
bufHdr->tag.rnode.dbNode,
1254
bufHdr->tag.rnode.relNode,
1255
PrivateRefCount[i - 1], bufHdr->refcount);
1257
/* Now we can do what we came for */
1258
bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
1259
bufHdr->cntxDirty = false;
1262
* And mark the buffer as no longer occupied by this rel.
1264
StrategyInvalidateBuffer(bufHdr);
1268
LWLockRelease(BufMgrLock);
1271
/* ---------------------------------------------------------------------
1274
* This function removes all the buffers in the buffer cache for a
1275
* particular database. Dirty pages are simply dropped, without
1276
* bothering to write them out first. This is used when we destroy a
1277
* database, to avoid trying to flush data to disk when the directory
1278
* tree no longer exists. Implementation is pretty similar to
1279
* DropRelationBuffers() which is for destroying just one relation.
1280
* --------------------------------------------------------------------
1283
DropBuffers(Oid dbid)
1288
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
1290
for (i = 1; i <= NBuffers; i++)
1292
bufHdr = &BufferDescriptors[i - 1];
1294
if (bufHdr->tag.rnode.dbNode == dbid)
1297
* If there is I/O in progress, better wait till it's done;
1298
* don't want to delete the database out from under someone
1299
* who's just trying to flush the buffer!
1301
if (bufHdr->flags & BM_IO_IN_PROGRESS)
1306
* By now, the buffer very possibly belongs to some other
1307
* DB, so check again before proceeding.
1311
/* Now we can do what we came for */
1312
bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
1313
bufHdr->cntxDirty = false;
1316
* The thing should be free, if caller has checked that no
1317
* backends are running in that database.
1319
Assert(bufHdr->refcount == 0);
1322
* And mark the buffer as no longer occupied by this page.
1324
StrategyInvalidateBuffer(bufHdr);
1328
LWLockRelease(BufMgrLock);
1331
/* -----------------------------------------------------------------
1334
* this function prints all the buffer descriptors, for debugging
1336
* -----------------------------------------------------------------
1340
PrintBufferDescs(void)
1343
BufferDesc *buf = BufferDescriptors;
1345
if (IsUnderPostmaster)
1347
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
1348
for (i = 0; i < NBuffers; ++i, ++buf)
1351
"[%02d] (freeNext=%d, freePrev=%d, rel=%u/%u/%u, "
1352
"blockNum=%u, flags=0x%x, refcount=%u %d)",
1353
i, buf->freeNext, buf->freePrev,
1354
buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
1355
buf->tag.rnode.relNode,
1356
buf->tag.blockNum, buf->flags,
1357
buf->refcount, PrivateRefCount[i]);
1359
LWLockRelease(BufMgrLock);
1363
/* interactive backend */
1364
for (i = 0; i < NBuffers; ++i, ++buf)
1366
printf("[%-2d] (%u/%u/%u, %u) flags=0x%x, refcount=%u %d)\n",
1367
i, buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
1368
buf->tag.rnode.relNode, buf->tag.blockNum,
1369
buf->flags, buf->refcount, PrivateRefCount[i]);
1377
PrintPinnedBufs(void)
1380
BufferDesc *buf = BufferDescriptors;
1382
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
1383
for (i = 0; i < NBuffers; ++i, ++buf)
1385
if (PrivateRefCount[i] > 0)
1387
"[%02d] (freeNext=%d, freePrev=%d, rel=%u/%u/%u, "
1388
"blockNum=%u, flags=0x%x, refcount=%u %d)",
1389
i, buf->freeNext, buf->freePrev,
1390
buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
1391
buf->tag.rnode.relNode,
1392
buf->tag.blockNum, buf->flags,
1393
buf->refcount, PrivateRefCount[i]);
1395
LWLockRelease(BufMgrLock);
1399
/* ---------------------------------------------------------------------
1400
* FlushRelationBuffers
1402
* This function writes all dirty pages of a relation out to disk.
1403
* Furthermore, pages that have blocknumber >= firstDelBlock are
1404
* actually removed from the buffer pool.
1406
* This is called by DROP TABLE to clear buffers for the relation
1407
* from the buffer pool. Note that we must write dirty buffers,
1408
* rather than just dropping the changes, because our transaction
1409
* might abort later on; we want to roll back safely in that case.
1411
* This is also called by VACUUM before truncating the relation to the
1412
* given number of blocks. It might seem unnecessary for VACUUM to
1413
* write dirty pages before firstDelBlock, since VACUUM should already
1414
* have committed its changes. However, it is possible for there still
1415
* to be dirty pages: if some page had unwritten on-row tuple status
1416
* updates from a prior transaction, and VACUUM had no additional
1417
* changes to make to that page, then VACUUM won't have written it.
1418
* This is harmless in most cases but will break pg_upgrade, which
1419
* relies on VACUUM to ensure that *all* tuples have correct on-row
1420
* status. So, we check and flush all dirty pages of the rel
1421
* regardless of block number.
1423
* In all cases, the caller should be holding AccessExclusiveLock on
1424
* the target relation to ensure that no other backend is busy reading
1425
* more blocks of the relation (or might do so before we commit).
1426
* This should also ensure that no one is busy dirtying these blocks.
1428
* Formerly, we considered it an error condition if we found dirty
1429
* buffers here. However, since BufferSync no longer forces out all
1430
* dirty buffers at every xact commit, it's possible for dirty buffers
1431
* to still be present in the cache due to failure of an earlier
1432
* transaction. So, must flush dirty buffers without complaint.
1434
* XXX currently it sequentially searches the buffer pool, should be
1435
* changed to more clever ways of searching.
1436
* --------------------------------------------------------------------
1439
FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
1444
/* Open rel at the smgr level if not already done */
1445
RelationOpenSmgr(rel);
1449
for (i = 0; i < NLocBuffer; i++)
1451
bufHdr = &LocalBufferDescriptors[i];
1452
if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
1454
if ((bufHdr->flags & BM_VALID) &&
1455
(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
1457
ErrorContextCallback errcontext;
1459
/* Setup error traceback support for ereport() */
1460
errcontext.callback = buffer_write_error_callback;
1461
errcontext.arg = bufHdr;
1462
errcontext.previous = error_context_stack;
1463
error_context_stack = &errcontext;
1465
smgrwrite(rel->rd_smgr,
1466
bufHdr->tag.blockNum,
1467
(char *) MAKE_PTR(bufHdr->data),
1470
bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
1471
bufHdr->cntxDirty = false;
1473
/* Pop the error context stack */
1474
error_context_stack = errcontext.previous;
1476
if (LocalRefCount[i] > 0)
1477
elog(ERROR, "FlushRelationBuffers(\"%s\" (local), %u): block %u is referenced (%d)",
1478
RelationGetRelationName(rel), firstDelBlock,
1479
bufHdr->tag.blockNum, LocalRefCount[i]);
1480
if (bufHdr->tag.blockNum >= firstDelBlock)
1481
bufHdr->tag.rnode.relNode = InvalidOid;
1488
/* Make sure we can handle the pin inside the loop */
1489
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
1491
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
1493
for (i = 0; i < NBuffers; i++)
1495
bufHdr = &BufferDescriptors[i];
1496
if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
1498
if ((bufHdr->flags & BM_VALID) &&
1499
(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
1501
PinBuffer(bufHdr, true);
1502
/* Someone else might be flushing buffer */
1503
if (bufHdr->flags & BM_IO_IN_PROGRESS)
1506
if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
1508
StartBufferIO(bufHdr, false);
1510
FlushBuffer(bufHdr, rel->rd_smgr, false);
1512
TerminateBufferIO(bufHdr, 0);
1514
UnpinBuffer(bufHdr, true);
1515
if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
1516
elog(ERROR, "FlushRelationBuffers(\"%s\", %u): block %u was re-dirtied",
1517
RelationGetRelationName(rel), firstDelBlock,
1518
bufHdr->tag.blockNum);
1520
if (bufHdr->refcount != 0)
1521
elog(ERROR, "FlushRelationBuffers(\"%s\", %u): block %u is referenced (private %d, global %u)",
1522
RelationGetRelationName(rel), firstDelBlock,
1523
bufHdr->tag.blockNum,
1524
PrivateRefCount[i], bufHdr->refcount);
1525
if (bufHdr->tag.blockNum >= firstDelBlock)
1526
StrategyInvalidateBuffer(bufHdr);
1530
LWLockRelease(BufMgrLock);
1534
* ReleaseBuffer -- remove the pin on a buffer without
1538
ReleaseBuffer(Buffer buffer)
1542
if (!BufferIsValid(buffer))
1543
elog(ERROR, "bad buffer id: %d", buffer);
1545
ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
1547
if (BufferIsLocal(buffer))
1549
Assert(LocalRefCount[-buffer - 1] > 0);
1550
LocalRefCount[-buffer - 1]--;
1554
bufHdr = &BufferDescriptors[buffer - 1];
1556
Assert(PrivateRefCount[buffer - 1] > 0);
1558
if (PrivateRefCount[buffer - 1] > 1)
1559
PrivateRefCount[buffer - 1]--;
1562
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
1563
UnpinBuffer(bufHdr, false);
1564
LWLockRelease(BufMgrLock);
1569
* IncrBufferRefCount
1570
* Increment the pin count on a buffer that we have *already* pinned
1573
* This function cannot be used on a buffer we do not have pinned,
1574
* because it doesn't change the shared buffer state.
1577
IncrBufferRefCount(Buffer buffer)
1579
Assert(BufferIsPinned(buffer));
1580
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
1581
ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
1582
if (BufferIsLocal(buffer))
1583
LocalRefCount[-buffer - 1]++;
1585
PrivateRefCount[buffer - 1]++;
1590
IncrBufferRefCount_Debug(char *file, int line, Buffer buffer)
1592
IncrBufferRefCount(buffer);
1593
if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer))
1595
BufferDesc *buf = &BufferDescriptors[buffer - 1];
1598
"PIN(Incr) %d rel = %u/%u/%u, blockNum = %u, "
1599
"refcount = %d, file: %s, line: %d\n",
1601
buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
1602
buf->tag.rnode.relNode, buf->tag.blockNum,
1603
PrivateRefCount[buffer - 1], file, line);
1610
ReleaseBuffer_Debug(char *file, int line, Buffer buffer)
1612
ReleaseBuffer(buffer);
1613
if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer))
1615
BufferDesc *buf = &BufferDescriptors[buffer - 1];
1618
"UNPIN(Rel) %d rel = %u/%u/%u, blockNum = %u, "
1619
"refcount = %d, file: %s, line: %d\n",
1621
buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
1622
buf->tag.rnode.relNode, buf->tag.blockNum,
1623
PrivateRefCount[buffer - 1], file, line);
1630
ReleaseAndReadBuffer_Debug(char *file,
1634
BlockNumber blockNum)
1639
bufferValid = BufferIsValid(buffer);
1640
b = ReleaseAndReadBuffer(buffer, relation, blockNum);
1641
if (ShowPinTrace && bufferValid && BufferIsLocal(buffer)
1642
&& is_userbuffer(buffer))
1644
BufferDesc *buf = &BufferDescriptors[buffer - 1];
1647
"UNPIN(Rel&Rd) %d rel = %u/%u/%u, blockNum = %u, "
1648
"refcount = %d, file: %s, line: %d\n",
1650
buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
1651
buf->tag.rnode.relNode, buf->tag.blockNum,
1652
PrivateRefCount[buffer - 1], file, line);
1654
if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer))
1656
BufferDesc *buf = &BufferDescriptors[b - 1];
1659
"PIN(Rel&Rd) %d rel = %u/%u/%u, blockNum = %u, "
1660
"refcount = %d, file: %s, line: %d\n",
1662
buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
1663
buf->tag.rnode.relNode, buf->tag.blockNum,
1664
PrivateRefCount[b - 1], file, line);
1671
* SetBufferCommitInfoNeedsSave
1673
* Mark a buffer dirty when we have updated tuple commit-status bits in it.
1675
* This is essentially the same as WriteNoReleaseBuffer. We preserve the
1676
* distinction as a way of documenting that the caller has not made a critical
1677
* data change --- the status-bit update could be redone by someone else just
1678
* as easily. Therefore, no WAL log record need be generated, whereas calls
1679
* to WriteNoReleaseBuffer really ought to be associated with a WAL-entry-
1682
* This routine might get called many times on the same page, if we are making
1683
* the first scan after commit of an xact that added/deleted many tuples.
1684
* So, be as quick as we can if the buffer is already dirty. We do this by
1685
* not acquiring BufMgrLock if it looks like the status bits are already OK.
1686
* (Note it is okay if someone else clears BM_JUST_DIRTIED immediately after
1687
* we look, because the buffer content update is already done and will be
1688
* reflected in the I/O.)
1691
SetBufferCommitInfoNeedsSave(Buffer buffer)
1695
if (!BufferIsValid(buffer))
1696
elog(ERROR, "bad buffer id: %d", buffer);
1698
if (BufferIsLocal(buffer))
1700
WriteLocalBuffer(buffer, false);
1704
bufHdr = &BufferDescriptors[buffer - 1];
1706
if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
1707
(BM_DIRTY | BM_JUST_DIRTIED))
1709
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
1710
Assert(bufHdr->refcount > 0);
1711
bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
1712
LWLockRelease(BufMgrLock);
1717
* Release buffer context locks for shared buffers.
1719
* Used to clean up after errors.
1721
* Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
1722
* of releasing buffer context locks per se; the only thing we need to deal
1723
* with here is clearing any PIN_COUNT request that was in progress.
1728
BufferDesc *buf = PinCountWaitBuf;
1732
HOLD_INTERRUPTS(); /* don't want to die() partway through... */
1734
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
1737
* Don't complain if flag bit not set; it could have been
1738
* reset but we got a cancel/die interrupt before getting the
1741
if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
1742
buf->wait_backend_id == MyBackendId)
1743
buf->flags &= ~BM_PIN_COUNT_WAITER;
1744
LWLockRelease(BufMgrLock);
1746
ProcCancelWaitForSignal();
1748
RESUME_INTERRUPTS();
1751
PinCountWaitBuf = NULL;
1755
* Acquire or release the cntx_lock for the buffer.
1758
LockBuffer(Buffer buffer, int mode)
1762
Assert(BufferIsValid(buffer));
1763
if (BufferIsLocal(buffer))
1766
buf = &(BufferDescriptors[buffer - 1]);
1768
if (mode == BUFFER_LOCK_UNLOCK)
1769
LWLockRelease(buf->cntx_lock);
1770
else if (mode == BUFFER_LOCK_SHARE)
1771
LWLockAcquire(buf->cntx_lock, LW_SHARED);
1772
else if (mode == BUFFER_LOCK_EXCLUSIVE)
1774
LWLockAcquire(buf->cntx_lock, LW_EXCLUSIVE);
1777
* This is not the best place to set cntxDirty flag (eg indices do
1778
* not always change buffer they lock in excl mode). But please
1779
* remember that it's critical to set cntxDirty *before* logging
1780
* changes with XLogInsert() - see comments in BufferSync().
1782
buf->cntxDirty = true;
1785
elog(ERROR, "unrecognized buffer lock mode: %d", mode);
1789
* Acquire the cntx_lock for the buffer, but only if we don't have to wait.
1791
* This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
1794
ConditionalLockBuffer(Buffer buffer)
1798
Assert(BufferIsValid(buffer));
1799
if (BufferIsLocal(buffer))
1800
return true; /* act as though we got it */
1802
buf = &(BufferDescriptors[buffer - 1]);
1804
if (LWLockConditionalAcquire(buf->cntx_lock, LW_EXCLUSIVE))
1807
* This is not the best place to set cntxDirty flag (eg indices do
1808
* not always change buffer they lock in excl mode). But please
1809
* remember that it's critical to set cntxDirty *before* logging
1810
* changes with XLogInsert() - see comments in BufferSync().
1812
buf->cntxDirty = true;
1820
* LockBufferForCleanup - lock a buffer in preparation for deleting items
1822
* Items may be deleted from a disk page only when the caller (a) holds an
1823
* exclusive lock on the buffer and (b) has observed that no other backend
1824
* holds a pin on the buffer. If there is a pin, then the other backend
1825
* might have a pointer into the buffer (for example, a heapscan reference
1826
* to an item --- see README for more details). It's OK if a pin is added
1827
* after the cleanup starts, however; the newly-arrived backend will be
1828
* unable to look at the page until we release the exclusive lock.
1830
* To implement this protocol, a would-be deleter must pin the buffer and
1831
* then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
1832
* LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
1833
* it has successfully observed pin count = 1.
1836
LockBufferForCleanup(Buffer buffer)
1840
Assert(BufferIsValid(buffer));
1841
Assert(PinCountWaitBuf == NULL);
1843
if (BufferIsLocal(buffer))
1845
/* There should be exactly one pin */
1846
if (LocalRefCount[-buffer - 1] != 1)
1847
elog(ERROR, "incorrect local pin count: %d",
1848
LocalRefCount[-buffer - 1]);
1849
/* Nobody else to wait for */
1853
/* There should be exactly one local pin */
1854
if (PrivateRefCount[buffer - 1] != 1)
1855
elog(ERROR, "incorrect local pin count: %d",
1856
PrivateRefCount[buffer - 1]);
1858
bufHdr = &BufferDescriptors[buffer - 1];
1862
/* Try to acquire lock */
1863
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1864
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
1865
Assert(bufHdr->refcount > 0);
1866
if (bufHdr->refcount == 1)
1868
/* Successfully acquired exclusive lock with pincount 1 */
1869
LWLockRelease(BufMgrLock);
1872
/* Failed, so mark myself as waiting for pincount 1 */
1873
if (bufHdr->flags & BM_PIN_COUNT_WAITER)
1875
LWLockRelease(BufMgrLock);
1876
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1877
elog(ERROR, "multiple backends attempting to wait for pincount 1");
1879
bufHdr->wait_backend_id = MyBackendId;
1880
bufHdr->flags |= BM_PIN_COUNT_WAITER;
1881
PinCountWaitBuf = bufHdr;
1882
LWLockRelease(BufMgrLock);
1883
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1884
/* Wait to be signaled by UnpinBuffer() */
1885
ProcWaitForSignal();
1886
PinCountWaitBuf = NULL;
1887
/* Loop back and try again */
1892
* Functions for IO error handling
1894
* Note: We assume that nested buffer IO never occurs.
1895
* i.e at most one io_in_progress lock is held per proc.
1899
* Function:StartBufferIO
1901
* My process is executing no IO
1902
* BufMgrLock is held
1903
* BM_IO_IN_PROGRESS mask is not set for the buffer
1904
* The buffer is Pinned
1906
* Because BufMgrLock is held, we are already in an interrupt holdoff here,
1907
* and do not need another.
1910
StartBufferIO(BufferDesc *buf, bool forInput)
1912
Assert(!InProgressBuf);
1913
Assert(!(buf->flags & BM_IO_IN_PROGRESS));
1914
buf->flags |= BM_IO_IN_PROGRESS;
1916
LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
1918
InProgressBuf = buf;
1919
IsForInput = forInput;
1923
* Function:TerminateBufferIO
1925
* My process is executing IO for the buffer
1926
* BufMgrLock is held
1927
* BM_IO_IN_PROGRESS mask is set for the buffer
1928
* The buffer is Pinned
1930
* err_flag must be 0 for successful completion and BM_IO_ERROR for failure.
1932
* Because BufMgrLock is held, we are already in an interrupt holdoff here,
1933
* and do not need another.
1936
TerminateBufferIO(BufferDesc *buf, int err_flag)
1938
Assert(buf == InProgressBuf);
1939
Assert(buf->flags & BM_IO_IN_PROGRESS);
1940
buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
1941
buf->flags |= err_flag;
1943
LWLockRelease(buf->io_in_progress_lock);
1945
InProgressBuf = NULL;
1949
* Function:ContinueBufferIO
1951
* My process is executing IO for the buffer
1952
* BufMgrLock is held
1953
* The buffer is Pinned
1955
* Because BufMgrLock is held, we are already in an interrupt holdoff here,
1956
* and do not need another.
1959
ContinueBufferIO(BufferDesc *buf, bool forInput)
1961
Assert(buf == InProgressBuf);
1962
Assert(buf->flags & BM_IO_IN_PROGRESS);
1963
IsForInput = forInput;
1970
InProgressBuf = NULL;
1975
* Clean up any active buffer I/O after an error.
1976
* BufMgrLock isn't held when this function is called,
1977
* but we haven't yet released buffer pins, so the buffer is still pinned.
1979
* If I/O was in progress, we always set BM_IO_ERROR.
1984
BufferDesc *buf = InProgressBuf;
1989
* Since LWLockReleaseAll has already been called, we're not
1990
* holding the buffer's io_in_progress_lock. We have to re-acquire
1991
* it so that we can use TerminateBufferIO. Anyone who's executing
1992
* WaitIO on the buffer will be in a busy spin until we succeed in
1995
LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
1997
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
1998
Assert(buf->flags & BM_IO_IN_PROGRESS);
2001
Assert(!(buf->flags & BM_DIRTY || buf->cntxDirty));
2002
/* We'd better not think buffer is valid yet */
2003
Assert(!(buf->flags & BM_VALID));
2007
Assert(buf->flags & BM_DIRTY || buf->cntxDirty);
2008
/* Issue notice if this is not the first failure... */
2009
if (buf->flags & BM_IO_ERROR)
2012
(errcode(ERRCODE_IO_ERROR),
2013
errmsg("could not write block %u of %u/%u/%u",
2015
buf->tag.rnode.spcNode,
2016
buf->tag.rnode.dbNode,
2017
buf->tag.rnode.relNode),
2018
errdetail("Multiple failures --- write error may be permanent.")));
2020
buf->flags |= BM_DIRTY;
2022
TerminateBufferIO(buf, BM_IO_ERROR);
2023
LWLockRelease(BufMgrLock);
2028
* Error context callback for errors occurring during buffer writes.
2031
buffer_write_error_callback(void *arg)
2033
BufferDesc *bufHdr = (BufferDesc *) arg;
2036
errcontext("writing block %u of relation %u/%u/%u",
2037
bufHdr->tag.blockNum,
2038
bufHdr->tag.rnode.spcNode,
2039
bufHdr->tag.rnode.dbNode,
2040
bufHdr->tag.rnode.relNode);