~vcs-imports/mammoth-replicator/trunk

« back to all changes in this revision

Viewing changes to src/backend/storage/buffer/bufmgr.c

  • Committer: alvherre
  • Date: 2005-12-16 21:24:52 UTC
  • Revision ID: svn-v4:db760fc0-0f08-0410-9d63-cc6633f64896:trunk:1
Initial import of the REL8_0_3 sources from the Pgsql CVS repository.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*-------------------------------------------------------------------------
 
2
 *
 
3
 * bufmgr.c
 
4
 *        buffer manager interface routines
 
5
 *
 
6
 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
 
7
 * Portions Copyright (c) 1994, Regents of the University of California
 
8
 *
 
9
 *
 
10
 * IDENTIFICATION
 
11
 *        $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.185 2005-01-10 20:02:21 tgl Exp $
 
12
 *
 
13
 *-------------------------------------------------------------------------
 
14
 */
 
15
/*
 
16
 * ReadBuffer() -- find or create a buffer holding the requested page,
 
17
 *              and pin it so that no one can destroy it while this process
 
18
 *              is using it.
 
19
 *
 
20
 * ReleaseBuffer() -- unpin the buffer
 
21
 *
 
22
 * WriteNoReleaseBuffer() -- mark the buffer contents as "dirty"
 
23
 *              but don't unpin.  The disk IO is delayed until buffer
 
24
 *              replacement.
 
25
 *
 
26
 * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer()
 
27
 *
 
28
 * BufferSync() -- flush all (or some) dirty buffers in the buffer pool.
 
29
 *
 
30
 * InitBufferPool() -- Init the buffer module.
 
31
 *
 
32
 * See other files:
 
33
 *              freelist.c -- chooses victim for buffer replacement
 
34
 *              buf_table.c -- manages the buffer lookup table
 
35
 */
 
36
#include "postgres.h"
 
37
 
 
38
#include <sys/file.h>
 
39
#include <unistd.h>
 
40
 
 
41
#include "lib/stringinfo.h"
 
42
#include "miscadmin.h"
 
43
#include "storage/buf_internals.h"
 
44
#include "storage/bufmgr.h"
 
45
#include "storage/bufpage.h"
 
46
#include "storage/proc.h"
 
47
#include "storage/smgr.h"
 
48
#include "utils/relcache.h"
 
49
#include "utils/resowner.h"
 
50
#include "pgstat.h"
 
51
 
 
52
 
 
53
#define BufferGetLSN(bufHdr)    \
 
54
        (*((XLogRecPtr*) MAKE_PTR((bufHdr)->data)))
 
55
 
 
56
 
 
57
/* GUC variable */
 
58
bool            zero_damaged_pages = false;
 
59
 
 
60
#ifdef NOT_USED
 
61
bool            ShowPinTrace = false;
 
62
#endif
 
63
 
 
64
long            NDirectFileRead;        /* some I/O's are direct file access.
 
65
                                                                 * bypass bufmgr */
 
66
long            NDirectFileWrite;       /* e.g., I/O in psort and hashjoin. */
 
67
 
 
68
 
 
69
/* local state for StartBufferIO and related functions */
 
70
static BufferDesc *InProgressBuf = NULL;
 
71
static bool IsForInput;
 
72
/* local state for LockBufferForCleanup */
 
73
static BufferDesc *PinCountWaitBuf = NULL;
 
74
 
 
75
 
 
76
static void PinBuffer(BufferDesc *buf, bool fixOwner);
 
77
static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
 
78
static void WaitIO(BufferDesc *buf);
 
79
static void StartBufferIO(BufferDesc *buf, bool forInput);
 
80
static void TerminateBufferIO(BufferDesc *buf, int err_flag);
 
81
static void ContinueBufferIO(BufferDesc *buf, bool forInput);
 
82
static void buffer_write_error_callback(void *arg);
 
83
static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum,
 
84
                                   bool bufferLockHeld);
 
85
static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
 
86
                        bool *foundPtr);
 
87
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, bool earlylock);
 
88
static void write_buffer(Buffer buffer, bool unpin);
 
89
 
 
90
 
 
91
/*
 
92
 * ReadBuffer -- returns a buffer containing the requested
 
93
 *              block of the requested relation.  If the blknum
 
94
 *              requested is P_NEW, extend the relation file and
 
95
 *              allocate a new block.  (Caller is responsible for
 
96
 *              ensuring that only one backend tries to extend a
 
97
 *              relation at the same time!)
 
98
 *
 
99
 * Returns: the buffer number for the buffer containing
 
100
 *              the block read.  The returned buffer has been pinned.
 
101
 *              Does not return on error --- elog's instead.
 
102
 *
 
103
 * Assume when this function is called, that reln has been
 
104
 *              opened already.
 
105
 */
 
106
Buffer
 
107
ReadBuffer(Relation reln, BlockNumber blockNum)
 
108
{
 
109
        ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 
110
        return ReadBufferInternal(reln, blockNum, false);
 
111
}
 
112
 
 
113
/*
 
114
 * ReadBufferInternal -- internal version of ReadBuffer with more options
 
115
 *
 
116
 * bufferLockHeld: if true, caller already acquired the bufmgr lock.
 
117
 * (This is assumed never to be true if dealing with a local buffer!)
 
118
 *
 
119
 * The caller must have done ResourceOwnerEnlargeBuffers(CurrentResourceOwner)
 
120
 */
 
121
static Buffer
 
122
ReadBufferInternal(Relation reln, BlockNumber blockNum,
 
123
                                   bool bufferLockHeld)
 
124
{
 
125
        BufferDesc *bufHdr;
 
126
        bool            found;
 
127
        bool            isExtend;
 
128
        bool            isLocalBuf;
 
129
 
 
130
        isExtend = (blockNum == P_NEW);
 
131
        isLocalBuf = reln->rd_istemp;
 
132
 
 
133
        /* Open it at the smgr level if not already done */
 
134
        RelationOpenSmgr(reln);
 
135
 
 
136
        /* Substitute proper block number if caller asked for P_NEW */
 
137
        if (isExtend)
 
138
                blockNum = smgrnblocks(reln->rd_smgr);
 
139
 
 
140
        if (isLocalBuf)
 
141
        {
 
142
                ReadLocalBufferCount++;
 
143
                pgstat_count_buffer_read(&reln->pgstat_info, reln);
 
144
                bufHdr = LocalBufferAlloc(reln, blockNum, &found);
 
145
                if (found)
 
146
                        LocalBufferHitCount++;
 
147
        }
 
148
        else
 
149
        {
 
150
                ReadBufferCount++;
 
151
                pgstat_count_buffer_read(&reln->pgstat_info, reln);
 
152
 
 
153
                /*
 
154
                 * lookup the buffer.  IO_IN_PROGRESS is set if the requested
 
155
                 * block is not currently in memory.
 
156
                 */
 
157
                if (!bufferLockHeld)
 
158
                        LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
159
                bufHdr = BufferAlloc(reln, blockNum, &found);
 
160
                if (found)
 
161
                        BufferHitCount++;
 
162
        }
 
163
 
 
164
        /* At this point we do NOT hold the bufmgr lock. */
 
165
 
 
166
        /* if it was already in the buffer pool, we're done */
 
167
        if (found)
 
168
        {
 
169
                /* Just need to update stats before we exit */
 
170
                pgstat_count_buffer_hit(&reln->pgstat_info, reln);
 
171
 
 
172
                if (VacuumCostActive)
 
173
                        VacuumCostBalance += VacuumCostPageHit;
 
174
 
 
175
                return BufferDescriptorGetBuffer(bufHdr);
 
176
        }
 
177
 
 
178
        /*
 
179
         * if we have gotten to this point, we have allocated a buffer for the
 
180
         * page but its contents are not yet valid.  IO_IN_PROGRESS is set for
 
181
         * it, if it's a shared buffer.
 
182
         *
 
183
         * Note: if smgrextend fails, we will end up with a buffer that is
 
184
         * allocated but not marked BM_VALID.  P_NEW will still select the
 
185
         * same block number (because the relation didn't get any longer on
 
186
         * disk) and so future attempts to extend the relation will find the
 
187
         * same buffer (if it's not been recycled) but come right back here to
 
188
         * try smgrextend again.
 
189
         */
 
190
        Assert(!(bufHdr->flags & BM_VALID));
 
191
 
 
192
        if (isExtend)
 
193
        {
 
194
                /* new buffers are zero-filled */
 
195
                MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ);
 
196
                smgrextend(reln->rd_smgr, blockNum, (char *) MAKE_PTR(bufHdr->data),
 
197
                                   reln->rd_istemp);
 
198
        }
 
199
        else
 
200
        {
 
201
                smgrread(reln->rd_smgr, blockNum, (char *) MAKE_PTR(bufHdr->data));
 
202
                /* check for garbage data */
 
203
                if (!PageHeaderIsValid((PageHeader) MAKE_PTR(bufHdr->data)))
 
204
                {
 
205
                        /*
 
206
                         * During WAL recovery, the first access to any data page
 
207
                         * should overwrite the whole page from the WAL; so a
 
208
                         * clobbered page header is not reason to fail.  Hence, when
 
209
                         * InRecovery we may always act as though zero_damaged_pages
 
210
                         * is ON.
 
211
                         */
 
212
                        if (zero_damaged_pages || InRecovery)
 
213
                        {
 
214
                                ereport(WARNING,
 
215
                                                (errcode(ERRCODE_DATA_CORRUPTED),
 
216
                                                 errmsg("invalid page header in block %u of relation \"%s\"; zeroing out page",
 
217
                                                          blockNum, RelationGetRelationName(reln))));
 
218
                                MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ);
 
219
                        }
 
220
                        else
 
221
                                ereport(ERROR,
 
222
                                                (errcode(ERRCODE_DATA_CORRUPTED),
 
223
                                                 errmsg("invalid page header in block %u of relation \"%s\"",
 
224
                                                          blockNum, RelationGetRelationName(reln))));
 
225
                }
 
226
        }
 
227
 
 
228
        if (isLocalBuf)
 
229
        {
 
230
                /* Only need to adjust flags */
 
231
                bufHdr->flags |= BM_VALID;
 
232
        }
 
233
        else
 
234
        {
 
235
                /* lock buffer manager again to update IO IN PROGRESS */
 
236
                LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
237
 
 
238
                /* IO Succeeded, so mark data valid */
 
239
                bufHdr->flags |= BM_VALID;
 
240
 
 
241
                /* If anyone was waiting for IO to complete, wake them up now */
 
242
                TerminateBufferIO(bufHdr, 0);
 
243
 
 
244
                LWLockRelease(BufMgrLock);
 
245
        }
 
246
 
 
247
        if (VacuumCostActive)
 
248
                VacuumCostBalance += VacuumCostPageMiss;
 
249
 
 
250
        return BufferDescriptorGetBuffer(bufHdr);
 
251
}
 
252
 
 
253
/*
 
254
 * BufferAlloc -- subroutine for ReadBuffer.  Handles lookup of a shared
 
255
 *              buffer.  If no buffer exists already, selects a replacement
 
256
 *              victim and evicts the old page, but does NOT read in new page.
 
257
 *
 
258
 * The returned buffer is pinned and is already marked as holding the
 
259
 * desired page.  If it already did have the desired page, *foundPtr is
 
260
 * set TRUE.  Otherwise, *foundPtr is set FALSE and the buffer is marked
 
261
 * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
 
262
 *
 
263
 * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
 
264
 * we keep it for simplicity in ReadBuffer.
 
265
 *
 
266
 * BufMgrLock must be held at entry.  When this routine returns,
 
267
 * the BufMgrLock is guaranteed NOT to be held.
 
268
 */
 
269
static BufferDesc *
 
270
BufferAlloc(Relation reln,
 
271
                        BlockNumber blockNum,
 
272
                        bool *foundPtr)
 
273
{
 
274
        BufferTag       newTag;                 /* identity of requested block */
 
275
        BufferDesc *buf,
 
276
                           *buf2;
 
277
        int                     cdb_found_index,
 
278
                                cdb_replace_index;
 
279
        bool            inProgress;             /* did we already do StartBufferIO? */
 
280
 
 
281
        /* create a tag so we can lookup the buffer */
 
282
        INIT_BUFFERTAG(newTag, reln, blockNum);
 
283
 
 
284
        /* see if the block is in the buffer pool already */
 
285
        buf = StrategyBufferLookup(&newTag, false, &cdb_found_index);
 
286
        if (buf != NULL)
 
287
        {
 
288
                /*
 
289
                 * Found it.  Now, pin the buffer so no one can steal it from the
 
290
                 * buffer pool, and check to see if someone else is still reading
 
291
                 * data into the buffer.  (Formerly, we'd always block here if
 
292
                 * IO_IN_PROGRESS is set, but there's no need to wait when someone
 
293
                 * is writing rather than reading.)
 
294
                 */
 
295
                *foundPtr = TRUE;
 
296
 
 
297
                PinBuffer(buf, true);
 
298
 
 
299
                if (!(buf->flags & BM_VALID))
 
300
                {
 
301
                        if (buf->flags & BM_IO_IN_PROGRESS)
 
302
                        {
 
303
                                /* someone else is reading it, wait for them */
 
304
                                WaitIO(buf);
 
305
                        }
 
306
                        if (!(buf->flags & BM_VALID))
 
307
                        {
 
308
                                /*
 
309
                                 * If we get here, previous attempts to read the buffer
 
310
                                 * must have failed ... but we shall bravely try again.
 
311
                                 */
 
312
                                *foundPtr = FALSE;
 
313
                                StartBufferIO(buf, true);
 
314
                        }
 
315
                }
 
316
 
 
317
                LWLockRelease(BufMgrLock);
 
318
 
 
319
                return buf;
 
320
        }
 
321
 
 
322
        *foundPtr = FALSE;
 
323
 
 
324
        /*
 
325
         * Didn't find it in the buffer pool.  We'll have to initialize a new
 
326
         * buffer.      First, grab one from the free list.  If it's dirty, flush
 
327
         * it to disk. Remember to unlock BufMgrLock while doing the IO.
 
328
         */
 
329
        inProgress = FALSE;
 
330
        do
 
331
        {
 
332
                buf = StrategyGetBuffer(&cdb_replace_index);
 
333
 
 
334
                /* StrategyGetBuffer will elog if it can't find a free buffer */
 
335
                Assert(buf);
 
336
 
 
337
                /*
 
338
                 * There should be exactly one pin on the buffer after it is
 
339
                 * allocated -- ours.  If it had a pin it wouldn't have been on
 
340
                 * the free list.  No one else could have pinned it between
 
341
                 * StrategyGetBuffer and here because we have the BufMgrLock.
 
342
                 *
 
343
                 * (We must pin the buffer before releasing BufMgrLock ourselves,
 
344
                 * to ensure StrategyGetBuffer won't give the same buffer to someone
 
345
                 * else.)
 
346
                 */
 
347
                Assert(buf->refcount == 0);
 
348
                buf->refcount = 1;
 
349
                PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1;
 
350
 
 
351
                ResourceOwnerRememberBuffer(CurrentResourceOwner,
 
352
                                                                        BufferDescriptorGetBuffer(buf));
 
353
 
 
354
                if ((buf->flags & BM_VALID) &&
 
355
                        (buf->flags & BM_DIRTY || buf->cntxDirty))
 
356
                {
 
357
                        /*
 
358
                         * Set BM_IO_IN_PROGRESS to show the buffer is being written.
 
359
                         * It cannot already be set because the buffer would be pinned
 
360
                         * if someone were writing it.
 
361
                         *
 
362
                         * Note: it's okay to grab the io_in_progress lock while holding
 
363
                         * BufMgrLock.  All code paths that acquire this lock pin the
 
364
                         * buffer first; since no one had it pinned (it just came off
 
365
                         * the free list), no one else can have the lock.
 
366
                         */
 
367
                        StartBufferIO(buf, false);
 
368
 
 
369
                        inProgress = TRUE;
 
370
 
 
371
                        /*
 
372
                         * Write the buffer out, being careful to release BufMgrLock
 
373
                         * while doing the I/O.  We also tell FlushBuffer to share-lock
 
374
                         * the buffer before releasing BufMgrLock.  This is safe because
 
375
                         * we know no other backend currently has the buffer pinned,
 
376
                         * therefore no one can have it locked either, so we can always
 
377
                         * get the lock without blocking.  It is necessary because if
 
378
                         * we release BufMgrLock first, it's possible for someone else
 
379
                         * to pin and exclusive-lock the buffer before we get to the
 
380
                         * share-lock, causing us to block.  If the someone else then
 
381
                         * blocks on a lock we hold, deadlock ensues.  This has been
 
382
                         * observed to happen when two backends are both trying to split
 
383
                         * btree index pages, and the second one just happens to be
 
384
                         * trying to split the page the first one got from the freelist.
 
385
                         */
 
386
                        FlushBuffer(buf, NULL, true);
 
387
 
 
388
                        /*
 
389
                         * Somebody could have allocated another buffer for the same
 
390
                         * block we are about to read in. While we flush out the dirty
 
391
                         * buffer, we don't hold the lock and someone could have
 
392
                         * allocated another buffer for the same block. The problem is
 
393
                         * we haven't yet inserted the new tag into the buffer table.
 
394
                         * So we need to check here.            -ay 3/95
 
395
                         *
 
396
                         * Another reason we have to do this is to update
 
397
                         * cdb_found_index, since the CDB could have disappeared from
 
398
                         * B1/B2 list while we were writing.
 
399
                         */
 
400
                        buf2 = StrategyBufferLookup(&newTag, true, &cdb_found_index);
 
401
                        if (buf2 != NULL)
 
402
                        {
 
403
                                /*
 
404
                                 * Found it. Someone has already done what we were about
 
405
                                 * to do. We'll just handle this as if it were found in
 
406
                                 * the buffer pool in the first place.  First, give up the
 
407
                                 * buffer we were planning to use.
 
408
                                 */
 
409
                                TerminateBufferIO(buf, 0);
 
410
                                UnpinBuffer(buf, true);
 
411
 
 
412
                                buf = buf2;
 
413
 
 
414
                                /* remaining code should match code at top of routine */
 
415
 
 
416
                                *foundPtr = TRUE;
 
417
 
 
418
                                PinBuffer(buf, true);
 
419
 
 
420
                                if (!(buf->flags & BM_VALID))
 
421
                                {
 
422
                                        if (buf->flags & BM_IO_IN_PROGRESS)
 
423
                                        {
 
424
                                                /* someone else is reading it, wait for them */
 
425
                                                WaitIO(buf);
 
426
                                        }
 
427
                                        if (!(buf->flags & BM_VALID))
 
428
                                        {
 
429
                                                /*
 
430
                                                 * If we get here, previous attempts to read the
 
431
                                                 * buffer must have failed ... but we shall
 
432
                                                 * bravely try again.
 
433
                                                 */
 
434
                                                *foundPtr = FALSE;
 
435
                                                StartBufferIO(buf, true);
 
436
                                        }
 
437
                                }
 
438
 
 
439
                                LWLockRelease(BufMgrLock);
 
440
 
 
441
                                return buf;
 
442
                        }
 
443
 
 
444
                        /*
 
445
                         * Somebody could have pinned the buffer while we were doing
 
446
                         * the I/O and had given up the BufMgrLock.  If so, we can't
 
447
                         * recycle this buffer --- we need to clear the I/O flags,
 
448
                         * remove our pin and choose a new victim buffer.  Similarly,
 
449
                         * we have to start over if somebody re-dirtied the buffer.
 
450
                         */
 
451
                        if (buf->refcount > 1 || buf->flags & BM_DIRTY || buf->cntxDirty)
 
452
                        {
 
453
                                TerminateBufferIO(buf, 0);
 
454
                                UnpinBuffer(buf, true);
 
455
                                inProgress = FALSE;
 
456
                                buf = NULL;
 
457
                        }
 
458
                }
 
459
        } while (buf == NULL);
 
460
 
 
461
        /*
 
462
         * At this point we should have the sole pin on a non-dirty buffer and
 
463
         * we may or may not already have the BM_IO_IN_PROGRESS flag set.
 
464
         */
 
465
 
 
466
        /*
 
467
         * Tell the buffer replacement strategy that we are replacing the
 
468
         * buffer content. Then rename the buffer.      Clearing BM_VALID here is
 
469
         * necessary, clearing the dirtybits is just paranoia.
 
470
         */
 
471
        StrategyReplaceBuffer(buf, &newTag, cdb_found_index, cdb_replace_index);
 
472
        buf->tag = newTag;
 
473
        buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
 
474
        buf->cntxDirty = false;
 
475
 
 
476
        /*
 
477
         * Buffer contents are currently invalid.  Have to mark IO IN PROGRESS
 
478
         * so no one fiddles with them until the read completes.  We may have
 
479
         * already marked it, in which case we just flip from write to read
 
480
         * status.
 
481
         */
 
482
        if (!inProgress)
 
483
                StartBufferIO(buf, true);
 
484
        else
 
485
                ContinueBufferIO(buf, true);
 
486
 
 
487
        LWLockRelease(BufMgrLock);
 
488
 
 
489
        return buf;
 
490
}
 
491
 
 
492
/*
 
493
 * write_buffer -- common functionality for
 
494
 *                                 WriteBuffer and WriteNoReleaseBuffer
 
495
 */
 
496
static void
 
497
write_buffer(Buffer buffer, bool release)
 
498
{
 
499
        BufferDesc *bufHdr;
 
500
 
 
501
        if (!BufferIsValid(buffer))
 
502
                elog(ERROR, "bad buffer id: %d", buffer);
 
503
 
 
504
        if (BufferIsLocal(buffer))
 
505
        {
 
506
                WriteLocalBuffer(buffer, release);
 
507
                return;
 
508
        }
 
509
 
 
510
        bufHdr = &BufferDescriptors[buffer - 1];
 
511
 
 
512
        Assert(PrivateRefCount[buffer - 1] > 0);
 
513
 
 
514
        LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
515
        Assert(bufHdr->refcount > 0);
 
516
 
 
517
        /*
 
518
         * If the buffer was not dirty already, do vacuum cost accounting.
 
519
         */
 
520
        if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
 
521
                VacuumCostBalance += VacuumCostPageDirty;
 
522
 
 
523
        bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
 
524
 
 
525
        if (release)
 
526
                UnpinBuffer(bufHdr, true);
 
527
        LWLockRelease(BufMgrLock);
 
528
}
 
529
 
 
530
/*
 
531
 * WriteBuffer
 
532
 *
 
533
 *              Marks buffer contents as dirty (actual write happens later).
 
534
 *
 
535
 * Assume that buffer is pinned.  Assume that reln is valid.
 
536
 *
 
537
 * Side Effects:
 
538
 *              Pin count is decremented.
 
539
 */
 
540
void
 
541
WriteBuffer(Buffer buffer)
 
542
{
 
543
        write_buffer(buffer, true);
 
544
}
 
545
 
 
546
/*
 
547
 * WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer
 
548
 *                                                 when the operation is complete.
 
549
 */
 
550
void
 
551
WriteNoReleaseBuffer(Buffer buffer)
 
552
{
 
553
        write_buffer(buffer, false);
 
554
}
 
555
 
 
556
/*
 
557
 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
 
558
 *              to save a lock release/acquire.
 
559
 *
 
560
 * Also, if the passed buffer is valid and already contains the desired block
 
561
 * number, we simply return it without ever acquiring the lock at all.
 
562
 * Since the passed buffer must be pinned, it's OK to examine its block
 
563
 * number without getting the lock first.
 
564
 *
 
565
 * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
 
566
 * buffer actually needs to be released.  This case is the same as ReadBuffer,
 
567
 * but can save some tests in the caller.
 
568
 *
 
569
 * Also note: while it will work to call this routine with blockNum == P_NEW,
 
570
 * it's best to avoid doing so, since that would result in calling
 
571
 * smgrnblocks() while holding the bufmgr lock, hence some loss of
 
572
 * concurrency.
 
573
 */
 
574
Buffer
 
575
ReleaseAndReadBuffer(Buffer buffer,
 
576
                                         Relation relation,
 
577
                                         BlockNumber blockNum)
 
578
{
 
579
        BufferDesc *bufHdr;
 
580
 
 
581
        if (BufferIsValid(buffer))
 
582
        {
 
583
                if (BufferIsLocal(buffer))
 
584
                {
 
585
                        Assert(LocalRefCount[-buffer - 1] > 0);
 
586
                        bufHdr = &LocalBufferDescriptors[-buffer - 1];
 
587
                        if (bufHdr->tag.blockNum == blockNum &&
 
588
                                RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node))
 
589
                                return buffer;
 
590
                        ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
 
591
                        /* owner now has a free slot, so no need for Enlarge() */
 
592
                        LocalRefCount[-buffer - 1]--;
 
593
                }
 
594
                else
 
595
                {
 
596
                        Assert(PrivateRefCount[buffer - 1] > 0);
 
597
                        bufHdr = &BufferDescriptors[buffer - 1];
 
598
                        if (bufHdr->tag.blockNum == blockNum &&
 
599
                                RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node))
 
600
                                return buffer;
 
601
                        ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
 
602
                        /* owner now has a free slot, so no need for Enlarge() */
 
603
                        if (PrivateRefCount[buffer - 1] > 1)
 
604
                                PrivateRefCount[buffer - 1]--;
 
605
                        else
 
606
                        {
 
607
                                LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
608
                                UnpinBuffer(bufHdr, false);
 
609
                                return ReadBufferInternal(relation, blockNum, true);
 
610
                        }
 
611
                }
 
612
        }
 
613
        else
 
614
                ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 
615
 
 
616
        return ReadBufferInternal(relation, blockNum, false);
 
617
}
 
618
 
 
619
/*
 
620
 * PinBuffer -- make buffer unavailable for replacement.
 
621
 *
 
622
 * This should be applied only to shared buffers, never local ones.
 
623
 * Bufmgr lock must be held by caller.
 
624
 *
 
625
 * Most but not all callers want CurrentResourceOwner to be adjusted.
 
626
 * Note that ResourceOwnerEnlargeBuffers must have been done already.
 
627
 */
 
628
static void
 
629
PinBuffer(BufferDesc *buf, bool fixOwner)
 
630
{
 
631
        int                     b = BufferDescriptorGetBuffer(buf) - 1;
 
632
 
 
633
        if (PrivateRefCount[b] == 0)
 
634
                buf->refcount++;
 
635
        PrivateRefCount[b]++;
 
636
        Assert(PrivateRefCount[b] > 0);
 
637
        if (fixOwner)
 
638
                ResourceOwnerRememberBuffer(CurrentResourceOwner,
 
639
                                                                        BufferDescriptorGetBuffer(buf));
 
640
}
 
641
 
 
642
/*
 
643
 * UnpinBuffer -- make buffer available for replacement.
 
644
 *
 
645
 * This should be applied only to shared buffers, never local ones.
 
646
 * Bufmgr lock must be held by caller.
 
647
 *
 
648
 * Most but not all callers want CurrentResourceOwner to be adjusted.
 
649
 */
 
650
static void
 
651
UnpinBuffer(BufferDesc *buf, bool fixOwner)
 
652
{
 
653
        int                     b = BufferDescriptorGetBuffer(buf) - 1;
 
654
 
 
655
        if (fixOwner)
 
656
                ResourceOwnerForgetBuffer(CurrentResourceOwner,
 
657
                                                                  BufferDescriptorGetBuffer(buf));
 
658
 
 
659
        Assert(buf->refcount > 0);
 
660
        Assert(PrivateRefCount[b] > 0);
 
661
        PrivateRefCount[b]--;
 
662
        if (PrivateRefCount[b] == 0)
 
663
        {
 
664
                buf->refcount--;
 
665
                /* I'd better not still hold any locks on the buffer */
 
666
                Assert(!LWLockHeldByMe(buf->cntx_lock));
 
667
                Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
 
668
        }
 
669
 
 
670
        if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
 
671
                buf->refcount == 1)
 
672
        {
 
673
                /* we just released the last pin other than the waiter's */
 
674
                buf->flags &= ~BM_PIN_COUNT_WAITER;
 
675
                ProcSendSignal(buf->wait_backend_id);
 
676
        }
 
677
        else
 
678
        {
 
679
                /* do nothing */
 
680
        }
 
681
}
 
682
 
 
683
/*
 
684
 * BufferSync -- Write out dirty buffers in the pool.
 
685
 *
 
686
 * This is called at checkpoint time to write out all dirty shared buffers,
 
687
 * and by the background writer process to write out some of the dirty blocks.
 
688
 * percent/maxpages should be -1 in the former case, and limit values (>= 0)
 
689
 * in the latter.
 
690
 *
 
691
 * Returns the number of buffers written.
 
692
 */
 
693
int
 
694
BufferSync(int percent, int maxpages)
 
695
{
 
696
        BufferDesc **dirty_buffers;
 
697
        BufferTag  *buftags;
 
698
        int                     num_buffer_dirty;
 
699
        int                     i;
 
700
 
 
701
        /* If either limit is zero then we are disabled from doing anything... */
 
702
        if (percent == 0 || maxpages == 0)
 
703
                return 0;
 
704
 
 
705
        /*
 
706
         * Get a list of all currently dirty buffers and how many there are.
 
707
         * We do not flush buffers that get dirtied after we started. They
 
708
         * have to wait until the next checkpoint.
 
709
         */
 
710
        dirty_buffers = (BufferDesc **) palloc(NBuffers * sizeof(BufferDesc *));
 
711
        buftags = (BufferTag *) palloc(NBuffers * sizeof(BufferTag));
 
712
 
 
713
        LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
714
        num_buffer_dirty = StrategyDirtyBufferList(dirty_buffers, buftags,
 
715
                                                                                           NBuffers);
 
716
 
 
717
        /*
 
718
         * If called by the background writer, we are usually asked to only
 
719
         * write out some portion of dirty buffers now, to prevent the IO
 
720
         * storm at checkpoint time.
 
721
         */
 
722
        if (percent > 0)
 
723
        {
 
724
                Assert(percent <= 100);
 
725
                num_buffer_dirty = (num_buffer_dirty * percent + 99) / 100;
 
726
        }
 
727
        if (maxpages > 0 && num_buffer_dirty > maxpages)
 
728
                num_buffer_dirty = maxpages;
 
729
 
 
730
        /* Make sure we can handle the pin inside the loop */
 
731
        ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 
732
 
 
733
        /*
 
734
         * Loop over buffers to be written.  Note the BufMgrLock is held at
 
735
         * loop top, but is released and reacquired within FlushBuffer, so we
 
736
         * aren't holding it long.
 
737
         */
 
738
        for (i = 0; i < num_buffer_dirty; i++)
 
739
        {
 
740
                BufferDesc *bufHdr = dirty_buffers[i];
 
741
 
 
742
                /*
 
743
                 * Check it is still the same page and still needs writing.
 
744
                 *
 
745
                 * We can check bufHdr->cntxDirty here *without* holding any lock on
 
746
                 * buffer context as long as we set this flag in access methods
 
747
                 * *before* logging changes with XLogInsert(): if someone will set
 
748
                 * cntxDirty just after our check we don't worry because of our
 
749
                 * checkpoint.redo points before log record for upcoming changes
 
750
                 * and so we are not required to write such dirty buffer.
 
751
                 */
 
752
                if (!(bufHdr->flags & BM_VALID))
 
753
                        continue;
 
754
                if (!BUFFERTAGS_EQUAL(bufHdr->tag, buftags[i]))
 
755
                        continue;
 
756
                if (!(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
 
757
                        continue;
 
758
 
 
759
                /*
 
760
                 * IO synchronization. Note that we do it with unpinned buffer to
 
761
                 * avoid conflicts with FlushRelationBuffers.
 
762
                 */
 
763
                if (bufHdr->flags & BM_IO_IN_PROGRESS)
 
764
                {
 
765
                        WaitIO(bufHdr);
 
766
                        /* Still need writing? */
 
767
                        if (!(bufHdr->flags & BM_VALID))
 
768
                                continue;
 
769
                        if (!BUFFERTAGS_EQUAL(bufHdr->tag, buftags[i]))
 
770
                                continue;
 
771
                        if (!(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
 
772
                                continue;
 
773
                }
 
774
 
 
775
                /*
 
776
                 * Here: no one doing IO for this buffer and it's dirty. Pin
 
777
                 * buffer now and set IO state for it *before* acquiring shlock to
 
778
                 * avoid conflicts with FlushRelationBuffers.
 
779
                 */
 
780
                PinBuffer(bufHdr, true);
 
781
                StartBufferIO(bufHdr, false);
 
782
 
 
783
                FlushBuffer(bufHdr, NULL, false);
 
784
 
 
785
                TerminateBufferIO(bufHdr, 0);
 
786
                UnpinBuffer(bufHdr, true);
 
787
        }
 
788
 
 
789
        LWLockRelease(BufMgrLock);
 
790
 
 
791
        pfree(dirty_buffers);
 
792
        pfree(buftags);
 
793
 
 
794
        return num_buffer_dirty;
 
795
}
 
796
 
 
797
/*
 
798
 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
 
799
 *
 
800
 * Should be entered with buffer manager lock held; releases it before
 
801
 * waiting and re-acquires it afterwards.
 
802
 */
 
803
static void
 
804
WaitIO(BufferDesc *buf)
 
805
{
 
806
        /*
 
807
         * Changed to wait until there's no IO - Inoue 01/13/2000
 
808
         *
 
809
         * Note this is *necessary* because an error abort in the process doing
 
810
         * I/O could release the io_in_progress_lock prematurely. See
 
811
         * AbortBufferIO.
 
812
         */
 
813
        while ((buf->flags & BM_IO_IN_PROGRESS) != 0)
 
814
        {
 
815
                LWLockRelease(BufMgrLock);
 
816
                LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
 
817
                LWLockRelease(buf->io_in_progress_lock);
 
818
                LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
819
        }
 
820
}
 
821
 
 
822
 
 
823
/*
 
824
 * Return a palloc'd string containing buffer usage statistics.
 
825
 */
 
826
char *
 
827
ShowBufferUsage(void)
 
828
{
 
829
        StringInfoData str;
 
830
        float           hitrate;
 
831
        float           localhitrate;
 
832
 
 
833
        initStringInfo(&str);
 
834
 
 
835
        if (ReadBufferCount == 0)
 
836
                hitrate = 0.0;
 
837
        else
 
838
                hitrate = (float) BufferHitCount *100.0 / ReadBufferCount;
 
839
 
 
840
        if (ReadLocalBufferCount == 0)
 
841
                localhitrate = 0.0;
 
842
        else
 
843
                localhitrate = (float) LocalBufferHitCount *100.0 / ReadLocalBufferCount;
 
844
 
 
845
        appendStringInfo(&str,
 
846
                                         "!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
 
847
                        ReadBufferCount - BufferHitCount, BufferFlushCount, hitrate);
 
848
        appendStringInfo(&str,
 
849
                                         "!\tLocal  blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
 
850
                                         ReadLocalBufferCount - LocalBufferHitCount, LocalBufferFlushCount, localhitrate);
 
851
        appendStringInfo(&str,
 
852
                                         "!\tDirect blocks: %10ld read, %10ld written\n",
 
853
                                         NDirectFileRead, NDirectFileWrite);
 
854
 
 
855
        return str.data;
 
856
}
 
857
 
 
858
void
 
859
ResetBufferUsage(void)
 
860
{
 
861
        BufferHitCount = 0;
 
862
        ReadBufferCount = 0;
 
863
        BufferFlushCount = 0;
 
864
        LocalBufferHitCount = 0;
 
865
        ReadLocalBufferCount = 0;
 
866
        LocalBufferFlushCount = 0;
 
867
        NDirectFileRead = 0;
 
868
        NDirectFileWrite = 0;
 
869
}
 
870
 
 
871
/*
 
872
 *              AtEOXact_Buffers - clean up at end of transaction.
 
873
 *
 
874
 *              As of PostgreSQL 8.0, buffer pins should get released by the
 
875
 *              ResourceOwner mechanism.  This routine is just a debugging
 
876
 *              cross-check that no pins remain.
 
877
 */
 
878
void
 
879
AtEOXact_Buffers(bool isCommit)
 
880
{
 
881
#ifdef USE_ASSERT_CHECKING
 
882
        int                     i;
 
883
 
 
884
        for (i = 0; i < NBuffers; i++)
 
885
        {
 
886
                Assert(PrivateRefCount[i] == 0);
 
887
        }
 
888
 
 
889
        AtEOXact_LocalBuffers(isCommit);
 
890
#endif
 
891
}
 
892
 
 
893
/*
 
894
 * Ensure we have released all shared-buffer locks and pins during backend exit
 
895
 */
 
896
void
 
897
AtProcExit_Buffers(void)
 
898
{
 
899
        int                     i;
 
900
 
 
901
        AbortBufferIO();
 
902
        UnlockBuffers();
 
903
 
 
904
        for (i = 0; i < NBuffers; i++)
 
905
        {
 
906
                if (PrivateRefCount[i] != 0)
 
907
                {
 
908
                        BufferDesc *buf = &(BufferDescriptors[i]);
 
909
 
 
910
                        /*
 
911
                         * We don't worry about updating ResourceOwner; if we even got
 
912
                         * here, it suggests that ResourceOwners are messed up.
 
913
                         */
 
914
                        PrivateRefCount[i] = 1;         /* make sure we release shared pin */
 
915
                        LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
916
                        UnpinBuffer(buf, false);
 
917
                        LWLockRelease(BufMgrLock);
 
918
                        Assert(PrivateRefCount[i] == 0);
 
919
                }
 
920
        }
 
921
}
 
922
 
 
923
/*
 
924
 * Helper routine to issue warnings when a buffer is unexpectedly pinned
 
925
 */
 
926
void
 
927
PrintBufferLeakWarning(Buffer buffer)
 
928
{
 
929
        BufferDesc *buf;
 
930
        int32           loccount;
 
931
 
 
932
        Assert(BufferIsValid(buffer));
 
933
        if (BufferIsLocal(buffer))
 
934
        {
 
935
                buf = &LocalBufferDescriptors[-buffer - 1];
 
936
                loccount = LocalRefCount[-buffer - 1];
 
937
        }
 
938
        else
 
939
        {
 
940
                buf = &BufferDescriptors[buffer - 1];
 
941
                loccount = PrivateRefCount[buffer - 1];
 
942
        }
 
943
 
 
944
        elog(WARNING,
 
945
                 "buffer refcount leak: [%03d] "
 
946
                 "(rel=%u/%u/%u, blockNum=%u, flags=0x%x, refcount=%u %d)",
 
947
                 buffer,
 
948
                 buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
 
949
                 buf->tag.rnode.relNode,
 
950
                 buf->tag.blockNum, buf->flags,
 
951
                 buf->refcount, loccount);
 
952
}
 
953
 
 
954
/*
 
955
 * FlushBufferPool
 
956
 *
 
957
 * Flush all dirty blocks in buffer pool to disk at the checkpoint time.
 
958
 * Local relations do not participate in checkpoints, so they don't need to be
 
959
 * flushed.
 
960
 */
 
961
void
 
962
FlushBufferPool(void)
 
963
{
 
964
        BufferSync(-1, -1);
 
965
        smgrsync();
 
966
}
 
967
 
 
968
 
 
969
/*
 
970
 * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
 
971
 */
 
972
void
 
973
BufmgrCommit(void)
 
974
{
 
975
        /* Nothing to do in bufmgr anymore... */
 
976
 
 
977
        smgrcommit();
 
978
}
 
979
 
 
980
/*
 
981
 * BufferGetBlockNumber
 
982
 *              Returns the block number associated with a buffer.
 
983
 *
 
984
 * Note:
 
985
 *              Assumes that the buffer is valid and pinned, else the
 
986
 *              value may be obsolete immediately...
 
987
 */
 
988
BlockNumber
 
989
BufferGetBlockNumber(Buffer buffer)
 
990
{
 
991
        Assert(BufferIsPinned(buffer));
 
992
 
 
993
        if (BufferIsLocal(buffer))
 
994
                return LocalBufferDescriptors[-buffer - 1].tag.blockNum;
 
995
        else
 
996
                return BufferDescriptors[buffer - 1].tag.blockNum;
 
997
}
 
998
 
 
999
/*
 
1000
 * BufferGetFileNode
 
1001
 *              Returns the relation ID (RelFileNode) associated with a buffer.
 
1002
 *
 
1003
 * This should make the same checks as BufferGetBlockNumber, but since the
 
1004
 * two are generally called together, we don't bother.
 
1005
 */
 
1006
RelFileNode
 
1007
BufferGetFileNode(Buffer buffer)
 
1008
{
 
1009
        BufferDesc *bufHdr;
 
1010
 
 
1011
        if (BufferIsLocal(buffer))
 
1012
                bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
 
1013
        else
 
1014
                bufHdr = &BufferDescriptors[buffer - 1];
 
1015
 
 
1016
        return (bufHdr->tag.rnode);
 
1017
}
 
1018
 
 
1019
/*
 
1020
 * FlushBuffer
 
1021
 *              Physically write out a shared buffer.
 
1022
 *
 
1023
 * NOTE: this actually just passes the buffer contents to the kernel; the
 
1024
 * real write to disk won't happen until the kernel feels like it.  This
 
1025
 * is okay from our point of view since we can redo the changes from WAL.
 
1026
 * However, we will need to force the changes to disk via fsync before
 
1027
 * we can checkpoint WAL.
 
1028
 *
 
1029
 * BufMgrLock must be held at entry, and the buffer must be pinned.  The
 
1030
 * caller is also responsible for doing StartBufferIO/TerminateBufferIO.
 
1031
 *
 
1032
 * If the caller has an smgr reference for the buffer's relation, pass it
 
1033
 * as the second parameter.  If not, pass NULL.  (Do not open relation
 
1034
 * while holding BufMgrLock!)
 
1035
 *
 
1036
 * When earlylock is TRUE, we grab the per-buffer sharelock before releasing
 
1037
 * BufMgrLock, rather than after.  Normally this would be a bad idea since
 
1038
 * we might deadlock, but it is safe and necessary when called from
 
1039
 * BufferAlloc() --- see comments therein.
 
1040
 */
 
1041
static void
 
1042
FlushBuffer(BufferDesc *buf, SMgrRelation reln, bool earlylock)
 
1043
{
 
1044
        Buffer          buffer = BufferDescriptorGetBuffer(buf);
 
1045
        XLogRecPtr      recptr;
 
1046
        ErrorContextCallback errcontext;
 
1047
 
 
1048
        /* Transpose cntxDirty into flags while holding BufMgrLock */
 
1049
        buf->cntxDirty = false;
 
1050
        buf->flags |= BM_DIRTY;
 
1051
 
 
1052
        /* To check if block content changed while flushing. - vadim 01/17/97 */
 
1053
        buf->flags &= ~BM_JUST_DIRTIED;
 
1054
 
 
1055
        /*
 
1056
         * If earlylock, grab buffer sharelock before anyone else could re-lock
 
1057
         * the buffer.
 
1058
         */
 
1059
        if (earlylock)
 
1060
                LockBuffer(buffer, BUFFER_LOCK_SHARE);
 
1061
 
 
1062
        /* Release BufMgrLock while doing xlog work */
 
1063
        LWLockRelease(BufMgrLock);
 
1064
 
 
1065
        /* Setup error traceback support for ereport() */
 
1066
        errcontext.callback = buffer_write_error_callback;
 
1067
        errcontext.arg = buf;
 
1068
        errcontext.previous = error_context_stack;
 
1069
        error_context_stack = &errcontext;
 
1070
 
 
1071
        /* Find smgr relation for buffer while holding minimal locks */
 
1072
        if (reln == NULL)
 
1073
                reln = smgropen(buf->tag.rnode);
 
1074
 
 
1075
        /*
 
1076
         * Protect buffer content against concurrent update.  (Note that
 
1077
         * hint-bit updates can still occur while the write is in progress,
 
1078
         * but we assume that that will not invalidate the data written.)
 
1079
         */
 
1080
        if (!earlylock)
 
1081
                LockBuffer(buffer, BUFFER_LOCK_SHARE);
 
1082
 
 
1083
        /*
 
1084
         * Force XLOG flush for buffer' LSN.  This implements the basic WAL
 
1085
         * rule that log updates must hit disk before any of the data-file
 
1086
         * changes they describe do.
 
1087
         */
 
1088
        recptr = BufferGetLSN(buf);
 
1089
        XLogFlush(recptr);
 
1090
 
 
1091
        /*
 
1092
         * Now it's safe to write buffer to disk. Note that no one else should
 
1093
         * have been able to write it while we were busy with locking and log
 
1094
         * flushing because caller has set the IO flag.
 
1095
         *
 
1096
         * It would be better to clear BM_JUST_DIRTIED right here, but we'd have
 
1097
         * to reacquire the BufMgrLock and it doesn't seem worth it.
 
1098
         */
 
1099
        smgrwrite(reln,
 
1100
                          buf->tag.blockNum,
 
1101
                          (char *) MAKE_PTR(buf->data),
 
1102
                          false);
 
1103
 
 
1104
        /* Pop the error context stack */
 
1105
        error_context_stack = errcontext.previous;
 
1106
 
 
1107
        /*
 
1108
         * Release the per-buffer readlock, reacquire BufMgrLock.
 
1109
         */
 
1110
        LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 
1111
 
 
1112
        LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
1113
 
 
1114
        BufferFlushCount++;
 
1115
 
 
1116
        /*
 
1117
         * If this buffer was marked by someone as DIRTY while we were
 
1118
         * flushing it out we must not clear DIRTY flag - vadim 01/17/97
 
1119
         */
 
1120
        if (!(buf->flags & BM_JUST_DIRTIED))
 
1121
                buf->flags &= ~BM_DIRTY;
 
1122
}
 
1123
 
 
1124
/*
 
1125
 * RelationGetNumberOfBlocks
 
1126
 *              Determines the current number of pages in the relation.
 
1127
 */
 
1128
BlockNumber
 
1129
RelationGetNumberOfBlocks(Relation relation)
 
1130
{
 
1131
        /* Open it at the smgr level if not already done */
 
1132
        RelationOpenSmgr(relation);
 
1133
 
 
1134
        return smgrnblocks(relation->rd_smgr);
 
1135
}
 
1136
 
 
1137
/*
 
1138
 * RelationTruncate
 
1139
 *              Physically truncate a relation to the specified number of blocks.
 
1140
 *
 
1141
 * Caller should already have done something to flush any buffered pages
 
1142
 * that are to be dropped.
 
1143
 */
 
1144
void
 
1145
RelationTruncate(Relation rel, BlockNumber nblocks)
 
1146
{
 
1147
        /* Open it at the smgr level if not already done */
 
1148
        RelationOpenSmgr(rel);
 
1149
 
 
1150
        /* Make sure rd_targblock isn't pointing somewhere past end */
 
1151
        rel->rd_targblock = InvalidBlockNumber;
 
1152
 
 
1153
        /* Do the real work */
 
1154
        smgrtruncate(rel->rd_smgr, nblocks, rel->rd_istemp);
 
1155
}
 
1156
 
 
1157
/* ---------------------------------------------------------------------
 
1158
 *              DropRelationBuffers
 
1159
 *
 
1160
 *              This function removes all the buffered pages for a relation
 
1161
 *              from the buffer pool.  Dirty pages are simply dropped, without
 
1162
 *              bothering to write them out first.      This is NOT rollback-able,
 
1163
 *              and so should be used only with extreme caution!
 
1164
 *
 
1165
 *              There is no particularly good reason why this doesn't have a
 
1166
 *              firstDelBlock parameter, except that current callers don't need it.
 
1167
 *
 
1168
 *              We assume that the caller holds an exclusive lock on the relation,
 
1169
 *              which should assure that no new buffers will be acquired for the rel
 
1170
 *              meanwhile.
 
1171
 * --------------------------------------------------------------------
 
1172
 */
 
1173
void
 
1174
DropRelationBuffers(Relation rel)
 
1175
{
 
1176
        DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp, 0);
 
1177
}
 
1178
 
 
1179
/* ---------------------------------------------------------------------
 
1180
 *              DropRelFileNodeBuffers
 
1181
 *
 
1182
 *              This is the same as DropRelationBuffers, except that the target
 
1183
 *              relation is specified by RelFileNode and temp status, and one
 
1184
 *              may specify the first block to drop.
 
1185
 *
 
1186
 *              This is NOT rollback-able.      One legitimate use is to clear the
 
1187
 *              buffer cache of buffers for a relation that is being deleted
 
1188
 *              during transaction abort.
 
1189
 * --------------------------------------------------------------------
 
1190
 */
 
1191
void
 
1192
DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
 
1193
                                           BlockNumber firstDelBlock)
 
1194
{
 
1195
        int                     i;
 
1196
        BufferDesc *bufHdr;
 
1197
 
 
1198
        if (istemp)
 
1199
        {
 
1200
                for (i = 0; i < NLocBuffer; i++)
 
1201
                {
 
1202
                        bufHdr = &LocalBufferDescriptors[i];
 
1203
                        if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
 
1204
                                bufHdr->tag.blockNum >= firstDelBlock)
 
1205
                        {
 
1206
                                if (LocalRefCount[i] != 0)
 
1207
                                        elog(ERROR, "block %u of %u/%u/%u is still referenced (local %u)",
 
1208
                                                 bufHdr->tag.blockNum,
 
1209
                                                 bufHdr->tag.rnode.spcNode,
 
1210
                                                 bufHdr->tag.rnode.dbNode,
 
1211
                                                 bufHdr->tag.rnode.relNode,
 
1212
                                                 LocalRefCount[i]);
 
1213
                                bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
 
1214
                                bufHdr->cntxDirty = false;
 
1215
                                bufHdr->tag.rnode.relNode = InvalidOid;
 
1216
                        }
 
1217
                }
 
1218
                return;
 
1219
        }
 
1220
 
 
1221
        LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
1222
 
 
1223
        for (i = 1; i <= NBuffers; i++)
 
1224
        {
 
1225
                bufHdr = &BufferDescriptors[i - 1];
 
1226
recheck:
 
1227
                if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
 
1228
                        bufHdr->tag.blockNum >= firstDelBlock)
 
1229
                {
 
1230
                        /*
 
1231
                         * If there is I/O in progress, better wait till it's done;
 
1232
                         * don't want to delete the relation out from under someone
 
1233
                         * who's just trying to flush the buffer!
 
1234
                         */
 
1235
                        if (bufHdr->flags & BM_IO_IN_PROGRESS)
 
1236
                        {
 
1237
                                WaitIO(bufHdr);
 
1238
 
 
1239
                                /*
 
1240
                                 * By now, the buffer very possibly belongs to some other
 
1241
                                 * rel, so check again before proceeding.
 
1242
                                 */
 
1243
                                goto recheck;
 
1244
                        }
 
1245
 
 
1246
                        /*
 
1247
                         * There should be no pin on the buffer.
 
1248
                         */
 
1249
                        if (bufHdr->refcount != 0)
 
1250
                                elog(ERROR, "block %u of %u/%u/%u is still referenced (private %d, global %u)",
 
1251
                                         bufHdr->tag.blockNum,
 
1252
                                         bufHdr->tag.rnode.spcNode,
 
1253
                                         bufHdr->tag.rnode.dbNode,
 
1254
                                         bufHdr->tag.rnode.relNode,
 
1255
                                         PrivateRefCount[i - 1], bufHdr->refcount);
 
1256
 
 
1257
                        /* Now we can do what we came for */
 
1258
                        bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
 
1259
                        bufHdr->cntxDirty = false;
 
1260
 
 
1261
                        /*
 
1262
                         * And mark the buffer as no longer occupied by this rel.
 
1263
                         */
 
1264
                        StrategyInvalidateBuffer(bufHdr);
 
1265
                }
 
1266
        }
 
1267
 
 
1268
        LWLockRelease(BufMgrLock);
 
1269
}
 
1270
 
 
1271
/* ---------------------------------------------------------------------
 
1272
 *              DropBuffers
 
1273
 *
 
1274
 *              This function removes all the buffers in the buffer cache for a
 
1275
 *              particular database.  Dirty pages are simply dropped, without
 
1276
 *              bothering to write them out first.      This is used when we destroy a
 
1277
 *              database, to avoid trying to flush data to disk when the directory
 
1278
 *              tree no longer exists.  Implementation is pretty similar to
 
1279
 *              DropRelationBuffers() which is for destroying just one relation.
 
1280
 * --------------------------------------------------------------------
 
1281
 */
 
1282
void
 
1283
DropBuffers(Oid dbid)
 
1284
{
 
1285
        int                     i;
 
1286
        BufferDesc *bufHdr;
 
1287
 
 
1288
        LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
1289
 
 
1290
        for (i = 1; i <= NBuffers; i++)
 
1291
        {
 
1292
                bufHdr = &BufferDescriptors[i - 1];
 
1293
recheck:
 
1294
                if (bufHdr->tag.rnode.dbNode == dbid)
 
1295
                {
 
1296
                        /*
 
1297
                         * If there is I/O in progress, better wait till it's done;
 
1298
                         * don't want to delete the database out from under someone
 
1299
                         * who's just trying to flush the buffer!
 
1300
                         */
 
1301
                        if (bufHdr->flags & BM_IO_IN_PROGRESS)
 
1302
                        {
 
1303
                                WaitIO(bufHdr);
 
1304
 
 
1305
                                /*
 
1306
                                 * By now, the buffer very possibly belongs to some other
 
1307
                                 * DB, so check again before proceeding.
 
1308
                                 */
 
1309
                                goto recheck;
 
1310
                        }
 
1311
                        /* Now we can do what we came for */
 
1312
                        bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
 
1313
                        bufHdr->cntxDirty = false;
 
1314
 
 
1315
                        /*
 
1316
                         * The thing should be free, if caller has checked that no
 
1317
                         * backends are running in that database.
 
1318
                         */
 
1319
                        Assert(bufHdr->refcount == 0);
 
1320
 
 
1321
                        /*
 
1322
                         * And mark the buffer as no longer occupied by this page.
 
1323
                         */
 
1324
                        StrategyInvalidateBuffer(bufHdr);
 
1325
                }
 
1326
        }
 
1327
 
 
1328
        LWLockRelease(BufMgrLock);
 
1329
}
 
1330
 
 
1331
/* -----------------------------------------------------------------
 
1332
 *              PrintBufferDescs
 
1333
 *
 
1334
 *              this function prints all the buffer descriptors, for debugging
 
1335
 *              use only.
 
1336
 * -----------------------------------------------------------------
 
1337
 */
 
1338
#ifdef NOT_USED
 
1339
void
 
1340
PrintBufferDescs(void)
 
1341
{
 
1342
        int                     i;
 
1343
        BufferDesc *buf = BufferDescriptors;
 
1344
 
 
1345
        if (IsUnderPostmaster)
 
1346
        {
 
1347
                LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
1348
                for (i = 0; i < NBuffers; ++i, ++buf)
 
1349
                {
 
1350
                        elog(LOG,
 
1351
                                 "[%02d] (freeNext=%d, freePrev=%d, rel=%u/%u/%u, "
 
1352
                                 "blockNum=%u, flags=0x%x, refcount=%u %d)",
 
1353
                                 i, buf->freeNext, buf->freePrev,
 
1354
                                 buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
 
1355
                                 buf->tag.rnode.relNode,
 
1356
                                 buf->tag.blockNum, buf->flags,
 
1357
                                 buf->refcount, PrivateRefCount[i]);
 
1358
                }
 
1359
                LWLockRelease(BufMgrLock);
 
1360
        }
 
1361
        else
 
1362
        {
 
1363
                /* interactive backend */
 
1364
                for (i = 0; i < NBuffers; ++i, ++buf)
 
1365
                {
 
1366
                        printf("[%-2d] (%u/%u/%u, %u) flags=0x%x, refcount=%u %d)\n",
 
1367
                                   i, buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
 
1368
                                   buf->tag.rnode.relNode, buf->tag.blockNum,
 
1369
                                   buf->flags, buf->refcount, PrivateRefCount[i]);
 
1370
                }
 
1371
        }
 
1372
}
 
1373
#endif
 
1374
 
 
1375
#ifdef NOT_USED
 
1376
void
 
1377
PrintPinnedBufs(void)
 
1378
{
 
1379
        int                     i;
 
1380
        BufferDesc *buf = BufferDescriptors;
 
1381
 
 
1382
        LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
1383
        for (i = 0; i < NBuffers; ++i, ++buf)
 
1384
        {
 
1385
                if (PrivateRefCount[i] > 0)
 
1386
                        elog(NOTICE,
 
1387
                                 "[%02d] (freeNext=%d, freePrev=%d, rel=%u/%u/%u, "
 
1388
                                 "blockNum=%u, flags=0x%x, refcount=%u %d)",
 
1389
                                 i, buf->freeNext, buf->freePrev,
 
1390
                                 buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
 
1391
                                 buf->tag.rnode.relNode,
 
1392
                                 buf->tag.blockNum, buf->flags,
 
1393
                                 buf->refcount, PrivateRefCount[i]);
 
1394
        }
 
1395
        LWLockRelease(BufMgrLock);
 
1396
}
 
1397
#endif
 
1398
 
 
1399
/* ---------------------------------------------------------------------
 
1400
 *              FlushRelationBuffers
 
1401
 *
 
1402
 *              This function writes all dirty pages of a relation out to disk.
 
1403
 *              Furthermore, pages that have blocknumber >= firstDelBlock are
 
1404
 *              actually removed from the buffer pool.
 
1405
 *
 
1406
 *              This is called by DROP TABLE to clear buffers for the relation
 
1407
 *              from the buffer pool.  Note that we must write dirty buffers,
 
1408
 *              rather than just dropping the changes, because our transaction
 
1409
 *              might abort later on; we want to roll back safely in that case.
 
1410
 *
 
1411
 *              This is also called by VACUUM before truncating the relation to the
 
1412
 *              given number of blocks.  It might seem unnecessary for VACUUM to
 
1413
 *              write dirty pages before firstDelBlock, since VACUUM should already
 
1414
 *              have committed its changes.  However, it is possible for there still
 
1415
 *              to be dirty pages: if some page had unwritten on-row tuple status
 
1416
 *              updates from a prior transaction, and VACUUM had no additional
 
1417
 *              changes to make to that page, then VACUUM won't have written it.
 
1418
 *              This is harmless in most cases but will break pg_upgrade, which
 
1419
 *              relies on VACUUM to ensure that *all* tuples have correct on-row
 
1420
 *              status.  So, we check and flush all dirty pages of the rel
 
1421
 *              regardless of block number.
 
1422
 *
 
1423
 *              In all cases, the caller should be holding AccessExclusiveLock on
 
1424
 *              the target relation to ensure that no other backend is busy reading
 
1425
 *              more blocks of the relation (or might do so before we commit).
 
1426
 *              This should also ensure that no one is busy dirtying these blocks.
 
1427
 *
 
1428
 *              Formerly, we considered it an error condition if we found dirty
 
1429
 *              buffers here.   However, since BufferSync no longer forces out all
 
1430
 *              dirty buffers at every xact commit, it's possible for dirty buffers
 
1431
 *              to still be present in the cache due to failure of an earlier
 
1432
 *              transaction.  So, must flush dirty buffers without complaint.
 
1433
 *
 
1434
 *              XXX currently it sequentially searches the buffer pool, should be
 
1435
 *              changed to more clever ways of searching.
 
1436
 * --------------------------------------------------------------------
 
1437
 */
 
1438
void
 
1439
FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 
1440
{
 
1441
        int                     i;
 
1442
        BufferDesc *bufHdr;
 
1443
 
 
1444
        /* Open rel at the smgr level if not already done */
 
1445
        RelationOpenSmgr(rel);
 
1446
 
 
1447
        if (rel->rd_istemp)
 
1448
        {
 
1449
                for (i = 0; i < NLocBuffer; i++)
 
1450
                {
 
1451
                        bufHdr = &LocalBufferDescriptors[i];
 
1452
                        if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
 
1453
                        {
 
1454
                                if ((bufHdr->flags & BM_VALID) &&
 
1455
                                        (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
 
1456
                                {
 
1457
                                        ErrorContextCallback errcontext;
 
1458
 
 
1459
                                        /* Setup error traceback support for ereport() */
 
1460
                                        errcontext.callback = buffer_write_error_callback;
 
1461
                                        errcontext.arg = bufHdr;
 
1462
                                        errcontext.previous = error_context_stack;
 
1463
                                        error_context_stack = &errcontext;
 
1464
 
 
1465
                                        smgrwrite(rel->rd_smgr,
 
1466
                                                          bufHdr->tag.blockNum,
 
1467
                                                          (char *) MAKE_PTR(bufHdr->data),
 
1468
                                                          true);
 
1469
 
 
1470
                                        bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
 
1471
                                        bufHdr->cntxDirty = false;
 
1472
 
 
1473
                                        /* Pop the error context stack */
 
1474
                                        error_context_stack = errcontext.previous;
 
1475
                                }
 
1476
                                if (LocalRefCount[i] > 0)
 
1477
                                        elog(ERROR, "FlushRelationBuffers(\"%s\" (local), %u): block %u is referenced (%d)",
 
1478
                                                 RelationGetRelationName(rel), firstDelBlock,
 
1479
                                                 bufHdr->tag.blockNum, LocalRefCount[i]);
 
1480
                                if (bufHdr->tag.blockNum >= firstDelBlock)
 
1481
                                        bufHdr->tag.rnode.relNode = InvalidOid;
 
1482
                        }
 
1483
                }
 
1484
 
 
1485
                return;
 
1486
        }
 
1487
 
 
1488
        /* Make sure we can handle the pin inside the loop */
 
1489
        ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 
1490
 
 
1491
        LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
1492
 
 
1493
        for (i = 0; i < NBuffers; i++)
 
1494
        {
 
1495
                bufHdr = &BufferDescriptors[i];
 
1496
                if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
 
1497
                {
 
1498
                        if ((bufHdr->flags & BM_VALID) &&
 
1499
                                (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
 
1500
                        {
 
1501
                                PinBuffer(bufHdr, true);
 
1502
                                /* Someone else might be flushing buffer */
 
1503
                                if (bufHdr->flags & BM_IO_IN_PROGRESS)
 
1504
                                        WaitIO(bufHdr);
 
1505
                                /* Still dirty? */
 
1506
                                if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
 
1507
                                {
 
1508
                                        StartBufferIO(bufHdr, false);
 
1509
 
 
1510
                                        FlushBuffer(bufHdr, rel->rd_smgr, false);
 
1511
 
 
1512
                                        TerminateBufferIO(bufHdr, 0);
 
1513
                                }
 
1514
                                UnpinBuffer(bufHdr, true);
 
1515
                                if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
 
1516
                                        elog(ERROR, "FlushRelationBuffers(\"%s\", %u): block %u was re-dirtied",
 
1517
                                                 RelationGetRelationName(rel), firstDelBlock,
 
1518
                                                 bufHdr->tag.blockNum);
 
1519
                        }
 
1520
                        if (bufHdr->refcount != 0)
 
1521
                                elog(ERROR, "FlushRelationBuffers(\"%s\", %u): block %u is referenced (private %d, global %u)",
 
1522
                                         RelationGetRelationName(rel), firstDelBlock,
 
1523
                                         bufHdr->tag.blockNum,
 
1524
                                         PrivateRefCount[i], bufHdr->refcount);
 
1525
                        if (bufHdr->tag.blockNum >= firstDelBlock)
 
1526
                                StrategyInvalidateBuffer(bufHdr);
 
1527
                }
 
1528
        }
 
1529
 
 
1530
        LWLockRelease(BufMgrLock);
 
1531
}
 
1532
 
 
1533
/*
 
1534
 * ReleaseBuffer -- remove the pin on a buffer without
 
1535
 *              marking it dirty.
 
1536
 */
 
1537
void
 
1538
ReleaseBuffer(Buffer buffer)
 
1539
{
 
1540
        BufferDesc *bufHdr;
 
1541
 
 
1542
        if (!BufferIsValid(buffer))
 
1543
                elog(ERROR, "bad buffer id: %d", buffer);
 
1544
 
 
1545
        ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
 
1546
 
 
1547
        if (BufferIsLocal(buffer))
 
1548
        {
 
1549
                Assert(LocalRefCount[-buffer - 1] > 0);
 
1550
                LocalRefCount[-buffer - 1]--;
 
1551
                return;
 
1552
        }
 
1553
 
 
1554
        bufHdr = &BufferDescriptors[buffer - 1];
 
1555
 
 
1556
        Assert(PrivateRefCount[buffer - 1] > 0);
 
1557
 
 
1558
        if (PrivateRefCount[buffer - 1] > 1)
 
1559
                PrivateRefCount[buffer - 1]--;
 
1560
        else
 
1561
        {
 
1562
                LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
1563
                UnpinBuffer(bufHdr, false);
 
1564
                LWLockRelease(BufMgrLock);
 
1565
        }
 
1566
}
 
1567
 
 
1568
/*
 
1569
 * IncrBufferRefCount
 
1570
 *              Increment the pin count on a buffer that we have *already* pinned
 
1571
 *              at least once.
 
1572
 *
 
1573
 *              This function cannot be used on a buffer we do not have pinned,
 
1574
 *              because it doesn't change the shared buffer state.
 
1575
 */
 
1576
void
 
1577
IncrBufferRefCount(Buffer buffer)
 
1578
{
 
1579
        Assert(BufferIsPinned(buffer));
 
1580
        ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 
1581
        ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
 
1582
        if (BufferIsLocal(buffer))
 
1583
                LocalRefCount[-buffer - 1]++;
 
1584
        else
 
1585
                PrivateRefCount[buffer - 1]++;
 
1586
}
 
1587
 
 
1588
#ifdef NOT_USED
 
1589
void
 
1590
IncrBufferRefCount_Debug(char *file, int line, Buffer buffer)
 
1591
{
 
1592
        IncrBufferRefCount(buffer);
 
1593
        if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer))
 
1594
        {
 
1595
                BufferDesc *buf = &BufferDescriptors[buffer - 1];
 
1596
 
 
1597
                fprintf(stderr,
 
1598
                                "PIN(Incr) %d rel = %u/%u/%u, blockNum = %u, "
 
1599
                                "refcount = %d, file: %s, line: %d\n",
 
1600
                                buffer,
 
1601
                                buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
 
1602
                                buf->tag.rnode.relNode, buf->tag.blockNum,
 
1603
                                PrivateRefCount[buffer - 1], file, line);
 
1604
        }
 
1605
}
 
1606
#endif
 
1607
 
 
1608
#ifdef NOT_USED
 
1609
void
 
1610
ReleaseBuffer_Debug(char *file, int line, Buffer buffer)
 
1611
{
 
1612
        ReleaseBuffer(buffer);
 
1613
        if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer))
 
1614
        {
 
1615
                BufferDesc *buf = &BufferDescriptors[buffer - 1];
 
1616
 
 
1617
                fprintf(stderr,
 
1618
                                "UNPIN(Rel) %d rel = %u/%u/%u, blockNum = %u, "
 
1619
                                "refcount = %d, file: %s, line: %d\n",
 
1620
                                buffer,
 
1621
                                buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
 
1622
                                buf->tag.rnode.relNode, buf->tag.blockNum,
 
1623
                                PrivateRefCount[buffer - 1], file, line);
 
1624
        }
 
1625
}
 
1626
#endif
 
1627
 
 
1628
#ifdef NOT_USED
 
1629
Buffer
 
1630
ReleaseAndReadBuffer_Debug(char *file,
 
1631
                                                   int line,
 
1632
                                                   Buffer buffer,
 
1633
                                                   Relation relation,
 
1634
                                                   BlockNumber blockNum)
 
1635
{
 
1636
        bool            bufferValid;
 
1637
        Buffer          b;
 
1638
 
 
1639
        bufferValid = BufferIsValid(buffer);
 
1640
        b = ReleaseAndReadBuffer(buffer, relation, blockNum);
 
1641
        if (ShowPinTrace && bufferValid && BufferIsLocal(buffer)
 
1642
                && is_userbuffer(buffer))
 
1643
        {
 
1644
                BufferDesc *buf = &BufferDescriptors[buffer - 1];
 
1645
 
 
1646
                fprintf(stderr,
 
1647
                                "UNPIN(Rel&Rd) %d rel = %u/%u/%u, blockNum = %u, "
 
1648
                                "refcount = %d, file: %s, line: %d\n",
 
1649
                                buffer,
 
1650
                                buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
 
1651
                                buf->tag.rnode.relNode, buf->tag.blockNum,
 
1652
                                PrivateRefCount[buffer - 1], file, line);
 
1653
        }
 
1654
        if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer))
 
1655
        {
 
1656
                BufferDesc *buf = &BufferDescriptors[b - 1];
 
1657
 
 
1658
                fprintf(stderr,
 
1659
                                "PIN(Rel&Rd) %d rel = %u/%u/%u, blockNum = %u, "
 
1660
                                "refcount = %d, file: %s, line: %d\n",
 
1661
                                b,
 
1662
                                buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
 
1663
                                buf->tag.rnode.relNode, buf->tag.blockNum,
 
1664
                                PrivateRefCount[b - 1], file, line);
 
1665
        }
 
1666
        return b;
 
1667
}
 
1668
#endif
 
1669
 
 
1670
/*
 
1671
 * SetBufferCommitInfoNeedsSave
 
1672
 *
 
1673
 *      Mark a buffer dirty when we have updated tuple commit-status bits in it.
 
1674
 *
 
1675
 * This is essentially the same as WriteNoReleaseBuffer.  We preserve the
 
1676
 * distinction as a way of documenting that the caller has not made a critical
 
1677
 * data change --- the status-bit update could be redone by someone else just
 
1678
 * as easily.  Therefore, no WAL log record need be generated, whereas calls
 
1679
 * to WriteNoReleaseBuffer really ought to be associated with a WAL-entry-
 
1680
 * creating action.
 
1681
 *
 
1682
 * This routine might get called many times on the same page, if we are making
 
1683
 * the first scan after commit of an xact that added/deleted many tuples.
 
1684
 * So, be as quick as we can if the buffer is already dirty.  We do this by
 
1685
 * not acquiring BufMgrLock if it looks like the status bits are already OK.
 
1686
 * (Note it is okay if someone else clears BM_JUST_DIRTIED immediately after
 
1687
 * we look, because the buffer content update is already done and will be
 
1688
 * reflected in the I/O.)
 
1689
 */
 
1690
void
 
1691
SetBufferCommitInfoNeedsSave(Buffer buffer)
 
1692
{
 
1693
        BufferDesc *bufHdr;
 
1694
 
 
1695
        if (!BufferIsValid(buffer))
 
1696
                elog(ERROR, "bad buffer id: %d", buffer);
 
1697
 
 
1698
        if (BufferIsLocal(buffer))
 
1699
        {
 
1700
                WriteLocalBuffer(buffer, false);
 
1701
                return;
 
1702
        }
 
1703
 
 
1704
        bufHdr = &BufferDescriptors[buffer - 1];
 
1705
 
 
1706
        if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
 
1707
                (BM_DIRTY | BM_JUST_DIRTIED))
 
1708
        {
 
1709
                LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
1710
                Assert(bufHdr->refcount > 0);
 
1711
                bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
 
1712
                LWLockRelease(BufMgrLock);
 
1713
        }
 
1714
}
 
1715
 
 
1716
/*
 
1717
 * Release buffer context locks for shared buffers.
 
1718
 *
 
1719
 * Used to clean up after errors.
 
1720
 *
 
1721
 * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
 
1722
 * of releasing buffer context locks per se; the only thing we need to deal
 
1723
 * with here is clearing any PIN_COUNT request that was in progress.
 
1724
 */
 
1725
void
 
1726
UnlockBuffers(void)
 
1727
{
 
1728
        BufferDesc *buf = PinCountWaitBuf;
 
1729
 
 
1730
        if (buf)
 
1731
        {
 
1732
                HOLD_INTERRUPTS();              /* don't want to die() partway through... */
 
1733
 
 
1734
                LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
1735
 
 
1736
                /*
 
1737
                 * Don't complain if flag bit not set; it could have been
 
1738
                 * reset but we got a cancel/die interrupt before getting the
 
1739
                 * signal.
 
1740
                 */
 
1741
                if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
 
1742
                        buf->wait_backend_id == MyBackendId)
 
1743
                        buf->flags &= ~BM_PIN_COUNT_WAITER;
 
1744
                LWLockRelease(BufMgrLock);
 
1745
 
 
1746
                ProcCancelWaitForSignal();
 
1747
 
 
1748
                RESUME_INTERRUPTS();
 
1749
        }
 
1750
 
 
1751
        PinCountWaitBuf = NULL;
 
1752
}
 
1753
 
 
1754
/*
 
1755
 * Acquire or release the cntx_lock for the buffer.
 
1756
 */
 
1757
void
 
1758
LockBuffer(Buffer buffer, int mode)
 
1759
{
 
1760
        BufferDesc *buf;
 
1761
 
 
1762
        Assert(BufferIsValid(buffer));
 
1763
        if (BufferIsLocal(buffer))
 
1764
                return;
 
1765
 
 
1766
        buf = &(BufferDescriptors[buffer - 1]);
 
1767
 
 
1768
        if (mode == BUFFER_LOCK_UNLOCK)
 
1769
                LWLockRelease(buf->cntx_lock);
 
1770
        else if (mode == BUFFER_LOCK_SHARE)
 
1771
                LWLockAcquire(buf->cntx_lock, LW_SHARED);
 
1772
        else if (mode == BUFFER_LOCK_EXCLUSIVE)
 
1773
        {
 
1774
                LWLockAcquire(buf->cntx_lock, LW_EXCLUSIVE);
 
1775
 
 
1776
                /*
 
1777
                 * This is not the best place to set cntxDirty flag (eg indices do
 
1778
                 * not always change buffer they lock in excl mode). But please
 
1779
                 * remember that it's critical to set cntxDirty *before* logging
 
1780
                 * changes with XLogInsert() - see comments in BufferSync().
 
1781
                 */
 
1782
                buf->cntxDirty = true;
 
1783
        }
 
1784
        else
 
1785
                elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 
1786
}
 
1787
 
 
1788
/*
 
1789
 * Acquire the cntx_lock for the buffer, but only if we don't have to wait.
 
1790
 *
 
1791
 * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
 
1792
 */
 
1793
bool
 
1794
ConditionalLockBuffer(Buffer buffer)
 
1795
{
 
1796
        BufferDesc *buf;
 
1797
 
 
1798
        Assert(BufferIsValid(buffer));
 
1799
        if (BufferIsLocal(buffer))
 
1800
                return true;                    /* act as though we got it */
 
1801
 
 
1802
        buf = &(BufferDescriptors[buffer - 1]);
 
1803
 
 
1804
        if (LWLockConditionalAcquire(buf->cntx_lock, LW_EXCLUSIVE))
 
1805
        {
 
1806
                /*
 
1807
                 * This is not the best place to set cntxDirty flag (eg indices do
 
1808
                 * not always change buffer they lock in excl mode). But please
 
1809
                 * remember that it's critical to set cntxDirty *before* logging
 
1810
                 * changes with XLogInsert() - see comments in BufferSync().
 
1811
                 */
 
1812
                buf->cntxDirty = true;
 
1813
 
 
1814
                return true;
 
1815
        }
 
1816
        return false;
 
1817
}
 
1818
 
 
1819
/*
 
1820
 * LockBufferForCleanup - lock a buffer in preparation for deleting items
 
1821
 *
 
1822
 * Items may be deleted from a disk page only when the caller (a) holds an
 
1823
 * exclusive lock on the buffer and (b) has observed that no other backend
 
1824
 * holds a pin on the buffer.  If there is a pin, then the other backend
 
1825
 * might have a pointer into the buffer (for example, a heapscan reference
 
1826
 * to an item --- see README for more details).  It's OK if a pin is added
 
1827
 * after the cleanup starts, however; the newly-arrived backend will be
 
1828
 * unable to look at the page until we release the exclusive lock.
 
1829
 *
 
1830
 * To implement this protocol, a would-be deleter must pin the buffer and
 
1831
 * then call LockBufferForCleanup().  LockBufferForCleanup() is similar to
 
1832
 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
 
1833
 * it has successfully observed pin count = 1.
 
1834
 */
 
1835
void
 
1836
LockBufferForCleanup(Buffer buffer)
 
1837
{
 
1838
        BufferDesc *bufHdr;
 
1839
 
 
1840
        Assert(BufferIsValid(buffer));
 
1841
        Assert(PinCountWaitBuf == NULL);
 
1842
 
 
1843
        if (BufferIsLocal(buffer))
 
1844
        {
 
1845
                /* There should be exactly one pin */
 
1846
                if (LocalRefCount[-buffer - 1] != 1)
 
1847
                        elog(ERROR, "incorrect local pin count: %d",
 
1848
                                 LocalRefCount[-buffer - 1]);
 
1849
                /* Nobody else to wait for */
 
1850
                return;
 
1851
        }
 
1852
 
 
1853
        /* There should be exactly one local pin */
 
1854
        if (PrivateRefCount[buffer - 1] != 1)
 
1855
                elog(ERROR, "incorrect local pin count: %d",
 
1856
                         PrivateRefCount[buffer - 1]);
 
1857
 
 
1858
        bufHdr = &BufferDescriptors[buffer - 1];
 
1859
 
 
1860
        for (;;)
 
1861
        {
 
1862
                /* Try to acquire lock */
 
1863
                LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
1864
                LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
1865
                Assert(bufHdr->refcount > 0);
 
1866
                if (bufHdr->refcount == 1)
 
1867
                {
 
1868
                        /* Successfully acquired exclusive lock with pincount 1 */
 
1869
                        LWLockRelease(BufMgrLock);
 
1870
                        return;
 
1871
                }
 
1872
                /* Failed, so mark myself as waiting for pincount 1 */
 
1873
                if (bufHdr->flags & BM_PIN_COUNT_WAITER)
 
1874
                {
 
1875
                        LWLockRelease(BufMgrLock);
 
1876
                        LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 
1877
                        elog(ERROR, "multiple backends attempting to wait for pincount 1");
 
1878
                }
 
1879
                bufHdr->wait_backend_id = MyBackendId;
 
1880
                bufHdr->flags |= BM_PIN_COUNT_WAITER;
 
1881
                PinCountWaitBuf = bufHdr;
 
1882
                LWLockRelease(BufMgrLock);
 
1883
                LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 
1884
                /* Wait to be signaled by UnpinBuffer() */
 
1885
                ProcWaitForSignal();
 
1886
                PinCountWaitBuf = NULL;
 
1887
                /* Loop back and try again */
 
1888
        }
 
1889
}
 
1890
 
 
1891
/*
 
1892
 *      Functions for IO error handling
 
1893
 *
 
1894
 *      Note: We assume that nested buffer IO never occurs.
 
1895
 *      i.e at most one io_in_progress lock is held per proc.
 
1896
 */
 
1897
 
 
1898
/*
 
1899
 * Function:StartBufferIO
 
1900
 *      (Assumptions)
 
1901
 *      My process is executing no IO
 
1902
 *      BufMgrLock is held
 
1903
 *      BM_IO_IN_PROGRESS mask is not set for the buffer
 
1904
 *      The buffer is Pinned
 
1905
 *
 
1906
 * Because BufMgrLock is held, we are already in an interrupt holdoff here,
 
1907
 * and do not need another.
 
1908
 */
 
1909
static void
 
1910
StartBufferIO(BufferDesc *buf, bool forInput)
 
1911
{
 
1912
        Assert(!InProgressBuf);
 
1913
        Assert(!(buf->flags & BM_IO_IN_PROGRESS));
 
1914
        buf->flags |= BM_IO_IN_PROGRESS;
 
1915
 
 
1916
        LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
 
1917
 
 
1918
        InProgressBuf = buf;
 
1919
        IsForInput = forInput;
 
1920
}
 
1921
 
 
1922
/*
 
1923
 * Function:TerminateBufferIO
 
1924
 *      (Assumptions)
 
1925
 *      My process is executing IO for the buffer
 
1926
 *      BufMgrLock is held
 
1927
 *      BM_IO_IN_PROGRESS mask is set for the buffer
 
1928
 *      The buffer is Pinned
 
1929
 *
 
1930
 * err_flag must be 0 for successful completion and BM_IO_ERROR for failure.
 
1931
 *
 
1932
 * Because BufMgrLock is held, we are already in an interrupt holdoff here,
 
1933
 * and do not need another.
 
1934
 */
 
1935
static void
 
1936
TerminateBufferIO(BufferDesc *buf, int err_flag)
 
1937
{
 
1938
        Assert(buf == InProgressBuf);
 
1939
        Assert(buf->flags & BM_IO_IN_PROGRESS);
 
1940
        buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
 
1941
        buf->flags |= err_flag;
 
1942
 
 
1943
        LWLockRelease(buf->io_in_progress_lock);
 
1944
 
 
1945
        InProgressBuf = NULL;
 
1946
}
 
1947
 
 
1948
/*
 
1949
 * Function:ContinueBufferIO
 
1950
 *      (Assumptions)
 
1951
 *      My process is executing IO for the buffer
 
1952
 *      BufMgrLock is held
 
1953
 *      The buffer is Pinned
 
1954
 *
 
1955
 * Because BufMgrLock is held, we are already in an interrupt holdoff here,
 
1956
 * and do not need another.
 
1957
 */
 
1958
static void
 
1959
ContinueBufferIO(BufferDesc *buf, bool forInput)
 
1960
{
 
1961
        Assert(buf == InProgressBuf);
 
1962
        Assert(buf->flags & BM_IO_IN_PROGRESS);
 
1963
        IsForInput = forInput;
 
1964
}
 
1965
 
 
1966
#ifdef NOT_USED
 
1967
void
 
1968
InitBufferIO(void)
 
1969
{
 
1970
        InProgressBuf = NULL;
 
1971
}
 
1972
#endif
 
1973
 
 
1974
/*
 
1975
 *      Clean up any active buffer I/O after an error.
 
1976
 *      BufMgrLock isn't held when this function is called,
 
1977
 *      but we haven't yet released buffer pins, so the buffer is still pinned.
 
1978
 *
 
1979
 *      If I/O was in progress, we always set BM_IO_ERROR.
 
1980
 */
 
1981
void
 
1982
AbortBufferIO(void)
 
1983
{
 
1984
        BufferDesc *buf = InProgressBuf;
 
1985
 
 
1986
        if (buf)
 
1987
        {
 
1988
                /*
 
1989
                 * Since LWLockReleaseAll has already been called, we're not
 
1990
                 * holding the buffer's io_in_progress_lock. We have to re-acquire
 
1991
                 * it so that we can use TerminateBufferIO. Anyone who's executing
 
1992
                 * WaitIO on the buffer will be in a busy spin until we succeed in
 
1993
                 * doing this.
 
1994
                 */
 
1995
                LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
 
1996
 
 
1997
                LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
1998
                Assert(buf->flags & BM_IO_IN_PROGRESS);
 
1999
                if (IsForInput)
 
2000
                {
 
2001
                        Assert(!(buf->flags & BM_DIRTY || buf->cntxDirty));
 
2002
                        /* We'd better not think buffer is valid yet */
 
2003
                        Assert(!(buf->flags & BM_VALID));
 
2004
                }
 
2005
                else
 
2006
                {
 
2007
                        Assert(buf->flags & BM_DIRTY || buf->cntxDirty);
 
2008
                        /* Issue notice if this is not the first failure... */
 
2009
                        if (buf->flags & BM_IO_ERROR)
 
2010
                        {
 
2011
                                ereport(WARNING,
 
2012
                                                (errcode(ERRCODE_IO_ERROR),
 
2013
                                                 errmsg("could not write block %u of %u/%u/%u",
 
2014
                                                                buf->tag.blockNum,
 
2015
                                                                buf->tag.rnode.spcNode,
 
2016
                                                                buf->tag.rnode.dbNode,
 
2017
                                                                buf->tag.rnode.relNode),
 
2018
                                                 errdetail("Multiple failures --- write error may be permanent.")));
 
2019
                        }
 
2020
                        buf->flags |= BM_DIRTY;
 
2021
                }
 
2022
                TerminateBufferIO(buf, BM_IO_ERROR);
 
2023
                LWLockRelease(BufMgrLock);
 
2024
        }
 
2025
}
 
2026
 
 
2027
/*
 
2028
 * Error context callback for errors occurring during buffer writes.
 
2029
 */
 
2030
static void
 
2031
buffer_write_error_callback(void *arg)
 
2032
{
 
2033
        BufferDesc *bufHdr = (BufferDesc *) arg;
 
2034
 
 
2035
        if (bufHdr != NULL)
 
2036
                errcontext("writing block %u of relation %u/%u/%u",
 
2037
                                   bufHdr->tag.blockNum,
 
2038
                                   bufHdr->tag.rnode.spcNode,
 
2039
                                   bufHdr->tag.rnode.dbNode,
 
2040
                                   bufHdr->tag.rnode.relNode);
 
2041
}