1
/*-------------------------------------------------------------------------
5
* PostgreSQL transaction log manager utility routines
7
* This file contains support routines that are used by XLOG replay functions.
8
* None of this code is used during normal system operation.
11
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
12
* Portions Copyright (c) 1994, Regents of the University of California
14
* src/backend/access/transam/xlogutils.c
16
*-------------------------------------------------------------------------
20
#include "access/xlogutils.h"
21
#include "catalog/catalog.h"
22
#include "storage/bufmgr.h"
23
#include "storage/smgr.h"
24
#include "utils/guc.h"
25
#include "utils/hsearch.h"
26
#include "utils/rel.h"
30
* During XLOG replay, we may see XLOG records for incremental updates of
31
* pages that no longer exist, because their relation was later dropped or
32
* truncated. (Note: this is only possible when full_page_writes = OFF,
33
* since when it's ON, the first reference we see to a page should always
34
* be a full-page rewrite not an incremental update.) Rather than simply
35
* ignoring such records, we make a note of the referenced page, and then
36
* complain if we don't actually see a drop or truncate covering the page
39
typedef struct xl_invalid_page_key
41
RelFileNode node; /* the relation */
42
ForkNumber forkno; /* the fork number */
43
BlockNumber blkno; /* the page */
44
} xl_invalid_page_key;
46
typedef struct xl_invalid_page
48
xl_invalid_page_key key; /* hash key ... must be first */
49
bool present; /* page existed but contained zeroes */
52
static HTAB *invalid_page_tab = NULL;
55
/* Log a reference to an invalid page */
57
log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno,
60
xl_invalid_page_key key;
61
xl_invalid_page *hentry;
65
* Log references to invalid pages at DEBUG1 level. This allows some
66
* tracing of the cause (note the elog context mechanism will tell us
67
* something about the XLOG record that generated the reference).
69
if (log_min_messages <= DEBUG1 || client_min_messages <= DEBUG1)
71
char *path = relpathperm(node, forkno);
74
elog(DEBUG1, "page %u of relation %s is uninitialized",
77
elog(DEBUG1, "page %u of relation %s does not exist",
82
if (invalid_page_tab == NULL)
84
/* create hash table when first needed */
87
memset(&ctl, 0, sizeof(ctl));
88
ctl.keysize = sizeof(xl_invalid_page_key);
89
ctl.entrysize = sizeof(xl_invalid_page);
92
invalid_page_tab = hash_create("XLOG invalid-page table",
95
HASH_ELEM | HASH_FUNCTION);
98
/* we currently assume xl_invalid_page_key contains no padding */
102
hentry = (xl_invalid_page *)
103
hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found);
107
/* hash_search already filled in the key */
108
hentry->present = present;
112
/* repeat reference ... leave "present" as it was */
116
/* Forget any invalid pages >= minblkno, because they've been dropped */
118
forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno)
120
HASH_SEQ_STATUS status;
121
xl_invalid_page *hentry;
123
if (invalid_page_tab == NULL)
124
return; /* nothing to do */
126
hash_seq_init(&status, invalid_page_tab);
128
while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
130
if (RelFileNodeEquals(hentry->key.node, node) &&
131
hentry->key.forkno == forkno &&
132
hentry->key.blkno >= minblkno)
134
if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2)
136
char *path = relpathperm(hentry->key.node, forkno);
138
elog(DEBUG2, "page %u of relation %s has been dropped",
139
hentry->key.blkno, path);
143
if (hash_search(invalid_page_tab,
144
(void *) &hentry->key,
145
HASH_REMOVE, NULL) == NULL)
146
elog(ERROR, "hash table corrupted");
151
/* Forget any invalid pages in a whole database */
153
forget_invalid_pages_db(Oid dbid)
155
HASH_SEQ_STATUS status;
156
xl_invalid_page *hentry;
158
if (invalid_page_tab == NULL)
159
return; /* nothing to do */
161
hash_seq_init(&status, invalid_page_tab);
163
while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
165
if (hentry->key.node.dbNode == dbid)
167
if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2)
169
char *path = relpathperm(hentry->key.node, hentry->key.forkno);
171
elog(DEBUG2, "page %u of relation %s has been dropped",
172
hentry->key.blkno, path);
176
if (hash_search(invalid_page_tab,
177
(void *) &hentry->key,
178
HASH_REMOVE, NULL) == NULL)
179
elog(ERROR, "hash table corrupted");
184
/* Complain about any remaining invalid-page entries */
186
XLogCheckInvalidPages(void)
188
HASH_SEQ_STATUS status;
189
xl_invalid_page *hentry;
190
bool foundone = false;
192
if (invalid_page_tab == NULL)
193
return; /* nothing to do */
195
hash_seq_init(&status, invalid_page_tab);
198
* Our strategy is to emit WARNING messages for all remaining entries and
199
* only PANIC after we've dumped all the available info.
201
while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
203
char *path = relpathperm(hentry->key.node, hentry->key.forkno);
206
elog(WARNING, "page %u of relation %s was uninitialized",
207
hentry->key.blkno, path);
209
elog(WARNING, "page %u of relation %s did not exist",
210
hentry->key.blkno, path);
216
elog(PANIC, "WAL contains references to invalid pages");
218
hash_destroy(invalid_page_tab);
219
invalid_page_tab = NULL;
224
* Read a page during XLOG replay.
226
* This is a shorthand of XLogReadBufferExtended() followed by
227
* LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), for reading from the main
230
* (Getting the buffer lock is not really necessary during single-process
231
* crash recovery, but some subroutines such as MarkBufferDirty will complain
232
* if we don't have the lock. In hot standby mode it's definitely necessary.)
234
* The returned buffer is exclusively-locked.
236
* For historical reasons, instead of a ReadBufferMode argument, this only
237
* supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
240
XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
244
buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
245
init ? RBM_ZERO : RBM_NORMAL);
246
if (BufferIsValid(buf))
247
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
253
* XLogReadBufferExtended
254
* Read a page during XLOG replay
256
* This is functionally comparable to ReadBufferExtended. There's some
257
* differences in the behavior wrt. the "mode" argument:
259
* In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we
260
* return InvalidBuffer. In this case the caller should silently skip the
261
* update on this page. (In this situation, we expect that the page was later
262
* dropped or truncated. If we don't see evidence of that later in the WAL
263
* sequence, we'll complain at the end of WAL replay.)
265
* In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
266
* relation is extended with all-zeroes pages up to the given block number.
269
XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
270
BlockNumber blkno, ReadBufferMode mode)
272
BlockNumber lastblock;
276
Assert(blkno != P_NEW);
278
/* Open the relation at smgr level */
279
smgr = smgropen(rnode, InvalidBackendId);
282
* Create the target file if it doesn't already exist. This lets us cope
283
* if the replay sequence contains writes to a relation that is later
284
* deleted. (The original coding of this routine would instead suppress
285
* the writes, but that seems like it risks losing valuable data if the
286
* filesystem loses an inode during a crash. Better to write the data
287
* until we are actually told to delete the file.)
289
smgrcreate(smgr, forknum, true);
291
lastblock = smgrnblocks(smgr, forknum);
293
if (blkno < lastblock)
295
/* page exists in file */
296
buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
301
/* hm, page doesn't exist in file */
302
if (mode == RBM_NORMAL)
304
log_invalid_page(rnode, forknum, blkno, false);
305
return InvalidBuffer;
307
/* OK to extend the file */
308
/* we do this in recovery only - no rel-extension lock needed */
310
buffer = InvalidBuffer;
311
while (blkno >= lastblock)
313
if (buffer != InvalidBuffer)
314
ReleaseBuffer(buffer);
315
buffer = ReadBufferWithoutRelcache(rnode, forknum,
319
Assert(BufferGetBlockNumber(buffer) == blkno);
322
if (mode == RBM_NORMAL)
324
/* check that page has been initialized */
325
Page page = (Page) BufferGetPage(buffer);
328
* We assume that PageIsNew is safe without a lock. During recovery,
329
* there should be no other backends that could modify the buffer at
334
ReleaseBuffer(buffer);
335
log_invalid_page(rnode, forknum, blkno, true);
336
return InvalidBuffer;
345
* Struct actually returned by XLogFakeRelcacheEntry, though the declared
346
* return type is Relation.
350
RelationData reldata; /* Note: this must be first */
351
FormData_pg_class pgc;
352
} FakeRelCacheEntryData;
354
typedef FakeRelCacheEntryData *FakeRelCacheEntry;
357
* Create a fake relation cache entry for a physical relation
359
* It's often convenient to use the same functions in XLOG replay as in the
360
* main codepath, but those functions typically work with a relcache entry.
361
* We don't have a working relation cache during XLOG replay, but this
362
* function can be used to create a fake relcache entry instead. Only the
363
* fields related to physical storage, like rd_rel, are initialized, so the
364
* fake entry is only usable in low-level operations like ReadBuffer().
366
* Caller must free the returned entry with FreeFakeRelcacheEntry().
369
CreateFakeRelcacheEntry(RelFileNode rnode)
371
FakeRelCacheEntry fakeentry;
374
/* Allocate the Relation struct and all related space in one block. */
375
fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
376
rel = (Relation) fakeentry;
378
rel->rd_rel = &fakeentry->pgc;
379
rel->rd_node = rnode;
380
/* We will never be working with temp rels during recovery */
381
rel->rd_backend = InvalidBackendId;
383
/* We don't know the name of the relation; use relfilenode instead */
384
sprintf(RelationGetRelationName(rel), "%u", rnode.relNode);
387
* We set up the lockRelId in case anything tries to lock the dummy
388
* relation. Note that this is fairly bogus since relNode may be
389
* different from the relation's OID. It shouldn't really matter though,
390
* since we are presumably running by ourselves and can't have any lock
393
rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
394
rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
402
* Free a fake relation cache entry.
405
FreeFakeRelcacheEntry(Relation fakerel)
411
* Drop a relation during XLOG replay
413
* This is called when the relation is about to be deleted; we need to remove
414
* any open "invalid-page" records for the relation.
417
XLogDropRelation(RelFileNode rnode, ForkNumber forknum)
419
forget_invalid_pages(rnode, forknum, 0);
423
* Drop a whole database during XLOG replay
425
* As above, but for DROP DATABASE instead of dropping a single rel
428
XLogDropDatabase(Oid dbid)
431
* This is unnecessarily heavy-handed, as it will close SMgrRelation
432
* objects for other databases as well. DROP DATABASE occurs seldom enough
433
* that it's not worth introducing a variant of smgrclose for just this
434
* purpose. XXX: Or should we rather leave the smgr entries dangling?
438
forget_invalid_pages_db(dbid);
442
* Truncate a relation during XLOG replay
444
* We need to clean up any open "invalid-page" records for the dropped pages.
447
XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
450
forget_invalid_pages(rnode, forkNum, nblocks);