~vcs-imports/mammoth-replicator/trunk

« back to all changes in this revision

Viewing changes to src/backend/access/transam/xlog.c

  • Committer: alvherre
  • Date: 2005-12-16 21:24:52 UTC
  • Revision ID: svn-v4:db760fc0-0f08-0410-9d63-cc6633f64896:trunk:1
Initial import of the REL8_0_3 sources from the Pgsql CVS repository.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*-------------------------------------------------------------------------
 
2
 *
 
3
 * xlog.c
 
4
 *              PostgreSQL transaction log manager
 
5
 *
 
6
 *
 
7
 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
 
8
 * Portions Copyright (c) 1994, Regents of the University of California
 
9
 *
 
10
 * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.180.4.2 2005-04-15 22:49:45 momjian Exp $
 
11
 *
 
12
 *-------------------------------------------------------------------------
 
13
 */
 
14
 
 
15
#include "postgres.h"
 
16
 
 
17
#include <ctype.h>
 
18
#include <fcntl.h>
 
19
#include <signal.h>
 
20
#include <time.h>
 
21
#include <unistd.h>
 
22
#include <sys/stat.h>
 
23
#include <sys/time.h>
 
24
 
 
25
#include "access/clog.h"
 
26
#include "access/subtrans.h"
 
27
#include "access/xact.h"
 
28
#include "access/xlog.h"
 
29
#include "access/xlog_internal.h"
 
30
#include "access/xlogutils.h"
 
31
#include "catalog/catversion.h"
 
32
#include "catalog/pg_control.h"
 
33
#include "miscadmin.h"
 
34
#include "postmaster/bgwriter.h"
 
35
#include "storage/bufpage.h"
 
36
#include "storage/fd.h"
 
37
#include "storage/lwlock.h"
 
38
#include "storage/pmsignal.h"
 
39
#include "storage/proc.h"
 
40
#include "storage/sinval.h"
 
41
#include "storage/spin.h"
 
42
#include "utils/builtins.h"
 
43
#include "utils/guc.h"
 
44
#include "utils/relcache.h"
 
45
 
 
46
 
 
47
/*
 
48
 * This chunk of hackery attempts to determine which file sync methods
 
49
 * are available on the current platform, and to choose an appropriate
 
50
 * default method.      We assume that fsync() is always available, and that
 
51
 * configure determined whether fdatasync() is.
 
52
 */
 
53
#define SYNC_METHOD_FSYNC               0
 
54
#define SYNC_METHOD_FDATASYNC   1
 
55
#define SYNC_METHOD_OPEN                2               /* used for both O_SYNC and
 
56
                                                                                 * O_DSYNC */
 
57
 
 
58
#if defined(O_SYNC)
 
59
#define OPEN_SYNC_FLAG     O_SYNC
 
60
#else
 
61
#if defined(O_FSYNC)
 
62
#define OPEN_SYNC_FLAG    O_FSYNC
 
63
#endif
 
64
#endif
 
65
 
 
66
#if defined(O_DSYNC)
 
67
#if defined(OPEN_SYNC_FLAG)
 
68
#if O_DSYNC != OPEN_SYNC_FLAG
 
69
#define OPEN_DATASYNC_FLAG        O_DSYNC
 
70
#endif
 
71
#else /* !defined(OPEN_SYNC_FLAG) */
 
72
/* Win32 only has O_DSYNC */
 
73
#define OPEN_DATASYNC_FLAG        O_DSYNC
 
74
#endif
 
75
#endif
 
76
 
 
77
#if defined(OPEN_DATASYNC_FLAG)
 
78
#define DEFAULT_SYNC_METHOD_STR    "open_datasync"
 
79
#define DEFAULT_SYNC_METHOD                SYNC_METHOD_OPEN
 
80
#define DEFAULT_SYNC_FLAGBIT       OPEN_DATASYNC_FLAG
 
81
#else
 
82
#if defined(HAVE_FDATASYNC)
 
83
#define DEFAULT_SYNC_METHOD_STR   "fdatasync"
 
84
#define DEFAULT_SYNC_METHOD               SYNC_METHOD_FDATASYNC
 
85
#define DEFAULT_SYNC_FLAGBIT      0
 
86
#else
 
87
#ifndef FSYNC_IS_WRITE_THROUGH
 
88
#define DEFAULT_SYNC_METHOD_STR   "fsync"
 
89
#else
 
90
#define DEFAULT_SYNC_METHOD_STR   "fsync_writethrough"
 
91
#endif
 
92
#define DEFAULT_SYNC_METHOD               SYNC_METHOD_FSYNC
 
93
#define DEFAULT_SYNC_FLAGBIT      0
 
94
#endif
 
95
#endif
 
96
 
 
97
 
 
98
/* User-settable parameters */
 
99
int                     CheckPointSegments = 3;
 
100
int                     XLOGbuffers = 8;
 
101
char       *XLogArchiveCommand = NULL;
 
102
char       *XLOG_sync_method = NULL;
 
103
const char      XLOG_sync_method_default[] = DEFAULT_SYNC_METHOD_STR;
 
104
 
 
105
#ifdef WAL_DEBUG
 
106
bool            XLOG_DEBUG = false;
 
107
#endif
 
108
 
 
109
/*
 
110
 * XLOGfileslop is used in the code as the allowed "fuzz" in the number of
 
111
 * preallocated XLOG segments --- we try to have at least XLOGfiles advance
 
112
 * segments but no more than XLOGfileslop segments.  This could
 
113
 * be made a separate GUC variable, but at present I think it's sufficient
 
114
 * to hardwire it as 2*CheckPointSegments+1.  Under normal conditions, a
 
115
 * checkpoint will free no more than 2*CheckPointSegments log segments, and
 
116
 * we want to recycle all of them; the +1 allows boundary cases to happen
 
117
 * without wasting a delete/create-segment cycle.
 
118
 */
 
119
 
 
120
#define XLOGfileslop    (2*CheckPointSegments + 1)
 
121
 
 
122
 
 
123
/* these are derived from XLOG_sync_method by assign_xlog_sync_method */
 
124
static int      sync_method = DEFAULT_SYNC_METHOD;
 
125
static int      open_sync_bit = DEFAULT_SYNC_FLAGBIT;
 
126
 
 
127
#define XLOG_SYNC_BIT  (enableFsync ? open_sync_bit : 0)
 
128
 
 
129
#define MinXLOGbuffers  4
 
130
 
 
131
 
 
132
/*
 
133
 * ThisTimeLineID will be same in all backends --- it identifies current
 
134
 * WAL timeline for the database system.
 
135
 */
 
136
TimeLineID      ThisTimeLineID = 0;
 
137
 
 
138
/* Are we doing recovery from XLOG? */
 
139
bool            InRecovery = false;
 
140
 
 
141
/* Are we recovering using offline XLOG archives? */
 
142
static bool InArchiveRecovery = false;
 
143
 
 
144
/* Was the last xlog file restored from archive, or local? */
 
145
static bool restoredFromArchive = false;
 
146
 
 
147
/* options taken from recovery.conf */
 
148
static char *recoveryRestoreCommand = NULL;
 
149
static bool recoveryTarget = false;
 
150
static bool recoveryTargetExact = false;
 
151
static bool recoveryTargetInclusive = true;
 
152
static TransactionId recoveryTargetXid;
 
153
static time_t recoveryTargetTime;
 
154
 
 
155
/* if recoveryStopsHere returns true, it saves actual stop xid/time here */
 
156
static TransactionId recoveryStopXid;
 
157
static time_t recoveryStopTime;
 
158
static bool recoveryStopAfter;
 
159
 
 
160
/* constraint set by read_backup_label */
 
161
static XLogRecPtr recoveryMinXlogOffset = {0, 0};
 
162
 
 
163
/*
 
164
 * During normal operation, the only timeline we care about is ThisTimeLineID.
 
165
 * During recovery, however, things are more complicated.  To simplify life
 
166
 * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 
167
 * scan through the WAL history (that is, it is the line that was active when
 
168
 * the currently-scanned WAL record was generated).  We also need these
 
169
 * timeline values:
 
170
 *
 
171
 * recoveryTargetTLI: the desired timeline that we want to end in.
 
172
 *
 
173
 * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
 
174
 * its known parents, newest first (so recoveryTargetTLI is always the
 
175
 * first list member).  Only these TLIs are expected to be seen in the WAL
 
176
 * segments we read, and indeed only these TLIs will be considered as
 
177
 * candidate WAL files to open at all.
 
178
 *
 
179
 * curFileTLI: the TLI appearing in the name of the current input WAL file.
 
180
 * (This is not necessarily the same as ThisTimeLineID, because we could
 
181
 * be scanning data that was copied from an ancestor timeline when the current
 
182
 * file was created.)  During a sequential scan we do not allow this value
 
183
 * to decrease.
 
184
 */
 
185
static TimeLineID recoveryTargetTLI;
 
186
static List *expectedTLIs;
 
187
static TimeLineID curFileTLI;
 
188
 
 
189
/*
 
190
 * MyLastRecPtr points to the start of the last XLOG record inserted by the
 
191
 * current transaction.  If MyLastRecPtr.xrecoff == 0, then the current
 
192
 * xact hasn't yet inserted any transaction-controlled XLOG records.
 
193
 *
 
194
 * Note that XLOG records inserted outside transaction control are not
 
195
 * reflected into MyLastRecPtr.  They do, however, cause MyXactMadeXLogEntry
 
196
 * to be set true.      The latter can be used to test whether the current xact
 
197
 * made any loggable changes (including out-of-xact changes, such as
 
198
 * sequence updates).
 
199
 *
 
200
 * When we insert/update/delete a tuple in a temporary relation, we do not
 
201
 * make any XLOG record, since we don't care about recovering the state of
 
202
 * the temp rel after a crash.  However, we will still need to remember
 
203
 * whether our transaction committed or aborted in that case.  So, we must
 
204
 * set MyXactMadeTempRelUpdate true to indicate that the XID will be of
 
205
 * interest later.
 
206
 */
 
207
XLogRecPtr      MyLastRecPtr = {0, 0};
 
208
 
 
209
bool            MyXactMadeXLogEntry = false;
 
210
 
 
211
bool            MyXactMadeTempRelUpdate = false;
 
212
 
 
213
/*
 
214
 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 
215
 * current backend.  It is updated for all inserts, transaction-controlled
 
216
 * or not.      ProcLastRecEnd is similar but points to end+1 of last record.
 
217
 */
 
218
static XLogRecPtr ProcLastRecPtr = {0, 0};
 
219
 
 
220
XLogRecPtr      ProcLastRecEnd = {0, 0};
 
221
 
 
222
/*
 
223
 * RedoRecPtr is this backend's local copy of the REDO record pointer
 
224
 * (which is almost but not quite the same as a pointer to the most recent
 
225
 * CHECKPOINT record).  We update this from the shared-memory copy,
 
226
 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 
227
 * hold the Insert lock).  See XLogInsert for details.  We are also allowed
 
228
 * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
 
229
 * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 
230
 * InitXLOGAccess.
 
231
 */
 
232
static XLogRecPtr RedoRecPtr;
 
233
 
 
234
/*----------
 
235
 * Shared-memory data structures for XLOG control
 
236
 *
 
237
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
 
238
 * the log up to (all records before that point must be written or fsynced).
 
239
 * LogwrtResult indicates the byte positions we have already written/fsynced.
 
240
 * These structs are identical but are declared separately to indicate their
 
241
 * slightly different functions.
 
242
 *
 
243
 * We do a lot of pushups to minimize the amount of access to lockable
 
244
 * shared memory values.  There are actually three shared-memory copies of
 
245
 * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 
246
 *              XLogCtl->LogwrtResult is protected by info_lck
 
247
 *              XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 
248
 *              XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 
249
 * One must hold the associated lock to read or write any of these, but
 
250
 * of course no lock is needed to read/write the unshared LogwrtResult.
 
251
 *
 
252
 * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 
253
 * right", since both are updated by a write or flush operation before
 
254
 * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 
255
 * is that it can be examined/modified by code that already holds WALWriteLock
 
256
 * without needing to grab info_lck as well.
 
257
 *
 
258
 * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
 
259
 * but is updated when convenient.      Again, it exists for the convenience of
 
260
 * code that is already holding WALInsertLock but not the other locks.
 
261
 *
 
262
 * The unshared LogwrtResult may lag behind any or all of these, and again
 
263
 * is updated when convenient.
 
264
 *
 
265
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 
266
 * (protected by info_lck), but we don't need to cache any copies of it.
 
267
 *
 
268
 * Note that this all works because the request and result positions can only
 
269
 * advance forward, never back up, and so we can easily determine which of two
 
270
 * values is "more up to date".
 
271
 *
 
272
 * info_lck is only held long enough to read/update the protected variables,
 
273
 * so it's a plain spinlock.  The other locks are held longer (potentially
 
274
 * over I/O operations), so we use LWLocks for them.  These locks are:
 
275
 *
 
276
 * WALInsertLock: must be held to insert a record into the WAL buffers.
 
277
 *
 
278
 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 
279
 * XLogFlush).
 
280
 *
 
281
 * ControlFileLock: must be held to read/update control file or create
 
282
 * new log file.
 
283
 *
 
284
 * CheckpointLock: must be held to do a checkpoint (ensures only one
 
285
 * checkpointer at a time; even though the postmaster won't launch
 
286
 * parallel checkpoint processes, we need this because manual checkpoints
 
287
 * could be launched simultaneously).
 
288
 *
 
289
 *----------
 
290
 */
 
291
 
 
292
typedef struct XLogwrtRqst
 
293
{
 
294
        XLogRecPtr      Write;                  /* last byte + 1 to write out */
 
295
        XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 
296
} XLogwrtRqst;
 
297
 
 
298
typedef struct XLogwrtResult
 
299
{
 
300
        XLogRecPtr      Write;                  /* last byte + 1 written out */
 
301
        XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 
302
} XLogwrtResult;
 
303
 
 
304
/*
 
305
 * Shared state data for XLogInsert.
 
306
 */
 
307
typedef struct XLogCtlInsert
 
308
{
 
309
        XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
 
310
        XLogRecPtr      PrevRecord;             /* start of previously-inserted record */
 
311
        uint16          curridx;                /* current block index in cache */
 
312
        XLogPageHeader currpage;        /* points to header of block in cache */
 
313
        char       *currpos;            /* current insertion point in cache */
 
314
        XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 
315
} XLogCtlInsert;
 
316
 
 
317
/*
 
318
 * Shared state data for XLogWrite/XLogFlush.
 
319
 */
 
320
typedef struct XLogCtlWrite
 
321
{
 
322
        XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
 
323
        uint16          curridx;                /* cache index of next block to write */
 
324
} XLogCtlWrite;
 
325
 
 
326
/*
 
327
 * Total shared-memory state for XLOG.
 
328
 */
 
329
typedef struct XLogCtlData
 
330
{
 
331
        /* Protected by WALInsertLock: */
 
332
        XLogCtlInsert Insert;
 
333
        /* Protected by info_lck: */
 
334
        XLogwrtRqst LogwrtRqst;
 
335
        XLogwrtResult LogwrtResult;
 
336
        /* Protected by WALWriteLock: */
 
337
        XLogCtlWrite Write;
 
338
 
 
339
        /*
 
340
         * These values do not change after startup, although the pointed-to
 
341
         * pages and xlblocks values certainly do.      Permission to read/write
 
342
         * the pages and xlblocks values depends on WALInsertLock and
 
343
         * WALWriteLock.
 
344
         */
 
345
        char       *pages;                      /* buffers for unwritten XLOG pages */
 
346
        XLogRecPtr *xlblocks;           /* 1st byte ptr-s + BLCKSZ */
 
347
        uint32          XLogCacheByte;  /* # bytes in xlog buffers */
 
348
        uint32          XLogCacheBlck;  /* highest allocated xlog buffer index */
 
349
        TimeLineID      ThisTimeLineID;
 
350
 
 
351
        slock_t         info_lck;               /* locks shared LogwrtRqst/LogwrtResult */
 
352
} XLogCtlData;
 
353
 
 
354
static XLogCtlData *XLogCtl = NULL;
 
355
 
 
356
/*
 
357
 * We maintain an image of pg_control in shared memory.
 
358
 */
 
359
static ControlFileData *ControlFile = NULL;
 
360
 
 
361
/*
 
362
 * Macros for managing XLogInsert state.  In most cases, the calling routine
 
363
 * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 
364
 * so these are passed as parameters instead of being fetched via XLogCtl.
 
365
 */
 
366
 
 
367
/* Free space remaining in the current xlog page buffer */
 
368
#define INSERT_FREESPACE(Insert)  \
 
369
        (BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
 
370
 
 
371
/* Construct XLogRecPtr value for current insertion point */
 
372
#define INSERT_RECPTR(recptr,Insert,curridx)  \
 
373
        ( \
 
374
          (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
 
375
          (recptr).xrecoff = \
 
376
                XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
 
377
        )
 
378
 
 
379
#define PrevBufIdx(idx)         \
 
380
                (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
 
381
 
 
382
#define NextBufIdx(idx)         \
 
383
                (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 
384
 
 
385
 
 
386
/* File path names */
 
387
char            XLogDir[MAXPGPATH];
 
388
static char ControlFilePath[MAXPGPATH];
 
389
 
 
390
/*
 
391
 * Private, possibly out-of-date copy of shared LogwrtResult.
 
392
 * See discussion above.
 
393
 */
 
394
static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
 
395
 
 
396
/*
 
397
 * openLogFile is -1 or a kernel FD for an open log file segment.
 
398
 * When it's open, openLogOff is the current seek offset in the file.
 
399
 * openLogId/openLogSeg identify the segment.  These variables are only
 
400
 * used to write the XLOG, and so will normally refer to the active segment.
 
401
 */
 
402
static int      openLogFile = -1;
 
403
static uint32 openLogId = 0;
 
404
static uint32 openLogSeg = 0;
 
405
static uint32 openLogOff = 0;
 
406
 
 
407
/*
 
408
 * These variables are used similarly to the ones above, but for reading
 
409
 * the XLOG.  Note, however, that readOff generally represents the offset
 
410
 * of the page just read, not the seek position of the FD itself, which
 
411
 * will be just past that page.
 
412
 */
 
413
static int      readFile = -1;
 
414
static uint32 readId = 0;
 
415
static uint32 readSeg = 0;
 
416
static uint32 readOff = 0;
 
417
 
 
418
/* Buffer for currently read page (BLCKSZ bytes) */
 
419
static char *readBuf = NULL;
 
420
 
 
421
/* Buffer for current ReadRecord result (expandable) */
 
422
static char *readRecordBuf = NULL;
 
423
static uint32 readRecordBufSize = 0;
 
424
 
 
425
/* State information for XLOG reading */
 
426
static XLogRecPtr ReadRecPtr;
 
427
static XLogRecPtr EndRecPtr;
 
428
static XLogRecord *nextRecord = NULL;
 
429
static TimeLineID lastPageTLI = 0;
 
430
 
 
431
static bool InRedo = false;
 
432
 
 
433
 
 
434
static void XLogArchiveNotify(const char *xlog);
 
435
static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
 
436
static bool XLogArchiveIsDone(const char *xlog);
 
437
static void XLogArchiveCleanup(const char *xlog);
 
438
static void readRecoveryCommandFile(void);
 
439
static void exitArchiveRecovery(TimeLineID endTLI,
 
440
                                        uint32 endLogId, uint32 endLogSeg);
 
441
static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 
442
 
 
443
static bool AdvanceXLInsertBuffer(void);
 
444
static void XLogWrite(XLogwrtRqst WriteRqst);
 
445
static int XLogFileInit(uint32 log, uint32 seg,
 
446
                         bool *use_existent, bool use_lock);
 
447
static bool InstallXLogFileSegment(uint32 log, uint32 seg, char *tmppath,
 
448
                                           bool find_free, int max_advance,
 
449
                                           bool use_lock);
 
450
static int      XLogFileOpen(uint32 log, uint32 seg);
 
451
static int      XLogFileRead(uint32 log, uint32 seg, int emode);
 
452
static bool RestoreArchivedFile(char *path, const char *xlogfname,
 
453
                                        const char *recovername, off_t expectedSize);
 
454
static int      PreallocXlogFiles(XLogRecPtr endptr);
 
455
static void MoveOfflineLogs(uint32 log, uint32 seg, XLogRecPtr endptr,
 
456
                                                        int *nsegsremoved, int *nsegsrecycled);
 
457
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
 
458
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 
459
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
 
460
static List *readTimeLineHistory(TimeLineID targetTLI);
 
461
static bool existsTimeLineHistory(TimeLineID probeTLI);
 
462
static TimeLineID findNewestTimeLine(TimeLineID startTLI);
 
463
static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 
464
                                         TimeLineID endTLI,
 
465
                                         uint32 endLogId, uint32 endLogSeg);
 
466
static void WriteControlFile(void);
 
467
static void ReadControlFile(void);
 
468
static char *str_time(time_t tnow);
 
469
static void issue_xlog_fsync(void);
 
470
 
 
471
#ifdef WAL_DEBUG
 
472
static void xlog_outrec(char *buf, XLogRecord *record);
 
473
#endif
 
474
static bool read_backup_label(XLogRecPtr *checkPointLoc);
 
475
static void remove_backup_label(void);
 
476
 
 
477
 
 
478
/*
 
479
 * Insert an XLOG record having the specified RMID and info bytes,
 
480
 * with the body of the record being the data chunk(s) described by
 
481
 * the rdata list (see xlog.h for notes about rdata).
 
482
 *
 
483
 * Returns XLOG pointer to end of record (beginning of next record).
 
484
 * This can be used as LSN for data pages affected by the logged action.
 
485
 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 
486
 * before the data page can be written out.  This implements the basic
 
487
 * WAL rule "write the log before the data".)
 
488
 *
 
489
 * NB: this routine feels free to scribble on the XLogRecData structs,
 
490
 * though not on the data they reference.  This is OK since the XLogRecData
 
491
 * structs are always just temporaries in the calling code.
 
492
 */
 
493
XLogRecPtr
 
494
XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 
495
{
 
496
        XLogCtlInsert *Insert = &XLogCtl->Insert;
 
497
        XLogRecord *record;
 
498
        XLogContRecord *contrecord;
 
499
        XLogRecPtr      RecPtr;
 
500
        XLogRecPtr      WriteRqst;
 
501
        uint32          freespace;
 
502
        uint16          curridx;
 
503
        XLogRecData *rdt;
 
504
        Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 
505
        bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 
506
        BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 
507
        XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 
508
        XLogRecData dtbuf_rdt[2 * XLR_MAX_BKP_BLOCKS];
 
509
        crc64           rdata_crc;
 
510
        uint32          len,
 
511
                                write_len;
 
512
        unsigned        i;
 
513
        XLogwrtRqst LogwrtRqst;
 
514
        bool            updrqst;
 
515
        bool            no_tran = (rmid == RM_XLOG_ID) ? true : false;
 
516
 
 
517
        if (info & XLR_INFO_MASK)
 
518
        {
 
519
                if ((info & XLR_INFO_MASK) != XLOG_NO_TRAN)
 
520
                        elog(PANIC, "invalid xlog info mask %02X", (info & XLR_INFO_MASK));
 
521
                no_tran = true;
 
522
                info &= ~XLR_INFO_MASK;
 
523
        }
 
524
 
 
525
        /*
 
526
         * In bootstrap mode, we don't actually log anything but XLOG
 
527
         * resources; return a phony record pointer.
 
528
         */
 
529
        if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 
530
        {
 
531
                RecPtr.xlogid = 0;
 
532
                RecPtr.xrecoff = SizeOfXLogLongPHD;             /* start of 1st chkpt
 
533
                                                                                                 * record */
 
534
                return (RecPtr);
 
535
        }
 
536
 
 
537
        /*
 
538
         * Here we scan the rdata list, determine which buffers must be backed
 
539
         * up, and compute the CRC values for the data.  Note that the record
 
540
         * header isn't added into the CRC yet since we don't know the final
 
541
         * length or info bits quite yet.
 
542
         *
 
543
         * We may have to loop back to here if a race condition is detected
 
544
         * below. We could prevent the race by doing all this work while
 
545
         * holding the insert lock, but it seems better to avoid doing CRC
 
546
         * calculations while holding the lock.  This means we have to be
 
547
         * careful about modifying the rdata list until we know we aren't
 
548
         * going to loop back again.  The only change we allow ourselves to
 
549
         * make earlier is to set rdt->data = NULL in list items we have
 
550
         * decided we will have to back up the whole buffer for.  This is OK
 
551
         * because we will certainly decide the same thing again for those
 
552
         * items if we do it over; doing it here saves an extra pass over the
 
553
         * list later.
 
554
         */
 
555
begin:;
 
556
        for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 
557
        {
 
558
                dtbuf[i] = InvalidBuffer;
 
559
                dtbuf_bkp[i] = false;
 
560
        }
 
561
 
 
562
        INIT_CRC64(rdata_crc);
 
563
        len = 0;
 
564
        for (rdt = rdata;;)
 
565
        {
 
566
                if (rdt->buffer == InvalidBuffer)
 
567
                {
 
568
                        /* Simple data, just include it */
 
569
                        len += rdt->len;
 
570
                        COMP_CRC64(rdata_crc, rdt->data, rdt->len);
 
571
                }
 
572
                else
 
573
                {
 
574
                        /* Find info for buffer */
 
575
                        for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 
576
                        {
 
577
                                if (rdt->buffer == dtbuf[i])
 
578
                                {
 
579
                                        /* Buffer already referenced by earlier list item */
 
580
                                        if (dtbuf_bkp[i])
 
581
                                                rdt->data = NULL;
 
582
                                        else if (rdt->data)
 
583
                                        {
 
584
                                                len += rdt->len;
 
585
                                                COMP_CRC64(rdata_crc, rdt->data, rdt->len);
 
586
                                        }
 
587
                                        break;
 
588
                                }
 
589
                                if (dtbuf[i] == InvalidBuffer)
 
590
                                {
 
591
                                        /* OK, put it in this slot */
 
592
                                        dtbuf[i] = rdt->buffer;
 
593
 
 
594
                                        /*
 
595
                                         * XXX We assume page LSN is first data on page
 
596
                                         */
 
597
                                        dtbuf_lsn[i] = *((XLogRecPtr *) BufferGetBlock(rdt->buffer));
 
598
                                        if (XLByteLE(dtbuf_lsn[i], RedoRecPtr))
 
599
                                        {
 
600
                                                crc64           dtcrc;
 
601
 
 
602
                                                dtbuf_bkp[i] = true;
 
603
                                                rdt->data = NULL;
 
604
                                                INIT_CRC64(dtcrc);
 
605
                                                COMP_CRC64(dtcrc,
 
606
                                                                   BufferGetBlock(dtbuf[i]),
 
607
                                                                   BLCKSZ);
 
608
                                                dtbuf_xlg[i].node = BufferGetFileNode(dtbuf[i]);
 
609
                                                dtbuf_xlg[i].block = BufferGetBlockNumber(dtbuf[i]);
 
610
                                                COMP_CRC64(dtcrc,
 
611
                                                                (char *) &(dtbuf_xlg[i]) + sizeof(crc64),
 
612
                                                                   sizeof(BkpBlock) - sizeof(crc64));
 
613
                                                FIN_CRC64(dtcrc);
 
614
                                                dtbuf_xlg[i].crc = dtcrc;
 
615
                                        }
 
616
                                        else if (rdt->data)
 
617
                                        {
 
618
                                                len += rdt->len;
 
619
                                                COMP_CRC64(rdata_crc, rdt->data, rdt->len);
 
620
                                        }
 
621
                                        break;
 
622
                                }
 
623
                        }
 
624
                        if (i >= XLR_MAX_BKP_BLOCKS)
 
625
                                elog(PANIC, "can backup at most %d blocks per xlog record",
 
626
                                         XLR_MAX_BKP_BLOCKS);
 
627
                }
 
628
                /* Break out of loop when rdt points to last list item */
 
629
                if (rdt->next == NULL)
 
630
                        break;
 
631
                rdt = rdt->next;
 
632
        }
 
633
 
 
634
        /*
 
635
         * NOTE: the test for len == 0 here is somewhat fishy, since in theory
 
636
         * all of the rmgr data might have been suppressed in favor of backup
 
637
         * blocks.      Currently, all callers of XLogInsert provide at least some
 
638
         * not-in-a-buffer data and so len == 0 should never happen, but that
 
639
         * may not be true forever.  If you need to remove the len == 0 check,
 
640
         * also remove the check for xl_len == 0 in ReadRecord, below.
 
641
         */
 
642
        if (len == 0)
 
643
                elog(PANIC, "invalid xlog record length %u", len);
 
644
 
 
645
        START_CRIT_SECTION();
 
646
 
 
647
        /* update LogwrtResult before doing cache fill check */
 
648
        {
 
649
                /* use volatile pointer to prevent code rearrangement */
 
650
                volatile XLogCtlData *xlogctl = XLogCtl;
 
651
 
 
652
                SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
 
653
                LogwrtRqst = xlogctl->LogwrtRqst;
 
654
                LogwrtResult = xlogctl->LogwrtResult;
 
655
                SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
 
656
        }
 
657
 
 
658
        /*
 
659
         * If cache is half filled then try to acquire write lock and do
 
660
         * XLogWrite. Ignore any fractional blocks in performing this check.
 
661
         */
 
662
        LogwrtRqst.Write.xrecoff -= LogwrtRqst.Write.xrecoff % BLCKSZ;
 
663
        if (LogwrtRqst.Write.xlogid != LogwrtResult.Write.xlogid ||
 
664
                (LogwrtRqst.Write.xrecoff >= LogwrtResult.Write.xrecoff +
 
665
                 XLogCtl->XLogCacheByte / 2))
 
666
        {
 
667
                if (LWLockConditionalAcquire(WALWriteLock, LW_EXCLUSIVE))
 
668
                {
 
669
                        LogwrtResult = XLogCtl->Write.LogwrtResult;
 
670
                        if (XLByteLT(LogwrtResult.Write, LogwrtRqst.Write))
 
671
                                XLogWrite(LogwrtRqst);
 
672
                        LWLockRelease(WALWriteLock);
 
673
                }
 
674
        }
 
675
 
 
676
        /* Now wait to get insert lock */
 
677
        LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 
678
 
 
679
        /*
 
680
         * Check to see if my RedoRecPtr is out of date.  If so, may have to
 
681
         * go back and recompute everything.  This can only happen just after
 
682
         * a checkpoint, so it's better to be slow in this case and fast
 
683
         * otherwise.
 
684
         */
 
685
        if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
 
686
        {
 
687
                Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
 
688
                RedoRecPtr = Insert->RedoRecPtr;
 
689
 
 
690
                for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 
691
                {
 
692
                        if (dtbuf[i] == InvalidBuffer)
 
693
                                continue;
 
694
                        if (dtbuf_bkp[i] == false &&
 
695
                                XLByteLE(dtbuf_lsn[i], RedoRecPtr))
 
696
                        {
 
697
                                /*
 
698
                                 * Oops, this buffer now needs to be backed up, but we
 
699
                                 * didn't think so above.  Start over.
 
700
                                 */
 
701
                                LWLockRelease(WALInsertLock);
 
702
                                END_CRIT_SECTION();
 
703
                                goto begin;
 
704
                        }
 
705
                }
 
706
        }
 
707
 
 
708
        /*
 
709
         * Make additional rdata list entries for the backup blocks, so that
 
710
         * we don't need to special-case them in the write loop.  Note that we
 
711
         * have now irrevocably changed the input rdata list.  At the exit of
 
712
         * this loop, write_len includes the backup block data.
 
713
         *
 
714
         * Also set the appropriate info bits to show which buffers were backed
 
715
         * up.  The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th
 
716
         * distinct buffer value (ignoring InvalidBuffer) appearing in the
 
717
         * rdata list.
 
718
         */
 
719
        write_len = len;
 
720
        for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 
721
        {
 
722
                if (dtbuf[i] == InvalidBuffer || !(dtbuf_bkp[i]))
 
723
                        continue;
 
724
 
 
725
                info |= XLR_SET_BKP_BLOCK(i);
 
726
 
 
727
                rdt->next = &(dtbuf_rdt[2 * i]);
 
728
 
 
729
                dtbuf_rdt[2 * i].data = (char *) &(dtbuf_xlg[i]);
 
730
                dtbuf_rdt[2 * i].len = sizeof(BkpBlock);
 
731
                write_len += sizeof(BkpBlock);
 
732
 
 
733
                rdt = dtbuf_rdt[2 * i].next = &(dtbuf_rdt[2 * i + 1]);
 
734
 
 
735
                dtbuf_rdt[2 * i + 1].data = (char *) BufferGetBlock(dtbuf[i]);
 
736
                dtbuf_rdt[2 * i + 1].len = BLCKSZ;
 
737
                write_len += BLCKSZ;
 
738
                dtbuf_rdt[2 * i + 1].next = NULL;
 
739
        }
 
740
 
 
741
        /*
 
742
         * If there isn't enough space on the current XLOG page for a record
 
743
         * header, advance to the next page (leaving the unused space as
 
744
         * zeroes).
 
745
         */
 
746
        updrqst = false;
 
747
        freespace = INSERT_FREESPACE(Insert);
 
748
        if (freespace < SizeOfXLogRecord)
 
749
        {
 
750
                updrqst = AdvanceXLInsertBuffer();
 
751
                freespace = INSERT_FREESPACE(Insert);
 
752
        }
 
753
 
 
754
        curridx = Insert->curridx;
 
755
        record = (XLogRecord *) Insert->currpos;
 
756
 
 
757
        /* Insert record header */
 
758
 
 
759
        record->xl_prev = Insert->PrevRecord;
 
760
        record->xl_xid = GetCurrentTransactionIdIfAny();
 
761
        record->xl_len = len;           /* doesn't include backup blocks */
 
762
        record->xl_info = info;
 
763
        record->xl_rmid = rmid;
 
764
 
 
765
        /* Now we can finish computing the main CRC */
 
766
        COMP_CRC64(rdata_crc, (char *) record + sizeof(crc64),
 
767
                           SizeOfXLogRecord - sizeof(crc64));
 
768
        FIN_CRC64(rdata_crc);
 
769
        record->xl_crc = rdata_crc;
 
770
 
 
771
        /* Compute record's XLOG location */
 
772
        INSERT_RECPTR(RecPtr, Insert, curridx);
 
773
 
 
774
        /* If first XLOG record of transaction, save it in PGPROC array */
 
775
        if (MyLastRecPtr.xrecoff == 0 && !no_tran)
 
776
        {
 
777
                /*
 
778
                 * We do not acquire SInvalLock here because of possible deadlock.
 
779
                 * Anyone who wants to inspect other procs' logRec must acquire
 
780
                 * WALInsertLock, instead.      A better solution would be a per-PROC
 
781
                 * spinlock, but no time for that before 7.2 --- tgl 12/19/01.
 
782
                 */
 
783
                MyProc->logRec = RecPtr;
 
784
        }
 
785
 
 
786
#ifdef WAL_DEBUG
 
787
        if (XLOG_DEBUG)
 
788
        {
 
789
                char            buf[8192];
 
790
 
 
791
                sprintf(buf, "INSERT @ %X/%X: ", RecPtr.xlogid, RecPtr.xrecoff);
 
792
                xlog_outrec(buf, record);
 
793
                if (rdata->data != NULL)
 
794
                {
 
795
                        strcat(buf, " - ");
 
796
                        RmgrTable[record->xl_rmid].rm_desc(buf, record->xl_info, rdata->data);
 
797
                }
 
798
                elog(LOG, "%s", buf);
 
799
        }
 
800
#endif
 
801
 
 
802
        /* Record begin of record in appropriate places */
 
803
        if (!no_tran)
 
804
                MyLastRecPtr = RecPtr;
 
805
        ProcLastRecPtr = RecPtr;
 
806
        Insert->PrevRecord = RecPtr;
 
807
        MyXactMadeXLogEntry = true;
 
808
 
 
809
        Insert->currpos += SizeOfXLogRecord;
 
810
        freespace -= SizeOfXLogRecord;
 
811
 
 
812
        /*
 
813
         * Append the data, including backup blocks if any
 
814
         */
 
815
        while (write_len)
 
816
        {
 
817
                while (rdata->data == NULL)
 
818
                        rdata = rdata->next;
 
819
 
 
820
                if (freespace > 0)
 
821
                {
 
822
                        if (rdata->len > freespace)
 
823
                        {
 
824
                                memcpy(Insert->currpos, rdata->data, freespace);
 
825
                                rdata->data += freespace;
 
826
                                rdata->len -= freespace;
 
827
                                write_len -= freespace;
 
828
                        }
 
829
                        else
 
830
                        {
 
831
                                memcpy(Insert->currpos, rdata->data, rdata->len);
 
832
                                freespace -= rdata->len;
 
833
                                write_len -= rdata->len;
 
834
                                Insert->currpos += rdata->len;
 
835
                                rdata = rdata->next;
 
836
                                continue;
 
837
                        }
 
838
                }
 
839
 
 
840
                /* Use next buffer */
 
841
                updrqst = AdvanceXLInsertBuffer();
 
842
                curridx = Insert->curridx;
 
843
                /* Insert cont-record header */
 
844
                Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
 
845
                contrecord = (XLogContRecord *) Insert->currpos;
 
846
                contrecord->xl_rem_len = write_len;
 
847
                Insert->currpos += SizeOfXLogContRecord;
 
848
                freespace = INSERT_FREESPACE(Insert);
 
849
        }
 
850
 
 
851
        /* Ensure next record will be properly aligned */
 
852
        Insert->currpos = (char *) Insert->currpage +
 
853
                MAXALIGN(Insert->currpos - (char *) Insert->currpage);
 
854
        freespace = INSERT_FREESPACE(Insert);
 
855
 
 
856
        /*
 
857
         * The recptr I return is the beginning of the *next* record. This
 
858
         * will be stored as LSN for changed data pages...
 
859
         */
 
860
        INSERT_RECPTR(RecPtr, Insert, curridx);
 
861
 
 
862
        /* Need to update shared LogwrtRqst if some block was filled up */
 
863
        if (freespace < SizeOfXLogRecord)
 
864
                updrqst = true;                 /* curridx is filled and available for
 
865
                                                                 * writing out */
 
866
        else
 
867
                curridx = PrevBufIdx(curridx);
 
868
        WriteRqst = XLogCtl->xlblocks[curridx];
 
869
 
 
870
        LWLockRelease(WALInsertLock);
 
871
 
 
872
        if (updrqst)
 
873
        {
 
874
                /* use volatile pointer to prevent code rearrangement */
 
875
                volatile XLogCtlData *xlogctl = XLogCtl;
 
876
 
 
877
                SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
 
878
                /* advance global request to include new block(s) */
 
879
                if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
 
880
                        xlogctl->LogwrtRqst.Write = WriteRqst;
 
881
                /* update local result copy while I have the chance */
 
882
                LogwrtResult = xlogctl->LogwrtResult;
 
883
                SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
 
884
        }
 
885
 
 
886
        ProcLastRecEnd = RecPtr;
 
887
 
 
888
        END_CRIT_SECTION();
 
889
 
 
890
        return (RecPtr);
 
891
}
 
892
 
 
893
/*
 
894
 * XLogArchiveNotify
 
895
 *
 
896
 * Create an archive notification file
 
897
 *
 
898
 * The name of the notification file is the message that will be picked up
 
899
 * by the archiver, e.g. we write 0000000100000001000000C6.ready
 
900
 * and the archiver then knows to archive XLogDir/0000000100000001000000C6,
 
901
 * then when complete, rename it to 0000000100000001000000C6.done
 
902
 */
 
903
static void
 
904
XLogArchiveNotify(const char *xlog)
 
905
{
 
906
        char            archiveStatusPath[MAXPGPATH];
 
907
        FILE       *fd;
 
908
 
 
909
        /* insert an otherwise empty file called <XLOG>.ready */
 
910
        StatusFilePath(archiveStatusPath, xlog, ".ready");
 
911
        fd = AllocateFile(archiveStatusPath, "w");
 
912
        if (fd == NULL)
 
913
        {
 
914
                ereport(LOG,
 
915
                                (errcode_for_file_access(),
 
916
                                 errmsg("could not create archive status file \"%s\": %m",
 
917
                                                archiveStatusPath)));
 
918
                return;
 
919
        }
 
920
        if (FreeFile(fd))
 
921
        {
 
922
                ereport(LOG,
 
923
                                (errcode_for_file_access(),
 
924
                                 errmsg("could not write archive status file \"%s\": %m",
 
925
                                                archiveStatusPath)));
 
926
                return;
 
927
        }
 
928
 
 
929
        /* Notify archiver that it's got something to do */
 
930
        if (IsUnderPostmaster)
 
931
                SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
 
932
}
 
933
 
 
934
/*
 
935
 * Convenience routine to notify using log/seg representation of filename
 
936
 */
 
937
static void
 
938
XLogArchiveNotifySeg(uint32 log, uint32 seg)
 
939
{
 
940
        char            xlog[MAXFNAMELEN];
 
941
 
 
942
        XLogFileName(xlog, ThisTimeLineID, log, seg);
 
943
        XLogArchiveNotify(xlog);
 
944
}
 
945
 
 
946
/*
 
947
 * XLogArchiveIsDone
 
948
 *
 
949
 * Checks for a ".done" archive notification file.      This is called when we
 
950
 * are ready to delete or recycle an old XLOG segment file.  If it is okay
 
951
 * to delete it then return true.
 
952
 *
 
953
 * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
 
954
 * then return false; else create <XLOG>.ready and return false.  The
 
955
 * last case covers the possibility that the original attempt to create
 
956
 * <XLOG>.ready failed.
 
957
 */
 
958
static bool
 
959
XLogArchiveIsDone(const char *xlog)
 
960
{
 
961
        char            archiveStatusPath[MAXPGPATH];
 
962
        struct stat stat_buf;
 
963
 
 
964
        /* First check for .done --- this is the expected case */
 
965
        StatusFilePath(archiveStatusPath, xlog, ".done");
 
966
        if (stat(archiveStatusPath, &stat_buf) == 0)
 
967
                return true;
 
968
 
 
969
        /* check for .ready --- this means archiver is still busy with it */
 
970
        StatusFilePath(archiveStatusPath, xlog, ".ready");
 
971
        if (stat(archiveStatusPath, &stat_buf) == 0)
 
972
                return false;
 
973
 
 
974
        /* Race condition --- maybe archiver just finished, so recheck */
 
975
        StatusFilePath(archiveStatusPath, xlog, ".done");
 
976
        if (stat(archiveStatusPath, &stat_buf) == 0)
 
977
                return true;
 
978
 
 
979
        /* Retry creation of the .ready file */
 
980
        XLogArchiveNotify(xlog);
 
981
        return false;
 
982
}
 
983
 
 
984
/*
 
985
 * XLogArchiveCleanup
 
986
 *
 
987
 * Cleanup archive notification file(s) for a particular xlog segment
 
988
 */
 
989
static void
 
990
XLogArchiveCleanup(const char *xlog)
 
991
{
 
992
        char            archiveStatusPath[MAXPGPATH];
 
993
 
 
994
        /* Remove the .done file */
 
995
        StatusFilePath(archiveStatusPath, xlog, ".done");
 
996
        unlink(archiveStatusPath);
 
997
        /* should we complain about failure? */
 
998
 
 
999
        /* Remove the .ready file if present --- normally it shouldn't be */
 
1000
        StatusFilePath(archiveStatusPath, xlog, ".ready");
 
1001
        unlink(archiveStatusPath);
 
1002
        /* should we complain about failure? */
 
1003
}
 
1004
 
 
1005
/*
 
1006
 * Advance the Insert state to the next buffer page, writing out the next
 
1007
 * buffer if it still contains unwritten data.
 
1008
 *
 
1009
 * The global LogwrtRqst.Write pointer needs to be advanced to include the
 
1010
 * just-filled page.  If we can do this for free (without an extra lock),
 
1011
 * we do so here.  Otherwise the caller must do it.  We return TRUE if the
 
1012
 * request update still needs to be done, FALSE if we did it internally.
 
1013
 *
 
1014
 * Must be called with WALInsertLock held.
 
1015
 */
 
1016
static bool
 
1017
AdvanceXLInsertBuffer(void)
 
1018
{
 
1019
        XLogCtlInsert *Insert = &XLogCtl->Insert;
 
1020
        XLogCtlWrite *Write = &XLogCtl->Write;
 
1021
        uint16          nextidx = NextBufIdx(Insert->curridx);
 
1022
        bool            update_needed = true;
 
1023
        XLogRecPtr      OldPageRqstPtr;
 
1024
        XLogwrtRqst WriteRqst;
 
1025
        XLogRecPtr      NewPageEndPtr;
 
1026
        XLogPageHeader NewPage;
 
1027
 
 
1028
        /* Use Insert->LogwrtResult copy if it's more fresh */
 
1029
        if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
 
1030
                LogwrtResult = Insert->LogwrtResult;
 
1031
 
 
1032
        /*
 
1033
         * Get ending-offset of the buffer page we need to replace (this may
 
1034
         * be zero if the buffer hasn't been used yet).  Fall through if it's
 
1035
         * already written out.
 
1036
         */
 
1037
        OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
 
1038
        if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
 
1039
        {
 
1040
                /* nope, got work to do... */
 
1041
                XLogRecPtr      FinishedPageRqstPtr;
 
1042
 
 
1043
                FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
 
1044
 
 
1045
                /* Before waiting, get info_lck and update LogwrtResult */
 
1046
                {
 
1047
                        /* use volatile pointer to prevent code rearrangement */
 
1048
                        volatile XLogCtlData *xlogctl = XLogCtl;
 
1049
 
 
1050
                        SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
 
1051
                        if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
 
1052
                                xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
 
1053
                        LogwrtResult = xlogctl->LogwrtResult;
 
1054
                        SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
 
1055
                }
 
1056
 
 
1057
                update_needed = false;  /* Did the shared-request update */
 
1058
 
 
1059
                if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
 
1060
                {
 
1061
                        /* OK, someone wrote it already */
 
1062
                        Insert->LogwrtResult = LogwrtResult;
 
1063
                }
 
1064
                else
 
1065
                {
 
1066
                        /* Must acquire write lock */
 
1067
                        LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 
1068
                        LogwrtResult = Write->LogwrtResult;
 
1069
                        if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
 
1070
                        {
 
1071
                                /* OK, someone wrote it already */
 
1072
                                LWLockRelease(WALWriteLock);
 
1073
                                Insert->LogwrtResult = LogwrtResult;
 
1074
                        }
 
1075
                        else
 
1076
                        {
 
1077
                                /*
 
1078
                                 * Have to write buffers while holding insert lock. This
 
1079
                                 * is not good, so only write as much as we absolutely
 
1080
                                 * must.
 
1081
                                 */
 
1082
                                WriteRqst.Write = OldPageRqstPtr;
 
1083
                                WriteRqst.Flush.xlogid = 0;
 
1084
                                WriteRqst.Flush.xrecoff = 0;
 
1085
                                XLogWrite(WriteRqst);
 
1086
                                LWLockRelease(WALWriteLock);
 
1087
                                Insert->LogwrtResult = LogwrtResult;
 
1088
                        }
 
1089
                }
 
1090
        }
 
1091
 
 
1092
        /*
 
1093
         * Now the next buffer slot is free and we can set it up to be the
 
1094
         * next output page.
 
1095
         */
 
1096
        NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
 
1097
        if (NewPageEndPtr.xrecoff >= XLogFileSize)
 
1098
        {
 
1099
                /* crossing a logid boundary */
 
1100
                NewPageEndPtr.xlogid += 1;
 
1101
                NewPageEndPtr.xrecoff = BLCKSZ;
 
1102
        }
 
1103
        else
 
1104
                NewPageEndPtr.xrecoff += BLCKSZ;
 
1105
        XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
 
1106
        NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * BLCKSZ);
 
1107
        Insert->curridx = nextidx;
 
1108
        Insert->currpage = NewPage;
 
1109
        Insert->currpos = ((char *) NewPage) + SizeOfXLogShortPHD;
 
1110
 
 
1111
        /*
 
1112
         * Be sure to re-zero the buffer so that bytes beyond what we've
 
1113
         * written will look like zeroes and not valid XLOG records...
 
1114
         */
 
1115
        MemSet((char *) NewPage, 0, BLCKSZ);
 
1116
 
 
1117
        /*
 
1118
         * Fill the new page's header
 
1119
         */
 
1120
        NewPage->xlp_magic = XLOG_PAGE_MAGIC;
 
1121
        /* NewPage->xlp_info = 0; */    /* done by memset */
 
1122
        NewPage->xlp_tli = ThisTimeLineID;
 
1123
        NewPage->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
 
1124
        NewPage->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - BLCKSZ;
 
1125
 
 
1126
        /*
 
1127
         * If first page of an XLOG segment file, make it a long header.
 
1128
         */
 
1129
        if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
 
1130
        {
 
1131
                XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
 
1132
 
 
1133
                NewLongPage->xlp_sysid = ControlFile->system_identifier;
 
1134
                NewLongPage->xlp_seg_size = XLogSegSize;
 
1135
                NewPage->xlp_info |= XLP_LONG_HEADER;
 
1136
                Insert->currpos = ((char *) NewPage) + SizeOfXLogLongPHD;
 
1137
        }
 
1138
 
 
1139
        return update_needed;
 
1140
}
 
1141
 
 
1142
/*
 
1143
 * Write and/or fsync the log at least as far as WriteRqst indicates.
 
1144
 *
 
1145
 * Must be called with WALWriteLock held.
 
1146
 */
 
1147
static void
 
1148
XLogWrite(XLogwrtRqst WriteRqst)
 
1149
{
 
1150
        XLogCtlWrite *Write = &XLogCtl->Write;
 
1151
        char       *from;
 
1152
        bool            ispartialpage;
 
1153
        bool            use_existent;
 
1154
 
 
1155
        /*
 
1156
         * Update local LogwrtResult (caller probably did this already,
 
1157
         * but...)
 
1158
         */
 
1159
        LogwrtResult = Write->LogwrtResult;
 
1160
 
 
1161
        while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
 
1162
        {
 
1163
                /*
 
1164
                 * Make sure we're not ahead of the insert process.  This could
 
1165
                 * happen if we're passed a bogus WriteRqst.Write that is past the
 
1166
                 * end of the last page that's been initialized by
 
1167
                 * AdvanceXLInsertBuffer.
 
1168
                 */
 
1169
                if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[Write->curridx]))
 
1170
                        elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
 
1171
                                 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
 
1172
                                 XLogCtl->xlblocks[Write->curridx].xlogid,
 
1173
                                 XLogCtl->xlblocks[Write->curridx].xrecoff);
 
1174
 
 
1175
                /* Advance LogwrtResult.Write to end of current buffer page */
 
1176
                LogwrtResult.Write = XLogCtl->xlblocks[Write->curridx];
 
1177
                ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
 
1178
 
 
1179
                if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
 
1180
                {
 
1181
                        /*
 
1182
                         * Switch to new logfile segment.
 
1183
                         */
 
1184
                        if (openLogFile >= 0)
 
1185
                        {
 
1186
                                if (close(openLogFile))
 
1187
                                        ereport(PANIC,
 
1188
                                                        (errcode_for_file_access(),
 
1189
                                        errmsg("could not close log file %u, segment %u: %m",
 
1190
                                                   openLogId, openLogSeg)));
 
1191
                                openLogFile = -1;
 
1192
                        }
 
1193
                        XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
 
1194
 
 
1195
                        /* create/use new log file */
 
1196
                        use_existent = true;
 
1197
                        openLogFile = XLogFileInit(openLogId, openLogSeg,
 
1198
                                                                           &use_existent, true);
 
1199
                        openLogOff = 0;
 
1200
 
 
1201
                        /* update pg_control, unless someone else already did */
 
1202
                        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 
1203
                        if (ControlFile->logId < openLogId ||
 
1204
                                (ControlFile->logId == openLogId &&
 
1205
                                 ControlFile->logSeg < openLogSeg + 1))
 
1206
                        {
 
1207
                                ControlFile->logId = openLogId;
 
1208
                                ControlFile->logSeg = openLogSeg + 1;
 
1209
                                ControlFile->time = time(NULL);
 
1210
                                UpdateControlFile();
 
1211
 
 
1212
                                /*
 
1213
                                 * Signal bgwriter to start a checkpoint if it's been
 
1214
                                 * too long since the last one.  (We look at local copy of
 
1215
                                 * RedoRecPtr which might be a little out of date, but
 
1216
                                 * should be close enough for this purpose.)
 
1217
                                 *
 
1218
                                 * A straight computation of segment number could overflow
 
1219
                                 * 32 bits.  Rather than assuming we have working 64-bit
 
1220
                                 * arithmetic, we compare the highest-order bits separately,
 
1221
                                 * and force a checkpoint immediately when they change.
 
1222
                                 */
 
1223
                                if (IsUnderPostmaster)
 
1224
                                {
 
1225
                                        uint32          old_segno,
 
1226
                                                                new_segno;
 
1227
                                        uint32          old_highbits,
 
1228
                                                                new_highbits;
 
1229
 
 
1230
                                        old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
 
1231
                                                (RedoRecPtr.xrecoff / XLogSegSize);
 
1232
                                        old_highbits = RedoRecPtr.xlogid / XLogSegSize;
 
1233
                                        new_segno = (openLogId % XLogSegSize) * XLogSegsPerFile +
 
1234
                                                openLogSeg;
 
1235
                                        new_highbits = openLogId / XLogSegSize;
 
1236
                                        if (new_highbits != old_highbits ||
 
1237
                                                new_segno >= old_segno + (uint32) CheckPointSegments)
 
1238
                                        {
 
1239
#ifdef WAL_DEBUG
 
1240
                                                if (XLOG_DEBUG)
 
1241
                                                        elog(LOG, "time for a checkpoint, signaling bgwriter");
 
1242
#endif
 
1243
                                                RequestCheckpoint(false);
 
1244
                                        }
 
1245
                                }
 
1246
                        }
 
1247
                        LWLockRelease(ControlFileLock);
 
1248
                }
 
1249
 
 
1250
                if (openLogFile < 0)
 
1251
                {
 
1252
                        XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
 
1253
                        openLogFile = XLogFileOpen(openLogId, openLogSeg);
 
1254
                        openLogOff = 0;
 
1255
                }
 
1256
 
 
1257
                /* Need to seek in the file? */
 
1258
                if (openLogOff != (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize)
 
1259
                {
 
1260
                        openLogOff = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
 
1261
                        if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0)
 
1262
                                ereport(PANIC,
 
1263
                                                (errcode_for_file_access(),
 
1264
                                                 errmsg("could not seek in log file %u, segment %u to offset %u: %m",
 
1265
                                                                openLogId, openLogSeg, openLogOff)));
 
1266
                }
 
1267
 
 
1268
                /* OK to write the page */
 
1269
                from = XLogCtl->pages + Write->curridx * BLCKSZ;
 
1270
                errno = 0;
 
1271
                if (write(openLogFile, from, BLCKSZ) != BLCKSZ)
 
1272
                {
 
1273
                        /* if write didn't set errno, assume problem is no disk space */
 
1274
                        if (errno == 0)
 
1275
                                errno = ENOSPC;
 
1276
                        ereport(PANIC,
 
1277
                                        (errcode_for_file_access(),
 
1278
                                         errmsg("could not write to log file %u, segment %u at offset %u: %m",
 
1279
                                                        openLogId, openLogSeg, openLogOff)));
 
1280
                }
 
1281
                openLogOff += BLCKSZ;
 
1282
 
 
1283
                /*
 
1284
                 * If we just wrote the whole last page of a logfile segment,
 
1285
                 * fsync the segment immediately.  This avoids having to go back
 
1286
                 * and re-open prior segments when an fsync request comes along
 
1287
                 * later. Doing it here ensures that one and only one backend will
 
1288
                 * perform this fsync.
 
1289
                 *
 
1290
                 * This is also the right place to notify the Archiver that the
 
1291
                 * segment is ready to copy to archival storage.
 
1292
                 */
 
1293
                if (openLogOff >= XLogSegSize && !ispartialpage)
 
1294
                {
 
1295
                        issue_xlog_fsync();
 
1296
                        LogwrtResult.Flush = LogwrtResult.Write;        /* end of current page */
 
1297
 
 
1298
                        if (XLogArchivingActive())
 
1299
                                XLogArchiveNotifySeg(openLogId, openLogSeg);
 
1300
                }
 
1301
 
 
1302
                if (ispartialpage)
 
1303
                {
 
1304
                        /* Only asked to write a partial page */
 
1305
                        LogwrtResult.Write = WriteRqst.Write;
 
1306
                        break;
 
1307
                }
 
1308
                Write->curridx = NextBufIdx(Write->curridx);
 
1309
        }
 
1310
 
 
1311
        /*
 
1312
         * If asked to flush, do so
 
1313
         */
 
1314
        if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
 
1315
                XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
 
1316
        {
 
1317
                /*
 
1318
                 * Could get here without iterating above loop, in which case we
 
1319
                 * might have no open file or the wrong one.  However, we do not
 
1320
                 * need to fsync more than one file.
 
1321
                 */
 
1322
                if (sync_method != SYNC_METHOD_OPEN)
 
1323
                {
 
1324
                        if (openLogFile >= 0 &&
 
1325
                         !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
 
1326
                        {
 
1327
                                if (close(openLogFile))
 
1328
                                        ereport(PANIC,
 
1329
                                                        (errcode_for_file_access(),
 
1330
                                        errmsg("could not close log file %u, segment %u: %m",
 
1331
                                                   openLogId, openLogSeg)));
 
1332
                                openLogFile = -1;
 
1333
                        }
 
1334
                        if (openLogFile < 0)
 
1335
                        {
 
1336
                                XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
 
1337
                                openLogFile = XLogFileOpen(openLogId, openLogSeg);
 
1338
                                openLogOff = 0;
 
1339
                        }
 
1340
                        issue_xlog_fsync();
 
1341
                }
 
1342
                LogwrtResult.Flush = LogwrtResult.Write;
 
1343
        }
 
1344
 
 
1345
        /*
 
1346
         * Update shared-memory status
 
1347
         *
 
1348
         * We make sure that the shared 'request' values do not fall behind the
 
1349
         * 'result' values.  This is not absolutely essential, but it saves
 
1350
         * some code in a couple of places.
 
1351
         */
 
1352
        {
 
1353
                /* use volatile pointer to prevent code rearrangement */
 
1354
                volatile XLogCtlData *xlogctl = XLogCtl;
 
1355
 
 
1356
                SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
 
1357
                xlogctl->LogwrtResult = LogwrtResult;
 
1358
                if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
 
1359
                        xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
 
1360
                if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
 
1361
                        xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
 
1362
                SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
 
1363
        }
 
1364
 
 
1365
        Write->LogwrtResult = LogwrtResult;
 
1366
}
 
1367
 
 
1368
/*
 
1369
 * Ensure that all XLOG data through the given position is flushed to disk.
 
1370
 *
 
1371
 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
 
1372
 * already held, and we try to avoid acquiring it if possible.
 
1373
 */
 
1374
void
 
1375
XLogFlush(XLogRecPtr record)
 
1376
{
 
1377
        XLogRecPtr      WriteRqstPtr;
 
1378
        XLogwrtRqst WriteRqst;
 
1379
 
 
1380
        /* Disabled during REDO */
 
1381
        if (InRedo)
 
1382
                return;
 
1383
 
 
1384
        /* Quick exit if already known flushed */
 
1385
        if (XLByteLE(record, LogwrtResult.Flush))
 
1386
                return;
 
1387
 
 
1388
#ifdef WAL_DEBUG
 
1389
        if (XLOG_DEBUG)
 
1390
                elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
 
1391
                         record.xlogid, record.xrecoff,
 
1392
                         LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
 
1393
                         LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
 
1394
#endif
 
1395
 
 
1396
        START_CRIT_SECTION();
 
1397
 
 
1398
        /*
 
1399
         * Since fsync is usually a horribly expensive operation, we try to
 
1400
         * piggyback as much data as we can on each fsync: if we see any more
 
1401
         * data entered into the xlog buffer, we'll write and fsync that too,
 
1402
         * so that the final value of LogwrtResult.Flush is as large as
 
1403
         * possible. This gives us some chance of avoiding another fsync
 
1404
         * immediately after.
 
1405
         */
 
1406
 
 
1407
        /* initialize to given target; may increase below */
 
1408
        WriteRqstPtr = record;
 
1409
 
 
1410
        /* read LogwrtResult and update local state */
 
1411
        {
 
1412
                /* use volatile pointer to prevent code rearrangement */
 
1413
                volatile XLogCtlData *xlogctl = XLogCtl;
 
1414
 
 
1415
                SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
 
1416
                if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
 
1417
                        WriteRqstPtr = xlogctl->LogwrtRqst.Write;
 
1418
                LogwrtResult = xlogctl->LogwrtResult;
 
1419
                SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
 
1420
        }
 
1421
 
 
1422
        /* done already? */
 
1423
        if (!XLByteLE(record, LogwrtResult.Flush))
 
1424
        {
 
1425
                /* now wait for the write lock */
 
1426
                LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 
1427
                LogwrtResult = XLogCtl->Write.LogwrtResult;
 
1428
                if (!XLByteLE(record, LogwrtResult.Flush))
 
1429
                {
 
1430
                        /* try to write/flush later additions to XLOG as well */
 
1431
                        if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
 
1432
                        {
 
1433
                                XLogCtlInsert *Insert = &XLogCtl->Insert;
 
1434
                                uint32          freespace = INSERT_FREESPACE(Insert);
 
1435
 
 
1436
                                if (freespace < SizeOfXLogRecord)               /* buffer is full */
 
1437
                                        WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
 
1438
                                else
 
1439
                                {
 
1440
                                        WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
 
1441
                                        WriteRqstPtr.xrecoff -= freespace;
 
1442
                                }
 
1443
                                LWLockRelease(WALInsertLock);
 
1444
                                WriteRqst.Write = WriteRqstPtr;
 
1445
                                WriteRqst.Flush = WriteRqstPtr;
 
1446
                        }
 
1447
                        else
 
1448
                        {
 
1449
                                WriteRqst.Write = WriteRqstPtr;
 
1450
                                WriteRqst.Flush = record;
 
1451
                        }
 
1452
                        XLogWrite(WriteRqst);
 
1453
                }
 
1454
                LWLockRelease(WALWriteLock);
 
1455
        }
 
1456
 
 
1457
        END_CRIT_SECTION();
 
1458
 
 
1459
        /*
 
1460
         * If we still haven't flushed to the request point then we have a
 
1461
         * problem; most likely, the requested flush point is past end of
 
1462
         * XLOG. This has been seen to occur when a disk page has a corrupted
 
1463
         * LSN.
 
1464
         *
 
1465
         * Formerly we treated this as a PANIC condition, but that hurts the
 
1466
         * system's robustness rather than helping it: we do not want to take
 
1467
         * down the whole system due to corruption on one data page.  In
 
1468
         * particular, if the bad page is encountered again during recovery
 
1469
         * then we would be unable to restart the database at all!      (This
 
1470
         * scenario has actually happened in the field several times with 7.1
 
1471
         * releases. Note that we cannot get here while InRedo is true, but if
 
1472
         * the bad page is brought in and marked dirty during recovery then
 
1473
         * CreateCheckPoint will try to flush it at the end of recovery.)
 
1474
         *
 
1475
         * The current approach is to ERROR under normal conditions, but only
 
1476
         * WARNING during recovery, so that the system can be brought up even
 
1477
         * if there's a corrupt LSN.  Note that for calls from xact.c, the
 
1478
         * ERROR will be promoted to PANIC since xact.c calls this routine
 
1479
         * inside a critical section.  However, calls from bufmgr.c are not
 
1480
         * within critical sections and so we will not force a restart for a
 
1481
         * bad LSN on a data page.
 
1482
         */
 
1483
        if (XLByteLT(LogwrtResult.Flush, record))
 
1484
                elog(InRecovery ? WARNING : ERROR,
 
1485
                         "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
 
1486
                         record.xlogid, record.xrecoff,
 
1487
                         LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
 
1488
}
 
1489
 
 
1490
/*
 
1491
 * Create a new XLOG file segment, or open a pre-existing one.
 
1492
 *
 
1493
 * log, seg: identify segment to be created/opened.
 
1494
 *
 
1495
 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
 
1496
 * pre-existing file will be deleted).  On return, TRUE if a pre-existing
 
1497
 * file was used.
 
1498
 *
 
1499
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
 
1500
 * place.  This should be TRUE except during bootstrap log creation.  The
 
1501
 * caller must *not* hold the lock at call.
 
1502
 *
 
1503
 * Returns FD of opened file.
 
1504
 */
 
1505
static int
 
1506
XLogFileInit(uint32 log, uint32 seg,
 
1507
                         bool *use_existent, bool use_lock)
 
1508
{
 
1509
        char            path[MAXPGPATH];
 
1510
        char            tmppath[MAXPGPATH];
 
1511
        char            zbuffer[BLCKSZ];
 
1512
        int                     fd;
 
1513
        int                     nbytes;
 
1514
 
 
1515
        XLogFilePath(path, ThisTimeLineID, log, seg);
 
1516
 
 
1517
        /*
 
1518
         * Try to use existent file (checkpoint maker may have created it
 
1519
         * already)
 
1520
         */
 
1521
        if (*use_existent)
 
1522
        {
 
1523
                fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
 
1524
                                                   S_IRUSR | S_IWUSR);
 
1525
                if (fd < 0)
 
1526
                {
 
1527
                        if (errno != ENOENT)
 
1528
                                ereport(PANIC,
 
1529
                                                (errcode_for_file_access(),
 
1530
                                                 errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
 
1531
                                                                path, log, seg)));
 
1532
                }
 
1533
                else
 
1534
                        return (fd);
 
1535
        }
 
1536
 
 
1537
        /*
 
1538
         * Initialize an empty (all zeroes) segment.  NOTE: it is possible
 
1539
         * that another process is doing the same thing.  If so, we will end
 
1540
         * up pre-creating an extra log segment.  That seems OK, and better
 
1541
         * than holding the lock throughout this lengthy process.
 
1542
         */
 
1543
        snprintf(tmppath, MAXPGPATH, "%s/xlogtemp.%d", XLogDir, (int)getpid());
 
1544
 
 
1545
        unlink(tmppath);
 
1546
 
 
1547
        /* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
 
1548
        fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
 
1549
                                           S_IRUSR | S_IWUSR);
 
1550
        if (fd < 0)
 
1551
                ereport(PANIC,
 
1552
                                (errcode_for_file_access(),
 
1553
                                 errmsg("could not create file \"%s\": %m", tmppath)));
 
1554
 
 
1555
        /*
 
1556
         * Zero-fill the file.  We have to do this the hard way to ensure that
 
1557
         * all the file space has really been allocated --- on platforms that
 
1558
         * allow "holes" in files, just seeking to the end doesn't allocate
 
1559
         * intermediate space.  This way, we know that we have all the space
 
1560
         * and (after the fsync below) that all the indirect blocks are down
 
1561
         * on disk.  Therefore, fdatasync(2) or O_DSYNC will be sufficient to
 
1562
         * sync future writes to the log file.
 
1563
         */
 
1564
        MemSet(zbuffer, 0, sizeof(zbuffer));
 
1565
        for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
 
1566
        {
 
1567
                errno = 0;
 
1568
                if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
 
1569
                {
 
1570
                        int                     save_errno = errno;
 
1571
 
 
1572
                        /*
 
1573
                         * If we fail to make the file, delete it to release disk
 
1574
                         * space
 
1575
                         */
 
1576
                        unlink(tmppath);
 
1577
                        /* if write didn't set errno, assume problem is no disk space */
 
1578
                        errno = save_errno ? save_errno : ENOSPC;
 
1579
 
 
1580
                        ereport(PANIC,
 
1581
                                        (errcode_for_file_access(),
 
1582
                                 errmsg("could not write to file \"%s\": %m", tmppath)));
 
1583
                }
 
1584
        }
 
1585
 
 
1586
        if (pg_fsync(fd) != 0)
 
1587
                ereport(PANIC,
 
1588
                                (errcode_for_file_access(),
 
1589
                                 errmsg("could not fsync file \"%s\": %m", tmppath)));
 
1590
 
 
1591
        if (close(fd))
 
1592
                ereport(PANIC,
 
1593
                                (errcode_for_file_access(),
 
1594
                                 errmsg("could not close file \"%s\": %m", tmppath)));
 
1595
 
 
1596
        /*
 
1597
         * Now move the segment into place with its final name.
 
1598
         *
 
1599
         * If caller didn't want to use a pre-existing file, get rid of any
 
1600
         * pre-existing file.  Otherwise, cope with possibility that someone
 
1601
         * else has created the file while we were filling ours: if so, use
 
1602
         * ours to pre-create a future log segment.
 
1603
         */
 
1604
        if (!InstallXLogFileSegment(log, seg, tmppath,
 
1605
                                                                *use_existent, XLOGfileslop,
 
1606
                                                                use_lock))
 
1607
        {
 
1608
                /* No need for any more future segments... */
 
1609
                unlink(tmppath);
 
1610
        }
 
1611
 
 
1612
        /* Set flag to tell caller there was no existent file */
 
1613
        *use_existent = false;
 
1614
 
 
1615
        /* Now open original target segment (might not be file I just made) */
 
1616
        fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
 
1617
                                           S_IRUSR | S_IWUSR);
 
1618
        if (fd < 0)
 
1619
                ereport(PANIC,
 
1620
                                (errcode_for_file_access(),
 
1621
                errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
 
1622
                           path, log, seg)));
 
1623
 
 
1624
        return (fd);
 
1625
}
 
1626
 
 
1627
/*
 
1628
 * Create a new XLOG file segment by copying a pre-existing one.
 
1629
 *
 
1630
 * log, seg: identify segment to be created.
 
1631
 *
 
1632
 * srcTLI, srclog, srcseg: identify segment to be copied (could be from
 
1633
 *              a different timeline)
 
1634
 *
 
1635
 * Currently this is only used during recovery, and so there are no locking
 
1636
 * considerations.      But we should be just as tense as XLogFileInit to avoid
 
1637
 * emplacing a bogus file.
 
1638
 */
 
1639
static void
 
1640
XLogFileCopy(uint32 log, uint32 seg,
 
1641
                         TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
 
1642
{
 
1643
        char            path[MAXPGPATH];
 
1644
        char            tmppath[MAXPGPATH];
 
1645
        char            buffer[BLCKSZ];
 
1646
        int                     srcfd;
 
1647
        int                     fd;
 
1648
        int                     nbytes;
 
1649
 
 
1650
        /*
 
1651
         * Open the source file
 
1652
         */
 
1653
        XLogFilePath(path, srcTLI, srclog, srcseg);
 
1654
        srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
 
1655
        if (srcfd < 0)
 
1656
                ereport(PANIC,
 
1657
                                (errcode_for_file_access(),
 
1658
                                 errmsg("could not open file \"%s\": %m", path)));
 
1659
 
 
1660
        /*
 
1661
         * Copy into a temp file name.
 
1662
         */
 
1663
        snprintf(tmppath, MAXPGPATH, "%s/xlogtemp.%d", XLogDir, (int)getpid());
 
1664
 
 
1665
        unlink(tmppath);
 
1666
 
 
1667
        /* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
 
1668
        fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
 
1669
                                           S_IRUSR | S_IWUSR);
 
1670
        if (fd < 0)
 
1671
                ereport(PANIC,
 
1672
                                (errcode_for_file_access(),
 
1673
                                 errmsg("could not create file \"%s\": %m", tmppath)));
 
1674
 
 
1675
        /*
 
1676
         * Do the data copying.
 
1677
         */
 
1678
        for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
 
1679
        {
 
1680
                errno = 0;
 
1681
                if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
 
1682
                {
 
1683
                        if (errno != 0)
 
1684
                                ereport(PANIC,
 
1685
                                                (errcode_for_file_access(),
 
1686
                                                 errmsg("could not read file \"%s\": %m", path)));
 
1687
                        else
 
1688
                                ereport(PANIC,
 
1689
                                         (errmsg("not enough data in file \"%s\"", path)));
 
1690
                }
 
1691
                errno = 0;
 
1692
                if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
 
1693
                {
 
1694
                        int                     save_errno = errno;
 
1695
 
 
1696
                        /*
 
1697
                         * If we fail to make the file, delete it to release disk
 
1698
                         * space
 
1699
                         */
 
1700
                        unlink(tmppath);
 
1701
                        /* if write didn't set errno, assume problem is no disk space */
 
1702
                        errno = save_errno ? save_errno : ENOSPC;
 
1703
 
 
1704
                        ereport(PANIC,
 
1705
                                        (errcode_for_file_access(),
 
1706
                                 errmsg("could not write to file \"%s\": %m", tmppath)));
 
1707
                }
 
1708
        }
 
1709
 
 
1710
        if (pg_fsync(fd) != 0)
 
1711
                ereport(PANIC,
 
1712
                                (errcode_for_file_access(),
 
1713
                                 errmsg("could not fsync file \"%s\": %m", tmppath)));
 
1714
 
 
1715
        if (close(fd))
 
1716
                ereport(PANIC,
 
1717
                                (errcode_for_file_access(),
 
1718
                                 errmsg("could not close file \"%s\": %m", tmppath)));
 
1719
 
 
1720
        close(srcfd);
 
1721
 
 
1722
        /*
 
1723
         * Now move the segment into place with its final name.
 
1724
         */
 
1725
        if (!InstallXLogFileSegment(log, seg, tmppath, false, 0, false))
 
1726
                elog(PANIC, "InstallXLogFileSegment should not have failed");
 
1727
}
 
1728
 
 
1729
/*
 
1730
 * Install a new XLOG segment file as a current or future log segment.
 
1731
 *
 
1732
 * This is used both to install a newly-created segment (which has a temp
 
1733
 * filename while it's being created) and to recycle an old segment.
 
1734
 *
 
1735
 * log, seg: identify segment to install as (or first possible target).
 
1736
 *
 
1737
 * tmppath: initial name of file to install.  It will be renamed into place.
 
1738
 *
 
1739
 * find_free: if TRUE, install the new segment at the first empty log/seg
 
1740
 * number at or after the passed numbers.  If FALSE, install the new segment
 
1741
 * exactly where specified, deleting any existing segment file there.
 
1742
 *
 
1743
 * max_advance: maximum number of log/seg slots to advance past the starting
 
1744
 * point.  Fail if no free slot is found in this range.  (Irrelevant if
 
1745
 * find_free is FALSE.)
 
1746
 *
 
1747
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
 
1748
 * place.  This should be TRUE except during bootstrap log creation.  The
 
1749
 * caller must *not* hold the lock at call.
 
1750
 *
 
1751
 * Returns TRUE if file installed, FALSE if not installed because of
 
1752
 * exceeding max_advance limit.  (Any other kind of failure causes ereport().)
 
1753
 */
 
1754
static bool
 
1755
InstallXLogFileSegment(uint32 log, uint32 seg, char *tmppath,
 
1756
                                           bool find_free, int max_advance,
 
1757
                                           bool use_lock)
 
1758
{
 
1759
        char            path[MAXPGPATH];
 
1760
        struct stat stat_buf;
 
1761
 
 
1762
        XLogFilePath(path, ThisTimeLineID, log, seg);
 
1763
 
 
1764
        /*
 
1765
         * We want to be sure that only one process does this at a time.
 
1766
         */
 
1767
        if (use_lock)
 
1768
                LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 
1769
 
 
1770
        if (!find_free)
 
1771
        {
 
1772
                /* Force installation: get rid of any pre-existing segment file */
 
1773
                unlink(path);
 
1774
        }
 
1775
        else
 
1776
        {
 
1777
                /* Find a free slot to put it in */
 
1778
                while (stat(path, &stat_buf) == 0)
 
1779
                {
 
1780
                        if (--max_advance < 0)
 
1781
                        {
 
1782
                                /* Failed to find a free slot within specified range */
 
1783
                                if (use_lock)
 
1784
                                        LWLockRelease(ControlFileLock);
 
1785
                                return false;
 
1786
                        }
 
1787
                        NextLogSeg(log, seg);
 
1788
                        XLogFilePath(path, ThisTimeLineID, log, seg);
 
1789
                }
 
1790
        }
 
1791
 
 
1792
        /*
 
1793
         * Prefer link() to rename() here just to be really sure that we don't
 
1794
         * overwrite an existing logfile.  However, there shouldn't be one, so
 
1795
         * rename() is an acceptable substitute except for the truly paranoid.
 
1796
         */
 
1797
#if HAVE_WORKING_LINK
 
1798
        if (link(tmppath, path) < 0)
 
1799
                ereport(PANIC,
 
1800
                                (errcode_for_file_access(),
 
1801
                                 errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
 
1802
                                                tmppath, path, log, seg)));
 
1803
        unlink(tmppath);
 
1804
#else
 
1805
        if (rename(tmppath, path) < 0)
 
1806
                ereport(PANIC,
 
1807
                                (errcode_for_file_access(),
 
1808
                                 errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
 
1809
                                                tmppath, path, log, seg)));
 
1810
#endif
 
1811
 
 
1812
        if (use_lock)
 
1813
                LWLockRelease(ControlFileLock);
 
1814
 
 
1815
        return true;
 
1816
}
 
1817
 
 
1818
/*
 
1819
 * Open a pre-existing logfile segment for writing.
 
1820
 */
 
1821
static int
 
1822
XLogFileOpen(uint32 log, uint32 seg)
 
1823
{
 
1824
        char            path[MAXPGPATH];
 
1825
        int                     fd;
 
1826
 
 
1827
        XLogFilePath(path, ThisTimeLineID, log, seg);
 
1828
 
 
1829
        fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
 
1830
                                           S_IRUSR | S_IWUSR);
 
1831
        if (fd < 0)
 
1832
                ereport(PANIC,
 
1833
                                (errcode_for_file_access(),
 
1834
                errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
 
1835
                           path, log, seg)));
 
1836
 
 
1837
        return fd;
 
1838
}
 
1839
 
 
1840
/*
 
1841
 * Open a logfile segment for reading (during recovery).
 
1842
 */
 
1843
static int
 
1844
XLogFileRead(uint32 log, uint32 seg, int emode)
 
1845
{
 
1846
        char            path[MAXPGPATH];
 
1847
        char            xlogfname[MAXFNAMELEN];
 
1848
        ListCell   *cell;
 
1849
        int                     fd;
 
1850
 
 
1851
        /*
 
1852
         * Loop looking for a suitable timeline ID: we might need to read any
 
1853
         * of the timelines listed in expectedTLIs.
 
1854
         *
 
1855
         * We expect curFileTLI on entry to be the TLI of the preceding file in
 
1856
         * sequence, or 0 if there was no predecessor.  We do not allow
 
1857
         * curFileTLI to go backwards; this prevents us from picking up the
 
1858
         * wrong file when a parent timeline extends to higher segment numbers
 
1859
         * than the child we want to read.
 
1860
         */
 
1861
        foreach(cell, expectedTLIs)
 
1862
        {
 
1863
                TimeLineID      tli = (TimeLineID) lfirst_int(cell);
 
1864
 
 
1865
                if (tli < curFileTLI)
 
1866
                        break;                          /* don't bother looking at too-old TLIs */
 
1867
 
 
1868
                if (InArchiveRecovery)
 
1869
                {
 
1870
                        XLogFileName(xlogfname, tli, log, seg);
 
1871
                        restoredFromArchive = RestoreArchivedFile(path, xlogfname,
 
1872
                                                                                                          "RECOVERYXLOG",
 
1873
                                                                                                          XLogSegSize);
 
1874
                }
 
1875
                else
 
1876
                        XLogFilePath(path, tli, log, seg);
 
1877
 
 
1878
                fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
 
1879
                if (fd >= 0)
 
1880
                {
 
1881
                        /* Success! */
 
1882
                        curFileTLI = tli;
 
1883
                        return fd;
 
1884
                }
 
1885
                if (errno != ENOENT)    /* unexpected failure? */
 
1886
                        ereport(PANIC,
 
1887
                                        (errcode_for_file_access(),
 
1888
                                         errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
 
1889
                                                        path, log, seg)));
 
1890
        }
 
1891
 
 
1892
        /* Couldn't find it.  For simplicity, complain about front timeline */
 
1893
        XLogFilePath(path, recoveryTargetTLI, log, seg);
 
1894
        errno = ENOENT;
 
1895
        ereport(emode,
 
1896
                        (errcode_for_file_access(),
 
1897
           errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
 
1898
                          path, log, seg)));
 
1899
        return -1;
 
1900
}
 
1901
 
 
1902
/*
 
1903
 * Attempt to retrieve the specified file from off-line archival storage.
 
1904
 * If successful, fill "path" with its complete path (note that this will be
 
1905
 * a temp file name that doesn't follow the normal naming convention), and
 
1906
 * return TRUE.
 
1907
 *
 
1908
 * If not successful, fill "path" with the name of the normal on-line file
 
1909
 * (which may or may not actually exist, but we'll try to use it), and return
 
1910
 * FALSE.
 
1911
 *
 
1912
 * For fixed-size files, the caller may pass the expected size as an
 
1913
 * additional crosscheck on successful recovery.  If the file size is not
 
1914
 * known, set expectedSize = 0.
 
1915
 */
 
1916
static bool
 
1917
RestoreArchivedFile(char *path, const char *xlogfname,
 
1918
                                        const char *recovername, off_t expectedSize)
 
1919
{
 
1920
        char            xlogpath[MAXPGPATH];
 
1921
        char            xlogRestoreCmd[MAXPGPATH];
 
1922
        char       *dp;
 
1923
        char       *endp;
 
1924
        const char *sp;
 
1925
        int                     rc;
 
1926
        struct stat stat_buf;
 
1927
 
 
1928
        /*
 
1929
         * When doing archive recovery, we always prefer an archived log file
 
1930
         * even if a file of the same name exists in XLogDir.  The reason is
 
1931
         * that the file in XLogDir could be an old, un-filled or
 
1932
         * partly-filled version that was copied and restored as part of
 
1933
         * backing up $PGDATA.
 
1934
         *
 
1935
         * We could try to optimize this slightly by checking the local copy
 
1936
         * lastchange timestamp against the archived copy, but we have no API
 
1937
         * to do this, nor can we guarantee that the lastchange timestamp was
 
1938
         * preserved correctly when we copied to archive. Our aim is
 
1939
         * robustness, so we elect not to do this.
 
1940
         *
 
1941
         * If we cannot obtain the log file from the archive, however, we will
 
1942
         * try to use the XLogDir file if it exists.  This is so that we can
 
1943
         * make use of log segments that weren't yet transferred to the
 
1944
         * archive.
 
1945
         *
 
1946
         * Notice that we don't actually overwrite any files when we copy back
 
1947
         * from archive because the recoveryRestoreCommand may inadvertently
 
1948
         * restore inappropriate xlogs, or they may be corrupt, so we may wish
 
1949
         * to fallback to the segments remaining in current XLogDir later. The
 
1950
         * copy-from-archive filename is always the same, ensuring that we
 
1951
         * don't run out of disk space on long recoveries.
 
1952
         */
 
1953
        snprintf(xlogpath, MAXPGPATH, "%s/%s", XLogDir, recovername);
 
1954
 
 
1955
        /*
 
1956
         * Make sure there is no existing file named recovername.
 
1957
         */
 
1958
        if (stat(xlogpath, &stat_buf) != 0)
 
1959
        {
 
1960
                if (errno != ENOENT)
 
1961
                        ereport(FATAL,
 
1962
                                        (errcode_for_file_access(),
 
1963
                                         errmsg("could not stat file \"%s\": %m",
 
1964
                                                        xlogpath)));
 
1965
        }
 
1966
        else
 
1967
        {
 
1968
                if (unlink(xlogpath) != 0)
 
1969
                        ereport(FATAL,
 
1970
                                        (errcode_for_file_access(),
 
1971
                                         errmsg("could not remove file \"%s\": %m",
 
1972
                                                        xlogpath)));
 
1973
        }
 
1974
 
 
1975
        /*
 
1976
         * construct the command to be executed
 
1977
         */
 
1978
        dp = xlogRestoreCmd;
 
1979
        endp = xlogRestoreCmd + MAXPGPATH - 1;
 
1980
        *endp = '\0';
 
1981
 
 
1982
        for (sp = recoveryRestoreCommand; *sp; sp++)
 
1983
        {
 
1984
                if (*sp == '%')
 
1985
                {
 
1986
                        switch (sp[1])
 
1987
                        {
 
1988
                                case 'p':
 
1989
                                        /* %p: full path of target file */
 
1990
                                        sp++;
 
1991
                                        StrNCpy(dp, xlogpath, endp - dp);
 
1992
                                        make_native_path(dp);
 
1993
                                        dp += strlen(dp);
 
1994
                                        break;
 
1995
                                case 'f':
 
1996
                                        /* %f: filename of desired file */
 
1997
                                        sp++;
 
1998
                                        StrNCpy(dp, xlogfname, endp - dp);
 
1999
                                        dp += strlen(dp);
 
2000
                                        break;
 
2001
                                case '%':
 
2002
                                        /* convert %% to a single % */
 
2003
                                        sp++;
 
2004
                                        if (dp < endp)
 
2005
                                                *dp++ = *sp;
 
2006
                                        break;
 
2007
                                default:
 
2008
                                        /* otherwise treat the % as not special */
 
2009
                                        if (dp < endp)
 
2010
                                                *dp++ = *sp;
 
2011
                                        break;
 
2012
                        }
 
2013
                }
 
2014
                else
 
2015
                {
 
2016
                        if (dp < endp)
 
2017
                                *dp++ = *sp;
 
2018
                }
 
2019
        }
 
2020
        *dp = '\0';
 
2021
 
 
2022
        ereport(DEBUG3,
 
2023
                        (errmsg_internal("executing restore command \"%s\"",
 
2024
                                                         xlogRestoreCmd)));
 
2025
 
 
2026
        /*
 
2027
         * Copy xlog from archival storage to XLogDir
 
2028
         */
 
2029
        rc = system(xlogRestoreCmd);
 
2030
        if (rc == 0)
 
2031
        {
 
2032
                /*
 
2033
                 * command apparently succeeded, but let's make sure the file is
 
2034
                 * really there now and has the correct size.
 
2035
                 *
 
2036
                 * XXX I made wrong-size a fatal error to ensure the DBA would notice
 
2037
                 * it, but is that too strong?  We could try to plow ahead with a
 
2038
                 * local copy of the file ... but the problem is that there
 
2039
                 * probably isn't one, and we'd incorrectly conclude we've reached
 
2040
                 * the end of WAL and we're done recovering ...
 
2041
                 */
 
2042
                if (stat(xlogpath, &stat_buf) == 0)
 
2043
                {
 
2044
                        if (expectedSize > 0 && stat_buf.st_size != expectedSize)
 
2045
                                ereport(FATAL,
 
2046
                                                (errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
 
2047
                                                                xlogfname,
 
2048
                                                                (unsigned long) stat_buf.st_size,
 
2049
                                                                (unsigned long) expectedSize)));
 
2050
                        else
 
2051
                        {
 
2052
                                ereport(LOG,
 
2053
                                                (errmsg("restored log file \"%s\" from archive",
 
2054
                                                                xlogfname)));
 
2055
                                strcpy(path, xlogpath);
 
2056
                                return true;
 
2057
                        }
 
2058
                }
 
2059
                else
 
2060
                {
 
2061
                        /* stat failed */
 
2062
                        if (errno != ENOENT)
 
2063
                                ereport(FATAL,
 
2064
                                                (errcode_for_file_access(),
 
2065
                                                 errmsg("could not stat file \"%s\": %m",
 
2066
                                                                xlogpath)));
 
2067
                }
 
2068
        }
 
2069
 
 
2070
        /*
 
2071
         * remember, we rollforward UNTIL the restore fails so failure here is
 
2072
         * just part of the process... that makes it difficult to determine
 
2073
         * whether the restore failed because there isn't an archive to
 
2074
         * restore, or because the administrator has specified the restore
 
2075
         * program incorrectly.  We have to assume the former.
 
2076
         */
 
2077
        ereport(DEBUG1,
 
2078
                 (errmsg("could not restore file \"%s\" from archive: return code %d",
 
2079
                                 xlogfname, rc)));
 
2080
 
 
2081
        /*
 
2082
         * if an archived file is not available, there might still be a
 
2083
         * version of this file in XLogDir, so return that as the filename to
 
2084
         * open.
 
2085
         *
 
2086
         * In many recovery scenarios we expect this to fail also, but if so that
 
2087
         * just means we've reached the end of WAL.
 
2088
         */
 
2089
        snprintf(path, MAXPGPATH, "%s/%s", XLogDir, xlogfname);
 
2090
        return false;
 
2091
}
 
2092
 
 
2093
/*
 
2094
 * Preallocate log files beyond the specified log endpoint, according to
 
2095
 * the XLOGfile user parameter.
 
2096
 */
 
2097
static int
 
2098
PreallocXlogFiles(XLogRecPtr endptr)
 
2099
{
 
2100
        int                     nsegsadded = 0;
 
2101
        uint32          _logId;
 
2102
        uint32          _logSeg;
 
2103
        int                     lf;
 
2104
        bool            use_existent;
 
2105
 
 
2106
        XLByteToPrevSeg(endptr, _logId, _logSeg);
 
2107
        if ((endptr.xrecoff - 1) % XLogSegSize >=
 
2108
                (uint32) (0.75 * XLogSegSize))
 
2109
        {
 
2110
                NextLogSeg(_logId, _logSeg);
 
2111
                use_existent = true;
 
2112
                lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
 
2113
                close(lf);
 
2114
                if (!use_existent)
 
2115
                        nsegsadded++;
 
2116
        }
 
2117
        return nsegsadded;
 
2118
}
 
2119
 
 
2120
/*
 
2121
 * Remove or move offline all log files older or equal to passed log/seg#
 
2122
 *
 
2123
 * endptr is current (or recent) end of xlog; this is used to determine
 
2124
 * whether we want to recycle rather than delete no-longer-wanted log files.
 
2125
 */
 
2126
static void
 
2127
MoveOfflineLogs(uint32 log, uint32 seg, XLogRecPtr endptr,
 
2128
                                int *nsegsremoved, int *nsegsrecycled)
 
2129
{
 
2130
        uint32          endlogId;
 
2131
        uint32          endlogSeg;
 
2132
        DIR                *xldir;
 
2133
        struct dirent *xlde;
 
2134
        char            lastoff[MAXFNAMELEN];
 
2135
        char            path[MAXPGPATH];
 
2136
 
 
2137
        *nsegsremoved = 0;
 
2138
        *nsegsrecycled = 0;
 
2139
 
 
2140
        XLByteToPrevSeg(endptr, endlogId, endlogSeg);
 
2141
 
 
2142
        xldir = AllocateDir(XLogDir);
 
2143
        if (xldir == NULL)
 
2144
                ereport(ERROR,
 
2145
                                (errcode_for_file_access(),
 
2146
                        errmsg("could not open transaction log directory \"%s\": %m",
 
2147
                                   XLogDir)));
 
2148
 
 
2149
        XLogFileName(lastoff, ThisTimeLineID, log, seg);
 
2150
 
 
2151
        errno = 0;
 
2152
        while ((xlde = readdir(xldir)) != NULL)
 
2153
        {
 
2154
                /*
 
2155
                 * We ignore the timeline part of the XLOG segment identifiers in
 
2156
                 * deciding whether a segment is still needed.  This ensures that
 
2157
                 * we won't prematurely remove a segment from a parent timeline.
 
2158
                 * We could probably be a little more proactive about removing
 
2159
                 * segments of non-parent timelines, but that would be a whole lot
 
2160
                 * more complicated.
 
2161
                 *
 
2162
                 * We use the alphanumeric sorting property of the filenames to
 
2163
                 * decide which ones are earlier than the lastoff segment.
 
2164
                 */
 
2165
                if (strlen(xlde->d_name) == 24 &&
 
2166
                        strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
 
2167
                        strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
 
2168
                {
 
2169
                        bool            recycle;
 
2170
 
 
2171
                        if (XLogArchivingActive())
 
2172
                                recycle = XLogArchiveIsDone(xlde->d_name);
 
2173
                        else
 
2174
                                recycle = true;
 
2175
 
 
2176
                        if (recycle)
 
2177
                        {
 
2178
                                snprintf(path, MAXPGPATH, "%s/%s", XLogDir, xlde->d_name);
 
2179
 
 
2180
                                /*
 
2181
                                 * Before deleting the file, see if it can be recycled as
 
2182
                                 * a future log segment.  We allow recycling segments up
 
2183
                                 * to XLOGfileslop segments beyond the current XLOG
 
2184
                                 * location.
 
2185
                                 */
 
2186
                                if (InstallXLogFileSegment(endlogId, endlogSeg, path,
 
2187
                                                                                   true, XLOGfileslop,
 
2188
                                                                                   true))
 
2189
                                {
 
2190
                                        ereport(DEBUG2,
 
2191
                                                  (errmsg("recycled transaction log file \"%s\"",
 
2192
                                                                  xlde->d_name)));
 
2193
                                        (*nsegsrecycled)++;
 
2194
                                }
 
2195
                                else
 
2196
                                {
 
2197
                                        /* No need for any more future segments... */
 
2198
                                        ereport(DEBUG2,
 
2199
                                                  (errmsg("removing transaction log file \"%s\"",
 
2200
                                                                  xlde->d_name)));
 
2201
                                        unlink(path);
 
2202
                                        (*nsegsremoved)++;
 
2203
                                }
 
2204
 
 
2205
                                XLogArchiveCleanup(xlde->d_name);
 
2206
                        }
 
2207
                }
 
2208
                errno = 0;
 
2209
        }
 
2210
#ifdef WIN32
 
2211
 
 
2212
        /*
 
2213
         * This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but
 
2214
         * not in released version
 
2215
         */
 
2216
        if (GetLastError() == ERROR_NO_MORE_FILES)
 
2217
                errno = 0;
 
2218
#endif
 
2219
        if (errno)
 
2220
                ereport(ERROR,
 
2221
                                (errcode_for_file_access(),
 
2222
                        errmsg("could not read transaction log directory \"%s\": %m",
 
2223
                                   XLogDir)));
 
2224
        FreeDir(xldir);
 
2225
}
 
2226
 
 
2227
/*
 
2228
 * Restore the backup blocks present in an XLOG record, if any.
 
2229
 *
 
2230
 * We assume all of the record has been read into memory at *record.
 
2231
 *
 
2232
 * Note: when a backup block is available in XLOG, we restore it
 
2233
 * unconditionally, even if the page in the database appears newer.
 
2234
 * This is to protect ourselves against database pages that were partially
 
2235
 * or incorrectly written during a crash.  We assume that the XLOG data
 
2236
 * must be good because it has passed a CRC check, while the database
 
2237
 * page might not be.  This will force us to replay all subsequent
 
2238
 * modifications of the page that appear in XLOG, rather than possibly
 
2239
 * ignoring them as already applied, but that's not a huge drawback.
 
2240
 */
 
2241
static void
 
2242
RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
 
2243
{
 
2244
        Relation        reln;
 
2245
        Buffer          buffer;
 
2246
        Page            page;
 
2247
        BkpBlock        bkpb;
 
2248
        char       *blk;
 
2249
        int                     i;
 
2250
 
 
2251
        blk = (char *) XLogRecGetData(record) + record->xl_len;
 
2252
        for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 
2253
        {
 
2254
                if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
 
2255
                        continue;
 
2256
 
 
2257
                memcpy((char *) &bkpb, blk, sizeof(BkpBlock));
 
2258
                blk += sizeof(BkpBlock);
 
2259
 
 
2260
                reln = XLogOpenRelation(true, record->xl_rmid, bkpb.node);
 
2261
 
 
2262
                if (reln)
 
2263
                {
 
2264
                        buffer = XLogReadBuffer(true, reln, bkpb.block);
 
2265
                        if (BufferIsValid(buffer))
 
2266
                        {
 
2267
                                page = (Page) BufferGetPage(buffer);
 
2268
                                memcpy((char *) page, blk, BLCKSZ);
 
2269
                                PageSetLSN(page, lsn);
 
2270
                                PageSetTLI(page, ThisTimeLineID);
 
2271
                                LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 
2272
                                WriteBuffer(buffer);
 
2273
                        }
 
2274
                }
 
2275
 
 
2276
                blk += BLCKSZ;
 
2277
        }
 
2278
}
 
2279
 
 
2280
/*
 
2281
 * CRC-check an XLOG record.  We do not believe the contents of an XLOG
 
2282
 * record (other than to the minimal extent of computing the amount of
 
2283
 * data to read in) until we've checked the CRCs.
 
2284
 *
 
2285
 * We assume all of the record has been read into memory at *record.
 
2286
 */
 
2287
static bool
 
2288
RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
 
2289
{
 
2290
        crc64           crc;
 
2291
        crc64           cbuf;
 
2292
        int                     i;
 
2293
        uint32          len = record->xl_len;
 
2294
        char       *blk;
 
2295
 
 
2296
        /* Check CRC of rmgr data and record header */
 
2297
        INIT_CRC64(crc);
 
2298
        COMP_CRC64(crc, XLogRecGetData(record), len);
 
2299
        COMP_CRC64(crc, (char *) record + sizeof(crc64),
 
2300
                           SizeOfXLogRecord - sizeof(crc64));
 
2301
        FIN_CRC64(crc);
 
2302
 
 
2303
        if (!EQ_CRC64(record->xl_crc, crc))
 
2304
        {
 
2305
                ereport(emode,
 
2306
                                (errmsg("incorrect resource manager data checksum in record at %X/%X",
 
2307
                                                recptr.xlogid, recptr.xrecoff)));
 
2308
                return (false);
 
2309
        }
 
2310
 
 
2311
        /* Check CRCs of backup blocks, if any */
 
2312
        blk = (char *) XLogRecGetData(record) + len;
 
2313
        for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 
2314
        {
 
2315
                if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
 
2316
                        continue;
 
2317
 
 
2318
                INIT_CRC64(crc);
 
2319
                COMP_CRC64(crc, blk + sizeof(BkpBlock), BLCKSZ);
 
2320
                COMP_CRC64(crc, blk + sizeof(crc64),
 
2321
                                   sizeof(BkpBlock) - sizeof(crc64));
 
2322
                FIN_CRC64(crc);
 
2323
                memcpy((char *) &cbuf, blk, sizeof(crc64));             /* don't assume
 
2324
                                                                                                                 * alignment */
 
2325
 
 
2326
                if (!EQ_CRC64(cbuf, crc))
 
2327
                {
 
2328
                        ereport(emode,
 
2329
                                        (errmsg("incorrect checksum of backup block %d in record at %X/%X",
 
2330
                                                        i + 1, recptr.xlogid, recptr.xrecoff)));
 
2331
                        return (false);
 
2332
                }
 
2333
                blk += sizeof(BkpBlock) + BLCKSZ;
 
2334
        }
 
2335
 
 
2336
        return (true);
 
2337
}
 
2338
 
 
2339
/*
 
2340
 * Attempt to read an XLOG record.
 
2341
 *
 
2342
 * If RecPtr is not NULL, try to read a record at that position.  Otherwise
 
2343
 * try to read a record just after the last one previously read.
 
2344
 *
 
2345
 * If no valid record is available, returns NULL, or fails if emode is PANIC.
 
2346
 * (emode must be either PANIC or LOG.)
 
2347
 *
 
2348
 * The record is copied into readRecordBuf, so that on successful return,
 
2349
 * the returned record pointer always points there.
 
2350
 */
 
2351
static XLogRecord *
 
2352
ReadRecord(XLogRecPtr *RecPtr, int emode)
 
2353
{
 
2354
        XLogRecord *record;
 
2355
        char       *buffer;
 
2356
        XLogRecPtr      tmpRecPtr = EndRecPtr;
 
2357
        bool            randAccess = false;
 
2358
        uint32          len,
 
2359
                                total_len;
 
2360
        uint32          targetPageOff;
 
2361
        uint32          targetRecOff;
 
2362
        uint32          pageHeaderSize;
 
2363
        unsigned        i;
 
2364
 
 
2365
        if (readBuf == NULL)
 
2366
        {
 
2367
                /*
 
2368
                 * First time through, permanently allocate readBuf.  We do it
 
2369
                 * this way, rather than just making a static array, for two
 
2370
                 * reasons: (1) no need to waste the storage in most
 
2371
                 * instantiations of the backend; (2) a static char array isn't
 
2372
                 * guaranteed to have any particular alignment, whereas malloc()
 
2373
                 * will provide MAXALIGN'd storage.
 
2374
                 */
 
2375
                readBuf = (char *) malloc(BLCKSZ);
 
2376
                Assert(readBuf != NULL);
 
2377
        }
 
2378
 
 
2379
        if (RecPtr == NULL)
 
2380
        {
 
2381
                RecPtr = &tmpRecPtr;
 
2382
                /* fast case if next record is on same page */
 
2383
                if (nextRecord != NULL)
 
2384
                {
 
2385
                        record = nextRecord;
 
2386
                        goto got_record;
 
2387
                }
 
2388
                /* align old recptr to next page */
 
2389
                if (tmpRecPtr.xrecoff % BLCKSZ != 0)
 
2390
                        tmpRecPtr.xrecoff += (BLCKSZ - tmpRecPtr.xrecoff % BLCKSZ);
 
2391
                if (tmpRecPtr.xrecoff >= XLogFileSize)
 
2392
                {
 
2393
                        (tmpRecPtr.xlogid)++;
 
2394
                        tmpRecPtr.xrecoff = 0;
 
2395
                }
 
2396
                /* We will account for page header size below */
 
2397
        }
 
2398
        else
 
2399
        {
 
2400
                if (!XRecOffIsValid(RecPtr->xrecoff))
 
2401
                        ereport(PANIC,
 
2402
                                        (errmsg("invalid record offset at %X/%X",
 
2403
                                                        RecPtr->xlogid, RecPtr->xrecoff)));
 
2404
 
 
2405
                /*
 
2406
                 * Since we are going to a random position in WAL, forget any
 
2407
                 * prior state about what timeline we were in, and allow it to be
 
2408
                 * any timeline in expectedTLIs.  We also set a flag to allow
 
2409
                 * curFileTLI to go backwards (but we can't reset that variable
 
2410
                 * right here, since we might not change files at all).
 
2411
                 */
 
2412
                lastPageTLI = 0;                /* see comment in ValidXLOGHeader */
 
2413
                randAccess = true;              /* allow curFileTLI to go backwards too */
 
2414
        }
 
2415
 
 
2416
        if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
 
2417
        {
 
2418
                close(readFile);
 
2419
                readFile = -1;
 
2420
        }
 
2421
        XLByteToSeg(*RecPtr, readId, readSeg);
 
2422
        if (readFile < 0)
 
2423
        {
 
2424
                /* Now it's okay to reset curFileTLI if random fetch */
 
2425
                if (randAccess)
 
2426
                        curFileTLI = 0;
 
2427
 
 
2428
                readFile = XLogFileRead(readId, readSeg, emode);
 
2429
                if (readFile < 0)
 
2430
                        goto next_record_is_invalid;
 
2431
                readOff = (uint32) (-1);        /* force read to occur below */
 
2432
        }
 
2433
 
 
2434
        targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / BLCKSZ) * BLCKSZ;
 
2435
        if (readOff != targetPageOff)
 
2436
        {
 
2437
                readOff = targetPageOff;
 
2438
                if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
 
2439
                {
 
2440
                        ereport(emode,
 
2441
                                        (errcode_for_file_access(),
 
2442
                                         errmsg("could not seek in log file %u, segment %u to offset %u: %m",
 
2443
                                                        readId, readSeg, readOff)));
 
2444
                        goto next_record_is_invalid;
 
2445
                }
 
2446
                if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
 
2447
                {
 
2448
                        ereport(emode,
 
2449
                                        (errcode_for_file_access(),
 
2450
                                         errmsg("could not read from log file %u, segment %u at offset %u: %m",
 
2451
                                                        readId, readSeg, readOff)));
 
2452
                        goto next_record_is_invalid;
 
2453
                }
 
2454
                if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
 
2455
                        goto next_record_is_invalid;
 
2456
        }
 
2457
        pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
 
2458
        targetRecOff = RecPtr->xrecoff % BLCKSZ;
 
2459
        if (targetRecOff == 0)
 
2460
        {
 
2461
                /*
 
2462
                 * Can only get here in the continuing-from-prev-page case,
 
2463
                 * because XRecOffIsValid eliminated the zero-page-offset case
 
2464
                 * otherwise. Need to skip over the new page's header.
 
2465
                 */
 
2466
                tmpRecPtr.xrecoff += pageHeaderSize;
 
2467
                targetRecOff = pageHeaderSize;
 
2468
        }
 
2469
        else if (targetRecOff < pageHeaderSize)
 
2470
        {
 
2471
                ereport(emode,
 
2472
                                (errmsg("invalid record offset at %X/%X",
 
2473
                                                RecPtr->xlogid, RecPtr->xrecoff)));
 
2474
                goto next_record_is_invalid;
 
2475
        }
 
2476
        if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
 
2477
                targetRecOff == pageHeaderSize)
 
2478
        {
 
2479
                ereport(emode,
 
2480
                                (errmsg("contrecord is requested by %X/%X",
 
2481
                                                RecPtr->xlogid, RecPtr->xrecoff)));
 
2482
                goto next_record_is_invalid;
 
2483
        }
 
2484
        record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % BLCKSZ);
 
2485
 
 
2486
got_record:;
 
2487
 
 
2488
        /*
 
2489
         * Currently, xl_len == 0 must be bad data, but that might not be true
 
2490
         * forever.  See note in XLogInsert.
 
2491
         */
 
2492
        if (record->xl_len == 0)
 
2493
        {
 
2494
                ereport(emode,
 
2495
                                (errmsg("record with zero length at %X/%X",
 
2496
                                                RecPtr->xlogid, RecPtr->xrecoff)));
 
2497
                goto next_record_is_invalid;
 
2498
        }
 
2499
        if (record->xl_rmid > RM_MAX_ID)
 
2500
        {
 
2501
                ereport(emode,
 
2502
                                (errmsg("invalid resource manager ID %u at %X/%X",
 
2503
                                         record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
 
2504
                goto next_record_is_invalid;
 
2505
        }
 
2506
 
 
2507
        /*
 
2508
         * Compute total length of record including any appended backup
 
2509
         * blocks.
 
2510
         */
 
2511
        total_len = SizeOfXLogRecord + record->xl_len;
 
2512
        for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 
2513
        {
 
2514
                if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
 
2515
                        continue;
 
2516
                total_len += sizeof(BkpBlock) + BLCKSZ;
 
2517
        }
 
2518
 
 
2519
        /*
 
2520
         * Allocate or enlarge readRecordBuf as needed.  To avoid useless
 
2521
         * small increases, round its size to a multiple of BLCKSZ, and make
 
2522
         * sure it's at least 4*BLCKSZ to start with.  (That is enough for all
 
2523
         * "normal" records, but very large commit or abort records might need
 
2524
         * more space.)
 
2525
         */
 
2526
        if (total_len > readRecordBufSize)
 
2527
        {
 
2528
                uint32          newSize = total_len;
 
2529
 
 
2530
                newSize += BLCKSZ - (newSize % BLCKSZ);
 
2531
                newSize = Max(newSize, 4 * BLCKSZ);
 
2532
                if (readRecordBuf)
 
2533
                        free(readRecordBuf);
 
2534
                readRecordBuf = (char *) malloc(newSize);
 
2535
                if (!readRecordBuf)
 
2536
                {
 
2537
                        readRecordBufSize = 0;
 
2538
                        /* We treat this as a "bogus data" condition */
 
2539
                        ereport(emode,
 
2540
                                        (errmsg("record length %u at %X/%X too long",
 
2541
                                                        total_len, RecPtr->xlogid, RecPtr->xrecoff)));
 
2542
                        goto next_record_is_invalid;
 
2543
                }
 
2544
                readRecordBufSize = newSize;
 
2545
        }
 
2546
 
 
2547
        buffer = readRecordBuf;
 
2548
        nextRecord = NULL;
 
2549
        len = BLCKSZ - RecPtr->xrecoff % BLCKSZ;
 
2550
        if (total_len > len)
 
2551
        {
 
2552
                /* Need to reassemble record */
 
2553
                XLogContRecord *contrecord;
 
2554
                uint32          gotlen = len;
 
2555
 
 
2556
                memcpy(buffer, record, len);
 
2557
                record = (XLogRecord *) buffer;
 
2558
                buffer += len;
 
2559
                for (;;)
 
2560
                {
 
2561
                        readOff += BLCKSZ;
 
2562
                        if (readOff >= XLogSegSize)
 
2563
                        {
 
2564
                                close(readFile);
 
2565
                                readFile = -1;
 
2566
                                NextLogSeg(readId, readSeg);
 
2567
                                readFile = XLogFileRead(readId, readSeg, emode);
 
2568
                                if (readFile < 0)
 
2569
                                        goto next_record_is_invalid;
 
2570
                                readOff = 0;
 
2571
                        }
 
2572
                        if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
 
2573
                        {
 
2574
                                ereport(emode,
 
2575
                                                (errcode_for_file_access(),
 
2576
                                                 errmsg("could not read from log file %u, segment %u, offset %u: %m",
 
2577
                                                                readId, readSeg, readOff)));
 
2578
                                goto next_record_is_invalid;
 
2579
                        }
 
2580
                        if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
 
2581
                                goto next_record_is_invalid;
 
2582
                        if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
 
2583
                        {
 
2584
                                ereport(emode,
 
2585
                                                (errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
 
2586
                                                                readId, readSeg, readOff)));
 
2587
                                goto next_record_is_invalid;
 
2588
                        }
 
2589
                        pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
 
2590
                        contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
 
2591
                        if (contrecord->xl_rem_len == 0 ||
 
2592
                                total_len != (contrecord->xl_rem_len + gotlen))
 
2593
                        {
 
2594
                                ereport(emode,
 
2595
                                                (errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
 
2596
                                                                contrecord->xl_rem_len,
 
2597
                                                                readId, readSeg, readOff)));
 
2598
                                goto next_record_is_invalid;
 
2599
                        }
 
2600
                        len = BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
 
2601
                        if (contrecord->xl_rem_len > len)
 
2602
                        {
 
2603
                                memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
 
2604
                                gotlen += len;
 
2605
                                buffer += len;
 
2606
                                continue;
 
2607
                        }
 
2608
                        memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
 
2609
                                   contrecord->xl_rem_len);
 
2610
                        break;
 
2611
                }
 
2612
                if (!RecordIsValid(record, *RecPtr, emode))
 
2613
                        goto next_record_is_invalid;
 
2614
                pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
 
2615
                if (BLCKSZ - SizeOfXLogRecord >= pageHeaderSize +
 
2616
                        SizeOfXLogContRecord + MAXALIGN(contrecord->xl_rem_len))
 
2617
                {
 
2618
                        nextRecord = (XLogRecord *) ((char *) contrecord +
 
2619
                                SizeOfXLogContRecord + MAXALIGN(contrecord->xl_rem_len));
 
2620
                }
 
2621
                EndRecPtr.xlogid = readId;
 
2622
                EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
 
2623
                        pageHeaderSize + SizeOfXLogContRecord +
 
2624
                        MAXALIGN(contrecord->xl_rem_len);
 
2625
                ReadRecPtr = *RecPtr;
 
2626
                return record;
 
2627
        }
 
2628
 
 
2629
        /* Record does not cross a page boundary */
 
2630
        if (!RecordIsValid(record, *RecPtr, emode))
 
2631
                goto next_record_is_invalid;
 
2632
        if (BLCKSZ - SizeOfXLogRecord >= RecPtr->xrecoff % BLCKSZ +
 
2633
                MAXALIGN(total_len))
 
2634
                nextRecord = (XLogRecord *) ((char *) record + MAXALIGN(total_len));
 
2635
        EndRecPtr.xlogid = RecPtr->xlogid;
 
2636
        EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
 
2637
        ReadRecPtr = *RecPtr;
 
2638
        memcpy(buffer, record, total_len);
 
2639
        return (XLogRecord *) buffer;
 
2640
 
 
2641
next_record_is_invalid:;
 
2642
        close(readFile);
 
2643
        readFile = -1;
 
2644
        nextRecord = NULL;
 
2645
        return NULL;
 
2646
}
 
2647
 
 
2648
/*
 
2649
 * Check whether the xlog header of a page just read in looks valid.
 
2650
 *
 
2651
 * This is just a convenience subroutine to avoid duplicated code in
 
2652
 * ReadRecord.  It's not intended for use from anywhere else.
 
2653
 */
 
2654
static bool
 
2655
ValidXLOGHeader(XLogPageHeader hdr, int emode)
 
2656
{
 
2657
        XLogRecPtr      recaddr;
 
2658
 
 
2659
        if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
 
2660
        {
 
2661
                ereport(emode,
 
2662
                                (errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
 
2663
                                                hdr->xlp_magic, readId, readSeg, readOff)));
 
2664
                return false;
 
2665
        }
 
2666
        if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
 
2667
        {
 
2668
                ereport(emode,
 
2669
                                (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
 
2670
                                                hdr->xlp_info, readId, readSeg, readOff)));
 
2671
                return false;
 
2672
        }
 
2673
        if (hdr->xlp_info & XLP_LONG_HEADER)
 
2674
        {
 
2675
                XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
 
2676
 
 
2677
                if (longhdr->xlp_sysid != ControlFile->system_identifier)
 
2678
                {
 
2679
                        char            fhdrident_str[32];
 
2680
                        char            sysident_str[32];
 
2681
 
 
2682
                        /*
 
2683
                         * Format sysids separately to keep platform-dependent format
 
2684
                         * code out of the translatable message string.
 
2685
                         */
 
2686
                        snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
 
2687
                                         longhdr->xlp_sysid);
 
2688
                        snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
 
2689
                                         ControlFile->system_identifier);
 
2690
                        ereport(emode,
 
2691
                                        (errmsg("WAL file is from different system"),
 
2692
                                errdetail("WAL file SYSID is %s, pg_control SYSID is %s",
 
2693
                                                  fhdrident_str, sysident_str)));
 
2694
                        return false;
 
2695
                }
 
2696
                if (longhdr->xlp_seg_size != XLogSegSize)
 
2697
                {
 
2698
                        ereport(emode,
 
2699
                                        (errmsg("WAL file is from different system"),
 
2700
                                  errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
 
2701
                        return false;
 
2702
                }
 
2703
        }
 
2704
        recaddr.xlogid = readId;
 
2705
        recaddr.xrecoff = readSeg * XLogSegSize + readOff;
 
2706
        if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
 
2707
        {
 
2708
                ereport(emode,
 
2709
                                (errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
 
2710
                                         hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
 
2711
                                                readId, readSeg, readOff)));
 
2712
                return false;
 
2713
        }
 
2714
 
 
2715
        /*
 
2716
         * Check page TLI is one of the expected values.
 
2717
         */
 
2718
        if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
 
2719
        {
 
2720
                ereport(emode,
 
2721
                                (errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
 
2722
                                                hdr->xlp_tli,
 
2723
                                                readId, readSeg, readOff)));
 
2724
                return false;
 
2725
        }
 
2726
 
 
2727
        /*
 
2728
         * Since child timelines are always assigned a TLI greater than their
 
2729
         * immediate parent's TLI, we should never see TLI go backwards across
 
2730
         * successive pages of a consistent WAL sequence.
 
2731
         *
 
2732
         * Of course this check should only be applied when advancing
 
2733
         * sequentially across pages; therefore ReadRecord resets lastPageTLI
 
2734
         * to zero when going to a random page.
 
2735
         */
 
2736
        if (hdr->xlp_tli < lastPageTLI)
 
2737
        {
 
2738
                ereport(emode,
 
2739
                                (errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
 
2740
                                                hdr->xlp_tli, lastPageTLI,
 
2741
                                                readId, readSeg, readOff)));
 
2742
                return false;
 
2743
        }
 
2744
        lastPageTLI = hdr->xlp_tli;
 
2745
        return true;
 
2746
}
 
2747
 
 
2748
/*
 
2749
 * Try to read a timeline's history file.
 
2750
 *
 
2751
 * If successful, return the list of component TLIs (the given TLI followed by
 
2752
 * its ancestor TLIs).  If we can't find the history file, assume that the
 
2753
 * timeline has no parents, and return a list of just the specified timeline
 
2754
 * ID.
 
2755
 */
 
2756
static List *
 
2757
readTimeLineHistory(TimeLineID targetTLI)
 
2758
{
 
2759
        List       *result;
 
2760
        char            path[MAXPGPATH];
 
2761
        char            histfname[MAXFNAMELEN];
 
2762
        char            fline[MAXPGPATH];
 
2763
        FILE       *fd;
 
2764
 
 
2765
        if (InArchiveRecovery)
 
2766
        {
 
2767
                TLHistoryFileName(histfname, targetTLI);
 
2768
                RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
 
2769
        }
 
2770
        else
 
2771
                TLHistoryFilePath(path, targetTLI);
 
2772
 
 
2773
        fd = AllocateFile(path, "r");
 
2774
        if (fd == NULL)
 
2775
        {
 
2776
                if (errno != ENOENT)
 
2777
                        ereport(FATAL,
 
2778
                                        (errcode_for_file_access(),
 
2779
                                         errmsg("could not open file \"%s\": %m", path)));
 
2780
                /* Not there, so assume no parents */
 
2781
                return list_make1_int((int) targetTLI);
 
2782
        }
 
2783
 
 
2784
        result = NIL;
 
2785
 
 
2786
        /*
 
2787
         * Parse the file...
 
2788
         */
 
2789
        while (fgets(fline, MAXPGPATH, fd) != NULL)
 
2790
        {
 
2791
                /* skip leading whitespace and check for # comment */
 
2792
                char       *ptr;
 
2793
                char       *endptr;
 
2794
                TimeLineID      tli;
 
2795
 
 
2796
                for (ptr = fline; *ptr; ptr++)
 
2797
                {
 
2798
                        if (!isspace((unsigned char) *ptr))
 
2799
                                break;
 
2800
                }
 
2801
                if (*ptr == '\0' || *ptr == '#')
 
2802
                        continue;
 
2803
 
 
2804
                /* expect a numeric timeline ID as first field of line */
 
2805
                tli = (TimeLineID) strtoul(ptr, &endptr, 0);
 
2806
                if (endptr == ptr)
 
2807
                        ereport(FATAL,
 
2808
                                        (errmsg("syntax error in history file: %s", fline),
 
2809
                                         errhint("Expected a numeric timeline ID.")));
 
2810
 
 
2811
                if (result &&
 
2812
                        tli <= (TimeLineID) linitial_int(result))
 
2813
                        ereport(FATAL,
 
2814
                                        (errmsg("invalid data in history file: %s", fline),
 
2815
                           errhint("Timeline IDs must be in increasing sequence.")));
 
2816
 
 
2817
                /* Build list with newest item first */
 
2818
                result = lcons_int((int) tli, result);
 
2819
 
 
2820
                /* we ignore the remainder of each line */
 
2821
        }
 
2822
 
 
2823
        FreeFile(fd);
 
2824
 
 
2825
        if (result &&
 
2826
                targetTLI <= (TimeLineID) linitial_int(result))
 
2827
                ereport(FATAL,
 
2828
                                (errmsg("invalid data in history file \"%s\"", path),
 
2829
                errhint("Timeline IDs must be less than child timeline's ID.")));
 
2830
 
 
2831
        result = lcons_int((int) targetTLI, result);
 
2832
 
 
2833
        ereport(DEBUG3,
 
2834
                        (errmsg_internal("history of timeline %u is %s",
 
2835
                                                         targetTLI, nodeToString(result))));
 
2836
 
 
2837
        return result;
 
2838
}
 
2839
 
 
2840
/*
 
2841
 * Probe whether a timeline history file exists for the given timeline ID
 
2842
 */
 
2843
static bool
 
2844
existsTimeLineHistory(TimeLineID probeTLI)
 
2845
{
 
2846
        char            path[MAXPGPATH];
 
2847
        char            histfname[MAXFNAMELEN];
 
2848
        FILE       *fd;
 
2849
 
 
2850
        if (InArchiveRecovery)
 
2851
        {
 
2852
                TLHistoryFileName(histfname, probeTLI);
 
2853
                RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
 
2854
        }
 
2855
        else
 
2856
                TLHistoryFilePath(path, probeTLI);
 
2857
 
 
2858
        fd = AllocateFile(path, "r");
 
2859
        if (fd != NULL)
 
2860
        {
 
2861
                FreeFile(fd);
 
2862
                return true;
 
2863
        }
 
2864
        else
 
2865
        {
 
2866
                if (errno != ENOENT)
 
2867
                        ereport(FATAL,
 
2868
                                        (errcode_for_file_access(),
 
2869
                                         errmsg("could not open file \"%s\": %m", path)));
 
2870
                return false;
 
2871
        }
 
2872
}
 
2873
 
 
2874
/*
 
2875
 * Find the newest existing timeline, assuming that startTLI exists.
 
2876
 *
 
2877
 * Note: while this is somewhat heuristic, it does positively guarantee
 
2878
 * that (result + 1) is not a known timeline, and therefore it should
 
2879
 * be safe to assign that ID to a new timeline.
 
2880
 */
 
2881
static TimeLineID
 
2882
findNewestTimeLine(TimeLineID startTLI)
 
2883
{
 
2884
        TimeLineID      newestTLI;
 
2885
        TimeLineID      probeTLI;
 
2886
 
 
2887
        /*
 
2888
         * The algorithm is just to probe for the existence of timeline
 
2889
         * history files.  XXX is it useful to allow gaps in the sequence?
 
2890
         */
 
2891
        newestTLI = startTLI;
 
2892
 
 
2893
        for (probeTLI = startTLI + 1;; probeTLI++)
 
2894
        {
 
2895
                if (existsTimeLineHistory(probeTLI))
 
2896
                {
 
2897
                        newestTLI = probeTLI;           /* probeTLI exists */
 
2898
                }
 
2899
                else
 
2900
                {
 
2901
                        /* doesn't exist, assume we're done */
 
2902
                        break;
 
2903
                }
 
2904
        }
 
2905
 
 
2906
        return newestTLI;
 
2907
}
 
2908
 
 
2909
/*
 
2910
 * Create a new timeline history file.
 
2911
 *
 
2912
 *      newTLI: ID of the new timeline
 
2913
 *      parentTLI: ID of its immediate parent
 
2914
 *      endTLI et al: ID of the last used WAL file, for annotation purposes
 
2915
 *
 
2916
 * Currently this is only used during recovery, and so there are no locking
 
2917
 * considerations.      But we should be just as tense as XLogFileInit to avoid
 
2918
 * emplacing a bogus file.
 
2919
 */
 
2920
static void
 
2921
writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 
2922
                                         TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
 
2923
{
 
2924
        char            path[MAXPGPATH];
 
2925
        char            tmppath[MAXPGPATH];
 
2926
        char            histfname[MAXFNAMELEN];
 
2927
        char            xlogfname[MAXFNAMELEN];
 
2928
        char            buffer[BLCKSZ];
 
2929
        int                     srcfd;
 
2930
        int                     fd;
 
2931
        int                     nbytes;
 
2932
 
 
2933
        Assert(newTLI > parentTLI); /* else bad selection of newTLI */
 
2934
 
 
2935
        /*
 
2936
         * Write into a temp file name.
 
2937
         */
 
2938
        snprintf(tmppath, MAXPGPATH, "%s/xlogtemp.%d", XLogDir, (int)getpid());
 
2939
 
 
2940
        unlink(tmppath);
 
2941
 
 
2942
        /* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
 
2943
        fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
 
2944
                                           S_IRUSR | S_IWUSR);
 
2945
        if (fd < 0)
 
2946
                ereport(PANIC,
 
2947
                                (errcode_for_file_access(),
 
2948
                                 errmsg("could not create file \"%s\": %m", tmppath)));
 
2949
 
 
2950
        /*
 
2951
         * If a history file exists for the parent, copy it verbatim
 
2952
         */
 
2953
        if (InArchiveRecovery)
 
2954
        {
 
2955
                TLHistoryFileName(histfname, parentTLI);
 
2956
                RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
 
2957
        }
 
2958
        else
 
2959
                TLHistoryFilePath(path, parentTLI);
 
2960
 
 
2961
        srcfd = BasicOpenFile(path, O_RDONLY, 0);
 
2962
        if (srcfd < 0)
 
2963
        {
 
2964
                if (errno != ENOENT)
 
2965
                        ereport(FATAL,
 
2966
                                        (errcode_for_file_access(),
 
2967
                                         errmsg("could not open file \"%s\": %m", path)));
 
2968
                /* Not there, so assume parent has no parents */
 
2969
        }
 
2970
        else
 
2971
        {
 
2972
                for (;;)
 
2973
                {
 
2974
                        errno = 0;
 
2975
                        nbytes = (int) read(srcfd, buffer, sizeof(buffer));
 
2976
                        if (nbytes < 0 || errno != 0)
 
2977
                                ereport(PANIC,
 
2978
                                                (errcode_for_file_access(),
 
2979
                                                 errmsg("could not read file \"%s\": %m", path)));
 
2980
                        if (nbytes == 0)
 
2981
                                break;
 
2982
                        errno = 0;
 
2983
                        if ((int) write(fd, buffer, nbytes) != nbytes)
 
2984
                        {
 
2985
                                int                     save_errno = errno;
 
2986
 
 
2987
                                /*
 
2988
                                 * If we fail to make the file, delete it to release disk
 
2989
                                 * space
 
2990
                                 */
 
2991
                                unlink(tmppath);
 
2992
 
 
2993
                                /*
 
2994
                                 * if write didn't set errno, assume problem is no disk
 
2995
                                 * space
 
2996
                                 */
 
2997
                                errno = save_errno ? save_errno : ENOSPC;
 
2998
 
 
2999
                                ereport(PANIC,
 
3000
                                                (errcode_for_file_access(),
 
3001
                                 errmsg("could not write to file \"%s\": %m", tmppath)));
 
3002
                        }
 
3003
                }
 
3004
                close(srcfd);
 
3005
        }
 
3006
 
 
3007
        /*
 
3008
         * Append one line with the details of this timeline split.
 
3009
         *
 
3010
         * If we did have a parent file, insert an extra newline just in case the
 
3011
         * parent file failed to end with one.
 
3012
         */
 
3013
        XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);
 
3014
 
 
3015
        snprintf(buffer, sizeof(buffer),
 
3016
                         "%s%u\t%s\t%s transaction %u at %s\n",
 
3017
                         (srcfd < 0) ? "" : "\n",
 
3018
                         parentTLI,
 
3019
                         xlogfname,
 
3020
                         recoveryStopAfter ? "after" : "before",
 
3021
                         recoveryStopXid,
 
3022
                         str_time(recoveryStopTime));
 
3023
 
 
3024
        nbytes = strlen(buffer);
 
3025
        errno = 0;
 
3026
        if ((int) write(fd, buffer, nbytes) != nbytes)
 
3027
        {
 
3028
                int                     save_errno = errno;
 
3029
 
 
3030
                /*
 
3031
                 * If we fail to make the file, delete it to release disk space
 
3032
                 */
 
3033
                unlink(tmppath);
 
3034
                /* if write didn't set errno, assume problem is no disk space */
 
3035
                errno = save_errno ? save_errno : ENOSPC;
 
3036
 
 
3037
                ereport(PANIC,
 
3038
                                (errcode_for_file_access(),
 
3039
                                 errmsg("could not write to file \"%s\": %m", tmppath)));
 
3040
        }
 
3041
 
 
3042
        if (pg_fsync(fd) != 0)
 
3043
                ereport(PANIC,
 
3044
                                (errcode_for_file_access(),
 
3045
                                 errmsg("could not fsync file \"%s\": %m", tmppath)));
 
3046
 
 
3047
        if (close(fd))
 
3048
                ereport(PANIC,
 
3049
                                (errcode_for_file_access(),
 
3050
                                 errmsg("could not close file \"%s\": %m", tmppath)));
 
3051
 
 
3052
 
 
3053
        /*
 
3054
         * Now move the completed history file into place with its final name.
 
3055
         */
 
3056
        TLHistoryFilePath(path, newTLI);
 
3057
 
 
3058
        /*
 
3059
         * Prefer link() to rename() here just to be really sure that we don't
 
3060
         * overwrite an existing logfile.  However, there shouldn't be one, so
 
3061
         * rename() is an acceptable substitute except for the truly paranoid.
 
3062
         */
 
3063
#if HAVE_WORKING_LINK
 
3064
        if (link(tmppath, path) < 0)
 
3065
                ereport(PANIC,
 
3066
                                (errcode_for_file_access(),
 
3067
                                 errmsg("could not link file \"%s\" to \"%s\": %m",
 
3068
                                                tmppath, path)));
 
3069
        unlink(tmppath);
 
3070
#else
 
3071
        if (rename(tmppath, path) < 0)
 
3072
                ereport(PANIC,
 
3073
                                (errcode_for_file_access(),
 
3074
                                 errmsg("could not rename file \"%s\" to \"%s\": %m",
 
3075
                                                tmppath, path)));
 
3076
#endif
 
3077
 
 
3078
        /* The history file can be archived immediately. */
 
3079
        TLHistoryFileName(histfname, newTLI);
 
3080
        XLogArchiveNotify(histfname);
 
3081
}
 
3082
 
 
3083
/*
 
3084
 * I/O routines for pg_control
 
3085
 *
 
3086
 * *ControlFile is a buffer in shared memory that holds an image of the
 
3087
 * contents of pg_control.      WriteControlFile() initializes pg_control
 
3088
 * given a preloaded buffer, ReadControlFile() loads the buffer from
 
3089
 * the pg_control file (during postmaster or standalone-backend startup),
 
3090
 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
 
3091
 *
 
3092
 * For simplicity, WriteControlFile() initializes the fields of pg_control
 
3093
 * that are related to checking backend/database compatibility, and
 
3094
 * ReadControlFile() verifies they are correct.  We could split out the
 
3095
 * I/O and compatibility-check functions, but there seems no need currently.
 
3096
 */
 
3097
 
 
3098
void
 
3099
XLOGPathInit(void)
 
3100
{
 
3101
        /* Init XLOG file paths */
 
3102
        snprintf(XLogDir, MAXPGPATH, "%s/pg_xlog", DataDir);
 
3103
        snprintf(ControlFilePath, MAXPGPATH, "%s/global/pg_control", DataDir);
 
3104
}
 
3105
 
 
3106
static void
 
3107
WriteControlFile(void)
 
3108
{
 
3109
        int                     fd;
 
3110
        char            buffer[BLCKSZ]; /* need not be aligned */
 
3111
        char       *localeptr;
 
3112
 
 
3113
        /*
 
3114
         * Initialize version and compatibility-check fields
 
3115
         */
 
3116
        ControlFile->pg_control_version = PG_CONTROL_VERSION;
 
3117
        ControlFile->catalog_version_no = CATALOG_VERSION_NO;
 
3118
        ControlFile->blcksz = BLCKSZ;
 
3119
        ControlFile->relseg_size = RELSEG_SIZE;
 
3120
        ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
 
3121
 
 
3122
        ControlFile->nameDataLen = NAMEDATALEN;
 
3123
        ControlFile->funcMaxArgs = FUNC_MAX_ARGS;
 
3124
 
 
3125
#ifdef HAVE_INT64_TIMESTAMP
 
3126
        ControlFile->enableIntTimes = TRUE;
 
3127
#else
 
3128
        ControlFile->enableIntTimes = FALSE;
 
3129
#endif
 
3130
 
 
3131
        ControlFile->localeBuflen = LOCALE_NAME_BUFLEN;
 
3132
        localeptr = setlocale(LC_COLLATE, NULL);
 
3133
        if (!localeptr)
 
3134
                ereport(PANIC,
 
3135
                                (errmsg("invalid LC_COLLATE setting")));
 
3136
        StrNCpy(ControlFile->lc_collate, localeptr, LOCALE_NAME_BUFLEN);
 
3137
        localeptr = setlocale(LC_CTYPE, NULL);
 
3138
        if (!localeptr)
 
3139
                ereport(PANIC,
 
3140
                                (errmsg("invalid LC_CTYPE setting")));
 
3141
        StrNCpy(ControlFile->lc_ctype, localeptr, LOCALE_NAME_BUFLEN);
 
3142
 
 
3143
        /* Contents are protected with a CRC */
 
3144
        INIT_CRC64(ControlFile->crc);
 
3145
        COMP_CRC64(ControlFile->crc,
 
3146
                           (char *) ControlFile + sizeof(crc64),
 
3147
                           sizeof(ControlFileData) - sizeof(crc64));
 
3148
        FIN_CRC64(ControlFile->crc);
 
3149
 
 
3150
        /*
 
3151
         * We write out BLCKSZ bytes into pg_control, zero-padding the excess
 
3152
         * over sizeof(ControlFileData).  This reduces the odds of
 
3153
         * premature-EOF errors when reading pg_control.  We'll still fail
 
3154
         * when we check the contents of the file, but hopefully with a more
 
3155
         * specific error than "couldn't read pg_control".
 
3156
         */
 
3157
        if (sizeof(ControlFileData) > BLCKSZ)
 
3158
                ereport(PANIC,
 
3159
                                (errmsg("sizeof(ControlFileData) is larger than BLCKSZ; fix either one")));
 
3160
 
 
3161
        memset(buffer, 0, BLCKSZ);
 
3162
        memcpy(buffer, ControlFile, sizeof(ControlFileData));
 
3163
 
 
3164
        fd = BasicOpenFile(ControlFilePath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
 
3165
                                           S_IRUSR | S_IWUSR);
 
3166
        if (fd < 0)
 
3167
                ereport(PANIC,
 
3168
                                (errcode_for_file_access(),
 
3169
                                 errmsg("could not create control file \"%s\": %m",
 
3170
                                                ControlFilePath)));
 
3171
 
 
3172
        errno = 0;
 
3173
        if (write(fd, buffer, BLCKSZ) != BLCKSZ)
 
3174
        {
 
3175
                /* if write didn't set errno, assume problem is no disk space */
 
3176
                if (errno == 0)
 
3177
                        errno = ENOSPC;
 
3178
                ereport(PANIC,
 
3179
                                (errcode_for_file_access(),
 
3180
                                 errmsg("could not write to control file: %m")));
 
3181
        }
 
3182
 
 
3183
        if (pg_fsync(fd) != 0)
 
3184
                ereport(PANIC,
 
3185
                                (errcode_for_file_access(),
 
3186
                                 errmsg("could not fsync control file: %m")));
 
3187
 
 
3188
        if (close(fd))
 
3189
                ereport(PANIC,
 
3190
                                (errcode_for_file_access(),
 
3191
                                 errmsg("could not close control file: %m")));
 
3192
}
 
3193
 
 
3194
static void
 
3195
ReadControlFile(void)
 
3196
{
 
3197
        crc64           crc;
 
3198
        int                     fd;
 
3199
 
 
3200
        /*
 
3201
         * Read data...
 
3202
         */
 
3203
        fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
 
3204
        if (fd < 0)
 
3205
                ereport(PANIC,
 
3206
                                (errcode_for_file_access(),
 
3207
                                 errmsg("could not open control file \"%s\": %m",
 
3208
                                                ControlFilePath)));
 
3209
 
 
3210
        if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
 
3211
                ereport(PANIC,
 
3212
                                (errcode_for_file_access(),
 
3213
                                 errmsg("could not read from control file: %m")));
 
3214
 
 
3215
        close(fd);
 
3216
 
 
3217
        /*
 
3218
         * Check for expected pg_control format version.  If this is wrong,
 
3219
         * the CRC check will likely fail because we'll be checking the wrong
 
3220
         * number of bytes.  Complaining about wrong version will probably be
 
3221
         * more enlightening than complaining about wrong CRC.
 
3222
         */
 
3223
        if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
 
3224
                ereport(FATAL,
 
3225
                                (errmsg("database files are incompatible with server"),
 
3226
                                 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
 
3227
                          " but the server was compiled with PG_CONTROL_VERSION %d.",
 
3228
                                        ControlFile->pg_control_version, PG_CONTROL_VERSION),
 
3229
                                 errhint("It looks like you need to initdb.")));
 
3230
        /* Now check the CRC. */
 
3231
        INIT_CRC64(crc);
 
3232
        COMP_CRC64(crc,
 
3233
                           (char *) ControlFile + sizeof(crc64),
 
3234
                           sizeof(ControlFileData) - sizeof(crc64));
 
3235
        FIN_CRC64(crc);
 
3236
 
 
3237
        if (!EQ_CRC64(crc, ControlFile->crc))
 
3238
                ereport(FATAL,
 
3239
                                (errmsg("incorrect checksum in control file")));
 
3240
 
 
3241
        /*
 
3242
         * Do compatibility checking immediately.  We do this here for 2
 
3243
         * reasons:
 
3244
         *
 
3245
         * (1) if the database isn't compatible with the backend executable, we
 
3246
         * want to abort before we can possibly do any damage;
 
3247
         *
 
3248
         * (2) this code is executed in the postmaster, so the setlocale() will
 
3249
         * propagate to forked backends, which aren't going to read this file
 
3250
         * for themselves.      (These locale settings are considered critical
 
3251
         * compatibility items because they can affect sort order of indexes.)
 
3252
         */
 
3253
        if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
 
3254
                ereport(FATAL,
 
3255
                                (errmsg("database files are incompatible with server"),
 
3256
                                 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
 
3257
                          " but the server was compiled with CATALOG_VERSION_NO %d.",
 
3258
                                        ControlFile->catalog_version_no, CATALOG_VERSION_NO),
 
3259
                                 errhint("It looks like you need to initdb.")));
 
3260
        if (ControlFile->blcksz != BLCKSZ)
 
3261
                ereport(FATAL,
 
3262
                                (errmsg("database files are incompatible with server"),
 
3263
                 errdetail("The database cluster was initialized with BLCKSZ %d,"
 
3264
                                   " but the server was compiled with BLCKSZ %d.",
 
3265
                                   ControlFile->blcksz, BLCKSZ),
 
3266
                         errhint("It looks like you need to recompile or initdb.")));
 
3267
        if (ControlFile->relseg_size != RELSEG_SIZE)
 
3268
                ereport(FATAL,
 
3269
                                (errmsg("database files are incompatible with server"),
 
3270
                                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
 
3271
                                         " but the server was compiled with RELSEG_SIZE %d.",
 
3272
                                                   ControlFile->relseg_size, RELSEG_SIZE),
 
3273
                         errhint("It looks like you need to recompile or initdb.")));
 
3274
        if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
 
3275
                ereport(FATAL,
 
3276
                                (errmsg("database files are incompatible with server"),
 
3277
                                 errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
 
3278
                                   " but the server was compiled with XLOG_SEG_SIZE %d.",
 
3279
                                                   ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
 
3280
                         errhint("It looks like you need to recompile or initdb.")));
 
3281
        if (ControlFile->nameDataLen != NAMEDATALEN)
 
3282
                ereport(FATAL,
 
3283
                                (errmsg("database files are incompatible with server"),
 
3284
                                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
 
3285
                                         " but the server was compiled with NAMEDATALEN %d.",
 
3286
                                                   ControlFile->nameDataLen, NAMEDATALEN),
 
3287
                         errhint("It looks like you need to recompile or initdb.")));
 
3288
        if (ControlFile->funcMaxArgs != FUNC_MAX_ARGS)
 
3289
                ereport(FATAL,
 
3290
                                (errmsg("database files are incompatible with server"),
 
3291
                                 errdetail("The database cluster was initialized with FUNC_MAX_ARGS %d,"
 
3292
                                   " but the server was compiled with FUNC_MAX_ARGS %d.",
 
3293
                                                   ControlFile->funcMaxArgs, FUNC_MAX_ARGS),
 
3294
                         errhint("It looks like you need to recompile or initdb.")));
 
3295
 
 
3296
#ifdef HAVE_INT64_TIMESTAMP
 
3297
        if (ControlFile->enableIntTimes != TRUE)
 
3298
                ereport(FATAL,
 
3299
                                (errmsg("database files are incompatible with server"),
 
3300
                                 errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
 
3301
                          " but the server was compiled with HAVE_INT64_TIMESTAMP."),
 
3302
                         errhint("It looks like you need to recompile or initdb.")));
 
3303
#else
 
3304
        if (ControlFile->enableIntTimes != FALSE)
 
3305
                ereport(FATAL,
 
3306
                                (errmsg("database files are incompatible with server"),
 
3307
                                 errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
 
3308
                   " but the server was compiled without HAVE_INT64_TIMESTAMP."),
 
3309
                         errhint("It looks like you need to recompile or initdb.")));
 
3310
#endif
 
3311
 
 
3312
        if (ControlFile->localeBuflen != LOCALE_NAME_BUFLEN)
 
3313
                ereport(FATAL,
 
3314
                                (errmsg("database files are incompatible with server"),
 
3315
                                 errdetail("The database cluster was initialized with LOCALE_NAME_BUFLEN %d,"
 
3316
                          " but the server was compiled with LOCALE_NAME_BUFLEN %d.",
 
3317
                                                   ControlFile->localeBuflen, LOCALE_NAME_BUFLEN),
 
3318
                         errhint("It looks like you need to recompile or initdb.")));
 
3319
        if (setlocale(LC_COLLATE, ControlFile->lc_collate) == NULL)
 
3320
                ereport(FATAL,
 
3321
                (errmsg("database files are incompatible with operating system"),
 
3322
                 errdetail("The database cluster was initialized with LC_COLLATE \"%s\","
 
3323
                                   " which is not recognized by setlocale().",
 
3324
                                   ControlFile->lc_collate),
 
3325
                 errhint("It looks like you need to initdb or install locale support.")));
 
3326
        if (setlocale(LC_CTYPE, ControlFile->lc_ctype) == NULL)
 
3327
                ereport(FATAL,
 
3328
                (errmsg("database files are incompatible with operating system"),
 
3329
                 errdetail("The database cluster was initialized with LC_CTYPE \"%s\","
 
3330
                                   " which is not recognized by setlocale().",
 
3331
                                   ControlFile->lc_ctype),
 
3332
                 errhint("It looks like you need to initdb or install locale support.")));
 
3333
 
 
3334
        /* Make the fixed locale settings visible as GUC variables, too */
 
3335
        SetConfigOption("lc_collate", ControlFile->lc_collate,
 
3336
                                        PGC_INTERNAL, PGC_S_OVERRIDE);
 
3337
        SetConfigOption("lc_ctype", ControlFile->lc_ctype,
 
3338
                                        PGC_INTERNAL, PGC_S_OVERRIDE);
 
3339
}
 
3340
 
 
3341
void
 
3342
UpdateControlFile(void)
 
3343
{
 
3344
        int                     fd;
 
3345
 
 
3346
        INIT_CRC64(ControlFile->crc);
 
3347
        COMP_CRC64(ControlFile->crc,
 
3348
                           (char *) ControlFile + sizeof(crc64),
 
3349
                           sizeof(ControlFileData) - sizeof(crc64));
 
3350
        FIN_CRC64(ControlFile->crc);
 
3351
 
 
3352
        fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
 
3353
        if (fd < 0)
 
3354
                ereport(PANIC,
 
3355
                                (errcode_for_file_access(),
 
3356
                                 errmsg("could not open control file \"%s\": %m",
 
3357
                                                ControlFilePath)));
 
3358
 
 
3359
        errno = 0;
 
3360
        if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
 
3361
        {
 
3362
                /* if write didn't set errno, assume problem is no disk space */
 
3363
                if (errno == 0)
 
3364
                        errno = ENOSPC;
 
3365
                ereport(PANIC,
 
3366
                                (errcode_for_file_access(),
 
3367
                                 errmsg("could not write to control file: %m")));
 
3368
        }
 
3369
 
 
3370
        if (pg_fsync(fd) != 0)
 
3371
                ereport(PANIC,
 
3372
                                (errcode_for_file_access(),
 
3373
                                 errmsg("could not fsync control file: %m")));
 
3374
 
 
3375
        if (close(fd))
 
3376
                ereport(PANIC,
 
3377
                                (errcode_for_file_access(),
 
3378
                                 errmsg("could not close control file: %m")));
 
3379
}
 
3380
 
 
3381
/*
 
3382
 * Initialization of shared memory for XLOG
 
3383
 */
 
3384
 
 
3385
int
 
3386
XLOGShmemSize(void)
 
3387
{
 
3388
        if (XLOGbuffers < MinXLOGbuffers)
 
3389
                XLOGbuffers = MinXLOGbuffers;
 
3390
 
 
3391
        return MAXALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers)
 
3392
                + BLCKSZ * XLOGbuffers +
 
3393
                MAXALIGN(sizeof(ControlFileData));
 
3394
}
 
3395
 
 
3396
void
 
3397
XLOGShmemInit(void)
 
3398
{
 
3399
        bool            foundXLog,
 
3400
                                foundCFile;
 
3401
 
 
3402
        /* this must agree with space requested by XLOGShmemSize() */
 
3403
        if (XLOGbuffers < MinXLOGbuffers)
 
3404
                XLOGbuffers = MinXLOGbuffers;
 
3405
 
 
3406
        XLogCtl = (XLogCtlData *)
 
3407
                ShmemInitStruct("XLOG Ctl",
 
3408
                                                MAXALIGN(sizeof(XLogCtlData) +
 
3409
                                                                 sizeof(XLogRecPtr) * XLOGbuffers)
 
3410
                                                + BLCKSZ * XLOGbuffers,
 
3411
                                                &foundXLog);
 
3412
        ControlFile = (ControlFileData *)
 
3413
                ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
 
3414
 
 
3415
        if (foundXLog || foundCFile)
 
3416
        {
 
3417
                /* both should be present or neither */
 
3418
                Assert(foundXLog && foundCFile);
 
3419
                return;
 
3420
        }
 
3421
 
 
3422
        memset(XLogCtl, 0, sizeof(XLogCtlData));
 
3423
 
 
3424
        /*
 
3425
         * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be
 
3426
         * a multiple of the alignment for same, so no extra alignment padding
 
3427
         * is needed here.
 
3428
         */
 
3429
        XLogCtl->xlblocks = (XLogRecPtr *)
 
3430
                (((char *) XLogCtl) + sizeof(XLogCtlData));
 
3431
        memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
 
3432
 
 
3433
        /*
 
3434
         * Here, on the other hand, we must MAXALIGN to ensure the page
 
3435
         * buffers have worst-case alignment.
 
3436
         */
 
3437
        XLogCtl->pages =
 
3438
                ((char *) XLogCtl) + MAXALIGN(sizeof(XLogCtlData) +
 
3439
                                                                          sizeof(XLogRecPtr) * XLOGbuffers);
 
3440
        memset(XLogCtl->pages, 0, BLCKSZ * XLOGbuffers);
 
3441
 
 
3442
        /*
 
3443
         * Do basic initialization of XLogCtl shared data. (StartupXLOG will
 
3444
         * fill in additional info.)
 
3445
         */
 
3446
        XLogCtl->XLogCacheByte = BLCKSZ * XLOGbuffers;
 
3447
        XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
 
3448
        XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
 
3449
        SpinLockInit(&XLogCtl->info_lck);
 
3450
 
 
3451
        /*
 
3452
         * If we are not in bootstrap mode, pg_control should already exist.
 
3453
         * Read and validate it immediately (see comments in ReadControlFile()
 
3454
         * for the reasons why).
 
3455
         */
 
3456
        if (!IsBootstrapProcessingMode())
 
3457
                ReadControlFile();
 
3458
}
 
3459
 
 
3460
/*
 
3461
 * This func must be called ONCE on system install.  It creates pg_control
 
3462
 * and the initial XLOG segment.
 
3463
 */
 
3464
void
 
3465
BootStrapXLOG(void)
 
3466
{
 
3467
        CheckPoint      checkPoint;
 
3468
        char       *buffer;
 
3469
        XLogPageHeader page;
 
3470
        XLogLongPageHeader longpage;
 
3471
        XLogRecord *record;
 
3472
        bool            use_existent;
 
3473
        uint64          sysidentifier;
 
3474
        struct timeval tv;
 
3475
        crc64           crc;
 
3476
 
 
3477
        /*
 
3478
         * Select a hopefully-unique system identifier code for this
 
3479
         * installation. We use the result of gettimeofday(), including the
 
3480
         * fractional seconds field, as being about as unique as we can easily
 
3481
         * get.  (Think not to use random(), since it hasn't been seeded and
 
3482
         * there's no portable way to seed it other than the system clock
 
3483
         * value...)  The upper half of the uint64 value is just the tv_sec
 
3484
         * part, while the lower half is the XOR of tv_sec and tv_usec.  This
 
3485
         * is to ensure that we don't lose uniqueness unnecessarily if
 
3486
         * "uint64" is really only 32 bits wide.  A person knowing this
 
3487
         * encoding can determine the initialization time of the installation,
 
3488
         * which could perhaps be useful sometimes.
 
3489
         */
 
3490
        gettimeofday(&tv, NULL);
 
3491
        sysidentifier = ((uint64) tv.tv_sec) << 32;
 
3492
        sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
 
3493
 
 
3494
        /* First timeline ID is always 1 */
 
3495
        ThisTimeLineID = 1;
 
3496
 
 
3497
        /* Use malloc() to ensure buffer is MAXALIGNED */
 
3498
        buffer = (char *) malloc(BLCKSZ);
 
3499
        page = (XLogPageHeader) buffer;
 
3500
        memset(buffer, 0, BLCKSZ);
 
3501
 
 
3502
        /* Set up information for the initial checkpoint record */
 
3503
        checkPoint.redo.xlogid = 0;
 
3504
        checkPoint.redo.xrecoff = SizeOfXLogLongPHD;
 
3505
        checkPoint.undo = checkPoint.redo;
 
3506
        checkPoint.ThisTimeLineID = ThisTimeLineID;
 
3507
        checkPoint.nextXid = FirstNormalTransactionId;
 
3508
        checkPoint.nextOid = BootstrapObjectIdData;
 
3509
        checkPoint.time = time(NULL);
 
3510
 
 
3511
        ShmemVariableCache->nextXid = checkPoint.nextXid;
 
3512
        ShmemVariableCache->nextOid = checkPoint.nextOid;
 
3513
        ShmemVariableCache->oidCount = 0;
 
3514
 
 
3515
        /* Set up the XLOG page header */
 
3516
        page->xlp_magic = XLOG_PAGE_MAGIC;
 
3517
        page->xlp_info = XLP_LONG_HEADER;
 
3518
        page->xlp_tli = ThisTimeLineID;
 
3519
        page->xlp_pageaddr.xlogid = 0;
 
3520
        page->xlp_pageaddr.xrecoff = 0;
 
3521
        longpage = (XLogLongPageHeader) page;
 
3522
        longpage->xlp_sysid = sysidentifier;
 
3523
        longpage->xlp_seg_size = XLogSegSize;
 
3524
 
 
3525
        /* Insert the initial checkpoint record */
 
3526
        record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
 
3527
        record->xl_prev.xlogid = 0;
 
3528
        record->xl_prev.xrecoff = 0;
 
3529
        record->xl_xid = InvalidTransactionId;
 
3530
        record->xl_len = sizeof(checkPoint);
 
3531
        record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
 
3532
        record->xl_rmid = RM_XLOG_ID;
 
3533
        memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
 
3534
 
 
3535
        INIT_CRC64(crc);
 
3536
        COMP_CRC64(crc, &checkPoint, sizeof(checkPoint));
 
3537
        COMP_CRC64(crc, (char *) record + sizeof(crc64),
 
3538
                           SizeOfXLogRecord - sizeof(crc64));
 
3539
        FIN_CRC64(crc);
 
3540
        record->xl_crc = crc;
 
3541
 
 
3542
        /* Create first XLOG segment file */
 
3543
        use_existent = false;
 
3544
        openLogFile = XLogFileInit(0, 0, &use_existent, false);
 
3545
 
 
3546
        /* Write the first page with the initial record */
 
3547
        errno = 0;
 
3548
        if (write(openLogFile, buffer, BLCKSZ) != BLCKSZ)
 
3549
        {
 
3550
                /* if write didn't set errno, assume problem is no disk space */
 
3551
                if (errno == 0)
 
3552
                        errno = ENOSPC;
 
3553
                ereport(PANIC,
 
3554
                                (errcode_for_file_access(),
 
3555
                  errmsg("could not write bootstrap transaction log file: %m")));
 
3556
        }
 
3557
 
 
3558
        if (pg_fsync(openLogFile) != 0)
 
3559
                ereport(PANIC,
 
3560
                                (errcode_for_file_access(),
 
3561
                  errmsg("could not fsync bootstrap transaction log file: %m")));
 
3562
 
 
3563
        if (close(openLogFile))
 
3564
                ereport(PANIC,
 
3565
                                (errcode_for_file_access(),
 
3566
                  errmsg("could not close bootstrap transaction log file: %m")));
 
3567
 
 
3568
        openLogFile = -1;
 
3569
 
 
3570
        /* Now create pg_control */
 
3571
 
 
3572
        memset(ControlFile, 0, sizeof(ControlFileData));
 
3573
        /* Initialize pg_control status fields */
 
3574
        ControlFile->system_identifier = sysidentifier;
 
3575
        ControlFile->state = DB_SHUTDOWNED;
 
3576
        ControlFile->time = checkPoint.time;
 
3577
        ControlFile->logId = 0;
 
3578
        ControlFile->logSeg = 1;
 
3579
        ControlFile->checkPoint = checkPoint.redo;
 
3580
        ControlFile->checkPointCopy = checkPoint;
 
3581
        /* some additional ControlFile fields are set in WriteControlFile() */
 
3582
 
 
3583
        WriteControlFile();
 
3584
 
 
3585
        /* Bootstrap the commit log, too */
 
3586
        BootStrapCLOG();
 
3587
        BootStrapSUBTRANS();
 
3588
}
 
3589
 
 
3590
static char *
 
3591
str_time(time_t tnow)
 
3592
{
 
3593
        static char buf[128];
 
3594
 
 
3595
        strftime(buf, sizeof(buf),
 
3596
                         "%Y-%m-%d %H:%M:%S %Z",
 
3597
                         localtime(&tnow));
 
3598
 
 
3599
        return buf;
 
3600
}
 
3601
 
 
3602
/*
 
3603
 * See if there is a recovery command file (recovery.conf), and if so
 
3604
 * read in parameters for archive recovery.
 
3605
 *
 
3606
 * XXX longer term intention is to expand this to
 
3607
 * cater for additional parameters and controls
 
3608
 * possibly use a flex lexer similar to the GUC one
 
3609
 */
 
3610
static void
 
3611
readRecoveryCommandFile(void)
 
3612
{
 
3613
        char            recoveryCommandFile[MAXPGPATH];
 
3614
        FILE       *fd;
 
3615
        char            cmdline[MAXPGPATH];
 
3616
        TimeLineID      rtli = 0;
 
3617
        bool            rtliGiven = false;
 
3618
        bool            syntaxError = false;
 
3619
 
 
3620
        snprintf(recoveryCommandFile, MAXPGPATH, "%s/recovery.conf", DataDir);
 
3621
        fd = AllocateFile(recoveryCommandFile, "r");
 
3622
        if (fd == NULL)
 
3623
        {
 
3624
                if (errno == ENOENT)
 
3625
                        return;                         /* not there, so no archive recovery */
 
3626
                ereport(FATAL,
 
3627
                                (errcode_for_file_access(),
 
3628
                                 errmsg("could not open recovery command file \"%s\": %m",
 
3629
                                                recoveryCommandFile)));
 
3630
        }
 
3631
 
 
3632
        ereport(LOG,
 
3633
                        (errmsg("starting archive recovery")));
 
3634
 
 
3635
        /*
 
3636
         * Parse the file...
 
3637
         */
 
3638
        while (fgets(cmdline, MAXPGPATH, fd) != NULL)
 
3639
        {
 
3640
                /* skip leading whitespace and check for # comment */
 
3641
                char       *ptr;
 
3642
                char       *tok1;
 
3643
                char       *tok2;
 
3644
 
 
3645
                for (ptr = cmdline; *ptr; ptr++)
 
3646
                {
 
3647
                        if (!isspace((unsigned char) *ptr))
 
3648
                                break;
 
3649
                }
 
3650
                if (*ptr == '\0' || *ptr == '#')
 
3651
                        continue;
 
3652
 
 
3653
                /* identify the quoted parameter value */
 
3654
                tok1 = strtok(ptr, "'");
 
3655
                if (!tok1)
 
3656
                {
 
3657
                        syntaxError = true;
 
3658
                        break;
 
3659
                }
 
3660
                tok2 = strtok(NULL, "'");
 
3661
                if (!tok2)
 
3662
                {
 
3663
                        syntaxError = true;
 
3664
                        break;
 
3665
                }
 
3666
                /* reparse to get just the parameter name */
 
3667
                tok1 = strtok(ptr, " \t=");
 
3668
                if (!tok1)
 
3669
                {
 
3670
                        syntaxError = true;
 
3671
                        break;
 
3672
                }
 
3673
 
 
3674
                if (strcmp(tok1, "restore_command") == 0)
 
3675
                {
 
3676
                        recoveryRestoreCommand = pstrdup(tok2);
 
3677
                        ereport(LOG,
 
3678
                                        (errmsg("restore_command = \"%s\"",
 
3679
                                                        recoveryRestoreCommand)));
 
3680
                }
 
3681
                else if (strcmp(tok1, "recovery_target_timeline") == 0)
 
3682
                {
 
3683
                        rtliGiven = true;
 
3684
                        if (strcmp(tok2, "latest") == 0)
 
3685
                                rtli = 0;
 
3686
                        else
 
3687
                        {
 
3688
                                errno = 0;
 
3689
                                rtli = (TimeLineID) strtoul(tok2, NULL, 0);
 
3690
                                if (errno == EINVAL || errno == ERANGE)
 
3691
                                        ereport(FATAL,
 
3692
                                                        (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
 
3693
                                                                        tok2)));
 
3694
                        }
 
3695
                        if (rtli)
 
3696
                                ereport(LOG,
 
3697
                                                (errmsg("recovery_target_timeline = %u", rtli)));
 
3698
                        else
 
3699
                                ereport(LOG,
 
3700
                                                (errmsg("recovery_target_timeline = latest")));
 
3701
                }
 
3702
                else if (strcmp(tok1, "recovery_target_xid") == 0)
 
3703
                {
 
3704
                        errno = 0;
 
3705
                        recoveryTargetXid = (TransactionId) strtoul(tok2, NULL, 0);
 
3706
                        if (errno == EINVAL || errno == ERANGE)
 
3707
                                ereport(FATAL,
 
3708
                                                (errmsg("recovery_target_xid is not a valid number: \"%s\"",
 
3709
                                                                tok2)));
 
3710
                        ereport(LOG,
 
3711
                                        (errmsg("recovery_target_xid = %u",
 
3712
                                                        recoveryTargetXid)));
 
3713
                        recoveryTarget = true;
 
3714
                        recoveryTargetExact = true;
 
3715
                }
 
3716
                else if (strcmp(tok1, "recovery_target_time") == 0)
 
3717
                {
 
3718
                        /*
 
3719
                         * if recovery_target_xid specified, then this overrides
 
3720
                         * recovery_target_time
 
3721
                         */
 
3722
                        if (recoveryTargetExact)
 
3723
                                continue;
 
3724
                        recoveryTarget = true;
 
3725
                        recoveryTargetExact = false;
 
3726
 
 
3727
                        /*
 
3728
                         * Convert the time string given by the user to the time_t
 
3729
                         * format. We use type abstime's input converter because we
 
3730
                         * know abstime has the same representation as time_t.
 
3731
                         */
 
3732
                        recoveryTargetTime = (time_t)
 
3733
                                DatumGetAbsoluteTime(DirectFunctionCall1(abstimein,
 
3734
                                                                                                 CStringGetDatum(tok2)));
 
3735
                        ereport(LOG,
 
3736
                                        (errmsg("recovery_target_time = %s",
 
3737
                                                  DatumGetCString(DirectFunctionCall1(abstimeout,
 
3738
                        AbsoluteTimeGetDatum((AbsoluteTime) recoveryTargetTime))))));
 
3739
                }
 
3740
                else if (strcmp(tok1, "recovery_target_inclusive") == 0)
 
3741
                {
 
3742
                        /*
 
3743
                         * does nothing if a recovery_target is not also set
 
3744
                         */
 
3745
                        if (strcmp(tok2, "true") == 0)
 
3746
                                recoveryTargetInclusive = true;
 
3747
                        else
 
3748
                        {
 
3749
                                recoveryTargetInclusive = false;
 
3750
                                tok2 = "false";
 
3751
                        }
 
3752
                        ereport(LOG,
 
3753
                                        (errmsg("recovery_target_inclusive = %s", tok2)));
 
3754
                }
 
3755
                else
 
3756
                        ereport(FATAL,
 
3757
                                        (errmsg("unrecognized recovery parameter \"%s\"",
 
3758
                                                        tok1)));
 
3759
        }
 
3760
 
 
3761
        FreeFile(fd);
 
3762
 
 
3763
        if (syntaxError)
 
3764
                ereport(FATAL,
 
3765
                                (errmsg("syntax error in recovery command file: %s",
 
3766
                                                cmdline),
 
3767
                  errhint("Lines should have the format parameter = 'value'.")));
 
3768
 
 
3769
        /* Check that required parameters were supplied */
 
3770
        if (recoveryRestoreCommand == NULL)
 
3771
                ereport(FATAL,
 
3772
                                (errmsg("recovery command file \"%s\" did not specify restore_command",
 
3773
                                                recoveryCommandFile)));
 
3774
 
 
3775
        /* Enable fetching from archive recovery area */
 
3776
        InArchiveRecovery = true;
 
3777
 
 
3778
        /*
 
3779
         * If user specified recovery_target_timeline, validate it or compute
 
3780
         * the "latest" value.  We can't do this until after we've gotten the
 
3781
         * restore command and set InArchiveRecovery, because we need to fetch
 
3782
         * timeline history files from the archive.
 
3783
         */
 
3784
        if (rtliGiven)
 
3785
        {
 
3786
                if (rtli)
 
3787
                {
 
3788
                        /* Timeline 1 does not have a history file, all else should */
 
3789
                        if (rtli != 1 && !existsTimeLineHistory(rtli))
 
3790
                                ereport(FATAL,
 
3791
                                        (errmsg("recovery_target_timeline %u does not exist",
 
3792
                                                        rtli)));
 
3793
                        recoveryTargetTLI = rtli;
 
3794
                }
 
3795
                else
 
3796
                {
 
3797
                        /* We start the "latest" search from pg_control's timeline */
 
3798
                        recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
 
3799
                }
 
3800
        }
 
3801
}
 
3802
 
 
3803
/*
 
3804
 * Exit archive-recovery state
 
3805
 */
 
3806
static void
 
3807
exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
 
3808
{
 
3809
        char            recoveryPath[MAXPGPATH];
 
3810
        char            xlogpath[MAXPGPATH];
 
3811
        char            recoveryCommandFile[MAXPGPATH];
 
3812
        char            recoveryCommandDone[MAXPGPATH];
 
3813
 
 
3814
        /*
 
3815
         * We are no longer in archive recovery state.
 
3816
         */
 
3817
        InArchiveRecovery = false;
 
3818
 
 
3819
        /*
 
3820
         * We should have the ending log segment currently open.  Verify, and
 
3821
         * then close it (to avoid problems on Windows with trying to rename
 
3822
         * or delete an open file).
 
3823
         */
 
3824
        Assert(readFile >= 0);
 
3825
        Assert(readId == endLogId);
 
3826
        Assert(readSeg == endLogSeg);
 
3827
 
 
3828
        close(readFile);
 
3829
        readFile = -1;
 
3830
 
 
3831
        /*
 
3832
         * If the segment was fetched from archival storage, we want to
 
3833
         * replace the existing xlog segment (if any) with the archival
 
3834
         * version.  This is because whatever is in XLogDir is very possibly
 
3835
         * older than what we have from the archives, since it could have come
 
3836
         * from restoring a PGDATA backup.      In any case, the archival version
 
3837
         * certainly is more descriptive of what our current database state
 
3838
         * is, because that is what we replayed from.
 
3839
         *
 
3840
         * Note that if we are establishing a new timeline, ThisTimeLineID is
 
3841
         * already set to the new value, and so we will create a new file
 
3842
         * instead of overwriting any existing file.
 
3843
         */
 
3844
        snprintf(recoveryPath, MAXPGPATH, "%s/RECOVERYXLOG", XLogDir);
 
3845
        XLogFilePath(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
 
3846
 
 
3847
        if (restoredFromArchive)
 
3848
        {
 
3849
                ereport(DEBUG3,
 
3850
                                (errmsg_internal("moving last restored xlog to \"%s\"",
 
3851
                                                                 xlogpath)));
 
3852
                unlink(xlogpath);               /* might or might not exist */
 
3853
                if (rename(recoveryPath, xlogpath) != 0)
 
3854
                        ereport(FATAL,
 
3855
                                        (errcode_for_file_access(),
 
3856
                                         errmsg("could not rename file \"%s\" to \"%s\": %m",
 
3857
                                                        recoveryPath, xlogpath)));
 
3858
                /* XXX might we need to fix permissions on the file? */
 
3859
        }
 
3860
        else
 
3861
        {
 
3862
                /*
 
3863
                 * If the latest segment is not archival, but there's still a
 
3864
                 * RECOVERYXLOG laying about, get rid of it.
 
3865
                 */
 
3866
                unlink(recoveryPath);   /* ignore any error */
 
3867
 
 
3868
                /*
 
3869
                 * If we are establishing a new timeline, we have to copy data
 
3870
                 * from the last WAL segment of the old timeline to create a
 
3871
                 * starting WAL segment for the new timeline.
 
3872
                 */
 
3873
                if (endTLI != ThisTimeLineID)
 
3874
                        XLogFileCopy(endLogId, endLogSeg,
 
3875
                                                 endTLI, endLogId, endLogSeg);
 
3876
        }
 
3877
 
 
3878
        /*
 
3879
         * Let's just make real sure there are not .ready or .done flags
 
3880
         * posted for the new segment.
 
3881
         */
 
3882
        XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
 
3883
        XLogArchiveCleanup(xlogpath);
 
3884
 
 
3885
        /* Get rid of any remaining recovered timeline-history file, too */
 
3886
        snprintf(recoveryPath, MAXPGPATH, "%s/RECOVERYHISTORY", XLogDir);
 
3887
        unlink(recoveryPath);           /* ignore any error */
 
3888
 
 
3889
        /*
 
3890
         * Rename the config file out of the way, so that we don't
 
3891
         * accidentally re-enter archive recovery mode in a subsequent crash.
 
3892
         */
 
3893
        snprintf(recoveryCommandFile, MAXPGPATH, "%s/recovery.conf", DataDir);
 
3894
        snprintf(recoveryCommandDone, MAXPGPATH, "%s/recovery.done", DataDir);
 
3895
        unlink(recoveryCommandDone);
 
3896
        if (rename(recoveryCommandFile, recoveryCommandDone) != 0)
 
3897
                ereport(FATAL,
 
3898
                                (errcode_for_file_access(),
 
3899
                                 errmsg("could not rename file \"%s\" to \"%s\": %m",
 
3900
                                                recoveryCommandFile, recoveryCommandDone)));
 
3901
 
 
3902
        ereport(LOG,
 
3903
                        (errmsg("archive recovery complete")));
 
3904
}
 
3905
 
 
3906
/*
 
3907
 * For point-in-time recovery, this function decides whether we want to
 
3908
 * stop applying the XLOG at or after the current record.
 
3909
 *
 
3910
 * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
 
3911
 * *includeThis is set TRUE if we should apply this record before stopping.
 
3912
 * Also, some information is saved in recoveryStopXid et al for use in
 
3913
 * annotating the new timeline's history file.
 
3914
 */
 
3915
static bool
 
3916
recoveryStopsHere(XLogRecord *record, bool *includeThis)
 
3917
{
 
3918
        bool            stopsHere;
 
3919
        uint8           record_info;
 
3920
        time_t          recordXtime;
 
3921
 
 
3922
        /* Do we have a PITR target at all? */
 
3923
        if (!recoveryTarget)
 
3924
                return false;
 
3925
 
 
3926
        /* We only consider stopping at COMMIT or ABORT records */
 
3927
        if (record->xl_rmid != RM_XACT_ID)
 
3928
                return false;
 
3929
        record_info = record->xl_info & ~XLR_INFO_MASK;
 
3930
        if (record_info == XLOG_XACT_COMMIT)
 
3931
        {
 
3932
                xl_xact_commit *recordXactCommitData;
 
3933
 
 
3934
                recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
 
3935
                recordXtime = recordXactCommitData->xtime;
 
3936
        }
 
3937
        else if (record_info == XLOG_XACT_ABORT)
 
3938
        {
 
3939
                xl_xact_abort *recordXactAbortData;
 
3940
 
 
3941
                recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
 
3942
                recordXtime = recordXactAbortData->xtime;
 
3943
        }
 
3944
        else
 
3945
                return false;
 
3946
 
 
3947
        if (recoveryTargetExact)
 
3948
        {
 
3949
                /*
 
3950
                 * there can be only one transaction end record with this exact
 
3951
                 * transactionid
 
3952
                 *
 
3953
                 * when testing for an xid, we MUST test for equality only, since
 
3954
                 * transactions are numbered in the order they start, not the
 
3955
                 * order they complete. A higher numbered xid will complete before
 
3956
                 * you about 50% of the time...
 
3957
                 */
 
3958
                stopsHere = (record->xl_xid == recoveryTargetXid);
 
3959
                if (stopsHere)
 
3960
                        *includeThis = recoveryTargetInclusive;
 
3961
        }
 
3962
        else
 
3963
        {
 
3964
                /*
 
3965
                 * there can be many transactions that share the same commit time,
 
3966
                 * so we stop after the last one, if we are inclusive, or stop at
 
3967
                 * the first one if we are exclusive
 
3968
                 */
 
3969
                if (recoveryTargetInclusive)
 
3970
                        stopsHere = (recordXtime > recoveryTargetTime);
 
3971
                else
 
3972
                        stopsHere = (recordXtime >= recoveryTargetTime);
 
3973
                if (stopsHere)
 
3974
                        *includeThis = false;
 
3975
        }
 
3976
 
 
3977
        if (stopsHere)
 
3978
        {
 
3979
                recoveryStopXid = record->xl_xid;
 
3980
                recoveryStopTime = recordXtime;
 
3981
                recoveryStopAfter = *includeThis;
 
3982
 
 
3983
                if (record_info == XLOG_XACT_COMMIT)
 
3984
                {
 
3985
                        if (recoveryStopAfter)
 
3986
                                ereport(LOG,
 
3987
                                                (errmsg("recovery stopping after commit of transaction %u, time %s",
 
3988
                                                  recoveryStopXid, str_time(recoveryStopTime))));
 
3989
                        else
 
3990
                                ereport(LOG,
 
3991
                                                (errmsg("recovery stopping before commit of transaction %u, time %s",
 
3992
                                                  recoveryStopXid, str_time(recoveryStopTime))));
 
3993
                }
 
3994
                else
 
3995
                {
 
3996
                        if (recoveryStopAfter)
 
3997
                                ereport(LOG,
 
3998
                                                (errmsg("recovery stopping after abort of transaction %u, time %s",
 
3999
                                                  recoveryStopXid, str_time(recoveryStopTime))));
 
4000
                        else
 
4001
                                ereport(LOG,
 
4002
                                                (errmsg("recovery stopping before abort of transaction %u, time %s",
 
4003
                                                  recoveryStopXid, str_time(recoveryStopTime))));
 
4004
                }
 
4005
        }
 
4006
 
 
4007
        return stopsHere;
 
4008
}
 
4009
 
 
4010
/*
 
4011
 * This must be called ONCE during postmaster or standalone-backend startup
 
4012
 */
 
4013
void
 
4014
StartupXLOG(void)
 
4015
{
 
4016
        XLogCtlInsert *Insert;
 
4017
        CheckPoint      checkPoint;
 
4018
        bool            wasShutdown;
 
4019
        bool            needNewTimeLine = false;
 
4020
        XLogRecPtr      RecPtr,
 
4021
                                LastRec,
 
4022
                                checkPointLoc,
 
4023
                                EndOfLog;
 
4024
        uint32          endLogId;
 
4025
        uint32          endLogSeg;
 
4026
        XLogRecord *record;
 
4027
        uint32          freespace;
 
4028
 
 
4029
        CritSectionCount++;
 
4030
 
 
4031
        /*
 
4032
         * Read control file and check XLOG status looks valid.
 
4033
         *
 
4034
         * Note: in most control paths, *ControlFile is already valid and we need
 
4035
         * not do ReadControlFile() here, but might as well do it to be sure.
 
4036
         */
 
4037
        ReadControlFile();
 
4038
 
 
4039
        if (ControlFile->logSeg == 0 ||
 
4040
                ControlFile->state < DB_SHUTDOWNED ||
 
4041
                ControlFile->state > DB_IN_PRODUCTION ||
 
4042
                !XRecOffIsValid(ControlFile->checkPoint.xrecoff))
 
4043
                ereport(FATAL,
 
4044
                                (errmsg("control file contains invalid data")));
 
4045
 
 
4046
        if (ControlFile->state == DB_SHUTDOWNED)
 
4047
                ereport(LOG,
 
4048
                                (errmsg("database system was shut down at %s",
 
4049
                                                str_time(ControlFile->time))));
 
4050
        else if (ControlFile->state == DB_SHUTDOWNING)
 
4051
                ereport(LOG,
 
4052
                                (errmsg("database system shutdown was interrupted at %s",
 
4053
                                                str_time(ControlFile->time))));
 
4054
        else if (ControlFile->state == DB_IN_RECOVERY)
 
4055
                ereport(LOG,
 
4056
                (errmsg("database system was interrupted while in recovery at %s",
 
4057
                                str_time(ControlFile->time)),
 
4058
                 errhint("This probably means that some data is corrupted and"
 
4059
                                 " you will have to use the last backup for recovery.")));
 
4060
        else if (ControlFile->state == DB_IN_PRODUCTION)
 
4061
                ereport(LOG,
 
4062
                                (errmsg("database system was interrupted at %s",
 
4063
                                                str_time(ControlFile->time))));
 
4064
 
 
4065
        /* This is just to allow attaching to startup process with a debugger */
 
4066
#ifdef XLOG_REPLAY_DELAY
 
4067
        if (ControlFile->state != DB_SHUTDOWNED)
 
4068
                pg_usleep(60000000L);
 
4069
#endif
 
4070
 
 
4071
        /*
 
4072
         * Initialize on the assumption we want to recover to the same
 
4073
         * timeline that's active according to pg_control.
 
4074
         */
 
4075
        recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
 
4076
 
 
4077
        /*
 
4078
         * Check for recovery control file, and if so set up state for offline
 
4079
         * recovery
 
4080
         */
 
4081
        readRecoveryCommandFile();
 
4082
 
 
4083
        /* Now we can determine the list of expected TLIs */
 
4084
        expectedTLIs = readTimeLineHistory(recoveryTargetTLI);
 
4085
 
 
4086
        /*
 
4087
         * If pg_control's timeline is not in expectedTLIs, then we cannot
 
4088
         * proceed: the backup is not part of the history of the requested
 
4089
         * timeline.
 
4090
         */
 
4091
        if (!list_member_int(expectedTLIs,
 
4092
                                           (int) ControlFile->checkPointCopy.ThisTimeLineID))
 
4093
                ereport(FATAL,
 
4094
                                (errmsg("requested timeline %u is not a child of database system timeline %u",
 
4095
                                                recoveryTargetTLI,
 
4096
                                                ControlFile->checkPointCopy.ThisTimeLineID)));
 
4097
 
 
4098
        if (read_backup_label(&checkPointLoc))
 
4099
        {
 
4100
                /*
 
4101
                 * When a backup_label file is present, we want to roll forward
 
4102
                 * from the checkpoint it identifies, rather than using
 
4103
                 * pg_control.
 
4104
                 */
 
4105
                record = ReadCheckpointRecord(checkPointLoc, 0);
 
4106
                if (record != NULL)
 
4107
                {
 
4108
                        ereport(LOG,
 
4109
                                        (errmsg("checkpoint record is at %X/%X",
 
4110
                                                  checkPointLoc.xlogid, checkPointLoc.xrecoff)));
 
4111
                        InRecovery = true;      /* force recovery even if SHUTDOWNED */
 
4112
                }
 
4113
                else
 
4114
                {
 
4115
                        ereport(PANIC,
 
4116
                                  (errmsg("could not locate required checkpoint record"),
 
4117
                                   errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
 
4118
                }
 
4119
        }
 
4120
        else
 
4121
        {
 
4122
                /*
 
4123
                 * Get the last valid checkpoint record.  If the latest one
 
4124
                 * according to pg_control is broken, try the next-to-last one.
 
4125
                 */
 
4126
                checkPointLoc = ControlFile->checkPoint;
 
4127
                record = ReadCheckpointRecord(checkPointLoc, 1);
 
4128
                if (record != NULL)
 
4129
                {
 
4130
                        ereport(LOG,
 
4131
                                        (errmsg("checkpoint record is at %X/%X",
 
4132
                                                  checkPointLoc.xlogid, checkPointLoc.xrecoff)));
 
4133
                }
 
4134
                else
 
4135
                {
 
4136
                        checkPointLoc = ControlFile->prevCheckPoint;
 
4137
                        record = ReadCheckpointRecord(checkPointLoc, 2);
 
4138
                        if (record != NULL)
 
4139
                        {
 
4140
                                ereport(LOG,
 
4141
                                         (errmsg("using previous checkpoint record at %X/%X",
 
4142
                                                  checkPointLoc.xlogid, checkPointLoc.xrecoff)));
 
4143
                                InRecovery = true;              /* force recovery even if
 
4144
                                                                                 * SHUTDOWNED */
 
4145
                        }
 
4146
                        else
 
4147
                                ereport(PANIC,
 
4148
                                 (errmsg("could not locate a valid checkpoint record")));
 
4149
                }
 
4150
        }
 
4151
 
 
4152
        LastRec = RecPtr = checkPointLoc;
 
4153
        memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
 
4154
        wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
 
4155
 
 
4156
        ereport(LOG,
 
4157
                        (errmsg("redo record is at %X/%X; undo record is at %X/%X; shutdown %s",
 
4158
                                        checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
 
4159
                                        checkPoint.undo.xlogid, checkPoint.undo.xrecoff,
 
4160
                                        wasShutdown ? "TRUE" : "FALSE")));
 
4161
        ereport(LOG,
 
4162
                        (errmsg("next transaction ID: %u; next OID: %u",
 
4163
                                        checkPoint.nextXid, checkPoint.nextOid)));
 
4164
        if (!TransactionIdIsNormal(checkPoint.nextXid))
 
4165
                ereport(PANIC,
 
4166
                                (errmsg("invalid next transaction ID")));
 
4167
 
 
4168
        ShmemVariableCache->nextXid = checkPoint.nextXid;
 
4169
        ShmemVariableCache->nextOid = checkPoint.nextOid;
 
4170
        ShmemVariableCache->oidCount = 0;
 
4171
 
 
4172
        /*
 
4173
         * We must replay WAL entries using the same TimeLineID they were
 
4174
         * created under, so temporarily adopt the TLI indicated by the
 
4175
         * checkpoint (see also xlog_redo()).
 
4176
         */
 
4177
        ThisTimeLineID = checkPoint.ThisTimeLineID;
 
4178
 
 
4179
        RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
 
4180
 
 
4181
        if (XLByteLT(RecPtr, checkPoint.redo))
 
4182
                ereport(PANIC,
 
4183
                                (errmsg("invalid redo in checkpoint record")));
 
4184
        if (checkPoint.undo.xrecoff == 0)
 
4185
                checkPoint.undo = RecPtr;
 
4186
 
 
4187
        /*
 
4188
         * Check whether we need to force recovery from WAL.  If it appears to
 
4189
         * have been a clean shutdown and we did not have a recovery.conf
 
4190
         * file, then assume no recovery needed.
 
4191
         */
 
4192
        if (XLByteLT(checkPoint.undo, RecPtr) ||
 
4193
                XLByteLT(checkPoint.redo, RecPtr))
 
4194
        {
 
4195
                if (wasShutdown)
 
4196
                        ereport(PANIC,
 
4197
                        (errmsg("invalid redo/undo record in shutdown checkpoint")));
 
4198
                InRecovery = true;
 
4199
        }
 
4200
        else if (ControlFile->state != DB_SHUTDOWNED)
 
4201
                InRecovery = true;
 
4202
        else if (InArchiveRecovery)
 
4203
        {
 
4204
                /* force recovery due to presence of recovery.conf */
 
4205
                InRecovery = true;
 
4206
        }
 
4207
 
 
4208
        /* REDO */
 
4209
        if (InRecovery)
 
4210
        {
 
4211
                int                     rmid;
 
4212
 
 
4213
                if (InArchiveRecovery)
 
4214
                        ereport(LOG,
 
4215
                                        (errmsg("automatic recovery in progress")));
 
4216
                else
 
4217
                        ereport(LOG,
 
4218
                                        (errmsg("database system was not properly shut down; "
 
4219
                                                        "automatic recovery in progress")));
 
4220
                ControlFile->state = DB_IN_RECOVERY;
 
4221
                ControlFile->time = time(NULL);
 
4222
                UpdateControlFile();
 
4223
 
 
4224
                /* Start up the recovery environment */
 
4225
                XLogInitRelationCache();
 
4226
 
 
4227
                for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
 
4228
                {
 
4229
                        if (RmgrTable[rmid].rm_startup != NULL)
 
4230
                                RmgrTable[rmid].rm_startup();
 
4231
                }
 
4232
 
 
4233
                /*
 
4234
                 * Find the first record that logically follows the checkpoint ---
 
4235
                 * it might physically precede it, though.
 
4236
                 */
 
4237
                if (XLByteLT(checkPoint.redo, RecPtr))
 
4238
                {
 
4239
                        /* back up to find the record */
 
4240
                        record = ReadRecord(&(checkPoint.redo), PANIC);
 
4241
                }
 
4242
                else
 
4243
                {
 
4244
                        /* just have to read next record after CheckPoint */
 
4245
                        record = ReadRecord(NULL, LOG);
 
4246
                }
 
4247
 
 
4248
                if (record != NULL)
 
4249
                {
 
4250
                        bool            recoveryContinue = true;
 
4251
                        bool            recoveryApply = true;
 
4252
 
 
4253
                        InRedo = true;
 
4254
                        ereport(LOG,
 
4255
                                        (errmsg("redo starts at %X/%X",
 
4256
                                                        ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
 
4257
 
 
4258
                        /*
 
4259
                         * main redo apply loop
 
4260
                         */
 
4261
                        do
 
4262
                        {
 
4263
#ifdef WAL_DEBUG
 
4264
                                if (XLOG_DEBUG)
 
4265
                                {
 
4266
                                        char            buf[8192];
 
4267
 
 
4268
                                        sprintf(buf, "REDO @ %X/%X; LSN %X/%X: ",
 
4269
                                                        ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
 
4270
                                                        EndRecPtr.xlogid, EndRecPtr.xrecoff);
 
4271
                                        xlog_outrec(buf, record);
 
4272
                                        strcat(buf, " - ");
 
4273
                                        RmgrTable[record->xl_rmid].rm_desc(buf,
 
4274
                                                                record->xl_info, XLogRecGetData(record));
 
4275
                                        elog(LOG, "%s", buf);
 
4276
                                }
 
4277
#endif
 
4278
 
 
4279
                                /*
 
4280
                                 * Have we reached our recovery target?
 
4281
                                 */
 
4282
                                if (recoveryStopsHere(record, &recoveryApply))
 
4283
                                {
 
4284
                                        needNewTimeLine = true;         /* see below */
 
4285
                                        recoveryContinue = false;
 
4286
                                        if (!recoveryApply)
 
4287
                                                break;
 
4288
                                }
 
4289
 
 
4290
                                /* nextXid must be beyond record's xid */
 
4291
                                if (TransactionIdFollowsOrEquals(record->xl_xid,
 
4292
                                                                                        ShmemVariableCache->nextXid))
 
4293
                                {
 
4294
                                        ShmemVariableCache->nextXid = record->xl_xid;
 
4295
                                        TransactionIdAdvance(ShmemVariableCache->nextXid);
 
4296
                                }
 
4297
 
 
4298
                                if (record->xl_info & XLR_BKP_BLOCK_MASK)
 
4299
                                        RestoreBkpBlocks(record, EndRecPtr);
 
4300
 
 
4301
                                RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
 
4302
 
 
4303
                                LastRec = ReadRecPtr;
 
4304
 
 
4305
                                record = ReadRecord(NULL, LOG);
 
4306
                        } while (record != NULL && recoveryContinue);
 
4307
 
 
4308
                        /*
 
4309
                         * end of main redo apply loop
 
4310
                         */
 
4311
 
 
4312
                        ereport(LOG,
 
4313
                                        (errmsg("redo done at %X/%X",
 
4314
                                                        ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
 
4315
                        InRedo = false;
 
4316
                }
 
4317
                else
 
4318
                {
 
4319
                        /* there are no WAL records following the checkpoint */
 
4320
                        ereport(LOG,
 
4321
                                        (errmsg("redo is not required")));
 
4322
                }
 
4323
        }
 
4324
 
 
4325
        /*
 
4326
         * Re-fetch the last valid or last applied record, so we can identify
 
4327
         * the exact endpoint of what we consider the valid portion of WAL.
 
4328
         */
 
4329
        record = ReadRecord(&LastRec, PANIC);
 
4330
        EndOfLog = EndRecPtr;
 
4331
        XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);
 
4332
 
 
4333
        /*
 
4334
         * Complain if we did not roll forward far enough to render the backup
 
4335
         * dump consistent.
 
4336
         */
 
4337
        if (XLByteLT(EndOfLog, recoveryMinXlogOffset))
 
4338
        {
 
4339
                if (needNewTimeLine)    /* stopped because of stop request */
 
4340
                        ereport(FATAL,
 
4341
                                        (errmsg("requested recovery stop point is before end time of backup dump")));
 
4342
                else
 
4343
                        /* ran off end of WAL */
 
4344
                        ereport(FATAL,
 
4345
                                        (errmsg("WAL ends before end time of backup dump")));
 
4346
        }
 
4347
 
 
4348
        /*
 
4349
         * Consider whether we need to assign a new timeline ID.
 
4350
         *
 
4351
         * If we stopped short of the end of WAL during recovery, then we are
 
4352
         * generating a new timeline and must assign it a unique new ID.
 
4353
         * Otherwise, we can just extend the timeline we were in when we ran
 
4354
         * out of WAL.
 
4355
         */
 
4356
        if (needNewTimeLine)
 
4357
        {
 
4358
                ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
 
4359
                ereport(LOG,
 
4360
                                (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
 
4361
                writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
 
4362
                                                         curFileTLI, endLogId, endLogSeg);
 
4363
        }
 
4364
 
 
4365
        /* Save the selected TimeLineID in shared memory, too */
 
4366
        XLogCtl->ThisTimeLineID = ThisTimeLineID;
 
4367
 
 
4368
        /*
 
4369
         * We are now done reading the old WAL.  Turn off archive fetching if
 
4370
         * it was active, and make a writable copy of the last WAL segment.
 
4371
         * (Note that we also have a copy of the last block of the old WAL in
 
4372
         * readBuf; we will use that below.)
 
4373
         */
 
4374
        if (InArchiveRecovery)
 
4375
                exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
 
4376
 
 
4377
        /*
 
4378
         * Prepare to write WAL starting at EndOfLog position, and init xlog
 
4379
         * buffer cache using the block containing the last record from the
 
4380
         * previous incarnation.
 
4381
         */
 
4382
        openLogId = endLogId;
 
4383
        openLogSeg = endLogSeg;
 
4384
        openLogFile = XLogFileOpen(openLogId, openLogSeg);
 
4385
        openLogOff = 0;
 
4386
        ControlFile->logId = openLogId;
 
4387
        ControlFile->logSeg = openLogSeg + 1;
 
4388
        Insert = &XLogCtl->Insert;
 
4389
        Insert->PrevRecord = LastRec;
 
4390
        XLogCtl->xlblocks[0].xlogid = openLogId;
 
4391
        XLogCtl->xlblocks[0].xrecoff =
 
4392
                ((EndOfLog.xrecoff - 1) / BLCKSZ + 1) * BLCKSZ;
 
4393
 
 
4394
        /*
 
4395
         * Tricky point here: readBuf contains the *last* block that the
 
4396
         * LastRec record spans, not the one it starts in.      The last block is
 
4397
         * indeed the one we want to use.
 
4398
         */
 
4399
        Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - BLCKSZ) % XLogSegSize);
 
4400
        memcpy((char *) Insert->currpage, readBuf, BLCKSZ);
 
4401
        Insert->currpos = (char *) Insert->currpage +
 
4402
                (EndOfLog.xrecoff + BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
 
4403
 
 
4404
        LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
 
4405
 
 
4406
        XLogCtl->Write.LogwrtResult = LogwrtResult;
 
4407
        Insert->LogwrtResult = LogwrtResult;
 
4408
        XLogCtl->LogwrtResult = LogwrtResult;
 
4409
 
 
4410
        XLogCtl->LogwrtRqst.Write = EndOfLog;
 
4411
        XLogCtl->LogwrtRqst.Flush = EndOfLog;
 
4412
 
 
4413
        freespace = INSERT_FREESPACE(Insert);
 
4414
        if (freespace > 0)
 
4415
        {
 
4416
                /* Make sure rest of page is zero */
 
4417
                MemSet(Insert->currpos, 0, freespace);
 
4418
                XLogCtl->Write.curridx = 0;
 
4419
        }
 
4420
        else
 
4421
        {
 
4422
                /*
 
4423
                 * Whenever Write.LogwrtResult points to exactly the end of a
 
4424
                 * page, Write.curridx must point to the *next* page (see
 
4425
                 * XLogWrite()).
 
4426
                 *
 
4427
                 * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
 
4428
                 * this is sufficient.  The first actual attempt to insert a log
 
4429
                 * record will advance the insert state.
 
4430
                 */
 
4431
                XLogCtl->Write.curridx = NextBufIdx(0);
 
4432
        }
 
4433
 
 
4434
#ifdef NOT_USED
 
4435
        /* UNDO */
 
4436
        if (InRecovery)
 
4437
        {
 
4438
                RecPtr = ReadRecPtr;
 
4439
                if (XLByteLT(checkPoint.undo, RecPtr))
 
4440
                {
 
4441
                        ereport(LOG,
 
4442
                                        (errmsg("undo starts at %X/%X",
 
4443
                                                        RecPtr.xlogid, RecPtr.xrecoff)));
 
4444
                        do
 
4445
                        {
 
4446
                                record = ReadRecord(&RecPtr, PANIC);
 
4447
                                if (TransactionIdIsValid(record->xl_xid) &&
 
4448
                                        !TransactionIdDidCommit(record->xl_xid))
 
4449
                                        RmgrTable[record->xl_rmid].rm_undo(EndRecPtr, record);
 
4450
                                RecPtr = record->xl_prev;
 
4451
                        } while (XLByteLE(checkPoint.undo, RecPtr));
 
4452
                        ereport(LOG,
 
4453
                                        (errmsg("undo done at %X/%X",
 
4454
                                                        ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
 
4455
                }
 
4456
                else
 
4457
                        ereport(LOG,
 
4458
                                        (errmsg("undo is not required")));
 
4459
        }
 
4460
#endif
 
4461
 
 
4462
        if (InRecovery)
 
4463
        {
 
4464
                int                     rmid;
 
4465
 
 
4466
                /*
 
4467
                 * Allow resource managers to do any required cleanup.
 
4468
                 */
 
4469
                for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
 
4470
                {
 
4471
                        if (RmgrTable[rmid].rm_cleanup != NULL)
 
4472
                                RmgrTable[rmid].rm_cleanup();
 
4473
                }
 
4474
 
 
4475
                /*
 
4476
                 * Perform a new checkpoint to update our recovery activity to
 
4477
                 * disk.
 
4478
                 *
 
4479
                 * Note that we write a shutdown checkpoint rather than an on-line
 
4480
                 * one.  This is not particularly critical, but since we may be
 
4481
                 * assigning a new TLI, using a shutdown checkpoint allows us to
 
4482
                 * have the rule that TLI only changes in shutdown checkpoints,
 
4483
                 * which allows some extra error checking in xlog_redo.
 
4484
                 *
 
4485
                 * In case we had to use the secondary checkpoint, make sure that it
 
4486
                 * will still be shown as the secondary checkpoint after this
 
4487
                 * CreateCheckPoint operation; we don't want the broken primary
 
4488
                 * checkpoint to become prevCheckPoint...
 
4489
                 */
 
4490
                if (XLByteEQ(checkPointLoc, ControlFile->prevCheckPoint))
 
4491
                        ControlFile->checkPoint = checkPointLoc;
 
4492
 
 
4493
                CreateCheckPoint(true, true);
 
4494
 
 
4495
                /*
 
4496
                 * Close down recovery environment
 
4497
                 */
 
4498
                XLogCloseRelationCache();
 
4499
 
 
4500
                /*
 
4501
                 * Now that we've checkpointed the recovery, it's safe to flush
 
4502
                 * old backup_label, if present.
 
4503
                 */
 
4504
                remove_backup_label();
 
4505
        }
 
4506
 
 
4507
        /*
 
4508
         * Preallocate additional log files, if wanted.
 
4509
         */
 
4510
        (void) PreallocXlogFiles(EndOfLog);
 
4511
 
 
4512
        /*
 
4513
         * Okay, we're officially UP.
 
4514
         */
 
4515
        InRecovery = false;
 
4516
 
 
4517
        ControlFile->state = DB_IN_PRODUCTION;
 
4518
        ControlFile->time = time(NULL);
 
4519
        UpdateControlFile();
 
4520
 
 
4521
        /* Start up the commit log, too */
 
4522
        StartupCLOG();
 
4523
        StartupSUBTRANS();
 
4524
 
 
4525
        ereport(LOG,
 
4526
                        (errmsg("database system is ready")));
 
4527
        CritSectionCount--;
 
4528
 
 
4529
        /* Shut down readFile facility, free space */
 
4530
        if (readFile >= 0)
 
4531
        {
 
4532
                close(readFile);
 
4533
                readFile = -1;
 
4534
        }
 
4535
        if (readBuf)
 
4536
        {
 
4537
                free(readBuf);
 
4538
                readBuf = NULL;
 
4539
        }
 
4540
        if (readRecordBuf)
 
4541
        {
 
4542
                free(readRecordBuf);
 
4543
                readRecordBuf = NULL;
 
4544
                readRecordBufSize = 0;
 
4545
        }
 
4546
}
 
4547
 
 
4548
/*
 
4549
 * Subroutine to try to fetch and validate a prior checkpoint record.
 
4550
 *
 
4551
 * whichChkpt identifies the checkpoint (merely for reporting purposes).
 
4552
 * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
 
4553
 */
 
4554
static XLogRecord *
 
4555
ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
 
4556
{
 
4557
        XLogRecord *record;
 
4558
 
 
4559
        if (!XRecOffIsValid(RecPtr.xrecoff))
 
4560
        {
 
4561
                switch (whichChkpt)
 
4562
                {
 
4563
                        case 1:
 
4564
                                ereport(LOG,
 
4565
                                                (errmsg("invalid primary checkpoint link in control file")));
 
4566
                                break;
 
4567
                        case 2:
 
4568
                                ereport(LOG,
 
4569
                                                (errmsg("invalid secondary checkpoint link in control file")));
 
4570
                                break;
 
4571
                        default:
 
4572
                                ereport(LOG,
 
4573
                                (errmsg("invalid checkpoint link in backup_label file")));
 
4574
                                break;
 
4575
                }
 
4576
                return NULL;
 
4577
        }
 
4578
 
 
4579
        record = ReadRecord(&RecPtr, LOG);
 
4580
 
 
4581
        if (record == NULL)
 
4582
        {
 
4583
                switch (whichChkpt)
 
4584
                {
 
4585
                        case 1:
 
4586
                                ereport(LOG,
 
4587
                                                (errmsg("invalid primary checkpoint record")));
 
4588
                                break;
 
4589
                        case 2:
 
4590
                                ereport(LOG,
 
4591
                                                (errmsg("invalid secondary checkpoint record")));
 
4592
                                break;
 
4593
                        default:
 
4594
                                ereport(LOG,
 
4595
                                                (errmsg("invalid checkpoint record")));
 
4596
                                break;
 
4597
                }
 
4598
                return NULL;
 
4599
        }
 
4600
        if (record->xl_rmid != RM_XLOG_ID)
 
4601
        {
 
4602
                switch (whichChkpt)
 
4603
                {
 
4604
                        case 1:
 
4605
                                ereport(LOG,
 
4606
                                                (errmsg("invalid resource manager ID in primary checkpoint record")));
 
4607
                                break;
 
4608
                        case 2:
 
4609
                                ereport(LOG,
 
4610
                                                (errmsg("invalid resource manager ID in secondary checkpoint record")));
 
4611
                                break;
 
4612
                        default:
 
4613
                                ereport(LOG,
 
4614
                                                (errmsg("invalid resource manager ID in checkpoint record")));
 
4615
                                break;
 
4616
                }
 
4617
                return NULL;
 
4618
        }
 
4619
        if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
 
4620
                record->xl_info != XLOG_CHECKPOINT_ONLINE)
 
4621
        {
 
4622
                switch (whichChkpt)
 
4623
                {
 
4624
                        case 1:
 
4625
                                ereport(LOG,
 
4626
                                (errmsg("invalid xl_info in primary checkpoint record")));
 
4627
                                break;
 
4628
                        case 2:
 
4629
                                ereport(LOG,
 
4630
                                                (errmsg("invalid xl_info in secondary checkpoint record")));
 
4631
                                break;
 
4632
                        default:
 
4633
                                ereport(LOG,
 
4634
                                                (errmsg("invalid xl_info in checkpoint record")));
 
4635
                                break;
 
4636
                }
 
4637
                return NULL;
 
4638
        }
 
4639
        if (record->xl_len != sizeof(CheckPoint))
 
4640
        {
 
4641
                switch (whichChkpt)
 
4642
                {
 
4643
                        case 1:
 
4644
                                ereport(LOG,
 
4645
                                (errmsg("invalid length of primary checkpoint record")));
 
4646
                                break;
 
4647
                        case 2:
 
4648
                                ereport(LOG,
 
4649
                                                (errmsg("invalid length of secondary checkpoint record")));
 
4650
                                break;
 
4651
                        default:
 
4652
                                ereport(LOG,
 
4653
                                                (errmsg("invalid length of checkpoint record")));
 
4654
                                break;
 
4655
                }
 
4656
                return NULL;
 
4657
        }
 
4658
        return record;
 
4659
}
 
4660
 
 
4661
/*
 
4662
 * This must be called during startup of a backend process, except that
 
4663
 * it need not be called in a standalone backend (which does StartupXLOG
 
4664
 * instead).  We need to initialize the local copies of ThisTimeLineID and
 
4665
 * RedoRecPtr.
 
4666
 *
 
4667
 * Note: before Postgres 8.0, we went to some effort to keep the postmaster
 
4668
 * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
 
4669
 * unnecessary however, since the postmaster itself never touches XLOG anyway.
 
4670
 */
 
4671
void
 
4672
InitXLOGAccess(void)
 
4673
{
 
4674
        /* ThisTimeLineID doesn't change so we need no lock to copy it */
 
4675
        ThisTimeLineID = XLogCtl->ThisTimeLineID;
 
4676
        /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
 
4677
        (void) GetRedoRecPtr();
 
4678
}
 
4679
 
 
4680
/*
 
4681
 * Once spawned, a backend may update its local RedoRecPtr from
 
4682
 * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
 
4683
 * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
 
4684
 */
 
4685
XLogRecPtr
 
4686
GetRedoRecPtr(void)
 
4687
{
 
4688
        /* use volatile pointer to prevent code rearrangement */
 
4689
        volatile XLogCtlData *xlogctl = XLogCtl;
 
4690
 
 
4691
        SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
 
4692
        Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
 
4693
        RedoRecPtr = xlogctl->Insert.RedoRecPtr;
 
4694
        SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
 
4695
 
 
4696
        return RedoRecPtr;
 
4697
}
 
4698
 
 
4699
/*
 
4700
 * This must be called ONCE during postmaster or standalone-backend shutdown
 
4701
 */
 
4702
void
 
4703
ShutdownXLOG(int code, Datum arg)
 
4704
{
 
4705
        ereport(LOG,
 
4706
                        (errmsg("shutting down")));
 
4707
 
 
4708
        CritSectionCount++;
 
4709
        CreateCheckPoint(true, true);
 
4710
        ShutdownCLOG();
 
4711
        ShutdownSUBTRANS();
 
4712
        CritSectionCount--;
 
4713
 
 
4714
        ereport(LOG,
 
4715
                        (errmsg("database system is shut down")));
 
4716
}
 
4717
 
 
4718
/*
 
4719
 * Perform a checkpoint --- either during shutdown, or on-the-fly
 
4720
 *
 
4721
 * If force is true, we force a checkpoint regardless of whether any XLOG
 
4722
 * activity has occurred since the last one.
 
4723
 */
 
4724
void
 
4725
CreateCheckPoint(bool shutdown, bool force)
 
4726
{
 
4727
        CheckPoint      checkPoint;
 
4728
        XLogRecPtr      recptr;
 
4729
        XLogCtlInsert *Insert = &XLogCtl->Insert;
 
4730
        XLogRecData rdata;
 
4731
        uint32          freespace;
 
4732
        uint32          _logId;
 
4733
        uint32          _logSeg;
 
4734
        int                     nsegsadded = 0;
 
4735
        int                     nsegsremoved = 0;
 
4736
        int                     nsegsrecycled = 0;
 
4737
 
 
4738
        /*
 
4739
         * Acquire CheckpointLock to ensure only one checkpoint happens at a
 
4740
         * time.  (This is just pro forma, since in the present system
 
4741
         * structure there is only one process that is allowed to issue
 
4742
         * checkpoints at any given time.)
 
4743
         */
 
4744
        LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
 
4745
 
 
4746
        /*
 
4747
         * Use a critical section to force system panic if we have trouble.
 
4748
         */
 
4749
        START_CRIT_SECTION();
 
4750
 
 
4751
        if (shutdown)
 
4752
        {
 
4753
                ControlFile->state = DB_SHUTDOWNING;
 
4754
                ControlFile->time = time(NULL);
 
4755
                UpdateControlFile();
 
4756
        }
 
4757
 
 
4758
        MemSet(&checkPoint, 0, sizeof(checkPoint));
 
4759
        checkPoint.ThisTimeLineID = ThisTimeLineID;
 
4760
        checkPoint.time = time(NULL);
 
4761
 
 
4762
        /*
 
4763
         * We must hold CheckpointStartLock while determining the checkpoint
 
4764
         * REDO pointer.  This ensures that any concurrent transaction commits
 
4765
         * will be either not yet logged, or logged and recorded in pg_clog.
 
4766
         * See notes in RecordTransactionCommit().
 
4767
         */
 
4768
        LWLockAcquire(CheckpointStartLock, LW_EXCLUSIVE);
 
4769
 
 
4770
        /* And we need WALInsertLock too */
 
4771
        LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 
4772
 
 
4773
        /*
 
4774
         * If this isn't a shutdown or forced checkpoint, and we have not
 
4775
         * inserted any XLOG records since the start of the last checkpoint,
 
4776
         * skip the checkpoint.  The idea here is to avoid inserting duplicate
 
4777
         * checkpoints when the system is idle. That wastes log space, and
 
4778
         * more importantly it exposes us to possible loss of both current and
 
4779
         * previous checkpoint records if the machine crashes just as we're
 
4780
         * writing the update. (Perhaps it'd make even more sense to
 
4781
         * checkpoint only when the previous checkpoint record is in a
 
4782
         * different xlog page?)
 
4783
         *
 
4784
         * We have to make two tests to determine that nothing has happened since
 
4785
         * the start of the last checkpoint: current insertion point must
 
4786
         * match the end of the last checkpoint record, and its redo pointer
 
4787
         * must point to itself.
 
4788
         */
 
4789
        if (!shutdown && !force)
 
4790
        {
 
4791
                XLogRecPtr      curInsert;
 
4792
 
 
4793
                INSERT_RECPTR(curInsert, Insert, Insert->curridx);
 
4794
                if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
 
4795
                        curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
 
4796
                        MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
 
4797
                        ControlFile->checkPoint.xlogid ==
 
4798
                        ControlFile->checkPointCopy.redo.xlogid &&
 
4799
                        ControlFile->checkPoint.xrecoff ==
 
4800
                        ControlFile->checkPointCopy.redo.xrecoff)
 
4801
                {
 
4802
                        LWLockRelease(WALInsertLock);
 
4803
                        LWLockRelease(CheckpointStartLock);
 
4804
                        LWLockRelease(CheckpointLock);
 
4805
                        END_CRIT_SECTION();
 
4806
                        return;
 
4807
                }
 
4808
        }
 
4809
 
 
4810
        /*
 
4811
         * Compute new REDO record ptr = location of next XLOG record.
 
4812
         *
 
4813
         * NB: this is NOT necessarily where the checkpoint record itself will
 
4814
         * be, since other backends may insert more XLOG records while we're
 
4815
         * off doing the buffer flush work.  Those XLOG records are logically
 
4816
         * after the checkpoint, even though physically before it.      Got that?
 
4817
         */
 
4818
        freespace = INSERT_FREESPACE(Insert);
 
4819
        if (freespace < SizeOfXLogRecord)
 
4820
        {
 
4821
                (void) AdvanceXLInsertBuffer();
 
4822
                /* OK to ignore update return flag, since we will do flush anyway */
 
4823
                freespace = INSERT_FREESPACE(Insert);
 
4824
        }
 
4825
        INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
 
4826
 
 
4827
        /*
 
4828
         * Here we update the shared RedoRecPtr for future XLogInsert calls;
 
4829
         * this must be done while holding the insert lock AND the info_lck.
 
4830
         *
 
4831
         * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
 
4832
         * pointing past where it really needs to point.  This is okay; the
 
4833
         * only consequence is that XLogInsert might back up whole buffers
 
4834
         * that it didn't really need to.  We can't postpone advancing
 
4835
         * RedoRecPtr because XLogInserts that happen while we are dumping
 
4836
         * buffers must assume that their buffer changes are not included in
 
4837
         * the checkpoint.
 
4838
         */
 
4839
        {
 
4840
                /* use volatile pointer to prevent code rearrangement */
 
4841
                volatile XLogCtlData *xlogctl = XLogCtl;
 
4842
 
 
4843
                SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
 
4844
                RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
 
4845
                SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
 
4846
        }
 
4847
 
 
4848
        /*
 
4849
         * Get UNDO record ptr - this is oldest of PGPROC->logRec values. We
 
4850
         * do this while holding insert lock to ensure that we won't miss any
 
4851
         * about-to-commit transactions (UNDO must include all xacts that have
 
4852
         * commits after REDO point).
 
4853
         *
 
4854
         * XXX temporarily ifdef'd out to avoid three-way deadlock condition:
 
4855
         * GetUndoRecPtr needs to grab SInvalLock to ensure that it is looking
 
4856
         * at a stable set of proc records, but grabbing SInvalLock while
 
4857
         * holding WALInsertLock is no good.  GetNewTransactionId may cause a
 
4858
         * WAL record to be written while holding XidGenLock, and
 
4859
         * GetSnapshotData needs to get XidGenLock while holding SInvalLock,
 
4860
         * so there's a risk of deadlock. Need to find a better solution.  See
 
4861
         * pgsql-hackers discussion of 17-Dec-01.
 
4862
         *
 
4863
         * XXX actually, the whole UNDO code is dead code and unlikely to ever be
 
4864
         * revived, so the lack of a good solution here is not troubling.
 
4865
         */
 
4866
#ifdef NOT_USED
 
4867
        checkPoint.undo = GetUndoRecPtr();
 
4868
 
 
4869
        if (shutdown && checkPoint.undo.xrecoff != 0)
 
4870
                elog(PANIC, "active transaction while database system is shutting down");
 
4871
#endif
 
4872
 
 
4873
        /*
 
4874
         * Now we can release insert lock and checkpoint start lock, allowing
 
4875
         * other xacts to proceed even while we are flushing disk buffers.
 
4876
         */
 
4877
        LWLockRelease(WALInsertLock);
 
4878
 
 
4879
        LWLockRelease(CheckpointStartLock);
 
4880
 
 
4881
        /*
 
4882
         * Get the other info we need for the checkpoint record.
 
4883
         */
 
4884
        LWLockAcquire(XidGenLock, LW_SHARED);
 
4885
        checkPoint.nextXid = ShmemVariableCache->nextXid;
 
4886
        LWLockRelease(XidGenLock);
 
4887
 
 
4888
        LWLockAcquire(OidGenLock, LW_SHARED);
 
4889
        checkPoint.nextOid = ShmemVariableCache->nextOid;
 
4890
        if (!shutdown)
 
4891
                checkPoint.nextOid += ShmemVariableCache->oidCount;
 
4892
        LWLockRelease(OidGenLock);
 
4893
 
 
4894
        /*
 
4895
         * Having constructed the checkpoint record, ensure all shmem disk
 
4896
         * buffers and commit-log buffers are flushed to disk.
 
4897
         *
 
4898
         * This I/O could fail for various reasons.  If so, we will fail to
 
4899
         * complete the checkpoint, but there is no reason to force a system
 
4900
         * panic.  Accordingly, exit critical section while doing it.
 
4901
         */
 
4902
        END_CRIT_SECTION();
 
4903
 
 
4904
        if (!shutdown)
 
4905
                ereport(DEBUG1,
 
4906
                                (errmsg("checkpoint starting")));
 
4907
 
 
4908
        CheckPointCLOG();
 
4909
        CheckPointSUBTRANS();
 
4910
        FlushBufferPool();
 
4911
 
 
4912
        START_CRIT_SECTION();
 
4913
 
 
4914
        /*
 
4915
         * Now insert the checkpoint record into XLOG.
 
4916
         */
 
4917
        rdata.buffer = InvalidBuffer;
 
4918
        rdata.data = (char *) (&checkPoint);
 
4919
        rdata.len = sizeof(checkPoint);
 
4920
        rdata.next = NULL;
 
4921
 
 
4922
        recptr = XLogInsert(RM_XLOG_ID,
 
4923
                                                shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
 
4924
                                                XLOG_CHECKPOINT_ONLINE,
 
4925
                                                &rdata);
 
4926
 
 
4927
        XLogFlush(recptr);
 
4928
 
 
4929
        /*
 
4930
         * We now have ProcLastRecPtr = start of actual checkpoint record,
 
4931
         * recptr = end of actual checkpoint record.
 
4932
         */
 
4933
        if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
 
4934
                ereport(PANIC,
 
4935
                                (errmsg("concurrent transaction log activity while database system is shutting down")));
 
4936
 
 
4937
        /*
 
4938
         * Select point at which we can truncate the log, which we base on the
 
4939
         * prior checkpoint's earliest info.
 
4940
         *
 
4941
         * With UNDO support: oldest item is redo or undo, whichever is older;
 
4942
         * but watch out for case that undo = 0.
 
4943
         *
 
4944
         * Without UNDO support: just use the redo pointer.  This allows xlog
 
4945
         * space to be freed much faster when there are long-running
 
4946
         * transactions.
 
4947
         */
 
4948
#ifdef NOT_USED
 
4949
        if (ControlFile->checkPointCopy.undo.xrecoff != 0 &&
 
4950
                XLByteLT(ControlFile->checkPointCopy.undo,
 
4951
                                 ControlFile->checkPointCopy.redo))
 
4952
                XLByteToSeg(ControlFile->checkPointCopy.undo, _logId, _logSeg);
 
4953
        else
 
4954
#endif
 
4955
                XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
 
4956
 
 
4957
        /*
 
4958
         * Update the control file.
 
4959
         */
 
4960
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 
4961
        if (shutdown)
 
4962
                ControlFile->state = DB_SHUTDOWNED;
 
4963
        ControlFile->prevCheckPoint = ControlFile->checkPoint;
 
4964
        ControlFile->checkPoint = ProcLastRecPtr;
 
4965
        ControlFile->checkPointCopy = checkPoint;
 
4966
        ControlFile->time = time(NULL);
 
4967
        UpdateControlFile();
 
4968
        LWLockRelease(ControlFileLock);
 
4969
 
 
4970
        /*
 
4971
         * We are now done with critical updates; no need for system panic if
 
4972
         * we have trouble while fooling with offline log segments.
 
4973
         */
 
4974
        END_CRIT_SECTION();
 
4975
 
 
4976
        /*
 
4977
         * Delete offline log files (those no longer needed even for previous
 
4978
         * checkpoint).
 
4979
         */
 
4980
        if (_logId || _logSeg)
 
4981
        {
 
4982
                PrevLogSeg(_logId, _logSeg);
 
4983
                MoveOfflineLogs(_logId, _logSeg, recptr,
 
4984
                                                &nsegsremoved, &nsegsrecycled);
 
4985
        }
 
4986
 
 
4987
        /*
 
4988
         * Make more log segments if needed.  (Do this after deleting offline
 
4989
         * log segments, to avoid having peak disk space usage higher than
 
4990
         * necessary.)
 
4991
         */
 
4992
        if (!shutdown)
 
4993
                nsegsadded = PreallocXlogFiles(recptr);
 
4994
 
 
4995
        /*
 
4996
         * Truncate pg_subtrans if possible.  We can throw away all data
 
4997
         * before the oldest XMIN of any running transaction.  No future
 
4998
         * transaction will attempt to reference any pg_subtrans entry older
 
4999
         * than that (see Asserts in subtrans.c).  During recovery, though, we
 
5000
         * mustn't do this because StartupSUBTRANS hasn't been called yet.
 
5001
         */
 
5002
        if (!InRecovery)
 
5003
                TruncateSUBTRANS(GetOldestXmin(true));
 
5004
 
 
5005
        if (!shutdown)
 
5006
                ereport(DEBUG1,
 
5007
                                (errmsg("checkpoint complete; %d transaction log file(s) added, %d removed, %d recycled",
 
5008
                                                nsegsadded, nsegsremoved, nsegsrecycled)));
 
5009
 
 
5010
        LWLockRelease(CheckpointLock);
 
5011
}
 
5012
 
 
5013
/*
 
5014
 * Write a NEXTOID log record
 
5015
 */
 
5016
void
 
5017
XLogPutNextOid(Oid nextOid)
 
5018
{
 
5019
        XLogRecData rdata;
 
5020
 
 
5021
        rdata.buffer = InvalidBuffer;
 
5022
        rdata.data = (char *) (&nextOid);
 
5023
        rdata.len = sizeof(Oid);
 
5024
        rdata.next = NULL;
 
5025
        (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
 
5026
}
 
5027
 
 
5028
/*
 
5029
 * XLOG resource manager's routines
 
5030
 */
 
5031
void
 
5032
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 
5033
{
 
5034
        uint8           info = record->xl_info & ~XLR_INFO_MASK;
 
5035
 
 
5036
        if (info == XLOG_NEXTOID)
 
5037
        {
 
5038
                Oid                     nextOid;
 
5039
 
 
5040
                memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
 
5041
                if (ShmemVariableCache->nextOid < nextOid)
 
5042
                {
 
5043
                        ShmemVariableCache->nextOid = nextOid;
 
5044
                        ShmemVariableCache->oidCount = 0;
 
5045
                }
 
5046
        }
 
5047
        else if (info == XLOG_CHECKPOINT_SHUTDOWN)
 
5048
        {
 
5049
                CheckPoint      checkPoint;
 
5050
 
 
5051
                memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
 
5052
                /* In a SHUTDOWN checkpoint, believe the counters exactly */
 
5053
                ShmemVariableCache->nextXid = checkPoint.nextXid;
 
5054
                ShmemVariableCache->nextOid = checkPoint.nextOid;
 
5055
                ShmemVariableCache->oidCount = 0;
 
5056
 
 
5057
                /*
 
5058
                 * TLI may change in a shutdown checkpoint, but it shouldn't
 
5059
                 * decrease
 
5060
                 */
 
5061
                if (checkPoint.ThisTimeLineID != ThisTimeLineID)
 
5062
                {
 
5063
                        if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
 
5064
                                !list_member_int(expectedTLIs,
 
5065
                                                                 (int) checkPoint.ThisTimeLineID))
 
5066
                                ereport(PANIC,
 
5067
                                                (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
 
5068
                                                        checkPoint.ThisTimeLineID, ThisTimeLineID)));
 
5069
                        /* Following WAL records should be run with new TLI */
 
5070
                        ThisTimeLineID = checkPoint.ThisTimeLineID;
 
5071
                }
 
5072
        }
 
5073
        else if (info == XLOG_CHECKPOINT_ONLINE)
 
5074
        {
 
5075
                CheckPoint      checkPoint;
 
5076
 
 
5077
                memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
 
5078
                /* In an ONLINE checkpoint, treat the counters like NEXTOID */
 
5079
                if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
 
5080
                                                                  checkPoint.nextXid))
 
5081
                        ShmemVariableCache->nextXid = checkPoint.nextXid;
 
5082
                if (ShmemVariableCache->nextOid < checkPoint.nextOid)
 
5083
                {
 
5084
                        ShmemVariableCache->nextOid = checkPoint.nextOid;
 
5085
                        ShmemVariableCache->oidCount = 0;
 
5086
                }
 
5087
                /* TLI should not change in an on-line checkpoint */
 
5088
                if (checkPoint.ThisTimeLineID != ThisTimeLineID)
 
5089
                        ereport(PANIC,
 
5090
                                        (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
 
5091
                                                        checkPoint.ThisTimeLineID, ThisTimeLineID)));
 
5092
        }
 
5093
}
 
5094
 
 
5095
void
 
5096
xlog_undo(XLogRecPtr lsn, XLogRecord *record)
 
5097
{
 
5098
}
 
5099
 
 
5100
void
 
5101
xlog_desc(char *buf, uint8 xl_info, char *rec)
 
5102
{
 
5103
        uint8           info = xl_info & ~XLR_INFO_MASK;
 
5104
 
 
5105
        if (info == XLOG_CHECKPOINT_SHUTDOWN ||
 
5106
                info == XLOG_CHECKPOINT_ONLINE)
 
5107
        {
 
5108
                CheckPoint *checkpoint = (CheckPoint *) rec;
 
5109
 
 
5110
                sprintf(buf + strlen(buf), "checkpoint: redo %X/%X; undo %X/%X; "
 
5111
                                "tli %u; xid %u; oid %u; %s",
 
5112
                                checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
 
5113
                                checkpoint->undo.xlogid, checkpoint->undo.xrecoff,
 
5114
                                checkpoint->ThisTimeLineID, checkpoint->nextXid,
 
5115
                                checkpoint->nextOid,
 
5116
                         (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
 
5117
        }
 
5118
        else if (info == XLOG_NEXTOID)
 
5119
        {
 
5120
                Oid                     nextOid;
 
5121
 
 
5122
                memcpy(&nextOid, rec, sizeof(Oid));
 
5123
                sprintf(buf + strlen(buf), "nextOid: %u", nextOid);
 
5124
        }
 
5125
        else
 
5126
                strcat(buf, "UNKNOWN");
 
5127
}
 
5128
 
 
5129
#ifdef WAL_DEBUG
 
5130
 
 
5131
static void
 
5132
xlog_outrec(char *buf, XLogRecord *record)
 
5133
{
 
5134
        int                     bkpb;
 
5135
        int                     i;
 
5136
 
 
5137
        sprintf(buf + strlen(buf), "prev %X/%X; xid %u",
 
5138
                        record->xl_prev.xlogid, record->xl_prev.xrecoff,
 
5139
                        record->xl_xid);
 
5140
 
 
5141
        for (i = 0, bkpb = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 
5142
        {
 
5143
                if (!(record->xl_info & (XLR_SET_BKP_BLOCK(i))))
 
5144
                        continue;
 
5145
                bkpb++;
 
5146
        }
 
5147
 
 
5148
        if (bkpb)
 
5149
                sprintf(buf + strlen(buf), "; bkpb %d", bkpb);
 
5150
 
 
5151
        sprintf(buf + strlen(buf), ": %s",
 
5152
                        RmgrTable[record->xl_rmid].rm_name);
 
5153
}
 
5154
#endif   /* WAL_DEBUG */
 
5155
 
 
5156
 
 
5157
/*
 
5158
 * GUC support
 
5159
 */
 
5160
const char *
 
5161
assign_xlog_sync_method(const char *method, bool doit, GucSource source)
 
5162
{
 
5163
        int                     new_sync_method;
 
5164
        int                     new_sync_bit;
 
5165
 
 
5166
#ifndef FSYNC_IS_WRITE_THROUGH
 
5167
        if (pg_strcasecmp(method, "fsync") == 0)
 
5168
#else
 
5169
        /* Win32 fsync() == _commit(), which writes through a write cache */
 
5170
        if (pg_strcasecmp(method, "fsync_writethrough") == 0)
 
5171
#endif
 
5172
        {
 
5173
                new_sync_method = SYNC_METHOD_FSYNC;
 
5174
                new_sync_bit = 0;
 
5175
        }
 
5176
#ifdef HAVE_FDATASYNC
 
5177
        else if (pg_strcasecmp(method, "fdatasync") == 0)
 
5178
        {
 
5179
                new_sync_method = SYNC_METHOD_FDATASYNC;
 
5180
                new_sync_bit = 0;
 
5181
        }
 
5182
#endif
 
5183
#ifdef OPEN_SYNC_FLAG
 
5184
        else if (pg_strcasecmp(method, "open_sync") == 0)
 
5185
        {
 
5186
                new_sync_method = SYNC_METHOD_OPEN;
 
5187
                new_sync_bit = OPEN_SYNC_FLAG;
 
5188
        }
 
5189
#endif
 
5190
#ifdef OPEN_DATASYNC_FLAG
 
5191
        else if (pg_strcasecmp(method, "open_datasync") == 0)
 
5192
        {
 
5193
                new_sync_method = SYNC_METHOD_OPEN;
 
5194
                new_sync_bit = OPEN_DATASYNC_FLAG;
 
5195
        }
 
5196
#endif
 
5197
        else
 
5198
                return NULL;
 
5199
 
 
5200
        if (!doit)
 
5201
                return method;
 
5202
 
 
5203
        if (sync_method != new_sync_method || open_sync_bit != new_sync_bit)
 
5204
        {
 
5205
                /*
 
5206
                 * To ensure that no blocks escape unsynced, force an fsync on the
 
5207
                 * currently open log segment (if any).  Also, if the open flag is
 
5208
                 * changing, close the log file so it will be reopened (with new
 
5209
                 * flag bit) at next use.
 
5210
                 */
 
5211
                if (openLogFile >= 0)
 
5212
                {
 
5213
                        if (pg_fsync(openLogFile) != 0)
 
5214
                                ereport(PANIC,
 
5215
                                                (errcode_for_file_access(),
 
5216
                                        errmsg("could not fsync log file %u, segment %u: %m",
 
5217
                                                   openLogId, openLogSeg)));
 
5218
                        if (open_sync_bit != new_sync_bit)
 
5219
                        {
 
5220
                                if (close(openLogFile))
 
5221
                                        ereport(PANIC,
 
5222
                                                        (errcode_for_file_access(),
 
5223
                                        errmsg("could not close log file %u, segment %u: %m",
 
5224
                                                   openLogId, openLogSeg)));
 
5225
                                openLogFile = -1;
 
5226
                        }
 
5227
                }
 
5228
                sync_method = new_sync_method;
 
5229
                open_sync_bit = new_sync_bit;
 
5230
        }
 
5231
 
 
5232
        return method;
 
5233
}
 
5234
 
 
5235
 
 
5236
/*
 
5237
 * Issue appropriate kind of fsync (if any) on the current XLOG output file
 
5238
 */
 
5239
static void
 
5240
issue_xlog_fsync(void)
 
5241
{
 
5242
        switch (sync_method)
 
5243
        {
 
5244
                case SYNC_METHOD_FSYNC:
 
5245
                        if (pg_fsync(openLogFile) != 0)
 
5246
                                ereport(PANIC,
 
5247
                                                (errcode_for_file_access(),
 
5248
                                        errmsg("could not fsync log file %u, segment %u: %m",
 
5249
                                                   openLogId, openLogSeg)));
 
5250
                        break;
 
5251
#ifdef HAVE_FDATASYNC
 
5252
                case SYNC_METHOD_FDATASYNC:
 
5253
                        if (pg_fdatasync(openLogFile) != 0)
 
5254
                                ereport(PANIC,
 
5255
                                                (errcode_for_file_access(),
 
5256
                                errmsg("could not fdatasync log file %u, segment %u: %m",
 
5257
                                           openLogId, openLogSeg)));
 
5258
                        break;
 
5259
#endif
 
5260
                case SYNC_METHOD_OPEN:
 
5261
                        /* write synced it already */
 
5262
                        break;
 
5263
                default:
 
5264
                        elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
 
5265
                        break;
 
5266
        }
 
5267
}
 
5268
 
 
5269
 
 
5270
/*
 
5271
 * pg_start_backup: set up for taking an on-line backup dump
 
5272
 *
 
5273
 * Essentially what this does is to create a backup label file in $PGDATA,
 
5274
 * where it will be archived as part of the backup dump.  The label file
 
5275
 * contains the user-supplied label string (typically this would be used
 
5276
 * to tell where the backup dump will be stored) and the starting time and
 
5277
 * starting WAL offset for the dump.
 
5278
 */
 
5279
Datum
 
5280
pg_start_backup(PG_FUNCTION_ARGS)
 
5281
{
 
5282
        text       *backupid = PG_GETARG_TEXT_P(0);
 
5283
        text       *result;
 
5284
        char       *backupidstr;
 
5285
        XLogRecPtr      checkpointloc;
 
5286
        XLogRecPtr      startpoint;
 
5287
        time_t          stamp_time;
 
5288
        char            strfbuf[128];
 
5289
        char            labelfilepath[MAXPGPATH];
 
5290
        char            xlogfilename[MAXFNAMELEN];
 
5291
        uint32          _logId;
 
5292
        uint32          _logSeg;
 
5293
        struct stat stat_buf;
 
5294
        FILE       *fp;
 
5295
 
 
5296
        if (!superuser())
 
5297
                ereport(ERROR,
 
5298
                                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 
5299
                                 (errmsg("must be superuser to run a backup"))));
 
5300
 
 
5301
        if (!XLogArchivingActive())
 
5302
                ereport(ERROR,
 
5303
                                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 
5304
                                 (errmsg("WAL archiving is not active"),
 
5305
                                 (errhint("archive_command must be defined before "
 
5306
                                                  "online backups can be made safely.")))));
 
5307
 
 
5308
        backupidstr = DatumGetCString(DirectFunctionCall1(textout,
 
5309
                                                                                         PointerGetDatum(backupid)));
 
5310
 
 
5311
        /*
 
5312
         * Force a CHECKPOINT.  This is not strictly necessary, but it seems
 
5313
         * like a good idea to minimize the amount of past WAL needed to use
 
5314
         * the backup.  Also, this guarantees that two successive backup runs
 
5315
         * will have different checkpoint positions and hence different
 
5316
         * history file names, even if nothing happened in between.
 
5317
         */
 
5318
        RequestCheckpoint(true);
 
5319
 
 
5320
        /*
 
5321
         * Now we need to fetch the checkpoint record location, and also its
 
5322
         * REDO pointer.  The oldest point in WAL that would be needed to
 
5323
         * restore starting from the checkpoint is precisely the REDO pointer.
 
5324
         */
 
5325
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 
5326
        checkpointloc = ControlFile->checkPoint;
 
5327
        startpoint = ControlFile->checkPointCopy.redo;
 
5328
        LWLockRelease(ControlFileLock);
 
5329
 
 
5330
        XLByteToSeg(startpoint, _logId, _logSeg);
 
5331
        XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
 
5332
 
 
5333
        /*
 
5334
         * We deliberately use strftime/localtime not the src/timezone
 
5335
         * functions, so that backup labels will consistently be recorded in
 
5336
         * the same timezone regardless of TimeZone setting.  This matches
 
5337
         * elog.c's practice.
 
5338
         */
 
5339
        stamp_time = time(NULL);
 
5340
        strftime(strfbuf, sizeof(strfbuf),
 
5341
                         "%Y-%m-%d %H:%M:%S %Z",
 
5342
                         localtime(&stamp_time));
 
5343
 
 
5344
        /*
 
5345
         * Check for existing backup label --- implies a backup is already
 
5346
         * running
 
5347
         */
 
5348
        snprintf(labelfilepath, MAXPGPATH, "%s/backup_label", DataDir);
 
5349
        if (stat(labelfilepath, &stat_buf) != 0)
 
5350
        {
 
5351
                if (errno != ENOENT)
 
5352
                        ereport(ERROR,
 
5353
                                        (errcode_for_file_access(),
 
5354
                                         errmsg("could not stat file \"%s\": %m",
 
5355
                                                        labelfilepath)));
 
5356
        }
 
5357
        else
 
5358
                ereport(ERROR,
 
5359
                                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 
5360
                                 errmsg("a backup is already in progress"),
 
5361
                                 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
 
5362
                                                 labelfilepath)));
 
5363
 
 
5364
        /*
 
5365
         * Okay, write the file
 
5366
         */
 
5367
        fp = AllocateFile(labelfilepath, "w");
 
5368
        if (!fp)
 
5369
                ereport(ERROR,
 
5370
                                (errcode_for_file_access(),
 
5371
                                 errmsg("could not create file \"%s\": %m",
 
5372
                                                labelfilepath)));
 
5373
        fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
 
5374
                        startpoint.xlogid, startpoint.xrecoff, xlogfilename);
 
5375
        fprintf(fp, "CHECKPOINT LOCATION: %X/%X\n",
 
5376
                        checkpointloc.xlogid, checkpointloc.xrecoff);
 
5377
        fprintf(fp, "START TIME: %s\n", strfbuf);
 
5378
        fprintf(fp, "LABEL: %s\n", backupidstr);
 
5379
        if (fflush(fp) || ferror(fp) || FreeFile(fp))
 
5380
                ereport(ERROR,
 
5381
                                (errcode_for_file_access(),
 
5382
                                 errmsg("could not write file \"%s\": %m",
 
5383
                                                labelfilepath)));
 
5384
 
 
5385
        /*
 
5386
         * We're done.  As a convenience, return the starting WAL offset.
 
5387
         */
 
5388
        snprintf(xlogfilename, sizeof(xlogfilename), "%X/%X",
 
5389
                         startpoint.xlogid, startpoint.xrecoff);
 
5390
        result = DatumGetTextP(DirectFunctionCall1(textin,
 
5391
                                                                                 CStringGetDatum(xlogfilename)));
 
5392
        PG_RETURN_TEXT_P(result);
 
5393
}
 
5394
 
 
5395
/*
 
5396
 * pg_stop_backup: finish taking an on-line backup dump
 
5397
 *
 
5398
 * We remove the backup label file created by pg_start_backup, and instead
 
5399
 * create a backup history file in pg_xlog (whence it will immediately be
 
5400
 * archived).  The backup history file contains the same info found in
 
5401
 * the label file, plus the backup-end time and WAL offset.
 
5402
 */
 
5403
Datum
 
5404
pg_stop_backup(PG_FUNCTION_ARGS)
 
5405
{
 
5406
        text       *result;
 
5407
        XLogCtlInsert *Insert = &XLogCtl->Insert;
 
5408
        XLogRecPtr      startpoint;
 
5409
        XLogRecPtr      stoppoint;
 
5410
        time_t          stamp_time;
 
5411
        char            strfbuf[128];
 
5412
        char            labelfilepath[MAXPGPATH];
 
5413
        char            histfilepath[MAXPGPATH];
 
5414
        char            startxlogfilename[MAXFNAMELEN];
 
5415
        char            stopxlogfilename[MAXFNAMELEN];
 
5416
        uint32          _logId;
 
5417
        uint32          _logSeg;
 
5418
        FILE       *lfp;
 
5419
        FILE       *fp;
 
5420
        char            ch;
 
5421
        int                     ich;
 
5422
 
 
5423
        if (!superuser())
 
5424
                ereport(ERROR,
 
5425
                                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 
5426
                                 (errmsg("must be superuser to run a backup"))));
 
5427
 
 
5428
        /*
 
5429
         * Get the current end-of-WAL position; it will be unsafe to use this
 
5430
         * dump to restore to a point in advance of this time.
 
5431
         */
 
5432
        LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 
5433
        INSERT_RECPTR(stoppoint, Insert, Insert->curridx);
 
5434
        LWLockRelease(WALInsertLock);
 
5435
 
 
5436
        XLByteToSeg(stoppoint, _logId, _logSeg);
 
5437
        XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);
 
5438
 
 
5439
        /*
 
5440
         * We deliberately use strftime/localtime not the src/timezone
 
5441
         * functions, so that backup labels will consistently be recorded in
 
5442
         * the same timezone regardless of TimeZone setting.  This matches
 
5443
         * elog.c's practice.
 
5444
         */
 
5445
        stamp_time = time(NULL);
 
5446
        strftime(strfbuf, sizeof(strfbuf),
 
5447
                         "%Y-%m-%d %H:%M:%S %Z",
 
5448
                         localtime(&stamp_time));
 
5449
 
 
5450
        /*
 
5451
         * Open the existing label file
 
5452
         */
 
5453
        snprintf(labelfilepath, MAXPGPATH, "%s/backup_label", DataDir);
 
5454
        lfp = AllocateFile(labelfilepath, "r");
 
5455
        if (!lfp)
 
5456
        {
 
5457
                if (errno != ENOENT)
 
5458
                        ereport(ERROR,
 
5459
                                        (errcode_for_file_access(),
 
5460
                                         errmsg("could not read file \"%s\": %m",
 
5461
                                                        labelfilepath)));
 
5462
                ereport(ERROR,
 
5463
                                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 
5464
                                 errmsg("a backup is not in progress")));
 
5465
        }
 
5466
 
 
5467
        /*
 
5468
         * Read and parse the START WAL LOCATION line (this code is pretty
 
5469
         * crude, but we are not expecting any variability in the file
 
5470
         * format).
 
5471
         */
 
5472
        if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %24s)%c",
 
5473
                           &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
 
5474
                           &ch) != 4 || ch != '\n')
 
5475
                ereport(ERROR,
 
5476
                                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 
5477
                                 errmsg("invalid data in file \"%s\"", labelfilepath)));
 
5478
 
 
5479
        /*
 
5480
         * Write the backup history file
 
5481
         */
 
5482
        XLByteToSeg(startpoint, _logId, _logSeg);
 
5483
        BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
 
5484
                                                  startpoint.xrecoff % XLogSegSize);
 
5485
        fp = AllocateFile(histfilepath, "w");
 
5486
        if (!fp)
 
5487
                ereport(ERROR,
 
5488
                                (errcode_for_file_access(),
 
5489
                                 errmsg("could not create file \"%s\": %m",
 
5490
                                                histfilepath)));
 
5491
        fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
 
5492
                        startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
 
5493
        fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
 
5494
                        stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
 
5495
        /* transfer remaining lines from label to history file */
 
5496
        while ((ich = fgetc(lfp)) != EOF)
 
5497
                fputc(ich, fp);
 
5498
        fprintf(fp, "STOP TIME: %s\n", strfbuf);
 
5499
        if (fflush(fp) || ferror(fp) || FreeFile(fp))
 
5500
                ereport(ERROR,
 
5501
                                (errcode_for_file_access(),
 
5502
                                 errmsg("could not write file \"%s\": %m",
 
5503
                                                histfilepath)));
 
5504
 
 
5505
        /*
 
5506
         * Close and remove the backup label file
 
5507
         */
 
5508
        if (ferror(lfp) || FreeFile(lfp))
 
5509
                ereport(ERROR,
 
5510
                                (errcode_for_file_access(),
 
5511
                                 errmsg("could not read file \"%s\": %m",
 
5512
                                                labelfilepath)));
 
5513
        if (unlink(labelfilepath) != 0)
 
5514
                ereport(ERROR,
 
5515
                                (errcode_for_file_access(),
 
5516
                                 errmsg("could not remove file \"%s\": %m",
 
5517
                                                labelfilepath)));
 
5518
 
 
5519
        /*
 
5520
         * Notify archiver that history file may be archived immediately
 
5521
         */
 
5522
        if (XLogArchivingActive())
 
5523
        {
 
5524
                BackupHistoryFileName(histfilepath, ThisTimeLineID, _logId, _logSeg,
 
5525
                                                          startpoint.xrecoff % XLogSegSize);
 
5526
                XLogArchiveNotify(histfilepath);
 
5527
        }
 
5528
 
 
5529
        /*
 
5530
         * We're done.  As a convenience, return the ending WAL offset.
 
5531
         */
 
5532
        snprintf(stopxlogfilename, sizeof(stopxlogfilename), "%X/%X",
 
5533
                         stoppoint.xlogid, stoppoint.xrecoff);
 
5534
        result = DatumGetTextP(DirectFunctionCall1(textin,
 
5535
                                                                         CStringGetDatum(stopxlogfilename)));
 
5536
        PG_RETURN_TEXT_P(result);
 
5537
}
 
5538
 
 
5539
/*
 
5540
 * read_backup_label: check to see if a backup_label file is present
 
5541
 *
 
5542
 * If we see a backup_label during recovery, we assume that we are recovering
 
5543
 * from a backup dump file, and we therefore roll forward from the checkpoint
 
5544
 * identified by the label file, NOT what pg_control says.      This avoids the
 
5545
 * problem that pg_control might have been archived one or more checkpoints
 
5546
 * later than the start of the dump, and so if we rely on it as the start
 
5547
 * point, we will fail to restore a consistent database state.
 
5548
 *
 
5549
 * We also attempt to retrieve the corresponding backup history file.
 
5550
 * If successful, set recoveryMinXlogOffset to constrain valid PITR stopping
 
5551
 * points.
 
5552
 *
 
5553
 * Returns TRUE if a backup_label was found (and fills the checkpoint
 
5554
 * location into *checkPointLoc); returns FALSE if not.
 
5555
 */
 
5556
static bool
 
5557
read_backup_label(XLogRecPtr *checkPointLoc)
 
5558
{
 
5559
        XLogRecPtr      startpoint;
 
5560
        XLogRecPtr      stoppoint;
 
5561
        char            labelfilepath[MAXPGPATH];
 
5562
        char            histfilename[MAXFNAMELEN];
 
5563
        char            histfilepath[MAXPGPATH];
 
5564
        char            startxlogfilename[MAXFNAMELEN];
 
5565
        char            stopxlogfilename[MAXFNAMELEN];
 
5566
        TimeLineID      tli;
 
5567
        uint32          _logId;
 
5568
        uint32          _logSeg;
 
5569
        FILE       *lfp;
 
5570
        FILE       *fp;
 
5571
        char            ch;
 
5572
 
 
5573
        /*
 
5574
         * See if label file is present
 
5575
         */
 
5576
        snprintf(labelfilepath, MAXPGPATH, "%s/backup_label", DataDir);
 
5577
        lfp = AllocateFile(labelfilepath, "r");
 
5578
        if (!lfp)
 
5579
        {
 
5580
                if (errno != ENOENT)
 
5581
                        ereport(FATAL,
 
5582
                                        (errcode_for_file_access(),
 
5583
                                         errmsg("could not read file \"%s\": %m",
 
5584
                                                        labelfilepath)));
 
5585
                return false;                   /* it's not there, all is fine */
 
5586
        }
 
5587
 
 
5588
        /*
 
5589
         * Read and parse the START WAL LOCATION and CHECKPOINT lines (this
 
5590
         * code is pretty crude, but we are not expecting any variability in
 
5591
         * the file format).
 
5592
         */
 
5593
        if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
 
5594
                           &startpoint.xlogid, &startpoint.xrecoff, &tli,
 
5595
                           startxlogfilename, &ch) != 5 || ch != '\n')
 
5596
                ereport(FATAL,
 
5597
                                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 
5598
                                 errmsg("invalid data in file \"%s\"", labelfilepath)));
 
5599
        if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
 
5600
                           &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
 
5601
                           &ch) != 3 || ch != '\n')
 
5602
                ereport(FATAL,
 
5603
                                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 
5604
                                 errmsg("invalid data in file \"%s\"", labelfilepath)));
 
5605
        if (ferror(lfp) || FreeFile(lfp))
 
5606
                ereport(FATAL,
 
5607
                                (errcode_for_file_access(),
 
5608
                                 errmsg("could not read file \"%s\": %m",
 
5609
                                                labelfilepath)));
 
5610
 
 
5611
        /*
 
5612
         * Try to retrieve the backup history file (no error if we can't)
 
5613
         */
 
5614
        XLByteToSeg(startpoint, _logId, _logSeg);
 
5615
        BackupHistoryFileName(histfilename, tli, _logId, _logSeg,
 
5616
                                                  startpoint.xrecoff % XLogSegSize);
 
5617
 
 
5618
        if (InArchiveRecovery)
 
5619
                RestoreArchivedFile(histfilepath, histfilename, "RECOVERYHISTORY", 0);
 
5620
        else
 
5621
                BackupHistoryFilePath(histfilepath, tli, _logId, _logSeg,
 
5622
                                                          startpoint.xrecoff % XLogSegSize);
 
5623
 
 
5624
        fp = AllocateFile(histfilepath, "r");
 
5625
        if (fp)
 
5626
        {
 
5627
                /*
 
5628
                 * Parse history file to identify stop point.
 
5629
                 */
 
5630
                if (fscanf(fp, "START WAL LOCATION: %X/%X (file %24s)%c",
 
5631
                          &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
 
5632
                                   &ch) != 4 || ch != '\n')
 
5633
                        ereport(FATAL,
 
5634
                                        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 
5635
                                   errmsg("invalid data in file \"%s\"", histfilename)));
 
5636
                if (fscanf(fp, "STOP WAL LOCATION: %X/%X (file %24s)%c",
 
5637
                                 &stoppoint.xlogid, &stoppoint.xrecoff, stopxlogfilename,
 
5638
                                   &ch) != 4 || ch != '\n')
 
5639
                        ereport(FATAL,
 
5640
                                        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 
5641
                                   errmsg("invalid data in file \"%s\"", histfilename)));
 
5642
                recoveryMinXlogOffset = stoppoint;
 
5643
                if (ferror(fp) || FreeFile(fp))
 
5644
                        ereport(FATAL,
 
5645
                                        (errcode_for_file_access(),
 
5646
                                         errmsg("could not read file \"%s\": %m",
 
5647
                                                        histfilepath)));
 
5648
        }
 
5649
 
 
5650
        return true;
 
5651
}
 
5652
 
 
5653
/*
 
5654
 * remove_backup_label: remove any extant backup_label after successful
 
5655
 * recovery.  Once we have completed the end-of-recovery checkpoint there
 
5656
 * is no reason to have to replay from the start point indicated by the
 
5657
 * label (and indeed we'll probably have removed/recycled the needed WAL
 
5658
 * segments), so remove the label to prevent trouble in later crash recoveries.
 
5659
 */
 
5660
static void
 
5661
remove_backup_label(void)
 
5662
{
 
5663
        char            labelfilepath[MAXPGPATH];
 
5664
 
 
5665
        snprintf(labelfilepath, MAXPGPATH, "%s/backup_label", DataDir);
 
5666
        if (unlink(labelfilepath) != 0)
 
5667
                if (errno != ENOENT)
 
5668
                        ereport(FATAL,
 
5669
                                        (errcode_for_file_access(),
 
5670
                                         errmsg("could not remove file \"%s\": %m",
 
5671
                                                        labelfilepath)));
 
5672
}