2
* See the file LICENSE for redistribution information.
4
* Copyright (c) 1996-2002
5
* Sleepycat Software. All rights reserved.
11
static const char copyright[] =
12
"Copyright (c) 1996-2002\nSleepycat Software Inc. All rights reserved.\n";
13
static const char revid[] =
17
#ifndef NO_SYSTEM_INCLUDES
18
#include <sys/types.h>
20
#if TIME_WITH_SYS_TIME
35
#include "dbinc/db_page.h"
36
#include "dbinc/db_shash.h"
37
#include "dbinc/lock.h"
38
#include "dbinc/log.h"
39
#include "dbinc/rep.h"
40
#include "dbinc/txn.h"
41
#include "dbinc/db_am.h"
43
static int __log_backup __P((DB_ENV *, DB_LOGC *, DB_LSN *, DB_LSN *));
44
static int __log_earliest __P((DB_ENV *, DB_LOGC *, int32_t *, DB_LSN *));
45
static double __lsn_diff __P((DB_LSN *, DB_LSN *, DB_LSN *, u_int32_t, int));
49
* Perform recovery. If max_lsn is non-NULL, then we are trying
50
* to synchronize this system up with another system that has a max
51
* LSN of max_lsn, so we need to roll back sufficiently far for that
52
* to work. See __log_backup for details.
54
* PUBLIC: int __db_apprec __P((DB_ENV *, DB_LSN *, u_int32_t));
57
__db_apprec(dbenv, max_lsn, flags)
64
DB_LSN ckp_lsn, first_lsn, last_lsn, lowlsn, lsn, stop_lsn;
66
__txn_ckp_args *ckp_args;
68
int32_t log_size, low;
70
int have_rec, is_thread, progress, ret, t_ret;
71
int (**dtab) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
73
u_int32_t hi_txn, lockid, txnid;
74
char *p, *pass, t1[60], t2[60];
77
COMPQUIET(nfiles, (double)0);
83
lockid = DB_LOCK_INVALIDID;
89
* Get the log size. No locking required because we're single-threaded
93
((LOG *)(((DB_LOG *)dbenv->lg_handle)->reginfo.primary))->log_size;
96
* Save the state of the thread flag -- we don't need it on at the
97
* moment because we're single-threaded until recovery is complete.
99
is_thread = F_ISSET(dbenv, DB_ENV_THREAD) ? 1 : 0;
100
F_CLR(dbenv, DB_ENV_THREAD);
102
/* Set in-recovery flags. */
103
F_SET((DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER);
104
region = ((DB_TXNMGR *)dbenv->tx_handle)->reginfo.primary;
105
F_SET(region, TXN_IN_RECOVERY);
107
/* Allocate a cursor for the log. */
108
if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0)
112
* If the user is specifying recovery to a particular point in time
113
* or to a particular LSN, find the point to start recovery from.
116
if (max_lsn != NULL) {
117
if ((ret = __log_backup(dbenv, logc, max_lsn, &lowlsn)) != 0)
119
} else if (dbenv->tx_timestamp != 0) {
120
if ((ret = __log_earliest(dbenv, logc, &low, &lowlsn)) != 0)
122
if ((int32_t)dbenv->tx_timestamp < low) {
123
(void)snprintf(t1, sizeof(t1),
124
"%s", ctime(&dbenv->tx_timestamp));
125
if ((p = strchr(t1, '\n')) != NULL)
128
(void)snprintf(t2, sizeof(t2), "%s", ctime(&tlow));
129
if ((p = strchr(t2, '\n')) != NULL)
132
"Invalid recovery timestamp %s; earliest time is %s",
140
* Recovery is done in three passes:
142
* We need to find the position from which we will open files.
143
* We need to open files beginning with the earlier of the
144
* most recent checkpoint LSN and a checkpoint LSN before the
145
* recovery timestamp, if specified. We need to be before the
146
* most recent checkpoint LSN because we are going to collect
147
* information about which transactions were begun before we
148
* start rolling forward. Those that were should never be undone
149
* because queue cannot use LSNs to determine what operations can
150
* safely be aborted and it cannot rollback operations in
151
* transactions for which there may be records not processed
152
* during recovery. We need to consider earlier points in time
153
* in case we are recovering to a particular timestamp.
156
* Read forward through the log from the position found in pass 0
157
* opening and closing files, and recording transactions for which
158
* we've seen their first record (the transaction's prev_lsn is
159
* 0,0). At the end of this pass, we know all transactions for
160
* which we've seen begins and we have the "current" set of files
164
* Read backward through the log undoing any uncompleted TXNs.
165
* There are four cases:
166
* 1. If doing catastrophic recovery, we read to the
167
* beginning of the log
168
* 2. If we are doing normal reovery, then we have to roll
169
* back to the most recent checkpoint LSN.
170
* 3. If we are recovering to a point in time, then we have
171
* to roll back to the checkpoint whose ckp_lsn is earlier
172
* than the specified time. __log_earliest will figure
174
* 4. If we are recovering back to a particular LSN, then
175
* we have to roll back to the checkpoint whose ckp_lsn
176
* is earlier than the max_lsn. __log_backup will figure
178
* In case 2, "uncompleted TXNs" include all those who commited
179
* after the user's specified timestamp.
182
* Read forward through the log from the LSN found in pass #2,
183
* redoing any committed TXNs (which commited after any user-
184
* specified rollback point). During this pass, checkpoint
185
* file information is ignored, and file openings and closings
188
* ckp_lsn -- lsn of the last checkpoint or the first in the log.
189
* first_lsn -- the lsn where the forward passes begin.
190
* last_lsn -- the last lsn in the log, used for feedback
191
* lowlsn -- the lsn we are rolling back to, if we are recovering
192
* to a point in time.
193
* lsn -- temporary use lsn.
194
* stop_lsn -- the point at which forward roll should stop
198
* Find out the last lsn, so that we can estimate how far along we
199
* are in recovery. This will help us determine how much log there
200
* is between the first LSN that we're going to be working with and
201
* the last one. We assume that each of the three phases takes the
202
* same amount of time (a false assumption) and then use the %-age
203
* of the amount of log traversed to figure out how much of the
204
* pass we've accomplished.
206
* If we can't find any log records, we're kind of done.
211
memset(&data, 0, sizeof(data));
212
if ((ret = logc->get(logc, &last_lsn, &data, DB_LAST)) != 0) {
213
if (ret == DB_NOTFOUND)
216
__db_err(dbenv, "Last log record not found");
221
/* txnid is after rectype, which is a u_int32. */
223
(u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
227
} while ((ret = logc->get(logc, &lsn, &data, DB_PREV)) == 0);
230
* There are no transactions, so there is nothing to do unless
231
* we're recovering to an LSN. If we are, we need to proceed since
232
* we'll still need to do a vtruncate based on information we haven't
235
if (ret == DB_NOTFOUND) {
247
* Find the LSN from which we begin OPENFILES.
249
* If this is a catastrophic recovery, or if no checkpoint exists
250
* in the log, the LSN is the first LSN in the log.
252
* Otherwise, it is the minimum of (1) the LSN in the last checkpoint
253
* and (2) the LSN in the checkpoint before any specified recovery
254
* timestamp or max_lsn.
257
* Get the first LSN in the log; it's an initial default
258
* even if this is not a catastrophic recovery.
260
if ((ret = logc->get(logc, &ckp_lsn, &data, DB_FIRST)) != 0) {
261
if (ret == DB_NOTFOUND)
264
__db_err(dbenv, "First log record not found");
270
if (!LF_ISSET(DB_RECOVER_FATAL)) {
271
if ((ret = __txn_getckp(dbenv, &ckp_lsn)) == 0 &&
272
(ret = logc->get(logc, &ckp_lsn, &data, DB_SET)) == 0) {
273
/* We have a recent checkpoint. This is LSN (1). */
274
if ((ret = __txn_ckp_read(dbenv,
275
data.data, &ckp_args)) != 0) {
277
"Invalid checkpoint record at [%ld][%ld]",
278
(u_long)ckp_lsn.file,
279
(u_long)ckp_lsn.offset);
282
first_lsn = ckp_args->ckp_lsn;
287
* If LSN (2) exists, use it if it's before LSN (1).
288
* (If LSN (1) doesn't exist, first_lsn is the
289
* beginning of the log, so will "win" this check.)
292
* In the recovery-to-a-timestamp case, lowlsn is chosen by
293
* __log_earliest, and is the checkpoint LSN of the
294
* *earliest* checkpoint in the unreclaimed log. I
295
* (krinsky) believe that we could optimize this by looking
296
* instead for the LSN of the *latest* checkpoint before
297
* the timestamp of interest, but I'm not sure that this
298
* is worth doing right now. (We have to look for lowlsn
299
* and low anyway, to make sure the requested timestamp is
300
* somewhere in the logs we have, and all that's required
301
* is that we pick *some* checkpoint after the beginning of
302
* the logs and before the timestamp.
304
if ((dbenv->tx_timestamp != 0 || max_lsn != NULL) &&
305
log_compare(&lowlsn, &first_lsn) < 0) {
306
DB_ASSERT(have_rec == 0);
311
/* Get the record at first_lsn if we don't have it already. */
313
(ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0) {
314
__db_err(dbenv, "Checkpoint LSN record [%ld][%ld] not found",
315
(u_long)first_lsn.file, (u_long)first_lsn.offset);
319
if (dbenv->db_feedback != NULL) {
320
if (last_lsn.file == first_lsn.file)
322
(last_lsn.offset - first_lsn.offset) / log_size;
324
nfiles = (double)(last_lsn.file - first_lsn.file) +
325
(double)(log_size - first_lsn.offset +
326
last_lsn.offset) / log_size;
327
/* We are going to divide by nfiles; make sure it isn't 0. */
329
nfiles = (double)0.001;
332
/* Find a low txnid. */
335
/* txnid is after rectype, which is a u_int32. */
337
(u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
341
} while ((ret = logc->get(logc, &lsn, &data, DB_NEXT)) == 0);
344
* There are no transactions and we're not recovering to an LSN (see
345
* above), so there is nothing to do.
347
if (ret == DB_NOTFOUND) {
353
/* Reset to the first lsn. */
354
if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
357
/* Initialize the transaction list. */
359
__db_txnlist_init(dbenv, txnid, hi_txn, max_lsn, &txninfo)) != 0)
364
* Run forward through the log starting at the first relevant lsn.
366
if ((ret = __env_openfiles(dbenv, logc,
367
txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0)
373
* We used first_lsn to tell us how far back we need to recover,
377
if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
378
__db_err(dbenv, "Recovery starting from [%lu][%lu]",
379
(u_long)first_lsn.file, (u_long)first_lsn.offset);
382
* If we are doing client recovery, then we need to allocate
383
* the page-info lock table.
385
if (max_lsn != NULL) {
386
if ((ret = __rep_lockpgno_init(dbenv, &dtab, &dtabsize)) != 0)
388
if ((ret = dbenv->lock_id(dbenv, &lockid)) != 0)
393
for (ret = logc->get(logc, &lsn, &data, DB_LAST);
394
ret == 0 && log_compare(&lsn, &first_lsn) >= 0;
395
ret = logc->get(logc, &lsn, &data, DB_PREV)) {
396
if (dbenv->db_feedback != NULL) {
397
progress = 34 + (int)(33 * (__lsn_diff(&first_lsn,
398
&last_lsn, &lsn, log_size, 0) / nfiles));
399
dbenv->db_feedback(dbenv, DB_RECOVER, progress);
401
if (max_lsn != NULL && (ret = __rep_lockpages(dbenv,
402
dtab, dtabsize, &lsn, NULL, NULL, lockid)) != 0)
405
ret = __db_dispatch(dbenv, dbenv->recover_dtab,
406
dbenv->recover_dtab_size, &data, &lsn,
407
DB_TXN_BACKWARD_ROLL, txninfo);
409
if (ret != DB_TXN_CKP)
415
if (ret != 0 && ret != DB_NOTFOUND)
419
* Pass #3. If we are recovering to a timestamp or to an LSN,
420
* we need to make sure that we don't roll-forward beyond that
421
* point because there may be non-transactional operations (e.g.,
422
* closes that would fail). The last_lsn variable is used for
423
* feedback calculations, but use it to set an initial stopping
424
* point for the forward pass, and then reset appropriately to
425
* derive a real stop_lsn that tells how far the forward pass
430
if (max_lsn != NULL || dbenv->tx_timestamp != 0)
431
stop_lsn = ((DB_TXNHEAD *)txninfo)->maxlsn;
433
for (ret = logc->get(logc, &lsn, &data, DB_NEXT);
434
ret == 0; ret = logc->get(logc, &lsn, &data, DB_NEXT)) {
436
* If we are recovering to a timestamp or an LSN,
437
* we need to make sure that we don't try to roll
438
* forward beyond the soon-to-be end of log.
440
if (log_compare(&lsn, &stop_lsn) > 0)
443
if (dbenv->db_feedback != NULL) {
444
progress = 67 + (int)(33 * (__lsn_diff(&first_lsn,
445
&last_lsn, &lsn, log_size, 1) / nfiles));
446
dbenv->db_feedback(dbenv, DB_RECOVER, progress);
448
ret = __db_dispatch(dbenv, dbenv->recover_dtab,
449
dbenv->recover_dtab_size, &data, &lsn,
450
DB_TXN_FORWARD_ROLL, txninfo);
452
if (ret != DB_TXN_CKP)
459
if (ret != 0 && ret != DB_NOTFOUND)
463
* Process any pages that were on the limbo list and move them to
464
* the free list. Do this before checkpointing the database.
466
if ((ret = __db_do_the_limbo(dbenv, NULL, NULL, txninfo)) != 0)
470
region->last_txnid = ((DB_TXNHEAD *)txninfo)->maxid;
472
/* Take a checkpoint here to force any dirty data pages to disk. */
473
if (dbenv->tx_timestamp != 0) {
474
region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
475
__log_vtruncate(dbenv, &((DB_TXNHEAD *)txninfo)->maxlsn,
476
&((DB_TXNHEAD *)txninfo)->ckplsn);
479
if ((ret = dbenv->txn_checkpoint(dbenv, 0, 0, DB_FORCE)) != 0)
482
/* Close all the db files that are open. */
483
if ((ret = __dbreg_close_files(dbenv)) != 0)
486
if (max_lsn != NULL) {
487
region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
489
/* We are going to truncate, so we'd best close the cursor. */
490
if (logc != NULL && (ret = logc->close(logc, 0)) != 0)
492
__log_vtruncate(dbenv,
493
max_lsn, &((DB_TXNHEAD *)txninfo)->ckplsn);
496
* Now we need to open files that should be open in order for
497
* client processing to continue. However, since we've
498
* truncated the log, we need to recompute from where the
499
* openfiles pass should begin.
501
if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0)
503
if ((ret = logc->get(logc, &first_lsn, &data, DB_FIRST)) != 0) {
504
if (ret == DB_NOTFOUND)
507
__db_err(dbenv, "First log record not found");
510
if ((ret = __txn_getckp(dbenv, &first_lsn)) == 0 &&
511
(ret = logc->get(logc, &first_lsn, &data, DB_SET)) == 0) {
512
/* We have a recent checkpoint. This is LSN (1). */
513
if ((ret = __txn_ckp_read(dbenv,
514
data.data, &ckp_args)) != 0) {
516
"Invalid checkpoint record at [%ld][%ld]",
517
(u_long)first_lsn.file,
518
(u_long)first_lsn.offset);
521
first_lsn = ckp_args->ckp_lsn;
523
if ((ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
525
if ((ret = __env_openfiles(dbenv, logc,
526
txninfo, &data, &first_lsn, NULL, nfiles, 1)) != 0)
528
} else if (region->stat.st_nrestores == 0)
530
* If there are no prepared transactions that need resolution,
531
* we need to reset the transaction ID space and log this fact.
533
if ((ret = __txn_reset(dbenv)) != 0)
536
if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) {
538
__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
539
__db_err(dbenv, "%s %lx %s [%lu][%lu]",
540
"Maximum transaction ID",
541
((DB_TXNHEAD *)txninfo)->maxid,
542
"Recovery checkpoint",
543
(u_long)region->last_ckp.file,
544
(u_long)region->last_ckp.offset);
548
msgerr: __db_err(dbenv,
549
"Recovery function for LSN %lu %lu failed on %s pass",
550
(u_long)lsn.file, (u_long)lsn.offset, pass);
554
err: if (lockid != DB_LOCK_INVALIDID) {
555
if ((t_ret = __rep_unlockpages(dbenv, lockid)) != 0 && ret == 0)
559
dbenv->lock_id_free(dbenv, lockid)) != 0 && ret == 0)
563
if (logc != NULL && (t_ret = logc->close(logc, 0)) != 0 && ret == 0)
567
__db_txnlist_end(dbenv, txninfo);
570
__os_free(dbenv, dtab);
572
if (ckp_args != NULL)
573
__os_free(dbenv, ckp_args);
575
dbenv->tx_timestamp = 0;
577
/* Restore the state of the thread flag, clear in-recovery flags. */
579
F_SET(dbenv, DB_ENV_THREAD);
580
F_CLR((DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER);
581
F_CLR(region, TXN_IN_RECOVERY);
587
* Figure out how many logfiles we have processed. If we are moving
588
* forward (is_forward != 0), then we're computing current - low. If
589
* we are moving backward, we are computing high - current. max is
590
* the number of bytes per logfile.
593
__lsn_diff(low, high, current, max, is_forward)
594
DB_LSN *low, *high, *current;
601
* There are three cases in each direction. If you are in the
602
* same file, then all you need worry about is the difference in
603
* offsets. If you are in different files, then either your offsets
604
* put you either more or less than the integral difference in the
605
* number of files -- we need to handle both of these.
608
if (current->file == low->file)
609
nf = (double)(current->offset - low->offset) / max;
610
else if (current->offset < low->offset)
611
nf = (double)(current->file - low->file - 1) +
612
(double)(max - low->offset + current->offset) / max;
614
nf = (double)(current->file - low->file) +
615
(double)(current->offset - low->offset) / max;
617
if (current->file == high->file)
618
nf = (double)(high->offset - current->offset) / max;
619
else if (current->offset > high->offset)
620
nf = (double)(high->file - current->file - 1) +
622
(max - current->offset + high->offset) / max;
624
nf = (double)(high->file - current->file) +
625
(double)(high->offset - current->offset) / max;
633
* This is used to find the earliest log record to process when a client
634
* is trying to sync up with a master whose max LSN is less than this
635
* client's max lsn; we want to roll back everything after that
637
* Find the latest checkpoint whose ckp_lsn is less than the max lsn.
640
__log_backup(dbenv, logc, max_lsn, start_lsn)
643
DB_LSN *max_lsn, *start_lsn;
647
__txn_ckp_args *ckp_args;
650
memset(&data, 0, sizeof(data));
654
* Follow checkpoints through the log until we find one with
655
* a ckp_lsn less than max_lsn.
657
if ((ret = __txn_getckp(dbenv, &lsn)) != 0)
659
while ((ret = logc->get(logc, &lsn, &data, DB_SET)) == 0) {
660
if ((ret = __txn_ckp_read(dbenv, data.data, &ckp_args)) != 0)
662
if (log_compare(&ckp_args->ckp_lsn, max_lsn) <= 0) {
663
*start_lsn = ckp_args->ckp_lsn;
667
lsn = ckp_args->prev_lsn;
668
if (IS_ZERO_LSN(lsn))
670
__os_free(dbenv, ckp_args);
673
if (ckp_args != NULL)
674
__os_free(dbenv, ckp_args);
675
err: if (IS_ZERO_LSN(*start_lsn) && (ret == 0 || ret == DB_NOTFOUND))
676
ret = logc->get(logc, start_lsn, &data, DB_FIRST);
683
* Return the earliest recovery point for the log files present. The
684
* earliest recovery time is the time stamp of the first checkpoint record
685
* whose checkpoint LSN is greater than the first LSN we process.
688
__log_earliest(dbenv, logc, lowtime, lowlsn)
694
DB_LSN first_lsn, lsn;
696
__txn_ckp_args *ckpargs;
700
memset(&data, 0, sizeof(data));
702
* Read forward through the log looking for the first checkpoint
703
* record whose ckp_lsn is greater than first_lsn.
706
for (ret = logc->get(logc, &first_lsn, &data, DB_FIRST);
707
ret == 0; ret = logc->get(logc, &lsn, &data, DB_NEXT)) {
708
memcpy(&rectype, data.data, sizeof(rectype));
709
if (rectype != DB___txn_ckp)
711
if ((ret = __txn_ckp_read(dbenv, data.data, &ckpargs)) == 0) {
712
cmp = log_compare(&ckpargs->ckp_lsn, &first_lsn);
713
*lowlsn = ckpargs->ckp_lsn;
714
*lowtime = ckpargs->timestamp;
716
__os_free(dbenv, ckpargs);
727
* Perform the pass of recovery that opens files. This is used
728
* both during regular recovery and an initial call to txn_recover (since
729
* we need files open in order to abort prepared, but not yet committed
732
* See the comments in db_apprec for a detailed description of the
733
* various recovery passes.
735
* If we are not doing feedback processing (i.e., we are doing txn_recover
736
* processing and in_recovery is zero), then last_lsn can be NULL.
738
* PUBLIC: int __env_openfiles __P((DB_ENV *, DB_LOGC *,
739
* PUBLIC: void *, DBT *, DB_LSN *, DB_LSN *, double, int));
742
__env_openfiles(dbenv, logc, txninfo,
743
data, open_lsn, last_lsn, nfiles, in_recovery)
748
DB_LSN *open_lsn, *last_lsn;
758
* Get the log size. No locking required because we're single-threaded
762
((LOG *)(((DB_LOG *)dbenv->lg_handle)->reginfo.primary))->log_size;
766
if (in_recovery && dbenv->db_feedback != NULL) {
767
DB_ASSERT(last_lsn != NULL);
768
progress = (int)(33 * (__lsn_diff(open_lsn,
769
last_lsn, &lsn, log_size, 1) / nfiles));
770
dbenv->db_feedback(dbenv, DB_RECOVER, progress);
772
ret = __db_dispatch(dbenv,
773
dbenv->recover_dtab, dbenv->recover_dtab_size, data, &lsn,
774
in_recovery ? DB_TXN_OPENFILES : DB_TXN_POPENFILES,
776
if (ret != 0 && ret != DB_TXN_CKP) {
778
"Recovery function for LSN %lu %lu failed",
779
(u_long)lsn.file, (u_long)lsn.offset);
782
if ((ret = logc->get(logc, &lsn, data, DB_NEXT)) != 0) {
783
if (ret == DB_NOTFOUND)