2
* See the file LICENSE for redistribution information.
4
* Copyright (c) 1996-2002
5
* Sleepycat Software. All rights reserved.
8
* Copyright (c) 1995, 1996
9
* The President and Fellows of Harvard University. All rights reserved.
11
* This code is derived from software contributed to Berkeley by
14
* Redistribution and use in source and binary forms, with or without
15
* modification, are permitted provided that the following conditions
17
* 1. Redistributions of source code must retain the above copyright
18
* notice, this list of conditions and the following disclaimer.
19
* 2. Redistributions in binary form must reproduce the above copyright
20
* notice, this list of conditions and the following disclaimer in the
21
* documentation and/or other materials provided with the distribution.
22
* 3. Neither the name of the University nor the names of its contributors
23
* may be used to endorse or promote products derived from this software
24
* without specific prior written permission.
26
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39
#include "db_config.h"
42
static const char revid[] = "$Id$";
45
#ifndef NO_SYSTEM_INCLUDES
46
#include <sys/types.h>
54
#include "dbinc/db_page.h"
55
#include "dbinc/hash.h"
56
#include "dbinc/log.h"
57
#include "dbinc/fop.h"
58
#include "dbinc/rep.h"
59
#include "dbinc/txn.h"
61
static int __db_limbo_fix __P((DB *,
62
DB_TXN *, DB_TXNLIST *, db_pgno_t *, DBMETA *));
63
static int __db_limbo_bucket __P((DB_ENV *, DB_TXN *, DB_TXNLIST *));
64
static int __db_limbo_move __P((DB_ENV *, DB_TXN *, DB_TXN *, DB_TXNLIST *));
65
static int __db_lock_move __P((DB_ENV *,
66
u_int8_t *, db_pgno_t, db_lockmode_t, DB_TXN *, DB_TXN *));
67
static int __db_default_getpgnos __P((DB_ENV *, DB_LSN *lsnp, void *));
68
static int __db_txnlist_find_internal __P((DB_ENV *, void *, db_txnlist_type,
69
u_int32_t, u_int8_t [DB_FILE_ID_LEN], DB_TXNLIST **, int));
70
static int __db_txnlist_pgnoadd __P((DB_ENV *, DB_TXNHEAD *,
71
int32_t, u_int8_t [DB_FILE_ID_LEN], char *, db_pgno_t));
76
* This is the transaction dispatch function used by the db access methods.
77
* It is designed to handle the record format used by all the access
78
* methods (the one automatically generated by the db_{h,log,read}.sh
79
* scripts in the tools directory). An application using a different
80
* recovery paradigm will supply a different dispatch function to txn_open.
82
* PUBLIC: int __db_dispatch __P((DB_ENV *,
83
* PUBLIC: int (**)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)),
84
* PUBLIC: size_t, DBT *, DB_LSN *, db_recops, void *));
87
__db_dispatch(dbenv, dtab, dtabsize, db, lsnp, redo, info)
88
DB_ENV *dbenv; /* The environment. */
89
int (**dtab)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
90
size_t dtabsize; /* Size of the dtab. */
91
DBT *db; /* The log record upon which to dispatch. */
92
DB_LSN *lsnp; /* The lsn of the record being dispatched. */
93
db_recops redo; /* Redo this op (or undo it). */
97
u_int32_t rectype, txnid;
100
memcpy(&rectype, db->data, sizeof(rectype));
101
memcpy(&txnid, (u_int8_t *)db->data + sizeof(rectype), sizeof(txnid));
104
/* If we don't have a dispatch table, it's hard to dispatch. */
105
DB_ASSERT(dtab != NULL);
108
* If we find a record that is in the user's number space and they
109
* have specified a recovery routine, let them handle it. If they
110
* didn't specify a recovery routine, then we expect that they've
111
* followed all our rules and registered new recovery functions.
119
case DB_TXN_OPENFILES:
121
* We collect all the transactions that have
122
* "begin" records, those with no previous LSN,
123
* so that we do not abort partial transactions.
124
* These are known to be undone, otherwise the
125
* log would not have been freeable.
127
memcpy(&prev_lsn, (u_int8_t *)db->data +
128
sizeof(rectype) + sizeof(txnid), sizeof(prev_lsn));
129
if (txnid != 0 && prev_lsn.file == 0 && (ret =
130
__db_txnlist_add(dbenv, info, txnid, TXN_OK, NULL)) != 0)
134
case DB_TXN_POPENFILES:
135
if (rectype == DB___dbreg_register ||
136
rectype == DB___txn_ckp || rectype == DB___txn_recycle)
137
return (dtab[rectype](dbenv, db, lsnp, redo, info));
139
case DB_TXN_BACKWARD_ROLL:
141
* Running full recovery in the backward pass. If we've
142
* seen this txnid before and added to it our commit list,
143
* then we do nothing during this pass, unless this is a child
144
* commit record, in which case we need to process it. If
145
* we've never seen it, then we call the appropriate recovery
148
* We need to always undo DB___db_noop records, so that we
149
* properly handle any aborts before the file was closed.
153
case DB___txn_recycle:
156
case DB___fop_file_remove:
161
case DB___dbreg_register:
168
if (txnid != 0 && (ret =
169
__db_txnlist_find(dbenv,
170
info, txnid)) != TXN_COMMIT && ret != TXN_IGNORE) {
172
* If not found then, this is an incomplete
175
if (ret == TXN_NOTFOUND)
176
return (__db_txnlist_add(dbenv,
177
info, txnid, TXN_IGNORE, lsnp));
180
(ret = __db_txnlist_update(dbenv,
182
rectype == DB___txn_xa_regop ?
183
TXN_PREPARE : TXN_ABORT, NULL)) != 0)
188
case DB_TXN_FORWARD_ROLL:
190
* In the forward pass, if we haven't seen the transaction,
191
* do nothing, else recover it.
193
* We need to always redo DB___db_noop records, so that we
194
* properly handle any commits after the file was closed.
197
case DB___txn_recycle:
204
if (txnid != 0 && (ret = __db_txnlist_find(dbenv,
205
info, txnid)) == TXN_COMMIT)
207
else if (ret != TXN_IGNORE &&
208
(rectype == DB___ham_metagroup ||
209
rectype == DB___ham_groupalloc ||
210
rectype == DB___db_pg_alloc)) {
212
* Because we cannot undo file extensions
213
* all allocation records must be reprocessed
214
* during rollforward in case the file was
215
* just created. It may not have been
216
* present during the backward pass.
219
redo = DB_TXN_BACKWARD_ALLOC;
220
} else if (rectype == DB___dbreg_register) {
222
* This may be a transaction dbreg_register.
223
* If it is, we only make the call on a COMMIT,
224
* which we checked above. If it's not, then we
225
* should always make the call, because we need
226
* the file open information.
233
case DB_TXN_GETPGNOS:
235
* If this is one of DB's own log records, we simply
238
if (rectype < DB_user_BEGIN) {
244
* If we're still here, this is a custom record in an
245
* application that's doing app-specific logging. Such a
246
* record doesn't have a getpgno function for the user
247
* dispatch function to call--the getpgnos functions return
248
* which pages replication needs to lock using the TXN_RECS
249
* structure, which is private and not something we want to
252
* Thus, we leave any necessary locking for the app's
253
* recovery function to do during the upcoming
254
* DB_TXN_APPLY. Fill in default getpgnos info (we need
255
* a stub entry for every log record that will get
256
* DB_TXN_APPLY'd) and return success.
258
return (__db_default_getpgnos(dbenv, lsnp, info));
260
return (__db_unknown_flag(dbenv, "__db_dispatch", redo));
263
* The switch statement uses ret to receive the return value of
264
* __db_txnlist_find, which returns a large number of different
265
* statuses, none of which we will be returning. For safety,
266
* let's reset this here in case we ever do a "return(ret)"
267
* below in the future.
272
if (rectype >= DB_user_BEGIN && dbenv->app_dispatch != NULL)
273
return (dbenv->app_dispatch(dbenv, db, lsnp, redo));
276
* The size of the dtab table argument is the same as
277
* the standard table, use the standard table's size
278
* as our sanity check.
280
if (rectype > dtabsize || dtab[rectype] == NULL) {
282
"Illegal record type %lu in log",
286
return (dtab[rectype](dbenv, db, lsnp, redo, info));
294
* __db_add_recovery --
296
* PUBLIC: int __db_add_recovery __P((DB_ENV *,
297
* PUBLIC: int (***)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), size_t *,
298
* PUBLIC: int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
301
__db_add_recovery(dbenv, dtab, dtabsize, func, ndx)
303
int (***dtab) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
305
int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
311
/* Check if we have to grow the table. */
312
if (ndx >= *dtabsize) {
315
__os_realloc(dbenv, nsize * sizeof((*dtab)[0]), dtab)) != 0)
317
for (i = *dtabsize; i < nsize; ++i)
327
* __db_txnlist_init --
328
* Initialize transaction linked list.
330
* PUBLIC: int __db_txnlist_init __P((DB_ENV *,
331
* PUBLIC: u_int32_t, u_int32_t, DB_LSN *, void *));
334
__db_txnlist_init(dbenv, low_txn, hi_txn, trunc_lsn, retp)
336
u_int32_t low_txn, hi_txn;
346
* If low is zero then we are being called during rollback
347
* and we need only one slot.
348
* Hi maybe lower than low if we have recycled txnid's.
349
* The numbers here are guesses about txn density, we can afford
350
* to look at a few entries in each slot.
355
if (hi_txn < low_txn) {
360
tmp = hi_txn - low_txn;
361
/* See if we wrapped around. */
362
if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2)
363
tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn);
368
if ((ret = __os_malloc(dbenv,
369
sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0)
372
memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head));
373
headp->maxid = hi_txn;
374
headp->generation = 0;
375
headp->nslots = size;
376
headp->gen_alloc = 8;
377
if ((ret = __os_malloc(dbenv, headp->gen_alloc *
378
sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) {
379
__os_free(dbenv, headp);
382
headp->gen_array[0].generation = 0;
383
headp->gen_array[0].txn_min = TXN_MINIMUM;
384
headp->gen_array[0].txn_max = TXN_MAXIMUM;
385
if (trunc_lsn != NULL)
386
headp->trunc_lsn = *trunc_lsn;
388
ZERO_LSN(headp->trunc_lsn);
389
ZERO_LSN(headp->maxlsn);
390
ZERO_LSN(headp->ckplsn);
392
*(void **)retp = headp;
397
* __db_txnlist_add --
398
* Add an element to our transaction linked list.
400
* PUBLIC: int __db_txnlist_add __P((DB_ENV *,
401
* PUBLIC: void *, u_int32_t, int32_t, DB_LSN *));
404
__db_txnlist_add(dbenv, listp, txnid, status, lsn)
415
if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
418
hp = (DB_TXNHEAD *)listp;
419
LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links);
421
elp->type = TXNLIST_TXNID;
422
elp->u.t.txnid = txnid;
423
elp->u.t.status = status;
424
elp->u.t.generation = hp->generation;
425
if (txnid > hp->maxid)
427
if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
430
DB_ASSERT(lsn == NULL ||
431
status != TXN_COMMIT || log_compare(&hp->maxlsn, lsn) >= 0);
437
* __db_txnlist_remove --
438
* Remove an element from our transaction linked list.
440
* PUBLIC: int __db_txnlist_remove __P((DB_ENV *, void *, u_int32_t));
443
__db_txnlist_remove(dbenv, listp, txnid)
450
return (__db_txnlist_find_internal(dbenv,
451
listp, TXNLIST_TXNID, txnid,
452
NULL, &entry, 1) == TXN_NOTFOUND ? TXN_NOTFOUND : TXN_OK);
456
* __db_txnlist_ckp --
457
* Used to record the maximum checkpoint that will be retained
458
* after recovery. Typically this is simply the max checkpoint, but
459
* if we are doing client replication recovery or timestamp-based
460
* recovery, we are going to virtually truncate the log and we need
461
* to retain the last checkpoint before the truncation point.
463
* PUBLIC: void __db_txnlist_ckp __P((DB_ENV *, void *, DB_LSN *));
466
__db_txnlist_ckp(dbenv, listp, ckp_lsn)
473
COMPQUIET(dbenv, NULL);
475
hp = (DB_TXNHEAD *)listp;
477
if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) &&
478
log_compare(&hp->maxlsn, ckp_lsn) >= 0)
479
hp->ckplsn = *ckp_lsn;
483
* __db_txnlist_end --
484
* Discard transaction linked list. Print out any error messages
487
* PUBLIC: void __db_txnlist_end __P((DB_ENV *, void *));
490
__db_txnlist_end(dbenv, listp)
498
if ((hp = (DB_TXNHEAD *)listp) == NULL)
501
for (i = 0; i < hp->nslots; i++)
502
while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) {
503
LIST_REMOVE(p, links);
506
__os_free(dbenv, p->u.l.lsn_array);
510
* Possibly an incomplete DB_TXNLIST; just
518
if (hp->gen_array != NULL)
519
__os_free(dbenv, hp->gen_array);
520
__os_free(dbenv, listp);
524
* __db_txnlist_find --
525
* Checks to see if a txnid with the current generation is in the
526
* txnid list. This returns TXN_NOTFOUND if the item isn't in the
527
* list otherwise it returns (like __db_txnlist_find_internal)
528
* the status of the transaction. A txnid of 0 means the record
529
* was generated while not in a transaction.
531
* PUBLIC: int __db_txnlist_find __P((DB_ENV *, void *, u_int32_t));
534
__db_txnlist_find(dbenv, listp, txnid)
542
return (TXN_NOTFOUND);
543
return (__db_txnlist_find_internal(dbenv, listp,
544
TXNLIST_TXNID, txnid, NULL, &entry, 0));
548
* __db_txnlist_update --
549
* Change the status of an existing transaction entry.
550
* Returns TXN_NOTFOUND if no such entry exists.
552
* PUBLIC: int __db_txnlist_update __P((DB_ENV *,
553
* PUBLIC: void *, u_int32_t, u_int32_t, DB_LSN *));
556
__db_txnlist_update(dbenv, listp, txnid, status, lsn)
568
return (TXN_NOTFOUND);
569
hp = (DB_TXNHEAD *)listp;
570
ret = __db_txnlist_find_internal(dbenv,
571
listp, TXNLIST_TXNID, txnid, NULL, &elp, 0);
573
if (ret == TXN_NOTFOUND)
575
elp->u.t.status = status;
577
if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
584
* __db_txnlist_find_internal --
585
* Find an entry on the transaction list. If the entry is not there or
586
* the list pointer is not initialized we return TXN_NOTFOUND. If the
587
* item is found, we return the status. Currently we always call this
588
* with an initialized list pointer but checking for NULL keeps it general.
591
__db_txnlist_find_internal(dbenv, listp, type, txnid, uid, txnlistp, delete)
594
db_txnlist_type type;
596
u_int8_t uid[DB_FILE_ID_LEN];
597
DB_TXNLIST **txnlistp;
604
struct __db_headlink *head;
607
if ((hp = (DB_TXNHEAD *)listp) == NULL)
608
return (TXN_NOTFOUND);
613
/* Find the most recent generation containing this ID */
614
for (i = 0; i <= hp->generation; i++)
615
/* The range may wrap around the end. */
616
if (hp->gen_array[i].txn_min <
617
hp->gen_array[i].txn_max ?
618
(txnid >= hp->gen_array[i].txn_min &&
619
txnid <= hp->gen_array[i].txn_max) :
620
(txnid >= hp->gen_array[i].txn_min ||
621
txnid <= hp->gen_array[i].txn_max))
623
DB_ASSERT(i <= hp->generation);
624
generation = hp->gen_array[i].generation;
627
memcpy(&hash, uid, sizeof(hash));
635
head = &hp->head[DB_TXNLIST_MASK(hp, hash)];
637
for (p = LIST_FIRST(head); p != NULL; p = LIST_NEXT(p, links)) {
642
if (p->u.t.txnid != txnid ||
643
generation != p->u.t.generation)
649
if (memcmp(uid, p->u.p.uid, DB_FILE_ID_LEN) != 0)
659
LIST_REMOVE(p, links);
661
} else if (p != LIST_FIRST(head)) {
662
/* Move it to head of list. */
663
LIST_REMOVE(p, links);
664
LIST_INSERT_HEAD(head, p, links);
670
return (TXN_NOTFOUND);
674
* __db_txnlist_gen --
675
* Change the current generation number.
677
* PUBLIC: int __db_txnlist_gen __P((DB_ENV *,
678
* PUBLIC: void *, int, u_int32_t, u_int32_t));
681
__db_txnlist_gen(dbenv, listp, incr, min, max)
691
* During recovery generation numbers keep track of "restart"
692
* checkpoints and recycle records. Restart checkpoints occur
693
* whenever we take a checkpoint and there are no outstanding
694
* transactions. When that happens, we can reset transaction IDs
695
* back to TXNID_MINIMUM. Currently we only do the reset
696
* at then end of recovery. Recycle records occrur when txnids
697
* are exhausted during runtime. A free range of ids is identified
698
* and logged. This code maintains a stack of ranges. A txnid
699
* is given the generation number of the first range it falls into
702
hp = (DB_TXNHEAD *)listp;
703
hp->generation += incr;
705
memmove(hp->gen_array, &hp->gen_array[1],
706
(hp->generation + 1) * sizeof(hp->gen_array[0]));
708
if (hp->generation >= hp->gen_alloc) {
710
if ((ret = __os_realloc(dbenv, hp->gen_alloc *
711
sizeof(hp->gen_array[0]), &hp->gen_array)) != 0)
714
memmove(&hp->gen_array[1], &hp->gen_array[0],
715
hp->generation * sizeof(hp->gen_array[0]));
716
hp->gen_array[0].generation = hp->generation;
717
hp->gen_array[0].txn_min = min;
718
hp->gen_array[0].txn_max = max;
723
#define TXN_BUBBLE(AP, MAX) { \
727
for (__j = 0; __j < MAX - 1; __j++) \
728
if (log_compare(&AP[__j], &AP[__j + 1]) < 0) { \
730
AP[__j] = AP[__j + 1]; \
731
AP[__j + 1] = __tmp; \
736
* __db_txnlist_lsnadd --
737
* Add to or re-sort the transaction list lsn entry. Note that since this
738
* is used during an abort, the __txn_undo code calls into the "recovery"
739
* subsystem explicitly, and there is only a single TXNLIST_LSN entry on
742
* PUBLIC: int __db_txnlist_lsnadd __P((DB_ENV *, void *, DB_LSN *, u_int32_t));
745
__db_txnlist_lsnadd(dbenv, listp, lsnp, flags)
755
hp = (DB_TXNHEAD *)listp;
757
for (elp = LIST_FIRST(&hp->head[0]);
758
elp != NULL; elp = LIST_NEXT(elp, links))
759
if (elp->type == TXNLIST_LSN)
763
return (DB_SURPRISE_KID);
765
if (LF_ISSET(TXNLIST_NEW)) {
766
if (elp->u.l.ntxns >= elp->u.l.maxn) {
767
if ((ret = __os_realloc(dbenv,
768
2 * elp->u.l.maxn * sizeof(DB_LSN),
769
&elp->u.l.lsn_array)) != 0)
773
elp->u.l.lsn_array[elp->u.l.ntxns++] = *lsnp;
775
/* Simply replace the 0th element. */
776
elp->u.l.lsn_array[0] = *lsnp;
779
* If we just added a new entry and there may be NULL entries, so we
780
* have to do a complete bubble sort, not just trickle a changed entry
783
for (i = 0; i < (!LF_ISSET(TXNLIST_NEW) ? 1 : elp->u.l.ntxns); i++)
784
TXN_BUBBLE(elp->u.l.lsn_array, elp->u.l.ntxns);
786
*lsnp = elp->u.l.lsn_array[0];
792
* __db_txnlist_lsninit --
793
* Initialize a transaction list with an lsn array entry.
795
* PUBLIC: int __db_txnlist_lsninit __P((DB_ENV *, DB_TXNHEAD *, DB_LSN *));
798
__db_txnlist_lsninit(dbenv, hp, lsnp)
808
if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
810
LIST_INSERT_HEAD(&hp->head[0], elp, links);
812
if ((ret = __os_malloc(dbenv,
813
12 * sizeof(DB_LSN), &elp->u.l.lsn_array)) != 0)
815
elp->type = TXNLIST_LSN;
818
elp->u.l.lsn_array[0] = *lsnp;
822
err: __db_txnlist_end(dbenv, hp);
827
* __db_add_limbo -- add pages to the limbo list.
828
* Get the file information and call pgnoadd for each page.
830
* PUBLIC: int __db_add_limbo __P((DB_ENV *,
831
* PUBLIC: void *, int32_t, db_pgno_t, int32_t));
834
__db_add_limbo(dbenv, info, fileid, pgno, count)
845
dblp = dbenv->lg_handle;
846
if ((ret = __dbreg_id_to_fname(dblp, fileid, 0, &fnp)) != 0)
851
__db_txnlist_pgnoadd(dbenv, info, fileid, fnp->ufid,
852
R_ADDR(&dblp->reginfo, fnp->name_off), pgno)) != 0)
855
} while (--count != 0);
861
* __db_do_the_limbo -- move pages from limbo to free.
863
* Limbo processing is what ensures that we correctly handle and
864
* recover from page allocations. During recovery, for each database,
865
* we process each in-question allocation, link them into the free list
866
* and then write out the new meta-data page that contains the pointer
867
* to the new beginning of the free list. On an abort, we use our
868
* standard __db_free mechanism in a compensating transaction which logs
869
* the specific modifications to the free list.
871
* If we run out of log space during an abort, then we can't write the
872
* compensating transaction, so we abandon the idea of a compenating
873
* transaction, and go back to processing how we do during recovery.
874
* The reason that this is not the norm is that it's expensive: it requires
875
* that we flush any database with an in-question allocation. Thus if
876
* a compensating transaction fails, we never try to restart it.
878
* Since files may be open and closed within transactions (in particular,
879
* the master database for subdatabases), we must be prepared to open
880
* files during this process. If there is a compensating transaction, we
881
* can open the files in that transaction. If this was an abort and there
882
* is no compensating transaction, then we've got to perform these opens
883
* in the context of the aborting transaction so that we do not deadlock.
884
* During recovery, there's no locking, so this isn't an issue.
886
* What you want to keep in mind when reading this is that there are two
887
* algorithms going on here: ctxn == NULL, then we're either in recovery
888
* or our compensating transaction has failed and we're doing the
889
* "create list and write meta-data page" algorithm. Otherwise, we're in
890
* an abort and doing the "use compensating transaction" algorithm.
892
* PUBLIC: int __db_do_the_limbo __P((DB_ENV *,
893
* PUBLIC: DB_TXN *, DB_TXN *, DB_TXNHEAD *));
896
__db_do_the_limbo(dbenv, ptxn, txn, hp)
906
* The slots correspond to hash buckets. We've hashed the
907
* fileids into hash buckets and need to pick up all affected
908
* files. (There will only be a single slot for an abort.)
910
for (h = 0; h < hp->nslots; h++) {
911
if ((elp = LIST_FIRST(&hp->head[h])) == NULL)
915
__db_limbo_move(dbenv, ptxn, txn, elp)) != 0)
917
} else if ((ret = __db_limbo_bucket(dbenv, txn, elp)) != 0)
922
__db_err(dbenv, "Fatal error in abort of an allocation");
923
ret = __db_panic(dbenv, ret);
929
/* Limbo support routines. */
933
* Move a lock from child to parent.
936
__db_lock_move(dbenv, fileid, pgno, mode, ptxn, txn)
945
DB_LOCK_ILOCK lock_obj;
949
lock_obj.pgno = pgno;
950
memcpy(lock_obj.fileid, fileid, DB_FILE_ID_LEN);
951
lock_obj.type = DB_PAGE_LOCK;
953
memset(&lock_dbt, 0, sizeof(lock_dbt));
954
lock_dbt.data = &lock_obj;
955
lock_dbt.size = sizeof(lock_obj);
957
if ((ret = dbenv->lock_get(dbenv,
958
txn->txnid, 0, &lock_dbt, mode, &lock)) == 0) {
959
memset(&req, 0, sizeof(req));
961
req.op = DB_LOCK_TRADE;
963
ret = dbenv->lock_vec(dbenv, ptxn->txnid, 0, &req, 1, NULL);
970
* Move just the metapage lock to the parent.
973
__db_limbo_move(dbenv, ptxn, txn, elp)
980
for (; elp != NULL; elp = LIST_NEXT(elp, links)) {
981
if (elp->type != TXNLIST_PGNO || elp->u.p.locked == 1)
983
if ((ret = __db_lock_move(dbenv, elp->u.p.uid,
984
PGNO_BASE_MD, DB_LOCK_WRITE, ptxn, txn)) != 0)
993
* Perform limbo processing for a single hash bucket in the txnlist.
994
* txn is the transaction aborting in the case of an abort and ctxn is the
995
* compensating transaction.
998
#define T_RESTORED(txn) ((txn) != NULL && F_ISSET(txn, TXN_RESTORED))
1000
__db_limbo_bucket(dbenv, txn, elp)
1009
db_pgno_t last_pgno, pgno;
1010
int dbp_created, in_retry, ret, t_ret;
1017
for (; elp != NULL; elp = LIST_NEXT(elp, links)) {
1018
if (elp->type != TXNLIST_PGNO)
1020
retry: dbp_created = 0;
1023
* Pick the transaction in which to potentially
1024
* log compensations.
1026
if (!in_retry && !IS_RECOVERING(dbenv) && !T_RESTORED(txn)
1027
&& (ret = __txn_compensate_begin(dbenv, &ctxn)) != 0)
1031
* Either use the compensating transaction or
1032
* the one passed in, which will be null if recovering.
1034
t = ctxn == NULL ? txn : ctxn;
1036
/* First try to get a dbp by fileid. */
1037
ret = __dbreg_id_to_db(dbenv, t, &dbp, elp->u.p.fileid, 0);
1040
* File is being destroyed. No need to worry about
1041
* dealing with recovery of allocations.
1043
if (ret == DB_DELETED ||
1044
(ret == 0 && F_ISSET(dbp, DB_AM_DISCARD)))
1048
if ((ret = db_create(&dbp, dbenv, 0)) != 0)
1052
* This tells the system not to lock, which is always
1053
* OK, whether this is an abort or recovery.
1055
F_SET(dbp, DB_AM_COMPENSATE);
1058
/* It is ok if the file is nolonger there. */
1059
dbp->type = DB_UNKNOWN;
1060
ret = __db_dbopen(dbp, t, elp->u.p.fname, NULL,
1061
DB_ODDFILESIZE, __db_omode("rw----"), PGNO_BASE_MD);
1067
* Verify that we are opening the same file that we were
1068
* referring to when we wrote this log record.
1070
if (memcmp(elp->u.p.uid, dbp->fileid, DB_FILE_ID_LEN) != 0)
1074
last_pgno = PGNO_INVALID;
1077
pgno = PGNO_BASE_MD;
1079
mpf->get(mpf, &pgno, 0, (PAGE **)&meta)) != 0)
1081
last_pgno = meta->free;
1084
ret = __db_limbo_fix(dbp, ctxn, elp, &last_pgno, meta);
1086
* If we were doing compensating transactions, then we are
1087
* going to hope this error was due to running out of space.
1088
* We'll change modes (into the sync the file mode) and keep
1089
* trying. If we weren't doing compensating transactions,
1090
* then this is a real error and we're sunk.
1093
if (ret == DB_RUNRECOVERY || ctxn == NULL)
1100
ret = ctxn->commit(ctxn, DB_TXN_NOSYNC);
1108
* This is where we handle the case where we're explicitly
1109
* putting together a free list. We need to decide whether
1110
* we have to write the meta-data page, and if we do, then
1111
* we need to sync it as well.
1113
if (last_pgno == meta->free) {
1114
/* No change to page; just put the page back. */
1115
if ((ret = mpf->put(mpf, meta, 0)) != 0)
1120
* These changes are unlogged so we cannot have the
1121
* metapage pointing at pages that are not on disk.
1122
* Therefore, we flush the new free list, then update
1123
* the metapage. We have to put the meta-data page
1124
* first so that it isn't pinned when we try to sync.
1126
if (!IS_RECOVERING(dbenv) && !T_RESTORED(txn))
1127
__db_err(dbenv, "Flushing free list to disk");
1128
if ((ret = mpf->put(mpf, meta, 0)) != 0)
1132
pgno = PGNO_BASE_MD;
1134
mpf->get(mpf, &pgno, 0, (PAGE **)&meta)) != 0)
1136
meta->free = last_pgno;
1137
if ((ret = mpf->put(mpf, meta, DB_MPOOL_DIRTY)) != 0)
1144
* If we get here, either we have processed the list
1145
* or the db file has been deleted or could no be opened.
1148
(t_ret = ctxn->abort(ctxn)) != 0 && ret == 0)
1152
(t_ret = __db_close_i(dbp, txn, 0)) != 0 && ret == 0)
1155
__os_free(dbenv, elp->u.p.fname);
1156
__os_free(dbenv, elp->u.p.pgno_array);
1163
err: if (meta != NULL)
1164
(void)mpf->put(mpf, meta, 0);
1170
* Process a single limbo entry which describes all the page allocations
1171
* for a single file.
1174
__db_limbo_fix(dbp, ctxn, elp, lastp, meta)
1183
PAGE *freep, *pagep;
1184
db_pgno_t next, pgno;
1185
int i, put_page, ret, t_ret;
1188
* Loop through the entries for this txnlist element and
1189
* either link them into the free list or write a compensating
1197
for (i = 0; i < elp->u.p.nentries; i++) {
1198
pgno = elp->u.p.pgno_array[i];
1200
if ((ret = mpf->get(mpf, &pgno, DB_MPOOL_CREATE, &pagep)) != 0)
1204
if (IS_ZERO_LSN(LSN(pagep))) {
1207
* If this is a fatal recovery which
1208
* spans a previous crash this page may
1209
* be on the free list already.
1211
for (next = *lastp; next != 0; ) {
1214
if ((ret = mpf->get(mpf,
1215
&next, 0, &freep)) != 0)
1217
next = NEXT_PGNO(freep);
1219
mpf->put(mpf, freep, 0)) != 0)
1224
P_INIT(pagep, dbp->pgsize, pgno,
1225
PGNO_INVALID, *lastp, 0, P_INVALID);
1226
LSN(pagep) = LSN(meta);
1230
P_INIT(pagep, dbp->pgsize, pgno,
1231
PGNO_INVALID, *lastp, 0, P_INVALID);
1232
if (dbc == NULL && (ret =
1233
dbp->cursor(dbp, ctxn, &dbc, 0)) != 0)
1236
* If the dbp is compensating (because we
1237
* opened it), the dbc will automatically be
1238
* marked compensating, but in case we didn't
1239
* do the open, we have to mark it explicitly.
1241
F_SET(dbc, DBC_COMPENSATE);
1242
ret = __db_free(dbc, pagep);
1245
* On any error, we hope that the error was
1246
* caused due to running out of space, and we
1247
* switch modes, doing the processing where we
1248
* sync out files instead of doing compensating
1249
* transactions. If this was a real error and
1250
* not out of space, we assume that some other
1251
* call will fail real soon.
1254
/* Assume that this is out of space. */
1255
(void)dbc->c_close(dbc);
1262
if (put_page == 1) {
1263
ret = mpf->put(mpf, pagep, DB_MPOOL_DIRTY);
1270
err: if (put_page &&
1271
(t_ret = mpf->put(mpf, pagep, DB_MPOOL_DIRTY)) != 0 && ret == 0)
1273
if (dbc != NULL && (t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
1278
#define DB_TXNLIST_MAX_PGNO 8 /* A nice even number. */
1281
* __db_txnlist_pgnoadd --
1282
* Find the txnlist entry for a file and add this pgno, or add the list
1283
* entry for the file and then add the pgno.
1286
__db_txnlist_pgnoadd(dbenv, hp, fileid, uid, fname, pgno)
1290
u_int8_t uid[DB_FILE_ID_LEN];
1300
if (__db_txnlist_find_internal(dbenv, hp,
1301
TXNLIST_PGNO, 0, uid, &elp, 0) != 0) {
1303
__os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
1305
memcpy(&hash, uid, sizeof(hash));
1307
&hp->head[DB_TXNLIST_MASK(hp, hash)], elp, links);
1308
elp->u.p.fileid = fileid;
1309
memcpy(elp->u.p.uid, uid, DB_FILE_ID_LEN);
1311
len = (int)strlen(fname) + 1;
1312
if ((ret = __os_malloc(dbenv, len, &elp->u.p.fname)) != 0)
1314
memcpy(elp->u.p.fname, fname, len);
1316
elp->u.p.maxentry = 0;
1317
elp->u.p.locked = 0;
1318
elp->type = TXNLIST_PGNO;
1319
if ((ret = __os_malloc(dbenv,
1320
8 * sizeof(db_pgno_t), &elp->u.p.pgno_array)) != 0)
1322
elp->u.p.maxentry = DB_TXNLIST_MAX_PGNO;
1323
elp->u.p.nentries = 0;
1324
} else if (elp->u.p.nentries == elp->u.p.maxentry) {
1325
elp->u.p.maxentry <<= 1;
1326
if ((ret = __os_realloc(dbenv, elp->u.p.maxentry *
1327
sizeof(db_pgno_t), &elp->u.p.pgno_array)) != 0)
1331
elp->u.p.pgno_array[elp->u.p.nentries++] = pgno;
1335
err: __db_txnlist_end(dbenv, hp);
1340
* __db_default_getpgnos --
1341
* Fill in default getpgnos information for an application-specific
1345
__db_default_getpgnos(dbenv, lsnp, summary)
1353
t = (TXN_RECS *)summary;
1355
if ((ret = __rep_check_alloc(dbenv, t, 1)) != 0)
1358
t->array[t->npages].flags = LSN_PAGE_NOLOCK;
1359
t->array[t->npages].lsn = *lsnp;
1360
t->array[t->npages].fid = DB_LOGFILEID_INVALID;
1361
memset(&t->array[t->npages].pgdesc, 0,
1362
sizeof(t->array[t->npages].pgdesc));
1371
* __db_txnlist_print --
1372
* Print out the transaction list.
1374
* PUBLIC: void __db_txnlist_print __P((void *));
1377
__db_txnlist_print(listp)
1383
char *stats[] = { "ok", "commit", "prepare", "abort", "notfound",
1384
"ignore", "expected", "unexpected" };
1386
hp = (DB_TXNHEAD *)listp;
1388
printf("Maxid: %lu Generation: %lu\n",
1389
(u_long)hp->maxid, (u_long)hp->generation);
1390
for (i = 0; i < hp->nslots; i++)
1391
for (p = LIST_FIRST(&hp->head[i]); p != NULL; p = LIST_NEXT(p, links)) {
1394
printf("TXNID: %lx(%lu): %s\n",
1395
(u_long)p->u.t.txnid, (u_long)p->u.t.generation,
1396
stats[p->u.t.status]);
1399
printf("Unrecognized type: %d\n", p->type);