152
* Read a log block, make sure it's valid, and byteswap it if necessary.
160
* Read a log block and make sure it's valid.
155
zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
163
zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
166
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
167
uint32_t aflags = ARC_WAIT;
168
arc_buf_t *abuf = NULL;
159
uint32_t aflags = ARC_WAIT;
162
zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
165
zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
170
* We shouldn't be doing any scrubbing while we're doing log
171
* replay, it's OK to not lock.
173
error = arc_read_nolock(NULL, zilog->zl_spa, &blk,
174
arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
175
ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
172
if (zilog->zl_header->zh_claim_txg == 0)
173
zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
175
if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
176
zio_flags |= ZIO_FLAG_SPECULATIVE;
178
SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
179
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
181
error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
182
ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
177
184
if (error == 0) {
178
char *data = (*abufpp)->b_data;
179
uint64_t blksz = BP_GET_LSIZE(bp);
180
zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
181
185
zio_cksum_t cksum = bp->blk_cksum;
191
195
cksum.zc_word[ZIL_ZC_SEQ]++;
193
if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum,
194
sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) ||
195
(ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))) {
200
VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
205
dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);
197
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
198
zil_chain_t *zilc = abuf->b_data;
199
char *lr = (char *)(zilc + 1);
200
uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
202
if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
203
sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
207
*end = (char *)dst + len;
208
*nbp = zilc->zc_next_blk;
211
char *lr = abuf->b_data;
212
uint64_t size = BP_GET_LSIZE(bp);
213
zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
215
if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
216
sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
217
(zilc->zc_nused > (size - sizeof (*zilc)))) {
220
bcopy(lr, dst, zilc->zc_nused);
221
*end = (char *)dst + zilc->zc_nused;
222
*nbp = zilc->zc_next_blk;
226
VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
233
* Read a TX_WRITE log data block.
236
zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
238
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
239
const blkptr_t *bp = &lr->lr_blkptr;
240
uint32_t aflags = ARC_WAIT;
241
arc_buf_t *abuf = NULL;
245
if (BP_IS_HOLE(bp)) {
247
bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
251
if (zilog->zl_header->zh_claim_txg == 0)
252
zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
254
SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
255
ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
257
error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
258
ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
262
bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
263
(void) arc_buf_remove_ref(abuf, &abuf);
211
270
* Parse the intent log, and call parse_func for each valid record within.
212
* Return the highest sequence number.
215
273
zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
216
274
zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
218
276
const zil_header_t *zh = zilog->zl_header;
219
uint64_t claim_seq = zh->zh_claim_seq;
221
uint64_t max_seq = 0;
222
blkptr_t blk = zh->zh_log;
277
boolean_t claimed = !!zh->zh_claim_txg;
278
uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
279
uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
280
uint64_t max_blk_seq = 0;
281
uint64_t max_lr_seq = 0;
282
uint64_t blk_count = 0;
283
uint64_t lr_count = 0;
284
blkptr_t blk, next_blk;
224
285
char *lrbuf, *lrp;
228
if (BP_IS_HOLE(&blk))
289
* Old logs didn't record the maximum zh_claim_lr_seq.
291
if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
292
claim_lr_seq = UINT64_MAX;
232
295
* Starting at the block pointed to by zh_log we read the log chain.
237
300
* If the log has been claimed, stop if we encounter a sequence
238
301
* number greater than the highest claimed sequence number.
240
zil_dva_tree_init(&zilog->zl_dva_tree);
242
seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
244
if (claim_seq != 0 && seq > claim_seq)
247
ASSERT(max_seq < seq);
250
error = zil_read_log_block(zilog, &blk, &abuf);
252
if (parse_blk_func != NULL)
253
parse_blk_func(zilog, &blk, arg, txg);
303
lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
304
zil_bp_tree_init(zilog);
306
for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
307
uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
311
if (blk_seq > claim_blk_seq)
313
if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
315
ASSERT3U(max_blk_seq, <, blk_seq);
316
max_blk_seq = blk_seq;
319
if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
322
error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
258
lrbuf = abuf->b_data;
259
ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
260
blk = ztp->zit_next_blk;
262
if (parse_lr_func == NULL) {
263
VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
267
for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
326
for (lrp = lrbuf; lrp < end; lrp += reclen) {
268
327
lr_t *lr = (lr_t *)lrp;
269
328
reclen = lr->lrc_reclen;
270
329
ASSERT3U(reclen, >=, sizeof (lr_t));
271
parse_lr_func(zilog, lr, arg, txg);
330
if (lr->lrc_seq > claim_lr_seq)
332
if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
334
ASSERT3U(max_lr_seq, <, lr->lrc_seq);
335
max_lr_seq = lr->lrc_seq;
273
VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
275
zil_dva_tree_fini(&zilog->zl_dva_tree);
340
zilog->zl_parse_error = error;
341
zilog->zl_parse_blk_seq = max_blk_seq;
342
zilog->zl_parse_lr_seq = max_lr_seq;
343
zilog->zl_parse_blk_count = blk_count;
344
zilog->zl_parse_lr_count = lr_count;
346
ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
347
(max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
349
zil_bp_tree_fini(zilog);
350
zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
282
356
zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
284
spa_t *spa = zilog->zl_spa;
288
359
* Claim log block if not already committed and not already claimed.
360
* If tx == NULL, just verify that the block is claimable.
290
if (bp->blk_birth >= first_txg &&
291
zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
292
err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL,
293
ZIO_FLAG_MUSTSUCCEED));
362
if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0)
365
return (zio_wait(zio_claim(NULL, zilog->zl_spa,
366
tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
367
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
299
371
zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
301
if (lrc->lrc_txtype == TX_WRITE) {
302
lr_write_t *lr = (lr_write_t *)lrc;
303
zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
373
lr_write_t *lr = (lr_write_t *)lrc;
376
if (lrc->lrc_txtype != TX_WRITE)
380
* If the block is not readable, don't claim it. This can happen
381
* in normal operation when a log block is written to disk before
382
* some of the dmu_sync() blocks it points to. In this case, the
383
* transaction cannot have been committed to anyone (we would have
384
* waited for all writes to be stable first), so it is semantically
385
* correct to declare this the end of the log.
387
if (lr->lr_blkptr.blk_birth >= first_txg &&
388
(error = zil_read_log_data(zilog, lr, NULL)) != 0)
390
return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
309
395
zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
311
zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
397
zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
315
403
zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
405
lr_write_t *lr = (lr_write_t *)lrc;
406
blkptr_t *bp = &lr->lr_blkptr;
318
409
* If we previously claimed it, we need to free it.
320
if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
321
lr_write_t *lr = (lr_write_t *)lrc;
322
blkptr_t *bp = &lr->lr_blkptr;
323
if (bp->blk_birth >= claim_txg &&
324
!zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
325
(void) arc_free(NULL, zilog->zl_spa,
326
dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
411
if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
412
bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0)
413
zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
419
zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
423
lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
424
lwb->lwb_zilog = zilog;
426
lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
427
lwb->lwb_max_txg = txg;
430
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
431
lwb->lwb_nused = sizeof (zil_chain_t);
432
lwb->lwb_sz = BP_GET_LSIZE(bp);
435
lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
438
mutex_enter(&zilog->zl_lock);
439
list_insert_tail(&zilog->zl_lwb_list, lwb);
440
mutex_exit(&zilog->zl_lock);
332
446
* Create an on-disk intent log.
335
449
zil_create(zilog_t *zilog)
337
451
const zil_header_t *zh = zilog->zl_header;
339
453
uint64_t txg = 0;
340
454
dmu_tx_t *tx = NULL;
457
554
list_remove(&zilog->zl_lwb_list, lwb);
458
555
if (lwb->lwb_buf != NULL)
459
556
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
460
zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg);
557
zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk);
461
558
kmem_cache_free(zil_lwb_cache, lwb);
465
(void) zil_parse(zilog, zil_free_log_block,
466
zil_free_log_record, tx, zh->zh_claim_txg);
560
} else if (!keep_first) {
561
(void) zil_parse(zilog, zil_free_log_block,
562
zil_free_log_record, tx, zh->zh_claim_txg);
469
564
mutex_exit(&zilog->zl_lock);
471
566
dmu_tx_commit(tx);
475
* return true if the initial log block is not valid
478
zil_empty(zilog_t *zilog)
480
const zil_header_t *zh = zilog->zl_header;
481
arc_buf_t *abuf = NULL;
483
if (BP_IS_HOLE(&zh->zh_log))
486
if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
489
VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
494
zil_claim(char *osname, void *txarg)
570
zil_claim(const char *osname, void *txarg)
496
572
dmu_tx_t *tx = txarg;
497
573
uint64_t first_txg = dmu_tx_get_txg(tx);
558
624
* Checksum errors are ok as they indicate the end of the chain.
559
625
* Any other error (no device or read failure) returns an error.
563
zil_check_log_chain(char *osname, void *txarg)
628
zil_check_log_chain(const char *osname, void *tx)
574
error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
636
error = dmu_objset_hold(osname, FTAG, &os);
576
638
cmn_err(CE_WARN, "can't open objset for %s", osname);
580
642
zilog = dmu_objset_zil(os);
581
zh = zil_header_in_syncing_context(zilog);
583
if (BP_IS_HOLE(&blk)) {
584
dmu_objset_close(os);
585
return (0); /* no chain */
589
error = zil_read_log_block(zilog, &blk, &abuf);
592
lrbuf = abuf->b_data;
593
ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
594
blk = ztp->zit_next_blk;
595
VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
597
dmu_objset_close(os);
599
return (0); /* normal end of chain */
645
* Because tx == NULL, zil_claim_log_block() will not actually claim
646
* any blocks, but just determine whether it is possible to do so.
647
* In addition to checking the log chain, zil_claim_log_block()
648
* will invoke zio_claim() with a done func of spa_claim_notify(),
649
* which will update spa_max_claim_txg. See spa_load() for details.
651
error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
652
zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
654
dmu_objset_rele(os, FTAG);
656
return ((error == ECKSUM || error == ENOENT) ? 0 : error);
744
798
if (lwb->lwb_zio == NULL) {
745
799
lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
746
0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz,
800
0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
747
801
zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
748
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb);
802
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
807
* Define a limited set of intent log block sizes.
808
* These must be a multiple of 4KB. Note only the amount used (again
809
* aligned to 4KB) actually gets written. However, we can't always just
810
* allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
812
uint64_t zil_block_buckets[] = {
813
4096, /* non TX_WRITE */
814
8192+4096, /* data base */
815
32*1024 + 4096, /* NFS writes */
820
* Use the slog as long as the logbias is 'latency' and the current commit size
821
* is less than the limit or the total list size is less than 2X the limit.
822
* Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
824
uint64_t zil_slog_limit = 1024 * 1024;
825
#define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
826
(((zilog)->zl_cur_used < zil_slog_limit) || \
827
((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
753
830
* Start a log block write and advance to the next log block.
754
831
* Calls are serialized.
757
834
zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
760
zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
761
838
spa_t *spa = zilog->zl_spa;
762
blkptr_t *bp = &ztp->zit_next_blk;
767
ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
842
uint64_t zil_blksz, wsz;
845
if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
846
zilc = (zil_chain_t *)lwb->lwb_buf;
847
bp = &zilc->zc_next_blk;
849
zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
850
bp = &zilc->zc_next_blk;
853
ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
770
856
* Allocate the next block and save its address in this block
771
857
* before writing it in order to establish the log chain.
772
858
* Note that if the allocation of nlwb synced before we wrote
773
859
* the block that points at it (lwb), we'd leak it if we crashed.
774
* Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
860
* Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
861
* We dirty the dataset to ensure that zil_sync() will be called
862
* to clean up in the event of allocation failure or I/O failure.
776
txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
777
txg_rele_to_quiesce(&lwb->lwb_txgh);
864
tx = dmu_tx_create(zilog->zl_os);
865
VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
866
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
867
txg = dmu_tx_get_txg(tx);
780
* Pick a ZIL blocksize. We request a size that is the
781
* maximum of the previous used size, the current used size and
782
* the amount waiting in the queue.
872
* Log blocks are pre-allocated. Here we select the size of the next
873
* block, based on size used in the last block.
874
* - first find the smallest bucket that will fit the block from a
875
* limited set of block sizes. This is because it's faster to write
876
* blocks allocated from the same metaslab as they are adjacent or
878
* - next find the maximum from the new suggested size and an array of
879
* previous sizes. This lessens a picket fence effect of wrongly
880
* guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
883
* Note we only write what is used, but we can't just allocate
884
* the maximum block size because we can exhaust the available
784
zil_blksz = MAX(zilog->zl_prev_used,
785
zilog->zl_cur_used + sizeof (*ztp));
786
zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
787
zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
788
if (zil_blksz > ZIL_MAX_BLKSZ)
789
zil_blksz = ZIL_MAX_BLKSZ;
887
zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
888
for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
890
zil_blksz = zil_block_buckets[i];
891
if (zil_blksz == UINT64_MAX)
892
zil_blksz = SPA_MAXBLOCKSIZE;
893
zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
894
for (i = 0; i < ZIL_PREV_BLKS; i++)
895
zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
896
zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
792
899
/* pass the old blkptr in order to spread log blocks across devs */
793
error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
795
dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);
798
* We dirty the dataset to ensure that zil_sync() will
799
* be called to remove this lwb from our zl_lwb_list.
800
* Failing to do so, may leave an lwb with a NULL lwb_buf
801
* hanging around on the zl_lwb_list.
803
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
807
* Since we've just experienced an allocation failure so we
808
* terminate the current lwb and send it on its way.
811
ztp->zit_nused = lwb->lwb_nused;
812
ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
813
zio_nowait(lwb->lwb_zio);
816
* By returning NULL the caller will call tx_wait_synced()
821
ASSERT3U(bp->blk_birth, ==, txg);
823
ztp->zit_nused = lwb->lwb_nused;
824
ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
825
bp->blk_cksum = lwb->lwb_blk.blk_cksum;
826
bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
829
* Allocate a new log write buffer (lwb).
831
nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
833
nlwb->lwb_zilog = zilog;
836
nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
837
nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
838
nlwb->lwb_max_txg = txg;
839
nlwb->lwb_zio = NULL;
842
* Put new lwb at the end of the log chain
844
mutex_enter(&zilog->zl_lock);
845
list_insert_tail(&zilog->zl_lwb_list, nlwb);
846
mutex_exit(&zilog->zl_lock);
848
/* Record the block for later vdev flushing */
849
zil_add_block(zilog, &lwb->lwb_blk);
852
* kick off the write for the old log block
854
dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
855
ASSERT(lwb->lwb_zio);
856
zio_nowait(lwb->lwb_zio);
900
error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
903
ASSERT3U(bp->blk_birth, ==, txg);
904
bp->blk_cksum = lwb->lwb_blk.blk_cksum;
905
bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
908
* Allocate a new log write buffer (lwb).
910
nlwb = zil_alloc_lwb(zilog, bp, txg);
912
/* Record the block for later vdev flushing */
913
zil_add_block(zilog, &lwb->lwb_blk);
916
if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
917
/* For Slim ZIL only write what is used. */
918
wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
919
ASSERT3U(wsz, <=, lwb->lwb_sz);
920
zio_shrink(lwb->lwb_zio, wsz);
927
zilc->zc_nused = lwb->lwb_nused;
928
zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
931
* clear unused data for security
933
bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
935
zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */
938
* If there was an allocation failure then nlwb will be null which
939
* forces a txg_wait_synced().
1065
1162
mutex_exit(&zilog->zl_lock);
1163
lwb = zil_create(zilog);
1067
1164
mutex_enter(&zilog->zl_lock);
1068
lwb = list_tail(&zilog->zl_lwb_list);
1167
ASSERT(lwb == NULL || lwb->lwb_zio == NULL);
1072
1169
/* Loop through in-memory log transactions filling log blocks. */
1073
1170
DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
1076
* Find the next itx to push:
1077
* Push all transactions related to specified foid and all
1078
* other transactions except TX_WRITE, TX_TRUNCATE,
1079
* TX_SETATTR and TX_ACL for all other files.
1081
if (itx_next != (itx_t *)-1)
1084
itx = list_head(&zilog->zl_itx_list);
1085
for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
1086
if (foid == 0) /* push all foids? */
1088
if (itx->itx_sync) /* push all O_[D]SYNC */
1090
switch (itx->itx_lr.lrc_txtype) {
1095
/* lr_foid is same offset for these records */
1096
if (((lr_write_t *)&itx->itx_lr)->lr_foid
1098
continue; /* skip this record */
1172
for (itx = list_head(&zilog->zl_itx_list); itx; itx = itx_next) {
1174
* Save the next pointer. Even though we drop zl_lock below,
1175
* all threads that can remove itx list entries (other writers
1176
* and zil_itx_clean()) can't do so until they have zl_writer.
1178
itx_next = list_next(&zilog->zl_itx_list, itx);
1181
* Determine whether to push this itx.
1182
* Push all transactions related to specified foid and
1183
* all other transactions except those that can be logged
1184
* out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL)
1185
* for all other files.
1187
* If foid == 0 (meaning "push all foids") or
1188
* itx->itx_sync is set (meaning O_[D]SYNC), push regardless.
1190
if (foid != 0 && !itx->itx_sync &&
1191
TX_OOO(itx->itx_lr.lrc_txtype) &&
1192
((lr_ooo_t *)&itx->itx_lr)->lr_foid != foid)
1193
continue; /* skip this record */
1106
1195
if ((itx->itx_lr.lrc_seq > seq) &&
1107
((lwb == NULL) || (lwb->lwb_nused == 0) ||
1108
(lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) {
1196
((lwb == NULL) || (LWB_EMPTY(lwb)) ||
1197
(lwb->lwb_nused + itx->itx_sod > lwb->lwb_sz)))
1113
* Save the next pointer. Even though we soon drop
1114
* zl_lock all threads that may change the list
1115
* (another writer or zil_itx_clean) can't do so until
1116
* they have zl_writer.
1118
itx_next = list_next(&zilog->zl_itx_list, itx);
1119
1200
list_remove(&zilog->zl_itx_list, itx);
1120
1201
zilog->zl_itx_list_sz -= itx->itx_sod;
1121
1203
mutex_exit(&zilog->zl_lock);
1122
1205
txg = itx->itx_lr.lrc_txg;
1125
1208
if (txg > spa_last_synced_txg(spa) ||
1126
1209
txg > spa_freeze_txg(spa))
1127
1210
lwb = zil_lwb_commit(zilog, itx, lwb);
1128
kmem_free(itx, offsetof(itx_t, itx_lr)
1129
+ itx->itx_lr.lrc_reclen);
1212
zil_itx_destroy(itx);
1130
1214
mutex_enter(&zilog->zl_lock);
1132
1216
DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
1133
1217
/* determine commit sequence number */
1134
1218
itx = list_head(&zilog->zl_itx_list);
1136
commit_seq = itx->itx_lr.lrc_seq;
1220
commit_seq = itx->itx_lr.lrc_seq - 1;
1138
1222
commit_seq = zilog->zl_itx_seq;
1139
1223
mutex_exit(&zilog->zl_lock);
1439
1568
typedef struct zil_replay_arg {
1441
1569
zil_replay_func_t **zr_replay;
1443
1571
boolean_t zr_byteswap;
1445
1573
} zil_replay_arg_t;
1576
zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
1578
char name[MAXNAMELEN];
1580
zilog->zl_replaying_seq--; /* didn't actually replay this one */
1582
dmu_objset_name(zilog->zl_os, name);
1584
cmn_err(CE_WARN, "ZFS replay transaction error %d, "
1585
"dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
1586
(u_longlong_t)lr->lrc_seq,
1587
(u_longlong_t)(lr->lrc_txtype & ~TX_CI),
1588
(lr->lrc_txtype & TX_CI) ? "CI" : "");
1448
1594
zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
1450
1596
zil_replay_arg_t *zr = zra;
1451
1597
const zil_header_t *zh = zilog->zl_header;
1452
1598
uint64_t reclen = lr->lrc_reclen;
1453
1599
uint64_t txtype = lr->lrc_txtype;
1457
if (!zilog->zl_replay) /* giving up */
1602
zilog->zl_replaying_seq = lr->lrc_seq;
1604
if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
1460
1607
if (lr->lrc_txg < claim_txg) /* already committed */
1463
if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
1466
1610
/* Strip case-insensitive bit, still present in log record */
1467
1611
txtype &= ~TX_CI;
1469
if (txtype == 0 || txtype >= TX_MAX_TYPE) {
1613
if (txtype == 0 || txtype >= TX_MAX_TYPE)
1614
return (zil_replay_error(zilog, lr, EINVAL));
1617
* If this record type can be logged out of order, the object
1618
* (lr_foid) may no longer exist. That's legitimate, not an error.
1620
if (TX_OOO(txtype)) {
1621
error = dmu_object_info(zilog->zl_os,
1622
((lr_ooo_t *)lr)->lr_foid, NULL);
1623
if (error == ENOENT || error == EEXIST)
1475
1628
* Make a copy of the data so we can revise and extend it.
1477
bcopy(lr, zr->zr_lrbuf, reclen);
1630
bcopy(lr, zr->zr_lr, reclen);
1633
* If this is a TX_WRITE with a blkptr, suck in the data.
1635
if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
1636
error = zil_read_log_data(zilog, (lr_write_t *)lr,
1637
zr->zr_lr + reclen);
1639
return (zil_replay_error(zilog, lr, error));
1480
1643
* The log block containing this lr may have been byteswapped
1481
1644
* so that we can easily examine common fields like lrc_txtype.
1482
* However, the log is a mix of different data types, and only the
1645
* However, the log is a mix of different record types, and only the
1483
1646
* replay vectors know how to byteswap their records. Therefore, if
1484
1647
* the lr was byteswapped, undo it before invoking the replay vector.
1486
1649
if (zr->zr_byteswap)
1487
byteswap_uint64_array(zr->zr_lrbuf, reclen);
1490
* If this is a TX_WRITE with a blkptr, suck in the data.
1492
if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
1493
lr_write_t *lrw = (lr_write_t *)lr;
1494
blkptr_t *wbp = &lrw->lr_blkptr;
1495
uint64_t wlen = lrw->lr_length;
1496
char *wbuf = zr->zr_lrbuf + reclen;
1498
if (BP_IS_HOLE(wbp)) { /* compressed to a hole */
1502
* A subsequent write may have overwritten this block,
1503
* in which case wbp may have been been freed and
1504
* reallocated, and our read of wbp may fail with a
1505
* checksum error. We can safely ignore this because
1506
* the later write will provide the correct data.
1510
zb.zb_objset = dmu_objset_id(zilog->zl_os);
1511
zb.zb_object = lrw->lr_foid;
1513
zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);
1515
(void) zio_wait(zio_read(NULL, zilog->zl_spa,
1516
wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
1517
ZIO_PRIORITY_SYNC_READ,
1518
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
1519
(void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
1650
byteswap_uint64_array(zr->zr_lr, reclen);
1524
1653
* We must now do two things atomically: replay this log record,
1526
1655
* we did so. At the end of each replay function the sequence number
1527
1656
* is updated if we are in replay mode.
1529
for (pass = 1; pass <= 2; pass++) {
1530
zilog->zl_replaying_seq = lr->lrc_seq;
1531
/* Only byteswap (if needed) on the 1st pass. */
1532
error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
1533
zr->zr_byteswap && pass == 1);
1658
error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
1539
1661
* The DMU's dnode layer doesn't see removes until the txg
1540
1662
* commits, so a subsequent claim can spuriously fail with
1541
1663
* EEXIST. So if we receive any error we try syncing out
1542
* any removes then retry the transaction.
1664
* any removes then retry the transaction. Note that we
1665
* specify B_FALSE for byteswap now, so we don't do it twice.
1545
txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
1667
txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
1668
error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
1670
return (zil_replay_error(zilog, lr, error));
1550
name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
1551
dmu_objset_name(zr->zr_os, name);
1552
cmn_err(CE_WARN, "ZFS replay transaction error %d, "
1553
"dataset %s, seq 0x%llx, txtype %llu %s\n",
1554
error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
1555
(lr->lrc_txtype & TX_CI) ? "CI" : "");
1556
zilog->zl_replay = B_FALSE;
1557
kmem_free(name, MAXNAMELEN);
1562
1677
zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
1564
1679
zilog->zl_replay_blks++;
1595
1711
ASSERT(zilog->zl_replay_blks == 0);
1596
1712
(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
1597
1713
zh->zh_claim_txg);
1598
kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
1714
kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
1600
1716
zil_destroy(zilog, B_FALSE);
1601
1717
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
1602
1718
zilog->zl_replay = B_FALSE;
1606
* Report whether all transactions are committed
1609
zil_is_committed(zilog_t *zilog)
1722
zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
1614
mutex_enter(&zilog->zl_lock);
1615
while (zilog->zl_writer)
1616
cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
1618
/* recent unpushed intent log transactions? */
1619
if (!list_is_empty(&zilog->zl_itx_list)) {
1624
/* intent log never used? */
1625
lwb = list_head(&zilog->zl_lwb_list);
1632
* more than 1 log buffer means zil_sync() hasn't yet freed
1633
* entries after a txg has committed
1635
if (list_next(&zilog->zl_lwb_list, lwb)) {
1640
ASSERT(zil_empty(zilog));
1643
cv_broadcast(&zilog->zl_cv_writer);
1644
mutex_exit(&zilog->zl_lock);
1727
if (zilog->zl_replay) {
1728
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
1729
zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
1730
zilog->zl_replaying_seq;
1650
zil_vdev_offline(char *osname, void *arg)
1739
zil_vdev_offline(const char *osname, void *arg)
1653
1742
zilog_t *zilog;
1656
error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
1745
error = dmu_objset_hold(osname, FTAG, &os);
1658
1747
return (error);