2
* linux/fs/jbd2/transaction.c
4
* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6
* Copyright 1998 Red Hat corp --- All Rights Reserved
8
* This file is part of the Linux kernel and is made available under
9
* the terms of the GNU General Public License, version 2, or at your
10
* option, any later version, incorporated herein by reference.
12
* Generic filesystem transaction handling code; part of the ext2fs
15
* This file manages transactions (compound commits managed by the
16
* journaling code) and handles (individual atomic operations by the
20
#include <linux/time.h>
22
#include <linux/jbd2.h>
23
#include <linux/errno.h>
24
#include <linux/slab.h>
25
#include <linux/timer.h>
27
#include <linux/highmem.h>
28
#include <linux/hrtimer.h>
29
#include <linux/backing-dev.h>
30
#include <linux/bug.h>
31
#include <linux/module.h>
33
static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
34
static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
37
* jbd2_get_transaction: obtain a new transaction_t object.
39
* Simply allocate and initialise a new transaction. Create it in
40
* RUNNING state and add it to the current journal (which should not
41
* have an existing running transaction: we only make a new transaction
42
* once we have started to commit the old one).
45
* The journal MUST be locked. We don't perform atomic mallocs on the
46
* new transaction and we can't block without protecting against other
47
* processes trying to touch the journal while it is in transition.
51
static transaction_t *
52
jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
54
transaction->t_journal = journal;
55
transaction->t_state = T_RUNNING;
56
transaction->t_start_time = ktime_get();
57
transaction->t_tid = journal->j_transaction_sequence++;
58
transaction->t_expires = jiffies + journal->j_commit_interval;
59
spin_lock_init(&transaction->t_handle_lock);
60
atomic_set(&transaction->t_updates, 0);
61
atomic_set(&transaction->t_outstanding_credits, 0);
62
atomic_set(&transaction->t_handle_count, 0);
63
INIT_LIST_HEAD(&transaction->t_inode_list);
64
INIT_LIST_HEAD(&transaction->t_private_list);
66
/* Set up the commit timer for the new transaction. */
67
journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
68
add_timer(&journal->j_commit_timer);
70
J_ASSERT(journal->j_running_transaction == NULL);
71
journal->j_running_transaction = transaction;
72
transaction->t_max_wait = 0;
73
transaction->t_start = jiffies;
81
* A handle_t is an object which represents a single atomic update to a
82
* filesystem, and which tracks all of the modifications which form part
87
* Update transaction's maximum wait time, if debugging is enabled.
89
* In order for t_max_wait to be reliable, it must be protected by a
90
* lock. But doing so will mean that start_this_handle() can not be
91
* run in parallel on SMP systems, which limits our scalability. So
92
* unless debugging is enabled, we no longer update t_max_wait, which
93
* means that maximum wait time reported by the jbd2_run_stats
94
* tracepoint will always be zero.
96
static inline void update_t_max_wait(transaction_t *transaction,
99
#ifdef CONFIG_JBD2_DEBUG
100
if (jbd2_journal_enable_debug &&
101
time_after(transaction->t_start, ts)) {
102
ts = jbd2_time_diff(ts, transaction->t_start);
103
spin_lock(&transaction->t_handle_lock);
104
if (ts > transaction->t_max_wait)
105
transaction->t_max_wait = ts;
106
spin_unlock(&transaction->t_handle_lock);
112
* start_this_handle: Given a handle, deal with any locking or stalling
113
* needed to make sure that there is enough journal space for the handle
114
* to begin. Attach the handle to a transaction and set up the
115
* transaction's buffer credits.
118
static int start_this_handle(journal_t *journal, handle_t *handle,
121
transaction_t *transaction, *new_transaction = NULL;
123
int needed, need_to_start;
124
int nblocks = handle->h_buffer_credits;
125
unsigned long ts = jiffies;
127
if (nblocks > journal->j_max_transaction_buffers) {
128
printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
129
current->comm, nblocks,
130
journal->j_max_transaction_buffers);
135
if (!journal->j_running_transaction) {
136
new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
137
if (!new_transaction) {
139
* If __GFP_FS is not present, then we may be
140
* being called from inside the fs writeback
141
* layer, so we MUST NOT fail. Since
142
* __GFP_NOFAIL is going away, we will arrange
143
* to retry the allocation ourselves.
145
if ((gfp_mask & __GFP_FS) == 0) {
146
congestion_wait(BLK_RW_ASYNC, HZ/50);
147
goto alloc_transaction;
153
jbd_debug(3, "New handle %p going live.\n", handle);
156
* We need to hold j_state_lock until t_updates has been incremented,
157
* for proper journal barrier handling
160
read_lock(&journal->j_state_lock);
161
BUG_ON(journal->j_flags & JBD2_UNMOUNT);
162
if (is_journal_aborted(journal) ||
163
(journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
164
read_unlock(&journal->j_state_lock);
165
kfree(new_transaction);
169
/* Wait on the journal's transaction barrier if necessary */
170
if (journal->j_barrier_count) {
171
read_unlock(&journal->j_state_lock);
172
wait_event(journal->j_wait_transaction_locked,
173
journal->j_barrier_count == 0);
177
if (!journal->j_running_transaction) {
178
read_unlock(&journal->j_state_lock);
179
if (!new_transaction)
180
goto alloc_transaction;
181
write_lock(&journal->j_state_lock);
182
if (!journal->j_running_transaction) {
183
jbd2_get_transaction(journal, new_transaction);
184
new_transaction = NULL;
186
write_unlock(&journal->j_state_lock);
190
transaction = journal->j_running_transaction;
193
* If the current transaction is locked down for commit, wait for the
194
* lock to be released.
196
if (transaction->t_state == T_LOCKED) {
199
prepare_to_wait(&journal->j_wait_transaction_locked,
200
&wait, TASK_UNINTERRUPTIBLE);
201
read_unlock(&journal->j_state_lock);
203
finish_wait(&journal->j_wait_transaction_locked, &wait);
208
* If there is not enough space left in the log to write all potential
209
* buffers requested by this operation, we need to stall pending a log
210
* checkpoint to free some more log space.
212
needed = atomic_add_return(nblocks,
213
&transaction->t_outstanding_credits);
215
if (needed > journal->j_max_transaction_buffers) {
217
* If the current transaction is already too large, then start
218
* to commit it: we can then go back and attach this handle to
223
jbd_debug(2, "Handle %p starting new commit...\n", handle);
224
atomic_sub(nblocks, &transaction->t_outstanding_credits);
225
prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
226
TASK_UNINTERRUPTIBLE);
227
tid = transaction->t_tid;
228
need_to_start = !tid_geq(journal->j_commit_request, tid);
229
read_unlock(&journal->j_state_lock);
231
jbd2_log_start_commit(journal, tid);
233
finish_wait(&journal->j_wait_transaction_locked, &wait);
238
* The commit code assumes that it can get enough log space
239
* without forcing a checkpoint. This is *critical* for
240
* correctness: a checkpoint of a buffer which is also
241
* associated with a committing transaction creates a deadlock,
242
* so commit simply cannot force through checkpoints.
244
* We must therefore ensure the necessary space in the journal
245
* *before* starting to dirty potentially checkpointed buffers
246
* in the new transaction.
248
* The worst part is, any transaction currently committing can
249
* reduce the free space arbitrarily. Be careful to account for
250
* those buffers when checkpointing.
254
* @@@ AKPM: This seems rather over-defensive. We're giving commit
255
* a _lot_ of headroom: 1/4 of the journal plus the size of
256
* the committing transaction. Really, we only need to give it
257
* committing_transaction->t_outstanding_credits plus "enough" for
258
* the log control blocks.
259
* Also, this test is inconsistent with the matching one in
260
* jbd2_journal_extend().
262
if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
263
jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
264
atomic_sub(nblocks, &transaction->t_outstanding_credits);
265
read_unlock(&journal->j_state_lock);
266
write_lock(&journal->j_state_lock);
267
if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
268
__jbd2_log_wait_for_space(journal);
269
write_unlock(&journal->j_state_lock);
273
/* OK, account for the buffers that this operation expects to
274
* use and add the handle to the running transaction.
276
update_t_max_wait(transaction, ts);
277
handle->h_transaction = transaction;
278
atomic_inc(&transaction->t_updates);
279
atomic_inc(&transaction->t_handle_count);
280
jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
282
atomic_read(&transaction->t_outstanding_credits),
283
__jbd2_log_space_left(journal));
284
read_unlock(&journal->j_state_lock);
286
lock_map_acquire(&handle->h_lockdep_map);
287
kfree(new_transaction);
291
static struct lock_class_key jbd2_handle_key;
293
/* Allocate a new handle. This should probably be in a slab... */
294
static handle_t *new_handle(int nblocks)
296
handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
299
memset(handle, 0, sizeof(*handle));
300
handle->h_buffer_credits = nblocks;
303
lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle",
304
&jbd2_handle_key, 0);
310
* handle_t *jbd2_journal_start() - Obtain a new handle.
311
* @journal: Journal to start transaction on.
312
* @nblocks: number of block buffer we might modify
314
* We make sure that the transaction can guarantee at least nblocks of
315
* modified buffers in the log. We block until the log can guarantee
318
* This function is visible to journal users (like ext3fs), so is not
319
* called with the journal already locked.
321
* Return a pointer to a newly allocated handle, or an ERR_PTR() value
324
handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask)
326
handle_t *handle = journal_current_handle();
330
return ERR_PTR(-EROFS);
333
J_ASSERT(handle->h_transaction->t_journal == journal);
338
handle = new_handle(nblocks);
340
return ERR_PTR(-ENOMEM);
342
current->journal_info = handle;
344
err = start_this_handle(journal, handle, gfp_mask);
346
jbd2_free_handle(handle);
347
current->journal_info = NULL;
348
handle = ERR_PTR(err);
352
EXPORT_SYMBOL(jbd2__journal_start);
355
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
357
return jbd2__journal_start(journal, nblocks, GFP_NOFS);
359
EXPORT_SYMBOL(jbd2_journal_start);
363
* int jbd2_journal_extend() - extend buffer credits.
364
* @handle: handle to 'extend'
365
* @nblocks: nr blocks to try to extend by.
367
* Some transactions, such as large extends and truncates, can be done
368
* atomically all at once or in several stages. The operation requests
369
* a credit for a number of buffer modications in advance, but can
370
* extend its credit if it needs more.
372
* jbd2_journal_extend tries to give the running handle more buffer credits.
373
* It does not guarantee that allocation - this is a best-effort only.
374
* The calling process MUST be able to deal cleanly with a failure to
377
* Return 0 on success, non-zero on failure.
379
* return code < 0 implies an error
380
* return code > 0 implies normal transaction-full status.
382
int jbd2_journal_extend(handle_t *handle, int nblocks)
384
transaction_t *transaction = handle->h_transaction;
385
journal_t *journal = transaction->t_journal;
390
if (is_handle_aborted(handle))
395
read_lock(&journal->j_state_lock);
397
/* Don't extend a locked-down transaction! */
398
if (handle->h_transaction->t_state != T_RUNNING) {
399
jbd_debug(3, "denied handle %p %d blocks: "
400
"transaction not running\n", handle, nblocks);
404
spin_lock(&transaction->t_handle_lock);
405
wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
407
if (wanted > journal->j_max_transaction_buffers) {
408
jbd_debug(3, "denied handle %p %d blocks: "
409
"transaction too large\n", handle, nblocks);
413
if (wanted > __jbd2_log_space_left(journal)) {
414
jbd_debug(3, "denied handle %p %d blocks: "
415
"insufficient log space\n", handle, nblocks);
419
handle->h_buffer_credits += nblocks;
420
atomic_add(nblocks, &transaction->t_outstanding_credits);
423
jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
425
spin_unlock(&transaction->t_handle_lock);
427
read_unlock(&journal->j_state_lock);
434
* int jbd2_journal_restart() - restart a handle .
435
* @handle: handle to restart
436
* @nblocks: nr credits requested
438
* Restart a handle for a multi-transaction filesystem
441
* If the jbd2_journal_extend() call above fails to grant new buffer credits
442
* to a running handle, a call to jbd2_journal_restart will commit the
443
* handle's transaction so far and reattach the handle to a new
444
* transaction capabable of guaranteeing the requested number of
447
int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
449
transaction_t *transaction = handle->h_transaction;
450
journal_t *journal = transaction->t_journal;
452
int need_to_start, ret;
454
/* If we've had an abort of any type, don't even think about
455
* actually doing the restart! */
456
if (is_handle_aborted(handle))
460
* First unlink the handle from its current transaction, and start the
463
J_ASSERT(atomic_read(&transaction->t_updates) > 0);
464
J_ASSERT(journal_current_handle() == handle);
466
read_lock(&journal->j_state_lock);
467
spin_lock(&transaction->t_handle_lock);
468
atomic_sub(handle->h_buffer_credits,
469
&transaction->t_outstanding_credits);
470
if (atomic_dec_and_test(&transaction->t_updates))
471
wake_up(&journal->j_wait_updates);
472
spin_unlock(&transaction->t_handle_lock);
474
jbd_debug(2, "restarting handle %p\n", handle);
475
tid = transaction->t_tid;
476
need_to_start = !tid_geq(journal->j_commit_request, tid);
477
read_unlock(&journal->j_state_lock);
479
jbd2_log_start_commit(journal, tid);
481
lock_map_release(&handle->h_lockdep_map);
482
handle->h_buffer_credits = nblocks;
483
ret = start_this_handle(journal, handle, gfp_mask);
486
EXPORT_SYMBOL(jbd2__journal_restart);
489
int jbd2_journal_restart(handle_t *handle, int nblocks)
491
return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
493
EXPORT_SYMBOL(jbd2_journal_restart);
496
* void jbd2_journal_lock_updates () - establish a transaction barrier.
497
* @journal: Journal to establish a barrier on.
499
* This locks out any further updates from being started, and blocks
500
* until all existing updates have completed, returning only once the
501
* journal is in a quiescent state with no updates running.
503
* The journal lock should not be held on entry.
505
void jbd2_journal_lock_updates(journal_t *journal)
509
write_lock(&journal->j_state_lock);
510
++journal->j_barrier_count;
512
/* Wait until there are no running updates */
514
transaction_t *transaction = journal->j_running_transaction;
519
spin_lock(&transaction->t_handle_lock);
520
if (!atomic_read(&transaction->t_updates)) {
521
spin_unlock(&transaction->t_handle_lock);
524
prepare_to_wait(&journal->j_wait_updates, &wait,
525
TASK_UNINTERRUPTIBLE);
526
spin_unlock(&transaction->t_handle_lock);
527
write_unlock(&journal->j_state_lock);
529
finish_wait(&journal->j_wait_updates, &wait);
530
write_lock(&journal->j_state_lock);
532
write_unlock(&journal->j_state_lock);
535
* We have now established a barrier against other normal updates, but
536
* we also need to barrier against other jbd2_journal_lock_updates() calls
537
* to make sure that we serialise special journal-locked operations
540
mutex_lock(&journal->j_barrier);
544
* void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
545
* @journal: Journal to release the barrier on.
547
* Release a transaction barrier obtained with jbd2_journal_lock_updates().
549
* Should be called without the journal lock held.
551
void jbd2_journal_unlock_updates (journal_t *journal)
553
J_ASSERT(journal->j_barrier_count != 0);
555
mutex_unlock(&journal->j_barrier);
556
write_lock(&journal->j_state_lock);
557
--journal->j_barrier_count;
558
write_unlock(&journal->j_state_lock);
559
wake_up(&journal->j_wait_transaction_locked);
562
static void warn_dirty_buffer(struct buffer_head *bh)
564
char b[BDEVNAME_SIZE];
567
"JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
568
"There's a risk of filesystem corruption in case of system "
570
bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
574
* If the buffer is already part of the current transaction, then there
575
* is nothing we need to do. If it is already part of a prior
576
* transaction which we are still committing to disk, then we need to
577
* make sure that we do not overwrite the old copy: we do copy-out to
578
* preserve the copy going to disk. We also account the buffer against
579
* the handle's metadata buffer credits (unless the buffer is already
580
* part of the transaction, that is).
584
do_get_write_access(handle_t *handle, struct journal_head *jh,
587
struct buffer_head *bh;
588
transaction_t *transaction;
591
char *frozen_buffer = NULL;
594
if (is_handle_aborted(handle))
597
transaction = handle->h_transaction;
598
journal = transaction->t_journal;
600
jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
602
JBUFFER_TRACE(jh, "entry");
606
/* @@@ Need to check for errors here at some point. */
609
jbd_lock_bh_state(bh);
611
/* We now hold the buffer lock so it is safe to query the buffer
612
* state. Is the buffer dirty?
614
* If so, there are two possibilities. The buffer may be
615
* non-journaled, and undergoing a quite legitimate writeback.
616
* Otherwise, it is journaled, and we don't expect dirty buffers
617
* in that state (the buffers should be marked JBD_Dirty
618
* instead.) So either the IO is being done under our own
619
* control and this is a bug, or it's a third party IO such as
620
* dump(8) (which may leave the buffer scheduled for read ---
621
* ie. locked but not dirty) or tune2fs (which may actually have
622
* the buffer dirtied, ugh.) */
624
if (buffer_dirty(bh)) {
626
* First question: is this buffer already part of the current
627
* transaction or the existing committing transaction?
629
if (jh->b_transaction) {
631
jh->b_transaction == transaction ||
633
journal->j_committing_transaction);
634
if (jh->b_next_transaction)
635
J_ASSERT_JH(jh, jh->b_next_transaction ==
637
warn_dirty_buffer(bh);
640
* In any case we need to clean the dirty flag and we must
641
* do it under the buffer lock to be sure we don't race
642
* with running write-out.
644
JBUFFER_TRACE(jh, "Journalling dirty buffer");
645
clear_buffer_dirty(bh);
646
set_buffer_jbddirty(bh);
652
if (is_handle_aborted(handle)) {
653
jbd_unlock_bh_state(bh);
659
* The buffer is already part of this transaction if b_transaction or
660
* b_next_transaction points to it
662
if (jh->b_transaction == transaction ||
663
jh->b_next_transaction == transaction)
667
* this is the first time this transaction is touching this buffer,
668
* reset the modified flag
673
* If there is already a copy-out version of this buffer, then we don't
674
* need to make another one
676
if (jh->b_frozen_data) {
677
JBUFFER_TRACE(jh, "has frozen data");
678
J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
679
jh->b_next_transaction = transaction;
683
/* Is there data here we need to preserve? */
685
if (jh->b_transaction && jh->b_transaction != transaction) {
686
JBUFFER_TRACE(jh, "owned by older transaction");
687
J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
688
J_ASSERT_JH(jh, jh->b_transaction ==
689
journal->j_committing_transaction);
691
/* There is one case we have to be very careful about.
692
* If the committing transaction is currently writing
693
* this buffer out to disk and has NOT made a copy-out,
694
* then we cannot modify the buffer contents at all
695
* right now. The essence of copy-out is that it is the
696
* extra copy, not the primary copy, which gets
697
* journaled. If the primary copy is already going to
698
* disk then we cannot do copy-out here. */
700
if (jh->b_jlist == BJ_Shadow) {
701
DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
702
wait_queue_head_t *wqh;
704
wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
706
JBUFFER_TRACE(jh, "on shadow: sleep");
707
jbd_unlock_bh_state(bh);
708
/* commit wakes up all shadow buffers after IO */
710
prepare_to_wait(wqh, &wait.wait,
711
TASK_UNINTERRUPTIBLE);
712
if (jh->b_jlist != BJ_Shadow)
716
finish_wait(wqh, &wait.wait);
720
/* Only do the copy if the currently-owning transaction
721
* still needs it. If it is on the Forget list, the
722
* committing transaction is past that stage. The
723
* buffer had better remain locked during the kmalloc,
724
* but that should be true --- we hold the journal lock
725
* still and the buffer is already on the BUF_JOURNAL
726
* list so won't be flushed.
728
* Subtle point, though: if this is a get_undo_access,
729
* then we will be relying on the frozen_data to contain
730
* the new value of the committed_data record after the
731
* transaction, so we HAVE to force the frozen_data copy
734
if (jh->b_jlist != BJ_Forget || force_copy) {
735
JBUFFER_TRACE(jh, "generate frozen data");
736
if (!frozen_buffer) {
737
JBUFFER_TRACE(jh, "allocate memory for buffer");
738
jbd_unlock_bh_state(bh);
740
jbd2_alloc(jh2bh(jh)->b_size,
742
if (!frozen_buffer) {
744
"%s: OOM for frozen_buffer\n",
746
JBUFFER_TRACE(jh, "oom!");
748
jbd_lock_bh_state(bh);
753
jh->b_frozen_data = frozen_buffer;
754
frozen_buffer = NULL;
757
jh->b_next_transaction = transaction;
762
* Finally, if the buffer is not journaled right now, we need to make
763
* sure it doesn't get written to disk before the caller actually
764
* commits the new data
766
if (!jh->b_transaction) {
767
JBUFFER_TRACE(jh, "no transaction");
768
J_ASSERT_JH(jh, !jh->b_next_transaction);
769
JBUFFER_TRACE(jh, "file as BJ_Reserved");
770
spin_lock(&journal->j_list_lock);
771
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
772
spin_unlock(&journal->j_list_lock);
781
J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
782
"Possible IO failure.\n");
783
page = jh2bh(jh)->b_page;
784
offset = offset_in_page(jh2bh(jh)->b_data);
785
source = kmap_atomic(page, KM_USER0);
786
/* Fire data frozen trigger just before we copy the data */
787
jbd2_buffer_frozen_trigger(jh, source + offset,
789
memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
790
kunmap_atomic(source, KM_USER0);
793
* Now that the frozen data is saved off, we need to store
794
* any matching triggers.
796
jh->b_frozen_triggers = jh->b_triggers;
798
jbd_unlock_bh_state(bh);
801
* If we are about to journal a buffer, then any revoke pending on it is
804
jbd2_journal_cancel_revoke(handle, jh);
807
if (unlikely(frozen_buffer)) /* It's usually NULL */
808
jbd2_free(frozen_buffer, bh->b_size);
810
JBUFFER_TRACE(jh, "exit");
815
* int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
816
* @handle: transaction to add buffer modifications to
817
* @bh: bh to be used for metadata writes
819
* Returns an error code or 0 on success.
821
* In full data journalling mode the buffer may be of type BJ_AsyncData,
822
* because we're write()ing a buffer which is also part of a shared mapping.
825
int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
827
struct journal_head *jh = jbd2_journal_add_journal_head(bh);
830
/* We do not want to get caught playing with fields which the
831
* log thread also manipulates. Make sure that the buffer
832
* completes any outstanding IO before proceeding. */
833
rc = do_get_write_access(handle, jh, 0);
834
jbd2_journal_put_journal_head(jh);
840
* When the user wants to journal a newly created buffer_head
841
* (ie. getblk() returned a new buffer and we are going to populate it
842
* manually rather than reading off disk), then we need to keep the
843
* buffer_head locked until it has been completely filled with new
844
* data. In this case, we should be able to make the assertion that
845
* the bh is not already part of an existing transaction.
847
* The buffer should already be locked by the caller by this point.
848
* There is no lock ranking violation: it was a newly created,
849
* unlocked buffer beforehand. */
852
* int jbd2_journal_get_create_access () - notify intent to use newly created bh
853
* @handle: transaction to new buffer to
856
* Call this if you create a new bh.
858
int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
860
transaction_t *transaction = handle->h_transaction;
861
journal_t *journal = transaction->t_journal;
862
struct journal_head *jh = jbd2_journal_add_journal_head(bh);
865
jbd_debug(5, "journal_head %p\n", jh);
867
if (is_handle_aborted(handle))
871
JBUFFER_TRACE(jh, "entry");
873
* The buffer may already belong to this transaction due to pre-zeroing
874
* in the filesystem's new_block code. It may also be on the previous,
875
* committing transaction's lists, but it HAS to be in Forget state in
876
* that case: the transaction must have deleted the buffer for it to be
879
jbd_lock_bh_state(bh);
880
spin_lock(&journal->j_list_lock);
881
J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
882
jh->b_transaction == NULL ||
883
(jh->b_transaction == journal->j_committing_transaction &&
884
jh->b_jlist == BJ_Forget)));
886
J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
887
J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
889
if (jh->b_transaction == NULL) {
891
* Previous jbd2_journal_forget() could have left the buffer
892
* with jbddirty bit set because it was being committed. When
893
* the commit finished, we've filed the buffer for
894
* checkpointing and marked it dirty. Now we are reallocating
895
* the buffer so the transaction freeing it must have
896
* committed and so it's safe to clear the dirty bit.
898
clear_buffer_dirty(jh2bh(jh));
899
/* first access by this transaction */
902
JBUFFER_TRACE(jh, "file as BJ_Reserved");
903
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
904
} else if (jh->b_transaction == journal->j_committing_transaction) {
905
/* first access by this transaction */
908
JBUFFER_TRACE(jh, "set next transaction");
909
jh->b_next_transaction = transaction;
911
spin_unlock(&journal->j_list_lock);
912
jbd_unlock_bh_state(bh);
915
* akpm: I added this. ext3_alloc_branch can pick up new indirect
916
* blocks which contain freed but then revoked metadata. We need
917
* to cancel the revoke in case we end up freeing it yet again
918
* and the reallocating as data - this would cause a second revoke,
919
* which hits an assertion error.
921
JBUFFER_TRACE(jh, "cancelling revoke");
922
jbd2_journal_cancel_revoke(handle, jh);
924
jbd2_journal_put_journal_head(jh);
929
* int jbd2_journal_get_undo_access() - Notify intent to modify metadata with
930
* non-rewindable consequences
931
* @handle: transaction
932
* @bh: buffer to undo
934
* Sometimes there is a need to distinguish between metadata which has
935
* been committed to disk and that which has not. The ext3fs code uses
936
* this for freeing and allocating space, we have to make sure that we
937
* do not reuse freed space until the deallocation has been committed,
938
* since if we overwrote that space we would make the delete
939
* un-rewindable in case of a crash.
941
* To deal with that, jbd2_journal_get_undo_access requests write access to a
942
* buffer for parts of non-rewindable operations such as delete
943
* operations on the bitmaps. The journaling code must keep a copy of
944
* the buffer's contents prior to the undo_access call until such time
945
* as we know that the buffer has definitely been committed to disk.
947
* We never need to know which transaction the committed data is part
948
* of, buffers touched here are guaranteed to be dirtied later and so
949
* will be committed to a new transaction in due course, at which point
950
* we can discard the old committed data pointer.
952
* Returns error number or 0 on success.
954
int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
957
struct journal_head *jh = jbd2_journal_add_journal_head(bh);
958
char *committed_data = NULL;
960
JBUFFER_TRACE(jh, "entry");
963
* Do this first --- it can drop the journal lock, so we want to
964
* make sure that obtaining the committed_data is done
965
* atomically wrt. completion of any outstanding commits.
967
err = do_get_write_access(handle, jh, 1);
972
if (!jh->b_committed_data) {
973
committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
974
if (!committed_data) {
975
printk(KERN_EMERG "%s: No memory for committed data\n",
982
jbd_lock_bh_state(bh);
983
if (!jh->b_committed_data) {
984
/* Copy out the current buffer contents into the
985
* preserved, committed copy. */
986
JBUFFER_TRACE(jh, "generate b_committed data");
987
if (!committed_data) {
988
jbd_unlock_bh_state(bh);
992
jh->b_committed_data = committed_data;
993
committed_data = NULL;
994
memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
996
jbd_unlock_bh_state(bh);
998
jbd2_journal_put_journal_head(jh);
999
if (unlikely(committed_data))
1000
jbd2_free(committed_data, bh->b_size);
1005
* void jbd2_journal_set_triggers() - Add triggers for commit writeout
1006
* @bh: buffer to trigger on
1007
* @type: struct jbd2_buffer_trigger_type containing the trigger(s).
1009
* Set any triggers on this journal_head. This is always safe, because
1010
* triggers for a committing buffer will be saved off, and triggers for
1011
* a running transaction will match the buffer in that transaction.
1013
* Call with NULL to clear the triggers.
1015
void jbd2_journal_set_triggers(struct buffer_head *bh,
1016
struct jbd2_buffer_trigger_type *type)
1018
struct journal_head *jh = bh2jh(bh);
1020
jh->b_triggers = type;
1023
void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
1024
struct jbd2_buffer_trigger_type *triggers)
1026
struct buffer_head *bh = jh2bh(jh);
1028
if (!triggers || !triggers->t_frozen)
1031
triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
1034
void jbd2_buffer_abort_trigger(struct journal_head *jh,
1035
struct jbd2_buffer_trigger_type *triggers)
1037
if (!triggers || !triggers->t_abort)
1040
triggers->t_abort(triggers, jh2bh(jh));
1046
* int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
1047
* @handle: transaction to add buffer to.
1048
* @bh: buffer to mark
1050
* mark dirty metadata which needs to be journaled as part of the current
1053
* The buffer must have previously had jbd2_journal_get_write_access()
1054
* called so that it has a valid journal_head attached to the buffer
1057
* The buffer is placed on the transaction's metadata list and is marked
1058
* as belonging to the transaction.
1060
* Returns error number or 0 on success.
1062
* Special care needs to be taken if the buffer already belongs to the
1063
* current committing transaction (in which case we should have frozen
1064
* data present for that commit). In that case, we don't relink the
1065
* buffer: that only gets done when the old transaction finally
1066
* completes its commit.
1068
int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1070
transaction_t *transaction = handle->h_transaction;
1071
journal_t *journal = transaction->t_journal;
1072
struct journal_head *jh = bh2jh(bh);
1075
jbd_debug(5, "journal_head %p\n", jh);
1076
JBUFFER_TRACE(jh, "entry");
1077
if (is_handle_aborted(handle))
1079
if (!buffer_jbd(bh)) {
1084
jbd_lock_bh_state(bh);
1086
if (jh->b_modified == 0) {
1088
* This buffer's got modified and becoming part
1089
* of the transaction. This needs to be done
1090
* once a transaction -bzzz
1093
J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
1094
handle->h_buffer_credits--;
1098
* fastpath, to avoid expensive locking. If this buffer is already
1099
* on the running transaction's metadata list there is nothing to do.
1100
* Nobody can take it off again because there is a handle open.
1101
* I _think_ we're OK here with SMP barriers - a mistaken decision will
1102
* result in this test being false, so we go in and take the locks.
1104
if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1105
JBUFFER_TRACE(jh, "fastpath");
1106
if (unlikely(jh->b_transaction !=
1107
journal->j_running_transaction)) {
1108
printk(KERN_EMERG "JBD: %s: "
1109
"jh->b_transaction (%llu, %p, %u) != "
1110
"journal->j_running_transaction (%p, %u)",
1112
(unsigned long long) bh->b_blocknr,
1114
jh->b_transaction ? jh->b_transaction->t_tid : 0,
1115
journal->j_running_transaction,
1116
journal->j_running_transaction ?
1117
journal->j_running_transaction->t_tid : 0);
1123
set_buffer_jbddirty(bh);
1126
* Metadata already on the current transaction list doesn't
1127
* need to be filed. Metadata on another transaction's list must
1128
* be committing, and will be refiled once the commit completes:
1129
* leave it alone for now.
1131
if (jh->b_transaction != transaction) {
1132
JBUFFER_TRACE(jh, "already on other transaction");
1133
if (unlikely(jh->b_transaction !=
1134
journal->j_committing_transaction)) {
1135
printk(KERN_EMERG "JBD: %s: "
1136
"jh->b_transaction (%llu, %p, %u) != "
1137
"journal->j_committing_transaction (%p, %u)",
1139
(unsigned long long) bh->b_blocknr,
1141
jh->b_transaction ? jh->b_transaction->t_tid : 0,
1142
journal->j_committing_transaction,
1143
journal->j_committing_transaction ?
1144
journal->j_committing_transaction->t_tid : 0);
1147
if (unlikely(jh->b_next_transaction != transaction)) {
1148
printk(KERN_EMERG "JBD: %s: "
1149
"jh->b_next_transaction (%llu, %p, %u) != "
1150
"transaction (%p, %u)",
1152
(unsigned long long) bh->b_blocknr,
1153
jh->b_next_transaction,
1154
jh->b_next_transaction ?
1155
jh->b_next_transaction->t_tid : 0,
1156
transaction, transaction->t_tid);
1159
/* And this case is illegal: we can't reuse another
1160
* transaction's data buffer, ever. */
1164
/* That test should have eliminated the following case: */
1165
J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
1167
JBUFFER_TRACE(jh, "file as BJ_Metadata");
1168
spin_lock(&journal->j_list_lock);
1169
__jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
1170
spin_unlock(&journal->j_list_lock);
1172
jbd_unlock_bh_state(bh);
1174
JBUFFER_TRACE(jh, "exit");
1175
WARN_ON(ret); /* All errors are bugs, so dump the stack */
1180
* jbd2_journal_release_buffer: undo a get_write_access without any buffer
1181
* updates, if the update decided in the end that it didn't need access.
1185
jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
1187
BUFFER_TRACE(bh, "entry");
1191
* void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
1192
* @handle: transaction handle
1193
* @bh: bh to 'forget'
1195
* We can only do the bforget if there are no commits pending against the
1196
* buffer. If the buffer is dirty in the current running transaction we
1197
* can safely unlink it.
1199
* bh may not be a journalled buffer at all - it may be a non-JBD
1200
* buffer which came off the hashtable. Check for this.
1202
* Decrements bh->b_count by one.
1204
* Allow this call even if the handle has aborted --- it may be part of
1205
* the caller's cleanup after an abort.
1207
int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1209
transaction_t *transaction = handle->h_transaction;
1210
journal_t *journal = transaction->t_journal;
1211
struct journal_head *jh;
1212
int drop_reserve = 0;
1214
int was_modified = 0;
1216
BUFFER_TRACE(bh, "entry");
1218
jbd_lock_bh_state(bh);
1219
spin_lock(&journal->j_list_lock);
1221
if (!buffer_jbd(bh))
1225
/* Critical error: attempting to delete a bitmap buffer, maybe?
1226
* Don't do any jbd operations, and return an error. */
1227
if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1228
"inconsistent data on disk")) {
1233
/* keep track of wether or not this transaction modified us */
1234
was_modified = jh->b_modified;
1237
* The buffer's going from the transaction, we must drop
1238
* all references -bzzz
1242
if (jh->b_transaction == handle->h_transaction) {
1243
J_ASSERT_JH(jh, !jh->b_frozen_data);
1245
/* If we are forgetting a buffer which is already part
1246
* of this transaction, then we can just drop it from
1247
* the transaction immediately. */
1248
clear_buffer_dirty(bh);
1249
clear_buffer_jbddirty(bh);
1251
JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1254
* we only want to drop a reference if this transaction
1255
* modified the buffer
1261
* We are no longer going to journal this buffer.
1262
* However, the commit of this transaction is still
1263
* important to the buffer: the delete that we are now
1264
* processing might obsolete an old log entry, so by
1265
* committing, we can satisfy the buffer's checkpoint.
1267
* So, if we have a checkpoint on the buffer, we should
1268
* now refile the buffer on our BJ_Forget list so that
1269
* we know to remove the checkpoint after we commit.
1272
if (jh->b_cp_transaction) {
1273
__jbd2_journal_temp_unlink_buffer(jh);
1274
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1276
__jbd2_journal_unfile_buffer(jh);
1277
if (!buffer_jbd(bh)) {
1278
spin_unlock(&journal->j_list_lock);
1279
jbd_unlock_bh_state(bh);
1284
} else if (jh->b_transaction) {
1285
J_ASSERT_JH(jh, (jh->b_transaction ==
1286
journal->j_committing_transaction));
1287
/* However, if the buffer is still owned by a prior
1288
* (committing) transaction, we can't drop it yet... */
1289
JBUFFER_TRACE(jh, "belongs to older transaction");
1290
/* ... but we CAN drop it from the new transaction if we
1291
* have also modified it since the original commit. */
1293
if (jh->b_next_transaction) {
1294
J_ASSERT(jh->b_next_transaction == transaction);
1295
jh->b_next_transaction = NULL;
1298
* only drop a reference if this transaction modified
1307
spin_unlock(&journal->j_list_lock);
1308
jbd_unlock_bh_state(bh);
1312
/* no need to reserve log space for this block -bzzz */
1313
handle->h_buffer_credits++;
1319
* int jbd2_journal_stop() - complete a transaction
1320
* @handle: tranaction to complete.
1322
* All done for a particular handle.
1324
* There is not much action needed here. We just return any remaining
1325
* buffer credits to the transaction and remove the handle. The only
1326
* complication is that we need to start a commit operation if the
1327
* filesystem is marked for synchronous update.
1329
* jbd2_journal_stop itself will not usually return an error, but it may
1330
* do so in unusual circumstances. In particular, expect it to
1331
* return -EIO if a jbd2_journal_abort has been executed since the
1332
* transaction began.
1334
int jbd2_journal_stop(handle_t *handle)
1336
transaction_t *transaction = handle->h_transaction;
1337
journal_t *journal = transaction->t_journal;
1338
int err, wait_for_commit = 0;
1342
J_ASSERT(journal_current_handle() == handle);
1344
if (is_handle_aborted(handle))
1347
J_ASSERT(atomic_read(&transaction->t_updates) > 0);
1351
if (--handle->h_ref > 0) {
1352
jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
1357
jbd_debug(4, "Handle %p going down\n", handle);
1360
* Implement synchronous transaction batching. If the handle
1361
* was synchronous, don't force a commit immediately. Let's
1362
* yield and let another thread piggyback onto this
1363
* transaction. Keep doing that while new threads continue to
1364
* arrive. It doesn't cost much - we're about to run a commit
1365
* and sleep on IO anyway. Speeds up many-threaded, many-dir
1366
* operations by 30x or more...
1368
* We try and optimize the sleep time against what the
1369
* underlying disk can do, instead of having a static sleep
1370
* time. This is useful for the case where our storage is so
1371
* fast that it is more optimal to go ahead and force a flush
1372
* and wait for the transaction to be committed than it is to
1373
* wait for an arbitrary amount of time for new writers to
1374
* join the transaction. We achieve this by measuring how
1375
* long it takes to commit a transaction, and compare it with
1376
* how long this transaction has been running, and if run time
1377
* < commit time then we sleep for the delta and commit. This
1378
* greatly helps super fast disks that would see slowdowns as
1379
* more threads started doing fsyncs.
1381
* But don't do this if this process was the most recent one
1382
* to perform a synchronous write. We do this to detect the
1383
* case where a single process is doing a stream of sync
1384
* writes. No point in waiting for joiners in that case.
1387
if (handle->h_sync && journal->j_last_sync_writer != pid) {
1388
u64 commit_time, trans_time;
1390
journal->j_last_sync_writer = pid;
1392
read_lock(&journal->j_state_lock);
1393
commit_time = journal->j_average_commit_time;
1394
read_unlock(&journal->j_state_lock);
1396
trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1397
transaction->t_start_time));
1399
commit_time = max_t(u64, commit_time,
1400
1000*journal->j_min_batch_time);
1401
commit_time = min_t(u64, commit_time,
1402
1000*journal->j_max_batch_time);
1404
if (trans_time < commit_time) {
1405
ktime_t expires = ktime_add_ns(ktime_get(),
1407
set_current_state(TASK_UNINTERRUPTIBLE);
1408
schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1413
transaction->t_synchronous_commit = 1;
1414
current->journal_info = NULL;
1415
atomic_sub(handle->h_buffer_credits,
1416
&transaction->t_outstanding_credits);
1419
* If the handle is marked SYNC, we need to set another commit
1420
* going! We also want to force a commit if the current
1421
* transaction is occupying too much of the log, or if the
1422
* transaction is too old now.
1424
if (handle->h_sync ||
1425
(atomic_read(&transaction->t_outstanding_credits) >
1426
journal->j_max_transaction_buffers) ||
1427
time_after_eq(jiffies, transaction->t_expires)) {
1428
/* Do this even for aborted journals: an abort still
1429
* completes the commit thread, it just doesn't write
1430
* anything to disk. */
1432
jbd_debug(2, "transaction too old, requesting commit for "
1433
"handle %p\n", handle);
1434
/* This is non-blocking */
1435
jbd2_log_start_commit(journal, transaction->t_tid);
1438
* Special case: JBD2_SYNC synchronous updates require us
1439
* to wait for the commit to complete.
1441
if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1442
wait_for_commit = 1;
1446
* Once we drop t_updates, if it goes to zero the transaction
1447
* could start committing on us and eventually disappear. So
1448
* once we do this, we must not dereference transaction
1451
tid = transaction->t_tid;
1452
if (atomic_dec_and_test(&transaction->t_updates)) {
1453
wake_up(&journal->j_wait_updates);
1454
if (journal->j_barrier_count)
1455
wake_up(&journal->j_wait_transaction_locked);
1458
if (wait_for_commit)
1459
err = jbd2_log_wait_commit(journal, tid);
1461
lock_map_release(&handle->h_lockdep_map);
1463
jbd2_free_handle(handle);
1468
* int jbd2_journal_force_commit() - force any uncommitted transactions
1469
* @journal: journal to force
1471
* For synchronous operations: force any uncommitted transactions
1472
* to disk. May seem kludgy, but it reuses all the handle batching
1473
* code in a very simple manner.
1475
int jbd2_journal_force_commit(journal_t *journal)
1480
handle = jbd2_journal_start(journal, 1);
1481
if (IS_ERR(handle)) {
1482
ret = PTR_ERR(handle);
1485
ret = jbd2_journal_stop(handle);
1492
* List management code snippets: various functions for manipulating the
1493
* transaction buffer lists.
1498
* Append a buffer to a transaction list, given the transaction's list head
1501
* j_list_lock is held.
1503
* jbd_lock_bh_state(jh2bh(jh)) is held.
1507
__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
1510
jh->b_tnext = jh->b_tprev = jh;
1513
/* Insert at the tail of the list to preserve order */
1514
struct journal_head *first = *list, *last = first->b_tprev;
1516
jh->b_tnext = first;
1517
last->b_tnext = first->b_tprev = jh;
1522
* Remove a buffer from a transaction list, given the transaction's list
1525
* Called with j_list_lock held, and the journal may not be locked.
1527
* jbd_lock_bh_state(jh2bh(jh)) is held.
1531
__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1534
*list = jh->b_tnext;
1538
jh->b_tprev->b_tnext = jh->b_tnext;
1539
jh->b_tnext->b_tprev = jh->b_tprev;
1543
* Remove a buffer from the appropriate transaction list.
1545
* Note that this function can *change* the value of
1546
* bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
1547
* t_log_list or t_reserved_list. If the caller is holding onto a copy of one
1548
* of these pointers, it could go bad. Generally the caller needs to re-read
1549
* the pointer from the transaction_t.
1551
* Called under j_list_lock. The journal may not be locked.
1553
void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1555
struct journal_head **list = NULL;
1556
transaction_t *transaction;
1557
struct buffer_head *bh = jh2bh(jh);
1559
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1560
transaction = jh->b_transaction;
1562
assert_spin_locked(&transaction->t_journal->j_list_lock);
1564
J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1565
if (jh->b_jlist != BJ_None)
1566
J_ASSERT_JH(jh, transaction != NULL);
1568
switch (jh->b_jlist) {
1572
transaction->t_nr_buffers--;
1573
J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
1574
list = &transaction->t_buffers;
1577
list = &transaction->t_forget;
1580
list = &transaction->t_iobuf_list;
1583
list = &transaction->t_shadow_list;
1586
list = &transaction->t_log_list;
1589
list = &transaction->t_reserved_list;
1593
__blist_del_buffer(list, jh);
1594
jh->b_jlist = BJ_None;
1595
if (test_clear_buffer_jbddirty(bh))
1596
mark_buffer_dirty(bh); /* Expose it to the VM */
1600
* Remove buffer from all transactions.
1602
* Called with bh_state lock and j_list_lock
1604
* jh and bh may be already freed when this function returns.
1606
static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
1608
__jbd2_journal_temp_unlink_buffer(jh);
1609
jh->b_transaction = NULL;
1610
jbd2_journal_put_journal_head(jh);
1613
void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
1615
struct buffer_head *bh = jh2bh(jh);
1617
/* Get reference so that buffer cannot be freed before we unlock it */
1619
jbd_lock_bh_state(bh);
1620
spin_lock(&journal->j_list_lock);
1621
__jbd2_journal_unfile_buffer(jh);
1622
spin_unlock(&journal->j_list_lock);
1623
jbd_unlock_bh_state(bh);
1628
* Called from jbd2_journal_try_to_free_buffers().
1630
* Called under jbd_lock_bh_state(bh)
1633
__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1635
struct journal_head *jh;
1639
if (buffer_locked(bh) || buffer_dirty(bh))
1642
if (jh->b_next_transaction != NULL)
1645
spin_lock(&journal->j_list_lock);
1646
if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1647
/* written-back checkpointed metadata buffer */
1648
if (jh->b_jlist == BJ_None) {
1649
JBUFFER_TRACE(jh, "remove from checkpoint list");
1650
__jbd2_journal_remove_checkpoint(jh);
1653
spin_unlock(&journal->j_list_lock);
1659
* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
1660
* @journal: journal for operation
1661
* @page: to try and free
1662
* @gfp_mask: we use the mask to detect how hard should we try to release
1663
* buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
1664
* release the buffers.
1667
* For all the buffers on this page,
1668
* if they are fully written out ordered data, move them onto BUF_CLEAN
1669
* so try_to_free_buffers() can reap them.
1671
* This function returns non-zero if we wish try_to_free_buffers()
1672
* to be called. We do this if the page is releasable by try_to_free_buffers().
1673
* We also do it if the page has locked or dirty buffers and the caller wants
1674
* us to perform sync or async writeout.
1676
* This complicates JBD locking somewhat. We aren't protected by the
1677
* BKL here. We wish to remove the buffer from its committing or
1678
* running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
1680
* This may *change* the value of transaction_t->t_datalist, so anyone
1681
* who looks at t_datalist needs to lock against this function.
1683
* Even worse, someone may be doing a jbd2_journal_dirty_data on this
1684
* buffer. So we need to lock against that. jbd2_journal_dirty_data()
1685
* will come out of the lock with the buffer dirty, which makes it
1686
* ineligible for release here.
1688
* Who else is affected by this? hmm... Really the only contender
1689
* is do_get_write_access() - it could be looking at the buffer while
1690
* journal_try_to_free_buffer() is changing its state. But that
1691
* cannot happen because we never reallocate freed data as metadata
1692
* while the data is part of a transaction. Yes?
1694
* Return 0 on failure, 1 on success
1696
int jbd2_journal_try_to_free_buffers(journal_t *journal,
1697
struct page *page, gfp_t gfp_mask)
1699
struct buffer_head *head;
1700
struct buffer_head *bh;
1703
J_ASSERT(PageLocked(page));
1705
head = page_buffers(page);
1708
struct journal_head *jh;
1711
* We take our own ref against the journal_head here to avoid
1712
* having to add tons of locking around each instance of
1713
* jbd2_journal_put_journal_head().
1715
jh = jbd2_journal_grab_journal_head(bh);
1719
jbd_lock_bh_state(bh);
1720
__journal_try_to_free_buffer(journal, bh);
1721
jbd2_journal_put_journal_head(jh);
1722
jbd_unlock_bh_state(bh);
1725
} while ((bh = bh->b_this_page) != head);
1727
ret = try_to_free_buffers(page);
1734
* This buffer is no longer needed. If it is on an older transaction's
1735
* checkpoint list we need to record it on this transaction's forget list
1736
* to pin this buffer (and hence its checkpointing transaction) down until
1737
* this transaction commits. If the buffer isn't on a checkpoint list, we
1739
* Returns non-zero if JBD no longer has an interest in the buffer.
1741
* Called under j_list_lock.
1743
* Called under jbd_lock_bh_state(bh).
1745
static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1748
struct buffer_head *bh = jh2bh(jh);
1750
if (jh->b_cp_transaction) {
1751
JBUFFER_TRACE(jh, "on running+cp transaction");
1752
__jbd2_journal_temp_unlink_buffer(jh);
1754
* We don't want to write the buffer anymore, clear the
1755
* bit so that we don't confuse checks in
1756
* __journal_file_buffer
1758
clear_buffer_dirty(bh);
1759
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1762
JBUFFER_TRACE(jh, "on running transaction");
1763
__jbd2_journal_unfile_buffer(jh);
1769
* jbd2_journal_invalidatepage
1771
* This code is tricky. It has a number of cases to deal with.
1773
* There are two invariants which this code relies on:
1775
* i_size must be updated on disk before we start calling invalidatepage on the
1778
* This is done in ext3 by defining an ext3_setattr method which
1779
* updates i_size before truncate gets going. By maintaining this
1780
* invariant, we can be sure that it is safe to throw away any buffers
1781
* attached to the current transaction: once the transaction commits,
1782
* we know that the data will not be needed.
1784
* Note however that we can *not* throw away data belonging to the
1785
* previous, committing transaction!
1787
* Any disk blocks which *are* part of the previous, committing
1788
* transaction (and which therefore cannot be discarded immediately) are
1789
* not going to be reused in the new running transaction
1791
* The bitmap committed_data images guarantee this: any block which is
1792
* allocated in one transaction and removed in the next will be marked
1793
* as in-use in the committed_data bitmap, so cannot be reused until
1794
* the next transaction to delete the block commits. This means that
1795
* leaving committing buffers dirty is quite safe: the disk blocks
1796
* cannot be reallocated to a different file and so buffer aliasing is
1800
* The above applies mainly to ordered data mode. In writeback mode we
1801
* don't make guarantees about the order in which data hits disk --- in
1802
* particular we don't guarantee that new dirty data is flushed before
1803
* transaction commit --- so it is always safe just to discard data
1804
* immediately in that mode. --sct
1808
* The journal_unmap_buffer helper function returns zero if the buffer
1809
* concerned remains pinned as an anonymous buffer belonging to an older
1812
* We're outside-transaction here. Either or both of j_running_transaction
1813
* and j_committing_transaction may be NULL.
1815
static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1817
transaction_t *transaction;
1818
struct journal_head *jh;
1822
BUFFER_TRACE(bh, "entry");
1825
* It is safe to proceed here without the j_list_lock because the
1826
* buffers cannot be stolen by try_to_free_buffers as long as we are
1827
* holding the page lock. --sct
1830
if (!buffer_jbd(bh))
1831
goto zap_buffer_unlocked;
1833
/* OK, we have data buffer in journaled mode */
1834
write_lock(&journal->j_state_lock);
1835
jbd_lock_bh_state(bh);
1836
spin_lock(&journal->j_list_lock);
1838
jh = jbd2_journal_grab_journal_head(bh);
1840
goto zap_buffer_no_jh;
1843
* We cannot remove the buffer from checkpoint lists until the
1844
* transaction adding inode to orphan list (let's call it T)
1845
* is committed. Otherwise if the transaction changing the
1846
* buffer would be cleaned from the journal before T is
1847
* committed, a crash will cause that the correct contents of
1848
* the buffer will be lost. On the other hand we have to
1849
* clear the buffer dirty bit at latest at the moment when the
1850
* transaction marking the buffer as freed in the filesystem
1851
* structures is committed because from that moment on the
1852
* buffer can be reallocated and used by a different page.
1853
* Since the block hasn't been freed yet but the inode has
1854
* already been added to orphan list, it is safe for us to add
1855
* the buffer to BJ_Forget list of the newest transaction.
1857
transaction = jh->b_transaction;
1858
if (transaction == NULL) {
1859
/* First case: not on any transaction. If it
1860
* has no checkpoint link, then we can zap it:
1861
* it's a writeback-mode buffer so we don't care
1862
* if it hits disk safely. */
1863
if (!jh->b_cp_transaction) {
1864
JBUFFER_TRACE(jh, "not on any transaction: zap");
1868
if (!buffer_dirty(bh)) {
1869
/* bdflush has written it. We can drop it now */
1873
/* OK, it must be in the journal but still not
1874
* written fully to disk: it's metadata or
1875
* journaled data... */
1877
if (journal->j_running_transaction) {
1878
/* ... and once the current transaction has
1879
* committed, the buffer won't be needed any
1881
JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
1882
ret = __dispose_buffer(jh,
1883
journal->j_running_transaction);
1884
jbd2_journal_put_journal_head(jh);
1885
spin_unlock(&journal->j_list_lock);
1886
jbd_unlock_bh_state(bh);
1887
write_unlock(&journal->j_state_lock);
1890
/* There is no currently-running transaction. So the
1891
* orphan record which we wrote for this file must have
1892
* passed into commit. We must attach this buffer to
1893
* the committing transaction, if it exists. */
1894
if (journal->j_committing_transaction) {
1895
JBUFFER_TRACE(jh, "give to committing trans");
1896
ret = __dispose_buffer(jh,
1897
journal->j_committing_transaction);
1898
jbd2_journal_put_journal_head(jh);
1899
spin_unlock(&journal->j_list_lock);
1900
jbd_unlock_bh_state(bh);
1901
write_unlock(&journal->j_state_lock);
1904
/* The orphan record's transaction has
1905
* committed. We can cleanse this buffer */
1906
clear_buffer_jbddirty(bh);
1910
} else if (transaction == journal->j_committing_transaction) {
1911
JBUFFER_TRACE(jh, "on committing transaction");
1913
* The buffer is committing, we simply cannot touch
1914
* it. So we just set j_next_transaction to the
1915
* running transaction (if there is one) and mark
1916
* buffer as freed so that commit code knows it should
1917
* clear dirty bits when it is done with the buffer.
1919
set_buffer_freed(bh);
1920
if (journal->j_running_transaction && buffer_jbddirty(bh))
1921
jh->b_next_transaction = journal->j_running_transaction;
1922
jbd2_journal_put_journal_head(jh);
1923
spin_unlock(&journal->j_list_lock);
1924
jbd_unlock_bh_state(bh);
1925
write_unlock(&journal->j_state_lock);
1928
/* Good, the buffer belongs to the running transaction.
1929
* We are writing our own transaction's data, not any
1930
* previous one's, so it is safe to throw it away
1931
* (remember that we expect the filesystem to have set
1932
* i_size already for this truncate so recovery will not
1933
* expose the disk blocks we are discarding here.) */
1934
J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
1935
JBUFFER_TRACE(jh, "on running transaction");
1936
may_free = __dispose_buffer(jh, transaction);
1940
jbd2_journal_put_journal_head(jh);
1942
spin_unlock(&journal->j_list_lock);
1943
jbd_unlock_bh_state(bh);
1944
write_unlock(&journal->j_state_lock);
1945
zap_buffer_unlocked:
1946
clear_buffer_dirty(bh);
1947
J_ASSERT_BH(bh, !buffer_jbddirty(bh));
1948
clear_buffer_mapped(bh);
1949
clear_buffer_req(bh);
1950
clear_buffer_new(bh);
1956
* void jbd2_journal_invalidatepage()
1957
* @journal: journal to use for flush...
1958
* @page: page to flush
1959
* @offset: length of page to invalidate.
1961
* Reap page buffers containing data after offset in page.
1964
void jbd2_journal_invalidatepage(journal_t *journal,
1966
unsigned long offset)
1968
struct buffer_head *head, *bh, *next;
1969
unsigned int curr_off = 0;
1972
if (!PageLocked(page))
1974
if (!page_has_buffers(page))
1977
/* We will potentially be playing with lists other than just the
1978
* data lists (especially for journaled data mode), so be
1979
* cautious in our locking. */
1981
head = bh = page_buffers(page);
1983
unsigned int next_off = curr_off + bh->b_size;
1984
next = bh->b_this_page;
1986
if (offset <= curr_off) {
1987
/* This block is wholly outside the truncation point */
1989
may_free &= journal_unmap_buffer(journal, bh);
1992
curr_off = next_off;
1995
} while (bh != head);
1998
if (may_free && try_to_free_buffers(page))
1999
J_ASSERT(!page_has_buffers(page));
2004
* File a buffer on the given transaction list.
2006
void __jbd2_journal_file_buffer(struct journal_head *jh,
2007
transaction_t *transaction, int jlist)
2009
struct journal_head **list = NULL;
2011
struct buffer_head *bh = jh2bh(jh);
2013
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
2014
assert_spin_locked(&transaction->t_journal->j_list_lock);
2016
J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
2017
J_ASSERT_JH(jh, jh->b_transaction == transaction ||
2018
jh->b_transaction == NULL);
2020
if (jh->b_transaction && jh->b_jlist == jlist)
2023
if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
2024
jlist == BJ_Shadow || jlist == BJ_Forget) {
2026
* For metadata buffers, we track dirty bit in buffer_jbddirty
2027
* instead of buffer_dirty. We should not see a dirty bit set
2028
* here because we clear it in do_get_write_access but e.g.
2029
* tune2fs can modify the sb and set the dirty bit at any time
2030
* so we try to gracefully handle that.
2032
if (buffer_dirty(bh))
2033
warn_dirty_buffer(bh);
2034
if (test_clear_buffer_dirty(bh) ||
2035
test_clear_buffer_jbddirty(bh))
2039
if (jh->b_transaction)
2040
__jbd2_journal_temp_unlink_buffer(jh);
2042
jbd2_journal_grab_journal_head(bh);
2043
jh->b_transaction = transaction;
2047
J_ASSERT_JH(jh, !jh->b_committed_data);
2048
J_ASSERT_JH(jh, !jh->b_frozen_data);
2051
transaction->t_nr_buffers++;
2052
list = &transaction->t_buffers;
2055
list = &transaction->t_forget;
2058
list = &transaction->t_iobuf_list;
2061
list = &transaction->t_shadow_list;
2064
list = &transaction->t_log_list;
2067
list = &transaction->t_reserved_list;
2071
__blist_add_buffer(list, jh);
2072
jh->b_jlist = jlist;
2075
set_buffer_jbddirty(bh);
2078
void jbd2_journal_file_buffer(struct journal_head *jh,
2079
transaction_t *transaction, int jlist)
2081
jbd_lock_bh_state(jh2bh(jh));
2082
spin_lock(&transaction->t_journal->j_list_lock);
2083
__jbd2_journal_file_buffer(jh, transaction, jlist);
2084
spin_unlock(&transaction->t_journal->j_list_lock);
2085
jbd_unlock_bh_state(jh2bh(jh));
2089
* Remove a buffer from its current buffer list in preparation for
2090
* dropping it from its current transaction entirely. If the buffer has
2091
* already started to be used by a subsequent transaction, refile the
2092
* buffer on that transaction's metadata list.
2094
* Called under j_list_lock
2095
* Called under jbd_lock_bh_state(jh2bh(jh))
2097
* jh and bh may be already free when this function returns
2099
void __jbd2_journal_refile_buffer(struct journal_head *jh)
2101
int was_dirty, jlist;
2102
struct buffer_head *bh = jh2bh(jh);
2104
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
2105
if (jh->b_transaction)
2106
assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
2108
/* If the buffer is now unused, just drop it. */
2109
if (jh->b_next_transaction == NULL) {
2110
__jbd2_journal_unfile_buffer(jh);
2115
* It has been modified by a later transaction: add it to the new
2116
* transaction's metadata list.
2119
was_dirty = test_clear_buffer_jbddirty(bh);
2120
__jbd2_journal_temp_unlink_buffer(jh);
2122
* We set b_transaction here because b_next_transaction will inherit
2123
* our jh reference and thus __jbd2_journal_file_buffer() must not
2126
jh->b_transaction = jh->b_next_transaction;
2127
jh->b_next_transaction = NULL;
2128
if (buffer_freed(bh))
2130
else if (jh->b_modified)
2131
jlist = BJ_Metadata;
2133
jlist = BJ_Reserved;
2134
__jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
2135
J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2138
set_buffer_jbddirty(bh);
2142
* __jbd2_journal_refile_buffer() with necessary locking added. We take our
2143
* bh reference so that we can safely unlock bh.
2145
* The jh and bh may be freed by this call.
2147
void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2149
struct buffer_head *bh = jh2bh(jh);
2151
/* Get reference so that buffer cannot be freed before we unlock it */
2153
jbd_lock_bh_state(bh);
2154
spin_lock(&journal->j_list_lock);
2155
__jbd2_journal_refile_buffer(jh);
2156
jbd_unlock_bh_state(bh);
2157
spin_unlock(&journal->j_list_lock);
2162
* File inode in the inode list of the handle's transaction
2164
int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
2166
transaction_t *transaction = handle->h_transaction;
2167
journal_t *journal = transaction->t_journal;
2169
if (is_handle_aborted(handle))
2172
jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
2173
transaction->t_tid);
2176
* First check whether inode isn't already on the transaction's
2177
* lists without taking the lock. Note that this check is safe
2178
* without the lock as we cannot race with somebody removing inode
2179
* from the transaction. The reason is that we remove inode from the
2180
* transaction only in journal_release_jbd_inode() and when we commit
2181
* the transaction. We are guarded from the first case by holding
2182
* a reference to the inode. We are safe against the second case
2183
* because if jinode->i_transaction == transaction, commit code
2184
* cannot touch the transaction because we hold reference to it,
2185
* and if jinode->i_next_transaction == transaction, commit code
2186
* will only file the inode where we want it.
2188
if (jinode->i_transaction == transaction ||
2189
jinode->i_next_transaction == transaction)
2192
spin_lock(&journal->j_list_lock);
2194
if (jinode->i_transaction == transaction ||
2195
jinode->i_next_transaction == transaction)
2199
* We only ever set this variable to 1 so the test is safe. Since
2200
* t_need_data_flush is likely to be set, we do the test to save some
2201
* cacheline bouncing
2203
if (!transaction->t_need_data_flush)
2204
transaction->t_need_data_flush = 1;
2205
/* On some different transaction's list - should be
2206
* the committing one */
2207
if (jinode->i_transaction) {
2208
J_ASSERT(jinode->i_next_transaction == NULL);
2209
J_ASSERT(jinode->i_transaction ==
2210
journal->j_committing_transaction);
2211
jinode->i_next_transaction = transaction;
2214
/* Not on any transaction list... */
2215
J_ASSERT(!jinode->i_next_transaction);
2216
jinode->i_transaction = transaction;
2217
list_add(&jinode->i_list, &transaction->t_inode_list);
2219
spin_unlock(&journal->j_list_lock);
2225
* File truncate and transaction commit interact with each other in a
2226
* non-trivial way. If a transaction writing data block A is
2227
* committing, we cannot discard the data by truncate until we have
2228
* written them. Otherwise if we crashed after the transaction with
2229
* write has committed but before the transaction with truncate has
2230
* committed, we could see stale data in block A. This function is a
2231
* helper to solve this problem. It starts writeout of the truncated
2232
* part in case it is in the committing transaction.
2234
* Filesystem code must call this function when inode is journaled in
2235
* ordered mode before truncation happens and after the inode has been
2236
* placed on orphan list with the new inode size. The second condition
2237
* avoids the race that someone writes new data and we start
2238
* committing the transaction after this function has been called but
2239
* before a transaction for truncate is started (and furthermore it
2240
* allows us to optimize the case where the addition to orphan list
2241
* happens in the same transaction as write --- we don't have to write
2242
* any data in such case).
2244
int jbd2_journal_begin_ordered_truncate(journal_t *journal,
2245
struct jbd2_inode *jinode,
2248
transaction_t *inode_trans, *commit_trans;
2251
/* This is a quick check to avoid locking if not necessary */
2252
if (!jinode->i_transaction)
2254
/* Locks are here just to force reading of recent values, it is
2255
* enough that the transaction was not committing before we started
2256
* a transaction adding the inode to orphan list */
2257
read_lock(&journal->j_state_lock);
2258
commit_trans = journal->j_committing_transaction;
2259
read_unlock(&journal->j_state_lock);
2260
spin_lock(&journal->j_list_lock);
2261
inode_trans = jinode->i_transaction;
2262
spin_unlock(&journal->j_list_lock);
2263
if (inode_trans == commit_trans) {
2264
ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
2265
new_size, LLONG_MAX);
2267
jbd2_journal_abort(journal, ret);