2
* Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
4
* The soft updates code is derived from the appendix of a University
5
* of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
6
* "Soft Updates: A Solution to the Metadata Update Problem in File
7
* Systems", CSE-TR-254-95, August 1995).
9
* Further information about soft updates can be obtained from:
11
* Marshall Kirk McKusick http://www.mckusick.com/softdep/
12
* 1614 Oxford Street mckusick@mckusick.com
13
* Berkeley, CA 94709-1608 +1-510-843-9542
16
* Redistribution and use in source and binary forms, with or without
17
* modification, are permitted provided that the following conditions
20
* 1. Redistributions of source code must retain the above copyright
21
* notice, this list of conditions and the following disclaimer.
22
* 2. Redistributions in binary form must reproduce the above copyright
23
* notice, this list of conditions and the following disclaimer in the
24
* documentation and/or other materials provided with the distribution.
26
* THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
27
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
28
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
29
* DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
30
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38
* from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
41
#include <sys/cdefs.h>
42
__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.211.2.9.2.1 2010/02/10 00:26:20 kensmith Exp $");
48
* For now we want the safety net that DEBUG flags provide.
54
#include <sys/param.h>
55
#include <sys/kernel.h>
56
#include <sys/systm.h>
60
#include <sys/kthread.h>
62
#include <sys/malloc.h>
63
#include <sys/mount.h>
64
#include <sys/mutex.h>
67
#include <sys/sysctl.h>
68
#include <sys/syslog.h>
69
#include <sys/vnode.h>
71
#include <ufs/ufs/dir.h>
72
#include <ufs/ufs/extattr.h>
73
#include <ufs/ufs/quota.h>
74
#include <ufs/ufs/inode.h>
75
#include <ufs/ufs/ufsmount.h>
76
#include <ufs/ffs/fs.h>
77
#include <ufs/ffs/softdep.h>
78
#include <ufs/ffs/ffs_extern.h>
79
#include <ufs/ufs/ufs_extern.h>
84
#include "opt_quota.h"
89
softdep_flushfiles(oldmnt, flags, td)
95
panic("softdep_flushfiles called");
99
softdep_mount(devvp, mp, fs, cred)
117
softdep_uninitialize()
124
softdep_setup_inomapdep(bp, ip, newinum)
130
panic("softdep_setup_inomapdep called");
134
softdep_setup_blkmapdep(bp, mp, newblkno)
137
ufs2_daddr_t newblkno;
140
panic("softdep_setup_blkmapdep called");
144
softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
147
ufs2_daddr_t newblkno;
148
ufs2_daddr_t oldblkno;
154
panic("softdep_setup_allocdirect called");
158
softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
161
ufs2_daddr_t newblkno;
162
ufs2_daddr_t oldblkno;
168
panic("softdep_setup_allocext called");
172
softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
177
ufs2_daddr_t newblkno;
178
ufs2_daddr_t oldblkno;
182
panic("softdep_setup_allocindir_page called");
186
softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
191
ufs2_daddr_t newblkno;
194
panic("softdep_setup_allocindir_meta called");
198
softdep_setup_freeblocks(ip, length, flags)
204
panic("softdep_setup_freeblocks called");
208
softdep_freefile(pvp, ino, mode)
214
panic("softdep_freefile called");
218
softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
223
struct buf *newdirbp;
227
panic("softdep_setup_directory_add called");
231
softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
239
panic("softdep_change_directoryentry_offset called");
243
softdep_setup_remove(bp, dp, ip, isrmdir)
250
panic("softdep_setup_remove called");
254
softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
262
panic("softdep_setup_directory_change called");
266
softdep_change_linkcnt(ip)
270
panic("softdep_change_linkcnt called");
274
softdep_load_inodeblock(ip)
278
panic("softdep_load_inodeblock called");
282
softdep_update_inodeblock(ip, bp, waitfor)
288
panic("softdep_update_inodeblock called");
293
struct vnode *vp; /* the "in_core" copy of the inode */
300
softdep_fsync_mountdev(vp)
308
softdep_flushworklist(oldmnt, countp, td)
309
struct mount *oldmnt;
319
softdep_sync_metadata(struct vnode *vp)
330
panic("softdep_slowdown called");
334
softdep_releasefile(ip)
335
struct inode *ip; /* inode with the zero effective link count */
338
panic("softdep_releasefile called");
342
softdep_request_cleanup(fs, vp)
351
softdep_check_suspend(struct mount *mp,
355
int secondary_writes,
356
int secondary_accwrites)
362
(void) softdep_accdeps;
364
ASSERT_VI_LOCKED(devvp, "softdep_check_suspend");
365
bo = &devvp->v_bufobj;
368
if (!MNT_ITRYLOCK(mp)) {
375
if (mp->mnt_secondary_writes != 0) {
377
msleep(&mp->mnt_secondary_writes,
379
(PUSER - 1) | PDROP, "secwr", 0);
387
* Reasons for needing more work before suspend:
388
* - Dirty buffers on devvp.
389
* - Secondary writes occurred after start of vnode sync loop
392
if (bo->bo_numoutput > 0 ||
393
bo->bo_dirty.bv_cnt > 0 ||
394
secondary_writes != 0 ||
395
mp->mnt_secondary_writes != 0 ||
396
secondary_accwrites != mp->mnt_secondary_accwrites)
403
softdep_get_depcounts(struct mount *mp,
405
int *softdepactiveaccp)
409
*softdepactiveaccp = 0;
414
* These definitions need to be adapted to the system to which
415
* this file is being ported.
418
* malloc types defined for the softdep system.
420
static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
421
static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
422
static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
423
static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
424
static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
425
static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
426
static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
427
static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
428
static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
429
static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
430
static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
431
static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
432
static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
433
static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
434
static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
436
#define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)
441
#define D_BMSAFEMAP 3
442
#define D_ALLOCDIRECT 4
444
#define D_ALLOCINDIR 6
451
#define D_NEWDIRBLK 13
452
#define D_LAST D_NEWDIRBLK
455
* translate from workitem type to memory type
456
* MUST match the defines above, such that memtype[D_XXX] == M_XXX
458
static struct malloc_type *memtype[] = {
475
#define DtoM(type) (memtype[type])
478
* Names of malloc types.
480
#define TYPENAME(type) \
481
((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
483
* End system adaptation definitions.
487
* Forward declarations.
489
struct inodedep_hashhead;
490
struct newblk_hashhead;
491
struct pagedep_hashhead;
494
* Internal function prototypes.
496
static void softdep_error(char *, int);
497
static void drain_output(struct vnode *);
498
static struct buf *getdirtybuf(struct buf *, struct mtx *, int);
499
static void clear_remove(struct thread *);
500
static void clear_inodedeps(struct thread *);
501
static int flush_pagedep_deps(struct vnode *, struct mount *,
503
static int flush_inodedep_deps(struct mount *, ino_t);
504
static int flush_deplist(struct allocdirectlst *, int, int *);
505
static int handle_written_filepage(struct pagedep *, struct buf *);
506
static void diradd_inode_written(struct diradd *, struct inodedep *);
507
static int handle_written_inodeblock(struct inodedep *, struct buf *);
508
static void handle_allocdirect_partdone(struct allocdirect *);
509
static void handle_allocindir_partdone(struct allocindir *);
510
static void initiate_write_filepage(struct pagedep *, struct buf *);
511
static void handle_written_mkdir(struct mkdir *, int);
512
static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
513
static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
514
static void handle_workitem_freefile(struct freefile *);
515
static void handle_workitem_remove(struct dirrem *, struct vnode *);
516
static struct dirrem *newdirrem(struct buf *, struct inode *,
517
struct inode *, int, struct dirrem **);
518
static void free_diradd(struct diradd *);
519
static void free_allocindir(struct allocindir *, struct inodedep *);
520
static void free_newdirblk(struct newdirblk *);
521
static int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
523
static void deallocate_dependencies(struct buf *, struct inodedep *);
524
static void free_allocdirect(struct allocdirectlst *,
525
struct allocdirect *, int);
526
static int check_inode_unwritten(struct inodedep *);
527
static int free_inodedep(struct inodedep *);
528
static void handle_workitem_freeblocks(struct freeblks *, int);
529
static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
530
static void setup_allocindir_phase2(struct buf *, struct inode *,
531
struct allocindir *);
532
static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
534
static void handle_workitem_freefrag(struct freefrag *);
535
static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
536
static void allocdirect_merge(struct allocdirectlst *,
537
struct allocdirect *, struct allocdirect *);
538
static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *);
539
static int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t,
541
static int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
542
static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
544
static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
545
static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
546
static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
547
struct mount *mp, int, struct pagedep **);
548
static void pause_timer(void *);
549
static int request_cleanup(struct mount *, int);
550
static int process_worklist_item(struct mount *, int);
551
static void add_to_worklist(struct worklist *);
552
static void softdep_flush(void);
553
static int softdep_speedup(void);
556
* Exported softdep operations.
558
static void softdep_disk_io_initiation(struct buf *);
559
static void softdep_disk_write_complete(struct buf *);
560
static void softdep_deallocate_dependencies(struct buf *);
561
static int softdep_count_dependencies(struct buf *bp, int);
563
static struct mtx lk;
564
MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
566
#define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk)
567
#define ACQUIRE_LOCK(lk) mtx_lock(lk)
568
#define FREE_LOCK(lk) mtx_unlock(lk)
571
* Worklist queue management.
572
* These routines require that the lock be held.
574
#ifndef /* NOT */ DEBUG
575
#define WORKLIST_INSERT(head, item) do { \
576
(item)->wk_state |= ONWORKLIST; \
577
LIST_INSERT_HEAD(head, item, wk_list); \
579
#define WORKLIST_REMOVE(item) do { \
580
(item)->wk_state &= ~ONWORKLIST; \
581
LIST_REMOVE(item, wk_list); \
584
static void worklist_insert(struct workhead *, struct worklist *);
585
static void worklist_remove(struct worklist *);
587
#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
588
#define WORKLIST_REMOVE(item) worklist_remove(item)
591
worklist_insert(head, item)
592
struct workhead *head;
593
struct worklist *item;
596
mtx_assert(&lk, MA_OWNED);
597
if (item->wk_state & ONWORKLIST)
598
panic("worklist_insert: already on list");
599
item->wk_state |= ONWORKLIST;
600
LIST_INSERT_HEAD(head, item, wk_list);
604
worklist_remove(item)
605
struct worklist *item;
608
mtx_assert(&lk, MA_OWNED);
609
if ((item->wk_state & ONWORKLIST) == 0)
610
panic("worklist_remove: not on list");
611
item->wk_state &= ~ONWORKLIST;
612
LIST_REMOVE(item, wk_list);
617
* Routines for tracking and managing workitems.
619
static void workitem_free(struct worklist *, int);
620
static void workitem_alloc(struct worklist *, int, struct mount *);
622
#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
625
workitem_free(item, type)
626
struct worklist *item;
629
struct ufsmount *ump;
630
mtx_assert(&lk, MA_OWNED);
633
if (item->wk_state & ONWORKLIST)
634
panic("workitem_free: still on list");
635
if (item->wk_type != type)
636
panic("workitem_free: type mismatch");
638
ump = VFSTOUFS(item->wk_mp);
639
if (--ump->softdep_deps == 0 && ump->softdep_req)
640
wakeup(&ump->softdep_deps);
641
FREE(item, DtoM(type));
645
workitem_alloc(item, type, mp)
646
struct worklist *item;
650
item->wk_type = type;
654
VFSTOUFS(mp)->softdep_deps++;
655
VFSTOUFS(mp)->softdep_accdeps++;
660
* Workitem queue management
662
static int max_softdeps; /* maximum number of structs before slowdown */
663
static int maxindirdeps = 50; /* max number of indirdeps before slowdown */
664
static int tickdelay = 2; /* number of ticks to pause during slowdown */
665
static int proc_waiting; /* tracks whether we have a timeout posted */
666
static int *stat_countp; /* statistic to count in proc_waiting timeout */
667
static struct callout softdep_callout;
668
static int req_pending;
669
static int req_clear_inodedeps; /* syncer process flush some inodedeps */
670
#define FLUSH_INODES 1
671
static int req_clear_remove; /* syncer process flush some freeblks */
672
#define FLUSH_REMOVE 2
673
#define FLUSH_REMOVE_WAIT 3
674
static long num_freeblkdep; /* number of freeblks workitems allocated */
679
static int stat_worklist_push; /* number of worklist cleanups */
680
static int stat_blk_limit_push; /* number of times block limit neared */
681
static int stat_ino_limit_push; /* number of times inode limit neared */
682
static int stat_blk_limit_hit; /* number of times block slowdown imposed */
683
static int stat_ino_limit_hit; /* number of times inode slowdown imposed */
684
static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
685
static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
686
static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
687
static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
688
static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
690
SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
691
SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
692
SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
693
SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
694
SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
695
SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
696
SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
697
SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
698
SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
699
SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
700
SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
701
SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
702
SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
703
/* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */
705
SYSCTL_DECL(_vfs_ffs);
707
static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */
708
SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
709
&compute_summary_at_mount, 0, "Recompute summary at mount");
711
static struct proc *softdepproc;
712
static struct kproc_desc softdep_kp = {
717
SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
725
struct ufsmount *ump;
731
td->td_pflags |= TDP_NORUNNINGBUF;
734
kthread_suspend_check(softdepproc);
735
vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
738
* If requested, try removing inode or removal dependencies.
740
if (req_clear_inodedeps) {
742
req_clear_inodedeps -= 1;
743
wakeup_one(&proc_waiting);
745
if (req_clear_remove) {
747
req_clear_remove -= 1;
748
wakeup_one(&proc_waiting);
751
VFS_UNLOCK_GIANT(vfslocked);
753
mtx_lock(&mountlist_mtx);
754
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
755
nmp = TAILQ_NEXT(mp, mnt_list);
756
if ((mp->mnt_flag & MNT_SOFTDEP) == 0)
758
if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
760
vfslocked = VFS_LOCK_GIANT(mp);
761
softdep_process_worklist(mp, 0);
763
remaining += ump->softdep_on_worklist -
764
ump->softdep_on_worklist_inprogress;
765
VFS_UNLOCK_GIANT(vfslocked);
766
mtx_lock(&mountlist_mtx);
767
nmp = TAILQ_NEXT(mp, mnt_list);
770
mtx_unlock(&mountlist_mtx);
775
msleep(&req_pending, &lk, PVM, "sdflush", hz);
782
softdep_speedup(void)
785
mtx_assert(&lk, MA_OWNED);
786
if (req_pending == 0) {
788
wakeup(&req_pending);
791
return speedup_syncer();
795
* Add an item to the end of the work queue.
796
* This routine requires that the lock be held.
797
* This is the only routine that adds items to the list.
798
* The following routine is the only one that removes items
799
* and does so in order from first to last.
805
struct ufsmount *ump;
807
mtx_assert(&lk, MA_OWNED);
808
ump = VFSTOUFS(wk->wk_mp);
809
if (wk->wk_state & ONWORKLIST)
810
panic("add_to_worklist: already on list");
811
wk->wk_state |= ONWORKLIST;
812
if (LIST_EMPTY(&ump->softdep_workitem_pending))
813
LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
815
LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
816
ump->softdep_worklist_tail = wk;
817
ump->softdep_on_worklist += 1;
821
* Process that runs once per second to handle items in the background queue.
823
* Note that we ensure that everything is done in the order in which they
824
* appear in the queue. The code below depends on this property to ensure
825
* that blocks of a file are freed before the inode itself is freed. This
826
* ordering ensures that no new <vfsid, inum, lbn> triples will be generated
827
* until all the old ones have been purged from the dependency lists.
830
softdep_process_worklist(mp, full)
834
struct thread *td = curthread;
835
int cnt, matchcnt, loopcount;
836
struct ufsmount *ump;
839
KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
841
* Record the process identifier of our caller so that we can give
842
* this process preferential treatment in request_cleanup below.
848
starttime = time_second;
849
while (ump->softdep_on_worklist > 0) {
850
if ((cnt = process_worklist_item(mp, 0)) == -1)
855
* If requested, try removing inode or removal dependencies.
857
if (req_clear_inodedeps) {
859
req_clear_inodedeps -= 1;
860
wakeup_one(&proc_waiting);
862
if (req_clear_remove) {
864
req_clear_remove -= 1;
865
wakeup_one(&proc_waiting);
868
* We do not generally want to stop for buffer space, but if
869
* we are really being a buffer hog, we will stop and wait.
871
if (loopcount++ % 128 == 0) {
878
* Never allow processing to run for more than one
879
* second. Otherwise the other mountpoints may get
880
* excessively backlogged.
882
if (!full && starttime != time_second) {
892
* Process one item on the worklist.
895
process_worklist_item(mp, flags)
899
struct worklist *wk, *wkend;
900
struct ufsmount *ump;
904
mtx_assert(&lk, MA_OWNED);
905
KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
907
* If we are being called because of a process doing a
908
* copy-on-write, then it is not safe to write as we may
909
* recurse into the copy-on-write routine.
911
if (curthread->td_pflags & TDP_COWINPROGRESS)
914
* Normally we just process each item on the worklist in order.
915
* However, if we are in a situation where we cannot lock any
916
* inodes, we have to skip over any dirrem requests whose
917
* vnodes are resident and locked.
921
LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
922
if (wk->wk_state & INPROGRESS)
924
if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
926
wk->wk_state |= INPROGRESS;
927
ump->softdep_on_worklist_inprogress++;
929
ffs_vgetf(mp, WK_DIRREM(wk)->dm_oldinum,
930
LK_NOWAIT | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ);
932
wk->wk_state &= ~INPROGRESS;
933
ump->softdep_on_worklist_inprogress--;
940
* Remove the item to be processed. If we are removing the last
941
* item on the list, we need to recalculate the tail pointer.
942
* As this happens rarely and usually when the list is short,
943
* we just run down the list to find it rather than tracking it
947
if (wk == ump->softdep_worklist_tail) {
948
LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
949
if (LIST_NEXT(wkend, wk_list) == NULL)
951
ump->softdep_worklist_tail = wkend;
953
ump->softdep_on_worklist -= 1;
955
if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
956
panic("process_worklist_item: suspended filesystem");
958
switch (wk->wk_type) {
961
/* removal of a directory entry */
962
handle_workitem_remove(WK_DIRREM(wk), vp);
966
/* releasing blocks and/or fragments from a file */
967
handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
971
/* releasing a fragment when replaced as a file grows */
972
handle_workitem_freefrag(WK_FREEFRAG(wk));
976
/* releasing an inode when its link count drops to 0 */
977
handle_workitem_freefile(WK_FREEFILE(wk));
981
panic("%s_process_worklist: Unknown type %s",
982
"softdep", TYPENAME(wk->wk_type));
985
vn_finished_secondary_write(mp);
991
* Move dependencies from one buffer to another.
994
softdep_move_dependencies(oldbp, newbp)
998
struct worklist *wk, *wktail;
1000
if (!LIST_EMPTY(&newbp->b_dep))
1001
panic("softdep_move_dependencies: need merge code");
1004
while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1005
LIST_REMOVE(wk, wk_list);
1007
LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1009
LIST_INSERT_AFTER(wktail, wk, wk_list);
1016
* Purge the work list of all items associated with a particular mount point.
1019
softdep_flushworklist(oldmnt, countp, td)
1020
struct mount *oldmnt;
1024
struct vnode *devvp;
1025
int count, error = 0;
1026
struct ufsmount *ump;
1029
* Alternately flush the block device associated with the mount
1030
* point and process any dependencies that the flushing
1031
* creates. We continue until no more worklist dependencies
1035
ump = VFSTOUFS(oldmnt);
1036
devvp = ump->um_devvp;
1037
while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1039
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
1040
error = VOP_FSYNC(devvp, MNT_WAIT, td);
1041
VOP_UNLOCK(devvp, 0, td);
1049
softdep_waitidle(struct mount *mp)
1051
struct ufsmount *ump;
1057
for (i = 0; i < 10 && ump->softdep_deps; i++) {
1058
ump->softdep_req = 1;
1059
if (ump->softdep_on_worklist)
1060
panic("softdep_waitidle: work added after flush.");
1061
msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
1063
ump->softdep_req = 0;
1068
printf("softdep_waitidle: Failed to flush worklist for %p\n",
1076
* Flush all vnodes and worklist items associated with a specified mount point.
1079
softdep_flushfiles(oldmnt, flags, td)
1080
struct mount *oldmnt;
1084
int error, depcount, loopcnt, retry_flush_count, retry;
1087
retry_flush_count = 3;
1092
* Alternately flush the vnodes associated with the mount
1093
* point and process any dependencies that the flushing
1094
* creates. In theory, this loop can happen at most twice,
1095
* but we give it a few extra just to be sure.
1097
for (; loopcnt > 0; loopcnt--) {
1099
* Do another flush in case any vnodes were brought in
1100
* as part of the cleanup operations.
1102
if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
1104
if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1109
* If we are unmounting then it is an error to fail. If we
1110
* are simply trying to downgrade to read-only, then filesystem
1111
* activity can keep us busy forever, so we just fail with EBUSY.
1114
if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1115
panic("softdep_flushfiles: looping");
1119
error = softdep_waitidle(oldmnt);
1121
if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
1124
KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
1125
("softdep_flushfiles: !MNTK_NOINSMNTQ"));
1126
if (oldmnt->mnt_nvnodelistsize > 0) {
1127
if (--retry_flush_count > 0) {
1133
MNT_IUNLOCK(oldmnt);
1142
* Structure hashing.
1144
* There are three types of structures that can be looked up:
1145
* 1) pagedep structures identified by mount point, inode number,
1146
* and logical block.
1147
* 2) inodedep structures identified by mount point and inode number.
1148
* 3) newblk structures identified by mount point and
1149
* physical block number.
1151
* The "pagedep" and "inodedep" dependency structures are hashed
1152
* separately from the file blocks and inodes to which they correspond.
1153
* This separation helps when the in-memory copy of an inode or
1154
* file block must be replaced. It also obviates the need to access
1155
* an inode or file page when simply updating (or de-allocating)
1156
* dependency structures. Lookup of newblk structures is needed to
1157
* find newly allocated blocks when trying to associate them with
1158
* their allocdirect or allocindir structure.
1160
* The lookup routines optionally create and hash a new instance when
1161
* an existing entry is not found.
1163
#define DEPALLOC 0x0001 /* allocate structure if lookup fails */
1164
#define NODELAY 0x0002 /* cannot do background work */
1167
* Structures and routines associated with pagedep caching.
1169
LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
1170
u_long pagedep_hash; /* size of hash table - 1 */
1171
#define PAGEDEP_HASH(mp, inum, lbn) \
1172
(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
1176
pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
1177
struct pagedep_hashhead *pagedephd;
1182
struct pagedep **pagedeppp;
1184
struct pagedep *pagedep;
1186
LIST_FOREACH(pagedep, pagedephd, pd_hash)
1187
if (ino == pagedep->pd_ino &&
1188
lbn == pagedep->pd_lbn &&
1189
mp == pagedep->pd_list.wk_mp)
1192
*pagedeppp = pagedep;
1193
if ((flags & DEPALLOC) != 0 &&
1194
(pagedep->pd_state & ONWORKLIST) == 0)
1202
* Look up a pagedep. Return 1 if found, 0 if not found or found
1203
* when asked to allocate but not associated with any buffer.
1204
* If not found, allocate if DEPALLOC flag is passed.
1205
* Found or allocated entry is returned in pagedeppp.
1206
* This routine must be called with splbio interrupts blocked.
1209
pagedep_lookup(ip, lbn, flags, pagedeppp)
1213
struct pagedep **pagedeppp;
1215
struct pagedep *pagedep;
1216
struct pagedep_hashhead *pagedephd;
1221
mtx_assert(&lk, MA_OWNED);
1222
mp = ITOV(ip)->v_mount;
1223
pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
1225
ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
1226
if (*pagedeppp || (flags & DEPALLOC) == 0)
1229
MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep),
1230
M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
1231
workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
1233
ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
1235
WORKITEM_FREE(pagedep, D_PAGEDEP);
1238
pagedep->pd_ino = ip->i_number;
1239
pagedep->pd_lbn = lbn;
1240
LIST_INIT(&pagedep->pd_dirremhd);
1241
LIST_INIT(&pagedep->pd_pendinghd);
1242
for (i = 0; i < DAHASHSZ; i++)
1243
LIST_INIT(&pagedep->pd_diraddhd[i]);
1244
LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
1245
*pagedeppp = pagedep;
1250
* Structures and routines associated with inodedep caching.
1252
LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
1253
static u_long inodedep_hash; /* size of hash table - 1 */
1254
static long num_inodedep; /* number of inodedep allocated */
1255
#define INODEDEP_HASH(fs, inum) \
1256
(&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
1259
inodedep_find(inodedephd, fs, inum, inodedeppp)
1260
struct inodedep_hashhead *inodedephd;
1263
struct inodedep **inodedeppp;
1265
struct inodedep *inodedep;
1267
LIST_FOREACH(inodedep, inodedephd, id_hash)
1268
if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1271
*inodedeppp = inodedep;
1279
* Look up an inodedep. Return 1 if found, 0 if not found.
1280
* If not found, allocate if DEPALLOC flag is passed.
1281
* Found or allocated entry is returned in inodedeppp.
1282
* This routine must be called with splbio interrupts blocked.
1285
inodedep_lookup(mp, inum, flags, inodedeppp)
1289
struct inodedep **inodedeppp;
1291
struct inodedep *inodedep;
1292
struct inodedep_hashhead *inodedephd;
1295
mtx_assert(&lk, MA_OWNED);
1296
fs = VFSTOUFS(mp)->um_fs;
1297
inodedephd = INODEDEP_HASH(fs, inum);
1299
if (inodedep_find(inodedephd, fs, inum, inodedeppp))
1301
if ((flags & DEPALLOC) == 0)
1304
* If we are over our limit, try to improve the situation.
1306
if (num_inodedep > max_softdeps && (flags & NODELAY) == 0)
1307
request_cleanup(mp, FLUSH_INODES);
1309
MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
1310
M_INODEDEP, M_SOFTDEP_FLAGS);
1311
workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
1313
if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
1314
WORKITEM_FREE(inodedep, D_INODEDEP);
1318
inodedep->id_fs = fs;
1319
inodedep->id_ino = inum;
1320
inodedep->id_state = ALLCOMPLETE;
1321
inodedep->id_nlinkdelta = 0;
1322
inodedep->id_savedino1 = NULL;
1323
inodedep->id_savedsize = -1;
1324
inodedep->id_savedextsize = -1;
1325
inodedep->id_buf = NULL;
1326
LIST_INIT(&inodedep->id_pendinghd);
1327
LIST_INIT(&inodedep->id_inowait);
1328
LIST_INIT(&inodedep->id_bufwait);
1329
TAILQ_INIT(&inodedep->id_inoupdt);
1330
TAILQ_INIT(&inodedep->id_newinoupdt);
1331
TAILQ_INIT(&inodedep->id_extupdt);
1332
TAILQ_INIT(&inodedep->id_newextupdt);
1333
LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1334
*inodedeppp = inodedep;
1339
* Structures and routines associated with newblk caching.
1341
LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1342
u_long newblk_hash; /* size of hash table - 1 */
1343
#define NEWBLK_HASH(fs, inum) \
1344
(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1347
newblk_find(newblkhd, fs, newblkno, newblkpp)
1348
struct newblk_hashhead *newblkhd;
1350
ufs2_daddr_t newblkno;
1351
struct newblk **newblkpp;
1353
struct newblk *newblk;
1355
LIST_FOREACH(newblk, newblkhd, nb_hash)
1356
if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
1367
* Look up a newblk. Return 1 if found, 0 if not found.
1368
* If not found, allocate if DEPALLOC flag is passed.
1369
* Found or allocated entry is returned in newblkpp.
1372
newblk_lookup(fs, newblkno, flags, newblkpp)
1374
ufs2_daddr_t newblkno;
1376
struct newblk **newblkpp;
1378
struct newblk *newblk;
1379
struct newblk_hashhead *newblkhd;
1381
newblkhd = NEWBLK_HASH(fs, newblkno);
1382
if (newblk_find(newblkhd, fs, newblkno, newblkpp))
1384
if ((flags & DEPALLOC) == 0)
1387
MALLOC(newblk, struct newblk *, sizeof(struct newblk),
1388
M_NEWBLK, M_SOFTDEP_FLAGS);
1390
if (newblk_find(newblkhd, fs, newblkno, newblkpp)) {
1391
FREE(newblk, M_NEWBLK);
1394
newblk->nb_state = 0;
1396
newblk->nb_newblkno = newblkno;
1397
LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1403
* Executed during filesystem system initialization before
1404
* mounting any filesystems.
1407
softdep_initialize()
1410
LIST_INIT(&mkdirlisthd);
1411
max_softdeps = desiredvnodes * 4;
1412
pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
1414
inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
1415
newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
1417
/* initialise bioops hack */
1418
bioops.io_start = softdep_disk_io_initiation;
1419
bioops.io_complete = softdep_disk_write_complete;
1420
bioops.io_deallocate = softdep_deallocate_dependencies;
1421
bioops.io_countdeps = softdep_count_dependencies;
1423
/* Initialize the callout with an mtx. */
1424
callout_init_mtx(&softdep_callout, &lk, 0);
1428
* Executed after all filesystems have been unmounted during
1429
* filesystem module unload.
1432
softdep_uninitialize()
1435
callout_drain(&softdep_callout);
1436
hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
1437
hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
1438
hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
1442
* Called at mount time to notify the dependency code that a
1443
* filesystem wishes to use it.
1446
softdep_mount(devvp, mp, fs, cred)
1447
struct vnode *devvp;
1452
struct csum_total cstotal;
1453
struct ufsmount *ump;
1459
mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
1460
if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
1461
mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
1467
LIST_INIT(&ump->softdep_workitem_pending);
1468
ump->softdep_worklist_tail = NULL;
1469
ump->softdep_on_worklist = 0;
1470
ump->softdep_deps = 0;
1472
* When doing soft updates, the counters in the
1473
* superblock may have gotten out of sync. Recomputation
1474
* can take a long time and can be deferred for background
1475
* fsck. However, the old behavior of scanning the cylinder
1476
* groups and recalculating them at mount time is available
1477
* by setting vfs.ffs.compute_summary_at_mount to one.
1479
if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
1481
bzero(&cstotal, sizeof cstotal);
1482
for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1483
if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
1484
fs->fs_cgsize, cred, &bp)) != 0) {
1488
cgp = (struct cg *)bp->b_data;
1489
cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1490
cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1491
cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1492
cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1493
fs->fs_cs(fs, cyl) = cgp->cg_cs;
1497
if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1498
printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
1500
bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1505
* Protecting the freemaps (or bitmaps).
1507
* To eliminate the need to execute fsck before mounting a filesystem
1508
* after a power failure, one must (conservatively) guarantee that the
1509
* on-disk copy of the bitmaps never indicate that a live inode or block is
1510
* free. So, when a block or inode is allocated, the bitmap should be
1511
* updated (on disk) before any new pointers. When a block or inode is
1512
* freed, the bitmap should not be updated until all pointers have been
1513
* reset. The latter dependency is handled by the delayed de-allocation
1514
* approach described below for block and inode de-allocation. The former
1515
* dependency is handled by calling the following procedure when a block or
1516
* inode is allocated. When an inode is allocated an "inodedep" is created
1517
* with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1518
* Each "inodedep" is also inserted into the hash indexing structure so
1519
* that any additional link additions can be made dependent on the inode
1522
* The ufs filesystem maintains a number of free block counts (e.g., per
1523
* cylinder group, per cylinder and per <cylinder, rotational position> pair)
1524
* in addition to the bitmaps. These counts are used to improve efficiency
1525
* during allocation and therefore must be consistent with the bitmaps.
1526
* There is no convenient way to guarantee post-crash consistency of these
1527
* counts with simple update ordering, for two main reasons: (1) The counts
1528
* and bitmaps for a single cylinder group block are not in the same disk
1529
* sector. If a disk write is interrupted (e.g., by power failure), one may
1530
* be written and the other not. (2) Some of the counts are located in the
1531
* superblock rather than the cylinder group block. So, we focus our soft
1532
* updates implementation on protecting the bitmaps. When mounting a
1533
* filesystem, we recompute the auxiliary counts from the bitmaps.
1537
* Called just after updating the cylinder group block to allocate an inode.
1540
softdep_setup_inomapdep(bp, ip, newinum)
1541
struct buf *bp; /* buffer for cylgroup block with inode map */
1542
struct inode *ip; /* inode related to allocation */
1543
ino_t newinum; /* new inode number being allocated */
1545
struct inodedep *inodedep;
1546
struct bmsafemap *bmsafemap;
1549
* Create a dependency for the newly allocated inode.
1550
* Panic if it already exists as something is seriously wrong.
1551
* Otherwise add it to the dependency list for the buffer holding
1552
* the cylinder group map from which it was allocated.
1555
if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY,
1557
panic("softdep_setup_inomapdep: dependency for new inode "
1559
inodedep->id_buf = bp;
1560
inodedep->id_state &= ~DEPCOMPLETE;
1561
bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp);
1562
LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1567
* Called just after updating the cylinder group block to
1568
* allocate block or fragment.
1571
softdep_setup_blkmapdep(bp, mp, newblkno)
1572
struct buf *bp; /* buffer for cylgroup block with block map */
1573
struct mount *mp; /* filesystem doing allocation */
1574
ufs2_daddr_t newblkno; /* number of newly allocated block */
1576
struct newblk *newblk;
1577
struct bmsafemap *bmsafemap;
1580
fs = VFSTOUFS(mp)->um_fs;
1582
* Create a dependency for the newly allocated block.
1583
* Add it to the dependency list for the buffer holding
1584
* the cylinder group map from which it was allocated.
1587
if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1588
panic("softdep_setup_blkmapdep: found block");
1589
newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp);
1590
LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1595
* Find the bmsafemap associated with a cylinder group buffer.
1596
* If none exists, create one. The buffer must be locked when
1597
* this routine is called and this routine must be called with
1598
* splbio interrupts blocked.
1600
static struct bmsafemap *
1601
bmsafemap_lookup(mp, bp)
1605
struct bmsafemap *bmsafemap;
1606
struct worklist *wk;
1608
mtx_assert(&lk, MA_OWNED);
1609
LIST_FOREACH(wk, &bp->b_dep, wk_list)
1610
if (wk->wk_type == D_BMSAFEMAP)
1611
return (WK_BMSAFEMAP(wk));
1613
MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1614
M_BMSAFEMAP, M_SOFTDEP_FLAGS);
1615
workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
1616
bmsafemap->sm_buf = bp;
1617
LIST_INIT(&bmsafemap->sm_allocdirecthd);
1618
LIST_INIT(&bmsafemap->sm_allocindirhd);
1619
LIST_INIT(&bmsafemap->sm_inodedephd);
1620
LIST_INIT(&bmsafemap->sm_newblkhd);
1622
WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1627
* Direct block allocation dependencies.
1629
* When a new block is allocated, the corresponding disk locations must be
1630
* initialized (with zeros or new data) before the on-disk inode points to
1631
* them. Also, the freemap from which the block was allocated must be
1632
* updated (on disk) before the inode's pointer. These two dependencies are
1633
* independent of each other and are needed for all file blocks and indirect
1634
* blocks that are pointed to directly by the inode. Just before the
1635
* "in-core" version of the inode is updated with a newly allocated block
1636
* number, a procedure (below) is called to setup allocation dependency
1637
* structures. These structures are removed when the corresponding
1638
* dependencies are satisfied or when the block allocation becomes obsolete
1639
* (i.e., the file is deleted, the block is de-allocated, or the block is a
1640
* fragment that gets upgraded). All of these cases are handled in
1641
* procedures described later.
1643
* When a file extension causes a fragment to be upgraded, either to a larger
1644
* fragment or to a full block, the on-disk location may change (if the
1645
* previous fragment could not simply be extended). In this case, the old
1646
* fragment must be de-allocated, but not until after the inode's pointer has
1647
* been updated. In most cases, this is handled by later procedures, which
1648
* will construct a "freefrag" structure to be added to the workitem queue
1649
* when the inode update is complete (or obsolete). The main exception to
1650
* this is when an allocation occurs while a pending allocation dependency
1651
* (for the same block pointer) remains. This case is handled in the main
1652
* allocation dependency setup procedure by immediately freeing the
1653
* unreferenced fragments.
1656
softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1657
struct inode *ip; /* inode to which block is being added */
1658
ufs_lbn_t lbn; /* block pointer within inode */
1659
ufs2_daddr_t newblkno; /* disk block number being added */
1660
ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */
1661
long newsize; /* size of new block */
1662
long oldsize; /* size of new block */
1663
struct buf *bp; /* bp for allocated block */
1665
struct allocdirect *adp, *oldadp;
1666
struct allocdirectlst *adphead;
1667
struct bmsafemap *bmsafemap;
1668
struct inodedep *inodedep;
1669
struct pagedep *pagedep;
1670
struct newblk *newblk;
1673
mp = UFSTOVFS(ip->i_ump);
1674
MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1675
M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1676
workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
1678
adp->ad_newblkno = newblkno;
1679
adp->ad_oldblkno = oldblkno;
1680
adp->ad_newsize = newsize;
1681
adp->ad_oldsize = oldsize;
1682
adp->ad_state = ATTACHED;
1683
LIST_INIT(&adp->ad_newdirblk);
1684
if (newblkno == oldblkno)
1685
adp->ad_freefrag = NULL;
1687
adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1690
if (lbn >= NDADDR) {
1691
/* allocating an indirect block */
1693
panic("softdep_setup_allocdirect: non-zero indir");
1696
* Allocating a direct block.
1698
* If we are allocating a directory block, then we must
1699
* allocate an associated pagedep to track additions and
1702
if ((ip->i_mode & IFMT) == IFDIR &&
1703
pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1704
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1706
if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1707
panic("softdep_setup_allocdirect: lost block");
1708
if (newblk->nb_state == DEPCOMPLETE) {
1709
adp->ad_state |= DEPCOMPLETE;
1712
bmsafemap = newblk->nb_bmsafemap;
1713
adp->ad_buf = bmsafemap->sm_buf;
1714
LIST_REMOVE(newblk, nb_deps);
1715
LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1717
LIST_REMOVE(newblk, nb_hash);
1718
FREE(newblk, M_NEWBLK);
1720
inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1721
adp->ad_inodedep = inodedep;
1722
WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1724
* The list of allocdirects must be kept in sorted and ascending
1725
* order so that the rollback routines can quickly determine the
1726
* first uncommitted block (the size of the file stored on disk
1727
* ends at the end of the lowest committed fragment, or if there
1728
* are no fragments, at the end of the highest committed block).
1729
* Since files generally grow, the typical case is that the new
1730
* block is to be added at the end of the list. We speed this
1731
* special case by checking against the last allocdirect in the
1732
* list before laboriously traversing the list looking for the
1735
adphead = &inodedep->id_newinoupdt;
1736
oldadp = TAILQ_LAST(adphead, allocdirectlst);
1737
if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1738
/* insert at end of list */
1739
TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1740
if (oldadp != NULL && oldadp->ad_lbn == lbn)
1741
allocdirect_merge(adphead, adp, oldadp);
1745
TAILQ_FOREACH(oldadp, adphead, ad_next) {
1746
if (oldadp->ad_lbn >= lbn)
1750
panic("softdep_setup_allocdirect: lost entry");
1751
/* insert in middle of list */
1752
TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1753
if (oldadp->ad_lbn == lbn)
1754
allocdirect_merge(adphead, adp, oldadp);
1759
* Replace an old allocdirect dependency with a newer one.
1760
* This routine must be called with splbio interrupts blocked.
1763
allocdirect_merge(adphead, newadp, oldadp)
1764
struct allocdirectlst *adphead; /* head of list holding allocdirects */
1765
struct allocdirect *newadp; /* allocdirect being added */
1766
struct allocdirect *oldadp; /* existing allocdirect being checked */
1768
struct worklist *wk;
1769
struct freefrag *freefrag;
1770
struct newdirblk *newdirblk;
1772
mtx_assert(&lk, MA_OWNED);
1773
if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1774
newadp->ad_oldsize != oldadp->ad_newsize ||
1775
newadp->ad_lbn >= NDADDR)
1776
panic("%s %jd != new %jd || old size %ld != new %ld",
1777
"allocdirect_merge: old blkno",
1778
(intmax_t)newadp->ad_oldblkno,
1779
(intmax_t)oldadp->ad_newblkno,
1780
newadp->ad_oldsize, oldadp->ad_newsize);
1781
newadp->ad_oldblkno = oldadp->ad_oldblkno;
1782
newadp->ad_oldsize = oldadp->ad_oldsize;
1784
* If the old dependency had a fragment to free or had never
1785
* previously had a block allocated, then the new dependency
1786
* can immediately post its freefrag and adopt the old freefrag.
1787
* This action is done by swapping the freefrag dependencies.
1788
* The new dependency gains the old one's freefrag, and the
1789
* old one gets the new one and then immediately puts it on
1790
* the worklist when it is freed by free_allocdirect. It is
1791
* not possible to do this swap when the old dependency had a
1792
* non-zero size but no previous fragment to free. This condition
1793
* arises when the new block is an extension of the old block.
1794
* Here, the first part of the fragment allocated to the new
1795
* dependency is part of the block currently claimed on disk by
1796
* the old dependency, so cannot legitimately be freed until the
1797
* conditions for the new dependency are fulfilled.
1799
if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1800
freefrag = newadp->ad_freefrag;
1801
newadp->ad_freefrag = oldadp->ad_freefrag;
1802
oldadp->ad_freefrag = freefrag;
1805
* If we are tracking a new directory-block allocation,
1806
* move it from the old allocdirect to the new allocdirect.
1808
if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
1809
newdirblk = WK_NEWDIRBLK(wk);
1810
WORKLIST_REMOVE(&newdirblk->db_list);
1811
if (!LIST_EMPTY(&oldadp->ad_newdirblk))
1812
panic("allocdirect_merge: extra newdirblk");
1813
WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
1815
free_allocdirect(adphead, oldadp, 0);
1819
* Allocate a new freefrag structure if needed.
1821
static struct freefrag *
1822
newfreefrag(ip, blkno, size)
1827
struct freefrag *freefrag;
1833
if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1834
panic("newfreefrag: frag size");
1835
MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1836
M_FREEFRAG, M_SOFTDEP_FLAGS);
1837
workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
1838
freefrag->ff_inum = ip->i_number;
1839
freefrag->ff_blkno = blkno;
1840
freefrag->ff_fragsize = size;
1845
* This workitem de-allocates fragments that were replaced during
1846
* file block allocation.
1849
handle_workitem_freefrag(freefrag)
1850
struct freefrag *freefrag;
1852
struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
1854
ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
1855
freefrag->ff_fragsize, freefrag->ff_inum);
1857
WORKITEM_FREE(freefrag, D_FREEFRAG);
1862
* Set up a dependency structure for an external attributes data block.
1863
* This routine follows much of the structure of softdep_setup_allocdirect.
1864
* See the description of softdep_setup_allocdirect above for details.
1867
softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1870
ufs2_daddr_t newblkno;
1871
ufs2_daddr_t oldblkno;
1876
struct allocdirect *adp, *oldadp;
1877
struct allocdirectlst *adphead;
1878
struct bmsafemap *bmsafemap;
1879
struct inodedep *inodedep;
1880
struct newblk *newblk;
1883
mp = UFSTOVFS(ip->i_ump);
1884
MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1885
M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1886
workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
1888
adp->ad_newblkno = newblkno;
1889
adp->ad_oldblkno = oldblkno;
1890
adp->ad_newsize = newsize;
1891
adp->ad_oldsize = oldsize;
1892
adp->ad_state = ATTACHED | EXTDATA;
1893
LIST_INIT(&adp->ad_newdirblk);
1894
if (newblkno == oldblkno)
1895
adp->ad_freefrag = NULL;
1897
adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1900
if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1901
panic("softdep_setup_allocext: lost block");
1903
inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1904
adp->ad_inodedep = inodedep;
1906
if (newblk->nb_state == DEPCOMPLETE) {
1907
adp->ad_state |= DEPCOMPLETE;
1910
bmsafemap = newblk->nb_bmsafemap;
1911
adp->ad_buf = bmsafemap->sm_buf;
1912
LIST_REMOVE(newblk, nb_deps);
1913
LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1915
LIST_REMOVE(newblk, nb_hash);
1916
FREE(newblk, M_NEWBLK);
1918
WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1920
panic("softdep_setup_allocext: lbn %lld > NXADDR",
1923
* The list of allocdirects must be kept in sorted and ascending
1924
* order so that the rollback routines can quickly determine the
1925
* first uncommitted block (the size of the file stored on disk
1926
* ends at the end of the lowest committed fragment, or if there
1927
* are no fragments, at the end of the highest committed block).
1928
* Since files generally grow, the typical case is that the new
1929
* block is to be added at the end of the list. We speed this
1930
* special case by checking against the last allocdirect in the
1931
* list before laboriously traversing the list looking for the
1934
adphead = &inodedep->id_newextupdt;
1935
oldadp = TAILQ_LAST(adphead, allocdirectlst);
1936
if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1937
/* insert at end of list */
1938
TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1939
if (oldadp != NULL && oldadp->ad_lbn == lbn)
1940
allocdirect_merge(adphead, adp, oldadp);
1944
TAILQ_FOREACH(oldadp, adphead, ad_next) {
1945
if (oldadp->ad_lbn >= lbn)
1949
panic("softdep_setup_allocext: lost entry");
1950
/* insert in middle of list */
1951
TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1952
if (oldadp->ad_lbn == lbn)
1953
allocdirect_merge(adphead, adp, oldadp);
1958
* Indirect block allocation dependencies.
1960
* The same dependencies that exist for a direct block also exist when
1961
* a new block is allocated and pointed to by an entry in a block of
1962
* indirect pointers. The undo/redo states described above are also
1963
* used here. Because an indirect block contains many pointers that
1964
* may have dependencies, a second copy of the entire in-memory indirect
1965
* block is kept. The buffer cache copy is always completely up-to-date.
1966
* The second copy, which is used only as a source for disk writes,
1967
* contains only the safe pointers (i.e., those that have no remaining
1968
* update dependencies). The second copy is freed when all pointers
1969
* are safe. The cache is not allowed to replace indirect blocks with
1970
* pending update dependencies. If a buffer containing an indirect
1971
* block with dependencies is written, these routines will mark it
1972
* dirty again. It can only be successfully written once all the
1973
* dependencies are removed. The ffs_fsync routine in conjunction with
1974
* softdep_sync_metadata work together to get all the dependencies
1975
* removed so that a file can be successfully written to disk. Three
1976
* procedures are used when setting up indirect block pointer
1977
* dependencies. The division is necessary because of the organization
1978
* of the "balloc" routine and because of the distinction between file
1979
* pages and file metadata blocks.
1983
* Allocate a new allocindir structure.
1985
static struct allocindir *
1986
newallocindir(ip, ptrno, newblkno, oldblkno)
1987
struct inode *ip; /* inode for file being extended */
1988
int ptrno; /* offset of pointer in indirect block */
1989
ufs2_daddr_t newblkno; /* disk block number being added */
1990
ufs2_daddr_t oldblkno; /* previous block number, 0 if none */
1992
struct allocindir *aip;
1994
MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1995
M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
1996
workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump));
1997
aip->ai_state = ATTACHED;
1998
aip->ai_offset = ptrno;
1999
aip->ai_newblkno = newblkno;
2000
aip->ai_oldblkno = oldblkno;
2001
aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
2006
* Called just before setting an indirect block pointer
2007
* to a newly allocated file page.
2010
softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
2011
struct inode *ip; /* inode for file being extended */
2012
ufs_lbn_t lbn; /* allocated block number within file */
2013
struct buf *bp; /* buffer with indirect blk referencing page */
2014
int ptrno; /* offset of pointer in indirect block */
2015
ufs2_daddr_t newblkno; /* disk block number being added */
2016
ufs2_daddr_t oldblkno; /* previous block number, 0 if none */
2017
struct buf *nbp; /* buffer holding allocated page */
2019
struct allocindir *aip;
2020
struct pagedep *pagedep;
2022
ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
2023
aip = newallocindir(ip, ptrno, newblkno, oldblkno);
2026
* If we are allocating a directory page, then we must
2027
* allocate an associated pagedep to track additions and
2030
if ((ip->i_mode & IFMT) == IFDIR &&
2031
pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
2032
WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
2033
WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
2034
setup_allocindir_phase2(bp, ip, aip);
2039
* Called just before setting an indirect block pointer to a
2040
* newly allocated indirect block.
2043
softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
2044
struct buf *nbp; /* newly allocated indirect block */
2045
struct inode *ip; /* inode for file being extended */
2046
struct buf *bp; /* indirect block referencing allocated block */
2047
int ptrno; /* offset of pointer in indirect block */
2048
ufs2_daddr_t newblkno; /* disk block number being added */
2050
struct allocindir *aip;
2052
ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
2053
aip = newallocindir(ip, ptrno, newblkno, 0);
2055
WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
2056
setup_allocindir_phase2(bp, ip, aip);
2061
* Called to finish the allocation of the "aip" allocated
2062
* by one of the two routines above.
2065
setup_allocindir_phase2(bp, ip, aip)
2066
struct buf *bp; /* in-memory copy of the indirect block */
2067
struct inode *ip; /* inode for file being extended */
2068
struct allocindir *aip; /* allocindir allocated by the above routines */
2070
struct worklist *wk;
2071
struct indirdep *indirdep, *newindirdep;
2072
struct bmsafemap *bmsafemap;
2073
struct allocindir *oldaip;
2074
struct freefrag *freefrag;
2075
struct newblk *newblk;
2078
mtx_assert(&lk, MA_OWNED);
2079
if (bp->b_lblkno >= 0)
2080
panic("setup_allocindir_phase2: not indir blk");
2081
for (indirdep = NULL, newindirdep = NULL; ; ) {
2082
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2083
if (wk->wk_type != D_INDIRDEP)
2085
indirdep = WK_INDIRDEP(wk);
2088
if (indirdep == NULL && newindirdep) {
2089
indirdep = newindirdep;
2090
WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
2094
if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
2096
panic("setup_allocindir: lost block");
2097
if (newblk->nb_state == DEPCOMPLETE) {
2098
aip->ai_state |= DEPCOMPLETE;
2101
bmsafemap = newblk->nb_bmsafemap;
2102
aip->ai_buf = bmsafemap->sm_buf;
2103
LIST_REMOVE(newblk, nb_deps);
2104
LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
2107
LIST_REMOVE(newblk, nb_hash);
2108
FREE(newblk, M_NEWBLK);
2109
aip->ai_indirdep = indirdep;
2111
* Check to see if there is an existing dependency
2112
* for this block. If there is, merge the old
2113
* dependency into the new one.
2115
if (aip->ai_oldblkno == 0)
2119
LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
2120
if (oldaip->ai_offset == aip->ai_offset)
2123
if (oldaip != NULL) {
2124
if (oldaip->ai_newblkno != aip->ai_oldblkno)
2125
panic("setup_allocindir_phase2: blkno");
2126
aip->ai_oldblkno = oldaip->ai_oldblkno;
2127
freefrag = aip->ai_freefrag;
2128
aip->ai_freefrag = oldaip->ai_freefrag;
2129
oldaip->ai_freefrag = NULL;
2130
free_allocindir(oldaip, NULL);
2132
LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
2133
if (ip->i_ump->um_fstype == UFS1)
2134
((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
2135
[aip->ai_offset] = aip->ai_oldblkno;
2137
((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
2138
[aip->ai_offset] = aip->ai_oldblkno;
2140
if (freefrag != NULL)
2141
handle_workitem_freefrag(freefrag);
2145
newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
2146
brelse(newindirdep->ir_savebp);
2148
WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
2157
MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
2158
M_INDIRDEP, M_SOFTDEP_FLAGS);
2159
workitem_alloc(&newindirdep->ir_list, D_INDIRDEP,
2160
UFSTOVFS(ip->i_ump));
2161
newindirdep->ir_state = ATTACHED;
2162
if (ip->i_ump->um_fstype == UFS1)
2163
newindirdep->ir_state |= UFS1FMT;
2164
LIST_INIT(&newindirdep->ir_deplisthd);
2165
LIST_INIT(&newindirdep->ir_donehd);
2166
if (bp->b_blkno == bp->b_lblkno) {
2167
ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
2169
bp->b_blkno = blkno;
2171
newindirdep->ir_savebp =
2172
getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
2173
BUF_KERNPROC(newindirdep->ir_savebp);
2174
bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
2180
* Block de-allocation dependencies.
2182
* When blocks are de-allocated, the on-disk pointers must be nullified before
2183
* the blocks are made available for use by other files. (The true
2184
* requirement is that old pointers must be nullified before new on-disk
2185
* pointers are set. We chose this slightly more stringent requirement to
2186
* reduce complexity.) Our implementation handles this dependency by updating
2187
* the inode (or indirect block) appropriately but delaying the actual block
2188
* de-allocation (i.e., freemap and free space count manipulation) until
2189
* after the updated versions reach stable storage. After the disk is
2190
* updated, the blocks can be safely de-allocated whenever it is convenient.
2191
* This implementation handles only the common case of reducing a file's
2192
* length to zero. Other cases are handled by the conventional synchronous
2195
* The ffs implementation with which we worked double-checks
2196
* the state of the block pointers and file size as it reduces
2197
* a file's length. Some of this code is replicated here in our
2198
* soft updates implementation. The freeblks->fb_chkcnt field is
2199
* used to transfer a part of this information to the procedure
2200
* that eventually de-allocates the blocks.
2202
* This routine should be called from the routine that shortens
2203
* a file's length, before the inode's size or block pointers
2204
* are modified. It will save the block pointer information for
2205
* later release and zero the inode so that the calling routine
2209
softdep_setup_freeblocks(ip, length, flags)
2210
struct inode *ip; /* The inode whose length is to be reduced */
2211
off_t length; /* The new length for the file */
2212
int flags; /* IO_EXT and/or IO_NORMAL */
2214
struct freeblks *freeblks;
2215
struct inodedep *inodedep;
2216
struct allocdirect *adp;
2220
ufs2_daddr_t extblocks, datablocks;
2222
int i, delay, error;
2225
mp = UFSTOVFS(ip->i_ump);
2227
panic("softdep_setup_freeblocks: non-zero length");
2228
MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
2229
M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
2230
workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
2231
freeblks->fb_state = ATTACHED;
2232
freeblks->fb_uid = ip->i_uid;
2233
freeblks->fb_previousinum = ip->i_number;
2234
freeblks->fb_devvp = ip->i_devvp;
2239
if (fs->fs_magic == FS_UFS2_MAGIC)
2240
extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
2241
datablocks = DIP(ip, i_blocks) - extblocks;
2242
if ((flags & IO_NORMAL) == 0) {
2243
freeblks->fb_oldsize = 0;
2244
freeblks->fb_chkcnt = 0;
2246
freeblks->fb_oldsize = ip->i_size;
2248
DIP_SET(ip, i_size, 0);
2249
freeblks->fb_chkcnt = datablocks;
2250
for (i = 0; i < NDADDR; i++) {
2251
freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
2252
DIP_SET(ip, i_db[i], 0);
2254
for (i = 0; i < NIADDR; i++) {
2255
freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
2256
DIP_SET(ip, i_ib[i], 0);
2259
* If the file was removed, then the space being freed was
2260
* accounted for then (see softdep_releasefile()). If the
2261
* file is merely being truncated, then we account for it now.
2263
if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
2264
UFS_LOCK(ip->i_ump);
2265
fs->fs_pendingblocks += datablocks;
2266
UFS_UNLOCK(ip->i_ump);
2269
if ((flags & IO_EXT) == 0) {
2270
freeblks->fb_oldextsize = 0;
2272
freeblks->fb_oldextsize = ip->i_din2->di_extsize;
2273
ip->i_din2->di_extsize = 0;
2274
freeblks->fb_chkcnt += extblocks;
2275
for (i = 0; i < NXADDR; i++) {
2276
freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
2277
ip->i_din2->di_extb[i] = 0;
2280
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
2282
* Push the zero'ed inode to to its disk buffer so that we are free
2283
* to delete its dependencies below. Once the dependencies are gone
2284
* the buffer can be safely released.
2286
if ((error = bread(ip->i_devvp,
2287
fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
2288
(int)fs->fs_bsize, NOCRED, &bp)) != 0) {
2290
softdep_error("softdep_setup_freeblocks", error);
2292
if (ip->i_ump->um_fstype == UFS1)
2293
*((struct ufs1_dinode *)bp->b_data +
2294
ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
2296
*((struct ufs2_dinode *)bp->b_data +
2297
ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
2299
* Find and eliminate any inode dependencies.
2302
(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
2303
if ((inodedep->id_state & IOSTARTED) != 0)
2304
panic("softdep_setup_freeblocks: inode busy");
2306
* Add the freeblks structure to the list of operations that
2307
* must await the zero'ed inode being written to disk. If we
2308
* still have a bitmap dependency (delay == 0), then the inode
2309
* has never been written to disk, so we can process the
2310
* freeblks below once we have deleted the dependencies.
2312
delay = (inodedep->id_state & DEPCOMPLETE);
2314
WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
2316
* Because the file length has been truncated to zero, any
2317
* pending block allocation dependency structures associated
2318
* with this inode are obsolete and can simply be de-allocated.
2319
* We must first merge the two dependency lists to get rid of
2320
* any duplicate freefrag structures, then purge the merged list.
2321
* If we still have a bitmap dependency, then the inode has never
2322
* been written to disk, so we can free any fragments without delay.
2324
if (flags & IO_NORMAL) {
2325
merge_inode_lists(&inodedep->id_newinoupdt,
2326
&inodedep->id_inoupdt);
2327
while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
2328
free_allocdirect(&inodedep->id_inoupdt, adp, delay);
2330
if (flags & IO_EXT) {
2331
merge_inode_lists(&inodedep->id_newextupdt,
2332
&inodedep->id_extupdt);
2333
while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
2334
free_allocdirect(&inodedep->id_extupdt, adp, delay);
2339
* We must wait for any I/O in progress to finish so that
2340
* all potential buffers on the dirty list will be visible.
2341
* Once they are all there, walk the list and get rid of
2348
TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
2349
if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
2350
((flags & IO_NORMAL) == 0 &&
2351
(bp->b_xflags & BX_ALTDATA) == 0))
2353
if ((bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT)) == NULL)
2357
(void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
2358
deallocate_dependencies(bp, inodedep);
2360
bp->b_flags |= B_INVAL | B_NOCACHE;
2367
if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
2368
(void) free_inodedep(inodedep);
2371
freeblks->fb_state |= DEPCOMPLETE;
2373
* If the inode with zeroed block pointers is now on disk
2374
* we can start freeing blocks. Add freeblks to the worklist
2375
* instead of calling handle_workitem_freeblocks directly as
2376
* it is more likely that additional IO is needed to complete
2377
* the request here than in the !delay case.
2379
if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
2380
add_to_worklist(&freeblks->fb_list);
2385
* If the inode has never been written to disk (delay == 0),
2386
* then we can process the freeblks now that we have deleted
2390
handle_workitem_freeblocks(freeblks, 0);
2394
* Reclaim any dependency structures from a buffer that is about to
2395
* be reallocated to a new vnode. The buffer must be locked, thus,
2396
* no I/O completion operations can occur while we are manipulating
2397
* its associated dependencies. The mutex is held so that other I/O's
2398
* associated with related dependencies do not occur.
2401
deallocate_dependencies(bp, inodedep)
2403
struct inodedep *inodedep;
2405
struct worklist *wk;
2406
struct indirdep *indirdep;
2407
struct allocindir *aip;
2408
struct pagedep *pagedep;
2409
struct dirrem *dirrem;
2413
mtx_assert(&lk, MA_OWNED);
2414
while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2415
switch (wk->wk_type) {
2418
indirdep = WK_INDIRDEP(wk);
2420
* None of the indirect pointers will ever be visible,
2421
* so they can simply be tossed. GOINGAWAY ensures
2422
* that allocated pointers will be saved in the buffer
2423
* cache until they are freed. Note that they will
2424
* only be able to be found by their physical address
2425
* since the inode mapping the logical address will
2426
* be gone. The save buffer used for the safe copy
2427
* was allocated in setup_allocindir_phase2 using
2428
* the physical address so it could be used for this
2429
* purpose. Hence we swap the safe copy with the real
2430
* copy, allowing the safe copy to be freed and holding
2431
* on to the real copy for later use in indir_trunc.
2433
if (indirdep->ir_state & GOINGAWAY)
2434
panic("deallocate_dependencies: already gone");
2435
indirdep->ir_state |= GOINGAWAY;
2436
VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
2437
while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
2438
free_allocindir(aip, inodedep);
2439
if (bp->b_lblkno >= 0 ||
2440
bp->b_blkno != indirdep->ir_savebp->b_lblkno)
2441
panic("deallocate_dependencies: not indir");
2442
bcopy(bp->b_data, indirdep->ir_savebp->b_data,
2444
WORKLIST_REMOVE(wk);
2445
WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
2449
pagedep = WK_PAGEDEP(wk);
2451
* None of the directory additions will ever be
2452
* visible, so they can simply be tossed.
2454
for (i = 0; i < DAHASHSZ; i++)
2456
LIST_FIRST(&pagedep->pd_diraddhd[i])))
2458
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
2461
* Copy any directory remove dependencies to the list
2462
* to be processed after the zero'ed inode is written.
2463
* If the inode has already been written, then they
2464
* can be dumped directly onto the work list.
2466
LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
2467
LIST_REMOVE(dirrem, dm_next);
2468
dirrem->dm_dirinum = pagedep->pd_ino;
2469
if (inodedep == NULL ||
2470
(inodedep->id_state & ALLCOMPLETE) ==
2472
add_to_worklist(&dirrem->dm_list);
2474
WORKLIST_INSERT(&inodedep->id_bufwait,
2477
if ((pagedep->pd_state & NEWBLOCK) != 0) {
2478
LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
2479
if (wk->wk_type == D_NEWDIRBLK &&
2480
WK_NEWDIRBLK(wk)->db_pagedep ==
2484
WORKLIST_REMOVE(wk);
2485
free_newdirblk(WK_NEWDIRBLK(wk));
2487
panic("deallocate_dependencies: "
2490
WORKLIST_REMOVE(&pagedep->pd_list);
2491
LIST_REMOVE(pagedep, pd_hash);
2492
WORKITEM_FREE(pagedep, D_PAGEDEP);
2496
free_allocindir(WK_ALLOCINDIR(wk), inodedep);
2501
panic("deallocate_dependencies: Unexpected type %s",
2502
TYPENAME(wk->wk_type));
2506
panic("deallocate_dependencies: Unknown type %s",
2507
TYPENAME(wk->wk_type));
2514
* Free an allocdirect. Generate a new freefrag work request if appropriate.
2515
* This routine must be called with splbio interrupts blocked.
2518
free_allocdirect(adphead, adp, delay)
2519
struct allocdirectlst *adphead;
2520
struct allocdirect *adp;
2523
struct newdirblk *newdirblk;
2524
struct worklist *wk;
2526
mtx_assert(&lk, MA_OWNED);
2527
if ((adp->ad_state & DEPCOMPLETE) == 0)
2528
LIST_REMOVE(adp, ad_deps);
2529
TAILQ_REMOVE(adphead, adp, ad_next);
2530
if ((adp->ad_state & COMPLETE) == 0)
2531
WORKLIST_REMOVE(&adp->ad_list);
2532
if (adp->ad_freefrag != NULL) {
2534
WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2535
&adp->ad_freefrag->ff_list);
2537
add_to_worklist(&adp->ad_freefrag->ff_list);
2539
if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
2540
newdirblk = WK_NEWDIRBLK(wk);
2541
WORKLIST_REMOVE(&newdirblk->db_list);
2542
if (!LIST_EMPTY(&adp->ad_newdirblk))
2543
panic("free_allocdirect: extra newdirblk");
2545
WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2546
&newdirblk->db_list);
2548
free_newdirblk(newdirblk);
2550
WORKITEM_FREE(adp, D_ALLOCDIRECT);
2554
* Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
2555
* This routine must be called with splbio interrupts blocked.
2558
free_newdirblk(newdirblk)
2559
struct newdirblk *newdirblk;
2561
struct pagedep *pagedep;
2565
mtx_assert(&lk, MA_OWNED);
2567
* If the pagedep is still linked onto the directory buffer
2568
* dependency chain, then some of the entries on the
2569
* pd_pendinghd list may not be committed to disk yet. In
2570
* this case, we will simply clear the NEWBLOCK flag and
2571
* let the pd_pendinghd list be processed when the pagedep
2572
* is next written. If the pagedep is no longer on the buffer
2573
* dependency chain, then all the entries on the pd_pending
2574
* list are committed to disk and we can free them here.
2576
pagedep = newdirblk->db_pagedep;
2577
pagedep->pd_state &= ~NEWBLOCK;
2578
if ((pagedep->pd_state & ONWORKLIST) == 0)
2579
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
2582
* If no dependencies remain, the pagedep will be freed.
2584
for (i = 0; i < DAHASHSZ; i++)
2585
if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
2587
if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
2588
LIST_REMOVE(pagedep, pd_hash);
2589
WORKITEM_FREE(pagedep, D_PAGEDEP);
2591
WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2595
* Prepare an inode to be freed. The actual free operation is not
2596
* done until the zero'ed inode has been written to disk.
2599
softdep_freefile(pvp, ino, mode)
2604
struct inode *ip = VTOI(pvp);
2605
struct inodedep *inodedep;
2606
struct freefile *freefile;
2609
* This sets up the inode de-allocation dependency.
2611
MALLOC(freefile, struct freefile *, sizeof(struct freefile),
2612
M_FREEFILE, M_SOFTDEP_FLAGS);
2613
workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
2614
freefile->fx_mode = mode;
2615
freefile->fx_oldinum = ino;
2616
freefile->fx_devvp = ip->i_devvp;
2617
if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
2618
UFS_LOCK(ip->i_ump);
2619
ip->i_fs->fs_pendinginodes += 1;
2620
UFS_UNLOCK(ip->i_ump);
2624
* If the inodedep does not exist, then the zero'ed inode has
2625
* been written to disk. If the allocated inode has never been
2626
* written to disk, then the on-disk inode is zero'ed. In either
2627
* case we can free the file immediately.
2630
if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 ||
2631
check_inode_unwritten(inodedep)) {
2633
handle_workitem_freefile(freefile);
2636
WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
2638
if (ip->i_number == ino)
2639
ip->i_flag |= IN_MODIFIED;
2643
* Check to see if an inode has never been written to disk. If
2644
* so free the inodedep and return success, otherwise return failure.
2645
* This routine must be called with splbio interrupts blocked.
2647
* If we still have a bitmap dependency, then the inode has never
2648
* been written to disk. Drop the dependency as it is no longer
2649
* necessary since the inode is being deallocated. We set the
2650
* ALLCOMPLETE flags since the bitmap now properly shows that the
2651
* inode is not allocated. Even if the inode is actively being
2652
* written, it has been rolled back to its zero'ed state, so we
2653
* are ensured that a zero inode is what is on the disk. For short
2654
* lived files, this change will usually result in removing all the
2655
* dependencies from the inode so that it can be freed immediately.
2658
check_inode_unwritten(inodedep)
2659
struct inodedep *inodedep;
2662
mtx_assert(&lk, MA_OWNED);
2663
if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
2664
!LIST_EMPTY(&inodedep->id_pendinghd) ||
2665
!LIST_EMPTY(&inodedep->id_bufwait) ||
2666
!LIST_EMPTY(&inodedep->id_inowait) ||
2667
!TAILQ_EMPTY(&inodedep->id_inoupdt) ||
2668
!TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
2669
!TAILQ_EMPTY(&inodedep->id_extupdt) ||
2670
!TAILQ_EMPTY(&inodedep->id_newextupdt) ||
2671
inodedep->id_nlinkdelta != 0)
2675
* Another process might be in initiate_write_inodeblock_ufs[12]
2676
* trying to allocate memory without holding "Softdep Lock".
2678
if ((inodedep->id_state & IOSTARTED) != 0 &&
2679
inodedep->id_savedino1 == NULL)
2682
inodedep->id_state |= ALLCOMPLETE;
2683
LIST_REMOVE(inodedep, id_deps);
2684
inodedep->id_buf = NULL;
2685
if (inodedep->id_state & ONWORKLIST)
2686
WORKLIST_REMOVE(&inodedep->id_list);
2687
if (inodedep->id_savedino1 != NULL) {
2688
FREE(inodedep->id_savedino1, M_SAVEDINO);
2689
inodedep->id_savedino1 = NULL;
2691
if (free_inodedep(inodedep) == 0)
2692
panic("check_inode_unwritten: busy inode");
2697
* Try to free an inodedep structure. Return 1 if it could be freed.
2700
free_inodedep(inodedep)
2701
struct inodedep *inodedep;
2704
mtx_assert(&lk, MA_OWNED);
2705
if ((inodedep->id_state & ONWORKLIST) != 0 ||
2706
(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
2707
!LIST_EMPTY(&inodedep->id_pendinghd) ||
2708
!LIST_EMPTY(&inodedep->id_bufwait) ||
2709
!LIST_EMPTY(&inodedep->id_inowait) ||
2710
!TAILQ_EMPTY(&inodedep->id_inoupdt) ||
2711
!TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
2712
!TAILQ_EMPTY(&inodedep->id_extupdt) ||
2713
!TAILQ_EMPTY(&inodedep->id_newextupdt) ||
2714
inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
2716
LIST_REMOVE(inodedep, id_hash);
2717
WORKITEM_FREE(inodedep, D_INODEDEP);
2723
* This workitem routine performs the block de-allocation.
2724
* The workitem is added to the pending list after the updated
2725
* inode block has been written to disk. As mentioned above,
2726
* checks regarding the number of blocks de-allocated (compared
2727
* to the number of blocks allocated for the file) are also
2728
* performed in this function.
2731
handle_workitem_freeblocks(freeblks, flags)
2732
struct freeblks *freeblks;
2738
struct ufsmount *ump;
2739
int i, nblocks, level, bsize;
2740
ufs2_daddr_t bn, blocksreleased = 0;
2741
int error, allerror = 0;
2742
ufs_lbn_t baselbns[NIADDR], tmpval;
2743
int fs_pendingblocks;
2745
ump = VFSTOUFS(freeblks->fb_list.wk_mp);
2747
fs_pendingblocks = 0;
2749
baselbns[0] = NDADDR;
2750
for (i = 1; i < NIADDR; i++) {
2751
tmpval *= NINDIR(fs);
2752
baselbns[i] = baselbns[i - 1] + tmpval;
2754
nblocks = btodb(fs->fs_bsize);
2757
* Release all extended attribute blocks or frags.
2759
if (freeblks->fb_oldextsize > 0) {
2760
for (i = (NXADDR - 1); i >= 0; i--) {
2761
if ((bn = freeblks->fb_eblks[i]) == 0)
2763
bsize = sblksize(fs, freeblks->fb_oldextsize, i);
2764
ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
2765
freeblks->fb_previousinum);
2766
blocksreleased += btodb(bsize);
2770
* Release all data blocks or frags.
2772
if (freeblks->fb_oldsize > 0) {
2774
* Indirect blocks first.
2776
for (level = (NIADDR - 1); level >= 0; level--) {
2777
if ((bn = freeblks->fb_iblks[level]) == 0)
2779
if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
2780
level, baselbns[level], &blocksreleased)) != 0)
2782
ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,
2783
fs->fs_bsize, freeblks->fb_previousinum);
2784
fs_pendingblocks += nblocks;
2785
blocksreleased += nblocks;
2788
* All direct blocks or frags.
2790
for (i = (NDADDR - 1); i >= 0; i--) {
2791
if ((bn = freeblks->fb_dblks[i]) == 0)
2793
bsize = sblksize(fs, freeblks->fb_oldsize, i);
2794
ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
2795
freeblks->fb_previousinum);
2796
fs_pendingblocks += btodb(bsize);
2797
blocksreleased += btodb(bsize);
2801
fs->fs_pendingblocks -= fs_pendingblocks;
2804
* If we still have not finished background cleanup, then check
2805
* to see if the block count needs to be adjusted.
2807
if (freeblks->fb_chkcnt != blocksreleased &&
2808
(fs->fs_flags & FS_UNCLEAN) != 0 &&
2809
ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
2810
(flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)
2813
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \
2814
freeblks->fb_chkcnt - blocksreleased);
2815
ip->i_flag |= IN_CHANGE;
2820
if (freeblks->fb_chkcnt != blocksreleased &&
2821
((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
2822
printf("handle_workitem_freeblocks: block count\n");
2824
softdep_error("handle_workitem_freeblks", allerror);
2825
#endif /* INVARIANTS */
2828
WORKITEM_FREE(freeblks, D_FREEBLKS);
2834
* Release blocks associated with the inode ip and stored in the indirect
2835
* block dbn. If level is greater than SINGLE, the block is an indirect block
2836
* and recursive calls to indirtrunc must be used to cleanse other indirect
2840
indir_trunc(freeblks, dbn, level, lbn, countp)
2841
struct freeblks *freeblks;
2845
ufs2_daddr_t *countp;
2849
struct worklist *wk;
2850
struct indirdep *indirdep;
2851
struct ufsmount *ump;
2852
ufs1_daddr_t *bap1 = 0;
2853
ufs2_daddr_t nb, *bap2 = 0;
2855
int i, nblocks, ufs1fmt;
2856
int error, allerror = 0;
2857
int fs_pendingblocks;
2859
ump = VFSTOUFS(freeblks->fb_list.wk_mp);
2861
fs_pendingblocks = 0;
2863
for (i = level; i > 0; i--)
2864
lbnadd *= NINDIR(fs);
2866
* Get buffer of block pointers to be freed. This routine is not
2867
* called until the zero'ed inode has been written, so it is safe
2868
* to free blocks as they are encountered. Because the inode has
2869
* been zero'ed, calls to bmap on these blocks will fail. So, we
2870
* have to use the on-disk address and the block device for the
2871
* filesystem to look them up. If the file was deleted before its
2872
* indirect blocks were all written to disk, the routine that set
2873
* us up (deallocate_dependencies) will have arranged to leave
2874
* a complete copy of the indirect block in memory for our use.
2875
* Otherwise we have to read the blocks in from the disk.
2878
bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
2881
bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
2884
if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2885
if (wk->wk_type != D_INDIRDEP ||
2886
(indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2887
(indirdep->ir_state & GOINGAWAY) == 0)
2888
panic("indir_trunc: lost indirdep");
2889
WORKLIST_REMOVE(wk);
2890
WORKITEM_FREE(indirdep, D_INDIRDEP);
2891
if (!LIST_EMPTY(&bp->b_dep))
2892
panic("indir_trunc: dangling dep");
2893
ump->um_numindirdeps -= 1;
2901
error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
2909
* Recursively free indirect blocks.
2911
if (ump->um_fstype == UFS1) {
2913
bap1 = (ufs1_daddr_t *)bp->b_data;
2916
bap2 = (ufs2_daddr_t *)bp->b_data;
2918
nblocks = btodb(fs->fs_bsize);
2919
for (i = NINDIR(fs) - 1; i >= 0; i--) {
2927
if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
2928
level - 1, lbn + (i * lbnadd), countp)) != 0)
2931
ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize,
2932
freeblks->fb_previousinum);
2933
fs_pendingblocks += nblocks;
2937
fs->fs_pendingblocks -= fs_pendingblocks;
2939
bp->b_flags |= B_INVAL | B_NOCACHE;
2945
* Free an allocindir.
2946
* This routine must be called with splbio interrupts blocked.
2949
free_allocindir(aip, inodedep)
2950
struct allocindir *aip;
2951
struct inodedep *inodedep;
2953
struct freefrag *freefrag;
2955
mtx_assert(&lk, MA_OWNED);
2956
if ((aip->ai_state & DEPCOMPLETE) == 0)
2957
LIST_REMOVE(aip, ai_deps);
2958
if (aip->ai_state & ONWORKLIST)
2959
WORKLIST_REMOVE(&aip->ai_list);
2960
LIST_REMOVE(aip, ai_next);
2961
if ((freefrag = aip->ai_freefrag) != NULL) {
2962
if (inodedep == NULL)
2963
add_to_worklist(&freefrag->ff_list);
2965
WORKLIST_INSERT(&inodedep->id_bufwait,
2966
&freefrag->ff_list);
2968
WORKITEM_FREE(aip, D_ALLOCINDIR);
2972
* Directory entry addition dependencies.
2974
* When adding a new directory entry, the inode (with its incremented link
2975
* count) must be written to disk before the directory entry's pointer to it.
2976
* Also, if the inode is newly allocated, the corresponding freemap must be
2977
* updated (on disk) before the directory entry's pointer. These requirements
2978
* are met via undo/redo on the directory entry's pointer, which consists
2979
* simply of the inode number.
2981
* As directory entries are added and deleted, the free space within a
2982
* directory block can become fragmented. The ufs filesystem will compact
2983
* a fragmented directory block to make space for a new entry. When this
2984
* occurs, the offsets of previously added entries change. Any "diradd"
2985
* dependency structures corresponding to these entries must be updated with
2990
* This routine is called after the in-memory inode's link
2991
* count has been incremented, but before the directory entry's
2992
* pointer to the inode has been set.
2995
softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
2996
struct buf *bp; /* buffer containing directory block */
2997
struct inode *dp; /* inode for directory */
2998
off_t diroffset; /* offset of new entry in directory */
2999
ino_t newinum; /* inode referenced by new directory entry */
3000
struct buf *newdirbp; /* non-NULL => contents of new mkdir */
3001
int isnewblk; /* entry is in a newly allocated block */
3003
int offset; /* offset of new entry within directory block */
3004
ufs_lbn_t lbn; /* block in directory containing new entry */
3007
struct allocdirect *adp;
3008
struct pagedep *pagedep;
3009
struct inodedep *inodedep;
3010
struct newdirblk *newdirblk = 0;
3011
struct mkdir *mkdir1, *mkdir2;
3015
* Whiteouts have no dependencies.
3017
if (newinum == WINO) {
3018
if (newdirbp != NULL)
3022
mp = UFSTOVFS(dp->i_ump);
3024
lbn = lblkno(fs, diroffset);
3025
offset = blkoff(fs, diroffset);
3026
MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
3027
M_SOFTDEP_FLAGS|M_ZERO);
3028
workitem_alloc(&dap->da_list, D_DIRADD, mp);
3029
dap->da_offset = offset;
3030
dap->da_newinum = newinum;
3031
dap->da_state = ATTACHED;
3032
if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
3033
MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk),
3034
M_NEWDIRBLK, M_SOFTDEP_FLAGS);
3035
workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
3037
if (newdirbp == NULL) {
3038
dap->da_state |= DEPCOMPLETE;
3041
dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
3042
MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
3044
workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
3045
mkdir1->md_state = MKDIR_BODY;
3046
mkdir1->md_diradd = dap;
3047
MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
3049
workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
3050
mkdir2->md_state = MKDIR_PARENT;
3051
mkdir2->md_diradd = dap;
3053
* Dependency on "." and ".." being written to disk.
3055
mkdir1->md_buf = newdirbp;
3057
LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
3058
WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
3062
* Dependency on link count increase for parent directory
3065
if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0
3066
|| (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
3067
dap->da_state &= ~MKDIR_PARENT;
3068
WORKITEM_FREE(mkdir2, D_MKDIR);
3070
LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
3071
WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
3075
* Link into parent directory pagedep to await its being written.
3077
if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
3078
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
3079
dap->da_pagedep = pagedep;
3080
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
3083
* Link into its inodedep. Put it on the id_bufwait list if the inode
3084
* is not yet written. If it is written, do the post-inode write
3085
* processing to put it on the id_pendinghd list.
3087
(void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
3088
if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
3089
diradd_inode_written(dap, inodedep);
3091
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
3094
* Directories growing into indirect blocks are rare
3095
* enough and the frequency of new block allocation
3096
* in those cases even more rare, that we choose not
3097
* to bother tracking them. Rather we simply force the
3098
* new directory entry to disk.
3100
if (lbn >= NDADDR) {
3103
* We only have a new allocation when at the
3104
* beginning of a new block, not when we are
3105
* expanding into an existing block.
3107
if (blkoff(fs, diroffset) == 0)
3112
* We only have a new allocation when at the beginning
3113
* of a new fragment, not when we are expanding into an
3114
* existing fragment. Also, there is nothing to do if we
3115
* are already tracking this block.
3117
if (fragoff(fs, diroffset) != 0) {
3121
if ((pagedep->pd_state & NEWBLOCK) != 0) {
3122
WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
3127
* Find our associated allocdirect and have it track us.
3129
if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0)
3130
panic("softdep_setup_directory_add: lost inodedep");
3131
adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
3132
if (adp == NULL || adp->ad_lbn != lbn)
3133
panic("softdep_setup_directory_add: lost entry");
3134
pagedep->pd_state |= NEWBLOCK;
3135
newdirblk->db_pagedep = pagedep;
3136
WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
3143
* This procedure is called to change the offset of a directory
3144
* entry when compacting a directory block which must be owned
3145
* exclusively by the caller. Note that the actual entry movement
3146
* must be done in this procedure to ensure that no I/O completions
3147
* occur while the move is in progress.
3150
softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
3151
struct inode *dp; /* inode for directory */
3152
caddr_t base; /* address of dp->i_offset */
3153
caddr_t oldloc; /* address of old directory location */
3154
caddr_t newloc; /* address of new directory location */
3155
int entrysize; /* size of directory entry */
3157
int offset, oldoffset, newoffset;
3158
struct pagedep *pagedep;
3163
lbn = lblkno(dp->i_fs, dp->i_offset);
3164
offset = blkoff(dp->i_fs, dp->i_offset);
3165
if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
3167
oldoffset = offset + (oldloc - base);
3168
newoffset = offset + (newloc - base);
3170
LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
3171
if (dap->da_offset != oldoffset)
3173
dap->da_offset = newoffset;
3174
if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
3176
LIST_REMOVE(dap, da_pdlist);
3177
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
3183
LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
3184
if (dap->da_offset == oldoffset) {
3185
dap->da_offset = newoffset;
3191
bcopy(oldloc, newloc, entrysize);
3196
* Free a diradd dependency structure. This routine must be called
3197
* with splbio interrupts blocked.
3203
struct dirrem *dirrem;
3204
struct pagedep *pagedep;
3205
struct inodedep *inodedep;
3206
struct mkdir *mkdir, *nextmd;
3208
mtx_assert(&lk, MA_OWNED);
3209
WORKLIST_REMOVE(&dap->da_list);
3210
LIST_REMOVE(dap, da_pdlist);
3211
if ((dap->da_state & DIRCHG) == 0) {
3212
pagedep = dap->da_pagedep;
3214
dirrem = dap->da_previous;
3215
pagedep = dirrem->dm_pagedep;
3216
dirrem->dm_dirinum = pagedep->pd_ino;
3217
add_to_worklist(&dirrem->dm_list);
3219
if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
3221
(void) free_inodedep(inodedep);
3222
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
3223
for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
3224
nextmd = LIST_NEXT(mkdir, md_mkdirs);
3225
if (mkdir->md_diradd != dap)
3227
dap->da_state &= ~mkdir->md_state;
3228
WORKLIST_REMOVE(&mkdir->md_list);
3229
LIST_REMOVE(mkdir, md_mkdirs);
3230
WORKITEM_FREE(mkdir, D_MKDIR);
3232
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
3233
panic("free_diradd: unfound ref");
3235
WORKITEM_FREE(dap, D_DIRADD);
3239
* Directory entry removal dependencies.
3241
* When removing a directory entry, the entry's inode pointer must be
3242
* zero'ed on disk before the corresponding inode's link count is decremented
3243
* (possibly freeing the inode for re-use). This dependency is handled by
3244
* updating the directory entry but delaying the inode count reduction until
3245
* after the directory block has been written to disk. After this point, the
3246
* inode count can be decremented whenever it is convenient.
3250
* This routine should be called immediately after removing
3251
* a directory entry. The inode's link count should not be
3252
* decremented by the calling procedure -- the soft updates
3253
* code will do this task when it is safe.
3256
softdep_setup_remove(bp, dp, ip, isrmdir)
3257
struct buf *bp; /* buffer containing directory block */
3258
struct inode *dp; /* inode for the directory being modified */
3259
struct inode *ip; /* inode for directory entry being removed */
3260
int isrmdir; /* indicates if doing RMDIR */
3262
struct dirrem *dirrem, *prevdirrem;
3265
* Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
3267
dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
3270
* If the COMPLETE flag is clear, then there were no active
3271
* entries and we want to roll back to a zeroed entry until
3272
* the new inode is committed to disk. If the COMPLETE flag is
3273
* set then we have deleted an entry that never made it to
3274
* disk. If the entry we deleted resulted from a name change,
3275
* then the old name still resides on disk. We cannot delete
3276
* its inode (returned to us in prevdirrem) until the zeroed
3277
* directory entry gets to disk. The new inode has never been
3278
* referenced on the disk, so can be deleted immediately.
3280
if ((dirrem->dm_state & COMPLETE) == 0) {
3281
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
3285
if (prevdirrem != NULL)
3286
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
3287
prevdirrem, dm_next);
3288
dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
3290
handle_workitem_remove(dirrem, NULL);
3295
* Allocate a new dirrem if appropriate and return it along with
3296
* its associated pagedep. Called without a lock, returns with lock.
3298
static long num_dirrem; /* number of dirrem allocated */
3299
static struct dirrem *
3300
newdirrem(bp, dp, ip, isrmdir, prevdirremp)
3301
struct buf *bp; /* buffer containing directory block */
3302
struct inode *dp; /* inode for the directory being modified */
3303
struct inode *ip; /* inode for directory entry being removed */
3304
int isrmdir; /* indicates if doing RMDIR */
3305
struct dirrem **prevdirremp; /* previously referenced inode, if any */
3310
struct dirrem *dirrem;
3311
struct pagedep *pagedep;
3314
* Whiteouts have no deletion dependencies.
3317
panic("newdirrem: whiteout");
3319
* If we are over our limit, try to improve the situation.
3320
* Limiting the number of dirrem structures will also limit
3321
* the number of freefile and freeblks structures.
3324
if (!(ip->i_flags & SF_SNAPSHOT) && num_dirrem > max_softdeps / 2)
3325
(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE);
3328
MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
3329
M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
3330
workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount);
3331
dirrem->dm_state = isrmdir ? RMDIR : 0;
3332
dirrem->dm_oldinum = ip->i_number;
3333
*prevdirremp = NULL;
3336
lbn = lblkno(dp->i_fs, dp->i_offset);
3337
offset = blkoff(dp->i_fs, dp->i_offset);
3338
if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
3339
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
3340
dirrem->dm_pagedep = pagedep;
3342
* Check for a diradd dependency for the same directory entry.
3343
* If present, then both dependencies become obsolete and can
3344
* be de-allocated. Check for an entry on both the pd_dirraddhd
3345
* list and the pd_pendinghd list.
3348
LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
3349
if (dap->da_offset == offset)
3353
LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
3354
if (dap->da_offset == offset)
3360
* Must be ATTACHED at this point.
3362
if ((dap->da_state & ATTACHED) == 0)
3363
panic("newdirrem: not ATTACHED");
3364
if (dap->da_newinum != ip->i_number)
3365
panic("newdirrem: inum %d should be %d",
3366
ip->i_number, dap->da_newinum);
3368
* If we are deleting a changed name that never made it to disk,
3369
* then return the dirrem describing the previous inode (which
3370
* represents the inode currently referenced from this entry on disk).
3372
if ((dap->da_state & DIRCHG) != 0) {
3373
*prevdirremp = dap->da_previous;
3374
dap->da_state &= ~DIRCHG;
3375
dap->da_pagedep = pagedep;
3378
* We are deleting an entry that never made it to disk.
3379
* Mark it COMPLETE so we can delete its inode immediately.
3381
dirrem->dm_state |= COMPLETE;
3387
* Directory entry change dependencies.
3389
* Changing an existing directory entry requires that an add operation
3390
* be completed first followed by a deletion. The semantics for the addition
3391
* are identical to the description of adding a new entry above except
3392
* that the rollback is to the old inode number rather than zero. Once
3393
* the addition dependency is completed, the removal is done as described
3394
* in the removal routine above.
3398
* This routine should be called immediately after changing
3399
* a directory entry. The inode's link count should not be
3400
* decremented by the calling procedure -- the soft updates
3401
* code will perform this task when it is safe.
3404
softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
3405
struct buf *bp; /* buffer containing directory block */
3406
struct inode *dp; /* inode for the directory being modified */
3407
struct inode *ip; /* inode for directory entry being removed */
3408
ino_t newinum; /* new inode number for changed entry */
3409
int isrmdir; /* indicates if doing RMDIR */
3412
struct diradd *dap = NULL;
3413
struct dirrem *dirrem, *prevdirrem;
3414
struct pagedep *pagedep;
3415
struct inodedep *inodedep;
3418
offset = blkoff(dp->i_fs, dp->i_offset);
3419
mp = UFSTOVFS(dp->i_ump);
3422
* Whiteouts do not need diradd dependencies.
3424
if (newinum != WINO) {
3425
MALLOC(dap, struct diradd *, sizeof(struct diradd),
3426
M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
3427
workitem_alloc(&dap->da_list, D_DIRADD, mp);
3428
dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
3429
dap->da_offset = offset;
3430
dap->da_newinum = newinum;
3434
* Allocate a new dirrem and ACQUIRE_LOCK.
3436
dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
3437
pagedep = dirrem->dm_pagedep;
3439
* The possible values for isrmdir:
3440
* 0 - non-directory file rename
3441
* 1 - directory rename within same directory
3442
* inum - directory rename to new directory of given inode number
3443
* When renaming to a new directory, we are both deleting and
3444
* creating a new directory entry, so the link count on the new
3445
* directory should not change. Thus we do not need the followup
3446
* dirrem which is usually done in handle_workitem_remove. We set
3447
* the DIRCHG flag to tell handle_workitem_remove to skip the
3451
dirrem->dm_state |= DIRCHG;
3454
* Whiteouts have no additional dependencies,
3455
* so just put the dirrem on the correct list.
3457
if (newinum == WINO) {
3458
if ((dirrem->dm_state & COMPLETE) == 0) {
3459
LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
3462
dirrem->dm_dirinum = pagedep->pd_ino;
3463
add_to_worklist(&dirrem->dm_list);
3470
* If the COMPLETE flag is clear, then there were no active
3471
* entries and we want to roll back to the previous inode until
3472
* the new inode is committed to disk. If the COMPLETE flag is
3473
* set, then we have deleted an entry that never made it to disk.
3474
* If the entry we deleted resulted from a name change, then the old
3475
* inode reference still resides on disk. Any rollback that we do
3476
* needs to be to that old inode (returned to us in prevdirrem). If
3477
* the entry we deleted resulted from a create, then there is
3478
* no entry on the disk, so we want to roll back to zero rather
3479
* than the uncommitted inode. In either of the COMPLETE cases we
3480
* want to immediately free the unwritten and unreferenced inode.
3482
if ((dirrem->dm_state & COMPLETE) == 0) {
3483
dap->da_previous = dirrem;
3485
if (prevdirrem != NULL) {
3486
dap->da_previous = prevdirrem;
3488
dap->da_state &= ~DIRCHG;
3489
dap->da_pagedep = pagedep;
3491
dirrem->dm_dirinum = pagedep->pd_ino;
3492
add_to_worklist(&dirrem->dm_list);
3495
* Link into its inodedep. Put it on the id_bufwait list if the inode
3496
* is not yet written. If it is written, do the post-inode write
3497
* processing to put it on the id_pendinghd list.
3499
if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 ||
3500
(inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
3501
dap->da_state |= COMPLETE;
3502
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3503
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3505
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
3507
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
3513
* Called whenever the link count on an inode is changed.
3514
* It creates an inode dependency so that the new reference(s)
3515
* to the inode cannot be committed to disk until the updated
3516
* inode has been written.
3519
softdep_change_linkcnt(ip)
3520
struct inode *ip; /* the inode with the increased link count */
3522
struct inodedep *inodedep;
3525
(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
3526
DEPALLOC, &inodedep);
3527
if (ip->i_nlink < ip->i_effnlink)
3528
panic("softdep_change_linkcnt: bad delta");
3529
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3534
* Called when the effective link count and the reference count
3535
* on an inode drops to zero. At this point there are no names
3536
* referencing the file in the filesystem and no active file
3537
* references. The space associated with the file will be freed
3538
* as soon as the necessary soft dependencies are cleared.
3541
softdep_releasefile(ip)
3542
struct inode *ip; /* inode with the zero effective link count */
3544
struct inodedep *inodedep;
3548
if (ip->i_effnlink > 0)
3549
panic("softdep_releasefile: file still referenced");
3551
* We may be called several times as the on-disk link count
3552
* drops to zero. We only want to account for the space once.
3554
if (ip->i_flag & IN_SPACECOUNTED)
3557
* We have to deactivate a snapshot otherwise copyonwrites may
3558
* add blocks and the cleanup may remove blocks after we have
3559
* tried to account for them.
3561
if ((ip->i_flags & SF_SNAPSHOT) != 0)
3562
ffs_snapremove(ITOV(ip));
3564
* If we are tracking an nlinkdelta, we have to also remember
3565
* whether we accounted for the freed space yet.
3568
if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep)))
3569
inodedep->id_state |= SPACECOUNTED;
3573
if (fs->fs_magic == FS_UFS2_MAGIC)
3574
extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
3575
UFS_LOCK(ip->i_ump);
3576
ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks;
3577
ip->i_fs->fs_pendinginodes += 1;
3578
UFS_UNLOCK(ip->i_ump);
3579
ip->i_flag |= IN_SPACECOUNTED;
3583
* This workitem decrements the inode's link count.
3584
* If the link count reaches zero, the file is removed.
3587
handle_workitem_remove(dirrem, xp)
3588
struct dirrem *dirrem;
3591
struct thread *td = curthread;
3592
struct inodedep *inodedep;
3598
if ((vp = xp) == NULL &&
3599
(error = ffs_vgetf(dirrem->dm_list.wk_mp,
3600
dirrem->dm_oldinum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)) != 0) {
3601
softdep_error("handle_workitem_remove: vget", error);
3606
if ((inodedep_lookup(dirrem->dm_list.wk_mp,
3607
dirrem->dm_oldinum, 0, &inodedep)) == 0)
3608
panic("handle_workitem_remove: lost inodedep");
3610
* Normal file deletion.
3612
if ((dirrem->dm_state & RMDIR) == 0) {
3614
DIP_SET(ip, i_nlink, ip->i_nlink);
3615
ip->i_flag |= IN_CHANGE;
3616
if (ip->i_nlink < ip->i_effnlink)
3617
panic("handle_workitem_remove: bad file delta");
3618
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3620
WORKITEM_FREE(dirrem, D_DIRREM);
3626
* Directory deletion. Decrement reference count for both the
3627
* just deleted parent directory entry and the reference for ".".
3628
* Next truncate the directory to length zero. When the
3629
* truncation completes, arrange to have the reference count on
3630
* the parent decremented to account for the loss of "..".
3633
DIP_SET(ip, i_nlink, ip->i_nlink);
3634
ip->i_flag |= IN_CHANGE;
3635
if (ip->i_nlink < ip->i_effnlink)
3636
panic("handle_workitem_remove: bad dir delta");
3637
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3639
if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
3640
softdep_error("handle_workitem_remove: truncate", error);
3643
* Rename a directory to a new parent. Since, we are both deleting
3644
* and creating a new directory entry, the link count on the new
3645
* directory should not change. Thus we skip the followup dirrem.
3647
if (dirrem->dm_state & DIRCHG) {
3649
WORKITEM_FREE(dirrem, D_DIRREM);
3655
* If the inodedep does not exist, then the zero'ed inode has
3656
* been written to disk. If the allocated inode has never been
3657
* written to disk, then the on-disk inode is zero'ed. In either
3658
* case we can remove the file immediately.
3660
dirrem->dm_state = 0;
3661
oldinum = dirrem->dm_oldinum;
3662
dirrem->dm_oldinum = dirrem->dm_dirinum;
3663
if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum,
3664
0, &inodedep) == 0 || check_inode_unwritten(inodedep)) {
3666
add_to_worklist(&dirrem->dm_list);
3670
handle_workitem_remove(dirrem, NULL);
3673
WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
3675
ip->i_flag |= IN_CHANGE;
3681
* Inode de-allocation dependencies.
3683
* When an inode's link count is reduced to zero, it can be de-allocated. We
3684
* found it convenient to postpone de-allocation until after the inode is
3685
* written to disk with its new link count (zero). At this point, all of the
3686
* on-disk inode's block pointers are nullified and, with careful dependency
3687
* list ordering, all dependencies related to the inode will be satisfied and
3688
* the corresponding dependency structures de-allocated. So, if/when the
3689
* inode is reused, there will be no mixing of old dependencies with new
3690
* ones. This artificial dependency is set up by the block de-allocation
3691
* procedure above (softdep_setup_freeblocks) and completed by the
3692
* following procedure.
3695
handle_workitem_freefile(freefile)
3696
struct freefile *freefile;
3699
struct inodedep *idp;
3700
struct ufsmount *ump;
3703
ump = VFSTOUFS(freefile->fx_list.wk_mp);
3707
error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
3710
panic("handle_workitem_freefile: inodedep survived");
3713
fs->fs_pendinginodes -= 1;
3715
if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
3716
freefile->fx_oldinum, freefile->fx_mode)) != 0)
3717
softdep_error("handle_workitem_freefile", error);
3719
WORKITEM_FREE(freefile, D_FREEFILE);
3725
* Helper function which unlinks marker element from work list and returns
3726
* the next element on the list.
3728
static __inline struct worklist *
3729
markernext(struct worklist *marker)
3731
struct worklist *next;
3733
next = LIST_NEXT(marker, wk_list);
3734
LIST_REMOVE(marker, wk_list);
3741
* The dependency structures constructed above are most actively used when file
3742
* system blocks are written to disk. No constraints are placed on when a
3743
* block can be written, but unsatisfied update dependencies are made safe by
3744
* modifying (or replacing) the source memory for the duration of the disk
3745
* write. When the disk write completes, the memory block is again brought
3748
* In-core inode structure reclamation.
3750
* Because there are a finite number of "in-core" inode structures, they are
3751
* reused regularly. By transferring all inode-related dependencies to the
3752
* in-memory inode block and indexing them separately (via "inodedep"s), we
3753
* can allow "in-core" inode structures to be reused at any time and avoid
3754
* any increase in contention.
3756
* Called just before entering the device driver to initiate a new disk I/O.
3757
* The buffer must be locked, thus, no I/O completion operations can occur
3758
* while we are manipulating its associated dependencies.
3761
softdep_disk_io_initiation(bp)
3762
struct buf *bp; /* structure describing disk write to occur */
3764
struct worklist *wk;
3765
struct worklist marker;
3766
struct indirdep *indirdep;
3767
struct inodedep *inodedep;
3770
* We only care about write operations. There should never
3771
* be dependencies for reads.
3773
if (bp->b_iocmd != BIO_WRITE)
3774
panic("softdep_disk_io_initiation: not write");
3776
marker.wk_type = D_LAST + 1; /* Not a normal workitem */
3777
PHOLD(curproc); /* Don't swap out kernel stack */
3781
* Do any necessary pre-I/O processing.
3783
for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
3784
wk = markernext(&marker)) {
3785
LIST_INSERT_AFTER(wk, &marker, wk_list);
3786
switch (wk->wk_type) {
3789
initiate_write_filepage(WK_PAGEDEP(wk), bp);
3793
inodedep = WK_INODEDEP(wk);
3794
if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
3795
initiate_write_inodeblock_ufs1(inodedep, bp);
3797
initiate_write_inodeblock_ufs2(inodedep, bp);
3801
indirdep = WK_INDIRDEP(wk);
3802
if (indirdep->ir_state & GOINGAWAY)
3803
panic("disk_io_initiation: indirdep gone");
3805
* If there are no remaining dependencies, this
3806
* will be writing the real pointers, so the
3807
* dependency can be freed.
3809
if (LIST_EMPTY(&indirdep->ir_deplisthd)) {
3812
bp = indirdep->ir_savebp;
3813
bp->b_flags |= B_INVAL | B_NOCACHE;
3814
/* inline expand WORKLIST_REMOVE(wk); */
3815
wk->wk_state &= ~ONWORKLIST;
3816
LIST_REMOVE(wk, wk_list);
3817
WORKITEM_FREE(indirdep, D_INDIRDEP);
3824
* Replace up-to-date version with safe version.
3827
MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
3828
M_INDIRDEP, M_SOFTDEP_FLAGS);
3830
indirdep->ir_state &= ~ATTACHED;
3831
indirdep->ir_state |= UNDONE;
3832
bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
3833
bcopy(indirdep->ir_savebp->b_data, bp->b_data,
3844
panic("handle_disk_io_initiation: Unexpected type %s",
3845
TYPENAME(wk->wk_type));
3850
PRELE(curproc); /* Allow swapout of kernel stack */
3854
* Called from within the procedure above to deal with unsatisfied
3855
* allocation dependencies in a directory. The buffer must be locked,
3856
* thus, no I/O completion operations can occur while we are
3857
* manipulating its associated dependencies.
3860
initiate_write_filepage(pagedep, bp)
3861
struct pagedep *pagedep;
3868
if (pagedep->pd_state & IOSTARTED) {
3870
* This can only happen if there is a driver that does not
3871
* understand chaining. Here biodone will reissue the call
3872
* to strategy for the incomplete buffers.
3874
printf("initiate_write_filepage: already started\n");
3877
pagedep->pd_state |= IOSTARTED;
3878
for (i = 0; i < DAHASHSZ; i++) {
3879
LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
3880
ep = (struct direct *)
3881
((char *)bp->b_data + dap->da_offset);
3882
if (ep->d_ino != dap->da_newinum)
3883
panic("%s: dir inum %d != new %d",
3884
"initiate_write_filepage",
3885
ep->d_ino, dap->da_newinum);
3886
if (dap->da_state & DIRCHG)
3887
ep->d_ino = dap->da_previous->dm_oldinum;
3890
dap->da_state &= ~ATTACHED;
3891
dap->da_state |= UNDONE;
3897
* Version of initiate_write_inodeblock that handles UFS1 dinodes.
3898
* Note that any bug fixes made to this routine must be done in the
3899
* version found below.
3901
* Called from within the procedure above to deal with unsatisfied
3902
* allocation dependencies in an inodeblock. The buffer must be
3903
* locked, thus, no I/O completion operations can occur while we
3904
* are manipulating its associated dependencies.
3907
initiate_write_inodeblock_ufs1(inodedep, bp)
3908
struct inodedep *inodedep;
3909
struct buf *bp; /* The inode block */
3911
struct allocdirect *adp, *lastadp;
3912
struct ufs1_dinode *dp;
3913
struct ufs1_dinode *sip;
3917
ufs_lbn_t prevlbn = 0;
3921
if (inodedep->id_state & IOSTARTED)
3922
panic("initiate_write_inodeblock_ufs1: already started");
3923
inodedep->id_state |= IOSTARTED;
3924
fs = inodedep->id_fs;
3925
dp = (struct ufs1_dinode *)bp->b_data +
3926
ino_to_fsbo(fs, inodedep->id_ino);
3928
* If the bitmap is not yet written, then the allocated
3929
* inode cannot be written to disk.
3931
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3932
if (inodedep->id_savedino1 != NULL)
3933
panic("initiate_write_inodeblock_ufs1: I/O underway");
3935
MALLOC(sip, struct ufs1_dinode *,
3936
sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
3938
inodedep->id_savedino1 = sip;
3939
*inodedep->id_savedino1 = *dp;
3940
bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
3941
dp->di_gen = inodedep->id_savedino1->di_gen;
3945
* If no dependencies, then there is nothing to roll back.
3947
inodedep->id_savedsize = dp->di_size;
3948
inodedep->id_savedextsize = 0;
3949
if (TAILQ_EMPTY(&inodedep->id_inoupdt))
3952
* Set the dependencies to busy.
3954
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3955
adp = TAILQ_NEXT(adp, ad_next)) {
3957
if (deplist != 0 && prevlbn >= adp->ad_lbn)
3958
panic("softdep_write_inodeblock: lbn order");
3959
prevlbn = adp->ad_lbn;
3960
if (adp->ad_lbn < NDADDR &&
3961
dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
3962
panic("%s: direct pointer #%jd mismatch %d != %jd",
3963
"softdep_write_inodeblock",
3964
(intmax_t)adp->ad_lbn,
3965
dp->di_db[adp->ad_lbn],
3966
(intmax_t)adp->ad_newblkno);
3967
if (adp->ad_lbn >= NDADDR &&
3968
dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
3969
panic("%s: indirect pointer #%jd mismatch %d != %jd",
3970
"softdep_write_inodeblock",
3971
(intmax_t)adp->ad_lbn - NDADDR,
3972
dp->di_ib[adp->ad_lbn - NDADDR],
3973
(intmax_t)adp->ad_newblkno);
3974
deplist |= 1 << adp->ad_lbn;
3975
if ((adp->ad_state & ATTACHED) == 0)
3976
panic("softdep_write_inodeblock: Unknown state 0x%x",
3978
#endif /* INVARIANTS */
3979
adp->ad_state &= ~ATTACHED;
3980
adp->ad_state |= UNDONE;
3983
* The on-disk inode cannot claim to be any larger than the last
3984
* fragment that has been written. Otherwise, the on-disk inode
3985
* might have fragments that were not the last block in the file
3986
* which would corrupt the filesystem.
3988
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3989
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3990
if (adp->ad_lbn >= NDADDR)
3992
dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3993
/* keep going until hitting a rollback to a frag */
3994
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3996
dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3997
for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3999
if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
4000
panic("softdep_write_inodeblock: lost dep1");
4001
#endif /* INVARIANTS */
4004
for (i = 0; i < NIADDR; i++) {
4006
if (dp->di_ib[i] != 0 &&
4007
(deplist & ((1 << NDADDR) << i)) == 0)
4008
panic("softdep_write_inodeblock: lost dep2");
4009
#endif /* INVARIANTS */
4015
* If we have zero'ed out the last allocated block of the file,
4016
* roll back the size to the last currently allocated block.
4017
* We know that this last allocated block is a full-sized as
4018
* we already checked for fragments in the loop above.
4020
if (lastadp != NULL &&
4021
dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
4022
for (i = lastadp->ad_lbn; i >= 0; i--)
4023
if (dp->di_db[i] != 0)
4025
dp->di_size = (i + 1) * fs->fs_bsize;
4028
* The only dependencies are for indirect blocks.
4030
* The file size for indirect block additions is not guaranteed.
4031
* Such a guarantee would be non-trivial to achieve. The conventional
4032
* synchronous write implementation also does not make this guarantee.
4033
* Fsck should catch and fix discrepancies. Arguably, the file size
4034
* can be over-estimated without destroying integrity when the file
4035
* moves into the indirect blocks (i.e., is large). If we want to
4036
* postpone fsck, we are stuck with this argument.
4038
for (; adp; adp = TAILQ_NEXT(adp, ad_next))
4039
dp->di_ib[adp->ad_lbn - NDADDR] = 0;
4043
* Version of initiate_write_inodeblock that handles UFS2 dinodes.
4044
* Note that any bug fixes made to this routine must be done in the
4045
* version found above.
4047
* Called from within the procedure above to deal with unsatisfied
4048
* allocation dependencies in an inodeblock. The buffer must be
4049
* locked, thus, no I/O completion operations can occur while we
4050
* are manipulating its associated dependencies.
4053
initiate_write_inodeblock_ufs2(inodedep, bp)
4054
struct inodedep *inodedep;
4055
struct buf *bp; /* The inode block */
4057
struct allocdirect *adp, *lastadp;
4058
struct ufs2_dinode *dp;
4059
struct ufs2_dinode *sip;
4063
ufs_lbn_t prevlbn = 0;
4067
if (inodedep->id_state & IOSTARTED)
4068
panic("initiate_write_inodeblock_ufs2: already started");
4069
inodedep->id_state |= IOSTARTED;
4070
fs = inodedep->id_fs;
4071
dp = (struct ufs2_dinode *)bp->b_data +
4072
ino_to_fsbo(fs, inodedep->id_ino);
4074
* If the bitmap is not yet written, then the allocated
4075
* inode cannot be written to disk.
4077
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4078
if (inodedep->id_savedino2 != NULL)
4079
panic("initiate_write_inodeblock_ufs2: I/O underway");
4081
MALLOC(sip, struct ufs2_dinode *,
4082
sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
4084
inodedep->id_savedino2 = sip;
4085
*inodedep->id_savedino2 = *dp;
4086
bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
4087
dp->di_gen = inodedep->id_savedino2->di_gen;
4091
* If no dependencies, then there is nothing to roll back.
4093
inodedep->id_savedsize = dp->di_size;
4094
inodedep->id_savedextsize = dp->di_extsize;
4095
if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
4096
TAILQ_EMPTY(&inodedep->id_extupdt))
4099
* Set the ext data dependencies to busy.
4101
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
4102
adp = TAILQ_NEXT(adp, ad_next)) {
4104
if (deplist != 0 && prevlbn >= adp->ad_lbn)
4105
panic("softdep_write_inodeblock: lbn order");
4106
prevlbn = adp->ad_lbn;
4107
if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno)
4108
panic("%s: direct pointer #%jd mismatch %jd != %jd",
4109
"softdep_write_inodeblock",
4110
(intmax_t)adp->ad_lbn,
4111
(intmax_t)dp->di_extb[adp->ad_lbn],
4112
(intmax_t)adp->ad_newblkno);
4113
deplist |= 1 << adp->ad_lbn;
4114
if ((adp->ad_state & ATTACHED) == 0)
4115
panic("softdep_write_inodeblock: Unknown state 0x%x",
4117
#endif /* INVARIANTS */
4118
adp->ad_state &= ~ATTACHED;
4119
adp->ad_state |= UNDONE;
4122
* The on-disk inode cannot claim to be any larger than the last
4123
* fragment that has been written. Otherwise, the on-disk inode
4124
* might have fragments that were not the last block in the ext
4125
* data which would corrupt the filesystem.
4127
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
4128
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
4129
dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
4130
/* keep going until hitting a rollback to a frag */
4131
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
4133
dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
4134
for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
4136
if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
4137
panic("softdep_write_inodeblock: lost dep1");
4138
#endif /* INVARIANTS */
4145
* If we have zero'ed out the last allocated block of the ext
4146
* data, roll back the size to the last currently allocated block.
4147
* We know that this last allocated block is a full-sized as
4148
* we already checked for fragments in the loop above.
4150
if (lastadp != NULL &&
4151
dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
4152
for (i = lastadp->ad_lbn; i >= 0; i--)
4153
if (dp->di_extb[i] != 0)
4155
dp->di_extsize = (i + 1) * fs->fs_bsize;
4158
* Set the file data dependencies to busy.
4160
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
4161
adp = TAILQ_NEXT(adp, ad_next)) {
4163
if (deplist != 0 && prevlbn >= adp->ad_lbn)
4164
panic("softdep_write_inodeblock: lbn order");
4165
prevlbn = adp->ad_lbn;
4166
if (adp->ad_lbn < NDADDR &&
4167
dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
4168
panic("%s: direct pointer #%jd mismatch %jd != %jd",
4169
"softdep_write_inodeblock",
4170
(intmax_t)adp->ad_lbn,
4171
(intmax_t)dp->di_db[adp->ad_lbn],
4172
(intmax_t)adp->ad_newblkno);
4173
if (adp->ad_lbn >= NDADDR &&
4174
dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
4175
panic("%s indirect pointer #%jd mismatch %jd != %jd",
4176
"softdep_write_inodeblock:",
4177
(intmax_t)adp->ad_lbn - NDADDR,
4178
(intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
4179
(intmax_t)adp->ad_newblkno);
4180
deplist |= 1 << adp->ad_lbn;
4181
if ((adp->ad_state & ATTACHED) == 0)
4182
panic("softdep_write_inodeblock: Unknown state 0x%x",
4184
#endif /* INVARIANTS */
4185
adp->ad_state &= ~ATTACHED;
4186
adp->ad_state |= UNDONE;
4189
* The on-disk inode cannot claim to be any larger than the last
4190
* fragment that has been written. Otherwise, the on-disk inode
4191
* might have fragments that were not the last block in the file
4192
* which would corrupt the filesystem.
4194
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
4195
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
4196
if (adp->ad_lbn >= NDADDR)
4198
dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
4199
/* keep going until hitting a rollback to a frag */
4200
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
4202
dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
4203
for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
4205
if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
4206
panic("softdep_write_inodeblock: lost dep2");
4207
#endif /* INVARIANTS */
4210
for (i = 0; i < NIADDR; i++) {
4212
if (dp->di_ib[i] != 0 &&
4213
(deplist & ((1 << NDADDR) << i)) == 0)
4214
panic("softdep_write_inodeblock: lost dep3");
4215
#endif /* INVARIANTS */
4221
* If we have zero'ed out the last allocated block of the file,
4222
* roll back the size to the last currently allocated block.
4223
* We know that this last allocated block is a full-sized as
4224
* we already checked for fragments in the loop above.
4226
if (lastadp != NULL &&
4227
dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
4228
for (i = lastadp->ad_lbn; i >= 0; i--)
4229
if (dp->di_db[i] != 0)
4231
dp->di_size = (i + 1) * fs->fs_bsize;
4234
* The only dependencies are for indirect blocks.
4236
* The file size for indirect block additions is not guaranteed.
4237
* Such a guarantee would be non-trivial to achieve. The conventional
4238
* synchronous write implementation also does not make this guarantee.
4239
* Fsck should catch and fix discrepancies. Arguably, the file size
4240
* can be over-estimated without destroying integrity when the file
4241
* moves into the indirect blocks (i.e., is large). If we want to
4242
* postpone fsck, we are stuck with this argument.
4244
for (; adp; adp = TAILQ_NEXT(adp, ad_next))
4245
dp->di_ib[adp->ad_lbn - NDADDR] = 0;
4249
* This routine is called during the completion interrupt
4250
* service routine for a disk write (from the procedure called
4251
* by the device driver to inform the filesystem caches of
4252
* a request completion). It should be called early in this
4253
* procedure, before the block is made available to other
4254
* processes or other routines are called.
4257
softdep_disk_write_complete(bp)
4258
struct buf *bp; /* describes the completed disk write */
4260
struct worklist *wk;
4261
struct worklist *owk;
4262
struct workhead reattach;
4263
struct newblk *newblk;
4264
struct allocindir *aip;
4265
struct allocdirect *adp;
4266
struct indirdep *indirdep;
4267
struct inodedep *inodedep;
4268
struct bmsafemap *bmsafemap;
4271
* If an error occurred while doing the write, then the data
4272
* has not hit the disk and the dependencies cannot be unrolled.
4274
if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
4276
LIST_INIT(&reattach);
4278
* This lock must not be released anywhere in this code segment.
4282
while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
4283
WORKLIST_REMOVE(wk);
4285
panic("duplicate worklist: %p\n", wk);
4287
switch (wk->wk_type) {
4290
if (handle_written_filepage(WK_PAGEDEP(wk), bp))
4291
WORKLIST_INSERT(&reattach, wk);
4295
if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
4296
WORKLIST_INSERT(&reattach, wk);
4300
bmsafemap = WK_BMSAFEMAP(wk);
4301
while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
4302
newblk->nb_state |= DEPCOMPLETE;
4303
newblk->nb_bmsafemap = NULL;
4304
LIST_REMOVE(newblk, nb_deps);
4307
LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
4308
adp->ad_state |= DEPCOMPLETE;
4310
LIST_REMOVE(adp, ad_deps);
4311
handle_allocdirect_partdone(adp);
4314
LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
4315
aip->ai_state |= DEPCOMPLETE;
4317
LIST_REMOVE(aip, ai_deps);
4318
handle_allocindir_partdone(aip);
4321
LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
4322
inodedep->id_state |= DEPCOMPLETE;
4323
LIST_REMOVE(inodedep, id_deps);
4324
inodedep->id_buf = NULL;
4326
WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
4330
handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
4334
adp = WK_ALLOCDIRECT(wk);
4335
adp->ad_state |= COMPLETE;
4336
handle_allocdirect_partdone(adp);
4340
aip = WK_ALLOCINDIR(wk);
4341
aip->ai_state |= COMPLETE;
4342
handle_allocindir_partdone(aip);
4346
indirdep = WK_INDIRDEP(wk);
4347
if (indirdep->ir_state & GOINGAWAY)
4348
panic("disk_write_complete: indirdep gone");
4349
bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
4350
FREE(indirdep->ir_saveddata, M_INDIRDEP);
4351
indirdep->ir_saveddata = 0;
4352
indirdep->ir_state &= ~UNDONE;
4353
indirdep->ir_state |= ATTACHED;
4354
while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
4355
handle_allocindir_partdone(aip);
4356
if (aip == LIST_FIRST(&indirdep->ir_donehd))
4357
panic("disk_write_complete: not gone");
4359
WORKLIST_INSERT(&reattach, wk);
4360
if ((bp->b_flags & B_DELWRI) == 0)
4361
stat_indir_blk_ptrs++;
4366
panic("handle_disk_write_complete: Unknown type %s",
4367
TYPENAME(wk->wk_type));
4372
* Reattach any requests that must be redone.
4374
while ((wk = LIST_FIRST(&reattach)) != NULL) {
4375
WORKLIST_REMOVE(wk);
4376
WORKLIST_INSERT(&bp->b_dep, wk);
4382
* Called from within softdep_disk_write_complete above. Note that
4383
* this routine is always called from interrupt level with further
4384
* splbio interrupts blocked.
4387
handle_allocdirect_partdone(adp)
4388
struct allocdirect *adp; /* the completed allocdirect */
4390
struct allocdirectlst *listhead;
4391
struct allocdirect *listadp;
4392
struct inodedep *inodedep;
4395
if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
4397
if (adp->ad_buf != NULL)
4398
panic("handle_allocdirect_partdone: dangling dep");
4400
* The on-disk inode cannot claim to be any larger than the last
4401
* fragment that has been written. Otherwise, the on-disk inode
4402
* might have fragments that were not the last block in the file
4403
* which would corrupt the filesystem. Thus, we cannot free any
4404
* allocdirects after one whose ad_oldblkno claims a fragment as
4405
* these blocks must be rolled back to zero before writing the inode.
4406
* We check the currently active set of allocdirects in id_inoupdt
4407
* or id_extupdt as appropriate.
4409
inodedep = adp->ad_inodedep;
4410
bsize = inodedep->id_fs->fs_bsize;
4411
if (adp->ad_state & EXTDATA)
4412
listhead = &inodedep->id_extupdt;
4414
listhead = &inodedep->id_inoupdt;
4415
TAILQ_FOREACH(listadp, listhead, ad_next) {
4416
/* found our block */
4419
/* continue if ad_oldlbn is not a fragment */
4420
if (listadp->ad_oldsize == 0 ||
4421
listadp->ad_oldsize == bsize)
4423
/* hit a fragment */
4427
* If we have reached the end of the current list without
4428
* finding the just finished dependency, then it must be
4429
* on the future dependency list. Future dependencies cannot
4430
* be freed until they are moved to the current list.
4432
if (listadp == NULL) {
4434
if (adp->ad_state & EXTDATA)
4435
listhead = &inodedep->id_newextupdt;
4437
listhead = &inodedep->id_newinoupdt;
4438
TAILQ_FOREACH(listadp, listhead, ad_next)
4439
/* found our block */
4442
if (listadp == NULL)
4443
panic("handle_allocdirect_partdone: lost dep");
4448
* If we have found the just finished dependency, then free
4449
* it along with anything that follows it that is complete.
4450
* If the inode still has a bitmap dependency, then it has
4451
* never been written to disk, hence the on-disk inode cannot
4452
* reference the old fragment so we can free it without delay.
4454
delay = (inodedep->id_state & DEPCOMPLETE);
4455
for (; adp; adp = listadp) {
4456
listadp = TAILQ_NEXT(adp, ad_next);
4457
if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
4459
free_allocdirect(listhead, adp, delay);
4464
* Called from within softdep_disk_write_complete above. Note that
4465
* this routine is always called from interrupt level with further
4466
* splbio interrupts blocked.
4469
handle_allocindir_partdone(aip)
4470
struct allocindir *aip; /* the completed allocindir */
4472
struct indirdep *indirdep;
4474
if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
4476
if (aip->ai_buf != NULL)
4477
panic("handle_allocindir_partdone: dangling dependency");
4478
indirdep = aip->ai_indirdep;
4479
if (indirdep->ir_state & UNDONE) {
4480
LIST_REMOVE(aip, ai_next);
4481
LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
4484
if (indirdep->ir_state & UFS1FMT)
4485
((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4488
((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4490
LIST_REMOVE(aip, ai_next);
4491
if (aip->ai_freefrag != NULL)
4492
add_to_worklist(&aip->ai_freefrag->ff_list);
4493
WORKITEM_FREE(aip, D_ALLOCINDIR);
4497
* Called from within softdep_disk_write_complete above to restore
4498
* in-memory inode block contents to their most up-to-date state. Note
4499
* that this routine is always called from interrupt level with further
4500
* splbio interrupts blocked.
4503
handle_written_inodeblock(inodedep, bp)
4504
struct inodedep *inodedep;
4505
struct buf *bp; /* buffer containing the inode block */
4507
struct worklist *wk, *filefree;
4508
struct allocdirect *adp, *nextadp;
4509
struct ufs1_dinode *dp1 = NULL;
4510
struct ufs2_dinode *dp2 = NULL;
4511
int hadchanges, fstype;
4513
if ((inodedep->id_state & IOSTARTED) == 0)
4514
panic("handle_written_inodeblock: not started");
4515
inodedep->id_state &= ~IOSTARTED;
4516
if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
4518
dp1 = (struct ufs1_dinode *)bp->b_data +
4519
ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4522
dp2 = (struct ufs2_dinode *)bp->b_data +
4523
ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4526
* If we had to rollback the inode allocation because of
4527
* bitmaps being incomplete, then simply restore it.
4528
* Keep the block dirty so that it will not be reclaimed until
4529
* all associated dependencies have been cleared and the
4530
* corresponding updates written to disk.
4532
if (inodedep->id_savedino1 != NULL) {
4534
*dp1 = *inodedep->id_savedino1;
4536
*dp2 = *inodedep->id_savedino2;
4537
FREE(inodedep->id_savedino1, M_SAVEDINO);
4538
inodedep->id_savedino1 = NULL;
4539
if ((bp->b_flags & B_DELWRI) == 0)
4540
stat_inode_bitmap++;
4544
inodedep->id_state |= COMPLETE;
4546
* Roll forward anything that had to be rolled back before
4547
* the inode could be updated.
4550
for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
4551
nextadp = TAILQ_NEXT(adp, ad_next);
4552
if (adp->ad_state & ATTACHED)
4553
panic("handle_written_inodeblock: new entry");
4554
if (fstype == UFS1) {
4555
if (adp->ad_lbn < NDADDR) {
4556
if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
4557
panic("%s %s #%jd mismatch %d != %jd",
4558
"handle_written_inodeblock:",
4560
(intmax_t)adp->ad_lbn,
4561
dp1->di_db[adp->ad_lbn],
4562
(intmax_t)adp->ad_oldblkno);
4563
dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
4565
if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
4566
panic("%s: %s #%jd allocated as %d",
4567
"handle_written_inodeblock",
4569
(intmax_t)adp->ad_lbn - NDADDR,
4570
dp1->di_ib[adp->ad_lbn - NDADDR]);
4571
dp1->di_ib[adp->ad_lbn - NDADDR] =
4575
if (adp->ad_lbn < NDADDR) {
4576
if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
4577
panic("%s: %s #%jd %s %jd != %jd",
4578
"handle_written_inodeblock",
4580
(intmax_t)adp->ad_lbn, "mismatch",
4581
(intmax_t)dp2->di_db[adp->ad_lbn],
4582
(intmax_t)adp->ad_oldblkno);
4583
dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
4585
if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
4586
panic("%s: %s #%jd allocated as %jd",
4587
"handle_written_inodeblock",
4589
(intmax_t)adp->ad_lbn - NDADDR,
4591
dp2->di_ib[adp->ad_lbn - NDADDR]);
4592
dp2->di_ib[adp->ad_lbn - NDADDR] =
4596
adp->ad_state &= ~UNDONE;
4597
adp->ad_state |= ATTACHED;
4600
for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
4601
nextadp = TAILQ_NEXT(adp, ad_next);
4602
if (adp->ad_state & ATTACHED)
4603
panic("handle_written_inodeblock: new entry");
4604
if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno)
4605
panic("%s: direct pointers #%jd %s %jd != %jd",
4606
"handle_written_inodeblock",
4607
(intmax_t)adp->ad_lbn, "mismatch",
4608
(intmax_t)dp2->di_extb[adp->ad_lbn],
4609
(intmax_t)adp->ad_oldblkno);
4610
dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
4611
adp->ad_state &= ~UNDONE;
4612
adp->ad_state |= ATTACHED;
4615
if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
4616
stat_direct_blk_ptrs++;
4618
* Reset the file size to its most up-to-date value.
4620
if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
4621
panic("handle_written_inodeblock: bad size");
4622
if (fstype == UFS1) {
4623
if (dp1->di_size != inodedep->id_savedsize) {
4624
dp1->di_size = inodedep->id_savedsize;
4628
if (dp2->di_size != inodedep->id_savedsize) {
4629
dp2->di_size = inodedep->id_savedsize;
4632
if (dp2->di_extsize != inodedep->id_savedextsize) {
4633
dp2->di_extsize = inodedep->id_savedextsize;
4637
inodedep->id_savedsize = -1;
4638
inodedep->id_savedextsize = -1;
4640
* If there were any rollbacks in the inode block, then it must be
4641
* marked dirty so that its will eventually get written back in
4647
* Process any allocdirects that completed during the update.
4649
if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
4650
handle_allocdirect_partdone(adp);
4651
if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
4652
handle_allocdirect_partdone(adp);
4654
* Process deallocations that were held pending until the
4655
* inode had been written to disk. Freeing of the inode
4656
* is delayed until after all blocks have been freed to
4657
* avoid creation of new <vfsid, inum, lbn> triples
4658
* before the old ones have been deleted.
4661
while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
4662
WORKLIST_REMOVE(wk);
4663
switch (wk->wk_type) {
4667
* We defer adding filefree to the worklist until
4668
* all other additions have been made to ensure
4669
* that it will be done after all the old blocks
4672
if (filefree != NULL)
4673
panic("handle_written_inodeblock: filefree");
4678
handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
4682
diradd_inode_written(WK_DIRADD(wk), inodedep);
4686
wk->wk_state |= COMPLETE;
4687
if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE)
4689
/* -- fall through -- */
4692
add_to_worklist(wk);
4696
free_newdirblk(WK_NEWDIRBLK(wk));
4700
panic("handle_written_inodeblock: Unknown type %s",
4701
TYPENAME(wk->wk_type));
4705
if (filefree != NULL) {
4706
if (free_inodedep(inodedep) == 0)
4707
panic("handle_written_inodeblock: live inodedep");
4708
add_to_worklist(filefree);
4713
* If no outstanding dependencies, free it.
4715
if (free_inodedep(inodedep) ||
4716
(TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
4717
TAILQ_FIRST(&inodedep->id_extupdt) == 0))
4719
return (hadchanges);
4723
* Process a diradd entry after its dependent inode has been written.
4724
* This routine must be called with splbio interrupts blocked.
4727
diradd_inode_written(dap, inodedep)
4729
struct inodedep *inodedep;
4731
struct pagedep *pagedep;
4733
dap->da_state |= COMPLETE;
4734
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4735
if (dap->da_state & DIRCHG)
4736
pagedep = dap->da_previous->dm_pagedep;
4738
pagedep = dap->da_pagedep;
4739
LIST_REMOVE(dap, da_pdlist);
4740
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4742
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
4746
* Handle the completion of a mkdir dependency.
4749
handle_written_mkdir(mkdir, type)
4750
struct mkdir *mkdir;
4754
struct pagedep *pagedep;
4756
if (mkdir->md_state != type)
4757
panic("handle_written_mkdir: bad type");
4758
dap = mkdir->md_diradd;
4759
dap->da_state &= ~type;
4760
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
4761
dap->da_state |= DEPCOMPLETE;
4762
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4763
if (dap->da_state & DIRCHG)
4764
pagedep = dap->da_previous->dm_pagedep;
4766
pagedep = dap->da_pagedep;
4767
LIST_REMOVE(dap, da_pdlist);
4768
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4770
LIST_REMOVE(mkdir, md_mkdirs);
4771
WORKITEM_FREE(mkdir, D_MKDIR);
4775
* Called from within softdep_disk_write_complete above.
4776
* A write operation was just completed. Removed inodes can
4777
* now be freed and associated block pointers may be committed.
4778
* Note that this routine is always called from interrupt level
4779
* with further splbio interrupts blocked.
4782
handle_written_filepage(pagedep, bp)
4783
struct pagedep *pagedep;
4784
struct buf *bp; /* buffer containing the written page */
4786
struct dirrem *dirrem;
4787
struct diradd *dap, *nextdap;
4791
if ((pagedep->pd_state & IOSTARTED) == 0)
4792
panic("handle_written_filepage: not started");
4793
pagedep->pd_state &= ~IOSTARTED;
4795
* Process any directory removals that have been committed.
4797
while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
4798
LIST_REMOVE(dirrem, dm_next);
4799
dirrem->dm_dirinum = pagedep->pd_ino;
4800
add_to_worklist(&dirrem->dm_list);
4803
* Free any directory additions that have been committed.
4804
* If it is a newly allocated block, we have to wait until
4805
* the on-disk directory inode claims the new block.
4807
if ((pagedep->pd_state & NEWBLOCK) == 0)
4808
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
4811
* Uncommitted directory entries must be restored.
4813
for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
4814
for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
4816
nextdap = LIST_NEXT(dap, da_pdlist);
4817
if (dap->da_state & ATTACHED)
4818
panic("handle_written_filepage: attached");
4819
ep = (struct direct *)
4820
((char *)bp->b_data + dap->da_offset);
4821
ep->d_ino = dap->da_newinum;
4822
dap->da_state &= ~UNDONE;
4823
dap->da_state |= ATTACHED;
4826
* If the inode referenced by the directory has
4827
* been written out, then the dependency can be
4828
* moved to the pending list.
4830
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4831
LIST_REMOVE(dap, da_pdlist);
4832
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
4838
* If there were any rollbacks in the directory, then it must be
4839
* marked dirty so that its will eventually get written back in
4843
if ((bp->b_flags & B_DELWRI) == 0)
4849
* If we are not waiting for a new directory block to be
4850
* claimed by its inode, then the pagedep will be freed.
4851
* Otherwise it will remain to track any new entries on
4852
* the page in case they are fsync'ed.
4854
if ((pagedep->pd_state & NEWBLOCK) == 0) {
4855
LIST_REMOVE(pagedep, pd_hash);
4856
WORKITEM_FREE(pagedep, D_PAGEDEP);
4862
* Writing back in-core inode structures.
4864
* The filesystem only accesses an inode's contents when it occupies an
4865
* "in-core" inode structure. These "in-core" structures are separate from
4866
* the page frames used to cache inode blocks. Only the latter are
4867
* transferred to/from the disk. So, when the updated contents of the
4868
* "in-core" inode structure are copied to the corresponding in-memory inode
4869
* block, the dependencies are also transferred. The following procedure is
4870
* called when copying a dirty "in-core" inode to a cached inode block.
4874
* Called when an inode is loaded from disk. If the effective link count
4875
* differed from the actual link count when it was last flushed, then we
4876
* need to ensure that the correct effective link count is put back.
4879
softdep_load_inodeblock(ip)
4880
struct inode *ip; /* the "in_core" copy of the inode */
4882
struct inodedep *inodedep;
4885
* Check for alternate nlink count.
4887
ip->i_effnlink = ip->i_nlink;
4889
if (inodedep_lookup(UFSTOVFS(ip->i_ump),
4890
ip->i_number, 0, &inodedep) == 0) {
4894
ip->i_effnlink -= inodedep->id_nlinkdelta;
4895
if (inodedep->id_state & SPACECOUNTED)
4896
ip->i_flag |= IN_SPACECOUNTED;
4901
* This routine is called just before the "in-core" inode
4902
* information is to be copied to the in-memory inode block.
4903
* Recall that an inode block contains several inodes. If
4904
* the force flag is set, then the dependencies will be
4905
* cleared so that the update can always be made. Note that
4906
* the buffer is locked when this routine is called, so we
4907
* will never be in the middle of writing the inode block
4911
softdep_update_inodeblock(ip, bp, waitfor)
4912
struct inode *ip; /* the "in_core" copy of the inode */
4913
struct buf *bp; /* the buffer containing the inode block */
4914
int waitfor; /* nonzero => update must be allowed */
4916
struct inodedep *inodedep;
4917
struct worklist *wk;
4923
* If the effective link count is not equal to the actual link
4924
* count, then we must track the difference in an inodedep while
4925
* the inode is (potentially) tossed out of the cache. Otherwise,
4926
* if there is no existing inodedep, then there are no dependencies
4929
mp = UFSTOVFS(ip->i_ump);
4931
if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
4933
if (ip->i_effnlink != ip->i_nlink)
4934
panic("softdep_update_inodeblock: bad link count");
4937
if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
4938
panic("softdep_update_inodeblock: bad delta");
4940
* Changes have been initiated. Anything depending on these
4941
* changes cannot occur until this inode has been written.
4943
inodedep->id_state &= ~COMPLETE;
4944
if ((inodedep->id_state & ONWORKLIST) == 0)
4945
WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
4947
* Any new dependencies associated with the incore inode must
4948
* now be moved to the list associated with the buffer holding
4949
* the in-memory copy of the inode. Once merged process any
4950
* allocdirects that are completed by the merger.
4952
merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
4953
if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
4954
handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
4955
merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
4956
if (!TAILQ_EMPTY(&inodedep->id_extupdt))
4957
handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
4959
* Now that the inode has been pushed into the buffer, the
4960
* operations dependent on the inode being written to disk
4961
* can be moved to the id_bufwait so that they will be
4962
* processed when the buffer I/O completes.
4964
while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
4965
WORKLIST_REMOVE(wk);
4966
WORKLIST_INSERT(&inodedep->id_bufwait, wk);
4969
* Newly allocated inodes cannot be written until the bitmap
4970
* that allocates them have been written (indicated by
4971
* DEPCOMPLETE being set in id_state). If we are doing a
4972
* forced sync (e.g., an fsync on a file), we force the bitmap
4973
* to be written so that the update can be done.
4980
if ((inodedep->id_state & DEPCOMPLETE) != 0) {
4984
ibp = inodedep->id_buf;
4985
ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
4988
* If ibp came back as NULL, the dependency could have been
4989
* freed while we slept. Look it up again, and check to see
4990
* that it has completed.
4992
if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
4998
if ((error = bwrite(ibp)) != 0)
4999
softdep_error("softdep_update_inodeblock: bwrite", error);
5003
* Merge the a new inode dependency list (such as id_newinoupdt) into an
5004
* old inode dependency list (such as id_inoupdt). This routine must be
5005
* called with splbio interrupts blocked.
5008
merge_inode_lists(newlisthead, oldlisthead)
5009
struct allocdirectlst *newlisthead;
5010
struct allocdirectlst *oldlisthead;
5012
struct allocdirect *listadp, *newadp;
5014
newadp = TAILQ_FIRST(newlisthead);
5015
for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
5016
if (listadp->ad_lbn < newadp->ad_lbn) {
5017
listadp = TAILQ_NEXT(listadp, ad_next);
5020
TAILQ_REMOVE(newlisthead, newadp, ad_next);
5021
TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
5022
if (listadp->ad_lbn == newadp->ad_lbn) {
5023
allocdirect_merge(oldlisthead, newadp,
5027
newadp = TAILQ_FIRST(newlisthead);
5029
while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
5030
TAILQ_REMOVE(newlisthead, newadp, ad_next);
5031
TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
5036
* If we are doing an fsync, then we must ensure that any directory
5037
* entries for the inode have been written after the inode gets to disk.
5041
struct vnode *vp; /* the "in_core" copy of the inode */
5043
struct inodedep *inodedep;
5044
struct pagedep *pagedep;
5045
struct worklist *wk;
5052
struct thread *td = curthread;
5053
int error, flushparent, pagedep_new_block;
5061
if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
5065
if (!LIST_EMPTY(&inodedep->id_inowait) ||
5066
!LIST_EMPTY(&inodedep->id_bufwait) ||
5067
!TAILQ_EMPTY(&inodedep->id_extupdt) ||
5068
!TAILQ_EMPTY(&inodedep->id_newextupdt) ||
5069
!TAILQ_EMPTY(&inodedep->id_inoupdt) ||
5070
!TAILQ_EMPTY(&inodedep->id_newinoupdt))
5071
panic("softdep_fsync: pending ops");
5072
for (error = 0, flushparent = 0; ; ) {
5073
if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
5075
if (wk->wk_type != D_DIRADD)
5076
panic("softdep_fsync: Unexpected type %s",
5077
TYPENAME(wk->wk_type));
5078
dap = WK_DIRADD(wk);
5080
* Flush our parent if this directory entry has a MKDIR_PARENT
5081
* dependency or is contained in a newly allocated block.
5083
if (dap->da_state & DIRCHG)
5084
pagedep = dap->da_previous->dm_pagedep;
5086
pagedep = dap->da_pagedep;
5087
parentino = pagedep->pd_ino;
5088
lbn = pagedep->pd_lbn;
5089
if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
5090
panic("softdep_fsync: dirty");
5091
if ((dap->da_state & MKDIR_PARENT) ||
5092
(pagedep->pd_state & NEWBLOCK))
5097
* If we are being fsync'ed as part of vgone'ing this vnode,
5098
* then we will not be able to release and recover the
5099
* vnode below, so we just have to give up on writing its
5100
* directory entry out. It will eventually be written, just
5101
* not now, but then the user was not asking to have it
5102
* written, so we are not breaking any promises.
5104
if (vp->v_iflag & VI_DOOMED)
5107
* We prevent deadlock by always fetching inodes from the
5108
* root, moving down the directory tree. Thus, when fetching
5109
* our parent directory, we first try to get the lock. If
5110
* that fails, we must unlock ourselves before requesting
5111
* the lock on our parent. See the comment in ufs_lookup
5112
* for details on possible races.
5115
if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
5117
VOP_UNLOCK(vp, 0, td);
5118
error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
5119
&pvp, FFSV_FORCEINSMQ);
5120
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
5125
* All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
5126
* that are contained in direct blocks will be resolved by
5127
* doing a ffs_update. Pagedeps contained in indirect blocks
5128
* may require a complete sync'ing of the directory. So, we
5129
* try the cheap and fast ffs_update first, and if that fails,
5130
* then we do the slower ffs_syncvnode of the directory.
5135
if ((error = ffs_update(pvp, 1)) != 0) {
5141
if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
5142
if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
5143
if (wk->wk_type != D_DIRADD)
5144
panic("softdep_fsync: Unexpected type %s",
5145
TYPENAME(wk->wk_type));
5146
dap = WK_DIRADD(wk);
5147
if (dap->da_state & DIRCHG)
5148
pagedep = dap->da_previous->dm_pagedep;
5150
pagedep = dap->da_pagedep;
5151
pagedep_new_block = pagedep->pd_state & NEWBLOCK;
5154
if (pagedep_new_block &&
5155
(error = ffs_syncvnode(pvp, MNT_WAIT))) {
5165
* Flush directory page containing the inode's name.
5167
error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
5177
if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
5185
* Flush all the dirty bitmaps associated with the block device
5186
* before flushing the rest of the dirty blocks so as to reduce
5187
* the number of dependencies that will have to be rolled back.
5190
softdep_fsync_mountdev(vp)
5193
struct buf *bp, *nbp;
5194
struct worklist *wk;
5196
if (!vn_isdisk(vp, NULL))
5197
panic("softdep_fsync_mountdev: vnode not a disk");
5201
TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
5203
* If it is already scheduled, skip to the next buffer.
5205
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
5208
if ((bp->b_flags & B_DELWRI) == 0)
5209
panic("softdep_fsync_mountdev: not dirty");
5211
* We are only interested in bitmaps with outstanding
5214
if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
5215
wk->wk_type != D_BMSAFEMAP ||
5216
(bp->b_vflags & BV_BKGRDINPROG)) {
5232
* This routine is called when we are trying to synchronously flush a
5233
* file. This routine must eliminate any filesystem metadata dependencies
5234
* so that the syncing routine can succeed by pushing the dirty blocks
5235
* associated with the file. If any I/O errors occur, they are returned.
5238
softdep_sync_metadata(struct vnode *vp)
5240
struct pagedep *pagedep;
5241
struct allocdirect *adp;
5242
struct allocindir *aip;
5243
struct buf *bp, *nbp;
5244
struct worklist *wk;
5245
int i, error, waitfor;
5247
if (!DOINGSOFTDEP(vp))
5250
* Ensure that any direct block dependencies have been cleared.
5253
if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) {
5259
* For most files, the only metadata dependencies are the
5260
* cylinder group maps that allocate their inode or blocks.
5261
* The block allocation dependencies can be found by traversing
5262
* the dependency lists for any buffers that remain on their
5263
* dirty buffer list. The inode allocation dependency will
5264
* be resolved when the inode is updated with MNT_WAIT.
5265
* This work is done in two passes. The first pass grabs most
5266
* of the buffers and begins asynchronously writing them. The
5267
* only way to wait for these asynchronous writes is to sleep
5268
* on the filesystem vnode which may stay busy for a long time
5269
* if the filesystem is active. So, instead, we make a second
5270
* pass over the dependencies blocking on each write. In the
5271
* usual case we will be blocking against a write that we
5272
* initiated, so when it is done the dependency will have been
5273
* resolved. Thus the second pass is expected to end quickly.
5275
waitfor = MNT_NOWAIT;
5279
* We must wait for any I/O in progress to finish so that
5280
* all potential buffers on the dirty list will be visible.
5284
while ((bp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd)) != NULL) {
5285
bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT);
5293
/* While syncing snapshots, we must allow recursive lookups */
5294
bp->b_lock.lk_flags |= LK_CANRECURSE;
5297
* As we hold the buffer locked, none of its dependencies
5300
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5301
switch (wk->wk_type) {
5304
adp = WK_ALLOCDIRECT(wk);
5305
if (adp->ad_state & DEPCOMPLETE)
5308
nbp = getdirtybuf(nbp, &lk, waitfor);
5312
if (waitfor == MNT_NOWAIT) {
5314
} else if ((error = bwrite(nbp)) != 0) {
5321
aip = WK_ALLOCINDIR(wk);
5322
if (aip->ai_state & DEPCOMPLETE)
5325
nbp = getdirtybuf(nbp, &lk, waitfor);
5329
if (waitfor == MNT_NOWAIT) {
5331
} else if ((error = bwrite(nbp)) != 0) {
5340
LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
5341
if (aip->ai_state & DEPCOMPLETE)
5344
nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
5348
if ((error = bwrite(nbp)) != 0) {
5357
if ((error = flush_inodedep_deps(wk->wk_mp,
5358
WK_INODEDEP(wk)->id_ino)) != 0) {
5366
* We are trying to sync a directory that may
5367
* have dependencies on both its own metadata
5368
* and/or dependencies on the inodes of any
5369
* recently allocated files. We walk its diradd
5370
* lists pushing out the associated inode.
5372
pagedep = WK_PAGEDEP(wk);
5373
for (i = 0; i < DAHASHSZ; i++) {
5374
if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
5377
flush_pagedep_deps(vp, wk->wk_mp,
5378
&pagedep->pd_diraddhd[i]))) {
5387
* This case should never happen if the vnode has
5388
* been properly sync'ed. However, if this function
5389
* is used at a place where the vnode has not yet
5390
* been sync'ed, this dependency can show up. So,
5391
* rather than panic, just flush it.
5393
nbp = WK_MKDIR(wk)->md_buf;
5394
nbp = getdirtybuf(nbp, &lk, waitfor);
5398
if (waitfor == MNT_NOWAIT) {
5400
} else if ((error = bwrite(nbp)) != 0) {
5408
* This case should never happen if the vnode has
5409
* been properly sync'ed. However, if this function
5410
* is used at a place where the vnode has not yet
5411
* been sync'ed, this dependency can show up. So,
5412
* rather than panic, just flush it.
5414
nbp = WK_BMSAFEMAP(wk)->sm_buf;
5415
nbp = getdirtybuf(nbp, &lk, waitfor);
5419
if (waitfor == MNT_NOWAIT) {
5421
} else if ((error = bwrite(nbp)) != 0) {
5428
panic("softdep_sync_metadata: Unknown type %s",
5429
TYPENAME(wk->wk_type));
5433
/* We reach here only in error and unlocked */
5435
panic("softdep_sync_metadata: zero error");
5436
bp->b_lock.lk_flags &= ~LK_CANRECURSE;
5442
while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) {
5443
nbp = getdirtybuf(nbp, VI_MTX(vp), MNT_WAIT);
5448
bp->b_lock.lk_flags &= ~LK_CANRECURSE;
5455
* The brief unlock is to allow any pent up dependency
5456
* processing to be done. Then proceed with the second pass.
5458
if (waitfor == MNT_NOWAIT) {
5464
* If we have managed to get rid of all the dirty buffers,
5465
* then we are done. For certain directories and block
5466
* devices, we may need to do further work.
5468
* We must wait for any I/O in progress to finish so that
5469
* all potential buffers on the dirty list will be visible.
5478
* Flush the dependencies associated with an inodedep.
5479
* Called with splbio blocked.
5482
flush_inodedep_deps(mp, ino)
5486
struct inodedep *inodedep;
5490
* This work is done in two passes. The first pass grabs most
5491
* of the buffers and begins asynchronously writing them. The
5492
* only way to wait for these asynchronous writes is to sleep
5493
* on the filesystem vnode which may stay busy for a long time
5494
* if the filesystem is active. So, instead, we make a second
5495
* pass over the dependencies blocking on each write. In the
5496
* usual case we will be blocking against a write that we
5497
* initiated, so when it is done the dependency will have been
5498
* resolved. Thus the second pass is expected to end quickly.
5499
* We give a brief window at the top of the loop to allow
5500
* any pending I/O to complete.
5502
for (error = 0, waitfor = MNT_NOWAIT; ; ) {
5507
if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
5509
if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
5510
flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
5511
flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
5512
flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
5515
* If pass2, we are done, otherwise do pass 2.
5517
if (waitfor == MNT_WAIT)
5522
* Try freeing inodedep in case all dependencies have been removed.
5524
if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
5525
(void) free_inodedep(inodedep);
5530
* Flush an inode dependency list.
5531
* Called with splbio blocked.
5534
flush_deplist(listhead, waitfor, errorp)
5535
struct allocdirectlst *listhead;
5539
struct allocdirect *adp;
5542
mtx_assert(&lk, MA_OWNED);
5543
TAILQ_FOREACH(adp, listhead, ad_next) {
5544
if (adp->ad_state & DEPCOMPLETE)
5547
bp = getdirtybuf(bp, &lk, waitfor);
5549
if (waitfor == MNT_NOWAIT)
5554
if (waitfor == MNT_NOWAIT) {
5556
} else if ((*errorp = bwrite(bp)) != 0) {
5567
* Eliminate a pagedep dependency by flushing out all its diradd dependencies.
5568
* Called with splbio blocked.
5571
flush_pagedep_deps(pvp, mp, diraddhdp)
5574
struct diraddhd *diraddhdp;
5576
struct inodedep *inodedep;
5577
struct ufsmount *ump;
5583
struct worklist *wk;
5586
while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
5588
* Flush ourselves if this directory entry
5589
* has a MKDIR_PARENT dependency.
5591
if (dap->da_state & MKDIR_PARENT) {
5593
if ((error = ffs_update(pvp, 1)) != 0)
5597
* If that cleared dependencies, go on to next.
5599
if (dap != LIST_FIRST(diraddhdp))
5601
if (dap->da_state & MKDIR_PARENT)
5602
panic("flush_pagedep_deps: MKDIR_PARENT");
5605
* A newly allocated directory must have its "." and
5606
* ".." entries written out before its name can be
5607
* committed in its parent. We do not want or need
5608
* the full semantics of a synchronous ffs_syncvnode as
5609
* that may end up here again, once for each directory
5610
* level in the filesystem. Instead, we push the blocks
5611
* and wait for them to clear. We have to fsync twice
5612
* because the first call may choose to defer blocks
5613
* that still have dependencies, but deferral will
5614
* happen at most once.
5616
inum = dap->da_newinum;
5617
if (dap->da_state & MKDIR_BODY) {
5619
if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
5622
if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) ||
5623
(error=ffs_syncvnode(vp, MNT_NOWAIT))) {
5630
* If first block is still dirty with a D_MKDIR
5631
* dependency then it needs to be written now.
5635
bp = gbincore(&vp->v_bufobj, 0);
5637
break; /* First block not present */
5638
error = BUF_LOCK(bp,
5644
if (error == ENOLCK)
5645
continue; /* Slept, retry */
5648
if ((bp->b_flags & B_DELWRI) == 0) {
5650
break; /* Buffer not dirty */
5652
for (wk = LIST_FIRST(&bp->b_dep);
5654
wk = LIST_NEXT(wk, wk_list))
5655
if (wk->wk_type == D_MKDIR)
5658
BUF_UNLOCK(bp); /* Dependency gone */
5661
* D_MKDIR dependency remains,
5662
* must write buffer to stable
5675
break; /* Flushing of first block failed */
5678
* If that cleared dependencies, go on to next.
5680
if (dap != LIST_FIRST(diraddhdp))
5682
if (dap->da_state & MKDIR_BODY)
5683
panic("flush_pagedep_deps: MKDIR_BODY");
5686
* Flush the inode on which the directory entry depends.
5687
* Having accounted for MKDIR_PARENT and MKDIR_BODY above,
5688
* the only remaining dependency is that the updated inode
5689
* count must get pushed to disk. The inode has already
5690
* been pushed into its inode buffer (via VOP_UPDATE) at
5691
* the time of the reference count change. So we need only
5692
* locate that buffer, ensure that there will be no rollback
5693
* caused by a bitmap dependency, then write the inode buffer.
5696
if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
5697
panic("flush_pagedep_deps: lost inode");
5699
* If the inode still has bitmap dependencies,
5700
* push them to disk.
5702
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5703
bp = inodedep->id_buf;
5704
bp = getdirtybuf(bp, &lk, MNT_WAIT);
5708
if ((error = bwrite(bp)) != 0)
5711
if (dap != LIST_FIRST(diraddhdp))
5715
* If the inode is still sitting in a buffer waiting
5716
* to be written, push it to disk.
5719
if ((error = bread(ump->um_devvp,
5720
fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
5721
(int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
5725
if ((error = bwrite(bp)) != 0)
5729
* If we have failed to get rid of all the dependencies
5730
* then something is seriously wrong.
5732
if (dap == LIST_FIRST(diraddhdp))
5733
panic("flush_pagedep_deps: flush failed");
5741
* A large burst of file addition or deletion activity can drive the
5742
* memory load excessively high. First attempt to slow things down
5743
* using the techniques below. If that fails, this routine requests
5744
* the offending operations to fall back to running synchronously
5745
* until the memory load returns to a reasonable level.
5748
softdep_slowdown(vp)
5751
int max_softdeps_hard;
5754
max_softdeps_hard = max_softdeps * 11 / 10;
5755
if (num_dirrem < max_softdeps_hard / 2 &&
5756
num_inodedep < max_softdeps_hard &&
5757
VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps &&
5758
num_freeblkdep < max_softdeps_hard) {
5762
if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps)
5764
stat_sync_limit_hit += 1;
5770
* Called by the allocation routines when they are about to fail
5771
* in the hope that we can free up some disk space.
5773
* First check to see if the work list has anything on it. If it has,
5774
* clean up entries until we successfully free some space. Because this
5775
* process holds inodes locked, we cannot handle any remove requests
5776
* that might block on a locked inode as that could lead to deadlock.
5777
* If the worklist yields no free space, encourage the syncer daemon
5778
* to help us. In no event will we try for longer than tickdelay seconds.
5781
softdep_request_cleanup(fs, vp)
5785
struct ufsmount *ump;
5787
ufs2_daddr_t needed;
5790
ump = VTOI(vp)->i_ump;
5791
mtx_assert(UFS_MTX(ump), MA_OWNED);
5792
needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
5793
starttime = time_second + tickdelay;
5795
* If we are being called because of a process doing a
5796
* copy-on-write, then it is not safe to update the vnode
5797
* as we may recurse into the copy-on-write routine.
5799
if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
5801
error = ffs_update(vp, 1);
5806
while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
5807
if (time_second > starttime)
5811
if (ump->softdep_on_worklist > 0 &&
5812
process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
5813
stat_worklist_push += 1;
5818
request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT);
5826
* If memory utilization has gotten too high, deliberately slow things
5827
* down and speed up the I/O processing.
5829
extern struct thread *syncertd;
5831
request_cleanup(mp, resource)
5835
struct thread *td = curthread;
5836
struct ufsmount *ump;
5838
mtx_assert(&lk, MA_OWNED);
5840
* We never hold up the filesystem syncer or buf daemon.
5842
if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
5846
* First check to see if the work list has gotten backlogged.
5847
* If it has, co-opt this process to help clean up two entries.
5848
* Because this process may hold inodes locked, we cannot
5849
* handle any remove requests that might block on a locked
5850
* inode as that could lead to deadlock. We set TDP_SOFTDEP
5851
* to avoid recursively processing the worklist.
5853
if (ump->softdep_on_worklist > max_softdeps / 10) {
5854
td->td_pflags |= TDP_SOFTDEP;
5855
process_worklist_item(mp, LK_NOWAIT);
5856
process_worklist_item(mp, LK_NOWAIT);
5857
td->td_pflags &= ~TDP_SOFTDEP;
5858
stat_worklist_push += 2;
5862
* Next, we attempt to speed up the syncer process. If that
5863
* is successful, then we allow the process to continue.
5865
if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT)
5868
* If we are resource constrained on inode dependencies, try
5869
* flushing some dirty inodes. Otherwise, we are constrained
5870
* by file deletions, so try accelerating flushes of directories
5871
* with removal dependencies. We would like to do the cleanup
5872
* here, but we probably hold an inode locked at this point and
5873
* that might deadlock against one that we try to clean. So,
5874
* the best that we can do is request the syncer daemon to do
5875
* the cleanup for us.
5880
stat_ino_limit_push += 1;
5881
req_clear_inodedeps += 1;
5882
stat_countp = &stat_ino_limit_hit;
5886
case FLUSH_REMOVE_WAIT:
5887
stat_blk_limit_push += 1;
5888
req_clear_remove += 1;
5889
stat_countp = &stat_blk_limit_hit;
5893
panic("request_cleanup: unknown type");
5896
* Hopefully the syncer daemon will catch up and awaken us.
5897
* We wait at most tickdelay before proceeding in any case.
5900
if (callout_pending(&softdep_callout) == FALSE)
5901
callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
5904
msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
5910
* Awaken processes pausing in request_cleanup and clear proc_waiting
5911
* to indicate that there is no longer a timer running.
5919
* The callout_ API has acquired mtx and will hold it around this
5923
wakeup_one(&proc_waiting);
5924
if (proc_waiting > 0)
5925
callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
5930
* Flush out a directory with at least one removal dependency in an effort to
5931
* reduce the number of dirrem, freefile, and freeblks dependency structures.
5937
struct pagedep_hashhead *pagedephd;
5938
struct pagedep *pagedep;
5939
static int next = 0;
5945
mtx_assert(&lk, MA_OWNED);
5947
for (cnt = 0; cnt < pagedep_hash; cnt++) {
5948
pagedephd = &pagedep_hashtbl[next++];
5949
if (next >= pagedep_hash)
5951
LIST_FOREACH(pagedep, pagedephd, pd_hash) {
5952
if (LIST_EMPTY(&pagedep->pd_dirremhd))
5954
mp = pagedep->pd_list.wk_mp;
5955
ino = pagedep->pd_ino;
5956
if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5959
if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
5960
FFSV_FORCEINSMQ))) {
5961
softdep_error("clear_remove: vget", error);
5962
vn_finished_write(mp);
5966
if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
5967
softdep_error("clear_remove: fsync", error);
5972
vn_finished_write(mp);
5980
* Clear out a block of dirty inodes in an effort to reduce
5981
* the number of inodedep dependency structures.
5987
struct inodedep_hashhead *inodedephd;
5988
struct inodedep *inodedep;
5989
static int next = 0;
5994
ino_t firstino, lastino, ino;
5996
mtx_assert(&lk, MA_OWNED);
5998
* Pick a random inode dependency to be cleared.
5999
* We will then gather up all the inodes in its block
6000
* that have dependencies and flush them out.
6002
for (cnt = 0; cnt < inodedep_hash; cnt++) {
6003
inodedephd = &inodedep_hashtbl[next++];
6004
if (next >= inodedep_hash)
6006
if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
6009
if (inodedep == NULL)
6011
fs = inodedep->id_fs;
6012
mp = inodedep->id_list.wk_mp;
6014
* Find the last inode in the block with dependencies.
6016
firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
6017
for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
6018
if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
6021
* Asynchronously push all but the last inode with dependencies.
6022
* Synchronously push the last inode with dependencies to ensure
6023
* that the inode block gets written to free up the inodedeps.
6025
for (ino = firstino; ino <= lastino; ino++) {
6026
if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
6028
if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
6031
if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
6032
FFSV_FORCEINSMQ)) != 0) {
6033
softdep_error("clear_inodedeps: vget", error);
6034
vn_finished_write(mp);
6038
if (ino == lastino) {
6039
if ((error = ffs_syncvnode(vp, MNT_WAIT)))
6040
softdep_error("clear_inodedeps: fsync1", error);
6042
if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
6043
softdep_error("clear_inodedeps: fsync2", error);
6049
vn_finished_write(mp);
6055
* Function to determine if the buffer has outstanding dependencies
6056
* that will cause a roll-back if the buffer is written. If wantcount
6057
* is set, return number of dependencies, otherwise just yes or no.
6060
softdep_count_dependencies(bp, wantcount)
6064
struct worklist *wk;
6065
struct inodedep *inodedep;
6066
struct indirdep *indirdep;
6067
struct allocindir *aip;
6068
struct pagedep *pagedep;
6074
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
6075
switch (wk->wk_type) {
6078
inodedep = WK_INODEDEP(wk);
6079
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
6080
/* bitmap allocation dependency */
6085
if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
6086
/* direct block pointer dependency */
6091
if (TAILQ_FIRST(&inodedep->id_extupdt)) {
6092
/* direct block pointer dependency */
6100
indirdep = WK_INDIRDEP(wk);
6102
LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
6103
/* indirect block pointer dependency */
6111
pagedep = WK_PAGEDEP(wk);
6112
for (i = 0; i < DAHASHSZ; i++) {
6114
LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
6115
/* directory entry dependency */
6127
/* never a dependency on these blocks */
6131
panic("softdep_check_for_rollback: Unexpected type %s",
6132
TYPENAME(wk->wk_type));
6142
* Acquire exclusive access to a buffer.
6143
* Must be called with a locked mtx parameter.
6144
* Return acquired buffer or NULL on failure.
6147
getdirtybuf(bp, mtx, waitfor)
6154
mtx_assert(mtx, MA_OWNED);
6155
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
6156
if (waitfor != MNT_WAIT)
6158
error = BUF_LOCK(bp,
6159
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
6161
* Even if we sucessfully acquire bp here, we have dropped
6162
* mtx, which may violates our guarantee.
6166
else if (error != ENOLCK)
6167
panic("getdirtybuf: inconsistent lock: %d", error);
6171
if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
6172
if (mtx == &lk && waitfor == MNT_WAIT) {
6174
BO_LOCK(bp->b_bufobj);
6176
if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
6177
bp->b_vflags |= BV_BKGRDWAIT;
6178
msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
6179
PRIBIO | PDROP, "getbuf", 0);
6181
BO_UNLOCK(bp->b_bufobj);
6186
if (waitfor != MNT_WAIT)
6189
* The mtx argument must be bp->b_vp's mutex in
6192
#ifdef DEBUG_VFS_LOCKS
6193
if (bp->b_vp->v_type != VCHR)
6194
ASSERT_VI_LOCKED(bp->b_vp, "getdirtybuf");
6196
bp->b_vflags |= BV_BKGRDWAIT;
6197
msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
6200
if ((bp->b_flags & B_DELWRI) == 0) {
6210
* Check if it is safe to suspend the file system now. On entry,
6211
* the vnode interlock for devvp should be held. Return 0 with
6212
* the mount interlock held if the file system can be suspended now,
6213
* otherwise return EAGAIN with the mount interlock held.
6216
softdep_check_suspend(struct mount *mp,
6217
struct vnode *devvp,
6219
int softdep_accdeps,
6220
int secondary_writes,
6221
int secondary_accwrites)
6224
struct ufsmount *ump;
6227
ASSERT_VI_LOCKED(devvp, "softdep_check_suspend");
6229
bo = &devvp->v_bufobj;
6232
if (!TRY_ACQUIRE_LOCK(&lk)) {
6239
if (!MNT_ITRYLOCK(mp)) {
6247
if (mp->mnt_secondary_writes != 0) {
6250
msleep(&mp->mnt_secondary_writes,
6252
(PUSER - 1) | PDROP, "secwr", 0);
6260
* Reasons for needing more work before suspend:
6261
* - Dirty buffers on devvp.
6262
* - Softdep activity occurred after start of vnode sync loop
6263
* - Secondary writes occurred after start of vnode sync loop
6266
if (bo->bo_numoutput > 0 ||
6267
bo->bo_dirty.bv_cnt > 0 ||
6268
softdep_deps != 0 ||
6269
ump->softdep_deps != 0 ||
6270
softdep_accdeps != ump->softdep_accdeps ||
6271
secondary_writes != 0 ||
6272
mp->mnt_secondary_writes != 0 ||
6273
secondary_accwrites != mp->mnt_secondary_accwrites)
6282
* Get the number of dependency structures for the file system, both
6283
* the current number and the total number allocated. These will
6284
* later be used to detect that softdep processing has occurred.
6287
softdep_get_depcounts(struct mount *mp,
6289
int *softdep_accdepsp)
6291
struct ufsmount *ump;
6295
*softdep_depsp = ump->softdep_deps;
6296
*softdep_accdepsp = ump->softdep_accdeps;
6301
* Wait for pending output on a vnode to complete.
6302
* Must be called with vnode lock and interlock locked.
6304
* XXX: Should just be a call to bufobj_wwait().
6310
ASSERT_VOP_LOCKED(vp, "drain_output");
6311
ASSERT_VI_LOCKED(vp, "drain_output");
6313
while (vp->v_bufobj.bo_numoutput) {
6314
vp->v_bufobj.bo_flag |= BO_WWAIT;
6315
msleep((caddr_t)&vp->v_bufobj.bo_numoutput,
6316
VI_MTX(vp), PRIBIO + 1, "drainvp", 0);
6321
* Called whenever a buffer that is being invalidated or reallocated
6322
* contains dependencies. This should only happen if an I/O error has
6323
* occurred. The routine is called with the buffer locked.
6326
softdep_deallocate_dependencies(bp)
6330
if ((bp->b_ioflags & BIO_ERROR) == 0)
6331
panic("softdep_deallocate_dependencies: dangling deps");
6332
softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
6333
panic("softdep_deallocate_dependencies: unrecovered I/O error");
6337
* Function to handle asynchronous write errors in the filesystem.
6340
softdep_error(func, error)
6345
/* XXX should do something better! */
6346
printf("%s: got error %d while accessing filesystem\n", func, error);
6351
DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
6353
struct inodedep_hashhead *inodedephd;
6354
struct inodedep *inodedep;
6358
fs = have_addr ? (struct fs *)addr : NULL;
6359
for (cnt = 0; cnt < inodedep_hash; cnt++) {
6360
inodedephd = &inodedep_hashtbl[cnt];
6361
LIST_FOREACH(inodedep, inodedephd, id_hash) {
6362
if (fs != NULL && fs != inodedep->id_fs)
6364
db_printf("%p fs %p st %x ino %jd inoblk %jd\n",
6365
inodedep, inodedep->id_fs, inodedep->id_state,
6366
(intmax_t)inodedep->id_ino,
6367
(intmax_t)fsbtodb(inodedep->id_fs,
6368
ino_to_fsba(inodedep->id_fs, inodedep->id_ino)));
6375
#endif /* SOFTUPDATES */