2
* Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
5
* This software was developed for the FreeBSD Project by Marshall
6
* Kirk McKusick and Network Associates Laboratories, the Security
7
* Research Division of Network Associates, Inc. under DARPA/SPAWAR
8
* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
11
* Redistribution and use in source and binary forms, with or without
12
* modification, are permitted provided that the following conditions
14
* 1. Redistributions of source code must retain the above copyright
15
* notice, this list of conditions and the following disclaimer.
16
* 2. Redistributions in binary form must reproduce the above copyright
17
* notice, this list of conditions and the following disclaimer in the
18
* documentation and/or other materials provided with the distribution.
20
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32
* Copyright (c) 1982, 1986, 1989, 1993
33
* The Regents of the University of California. All rights reserved.
35
* Redistribution and use in source and binary forms, with or without
36
* modification, are permitted provided that the following conditions
38
* 1. Redistributions of source code must retain the above copyright
39
* notice, this list of conditions and the following disclaimer.
40
* 2. Redistributions in binary form must reproduce the above copyright
41
* notice, this list of conditions and the following disclaimer in the
42
* documentation and/or other materials provided with the distribution.
43
* 4. Neither the name of the University nor the names of its contributors
44
* may be used to endorse or promote products derived from this software
45
* without specific prior written permission.
47
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59
* from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
60
* from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
61
* @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
64
#include <sys/cdefs.h>
65
__FBSDID("$FreeBSD$");
67
#include <sys/param.h>
69
#include <sys/systm.h>
72
#include <sys/extattr.h>
73
#include <sys/kernel.h>
74
#include <sys/limits.h>
75
#include <sys/malloc.h>
76
#include <sys/mount.h>
79
#include <sys/resourcevar.h>
80
#include <sys/signalvar.h>
82
#include <sys/vmmeter.h>
83
#include <sys/vnode.h>
86
#include <vm/vm_extern.h>
87
#include <vm/vm_object.h>
88
#include <vm/vm_page.h>
89
#include <vm/vm_pager.h>
90
#include <vm/vnode_pager.h>
92
#include <ufs/ufs/extattr.h>
93
#include <ufs/ufs/quota.h>
94
#include <ufs/ufs/inode.h>
95
#include <ufs/ufs/ufs_extern.h>
96
#include <ufs/ufs/ufsmount.h>
98
#include <ufs/ffs/fs.h>
99
#include <ufs/ffs/ffs_extern.h>
100
#include "opt_directio.h"
104
extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
106
static vop_fsync_t ffs_fsync;
107
static vop_lock1_t ffs_lock;
108
static vop_getpages_t ffs_getpages;
109
static vop_read_t ffs_read;
110
static vop_write_t ffs_write;
111
static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
112
static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
114
static vop_strategy_t ffsext_strategy;
115
static vop_closeextattr_t ffs_closeextattr;
116
static vop_deleteextattr_t ffs_deleteextattr;
117
static vop_getextattr_t ffs_getextattr;
118
static vop_listextattr_t ffs_listextattr;
119
static vop_openextattr_t ffs_openextattr;
120
static vop_setextattr_t ffs_setextattr;
121
static vop_vptofh_t ffs_vptofh;
124
/* Global vfs data structures for ufs. */
125
struct vop_vector ffs_vnodeops1 = {
126
.vop_default = &ufs_vnodeops,
127
.vop_fsync = ffs_fsync,
128
.vop_getpages = ffs_getpages,
129
.vop_lock1 = ffs_lock,
130
.vop_read = ffs_read,
131
.vop_reallocblks = ffs_reallocblks,
132
.vop_write = ffs_write,
133
.vop_vptofh = ffs_vptofh,
136
struct vop_vector ffs_fifoops1 = {
137
.vop_default = &ufs_fifoops,
138
.vop_fsync = ffs_fsync,
139
.vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */
140
.vop_vptofh = ffs_vptofh,
143
/* Global vfs data structures for ufs. */
144
struct vop_vector ffs_vnodeops2 = {
145
.vop_default = &ufs_vnodeops,
146
.vop_fsync = ffs_fsync,
147
.vop_getpages = ffs_getpages,
148
.vop_lock1 = ffs_lock,
149
.vop_read = ffs_read,
150
.vop_reallocblks = ffs_reallocblks,
151
.vop_write = ffs_write,
152
.vop_closeextattr = ffs_closeextattr,
153
.vop_deleteextattr = ffs_deleteextattr,
154
.vop_getextattr = ffs_getextattr,
155
.vop_listextattr = ffs_listextattr,
156
.vop_openextattr = ffs_openextattr,
157
.vop_setextattr = ffs_setextattr,
158
.vop_vptofh = ffs_vptofh,
161
struct vop_vector ffs_fifoops2 = {
162
.vop_default = &ufs_fifoops,
163
.vop_fsync = ffs_fsync,
164
.vop_lock1 = ffs_lock,
165
.vop_reallocblks = ffs_reallocblks,
166
.vop_strategy = ffsext_strategy,
167
.vop_closeextattr = ffs_closeextattr,
168
.vop_deleteextattr = ffs_deleteextattr,
169
.vop_getextattr = ffs_getextattr,
170
.vop_listextattr = ffs_listextattr,
171
.vop_openextattr = ffs_openextattr,
172
.vop_setextattr = ffs_setextattr,
173
.vop_vptofh = ffs_vptofh,
177
* Synch an open file.
181
ffs_fsync(struct vop_fsync_args *ap)
190
error = ffs_syncvnode(vp, ap->a_waitfor);
193
if (ap->a_waitfor == MNT_WAIT &&
194
(vp->v_mount->mnt_flag & MNT_SOFTDEP)) {
195
error = softdep_fsync(vp);
200
* The softdep_fsync() function may drop vp lock,
201
* allowing for dirty buffers to reappear on the
202
* bo_dirty list. Recheck and resync as needed.
205
if (vp->v_type == VREG && (bo->bo_numoutput > 0 ||
206
bo->bo_dirty.bv_cnt > 0)) {
216
ffs_syncvnode(struct vnode *vp, int waitfor)
218
struct inode *ip = VTOI(vp);
222
int s, error, wait, passes, skipmeta;
225
wait = (waitfor == MNT_WAIT);
226
lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
230
* Flush all dirty buffers associated with a vnode.
239
TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
240
bp->b_vflags &= ~BV_SCANNED;
241
TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
243
* Reasons to skip this buffer: it has already been considered
244
* on this pass, this pass is the first time through on a
245
* synchronous flush request and the buffer being considered
246
* is metadata, the buffer has dependencies that will cause
247
* it to be redirtied and it has not already been deferred,
248
* or it is already being written.
250
if ((bp->b_vflags & BV_SCANNED) != 0)
252
bp->b_vflags |= BV_SCANNED;
253
if ((skipmeta == 1 && bp->b_lblkno < 0))
255
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
258
if (!wait && !LIST_EMPTY(&bp->b_dep) &&
259
(bp->b_flags & B_DEFERRED) == 0 &&
260
buf_countdeps(bp, 0)) {
261
bp->b_flags |= B_DEFERRED;
266
if ((bp->b_flags & B_DELWRI) == 0)
267
panic("ffs_fsync: not dirty");
269
* If this is a synchronous flush request, or it is not a
270
* file or device, start the write on this buffer immediately.
272
if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
275
* On our final pass through, do all I/O synchronously
276
* so that we can find out if our flush is failing
277
* because of write errors.
279
if (passes > 0 || !wait) {
280
if ((bp->b_flags & B_CLUSTEROK) && !wait) {
281
(void) vfs_bio_awrite(bp);
291
if ((error = bwrite(bp)) != 0)
295
} else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
297
* If the buffer is for data that has been truncated
298
* off the file, then throw it away.
301
bp->b_flags |= B_INVAL | B_NOCACHE;
309
* Since we may have slept during the I/O, we need
310
* to start from a known point.
313
nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
316
* If we were asked to do this synchronously, then go back for
317
* another pass, this time doing the metadata.
325
bufobj_wwait(bo, 3, 0);
329
* Ensure that any filesystem metatdata associated
330
* with the vnode has been written.
333
if ((error = softdep_sync_metadata(vp)) != 0)
338
if (bo->bo_dirty.bv_cnt > 0) {
340
* Block devices associated with filesystems may
341
* have new I/O requests posted for them even if
342
* the vnode is locked, so no amount of trying will
343
* get them clean. Thus we give block devices a
344
* good effort, then just give up. For all other file
345
* types, go around and try again until it is clean.
352
if (!vn_isdisk(vp, NULL))
353
vprint("ffs_fsync: dirty", vp);
359
return (ffs_update(vp, wait));
364
struct vop_lock1_args /* {
372
#ifndef NO_FFS_SNAPSHOT
378
switch (ap->a_flags & LK_TYPE_MASK) {
385
#ifdef DEBUG_VFS_LOCKS
386
KASSERT(vp->v_holdcnt != 0,
387
("ffs_lock %p: zero hold count", vp));
390
result = _lockmgr_args(lkp, flags, VI_MTX(vp),
391
LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
392
ap->a_file, ap->a_line);
393
if (lkp == vp->v_vnlock || result != 0)
396
* Apparent success, except that the vnode
397
* mutated between snapshot file vnode and
398
* regular file vnode while this process
399
* slept. The lock currently held is not the
400
* right lock. Release it, and try to get the
403
(void) _lockmgr_args(lkp, LK_RELEASE, NULL,
404
LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
405
ap->a_file, ap->a_line);
406
if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
407
(LK_INTERLOCK | LK_NOWAIT))
409
if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
410
flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
411
flags &= ~LK_INTERLOCK;
415
result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
419
return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
424
* Vnode op for reading.
429
struct vop_read_args /* {
433
struct ucred *a_cred;
441
ufs_lbn_t lbn, nextlbn;
443
long size, xfersize, blkoffset;
444
int error, orig_resid;
450
ioflag = ap->a_ioflag;
451
if (ap->a_ioflag & IO_EXT)
453
return (ffs_extread(vp, uio, ioflag));
455
panic("ffs_read+IO_EXT");
458
if ((ioflag & IO_DIRECT) != 0) {
461
error = ffs_rawread(vp, uio, &workdone);
462
if (error != 0 || workdone != 0)
467
seqcount = ap->a_ioflag >> IO_SEQSHIFT;
471
if (uio->uio_rw != UIO_READ)
472
panic("ffs_read: mode");
474
if (vp->v_type == VLNK) {
475
if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
476
panic("ffs_read: short symlink");
477
} else if (vp->v_type != VREG && vp->v_type != VDIR)
478
panic("ffs_read: type %d", vp->v_type);
480
orig_resid = uio->uio_resid;
481
KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
484
KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
486
if (uio->uio_offset < ip->i_size &&
487
uio->uio_offset >= fs->fs_maxfilesize)
490
for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
491
if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
493
lbn = lblkno(fs, uio->uio_offset);
497
* size of buffer. The buffer representing the
498
* end of the file is rounded up to the size of
499
* the block type ( fragment or full block,
502
size = blksize(fs, ip, lbn);
503
blkoffset = blkoff(fs, uio->uio_offset);
506
* The amount we want to transfer in this iteration is
507
* one FS block less the amount of the data before
508
* our startpoint (duh!)
510
xfersize = fs->fs_bsize - blkoffset;
513
* But if we actually want less than the block,
514
* or the file doesn't have a whole block more of data,
515
* then use the lesser number.
517
if (uio->uio_resid < xfersize)
518
xfersize = uio->uio_resid;
519
if (bytesinfile < xfersize)
520
xfersize = bytesinfile;
522
if (lblktosize(fs, nextlbn) >= ip->i_size) {
524
* Don't do readahead if this is the end of the file.
526
error = bread(vp, lbn, size, NOCRED, &bp);
527
} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
529
* Otherwise if we are allowed to cluster,
530
* grab as much as we can.
532
* XXX This may not be a win if we are not
533
* doing sequential access.
535
error = cluster_read(vp, ip->i_size, lbn,
536
size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp);
537
} else if (seqcount > 1) {
539
* If we are NOT allowed to cluster, then
540
* if we appear to be acting sequentially,
541
* fire off a request for a readahead
542
* as well as a read. Note that the 4th and 5th
543
* arguments point to arrays of the size specified in
546
int nextsize = blksize(fs, ip, nextlbn);
547
error = breadn(vp, lbn,
548
size, &nextlbn, &nextsize, 1, NOCRED, &bp);
551
* Failing all of the above, just read what the
552
* user asked for. Interestingly, the same as
553
* the first option above.
555
error = bread(vp, lbn, size, NOCRED, &bp);
564
* If IO_DIRECT then set B_DIRECT for the buffer. This
565
* will cause us to attempt to release the buffer later on
566
* and will cause the buffer cache to attempt to free the
569
if (ioflag & IO_DIRECT)
570
bp->b_flags |= B_DIRECT;
573
* We should only get non-zero b_resid when an I/O error
574
* has occurred, which should cause us to break above.
575
* However, if the short read did not cause an error,
576
* then we want to ensure that we do not uiomove bad
577
* or uninitialized data.
580
if (size < xfersize) {
586
error = uiomove((char *)bp->b_data + blkoffset,
591
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
592
(LIST_EMPTY(&bp->b_dep))) {
594
* If there are no dependencies, and it's VMIO,
595
* then we don't need the buf, mark it available
596
* for freeing. The VM has the data.
598
bp->b_flags |= B_RELBUF;
602
* Otherwise let whoever
603
* made the request take care of
604
* freeing it. We just queue
605
* it onto another list.
612
* This can only happen in the case of an error
613
* because the loop above resets bp to NULL on each iteration
614
* and on normal completion has not set a new value into it.
615
* so it must have come from a 'break' statement
618
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
619
(LIST_EMPTY(&bp->b_dep))) {
620
bp->b_flags |= B_RELBUF;
627
if ((error == 0 || uio->uio_resid != orig_resid) &&
628
(vp->v_mount->mnt_flag & MNT_NOATIME) == 0 &&
629
(ip->i_flag & IN_ACCESS) == 0) {
631
ip->i_flag |= IN_ACCESS;
638
* Vnode op for writing.
642
struct vop_write_args /* {
646
struct ucred *a_cred;
658
int blkoffset, error, flags, ioflag, resid, size, xfersize;
662
ioflag = ap->a_ioflag;
663
if (ap->a_ioflag & IO_EXT)
665
return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
667
panic("ffs_write+IO_EXT");
670
seqcount = ap->a_ioflag >> IO_SEQSHIFT;
674
if (uio->uio_rw != UIO_WRITE)
675
panic("ffs_write: mode");
678
switch (vp->v_type) {
680
if (ioflag & IO_APPEND)
681
uio->uio_offset = ip->i_size;
682
if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
688
panic("ffs_write: dir write");
691
panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
692
(int)uio->uio_offset,
697
KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
698
KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
700
if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
703
* Maybe this should be above the vnode op call, but so long as
704
* file servers have no limits, I don't think it matters.
707
if (vp->v_type == VREG && td != NULL) {
708
PROC_LOCK(td->td_proc);
709
if (uio->uio_offset + uio->uio_resid >
710
lim_cur(td->td_proc, RLIMIT_FSIZE)) {
711
psignal(td->td_proc, SIGXFSZ);
712
PROC_UNLOCK(td->td_proc);
715
PROC_UNLOCK(td->td_proc);
718
resid = uio->uio_resid;
720
if (seqcount > BA_SEQMAX)
721
flags = BA_SEQMAX << BA_SEQSHIFT;
723
flags = seqcount << BA_SEQSHIFT;
724
if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
727
for (error = 0; uio->uio_resid > 0;) {
728
lbn = lblkno(fs, uio->uio_offset);
729
blkoffset = blkoff(fs, uio->uio_offset);
730
xfersize = fs->fs_bsize - blkoffset;
731
if (uio->uio_resid < xfersize)
732
xfersize = uio->uio_resid;
733
if (uio->uio_offset + xfersize > ip->i_size)
734
vnode_pager_setsize(vp, uio->uio_offset + xfersize);
737
* We must perform a read-before-write if the transfer size
738
* does not cover the entire buffer.
740
if (fs->fs_bsize > xfersize)
744
/* XXX is uio->uio_offset the right thing here? */
745
error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
746
ap->a_cred, flags, &bp);
748
vnode_pager_setsize(vp, ip->i_size);
752
* If the buffer is not valid we have to clear out any
753
* garbage data from the pages instantiated for the buffer.
754
* If we do not, a failed uiomove() during a write can leave
755
* the prior contents of the pages exposed to a userland
756
* mmap(). XXX deal with uiomove() errors a better way.
758
if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
760
if (ioflag & IO_DIRECT)
761
bp->b_flags |= B_DIRECT;
762
if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
763
bp->b_flags |= B_NOCACHE;
765
if (uio->uio_offset + xfersize > ip->i_size) {
766
ip->i_size = uio->uio_offset + xfersize;
767
DIP_SET(ip, i_size, ip->i_size);
770
size = blksize(fs, ip, lbn) - bp->b_resid;
775
uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
776
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
777
(LIST_EMPTY(&bp->b_dep))) {
778
bp->b_flags |= B_RELBUF;
782
* If IO_SYNC each buffer is written synchronously. Otherwise
783
* if we have a severe page deficiency write the buffer
784
* asynchronously. Otherwise try to cluster, and if that
785
* doesn't do it then either do an async write (if O_DIRECT),
786
* or a delayed write (if not).
788
if (ioflag & IO_SYNC) {
790
} else if (vm_page_count_severe() ||
791
buf_dirty_count_severe() ||
792
(ioflag & IO_ASYNC)) {
793
bp->b_flags |= B_CLUSTEROK;
795
} else if (xfersize + blkoffset == fs->fs_bsize) {
796
if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
797
bp->b_flags |= B_CLUSTEROK;
798
cluster_write(vp, bp, ip->i_size, seqcount);
802
} else if (ioflag & IO_DIRECT) {
803
bp->b_flags |= B_CLUSTEROK;
806
bp->b_flags |= B_CLUSTEROK;
809
if (error || xfersize == 0)
811
ip->i_flag |= IN_CHANGE | IN_UPDATE;
814
* If we successfully wrote any data, and we are not the superuser
815
* we clear the setuid and setgid bits as a precaution against
818
if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
820
if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) {
821
ip->i_mode &= ~(ISUID | ISGID);
822
DIP_SET(ip, i_mode, ip->i_mode);
826
if (ioflag & IO_UNIT) {
827
(void)ffs_truncate(vp, osize,
828
IO_NORMAL | (ioflag & IO_SYNC),
829
ap->a_cred, uio->uio_td);
830
uio->uio_offset -= resid - uio->uio_resid;
831
uio->uio_resid = resid;
833
} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
834
error = ffs_update(vp, 1);
843
struct vop_getpages_args *ap;
849
pcount = round_page(ap->a_count) / PAGE_SIZE;
850
mreq = ap->a_m[ap->a_reqpage];
853
* if ANY DEV_BSIZE blocks are valid on a large filesystem block,
854
* then the entire page is valid. Since the page may be mapped,
855
* user programs might reference data beyond the actual end of file
856
* occuring within the page. We have to zero that data.
858
VM_OBJECT_LOCK(mreq->object);
860
if (mreq->valid != VM_PAGE_BITS_ALL)
861
vm_page_zero_invalid(mreq, TRUE);
862
vm_page_lock_queues();
863
for (i = 0; i < pcount; i++) {
864
if (i != ap->a_reqpage) {
865
vm_page_free(ap->a_m[i]);
868
vm_page_unlock_queues();
869
VM_OBJECT_UNLOCK(mreq->object);
872
VM_OBJECT_UNLOCK(mreq->object);
874
return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
881
* Extended attribute area reading.
884
ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
887
struct ufs2_dinode *dp;
890
ufs_lbn_t lbn, nextlbn;
892
long size, xfersize, blkoffset;
893
int error, orig_resid;
900
if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
901
panic("ffs_extread: mode");
904
orig_resid = uio->uio_resid;
905
KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
908
KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
910
for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
911
if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
913
lbn = lblkno(fs, uio->uio_offset);
917
* size of buffer. The buffer representing the
918
* end of the file is rounded up to the size of
919
* the block type ( fragment or full block,
922
size = sblksize(fs, dp->di_extsize, lbn);
923
blkoffset = blkoff(fs, uio->uio_offset);
926
* The amount we want to transfer in this iteration is
927
* one FS block less the amount of the data before
928
* our startpoint (duh!)
930
xfersize = fs->fs_bsize - blkoffset;
933
* But if we actually want less than the block,
934
* or the file doesn't have a whole block more of data,
935
* then use the lesser number.
937
if (uio->uio_resid < xfersize)
938
xfersize = uio->uio_resid;
939
if (bytesinfile < xfersize)
940
xfersize = bytesinfile;
942
if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
944
* Don't do readahead if this is the end of the info.
946
error = bread(vp, -1 - lbn, size, NOCRED, &bp);
949
* If we have a second block, then
950
* fire off a request for a readahead
951
* as well as a read. Note that the 4th and 5th
952
* arguments point to arrays of the size specified in
955
int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
957
nextlbn = -1 - nextlbn;
958
error = breadn(vp, -1 - lbn,
959
size, &nextlbn, &nextsize, 1, NOCRED, &bp);
968
* If IO_DIRECT then set B_DIRECT for the buffer. This
969
* will cause us to attempt to release the buffer later on
970
* and will cause the buffer cache to attempt to free the
973
if (ioflag & IO_DIRECT)
974
bp->b_flags |= B_DIRECT;
977
* We should only get non-zero b_resid when an I/O error
978
* has occurred, which should cause us to break above.
979
* However, if the short read did not cause an error,
980
* then we want to ensure that we do not uiomove bad
981
* or uninitialized data.
984
if (size < xfersize) {
990
error = uiomove((char *)bp->b_data + blkoffset,
995
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
996
(LIST_EMPTY(&bp->b_dep))) {
998
* If there are no dependencies, and it's VMIO,
999
* then we don't need the buf, mark it available
1000
* for freeing. The VM has the data.
1002
bp->b_flags |= B_RELBUF;
1006
* Otherwise let whoever
1007
* made the request take care of
1008
* freeing it. We just queue
1009
* it onto another list.
1016
* This can only happen in the case of an error
1017
* because the loop above resets bp to NULL on each iteration
1018
* and on normal completion has not set a new value into it.
1019
* so it must have come from a 'break' statement
1022
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1023
(LIST_EMPTY(&bp->b_dep))) {
1024
bp->b_flags |= B_RELBUF;
1034
* Extended attribute area writing.
1037
ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1040
struct ufs2_dinode *dp;
1045
int blkoffset, error, flags, resid, size, xfersize;
1051
KASSERT(!(ip->i_flag & IN_SPACECOUNTED), ("inode %u: inode is dead",
1055
if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1056
panic("ffs_extwrite: mode");
1059
if (ioflag & IO_APPEND)
1060
uio->uio_offset = dp->di_extsize;
1061
KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1062
KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1063
if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize)
1066
resid = uio->uio_resid;
1067
osize = dp->di_extsize;
1069
if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
1072
for (error = 0; uio->uio_resid > 0;) {
1073
lbn = lblkno(fs, uio->uio_offset);
1074
blkoffset = blkoff(fs, uio->uio_offset);
1075
xfersize = fs->fs_bsize - blkoffset;
1076
if (uio->uio_resid < xfersize)
1077
xfersize = uio->uio_resid;
1080
* We must perform a read-before-write if the transfer size
1081
* does not cover the entire buffer.
1083
if (fs->fs_bsize > xfersize)
1086
flags &= ~BA_CLRBUF;
1087
error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1092
* If the buffer is not valid we have to clear out any
1093
* garbage data from the pages instantiated for the buffer.
1094
* If we do not, a failed uiomove() during a write can leave
1095
* the prior contents of the pages exposed to a userland
1096
* mmap(). XXX deal with uiomove() errors a better way.
1098
if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1100
if (ioflag & IO_DIRECT)
1101
bp->b_flags |= B_DIRECT;
1103
if (uio->uio_offset + xfersize > dp->di_extsize)
1104
dp->di_extsize = uio->uio_offset + xfersize;
1106
size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1107
if (size < xfersize)
1111
uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1112
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1113
(LIST_EMPTY(&bp->b_dep))) {
1114
bp->b_flags |= B_RELBUF;
1118
* If IO_SYNC each buffer is written synchronously. Otherwise
1119
* if we have a severe page deficiency write the buffer
1120
* asynchronously. Otherwise try to cluster, and if that
1121
* doesn't do it then either do an async write (if O_DIRECT),
1122
* or a delayed write (if not).
1124
if (ioflag & IO_SYNC) {
1126
} else if (vm_page_count_severe() ||
1127
buf_dirty_count_severe() ||
1128
xfersize + blkoffset == fs->fs_bsize ||
1129
(ioflag & (IO_ASYNC | IO_DIRECT)))
1133
if (error || xfersize == 0)
1135
ip->i_flag |= IN_CHANGE;
1138
* If we successfully wrote any data, and we are not the superuser
1139
* we clear the setuid and setgid bits as a precaution against
1142
if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1143
if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) {
1144
ip->i_mode &= ~(ISUID | ISGID);
1145
dp->di_mode = ip->i_mode;
1149
if (ioflag & IO_UNIT) {
1150
(void)ffs_truncate(vp, osize,
1151
IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td);
1152
uio->uio_offset -= resid - uio->uio_resid;
1153
uio->uio_resid = resid;
1155
} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1156
error = ffs_update(vp, 1);
1162
* Vnode operating to retrieve a named extended attribute.
1164
* Locate a particular EA (nspace:name) in the area (ptr:length), and return
1165
* the length of the EA, and possibly the pointer to the entry and to the data.
1168
ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac)
1170
u_char *p, *pe, *pn, *p0;
1171
int eapad1, eapad2, ealength, ealen, nlen;
1175
nlen = strlen(name);
1177
for (p = ptr; p < pe; p = pn) {
1179
bcopy(p, &ul, sizeof(ul));
1181
/* make sure this entry is complete */
1184
p += sizeof(uint32_t);
1192
if (bcmp(p, name, nlen))
1194
ealength = sizeof(uint32_t) + 3 + nlen;
1195
eapad1 = 8 - (ealength % 8);
1199
ealen = ul - ealength - eapad2;
1211
ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1214
struct ufs2_dinode *dp;
1217
struct iovec liovec;
1224
easize = dp->di_extsize;
1225
if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize)
1228
eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1230
liovec.iov_base = eae;
1231
liovec.iov_len = easize;
1232
luio.uio_iov = &liovec;
1233
luio.uio_iovcnt = 1;
1234
luio.uio_offset = 0;
1235
luio.uio_resid = easize;
1236
luio.uio_segflg = UIO_SYSSPACE;
1237
luio.uio_rw = UIO_READ;
1240
error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1250
ffs_lock_ea(struct vnode *vp)
1256
while (ip->i_flag & IN_EA_LOCKED) {
1257
ip->i_flag |= IN_EA_LOCKWAIT;
1258
msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
1261
ip->i_flag |= IN_EA_LOCKED;
1266
ffs_unlock_ea(struct vnode *vp)
1272
if (ip->i_flag & IN_EA_LOCKWAIT)
1273
wakeup(&ip->i_ea_refs);
1274
ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1279
ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1282
struct ufs2_dinode *dp;
1288
if (ip->i_ea_area != NULL) {
1294
error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1299
ip->i_ea_len = dp->di_extsize;
1307
* Vnode extattr transaction commit/abort
1310
ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1314
struct iovec liovec;
1316
struct ufs2_dinode *dp;
1321
if (ip->i_ea_area == NULL) {
1326
error = ip->i_ea_error;
1327
if (commit && error == 0) {
1328
ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1330
cred = vp->v_mount->mnt_cred;
1331
liovec.iov_base = ip->i_ea_area;
1332
liovec.iov_len = ip->i_ea_len;
1333
luio.uio_iov = &liovec;
1334
luio.uio_iovcnt = 1;
1335
luio.uio_offset = 0;
1336
luio.uio_resid = ip->i_ea_len;
1337
luio.uio_segflg = UIO_SYSSPACE;
1338
luio.uio_rw = UIO_WRITE;
1340
/* XXX: I'm not happy about truncating to zero size */
1341
if (ip->i_ea_len < dp->di_extsize)
1342
error = ffs_truncate(vp, 0, IO_EXT, cred, td);
1343
error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1345
if (--ip->i_ea_refs == 0) {
1346
free(ip->i_ea_area, M_TEMP);
1347
ip->i_ea_area = NULL;
1356
* Vnode extattr strategy routine for fifos.
1358
* We need to check for a read or write of the external attributes.
1359
* Otherwise we just fall through and do the usual thing.
1362
ffsext_strategy(struct vop_strategy_args *ap)
1364
struct vop_strategy_args {
1365
struct vnodeop_desc *a_desc;
1375
lbn = ap->a_bp->b_lblkno;
1376
if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
1377
lbn < 0 && lbn >= -NXADDR)
1378
return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1379
if (vp->v_type == VFIFO)
1380
return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1381
panic("spec nodes went here");
1385
* Vnode extattr transaction commit/abort
1388
ffs_openextattr(struct vop_openextattr_args *ap)
1390
struct vop_openextattr_args {
1391
struct vnodeop_desc *a_desc;
1393
IN struct ucred *a_cred;
1394
IN struct thread *a_td;
1401
ip = VTOI(ap->a_vp);
1404
if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1405
return (EOPNOTSUPP);
1407
return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1412
* Vnode extattr transaction commit/abort
1415
ffs_closeextattr(struct vop_closeextattr_args *ap)
1417
struct vop_closeextattr_args {
1418
struct vnodeop_desc *a_desc;
1421
IN struct ucred *a_cred;
1422
IN struct thread *a_td;
1429
ip = VTOI(ap->a_vp);
1432
if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1433
return (EOPNOTSUPP);
1435
if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
1438
return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1442
* Vnode operation to remove a named attribute.
1445
ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1448
IN struct vnode *a_vp;
1449
IN int a_attrnamespace;
1450
IN const char *a_name;
1451
IN struct ucred *a_cred;
1452
IN struct thread *a_td;
1458
uint32_t ealength, ul;
1459
int ealen, olen, eapad1, eapad2, error, i, easize;
1462
ip = VTOI(ap->a_vp);
1465
if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1466
return (EOPNOTSUPP);
1468
if (strlen(ap->a_name) == 0)
1471
if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1474
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1475
ap->a_cred, ap->a_td, VWRITE);
1479
* ffs_lock_ea is not needed there, because the vnode
1480
* must be exclusively locked.
1482
if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1483
ip->i_ea_error = error;
1487
error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1491
ealength = eapad1 = ealen = eapad2 = 0;
1493
eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1494
bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1495
easize = ip->i_ea_len;
1497
olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1500
/* delete but nonexistent */
1502
ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1505
bcopy(p, &ul, sizeof ul);
1507
if (ul != ealength) {
1508
bcopy(p + ul, p + ealength, easize - i);
1509
easize += (ealength - ul);
1511
if (easize > NXADDR * fs->fs_bsize) {
1513
ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1514
if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1515
ip->i_ea_error = ENOSPC;
1519
ip->i_ea_area = eae;
1520
ip->i_ea_len = easize;
1522
error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1527
* Vnode operation to retrieve a named extended attribute.
1530
ffs_getextattr(struct vop_getextattr_args *ap)
1533
IN struct vnode *a_vp;
1534
IN int a_attrnamespace;
1535
IN const char *a_name;
1536
INOUT struct uio *a_uio;
1538
IN struct ucred *a_cred;
1539
IN struct thread *a_td;
1549
ip = VTOI(ap->a_vp);
1552
if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1553
return (EOPNOTSUPP);
1555
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1556
ap->a_cred, ap->a_td, VREAD);
1560
error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1564
eae = ip->i_ea_area;
1565
easize = ip->i_ea_len;
1567
ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1571
if (ap->a_size != NULL)
1572
*ap->a_size = ealen;
1573
else if (ap->a_uio != NULL)
1574
error = uiomove(p, ealen, ap->a_uio);
1578
ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1583
* Vnode operation to retrieve extended attributes on a vnode.
1586
ffs_listextattr(struct vop_listextattr_args *ap)
1589
IN struct vnode *a_vp;
1590
IN int a_attrnamespace;
1591
INOUT struct uio *a_uio;
1593
IN struct ucred *a_cred;
1594
IN struct thread *a_td;
1600
u_char *eae, *p, *pe, *pn;
1605
ip = VTOI(ap->a_vp);
1608
if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1609
return (EOPNOTSUPP);
1611
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1612
ap->a_cred, ap->a_td, VREAD);
1616
error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1619
eae = ip->i_ea_area;
1620
easize = ip->i_ea_len;
1623
if (ap->a_size != NULL)
1626
for(p = eae; error == 0 && p < pe; p = pn) {
1627
bcopy(p, &ul, sizeof(ul));
1632
if (*p++ != ap->a_attrnamespace)
1636
if (ap->a_size != NULL) {
1637
*ap->a_size += ealen + 1;
1638
} else if (ap->a_uio != NULL) {
1639
error = uiomove(p, ealen + 1, ap->a_uio);
1642
ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1647
* Vnode operation to set a named attribute.
1650
ffs_setextattr(struct vop_setextattr_args *ap)
1653
IN struct vnode *a_vp;
1654
IN int a_attrnamespace;
1655
IN const char *a_name;
1656
INOUT struct uio *a_uio;
1657
IN struct ucred *a_cred;
1658
IN struct thread *a_td;
1664
uint32_t ealength, ul;
1665
int ealen, olen, eapad1, eapad2, error, i, easize;
1668
ip = VTOI(ap->a_vp);
1671
if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1672
return (EOPNOTSUPP);
1674
if (strlen(ap->a_name) == 0)
1677
/* XXX Now unsupported API to delete EAs using NULL uio. */
1678
if (ap->a_uio == NULL)
1679
return (EOPNOTSUPP);
1681
if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1684
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1685
ap->a_cred, ap->a_td, VWRITE);
1689
* ffs_lock_ea is not needed there, because the vnode
1690
* must be exclusively locked.
1692
if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1693
ip->i_ea_error = error;
1697
error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1701
ealen = ap->a_uio->uio_resid;
1702
ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1703
eapad1 = 8 - (ealength % 8);
1706
eapad2 = 8 - (ealen % 8);
1709
ealength += eapad1 + ealen + eapad2;
1711
eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1712
bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1713
easize = ip->i_ea_len;
1715
olen = ffs_findextattr(eae, easize,
1716
ap->a_attrnamespace, ap->a_name, &p, NULL);
1718
/* new, append at end */
1722
bcopy(p, &ul, sizeof ul);
1724
if (ul != ealength) {
1725
bcopy(p + ul, p + ealength, easize - i);
1726
easize += (ealength - ul);
1729
if (easize > NXADDR * fs->fs_bsize) {
1731
ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1732
if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1733
ip->i_ea_error = ENOSPC;
1736
bcopy(&ealength, p, sizeof(ealength));
1737
p += sizeof(ealength);
1738
*p++ = ap->a_attrnamespace;
1740
*p++ = strlen(ap->a_name);
1741
strcpy(p, ap->a_name);
1742
p += strlen(ap->a_name);
1745
error = uiomove(p, ealen, ap->a_uio);
1748
ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1749
if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1750
ip->i_ea_error = error;
1757
ip->i_ea_area = eae;
1758
ip->i_ea_len = easize;
1760
error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1765
* Vnode pointer to File handle
1768
ffs_vptofh(struct vop_vptofh_args *ap)
1771
IN struct vnode *a_vp;
1772
IN struct fid *a_fhp;
1779
ip = VTOI(ap->a_vp);
1780
ufhp = (struct ufid *)ap->a_fhp;
1781
ufhp->ufid_len = sizeof(struct ufid);
1782
ufhp->ufid_ino = ip->i_number;
1783
ufhp->ufid_gen = ip->i_gen;