2 * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause)
4 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
7 * This software was developed for the FreeBSD Project by Marshall
8 * Kirk McKusick and Network Associates Laboratories, the Security
9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * Copyright (c) 1982, 1986, 1989, 1993
35 * The Regents of the University of California. All rights reserved.
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
62 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
63 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
66 #include <sys/cdefs.h>
67 __FBSDID("$FreeBSD$");
69 #include "opt_directio.h"
73 #include <sys/param.h>
75 #include <sys/systm.h>
78 #include <sys/extattr.h>
79 #include <sys/kernel.h>
80 #include <sys/limits.h>
81 #include <sys/malloc.h>
82 #include <sys/mount.h>
84 #include <sys/rwlock.h>
86 #include <sys/sysctl.h>
87 #include <sys/vmmeter.h>
88 #include <sys/vnode.h>
91 #include <vm/vm_param.h>
92 #include <vm/vm_extern.h>
93 #include <vm/vm_object.h>
94 #include <vm/vm_page.h>
95 #include <vm/vm_pager.h>
96 #include <vm/vnode_pager.h>
98 #include <ufs/ufs/extattr.h>
99 #include <ufs/ufs/quota.h>
100 #include <ufs/ufs/inode.h>
101 #include <ufs/ufs/ufs_extern.h>
102 #include <ufs/ufs/ufsmount.h>
103 #include <ufs/ufs/dir.h>
105 #include <ufs/ufs/dirhash.h>
108 #include <ufs/ffs/fs.h>
109 #include <ufs/ffs/ffs_extern.h>
111 #define ALIGNED_TO(ptr, s) \
112 (((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)
115 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
117 static vop_fdatasync_t ffs_fdatasync;
118 static vop_fsync_t ffs_fsync;
119 static vop_getpages_t ffs_getpages;
120 static vop_getpages_async_t ffs_getpages_async;
121 static vop_lock1_t ffs_lock;
123 static vop_unlock_t ffs_unlock_debug;
125 static vop_read_t ffs_read;
126 static vop_write_t ffs_write;
127 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
128 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
130 static vop_strategy_t ffsext_strategy;
131 static vop_closeextattr_t ffs_closeextattr;
132 static vop_deleteextattr_t ffs_deleteextattr;
133 static vop_getextattr_t ffs_getextattr;
134 static vop_listextattr_t ffs_listextattr;
135 static vop_openextattr_t ffs_openextattr;
136 static vop_setextattr_t ffs_setextattr;
137 static vop_vptofh_t ffs_vptofh;
138 static vop_vput_pair_t ffs_vput_pair;
140 /* Global vfs data structures for ufs. */
141 struct vop_vector ffs_vnodeops1 = {
142 .vop_default = &ufs_vnodeops,
143 .vop_fsync = ffs_fsync,
144 .vop_fdatasync = ffs_fdatasync,
145 .vop_getpages = ffs_getpages,
146 .vop_getpages_async = ffs_getpages_async,
147 .vop_lock1 = ffs_lock,
149 .vop_unlock = ffs_unlock_debug,
151 .vop_read = ffs_read,
152 .vop_reallocblks = ffs_reallocblks,
153 .vop_write = ffs_write,
154 .vop_vptofh = ffs_vptofh,
155 .vop_vput_pair = ffs_vput_pair,
157 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops1);
159 struct vop_vector ffs_fifoops1 = {
160 .vop_default = &ufs_fifoops,
161 .vop_fsync = ffs_fsync,
162 .vop_fdatasync = ffs_fdatasync,
163 .vop_lock1 = ffs_lock,
165 .vop_unlock = ffs_unlock_debug,
167 .vop_vptofh = ffs_vptofh,
169 VFS_VOP_VECTOR_REGISTER(ffs_fifoops1);
171 /* Global vfs data structures for ufs. */
172 struct vop_vector ffs_vnodeops2 = {
173 .vop_default = &ufs_vnodeops,
174 .vop_fsync = ffs_fsync,
175 .vop_fdatasync = ffs_fdatasync,
176 .vop_getpages = ffs_getpages,
177 .vop_getpages_async = ffs_getpages_async,
178 .vop_lock1 = ffs_lock,
180 .vop_unlock = ffs_unlock_debug,
182 .vop_read = ffs_read,
183 .vop_reallocblks = ffs_reallocblks,
184 .vop_write = ffs_write,
185 .vop_closeextattr = ffs_closeextattr,
186 .vop_deleteextattr = ffs_deleteextattr,
187 .vop_getextattr = ffs_getextattr,
188 .vop_listextattr = ffs_listextattr,
189 .vop_openextattr = ffs_openextattr,
190 .vop_setextattr = ffs_setextattr,
191 .vop_vptofh = ffs_vptofh,
192 .vop_vput_pair = ffs_vput_pair,
194 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops2);
196 struct vop_vector ffs_fifoops2 = {
197 .vop_default = &ufs_fifoops,
198 .vop_fsync = ffs_fsync,
199 .vop_fdatasync = ffs_fdatasync,
200 .vop_lock1 = ffs_lock,
202 .vop_unlock = ffs_unlock_debug,
204 .vop_reallocblks = ffs_reallocblks,
205 .vop_strategy = ffsext_strategy,
206 .vop_closeextattr = ffs_closeextattr,
207 .vop_deleteextattr = ffs_deleteextattr,
208 .vop_getextattr = ffs_getextattr,
209 .vop_listextattr = ffs_listextattr,
210 .vop_openextattr = ffs_openextattr,
211 .vop_setextattr = ffs_setextattr,
212 .vop_vptofh = ffs_vptofh,
214 VFS_VOP_VECTOR_REGISTER(ffs_fifoops2);
217 * Synch an open file.
221 ffs_fsync(struct vop_fsync_args *ap)
230 error = ffs_syncvnode(vp, ap->a_waitfor, 0);
233 if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
234 error = softdep_fsync(vp);
239 * The softdep_fsync() function may drop vp lock,
240 * allowing for dirty buffers to reappear on the
241 * bo_dirty list. Recheck and resync as needed.
244 if ((vp->v_type == VREG || vp->v_type == VDIR) &&
245 (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
251 if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), 0))
257 ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
261 struct ufsmount *ump;
262 struct buf *bp, *nbp;
264 int error, passes, wflag;
265 bool still_dirty, unlocked, wait;
269 ump = VFSTOUFS(vp->v_mount);
271 wflag = IS_SNAPSHOT(ip) ? LK_NOWITNESS : 0;
277 * When doing MNT_WAIT we must first flush all dependencies
280 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
281 (error = softdep_sync_metadata(vp)) != 0) {
282 if (ffs_fsfail_cleanup(ump, error))
288 * Flush all dirty buffers associated with a vnode.
292 wait = false; /* Always do an async pass first. */
294 lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
297 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
298 bp->b_vflags &= ~BV_SCANNED;
299 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
301 * Reasons to skip this buffer: it has already been considered
302 * on this pass, the buffer has dependencies that will cause
303 * it to be redirtied and it has not already been deferred,
304 * or it is already being written.
306 if ((bp->b_vflags & BV_SCANNED) != 0)
308 bp->b_vflags |= BV_SCANNED;
310 * Flush indirects in order, if requested.
312 * Note that if only datasync is requested, we can
313 * skip indirect blocks when softupdates are not
314 * active. Otherwise we must flush them with data,
315 * since dependencies prevent data block writes.
317 if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
318 (lbn_level(bp->b_lblkno) >= passes ||
319 ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
321 if (bp->b_lblkno > lbn)
322 panic("ffs_syncvnode: syncing truncated data.");
323 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
326 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
327 LK_INTERLOCK | wflag, BO_LOCKPTR(bo)) != 0) {
329 bp->b_vflags &= ~BV_SCANNED;
334 if ((bp->b_flags & B_DELWRI) == 0)
335 panic("ffs_fsync: not dirty");
337 * Check for dependencies and potentially complete them.
339 if (!LIST_EMPTY(&bp->b_dep) &&
340 (error = softdep_sync_buf(vp, bp,
341 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
343 * Lock order conflict, buffer was already unlocked,
344 * and vnode possibly unlocked.
346 if (error == ERELOOKUP) {
347 if (vp->v_data == NULL)
350 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
351 (error = softdep_sync_metadata(vp)) != 0) {
352 if (ffs_fsfail_cleanup(ump, error))
354 return (unlocked && error == 0 ?
357 /* Re-evaluate inode size */
358 lbn = lblkno(ITOFS(ip), (ip->i_size +
359 ITOFS(ip)->fs_bsize - 1));
363 if (error != EBUSY) {
367 /* If we deferred once, don't defer again. */
368 if ((bp->b_flags & B_DEFERRED) == 0) {
369 bp->b_flags |= B_DEFERRED;
377 if (ffs_fsfail_cleanup(ump, error))
381 } else if ((bp->b_flags & B_CLUSTEROK)) {
382 (void) vfs_bio_awrite(bp);
389 * Since we may have slept during the I/O, we need
390 * to start from a known point.
394 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
396 if (waitfor != MNT_WAIT) {
398 if ((flags & NO_INO_UPDT) != 0)
399 return (unlocked ? ERELOOKUP : 0);
400 error = ffs_update(vp, 0);
401 if (error == 0 && unlocked)
405 /* Drain IO to see if we're done. */
406 bufobj_wwait(bo, 0, 0);
408 * Block devices associated with filesystems may have new I/O
409 * requests posted for them even if the vnode is locked, so no
410 * amount of trying will get them clean. We make several passes
413 * Regular files may need multiple passes to flush all dependency
414 * work as it is possible that we must write once per indirect
415 * level, once for the leaf, and once for the inode and each of
416 * these will be done with one sync and one async pass.
418 if (bo->bo_dirty.bv_cnt > 0) {
419 if ((flags & DATA_ONLY) == 0) {
423 * For data-only sync, dirty indirect buffers
427 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
428 if (bp->b_lblkno > -UFS_NDADDR) {
436 /* Write the inode after sync passes to flush deps. */
437 if (wait && DOINGSOFTDEP(vp) &&
438 (flags & NO_INO_UPDT) == 0) {
443 /* switch between sync/async. */
445 if (wait || ++passes < UFS_NIADDR + 2)
451 if ((flags & DATA_ONLY) == 0) {
452 if ((flags & NO_INO_UPDT) == 0)
453 error = ffs_update(vp, 1);
455 softdep_journal_fsync(VTOI(vp));
456 } else if ((ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)) != 0) {
457 error = ffs_update(vp, 1);
459 if (error == 0 && unlocked)
462 ip->i_flag &= ~IN_NEEDSYNC;
467 ffs_fdatasync(struct vop_fdatasync_args *ap)
470 return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
475 struct vop_lock1_args /* {
482 #if !defined(NO_FFS_SNAPSHOT) || defined(DIAGNOSTIC)
483 struct vnode *vp = ap->a_vp;
484 #endif /* !NO_FFS_SNAPSHOT || DIAGNOSTIC */
487 #endif /* DIAGNOSTIC */
489 #ifndef NO_FFS_SNAPSHOT
494 * Adaptive spinning mixed with SU leads to trouble. use a giant hammer
495 * and only use it when LK_NODDLKTREAT is set. Currently this means it
496 * is only used during path lookup.
498 if ((ap->a_flags & LK_NODDLKTREAT) != 0)
499 ap->a_flags |= LK_ADAPTIVE;
500 switch (ap->a_flags & LK_TYPE_MASK) {
506 #ifdef DEBUG_VFS_LOCKS
507 VNPASS(vp->v_holdcnt != 0, vp);
508 #endif /* DEBUG_VFS_LOCKS */
510 result = lockmgr_lock_flags(lkp, flags,
511 &VI_MTX(vp)->lock_object, ap->a_file, ap->a_line);
512 if (lkp == vp->v_vnlock || result != 0)
515 * Apparent success, except that the vnode
516 * mutated between snapshot file vnode and
517 * regular file vnode while this process
518 * slept. The lock currently held is not the
519 * right lock. Release it, and try to get the
523 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
524 (LK_INTERLOCK | LK_NOWAIT))
526 if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
527 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
528 flags &= ~LK_INTERLOCK;
531 switch (ap->a_flags & LK_TYPE_MASK) {
534 if (result == 0 && vp->v_vnlock->lk_recurse == 0) {
540 #endif /* DIAGNOSTIC */
544 if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) {
547 ufs_unlock_tracker(ip);
549 #endif /* DIAGNOSTIC */
550 result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
553 #else /* NO_FFS_SNAPSHOT */
555 * See above for an explanation.
557 if ((ap->a_flags & LK_NODDLKTREAT) != 0)
558 ap->a_flags |= LK_ADAPTIVE;
560 if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) {
563 ufs_unlock_tracker(ip);
565 #endif /* DIAGNOSTIC */
566 result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
567 #endif /* NO_FFS_SNAPSHOT */
569 switch (ap->a_flags & LK_TYPE_MASK) {
572 if (result == 0 && vp->v_vnlock->lk_recurse == 0) {
578 #endif /* DIAGNOSTIC */
584 ffs_unlock_debug(struct vop_unlock_args *ap)
591 if (ip->i_flag & UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE) {
592 if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
594 VNASSERT((vp->v_mflag & VMP_LAZYLIST), vp,
595 ("%s: modified vnode (%x) not on lazy list",
596 __func__, ip->i_flag));
600 KASSERT(vp->v_type != VDIR || vp->v_vnlock->lk_recurse != 0 ||
601 (ip->i_flag & IN_ENDOFF) == 0,
602 ("ufs dir vp %p ip %p flags %#x", vp, ip, ip->i_flag));
604 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && ip != NULL &&
605 vp->v_vnlock->lk_recurse == 0)
606 ufs_unlock_tracker(ip);
608 return (VOP_UNLOCK_APV(&ufs_vnodeops, ap));
613 ffs_read_hole(struct uio *uio, long xfersize, long *size)
615 ssize_t saved_resid, tlen;
618 while (xfersize > 0) {
619 tlen = min(xfersize, ZERO_REGION_SIZE);
620 saved_resid = uio->uio_resid;
621 error = vn_io_fault_uiomove(__DECONST(void *, zero_region),
625 tlen = saved_resid - uio->uio_resid;
633 * Vnode op for reading.
637 struct vop_read_args /* {
641 struct ucred *a_cred;
649 ufs_lbn_t lbn, nextlbn;
651 long size, xfersize, blkoffset;
653 int bflag, error, ioflag, seqcount;
657 ioflag = ap->a_ioflag;
658 if (ap->a_ioflag & IO_EXT)
660 return (ffs_extread(vp, uio, ioflag));
662 panic("ffs_read+IO_EXT");
665 if ((ioflag & IO_DIRECT) != 0) {
668 error = ffs_rawread(vp, uio, &workdone);
669 if (error != 0 || workdone != 0)
674 seqcount = ap->a_ioflag >> IO_SEQSHIFT;
678 if (uio->uio_rw != UIO_READ)
679 panic("ffs_read: mode");
681 if (vp->v_type == VLNK) {
682 if ((int)ip->i_size < VFSTOUFS(vp->v_mount)->um_maxsymlinklen)
683 panic("ffs_read: short symlink");
684 } else if (vp->v_type != VREG && vp->v_type != VDIR)
685 panic("ffs_read: type %d", vp->v_type);
687 orig_resid = uio->uio_resid;
688 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
691 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
693 if (uio->uio_offset < ip->i_size &&
694 uio->uio_offset >= fs->fs_maxfilesize)
697 bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE);
698 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
699 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
701 lbn = lblkno(fs, uio->uio_offset);
705 * size of buffer. The buffer representing the
706 * end of the file is rounded up to the size of
707 * the block type ( fragment or full block,
710 size = blksize(fs, ip, lbn);
711 blkoffset = blkoff(fs, uio->uio_offset);
714 * The amount we want to transfer in this iteration is
715 * one FS block less the amount of the data before
716 * our startpoint (duh!)
718 xfersize = fs->fs_bsize - blkoffset;
721 * But if we actually want less than the block,
722 * or the file doesn't have a whole block more of data,
723 * then use the lesser number.
725 if (uio->uio_resid < xfersize)
726 xfersize = uio->uio_resid;
727 if (bytesinfile < xfersize)
728 xfersize = bytesinfile;
730 if (lblktosize(fs, nextlbn) >= ip->i_size) {
732 * Don't do readahead if this is the end of the file.
734 error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
735 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
737 * Otherwise if we are allowed to cluster,
738 * grab as much as we can.
740 * XXX This may not be a win if we are not
741 * doing sequential access.
743 error = cluster_read(vp, ip->i_size, lbn,
744 size, NOCRED, blkoffset + uio->uio_resid,
745 seqcount, bflag, &bp);
746 } else if (seqcount > 1) {
748 * If we are NOT allowed to cluster, then
749 * if we appear to be acting sequentially,
750 * fire off a request for a readahead
751 * as well as a read. Note that the 4th and 5th
752 * arguments point to arrays of the size specified in
755 u_int nextsize = blksize(fs, ip, nextlbn);
756 error = breadn_flags(vp, lbn, lbn, size, &nextlbn,
757 &nextsize, 1, NOCRED, bflag, NULL, &bp);
760 * Failing all of the above, just read what the
761 * user asked for. Interestingly, the same as
762 * the first option above.
764 error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
766 if (error == EJUSTRETURN) {
767 error = ffs_read_hole(uio, xfersize, &size);
778 * We should only get non-zero b_resid when an I/O error
779 * has occurred, which should cause us to break above.
780 * However, if the short read did not cause an error,
781 * then we want to ensure that we do not uiomove bad
782 * or uninitialized data.
785 if (size < xfersize) {
791 if (buf_mapped(bp)) {
792 error = vn_io_fault_uiomove((char *)bp->b_data +
793 blkoffset, (int)xfersize, uio);
795 error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
801 vfs_bio_brelse(bp, ioflag);
805 * This can only happen in the case of an error
806 * because the loop above resets bp to NULL on each iteration
807 * and on normal completion has not set a new value into it.
808 * so it must have come from a 'break' statement
811 vfs_bio_brelse(bp, ioflag);
813 if ((error == 0 || uio->uio_resid != orig_resid) &&
814 (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
815 UFS_INODE_SET_FLAG_SHARED(ip, IN_ACCESS);
820 * Vnode op for writing.
824 struct vop_write_args /* {
828 struct ucred *a_cred;
840 int blkoffset, error, flags, ioflag, size, xfersize;
844 softdep_prealloc(vp, MNT_WAIT);
845 if (vp->v_data == NULL)
849 ioflag = ap->a_ioflag;
850 if (ap->a_ioflag & IO_EXT)
852 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
854 panic("ffs_write+IO_EXT");
857 seqcount = ap->a_ioflag >> IO_SEQSHIFT;
861 if (uio->uio_rw != UIO_WRITE)
862 panic("ffs_write: mode");
865 switch (vp->v_type) {
867 if (ioflag & IO_APPEND)
868 uio->uio_offset = ip->i_size;
869 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
875 panic("ffs_write: dir write");
878 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
879 (int)uio->uio_offset,
884 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
885 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
887 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
890 * Maybe this should be above the vnode op call, but so long as
891 * file servers have no limits, I don't think it matters.
893 if (vn_rlimit_fsize(vp, uio, uio->uio_td))
896 resid = uio->uio_resid;
898 if (seqcount > BA_SEQMAX)
899 flags = BA_SEQMAX << BA_SEQSHIFT;
901 flags = seqcount << BA_SEQSHIFT;
902 if (ioflag & IO_SYNC)
904 flags |= BA_UNMAPPED;
906 for (error = 0; uio->uio_resid > 0;) {
907 lbn = lblkno(fs, uio->uio_offset);
908 blkoffset = blkoff(fs, uio->uio_offset);
909 xfersize = fs->fs_bsize - blkoffset;
910 if (uio->uio_resid < xfersize)
911 xfersize = uio->uio_resid;
912 if (uio->uio_offset + xfersize > ip->i_size)
913 vnode_pager_setsize(vp, uio->uio_offset + xfersize);
916 * We must perform a read-before-write if the transfer size
917 * does not cover the entire buffer.
919 if (fs->fs_bsize > xfersize)
923 /* XXX is uio->uio_offset the right thing here? */
924 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
925 ap->a_cred, flags, &bp);
927 vnode_pager_setsize(vp, ip->i_size);
930 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
931 bp->b_flags |= B_NOCACHE;
933 if (uio->uio_offset + xfersize > ip->i_size) {
934 ip->i_size = uio->uio_offset + xfersize;
935 DIP_SET(ip, i_size, ip->i_size);
936 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
939 size = blksize(fs, ip, lbn) - bp->b_resid;
943 if (buf_mapped(bp)) {
944 error = vn_io_fault_uiomove((char *)bp->b_data +
945 blkoffset, (int)xfersize, uio);
947 error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
951 * If the buffer is not already filled and we encounter an
952 * error while trying to fill it, we have to clear out any
953 * garbage data from the pages instantiated for the buffer.
954 * If we do not, a failed uiomove() during a write can leave
955 * the prior contents of the pages exposed to a userland mmap.
957 * Note that we need only clear buffers with a transfer size
958 * equal to the block size because buffers with a shorter
959 * transfer size were cleared above by the call to UFS_BALLOC()
960 * with the BA_CLRBUF flag set.
962 * If the source region for uiomove identically mmaps the
963 * buffer, uiomove() performed the NOP copy, and the buffer
964 * content remains valid because the page fault handler
965 * validated the pages.
967 if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
968 fs->fs_bsize == xfersize)
971 vfs_bio_set_flags(bp, ioflag);
974 * If IO_SYNC each buffer is written synchronously. Otherwise
975 * if we have a severe page deficiency write the buffer
976 * asynchronously. Otherwise try to cluster, and if that
977 * doesn't do it then either do an async write (if O_DIRECT),
978 * or a delayed write (if not).
980 if (ioflag & IO_SYNC) {
982 } else if (vm_page_count_severe() ||
983 buf_dirty_count_severe() ||
984 (ioflag & IO_ASYNC)) {
985 bp->b_flags |= B_CLUSTEROK;
987 } else if (xfersize + blkoffset == fs->fs_bsize) {
988 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
989 bp->b_flags |= B_CLUSTEROK;
990 cluster_write(vp, bp, ip->i_size, seqcount,
995 } else if (ioflag & IO_DIRECT) {
996 bp->b_flags |= B_CLUSTEROK;
999 bp->b_flags |= B_CLUSTEROK;
1002 if (error || xfersize == 0)
1004 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
1007 * If we successfully wrote any data, and we are not the superuser
1008 * we clear the setuid and setgid bits as a precaution against
1011 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
1013 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) {
1014 vn_seqc_write_begin(vp);
1015 UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID));
1016 DIP_SET(ip, i_mode, ip->i_mode);
1017 vn_seqc_write_end(vp);
1021 if (ioflag & IO_UNIT) {
1022 (void)ffs_truncate(vp, osize,
1023 IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
1024 uio->uio_offset -= resid - uio->uio_resid;
1025 uio->uio_resid = resid;
1027 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) {
1028 if (!(ioflag & IO_DATASYNC) ||
1029 (ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)))
1030 error = ffs_update(vp, 1);
1031 if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), error))
1038 * Extended attribute area reading.
1041 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
1044 struct ufs2_dinode *dp;
1047 ufs_lbn_t lbn, nextlbn;
1049 long size, xfersize, blkoffset;
1058 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
1059 panic("ffs_extread: mode");
1062 orig_resid = uio->uio_resid;
1063 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
1064 if (orig_resid == 0)
1066 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
1068 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
1069 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
1071 lbn = lblkno(fs, uio->uio_offset);
1075 * size of buffer. The buffer representing the
1076 * end of the file is rounded up to the size of
1077 * the block type ( fragment or full block,
1080 size = sblksize(fs, dp->di_extsize, lbn);
1081 blkoffset = blkoff(fs, uio->uio_offset);
1084 * The amount we want to transfer in this iteration is
1085 * one FS block less the amount of the data before
1086 * our startpoint (duh!)
1088 xfersize = fs->fs_bsize - blkoffset;
1091 * But if we actually want less than the block,
1092 * or the file doesn't have a whole block more of data,
1093 * then use the lesser number.
1095 if (uio->uio_resid < xfersize)
1096 xfersize = uio->uio_resid;
1097 if (bytesinfile < xfersize)
1098 xfersize = bytesinfile;
1100 if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
1102 * Don't do readahead if this is the end of the info.
1104 error = bread(vp, -1 - lbn, size, NOCRED, &bp);
1107 * If we have a second block, then
1108 * fire off a request for a readahead
1109 * as well as a read. Note that the 4th and 5th
1110 * arguments point to arrays of the size specified in
1113 u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
1115 nextlbn = -1 - nextlbn;
1116 error = breadn(vp, -1 - lbn,
1117 size, &nextlbn, &nextsize, 1, NOCRED, &bp);
1126 * We should only get non-zero b_resid when an I/O error
1127 * has occurred, which should cause us to break above.
1128 * However, if the short read did not cause an error,
1129 * then we want to ensure that we do not uiomove bad
1130 * or uninitialized data.
1132 size -= bp->b_resid;
1133 if (size < xfersize) {
1139 error = uiomove((char *)bp->b_data + blkoffset,
1140 (int)xfersize, uio);
1143 vfs_bio_brelse(bp, ioflag);
1147 * This can only happen in the case of an error
1148 * because the loop above resets bp to NULL on each iteration
1149 * and on normal completion has not set a new value into it.
1150 * so it must have come from a 'break' statement
1153 vfs_bio_brelse(bp, ioflag);
1158 * Extended attribute area writing.
1161 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1164 struct ufs2_dinode *dp;
1170 int blkoffset, error, flags, size, xfersize;
1177 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1178 panic("ffs_extwrite: mode");
1181 if (ioflag & IO_APPEND)
1182 uio->uio_offset = dp->di_extsize;
1183 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1184 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1185 if ((uoff_t)uio->uio_offset + uio->uio_resid >
1186 UFS_NXADDR * fs->fs_bsize)
1189 resid = uio->uio_resid;
1190 osize = dp->di_extsize;
1192 if (ioflag & IO_SYNC)
1195 for (error = 0; uio->uio_resid > 0;) {
1196 lbn = lblkno(fs, uio->uio_offset);
1197 blkoffset = blkoff(fs, uio->uio_offset);
1198 xfersize = fs->fs_bsize - blkoffset;
1199 if (uio->uio_resid < xfersize)
1200 xfersize = uio->uio_resid;
1203 * We must perform a read-before-write if the transfer size
1204 * does not cover the entire buffer.
1206 if (fs->fs_bsize > xfersize)
1209 flags &= ~BA_CLRBUF;
1210 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1215 * If the buffer is not valid we have to clear out any
1216 * garbage data from the pages instantiated for the buffer.
1217 * If we do not, a failed uiomove() during a write can leave
1218 * the prior contents of the pages exposed to a userland
1219 * mmap(). XXX deal with uiomove() errors a better way.
1221 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1224 if (uio->uio_offset + xfersize > dp->di_extsize) {
1225 dp->di_extsize = uio->uio_offset + xfersize;
1226 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
1229 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1230 if (size < xfersize)
1234 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1236 vfs_bio_set_flags(bp, ioflag);
1239 * If IO_SYNC each buffer is written synchronously. Otherwise
1240 * if we have a severe page deficiency write the buffer
1241 * asynchronously. Otherwise try to cluster, and if that
1242 * doesn't do it then either do an async write (if O_DIRECT),
1243 * or a delayed write (if not).
1245 if (ioflag & IO_SYNC) {
1247 } else if (vm_page_count_severe() ||
1248 buf_dirty_count_severe() ||
1249 xfersize + blkoffset == fs->fs_bsize ||
1250 (ioflag & (IO_ASYNC | IO_DIRECT)))
1254 if (error || xfersize == 0)
1256 UFS_INODE_SET_FLAG(ip, IN_CHANGE);
1259 * If we successfully wrote any data, and we are not the superuser
1260 * we clear the setuid and setgid bits as a precaution against
1263 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1264 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) {
1265 vn_seqc_write_begin(vp);
1266 UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID));
1267 dp->di_mode = ip->i_mode;
1268 vn_seqc_write_end(vp);
1272 if (ioflag & IO_UNIT) {
1273 (void)ffs_truncate(vp, osize,
1274 IO_EXT | (ioflag&IO_SYNC), ucred);
1275 uio->uio_offset -= resid - uio->uio_resid;
1276 uio->uio_resid = resid;
1278 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1279 error = ffs_update(vp, 1);
1284 * Vnode operating to retrieve a named extended attribute.
1286 * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1287 * the length of the EA, and possibly the pointer to the entry and to the data.
1290 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name,
1291 struct extattr **eapp, u_char **eac)
1293 struct extattr *eap, *eaend;
1296 nlen = strlen(name);
1297 KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
1298 eap = (struct extattr *)ptr;
1299 eaend = (struct extattr *)(ptr + length);
1300 for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
1301 KASSERT(EXTATTR_NEXT(eap) <= eaend,
1302 ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend));
1303 if (eap->ea_namespace != nspace || eap->ea_namelength != nlen
1304 || memcmp(eap->ea_name, name, nlen) != 0)
1309 *eac = EXTATTR_CONTENT(eap);
1310 return (EXTATTR_CONTENT_SIZE(eap));
1316 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td)
1318 const struct extattr *eap, *eaend, *eapnext;
1320 struct ufs2_dinode *dp;
1323 struct iovec liovec;
1331 easize = dp->di_extsize;
1332 if ((uoff_t)easize > UFS_NXADDR * fs->fs_bsize)
1335 eae = malloc(easize, M_TEMP, M_WAITOK);
1337 liovec.iov_base = eae;
1338 liovec.iov_len = easize;
1339 luio.uio_iov = &liovec;
1340 luio.uio_iovcnt = 1;
1341 luio.uio_offset = 0;
1342 luio.uio_resid = easize;
1343 luio.uio_segflg = UIO_SYSSPACE;
1344 luio.uio_rw = UIO_READ;
1347 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1352 /* Validate disk xattrfile contents. */
1353 for (eap = (void *)eae, eaend = (void *)(eae + easize); eap < eaend;
1355 /* Detect zeroed out tail */
1356 if (eap->ea_length < sizeof(*eap) || eap->ea_length == 0) {
1357 easize = (const u_char *)eap - eae;
1361 eapnext = EXTATTR_NEXT(eap);
1362 /* Bogusly long entry. */
1363 if (eapnext > eaend) {
1365 return (EINTEGRITY);
1368 ip->i_ea_len = easize;
1374 ffs_lock_ea(struct vnode *vp)
1380 while (ip->i_flag & IN_EA_LOCKED) {
1381 UFS_INODE_SET_FLAG(ip, IN_EA_LOCKWAIT);
1382 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
1385 UFS_INODE_SET_FLAG(ip, IN_EA_LOCKED);
1390 ffs_unlock_ea(struct vnode *vp)
1396 if (ip->i_flag & IN_EA_LOCKWAIT)
1397 wakeup(&ip->i_ea_refs);
1398 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1403 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1411 if (ip->i_ea_area != NULL) {
1416 error = ffs_rdextattr(&ip->i_ea_area, vp, td);
1428 * Vnode extattr transaction commit/abort
1431 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1435 struct iovec *liovec;
1436 struct ufs2_dinode *dp;
1437 size_t ea_len, tlen;
1444 if (ip->i_ea_area == NULL) {
1449 error = ip->i_ea_error;
1451 if (commit && error == 0) {
1452 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1454 cred = vp->v_mount->mnt_cred;
1456 ea_len = MAX(ip->i_ea_len, dp->di_extsize);
1457 for (lcnt = 1, tlen = ea_len - ip->i_ea_len; tlen > 0;) {
1458 tlen -= MIN(ZERO_REGION_SIZE, tlen);
1462 liovec = __builtin_alloca(lcnt * sizeof(struct iovec));
1463 luio.uio_iovcnt = lcnt;
1465 liovec[0].iov_base = ip->i_ea_area;
1466 liovec[0].iov_len = ip->i_ea_len;
1467 for (i = 1, tlen = ea_len - ip->i_ea_len; i < lcnt; i++) {
1468 liovec[i].iov_base = __DECONST(void *, zero_region);
1469 liovec[i].iov_len = MIN(ZERO_REGION_SIZE, tlen);
1470 tlen -= liovec[i].iov_len;
1474 luio.uio_iov = liovec;
1475 luio.uio_offset = 0;
1476 luio.uio_resid = ea_len;
1477 luio.uio_segflg = UIO_SYSSPACE;
1478 luio.uio_rw = UIO_WRITE;
1480 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1481 if (error == 0 && ip->i_ea_len == 0)
1484 if (--ip->i_ea_refs == 0) {
1485 free(ip->i_ea_area, M_TEMP);
1486 ip->i_ea_area = NULL;
1493 ffs_truncate(vp, 0, IO_EXT, cred);
1498 * Vnode extattr strategy routine for fifos.
1500 * We need to check for a read or write of the external attributes.
1501 * Otherwise we just fall through and do the usual thing.
1504 ffsext_strategy(struct vop_strategy_args *ap)
1506 struct vop_strategy_args {
1507 struct vnodeop_desc *a_desc;
1517 lbn = ap->a_bp->b_lblkno;
1518 if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
1519 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1520 if (vp->v_type == VFIFO)
1521 return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1522 panic("spec nodes went here");
1526 * Vnode extattr transaction commit/abort
1529 ffs_openextattr(struct vop_openextattr_args *ap)
1531 struct vop_openextattr_args {
1532 struct vnodeop_desc *a_desc;
1534 IN struct ucred *a_cred;
1535 IN struct thread *a_td;
1540 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1541 return (EOPNOTSUPP);
1543 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1547 * Vnode extattr transaction commit/abort
1550 ffs_closeextattr(struct vop_closeextattr_args *ap)
1552 struct vop_closeextattr_args {
1553 struct vnodeop_desc *a_desc;
1556 IN struct ucred *a_cred;
1557 IN struct thread *a_td;
1564 if (vp->v_type == VCHR || vp->v_type == VBLK)
1565 return (EOPNOTSUPP);
1566 if (ap->a_commit && (vp->v_mount->mnt_flag & MNT_RDONLY) != 0)
1569 if (ap->a_commit && DOINGSUJ(vp)) {
1570 ASSERT_VOP_ELOCKED(vp, "ffs_closeextattr commit");
1571 softdep_prealloc(vp, MNT_WAIT);
1572 if (vp->v_data == NULL)
1575 return (ffs_close_ea(vp, ap->a_commit, ap->a_cred, ap->a_td));
1579 * Vnode operation to remove a named attribute.
1582 ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1585 IN struct vnode *a_vp;
1586 IN int a_attrnamespace;
1587 IN const char *a_name;
1588 IN struct ucred *a_cred;
1589 IN struct thread *a_td;
1595 struct extattr *eap;
1597 int olen, error, i, easize;
1604 if (vp->v_type == VCHR || vp->v_type == VBLK)
1605 return (EOPNOTSUPP);
1606 if (strlen(ap->a_name) == 0)
1608 if (vp->v_mount->mnt_flag & MNT_RDONLY)
1611 error = extattr_check_cred(vp, ap->a_attrnamespace,
1612 ap->a_cred, ap->a_td, VWRITE);
1615 * ffs_lock_ea is not needed there, because the vnode
1616 * must be exclusively locked.
1618 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1619 ip->i_ea_error = error;
1624 ASSERT_VOP_ELOCKED(vp, "ffs_deleteextattr");
1625 softdep_prealloc(vp, MNT_WAIT);
1626 if (vp->v_data == NULL)
1630 error = ffs_open_ea(vp, ap->a_cred, ap->a_td);
1634 /* CEM: delete could be done in-place instead */
1635 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1636 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1637 easize = ip->i_ea_len;
1639 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1642 /* delete but nonexistent */
1644 ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1647 ul = eap->ea_length;
1648 i = (u_char *)EXTATTR_NEXT(eap) - eae;
1649 bcopy(EXTATTR_NEXT(eap), eap, easize - i);
1652 tmp = ip->i_ea_area;
1653 ip->i_ea_area = eae;
1654 ip->i_ea_len = easize;
1656 error = ffs_close_ea(vp, 1, ap->a_cred, ap->a_td);
1661 * Vnode operation to retrieve a named extended attribute.
1664 ffs_getextattr(struct vop_getextattr_args *ap)
1667 IN struct vnode *a_vp;
1668 IN int a_attrnamespace;
1669 IN const char *a_name;
1670 INOUT struct uio *a_uio;
1672 IN struct ucred *a_cred;
1673 IN struct thread *a_td;
1682 ip = VTOI(ap->a_vp);
1684 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1685 return (EOPNOTSUPP);
1687 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1688 ap->a_cred, ap->a_td, VREAD);
1692 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1696 eae = ip->i_ea_area;
1697 easize = ip->i_ea_len;
1699 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1703 if (ap->a_size != NULL)
1704 *ap->a_size = ealen;
1705 else if (ap->a_uio != NULL)
1706 error = uiomove(p, ealen, ap->a_uio);
1710 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1715 * Vnode operation to retrieve extended attributes on a vnode.
1718 ffs_listextattr(struct vop_listextattr_args *ap)
1721 IN struct vnode *a_vp;
1722 IN int a_attrnamespace;
1723 INOUT struct uio *a_uio;
1725 IN struct ucred *a_cred;
1726 IN struct thread *a_td;
1731 struct extattr *eap, *eaend;
1734 ip = VTOI(ap->a_vp);
1736 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1737 return (EOPNOTSUPP);
1739 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1740 ap->a_cred, ap->a_td, VREAD);
1744 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1749 if (ap->a_size != NULL)
1752 KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
1753 eap = (struct extattr *)ip->i_ea_area;
1754 eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
1755 for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
1756 KASSERT(EXTATTR_NEXT(eap) <= eaend,
1757 ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend));
1758 if (eap->ea_namespace != ap->a_attrnamespace)
1761 ealen = eap->ea_namelength;
1762 if (ap->a_size != NULL)
1763 *ap->a_size += ealen + 1;
1764 else if (ap->a_uio != NULL)
1765 error = uiomove(&eap->ea_namelength, ealen + 1,
1769 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1774 * Vnode operation to set a named attribute.
1777 ffs_setextattr(struct vop_setextattr_args *ap)
1780 IN struct vnode *a_vp;
1781 IN int a_attrnamespace;
1782 IN const char *a_name;
1783 INOUT struct uio *a_uio;
1784 IN struct ucred *a_cred;
1785 IN struct thread *a_td;
1792 struct extattr *eap;
1793 uint32_t ealength, ul;
1795 int olen, eapad1, eapad2, error, i, easize;
1803 if (vp->v_type == VCHR || vp->v_type == VBLK)
1804 return (EOPNOTSUPP);
1805 if (strlen(ap->a_name) == 0)
1808 /* XXX Now unsupported API to delete EAs using NULL uio. */
1809 if (ap->a_uio == NULL)
1810 return (EOPNOTSUPP);
1812 if (vp->v_mount->mnt_flag & MNT_RDONLY)
1815 ealen = ap->a_uio->uio_resid;
1816 if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR))
1819 error = extattr_check_cred(vp, ap->a_attrnamespace,
1820 ap->a_cred, ap->a_td, VWRITE);
1823 * ffs_lock_ea is not needed there, because the vnode
1824 * must be exclusively locked.
1826 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1827 ip->i_ea_error = error;
1832 ASSERT_VOP_ELOCKED(vp, "ffs_deleteextattr");
1833 softdep_prealloc(vp, MNT_WAIT);
1834 if (vp->v_data == NULL)
1838 error = ffs_open_ea(vp, ap->a_cred, ap->a_td);
1842 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1843 eapad1 = roundup2(ealength, 8) - ealength;
1844 eapad2 = roundup2(ealen, 8) - ealen;
1845 ealength += eapad1 + ealen + eapad2;
1848 * CEM: rewrites of the same size or smaller could be done in-place
1849 * instead. (We don't acquire any fine-grained locks in here either,
1850 * so we could also do bigger writes in-place.)
1852 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1853 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1854 easize = ip->i_ea_len;
1856 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1859 /* new, append at end */
1860 KASSERT(ALIGNED_TO(eae + easize, struct extattr),
1862 eap = (struct extattr *)(eae + easize);
1865 ul = eap->ea_length;
1866 i = (u_char *)EXTATTR_NEXT(eap) - eae;
1867 if (ul != ealength) {
1868 bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength,
1870 easize += (ealength - ul);
1873 if (easize > lblktosize(fs, UFS_NXADDR)) {
1875 ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1876 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1877 ip->i_ea_error = ENOSPC;
1880 eap->ea_length = ealength;
1881 eap->ea_namespace = ap->a_attrnamespace;
1882 eap->ea_contentpadlen = eapad2;
1883 eap->ea_namelength = strlen(ap->a_name);
1884 memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
1885 bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
1886 error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
1889 ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1890 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1891 ip->i_ea_error = error;
1894 bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2);
1896 tmp = ip->i_ea_area;
1897 ip->i_ea_area = eae;
1898 ip->i_ea_len = easize;
1900 error = ffs_close_ea(vp, 1, ap->a_cred, ap->a_td);
1905 * Vnode pointer to File handle
1908 ffs_vptofh(struct vop_vptofh_args *ap)
1911 IN struct vnode *a_vp;
1912 IN struct fid *a_fhp;
1919 ip = VTOI(ap->a_vp);
1920 ufhp = (struct ufid *)ap->a_fhp;
1921 ufhp->ufid_len = sizeof(struct ufid);
1922 ufhp->ufid_ino = ip->i_number;
1923 ufhp->ufid_gen = ip->i_gen;
1927 SYSCTL_DECL(_vfs_ffs);
1928 static int use_buf_pager = 1;
1929 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
1930 "Always use buffer pager instead of bmap");
1933 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
1936 return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
1940 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn, long *sz)
1943 *sz = blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn);
1948 ffs_getpages(struct vop_getpages_args *ap)
1951 struct ufsmount *um;
1954 um = VFSTOUFS(vp->v_mount);
1956 if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1957 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1958 ap->a_rbehind, ap->a_rahead, NULL, NULL));
1959 return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1960 ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
1964 ffs_getpages_async(struct vop_getpages_async_args *ap)
1967 struct ufsmount *um;
1972 um = VFSTOUFS(vp->v_mount);
1975 if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) {
1976 error = vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1977 ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg);
1981 error = vfs_bio_getpages(vp, ap->a_m, ap->a_count,
1982 ap->a_rbehind, ap->a_rahead, ffs_gbp_getblkno,
1985 if (do_iodone && ap->a_iodone != NULL)
1986 ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
1992 ffs_vput_pair(struct vop_vput_pair_args *ap)
1995 struct vnode *dvp, *vp, *vp1, **vpp;
1996 struct inode *dp, *ip;
1999 int error, vp_locked;
2004 vp = vpp != NULL ? *vpp : NULL;
2006 if ((dp->i_flag & (IN_NEEDSYNC | IN_ENDOFF)) == 0) {
2008 if (vp != NULL && ap->a_unlock_vp)
2015 if (ap->a_unlock_vp) {
2018 MPASS(vp->v_type != VNON);
2019 vp_locked = VOP_ISLOCKED(vp);
2021 ip_ino = ip->i_number;
2028 * If compaction or fsync was requested do it in ffs_vput_pair()
2029 * now that other locks are no longer held.
2031 if ((dp->i_flag & IN_ENDOFF) != 0) {
2032 VNASSERT(I_ENDOFF(dp) != 0 && I_ENDOFF(dp) < dp->i_size, dvp,
2033 ("IN_ENDOFF set but I_ENDOFF() is not"));
2034 dp->i_flag &= ~IN_ENDOFF;
2035 error = UFS_TRUNCATE(dvp, (off_t)I_ENDOFF(dp), IO_NORMAL |
2036 (DOINGASYNC(dvp) ? 0 : IO_SYNC), curthread->td_ucred);
2037 if (error != 0 && error != ERELOOKUP) {
2038 if (!ffs_fsfail_cleanup(VFSTOUFS(mp), error)) {
2040 "IN_ENDOFF: failed to truncate, "
2041 "error %d\n", error);
2044 ufsdirhash_free(dp);
2047 SET_I_ENDOFF(dp, 0);
2049 if ((dp->i_flag & IN_NEEDSYNC) != 0) {
2051 error = ffs_syncvnode(dvp, MNT_WAIT, 0);
2052 } while (error == ERELOOKUP);
2057 if (vp == NULL || ap->a_unlock_vp)
2062 * It is possible that vp is reclaimed at this point. Only
2063 * routines that call us with a_unlock_vp == false can find
2064 * that their vp has been reclaimed. There are three areas
2065 * that are affected:
2066 * 1) vn_open_cred() - later VOPs could fail, but
2067 * dead_open() returns 0 to simulate successful open.
2068 * 2) ffs_snapshot() - creation of snapshot fails with EBADF.
2069 * 3) NFS server (several places) - code is prepared to detect
2070 * and respond to dead vnodes by returning ESTALE.
2072 VOP_LOCK(vp, vp_locked | LK_RETRY);
2077 * Try harder to recover from reclaimed vp if reclaim was not
2078 * because underlying inode was cleared. We saved inode
2079 * number and inode generation, so we can try to reinstantiate
2080 * exactly same version of inode. If this fails, return
2081 * original doomed vnode and let caller to handle
2084 * Note that callers must keep write started around
2085 * VOP_VPUT_PAIR() calls, so it is safe to use mp without
2089 error = ffs_inotovp(mp, ip_ino, ip_gen, LK_EXCLUSIVE, &vp1,
2090 FFSV_REPLACE_DOOMED);
2092 VOP_LOCK(vp, vp_locked | LK_RETRY);