4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2017 Nexenta Systems, Inc.
29 /* Portions Copyright 2007 Jeremy Teo */
30 /* Portions Copyright 2010 Robert Milkowski */
32 #include <sys/types.h>
33 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/sysmacros.h>
37 #include <sys/resource.h>
40 #include <sys/vnode.h>
44 #include <sys/taskq.h>
46 #include <sys/atomic.h>
47 #include <sys/namei.h>
49 #include <sys/cmn_err.h>
50 #include <sys/errno.h>
51 #include <sys/unistd.h>
52 #include <sys/zfs_dir.h>
53 #include <sys/zfs_ioctl.h>
54 #include <sys/fs/zfs.h>
56 #include <sys/dmu_objset.h>
62 #include <sys/dirent.h>
63 #include <sys/policy.h>
64 #include <sys/sunddi.h>
65 #include <sys/filio.h>
67 #include <sys/zfs_ctldir.h>
68 #include <sys/zfs_fuid.h>
69 #include <sys/zfs_sa.h>
70 #include <sys/zfs_rlock.h>
71 #include <sys/extdirent.h>
72 #include <sys/kidmap.h>
75 #include <sys/sched.h>
77 #include <sys/vmmeter.h>
78 #include <vm/vm_param.h>
84 * Each vnode op performs some logical unit of work. To do this, the ZPL must
85 * properly lock its in-core state, create a DMU transaction, do the work,
86 * record this work in the intent log (ZIL), commit the DMU transaction,
87 * and wait for the intent log to commit if it is a synchronous operation.
88 * Moreover, the vnode ops must work in both normal and log replay context.
89 * The ordering of events is important to avoid deadlocks and references
90 * to freed memory. The example below illustrates the following Big Rules:
92 * (1) A check must be made in each zfs thread for a mounted file system.
93 * This is done avoiding races using ZFS_ENTER(zfsvfs).
94 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
95 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
96 * can return EIO from the calling function.
98 * (2) VN_RELE() should always be the last thing except for zil_commit()
99 * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
100 * First, if it's the last reference, the vnode/znode
101 * can be freed, so the zp may point to freed memory. Second, the last
102 * reference will call zfs_zinactive(), which may induce a lot of work --
103 * pushing cached pages (which acquires range locks) and syncing out
104 * cached atime changes. Third, zfs_zinactive() may require a new tx,
105 * which could deadlock the system if you were already holding one.
106 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
108 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
109 * as they can span dmu_tx_assign() calls.
111 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
112 * dmu_tx_assign(). This is critical because we don't want to block
113 * while holding locks.
115 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
116 * reduces lock contention and CPU usage when we must wait (note that if
117 * throughput is constrained by the storage, nearly every transaction
120 * Note, in particular, that if a lock is sometimes acquired before
121 * the tx assigns, and sometimes after (e.g. z_lock), then failing
122 * to use a non-blocking assign can deadlock the system. The scenario:
124 * Thread A has grabbed a lock before calling dmu_tx_assign().
125 * Thread B is in an already-assigned tx, and blocks for this lock.
126 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
127 * forever, because the previous txg can't quiesce until B's tx commits.
129 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
130 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
131 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
132 * to indicate that this operation has already called dmu_tx_wait().
133 * This will ensure that we don't retry forever, waiting a short bit
136 * (5) If the operation succeeded, generate the intent log entry for it
137 * before dropping locks. This ensures that the ordering of events
138 * in the intent log matches the order in which they actually occurred.
139 * During ZIL replay the zfs_log_* functions will update the sequence
140 * number to indicate the zil transaction has replayed.
142 * (6) At the end of each vnode op, the DMU tx must always commit,
143 * regardless of whether there were any errors.
145 * (7) After dropping all locks, invoke zil_commit(zilog, foid)
146 * to ensure that synchronous semantics are provided when necessary.
148 * In general, this is how things should be ordered in each vnode op:
150 * ZFS_ENTER(zfsvfs); // exit if unmounted
152 * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD())
153 * rw_enter(...); // grab any other locks you need
154 * tx = dmu_tx_create(...); // get DMU tx
155 * dmu_tx_hold_*(); // hold each object you might modify
156 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
158 * rw_exit(...); // drop locks
159 * zfs_dirent_unlock(dl); // unlock directory entry
160 * VN_RELE(...); // release held vnodes
161 * if (error == ERESTART) {
167 * dmu_tx_abort(tx); // abort DMU tx
168 * ZFS_EXIT(zfsvfs); // finished in zfs
169 * return (error); // really out of space
171 * error = do_real_work(); // do whatever this VOP does
173 * zfs_log_*(...); // on success, make ZIL entry
174 * dmu_tx_commit(tx); // commit DMU tx -- error or not
175 * rw_exit(...); // drop locks
176 * zfs_dirent_unlock(dl); // unlock directory entry
177 * VN_RELE(...); // release held vnodes
178 * zil_commit(zilog, foid); // synchronous when necessary
179 * ZFS_EXIT(zfsvfs); // finished in zfs
180 * return (error); // done, report error
185 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
187 znode_t *zp = VTOZ(*vpp);
188 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
193 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
194 ((flag & FAPPEND) == 0)) {
196 return (SET_ERROR(EPERM));
199 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
200 ZTOV(zp)->v_type == VREG &&
201 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
202 if (fs_vscan(*vpp, cr, 0) != 0) {
204 return (SET_ERROR(EACCES));
208 /* Keep a count of the synchronous opens in the znode */
209 if (flag & (FSYNC | FDSYNC))
210 atomic_inc_32(&zp->z_sync_cnt);
218 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
219 caller_context_t *ct)
221 znode_t *zp = VTOZ(vp);
222 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
225 * Clean up any locks held by this process on the vp.
227 cleanlocks(vp, ddi_get_pid(), 0);
228 cleanshares(vp, ddi_get_pid());
233 /* Decrement the synchronous opens in the znode */
234 if ((flag & (FSYNC | FDSYNC)) && (count == 1))
235 atomic_dec_32(&zp->z_sync_cnt);
237 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
238 ZTOV(zp)->v_type == VREG &&
239 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
240 VERIFY(fs_vscan(vp, cr, 1) == 0);
247 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
248 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
251 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
253 znode_t *zp = VTOZ(vp);
254 uint64_t noff = (uint64_t)*off; /* new offset */
259 file_sz = zp->z_size;
260 if (noff >= file_sz) {
261 return (SET_ERROR(ENXIO));
264 if (cmd == _FIO_SEEK_HOLE)
269 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
272 return (SET_ERROR(ENXIO));
275 * We could find a hole that begins after the logical end-of-file,
276 * because dmu_offset_next() only works on whole blocks. If the
277 * EOF falls mid-block, then indicate that the "virtual hole"
278 * at the end of the file begins at the logical EOF, rather than
279 * at the end of the last block.
281 if (noff > file_sz) {
294 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
295 int *rvalp, caller_context_t *ct)
299 dmu_object_info_t doi;
310 * The following two ioctls are used by bfu. Faking out,
311 * necessary to avoid bfu errors.
324 if (ddi_copyin((void *)data, &off, sizeof (off), flag))
325 return (SET_ERROR(EFAULT));
327 off = *(offset_t *)data;
330 zfsvfs = zp->z_zfsvfs;
334 /* offset parameter is in/out */
335 error = zfs_holey(vp, com, &off);
340 if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
341 return (SET_ERROR(EFAULT));
343 *(offset_t *)data = off;
348 case _FIO_COUNT_FILLED:
351 * _FIO_COUNT_FILLED adds a new ioctl command which
352 * exposes the number of filled blocks in a
356 zfsvfs = zp->z_zfsvfs;
361 * Wait for all dirty blocks for this object
362 * to get synced out to disk, and the DMU info
365 error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
372 * Retrieve fill count from DMU object.
374 error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
380 ndata = doi.doi_fill_count;
383 if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
384 return (SET_ERROR(EFAULT));
389 return (SET_ERROR(ENOTTY));
393 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
400 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
401 * aligned boundaries, if the range is not aligned. As a result a
402 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
403 * It may happen that all DEV_BSIZE subranges are marked clean and thus
404 * the whole page would be considred clean despite have some dirty data.
405 * For this reason we should shrink the range to DEV_BSIZE aligned
406 * boundaries before calling vm_page_clear_dirty.
408 end = rounddown2(off + nbytes, DEV_BSIZE);
409 off = roundup2(off, DEV_BSIZE);
413 zfs_vmobject_assert_wlocked(obj);
415 vm_page_grab_valid(&pp, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT |
416 VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
418 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
419 vm_object_pip_add(obj, 1);
420 pmap_remove_write(pp);
422 vm_page_clear_dirty(pp, off, nbytes);
428 page_unbusy(vm_page_t pp)
432 vm_object_pip_wakeup(pp->object);
436 page_wire(vnode_t *vp, int64_t start)
442 zfs_vmobject_assert_wlocked(obj);
444 vm_page_grab_valid(&m, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT |
445 VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY);
450 page_unwire(vm_page_t pp)
453 vm_page_unwire(pp, PQ_ACTIVE);
457 * When a file is memory mapped, we must keep the IO data synchronized
458 * between the DMU cache and the memory mapped pages. What this means:
460 * On Write: If we find a memory mapped page, we write to *both*
461 * the page and the dmu buffer.
464 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
465 int segflg, dmu_tx_t *tx)
472 ASSERT(segflg != UIO_NOCOPY);
473 ASSERT(vp->v_mount != NULL);
477 off = start & PAGEOFFSET;
478 zfs_vmobject_wlock(obj);
479 vm_object_pip_add(obj, 1);
480 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
482 int nbytes = imin(PAGESIZE - off, len);
484 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
485 zfs_vmobject_wunlock(obj);
487 va = zfs_map_page(pp, &sf);
488 (void) dmu_read(os, oid, start+off, nbytes,
489 va+off, DMU_READ_PREFETCH);;
492 zfs_vmobject_wlock(obj);
498 vm_object_pip_wakeup(obj);
499 zfs_vmobject_wunlock(obj);
503 * Read with UIO_NOCOPY flag means that sendfile(2) requests
504 * ZFS to populate a range of page cache pages with data.
506 * NOTE: this function could be optimized to pre-allocate
507 * all pages in advance, drain exclusive busy on all of them,
508 * map them into contiguous KVA region and populate them
509 * in one single dmu_read() call.
512 mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
514 znode_t *zp = VTOZ(vp);
515 objset_t *os = zp->z_zfsvfs->z_os;
525 ASSERT(uio->uio_segflg == UIO_NOCOPY);
526 ASSERT(vp->v_mount != NULL);
529 ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
531 zfs_vmobject_wlock(obj);
532 for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
533 int bytes = MIN(PAGESIZE, len);
535 pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
536 VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
537 if (vm_page_none_valid(pp)) {
538 zfs_vmobject_wunlock(obj);
539 va = zfs_map_page(pp, &sf);
540 error = dmu_read(os, zp->z_id, start, bytes, va,
542 if (bytes != PAGESIZE && error == 0)
543 bzero(va + bytes, PAGESIZE - bytes);
545 zfs_vmobject_wlock(obj);
549 vm_page_activate(pp);
553 if (error != 0 && !vm_page_wired(pp) == 0 &&
554 pp->valid == 0 && vm_page_tryxbusy(pp))
557 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
562 uio->uio_resid -= bytes;
563 uio->uio_offset += bytes;
566 zfs_vmobject_wunlock(obj);
571 * When a file is memory mapped, we must keep the IO data synchronized
572 * between the DMU cache and the memory mapped pages. What this means:
574 * On Read: We "read" preferentially from memory mapped pages,
575 * else we default from the dmu buffer.
577 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
578 * the file is memory mapped.
581 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
583 znode_t *zp = VTOZ(vp);
591 ASSERT(vp->v_mount != NULL);
595 start = uio->uio_loffset;
596 off = start & PAGEOFFSET;
597 zfs_vmobject_wlock(obj);
598 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
600 uint64_t bytes = MIN(PAGESIZE - off, len);
602 if (pp = page_wire(vp, start)) {
606 zfs_vmobject_wunlock(obj);
607 va = zfs_map_page(pp, &sf);
609 error = uiomove(va + off, bytes, UIO_READ, uio);
611 error = vn_io_fault_uiomove(va + off, bytes, uio);
614 zfs_vmobject_wlock(obj);
617 zfs_vmobject_wunlock(obj);
618 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
620 zfs_vmobject_wlock(obj);
627 zfs_vmobject_wunlock(obj);
631 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
634 * Read bytes from specified file into supplied buffer.
636 * IN: vp - vnode of file to be read from.
637 * uio - structure supplying read location, range info,
639 * ioflag - SYNC flags; used to provide FRSYNC semantics.
640 * cr - credentials of caller.
641 * ct - caller context
643 * OUT: uio - updated offset and range, buffer filled.
645 * RETURN: 0 on success, error code on failure.
648 * vp - atime updated if byte count > 0
652 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
654 znode_t *zp = VTOZ(vp);
655 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
664 if (zp->z_pflags & ZFS_AV_QUARANTINED) {
666 return (SET_ERROR(EACCES));
670 * Validate file offset
672 if (uio->uio_loffset < (offset_t)0) {
674 return (SET_ERROR(EINVAL));
678 * Fasttrack empty reads
680 if (uio->uio_resid == 0) {
686 * Check for mandatory locks
688 if (MANDMODE(zp->z_mode)) {
689 if (error = chklock(vp, FREAD,
690 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
697 * If we're in FRSYNC mode, sync out this znode before reading it.
700 (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
701 zil_commit(zfsvfs->z_log, zp->z_id);
704 * Lock the range against changes.
706 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
709 * If we are reading past end-of-file we can skip
710 * to the end; but we might still need to set atime.
712 if (uio->uio_loffset >= zp->z_size) {
717 ASSERT(uio->uio_loffset < zp->z_size);
718 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
721 if ((uio->uio_extflg == UIO_XUIO) &&
722 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
724 int blksz = zp->z_blksz;
725 uint64_t offset = uio->uio_loffset;
727 xuio = (xuio_t *)uio;
729 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
732 ASSERT(offset + n <= blksz);
735 (void) dmu_xuio_init(xuio, nblk);
737 if (vn_has_cached_data(vp)) {
739 * For simplicity, we always allocate a full buffer
740 * even if we only expect to read a portion of a block.
742 while (--nblk >= 0) {
743 (void) dmu_xuio_add(xuio,
744 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
752 nbytes = MIN(n, zfs_read_chunk_size -
753 P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
756 if (uio->uio_segflg == UIO_NOCOPY)
757 error = mappedread_sf(vp, nbytes, uio);
759 #endif /* __FreeBSD__ */
760 if (vn_has_cached_data(vp)) {
761 error = mappedread(vp, nbytes, uio);
763 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
767 /* convert checksum errors into IO errors */
769 error = SET_ERROR(EIO);
776 zfs_range_unlock(rl);
778 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
784 * Write the bytes to a file.
786 * IN: vp - vnode of file to be written to.
787 * uio - structure supplying write location, range info,
789 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is
790 * set if in append mode.
791 * cr - credentials of caller.
792 * ct - caller context (NFS/CIFS fem monitor only)
794 * OUT: uio - updated offset and range.
796 * RETURN: 0 on success, error code on failure.
799 * vp - ctime|mtime updated if byte count > 0
804 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
806 znode_t *zp = VTOZ(vp);
807 rlim64_t limit = MAXOFFSET_T;
808 ssize_t start_resid = uio->uio_resid;
812 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
817 int max_blksz = zfsvfs->z_max_blksz;
820 iovec_t *aiov = NULL;
823 int iovcnt = uio->uio_iovcnt;
824 iovec_t *iovp = uio->uio_iov;
827 sa_bulk_attr_t bulk[4];
828 uint64_t mtime[2], ctime[2];
831 * Fasttrack empty write
837 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
843 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
844 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
845 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
847 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
851 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
852 * callers might not be able to detect properly that we are read-only,
853 * so check it explicitly here.
855 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
857 return (SET_ERROR(EROFS));
861 * If immutable or not appending then return EPERM.
862 * Intentionally allow ZFS_READONLY through here.
863 * See zfs_zaccess_common()
865 if ((zp->z_pflags & ZFS_IMMUTABLE) ||
866 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
867 (uio->uio_loffset < zp->z_size))) {
869 return (SET_ERROR(EPERM));
872 zilog = zfsvfs->z_log;
875 * Validate file offset
877 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
880 return (SET_ERROR(EINVAL));
884 * Check for mandatory locks before calling zfs_range_lock()
885 * in order to prevent a deadlock with locks set via fcntl().
887 if (MANDMODE((mode_t)zp->z_mode) &&
888 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
895 * Pre-fault the pages to ensure slow (eg NFS) pages
897 * Skip this if uio contains loaned arc_buf.
899 if ((uio->uio_extflg == UIO_XUIO) &&
900 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
901 xuio = (xuio_t *)uio;
903 uio_prefaultpages(MIN(n, max_blksz), uio);
907 * If in append mode, set the io offset pointer to eof.
909 if (ioflag & FAPPEND) {
911 * Obtain an appending range lock to guarantee file append
912 * semantics. We reset the write offset once we have the lock.
914 rl = zfs_range_lock(zp, 0, n, RL_APPEND);
916 if (rl->r_len == UINT64_MAX) {
918 * We overlocked the file because this write will cause
919 * the file block size to increase.
920 * Note that zp_size cannot change with this lock held.
924 uio->uio_loffset = woff;
927 * Note that if the file block size will change as a result of
928 * this write, then this range lock will lock the entire file
929 * so that we can re-write the block safely.
931 rl = zfs_range_lock(zp, woff, n, RL_WRITER);
934 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
935 zfs_range_unlock(rl);
941 zfs_range_unlock(rl);
943 return (SET_ERROR(EFBIG));
946 if ((woff + n) > limit || woff > (limit - n))
949 /* Will this write extend the file length? */
950 write_eof = (woff + n > zp->z_size);
952 end_size = MAX(zp->z_size, woff + n);
955 * Write the file in reasonable size chunks. Each chunk is written
956 * in a separate transaction; this keeps the intent log records small
957 * and allows us to do more fine-grained space accounting.
961 woff = uio->uio_loffset;
962 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
963 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
965 dmu_return_arcbuf(abuf);
966 error = SET_ERROR(EDQUOT);
970 if (xuio && abuf == NULL) {
971 ASSERT(i_iov < iovcnt);
973 abuf = dmu_xuio_arcbuf(xuio, i_iov);
974 dmu_xuio_clear(xuio, i_iov);
975 DTRACE_PROBE3(zfs_cp_write, int, i_iov,
976 iovec_t *, aiov, arc_buf_t *, abuf);
977 ASSERT((aiov->iov_base == abuf->b_data) ||
978 ((char *)aiov->iov_base - (char *)abuf->b_data +
979 aiov->iov_len == arc_buf_size(abuf)));
981 } else if (abuf == NULL && n >= max_blksz &&
982 woff >= zp->z_size &&
983 P2PHASE(woff, max_blksz) == 0 &&
984 zp->z_blksz == max_blksz) {
986 * This write covers a full block. "Borrow" a buffer
987 * from the dmu so that we can fill it before we enter
988 * a transaction. This avoids the possibility of
989 * holding up the transaction if the data copy hangs
990 * up on a pagefault (e.g., from an NFS server mapping).
994 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
996 ASSERT(abuf != NULL);
997 ASSERT(arc_buf_size(abuf) == max_blksz);
998 if (error = uiocopy(abuf->b_data, max_blksz,
999 UIO_WRITE, uio, &cbytes)) {
1000 dmu_return_arcbuf(abuf);
1003 ASSERT(cbytes == max_blksz);
1007 * Start a transaction.
1009 tx = dmu_tx_create(zfsvfs->z_os);
1010 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1011 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1012 zfs_sa_upgrade_txholds(tx, zp);
1013 error = dmu_tx_assign(tx, TXG_WAIT);
1017 dmu_return_arcbuf(abuf);
1022 * If zfs_range_lock() over-locked we grow the blocksize
1023 * and then reduce the lock range. This will only happen
1024 * on the first iteration since zfs_range_reduce() will
1025 * shrink down r_len to the appropriate size.
1027 if (rl->r_len == UINT64_MAX) {
1030 if (zp->z_blksz > max_blksz) {
1032 * File's blocksize is already larger than the
1033 * "recordsize" property. Only let it grow to
1034 * the next power of 2.
1036 ASSERT(!ISP2(zp->z_blksz));
1037 new_blksz = MIN(end_size,
1038 1 << highbit64(zp->z_blksz));
1040 new_blksz = MIN(end_size, max_blksz);
1042 zfs_grow_blocksize(zp, new_blksz, tx);
1043 zfs_range_reduce(rl, woff, n);
1047 * XXX - should we really limit each write to z_max_blksz?
1048 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1050 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1052 if (woff + nbytes > zp->z_size)
1053 vnode_pager_setsize(vp, woff + nbytes);
1056 tx_bytes = uio->uio_resid;
1057 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1059 tx_bytes -= uio->uio_resid;
1062 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1064 * If this is not a full block write, but we are
1065 * extending the file past EOF and this data starts
1066 * block-aligned, use assign_arcbuf(). Otherwise,
1067 * write via dmu_write().
1069 if (tx_bytes < max_blksz && (!write_eof ||
1070 aiov->iov_base != abuf->b_data)) {
1072 dmu_write(zfsvfs->z_os, zp->z_id, woff,
1073 aiov->iov_len, aiov->iov_base, tx);
1074 dmu_return_arcbuf(abuf);
1075 xuio_stat_wbuf_copied();
1077 ASSERT(xuio || tx_bytes == max_blksz);
1078 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1081 ASSERT(tx_bytes <= uio->uio_resid);
1082 uioskip(uio, tx_bytes);
1084 if (tx_bytes && vn_has_cached_data(vp)) {
1085 update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1086 zp->z_id, uio->uio_segflg, tx);
1090 * If we made no progress, we're done. If we made even
1091 * partial progress, update the znode and ZIL accordingly.
1093 if (tx_bytes == 0) {
1094 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1095 (void *)&zp->z_size, sizeof (uint64_t), tx);
1102 * Clear Set-UID/Set-GID bits on successful write if not
1103 * privileged and at least one of the excute bits is set.
1105 * It would be nice to to this after all writes have
1106 * been done, but that would still expose the ISUID/ISGID
1107 * to another app after the partial write is committed.
1109 * Note: we don't call zfs_fuid_map_id() here because
1110 * user 0 is not an ephemeral uid.
1112 mutex_enter(&zp->z_acl_lock);
1113 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1114 (S_IXUSR >> 6))) != 0 &&
1115 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1116 secpolicy_vnode_setid_retain(vp, cr,
1117 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1119 zp->z_mode &= ~(S_ISUID | S_ISGID);
1120 newmode = zp->z_mode;
1121 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1122 (void *)&newmode, sizeof (uint64_t), tx);
1124 mutex_exit(&zp->z_acl_lock);
1126 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1130 * Update the file size (zp_size) if it has changed;
1131 * account for possible concurrent updates.
1133 while ((end_size = zp->z_size) < uio->uio_loffset) {
1134 (void) atomic_cas_64(&zp->z_size, end_size,
1139 ASSERT(error == 0 || error == EFAULT);
1143 * If we are replaying and eof is non zero then force
1144 * the file size to the specified eof. Note, there's no
1145 * concurrency during replay.
1147 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1148 zp->z_size = zfsvfs->z_replay_eof;
1151 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1153 (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1155 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1160 ASSERT(tx_bytes == nbytes);
1165 uio_prefaultpages(MIN(n, max_blksz), uio);
1169 zfs_range_unlock(rl);
1172 * If we're in replay mode, or we made no progress, return error.
1173 * Otherwise, it's at least a partial write, so it's successful.
1175 if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1182 * EFAULT means that at least one page of the source buffer was not
1183 * available. VFS will re-try remaining I/O upon this error.
1185 if (error == EFAULT) {
1191 if (ioflag & (FSYNC | FDSYNC) ||
1192 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1193 zil_commit(zilog, zp->z_id);
1201 zfs_get_done(zgd_t *zgd, int error)
1203 znode_t *zp = zgd->zgd_private;
1204 objset_t *os = zp->z_zfsvfs->z_os;
1207 dmu_buf_rele(zgd->zgd_db, zgd);
1209 zfs_range_unlock(zgd->zgd_rl);
1212 * Release the vnode asynchronously as we currently have the
1213 * txg stopped from syncing.
1215 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1217 kmem_free(zgd, sizeof (zgd_t));
1221 static int zil_fault_io = 0;
1225 * Get data to generate a TX_WRITE intent log record.
1228 zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
1230 zfsvfs_t *zfsvfs = arg;
1231 objset_t *os = zfsvfs->z_os;
1233 uint64_t object = lr->lr_foid;
1234 uint64_t offset = lr->lr_offset;
1235 uint64_t size = lr->lr_length;
1240 ASSERT3P(lwb, !=, NULL);
1241 ASSERT3P(zio, !=, NULL);
1242 ASSERT3U(size, !=, 0);
1245 * Nothing to do if the file has been removed
1247 if (zfs_zget(zfsvfs, object, &zp) != 0)
1248 return (SET_ERROR(ENOENT));
1249 if (zp->z_unlinked) {
1251 * Release the vnode asynchronously as we currently have the
1252 * txg stopped from syncing.
1254 VN_RELE_ASYNC(ZTOV(zp),
1255 dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1256 return (SET_ERROR(ENOENT));
1259 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1261 zgd->zgd_private = zp;
1264 * Write records come in two flavors: immediate and indirect.
1265 * For small writes it's cheaper to store the data with the
1266 * log record (immediate); for large writes it's cheaper to
1267 * sync the data and get a pointer to it (indirect) so that
1268 * we don't have to write the data twice.
1270 if (buf != NULL) { /* immediate write */
1271 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1272 /* test for truncation needs to be done while range locked */
1273 if (offset >= zp->z_size) {
1274 error = SET_ERROR(ENOENT);
1276 error = dmu_read(os, object, offset, size, buf,
1277 DMU_READ_NO_PREFETCH);
1279 ASSERT(error == 0 || error == ENOENT);
1280 } else { /* indirect write */
1282 * Have to lock the whole block to ensure when it's
1283 * written out and its checksum is being calculated
1284 * that no one can change the data. We need to re-check
1285 * blocksize after we get the lock in case it's changed!
1290 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1292 zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1294 if (zp->z_blksz == size)
1297 zfs_range_unlock(zgd->zgd_rl);
1299 /* test for truncation needs to be done while range locked */
1300 if (lr->lr_offset >= zp->z_size)
1301 error = SET_ERROR(ENOENT);
1304 error = SET_ERROR(EIO);
1309 error = dmu_buf_hold(os, object, offset, zgd, &db,
1310 DMU_READ_NO_PREFETCH);
1313 blkptr_t *bp = &lr->lr_blkptr;
1318 ASSERT(db->db_offset == offset);
1319 ASSERT(db->db_size == size);
1321 error = dmu_sync(zio, lr->lr_common.lrc_txg,
1323 ASSERT(error || lr->lr_length <= size);
1326 * On success, we need to wait for the write I/O
1327 * initiated by dmu_sync() to complete before we can
1328 * release this dbuf. We will finish everything up
1329 * in the zfs_get_done() callback.
1334 if (error == EALREADY) {
1335 lr->lr_common.lrc_txtype = TX_WRITE2;
1337 * TX_WRITE2 relies on the data previously
1338 * written by the TX_WRITE that caused
1339 * EALREADY. We zero out the BP because
1340 * it is the old, currently-on-disk BP.
1349 zfs_get_done(zgd, error);
1356 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1357 caller_context_t *ct)
1359 znode_t *zp = VTOZ(vp);
1360 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1366 if (flag & V_ACE_MASK)
1367 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1369 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1376 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1381 error = vn_lock(*vpp, lkflags);
1388 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1390 znode_t *zdp = VTOZ(dvp);
1391 zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1395 ASSERT_VOP_LOCKED(dvp, __func__);
1397 if ((zdp->z_pflags & ZFS_XATTR) == 0)
1398 VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1401 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1402 ASSERT3P(dvp, ==, vp);
1404 ltype = lkflags & LK_TYPE_MASK;
1405 if (ltype != VOP_ISLOCKED(dvp)) {
1406 if (ltype == LK_EXCLUSIVE)
1407 vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1408 else /* if (ltype == LK_SHARED) */
1409 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1412 * Relock for the "." case could leave us with
1415 if (dvp->v_iflag & VI_DOOMED) {
1417 return (SET_ERROR(ENOENT));
1421 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1423 * Note that in this case, dvp is the child vnode, and we
1424 * are looking up the parent vnode - exactly reverse from
1425 * normal operation. Unlocking dvp requires some rather
1426 * tricky unlock/relock dance to prevent mp from being freed;
1427 * use vn_vget_ino_gen() which takes care of all that.
1429 * XXX Note that there is a time window when both vnodes are
1430 * unlocked. It is possible, although highly unlikely, that
1431 * during that window the parent-child relationship between
1432 * the vnodes may change, for example, get reversed.
1433 * In that case we would have a wrong lock order for the vnodes.
1434 * All other filesystems seem to ignore this problem, so we
1436 * A potential solution could be implemented as follows:
1437 * - using LK_NOWAIT when locking the second vnode and retrying
1439 * - checking that the parent-child relationship still holds
1440 * after locking both vnodes and retrying if it doesn't
1442 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1445 error = vn_lock(vp, lkflags);
1453 * Lookup an entry in a directory, or an extended attribute directory.
1454 * If it exists, return a held vnode reference for it.
1456 * IN: dvp - vnode of directory to search.
1457 * nm - name of entry to lookup.
1458 * pnp - full pathname to lookup [UNUSED].
1459 * flags - LOOKUP_XATTR set if looking for an attribute.
1460 * rdir - root directory vnode [UNUSED].
1461 * cr - credentials of caller.
1462 * ct - caller context
1464 * OUT: vpp - vnode of located entry, NULL if not found.
1466 * RETURN: 0 on success, error code on failure.
1473 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1474 int nameiop, cred_t *cr, kthread_t *td, int flags)
1476 znode_t *zdp = VTOZ(dvp);
1478 zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1482 * Fast path lookup, however we must skip DNLC lookup
1483 * for case folding or normalizing lookups because the
1484 * DNLC code only stores the passed in name. This means
1485 * creating 'a' and removing 'A' on a case insensitive
1486 * file system would work, but DNLC still thinks 'a'
1487 * exists and won't let you create it again on the next
1488 * pass through fast path.
1490 if (!(flags & LOOKUP_XATTR)) {
1491 if (dvp->v_type != VDIR) {
1492 return (SET_ERROR(ENOTDIR));
1493 } else if (zdp->z_sa_hdl == NULL) {
1494 return (SET_ERROR(EIO));
1498 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1505 if (flags & LOOKUP_XATTR) {
1508 * If the xattr property is off, refuse the lookup request.
1510 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1512 return (SET_ERROR(EINVAL));
1517 * We don't allow recursive attributes..
1518 * Maybe someday we will.
1520 if (zdp->z_pflags & ZFS_XATTR) {
1522 return (SET_ERROR(EINVAL));
1525 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1531 * Do we have permission to get into attribute directory?
1533 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1544 * Check accessibility of directory.
1546 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1551 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1552 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1554 return (SET_ERROR(EILSEQ));
1559 * First handle the special cases.
1561 if ((cnp->cn_flags & ISDOTDOT) != 0) {
1563 * If we are a snapshot mounted under .zfs, return
1564 * the vp for the snapshot directory.
1566 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1567 struct componentname cn;
1572 ltype = VOP_ISLOCKED(dvp);
1574 error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
1577 cn.cn_nameptr = "snapshot";
1578 cn.cn_namelen = strlen(cn.cn_nameptr);
1579 cn.cn_nameiop = cnp->cn_nameiop;
1580 cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
1581 cn.cn_lkflags = cnp->cn_lkflags;
1582 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
1585 vn_lock(dvp, ltype | LK_RETRY);
1589 if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1591 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1592 return (SET_ERROR(ENOTSUP));
1593 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
1598 * The loop is retry the lookup if the parent-child relationship
1599 * changes during the dot-dot locking complexities.
1604 error = zfs_dirlook(zdp, nm, &zp);
1612 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1615 * If we've got a locking error, then the vnode
1616 * got reclaimed because of a force unmount.
1617 * We never enter doomed vnodes into the name cache.
1623 if ((cnp->cn_flags & ISDOTDOT) == 0)
1627 if (zdp->z_sa_hdl == NULL) {
1628 error = SET_ERROR(EIO);
1630 error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1631 &parent, sizeof (parent));
1638 if (zp->z_id == parent) {
1649 /* Translate errors and add SAVENAME when needed. */
1650 if (cnp->cn_flags & ISLASTCN) {
1654 if (error == ENOENT) {
1655 error = EJUSTRETURN;
1656 cnp->cn_flags |= SAVENAME;
1662 cnp->cn_flags |= SAVENAME;
1667 /* Insert name into cache (as non-existent) if appropriate. */
1668 if (zfsvfs->z_use_namecache &&
1669 error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1670 cache_enter(dvp, NULL, cnp);
1672 /* Insert name into cache if appropriate. */
1673 if (zfsvfs->z_use_namecache &&
1674 error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1675 if (!(cnp->cn_flags & ISLASTCN) ||
1676 (nameiop != DELETE && nameiop != RENAME)) {
1677 cache_enter(dvp, *vpp, cnp);
1685 * Attempt to create a new entry in a directory. If the entry
1686 * already exists, truncate the file if permissible, else return
1687 * an error. Return the vp of the created or trunc'd file.
1689 * IN: dvp - vnode of directory to put new file entry in.
1690 * name - name of new file entry.
1691 * vap - attributes of new file.
1692 * excl - flag indicating exclusive or non-exclusive mode.
1693 * mode - mode to open file with.
1694 * cr - credentials of caller.
1695 * flag - large file flag [UNUSED].
1696 * ct - caller context
1697 * vsecp - ACL to be set
1699 * OUT: vpp - vnode of created or trunc'd entry.
1701 * RETURN: 0 on success, error code on failure.
1704 * dvp - ctime|mtime updated if new entry created
1705 * vp - ctime|mtime always, atime if new
1710 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1711 vnode_t **vpp, cred_t *cr, kthread_t *td)
1713 znode_t *zp, *dzp = VTOZ(dvp);
1714 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1721 gid_t gid = crgetgid(cr);
1722 zfs_acl_ids_t acl_ids;
1723 boolean_t fuid_dirtied;
1729 * If we have an ephemeral id, ACL, or XVATTR then
1730 * make sure file system is at proper version
1733 ksid = crgetsid(cr, KSID_OWNER);
1735 uid = ksid_getid(ksid);
1739 if (zfsvfs->z_use_fuids == B_FALSE &&
1740 (vsecp || (vap->va_mask & AT_XVATTR) ||
1741 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1742 return (SET_ERROR(EINVAL));
1747 zilog = zfsvfs->z_log;
1749 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1750 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1752 return (SET_ERROR(EILSEQ));
1755 if (vap->va_mask & AT_XVATTR) {
1756 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1757 crgetuid(cr), cr, vap->va_type)) != 0) {
1765 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1766 vap->va_mode &= ~S_ISVTX;
1768 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
1773 ASSERT3P(zp, ==, NULL);
1776 * Create a new file object and update the directory
1779 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1784 * We only support the creation of regular files in
1785 * extended attribute directories.
1788 if ((dzp->z_pflags & ZFS_XATTR) &&
1789 (vap->va_type != VREG)) {
1790 error = SET_ERROR(EINVAL);
1794 if ((error = zfs_acl_ids_create(dzp, 0, vap,
1795 cr, vsecp, &acl_ids)) != 0)
1798 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1799 zfs_acl_ids_free(&acl_ids);
1800 error = SET_ERROR(EDQUOT);
1804 getnewvnode_reserve(1);
1806 tx = dmu_tx_create(os);
1808 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1809 ZFS_SA_BASE_ATTR_SIZE);
1811 fuid_dirtied = zfsvfs->z_fuid_dirty;
1813 zfs_fuid_txhold(zfsvfs, tx);
1814 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1815 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1816 if (!zfsvfs->z_use_sa &&
1817 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1818 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1819 0, acl_ids.z_aclp->z_acl_bytes);
1821 error = dmu_tx_assign(tx, TXG_WAIT);
1823 zfs_acl_ids_free(&acl_ids);
1825 getnewvnode_drop_reserve();
1829 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1832 zfs_fuid_sync(zfsvfs, tx);
1834 (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
1835 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1836 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1837 vsecp, acl_ids.z_fuidp, vap);
1838 zfs_acl_ids_free(&acl_ids);
1841 getnewvnode_drop_reserve();
1848 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1849 zil_commit(zilog, 0);
1856 * Remove an entry from a directory.
1858 * IN: dvp - vnode of directory to remove entry from.
1859 * name - name of entry to remove.
1860 * cr - credentials of caller.
1861 * ct - caller context
1862 * flags - case flags
1864 * RETURN: 0 on success, error code on failure.
1868 * vp - ctime (if nlink > 0)
1873 zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
1875 znode_t *dzp = VTOZ(dvp);
1876 znode_t *zp = VTOZ(vp);
1878 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1880 uint64_t acl_obj, xattr_obj;
1883 boolean_t unlinked, toobig = FALSE;
1890 zilog = zfsvfs->z_log;
1896 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1901 * Need to use rmdir for removing directories.
1903 if (vp->v_type == VDIR) {
1904 error = SET_ERROR(EPERM);
1908 vnevent_remove(vp, dvp, name, ct);
1912 /* are there any extended attributes? */
1913 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1914 &xattr_obj, sizeof (xattr_obj));
1915 if (error == 0 && xattr_obj) {
1916 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1921 * We may delete the znode now, or we may put it in the unlinked set;
1922 * it depends on whether we're the last link, and on whether there are
1923 * other holds on the vnode. So we dmu_tx_hold() the right things to
1924 * allow for either case.
1926 tx = dmu_tx_create(zfsvfs->z_os);
1927 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1928 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1929 zfs_sa_upgrade_txholds(tx, zp);
1930 zfs_sa_upgrade_txholds(tx, dzp);
1933 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1934 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1937 /* charge as an update -- would be nice not to charge at all */
1938 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1941 * Mark this transaction as typically resulting in a net free of space
1943 dmu_tx_mark_netfree(tx);
1945 error = dmu_tx_assign(tx, TXG_WAIT);
1953 * Remove the directory entry.
1955 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
1963 zfs_unlinked_add(zp, tx);
1964 vp->v_vflag |= VV_NOSYNC;
1968 zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1976 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1977 zil_commit(zilog, 0);
1984 * Create a new directory and insert it into dvp using the name
1985 * provided. Return a pointer to the inserted directory.
1987 * IN: dvp - vnode of directory to add subdir to.
1988 * dirname - name of new directory.
1989 * vap - attributes of new directory.
1990 * cr - credentials of caller.
1991 * ct - caller context
1992 * flags - case flags
1993 * vsecp - ACL to be set
1995 * OUT: vpp - vnode of created directory.
1997 * RETURN: 0 on success, error code on failure.
2000 * dvp - ctime|mtime updated
2001 * vp - ctime|mtime|atime updated
2005 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2007 znode_t *zp, *dzp = VTOZ(dvp);
2008 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
2015 gid_t gid = crgetgid(cr);
2016 zfs_acl_ids_t acl_ids;
2017 boolean_t fuid_dirtied;
2019 ASSERT(vap->va_type == VDIR);
2022 * If we have an ephemeral id, ACL, or XVATTR then
2023 * make sure file system is at proper version
2026 ksid = crgetsid(cr, KSID_OWNER);
2028 uid = ksid_getid(ksid);
2031 if (zfsvfs->z_use_fuids == B_FALSE &&
2032 ((vap->va_mask & AT_XVATTR) ||
2033 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2034 return (SET_ERROR(EINVAL));
2038 zilog = zfsvfs->z_log;
2040 if (dzp->z_pflags & ZFS_XATTR) {
2042 return (SET_ERROR(EINVAL));
2045 if (zfsvfs->z_utf8 && u8_validate(dirname,
2046 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2048 return (SET_ERROR(EILSEQ));
2051 if (vap->va_mask & AT_XVATTR) {
2052 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2053 crgetuid(cr), cr, vap->va_type)) != 0) {
2059 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2060 NULL, &acl_ids)) != 0) {
2066 * First make sure the new directory doesn't exist.
2068 * Existence is checked first to make sure we don't return
2069 * EACCES instead of EEXIST which can cause some applications
2074 if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2075 zfs_acl_ids_free(&acl_ids);
2079 ASSERT3P(zp, ==, NULL);
2081 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2082 zfs_acl_ids_free(&acl_ids);
2087 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2088 zfs_acl_ids_free(&acl_ids);
2090 return (SET_ERROR(EDQUOT));
2094 * Add a new entry to the directory.
2096 getnewvnode_reserve(1);
2097 tx = dmu_tx_create(zfsvfs->z_os);
2098 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2099 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2100 fuid_dirtied = zfsvfs->z_fuid_dirty;
2102 zfs_fuid_txhold(zfsvfs, tx);
2103 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2104 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2105 acl_ids.z_aclp->z_acl_bytes);
2108 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2109 ZFS_SA_BASE_ATTR_SIZE);
2111 error = dmu_tx_assign(tx, TXG_WAIT);
2113 zfs_acl_ids_free(&acl_ids);
2115 getnewvnode_drop_reserve();
2123 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2126 zfs_fuid_sync(zfsvfs, tx);
2129 * Now put new name in parent dir.
2131 (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2135 txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2136 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2137 acl_ids.z_fuidp, vap);
2139 zfs_acl_ids_free(&acl_ids);
2143 getnewvnode_drop_reserve();
2145 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2146 zil_commit(zilog, 0);
2153 * Remove a directory subdir entry. If the current working
2154 * directory is the same as the subdir to be removed, the
2157 * IN: dvp - vnode of directory to remove from.
2158 * name - name of directory to be removed.
2159 * cwd - vnode of current working directory.
2160 * cr - credentials of caller.
2161 * ct - caller context
2162 * flags - case flags
2164 * RETURN: 0 on success, error code on failure.
2167 * dvp - ctime|mtime updated
2171 zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2173 znode_t *dzp = VTOZ(dvp);
2174 znode_t *zp = VTOZ(vp);
2175 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
2183 zilog = zfsvfs->z_log;
2186 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2190 if (vp->v_type != VDIR) {
2191 error = SET_ERROR(ENOTDIR);
2195 vnevent_rmdir(vp, dvp, name, ct);
2197 tx = dmu_tx_create(zfsvfs->z_os);
2198 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2199 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2200 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2201 zfs_sa_upgrade_txholds(tx, zp);
2202 zfs_sa_upgrade_txholds(tx, dzp);
2203 dmu_tx_mark_netfree(tx);
2204 error = dmu_tx_assign(tx, TXG_WAIT);
2213 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2216 uint64_t txtype = TX_RMDIR;
2217 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2224 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2225 zil_commit(zilog, 0);
2232 * Read as many directory entries as will fit into the provided
2233 * buffer from the given directory cursor position (specified in
2234 * the uio structure).
2236 * IN: vp - vnode of directory to read.
2237 * uio - structure supplying read location, range info,
2238 * and return buffer.
2239 * cr - credentials of caller.
2240 * ct - caller context
2241 * flags - case flags
2243 * OUT: uio - updated offset and range, buffer filled.
2244 * eofp - set to true if end-of-file detected.
2246 * RETURN: 0 on success, error code on failure.
2249 * vp - atime updated
2251 * Note that the low 4 bits of the cookie returned by zap is always zero.
2252 * This allows us to use the low range for "special" directory entries:
2253 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
2254 * we use the offset 2 for the '.zfs' directory.
2258 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2260 znode_t *zp = VTOZ(vp);
2264 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2269 zap_attribute_t zap;
2270 uint_t bytes_wanted;
2271 uint64_t offset; /* must be unsigned; checks for < 1 */
2277 boolean_t check_sysattrs;
2280 u_long *cooks = NULL;
2286 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2287 &parent, sizeof (parent))) != 0) {
2293 * If we are not given an eof variable,
2300 * Check for valid iov_len.
2302 if (uio->uio_iov->iov_len <= 0) {
2304 return (SET_ERROR(EINVAL));
2308 * Quit if directory has been removed (posix)
2310 if ((*eofp = zp->z_unlinked) != 0) {
2317 offset = uio->uio_loffset;
2318 prefetch = zp->z_zn_prefetch;
2321 * Initialize the iterator cursor.
2325 * Start iteration from the beginning of the directory.
2327 zap_cursor_init(&zc, os, zp->z_id);
2330 * The offset is a serialized cursor.
2332 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2336 * Get space to change directory entries into fs independent format.
2338 iovp = uio->uio_iov;
2339 bytes_wanted = iovp->iov_len;
2340 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2341 bufsize = bytes_wanted;
2342 outbuf = kmem_alloc(bufsize, KM_SLEEP);
2343 odp = (struct dirent64 *)outbuf;
2345 bufsize = bytes_wanted;
2347 odp = (struct dirent64 *)iovp->iov_base;
2349 eodp = (struct edirent *)odp;
2351 if (ncookies != NULL) {
2353 * Minimum entry size is dirent size and 1 byte for a file name.
2355 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2356 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2361 * If this VFS supports the system attribute view interface; and
2362 * we're looking at an extended attribute directory; and we care
2363 * about normalization conflicts on this vfs; then we must check
2364 * for normalization conflicts with the sysattr name space.
2367 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2368 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2369 (flags & V_RDDIR_ENTFLAGS);
2375 * Transform to file-system independent format
2378 while (outcount < bytes_wanted) {
2381 off64_t *next = NULL;
2384 * Special case `.', `..', and `.zfs'.
2387 (void) strcpy(zap.za_name, ".");
2388 zap.za_normalization_conflict = 0;
2391 } else if (offset == 1) {
2392 (void) strcpy(zap.za_name, "..");
2393 zap.za_normalization_conflict = 0;
2396 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2397 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2398 zap.za_normalization_conflict = 0;
2399 objnum = ZFSCTL_INO_ROOT;
2405 if (error = zap_cursor_retrieve(&zc, &zap)) {
2406 if ((*eofp = (error == ENOENT)) != 0)
2412 if (zap.za_integer_length != 8 ||
2413 zap.za_num_integers != 1) {
2414 cmn_err(CE_WARN, "zap_readdir: bad directory "
2415 "entry, obj = %lld, offset = %lld\n",
2416 (u_longlong_t)zp->z_id,
2417 (u_longlong_t)offset);
2418 error = SET_ERROR(ENXIO);
2422 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2424 * MacOS X can extract the object type here such as:
2425 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2427 type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2429 if (check_sysattrs && !zap.za_normalization_conflict) {
2431 zap.za_normalization_conflict =
2432 xattr_sysattr_casechk(zap.za_name);
2434 panic("%s:%u: TODO", __func__, __LINE__);
2439 if (flags & V_RDDIR_ACCFILTER) {
2441 * If we have no access at all, don't include
2442 * this entry in the returned information
2445 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2447 if (!zfs_has_access(ezp, cr)) {
2454 if (flags & V_RDDIR_ENTFLAGS)
2455 reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2457 reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2460 * Will this entry fit in the buffer?
2462 if (outcount + reclen > bufsize) {
2464 * Did we manage to fit anything in the buffer?
2467 error = SET_ERROR(EINVAL);
2472 if (flags & V_RDDIR_ENTFLAGS) {
2474 * Add extended flag entry:
2476 eodp->ed_ino = objnum;
2477 eodp->ed_reclen = reclen;
2478 /* NOTE: ed_off is the offset for the *next* entry. */
2479 next = &eodp->ed_off;
2480 eodp->ed_eflags = zap.za_normalization_conflict ?
2481 ED_CASE_CONFLICT : 0;
2482 (void) strncpy(eodp->ed_name, zap.za_name,
2483 EDIRENT_NAMELEN(reclen));
2484 eodp = (edirent_t *)((intptr_t)eodp + reclen);
2489 odp->d_ino = objnum;
2490 odp->d_reclen = reclen;
2491 odp->d_namlen = strlen(zap.za_name);
2492 /* NOTE: d_off is the offset for the *next* entry. */
2494 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2496 dirent_terminate(odp);
2497 odp = (dirent64_t *)((intptr_t)odp + reclen);
2501 ASSERT(outcount <= bufsize);
2503 /* Prefetch znode */
2505 dmu_prefetch(os, objnum, 0, 0, 0,
2506 ZIO_PRIORITY_SYNC_READ);
2510 * Move to the next entry, fill in the previous offset.
2512 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2513 zap_cursor_advance(&zc);
2514 offset = zap_cursor_serialize(&zc);
2519 /* Fill the offset right after advancing the cursor. */
2522 if (cooks != NULL) {
2525 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2528 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2530 /* Subtract unused cookies */
2531 if (ncookies != NULL)
2532 *ncookies -= ncooks;
2534 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2535 iovp->iov_base += outcount;
2536 iovp->iov_len -= outcount;
2537 uio->uio_resid -= outcount;
2538 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2540 * Reset the pointer.
2542 offset = uio->uio_loffset;
2546 zap_cursor_fini(&zc);
2547 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2548 kmem_free(outbuf, bufsize);
2550 if (error == ENOENT)
2553 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2555 uio->uio_loffset = offset;
2557 if (error != 0 && cookies != NULL) {
2558 free(*cookies, M_TEMP);
2565 ulong_t zfs_fsync_sync_cnt = 4;
2568 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2570 znode_t *zp = VTOZ(vp);
2571 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2573 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2575 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2578 zil_commit(zfsvfs->z_log, zp->z_id);
2586 * Get the requested file attributes and place them in the provided
2589 * IN: vp - vnode of file.
2590 * vap - va_mask identifies requested attributes.
2591 * If AT_XVATTR set, then optional attrs are requested
2592 * flags - ATTR_NOACLCHECK (CIFS server context)
2593 * cr - credentials of caller.
2594 * ct - caller context
2596 * OUT: vap - attribute values.
2598 * RETURN: 0 (always succeeds).
2602 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2603 caller_context_t *ct)
2605 znode_t *zp = VTOZ(vp);
2606 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2609 u_longlong_t nblocks;
2610 uint64_t mtime[2], ctime[2], crtime[2], rdev;
2611 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2612 xoptattr_t *xoap = NULL;
2613 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2614 sa_bulk_attr_t bulk[4];
2620 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2622 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2623 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2624 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2625 if (vp->v_type == VBLK || vp->v_type == VCHR)
2626 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2629 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2635 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2636 * Also, if we are the owner don't bother, since owner should
2637 * always be allowed to read basic attributes of file.
2639 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2640 (vap->va_uid != crgetuid(cr))) {
2641 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2649 * Return all attributes. It's cheaper to provide the answer
2650 * than to determine whether we were asked the question.
2653 vap->va_type = IFTOVT(zp->z_mode);
2654 vap->va_mode = zp->z_mode & ~S_IFMT;
2656 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2660 vap->va_nodeid = zp->z_id;
2661 vap->va_nlink = zp->z_links;
2662 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
2663 zp->z_links < ZFS_LINK_MAX)
2665 vap->va_size = zp->z_size;
2667 vap->va_rdev = vp->v_rdev;
2669 if (vp->v_type == VBLK || vp->v_type == VCHR)
2670 vap->va_rdev = zfs_cmpldev(rdev);
2672 vap->va_seq = zp->z_seq;
2673 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
2674 vap->va_filerev = zp->z_seq;
2677 * Add in any requested optional attributes and the create time.
2678 * Also set the corresponding bits in the returned attribute bitmap.
2680 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2681 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2683 ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2684 XVA_SET_RTN(xvap, XAT_ARCHIVE);
2687 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2688 xoap->xoa_readonly =
2689 ((zp->z_pflags & ZFS_READONLY) != 0);
2690 XVA_SET_RTN(xvap, XAT_READONLY);
2693 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2695 ((zp->z_pflags & ZFS_SYSTEM) != 0);
2696 XVA_SET_RTN(xvap, XAT_SYSTEM);
2699 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2701 ((zp->z_pflags & ZFS_HIDDEN) != 0);
2702 XVA_SET_RTN(xvap, XAT_HIDDEN);
2705 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2706 xoap->xoa_nounlink =
2707 ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2708 XVA_SET_RTN(xvap, XAT_NOUNLINK);
2711 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2712 xoap->xoa_immutable =
2713 ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2714 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2717 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2718 xoap->xoa_appendonly =
2719 ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2720 XVA_SET_RTN(xvap, XAT_APPENDONLY);
2723 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2725 ((zp->z_pflags & ZFS_NODUMP) != 0);
2726 XVA_SET_RTN(xvap, XAT_NODUMP);
2729 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2731 ((zp->z_pflags & ZFS_OPAQUE) != 0);
2732 XVA_SET_RTN(xvap, XAT_OPAQUE);
2735 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2736 xoap->xoa_av_quarantined =
2737 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2738 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2741 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2742 xoap->xoa_av_modified =
2743 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2744 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2747 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2748 vp->v_type == VREG) {
2749 zfs_sa_get_scanstamp(zp, xvap);
2752 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2753 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2754 XVA_SET_RTN(xvap, XAT_REPARSE);
2756 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2757 xoap->xoa_generation = zp->z_gen;
2758 XVA_SET_RTN(xvap, XAT_GEN);
2761 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2763 ((zp->z_pflags & ZFS_OFFLINE) != 0);
2764 XVA_SET_RTN(xvap, XAT_OFFLINE);
2767 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2769 ((zp->z_pflags & ZFS_SPARSE) != 0);
2770 XVA_SET_RTN(xvap, XAT_SPARSE);
2774 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2775 ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2776 ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2777 ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2780 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2781 vap->va_blksize = blksize;
2782 vap->va_bytes = nblocks << 9; /* nblocks * 512 */
2784 if (zp->z_blksz == 0) {
2786 * Block size hasn't been set; suggest maximal I/O transfers.
2788 vap->va_blksize = zfsvfs->z_max_blksz;
2796 * Set the file attributes to the values contained in the
2799 * IN: vp - vnode of file to be modified.
2800 * vap - new attribute values.
2801 * If AT_XVATTR set, then optional attrs are being set
2802 * flags - ATTR_UTIME set if non-default time values provided.
2803 * - ATTR_NOACLCHECK (CIFS context only).
2804 * cr - credentials of caller.
2805 * ct - caller context
2807 * RETURN: 0 on success, error code on failure.
2810 * vp - ctime updated, mtime updated if size changed.
2814 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2815 caller_context_t *ct)
2817 znode_t *zp = VTOZ(vp);
2818 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2823 uint_t mask = vap->va_mask;
2824 uint_t saved_mask = 0;
2825 uint64_t saved_mode;
2828 uint64_t new_uid, new_gid;
2830 uint64_t mtime[2], ctime[2];
2832 int need_policy = FALSE;
2834 zfs_fuid_info_t *fuidp = NULL;
2835 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2838 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2839 boolean_t fuid_dirtied = B_FALSE;
2840 sa_bulk_attr_t bulk[7], xattr_bulk[7];
2841 int count = 0, xattr_count = 0;
2846 if (mask & AT_NOSET)
2847 return (SET_ERROR(EINVAL));
2852 zilog = zfsvfs->z_log;
2855 * Make sure that if we have ephemeral uid/gid or xvattr specified
2856 * that file system is at proper version level
2859 if (zfsvfs->z_use_fuids == B_FALSE &&
2860 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2861 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2862 (mask & AT_XVATTR))) {
2864 return (SET_ERROR(EINVAL));
2867 if (mask & AT_SIZE && vp->v_type == VDIR) {
2869 return (SET_ERROR(EISDIR));
2872 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2874 return (SET_ERROR(EINVAL));
2878 * If this is an xvattr_t, then get a pointer to the structure of
2879 * optional attributes. If this is NULL, then we have a vattr_t.
2881 xoap = xva_getxoptattr(xvap);
2883 xva_init(&tmpxvattr);
2886 * Immutable files can only alter immutable bit and atime
2888 if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2889 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2890 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2892 return (SET_ERROR(EPERM));
2896 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
2900 * Verify timestamps doesn't overflow 32 bits.
2901 * ZFS can handle large timestamps, but 32bit syscalls can't
2902 * handle times greater than 2039. This check should be removed
2903 * once large timestamps are fully supported.
2905 if (mask & (AT_ATIME | AT_MTIME)) {
2906 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2907 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2909 return (SET_ERROR(EOVERFLOW));
2912 if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
2913 TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
2915 return (SET_ERROR(EOVERFLOW));
2921 /* Can this be moved to before the top label? */
2922 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2924 return (SET_ERROR(EROFS));
2928 * First validate permissions
2931 if (mask & AT_SIZE) {
2933 * XXX - Note, we are not providing any open
2934 * mode flags here (like FNDELAY), so we may
2935 * block if there are locks present... this
2936 * should be addressed in openat().
2938 /* XXX - would it be OK to generate a log record here? */
2939 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2946 if (mask & (AT_ATIME|AT_MTIME) ||
2947 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2948 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2949 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2950 XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2951 XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2952 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2953 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2954 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2958 if (mask & (AT_UID|AT_GID)) {
2959 int idmask = (mask & (AT_UID|AT_GID));
2964 * NOTE: even if a new mode is being set,
2965 * we may clear S_ISUID/S_ISGID bits.
2968 if (!(mask & AT_MODE))
2969 vap->va_mode = zp->z_mode;
2972 * Take ownership or chgrp to group we are a member of
2975 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2976 take_group = (mask & AT_GID) &&
2977 zfs_groupmember(zfsvfs, vap->va_gid, cr);
2980 * If both AT_UID and AT_GID are set then take_owner and
2981 * take_group must both be set in order to allow taking
2984 * Otherwise, send the check through secpolicy_vnode_setattr()
2988 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2989 ((idmask == AT_UID) && take_owner) ||
2990 ((idmask == AT_GID) && take_group)) {
2991 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2992 skipaclchk, cr) == 0) {
2994 * Remove setuid/setgid for non-privileged users
2996 secpolicy_setid_clear(vap, vp, cr);
2997 trim_mask = (mask & (AT_UID|AT_GID));
3006 oldva.va_mode = zp->z_mode;
3007 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3008 if (mask & AT_XVATTR) {
3010 * Update xvattr mask to include only those attributes
3011 * that are actually changing.
3013 * the bits will be restored prior to actually setting
3014 * the attributes so the caller thinks they were set.
3016 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3017 if (xoap->xoa_appendonly !=
3018 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3021 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3022 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3026 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3027 if (xoap->xoa_nounlink !=
3028 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3031 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3032 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3036 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3037 if (xoap->xoa_immutable !=
3038 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3041 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3042 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3046 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3047 if (xoap->xoa_nodump !=
3048 ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3051 XVA_CLR_REQ(xvap, XAT_NODUMP);
3052 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3056 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3057 if (xoap->xoa_av_modified !=
3058 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3061 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3062 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3066 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3067 if ((vp->v_type != VREG &&
3068 xoap->xoa_av_quarantined) ||
3069 xoap->xoa_av_quarantined !=
3070 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3073 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3074 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3078 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3080 return (SET_ERROR(EPERM));
3083 if (need_policy == FALSE &&
3084 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3085 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3090 if (mask & AT_MODE) {
3091 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3092 err = secpolicy_setid_setsticky_clear(vp, vap,
3098 trim_mask |= AT_MODE;
3106 * If trim_mask is set then take ownership
3107 * has been granted or write_acl is present and user
3108 * has the ability to modify mode. In that case remove
3109 * UID|GID and or MODE from mask so that
3110 * secpolicy_vnode_setattr() doesn't revoke it.
3114 saved_mask = vap->va_mask;
3115 vap->va_mask &= ~trim_mask;
3116 if (trim_mask & AT_MODE) {
3118 * Save the mode, as secpolicy_vnode_setattr()
3119 * will overwrite it with ova.va_mode.
3121 saved_mode = vap->va_mode;
3124 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3125 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3132 vap->va_mask |= saved_mask;
3133 if (trim_mask & AT_MODE) {
3135 * Recover the mode after
3136 * secpolicy_vnode_setattr().
3138 vap->va_mode = saved_mode;
3144 * secpolicy_vnode_setattr, or take ownership may have
3147 mask = vap->va_mask;
3149 if ((mask & (AT_UID | AT_GID))) {
3150 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3151 &xattr_obj, sizeof (xattr_obj));
3153 if (err == 0 && xattr_obj) {
3154 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3156 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3158 vrele(ZTOV(attrzp));
3163 if (mask & AT_UID) {
3164 new_uid = zfs_fuid_create(zfsvfs,
3165 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3166 if (new_uid != zp->z_uid &&
3167 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3170 err = SET_ERROR(EDQUOT);
3175 if (mask & AT_GID) {
3176 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3177 cr, ZFS_GROUP, &fuidp);
3178 if (new_gid != zp->z_gid &&
3179 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3182 err = SET_ERROR(EDQUOT);
3187 tx = dmu_tx_create(zfsvfs->z_os);
3189 if (mask & AT_MODE) {
3190 uint64_t pmode = zp->z_mode;
3192 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3194 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3195 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3196 err = SET_ERROR(EPERM);
3200 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3203 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3205 * Are we upgrading ACL from old V0 format
3208 if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3209 zfs_znode_acl_version(zp) ==
3210 ZFS_ACL_VERSION_INITIAL) {
3211 dmu_tx_hold_free(tx, acl_obj, 0,
3213 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3214 0, aclp->z_acl_bytes);
3216 dmu_tx_hold_write(tx, acl_obj, 0,
3219 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3220 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3221 0, aclp->z_acl_bytes);
3223 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3225 if ((mask & AT_XVATTR) &&
3226 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3227 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3229 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3233 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3236 fuid_dirtied = zfsvfs->z_fuid_dirty;
3238 zfs_fuid_txhold(zfsvfs, tx);
3240 zfs_sa_upgrade_txholds(tx, zp);
3242 err = dmu_tx_assign(tx, TXG_WAIT);
3248 * Set each attribute requested.
3249 * We group settings according to the locks they need to acquire.
3251 * Note: you cannot set ctime directly, although it will be
3252 * updated as a side-effect of calling this function.
3255 if (mask & (AT_UID|AT_GID|AT_MODE))
3256 mutex_enter(&zp->z_acl_lock);
3258 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3259 &zp->z_pflags, sizeof (zp->z_pflags));
3262 if (mask & (AT_UID|AT_GID|AT_MODE))
3263 mutex_enter(&attrzp->z_acl_lock);
3264 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3265 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3266 sizeof (attrzp->z_pflags));
3269 if (mask & (AT_UID|AT_GID)) {
3271 if (mask & AT_UID) {
3272 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3273 &new_uid, sizeof (new_uid));
3274 zp->z_uid = new_uid;
3276 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3277 SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3279 attrzp->z_uid = new_uid;
3283 if (mask & AT_GID) {
3284 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3285 NULL, &new_gid, sizeof (new_gid));
3286 zp->z_gid = new_gid;
3288 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3289 SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3291 attrzp->z_gid = new_gid;
3294 if (!(mask & AT_MODE)) {
3295 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3296 NULL, &new_mode, sizeof (new_mode));
3297 new_mode = zp->z_mode;
3299 err = zfs_acl_chown_setattr(zp);
3302 err = zfs_acl_chown_setattr(attrzp);
3307 if (mask & AT_MODE) {
3308 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3309 &new_mode, sizeof (new_mode));
3310 zp->z_mode = new_mode;
3311 ASSERT3U((uintptr_t)aclp, !=, 0);
3312 err = zfs_aclset_common(zp, aclp, cr, tx);
3314 if (zp->z_acl_cached)
3315 zfs_acl_free(zp->z_acl_cached);
3316 zp->z_acl_cached = aclp;
3321 if (mask & AT_ATIME) {
3322 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3323 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3324 &zp->z_atime, sizeof (zp->z_atime));
3327 if (mask & AT_MTIME) {
3328 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3329 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3330 mtime, sizeof (mtime));
3333 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3334 if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3335 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3336 NULL, mtime, sizeof (mtime));
3337 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3338 &ctime, sizeof (ctime));
3339 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3341 } else if (mask != 0) {
3342 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3343 &ctime, sizeof (ctime));
3344 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3347 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3348 SA_ZPL_CTIME(zfsvfs), NULL,
3349 &ctime, sizeof (ctime));
3350 zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3351 mtime, ctime, B_TRUE);
3355 * Do this after setting timestamps to prevent timestamp
3356 * update from toggling bit
3359 if (xoap && (mask & AT_XVATTR)) {
3361 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
3362 xoap->xoa_createtime = vap->va_birthtime;
3364 * restore trimmed off masks
3365 * so that return masks can be set for caller.
3368 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3369 XVA_SET_REQ(xvap, XAT_APPENDONLY);
3371 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3372 XVA_SET_REQ(xvap, XAT_NOUNLINK);
3374 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3375 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3377 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3378 XVA_SET_REQ(xvap, XAT_NODUMP);
3380 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3381 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3383 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3384 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3387 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3388 ASSERT(vp->v_type == VREG);
3390 zfs_xvattr_set(zp, xvap, tx);
3394 zfs_fuid_sync(zfsvfs, tx);
3397 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3399 if (mask & (AT_UID|AT_GID|AT_MODE))
3400 mutex_exit(&zp->z_acl_lock);
3403 if (mask & (AT_UID|AT_GID|AT_MODE))
3404 mutex_exit(&attrzp->z_acl_lock);
3407 if (err == 0 && attrzp) {
3408 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3420 zfs_fuid_info_free(fuidp);
3427 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3432 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3433 zil_commit(zilog, 0);
3440 * We acquire all but fdvp locks using non-blocking acquisitions. If we
3441 * fail to acquire any lock in the path we will drop all held locks,
3442 * acquire the new lock in a blocking fashion, and then release it and
3443 * restart the rename. This acquire/release step ensures that we do not
3444 * spin on a lock waiting for release. On error release all vnode locks
3445 * and decrement references the way tmpfs_rename() would do.
3448 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3449 struct vnode *tdvp, struct vnode **tvpp,
3450 const struct componentname *scnp, const struct componentname *tcnp)
3453 struct vnode *nvp, *svp, *tvp;
3454 znode_t *sdzp, *tdzp, *szp, *tzp;
3455 const char *snm = scnp->cn_nameptr;
3456 const char *tnm = tcnp->cn_nameptr;
3459 VOP_UNLOCK(tdvp, 0);
3460 if (*tvpp != NULL && *tvpp != tdvp)
3461 VOP_UNLOCK(*tvpp, 0);
3464 error = vn_lock(sdvp, LK_EXCLUSIVE);
3469 error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3471 VOP_UNLOCK(sdvp, 0);
3474 error = vn_lock(tdvp, LK_EXCLUSIVE);
3477 VOP_UNLOCK(tdvp, 0);
3483 * Before using sdzp and tdzp we must ensure that they are live.
3484 * As a porting legacy from illumos we have two things to worry
3485 * about. One is typical for FreeBSD and it is that the vnode is
3486 * not reclaimed (doomed). The other is that the znode is live.
3487 * The current code can invalidate the znode without acquiring the
3488 * corresponding vnode lock if the object represented by the znode
3489 * and vnode is no longer valid after a rollback or receive operation.
3490 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3491 * that protects the znodes from the invalidation.
3493 zfsvfs = sdzp->z_zfsvfs;
3494 ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3498 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3499 * bypassing the cleanup code in the case of an error.
3501 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3503 VOP_UNLOCK(sdvp, 0);
3504 VOP_UNLOCK(tdvp, 0);
3505 error = SET_ERROR(EIO);
3510 * Re-resolve svp to be certain it still exists and fetch the
3513 error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
3515 /* Source entry invalid or not there. */
3517 VOP_UNLOCK(sdvp, 0);
3518 VOP_UNLOCK(tdvp, 0);
3519 if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3520 (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3521 error = SET_ERROR(EINVAL);
3527 * Re-resolve tvp, if it disappeared we just carry on.
3529 error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
3532 VOP_UNLOCK(sdvp, 0);
3533 VOP_UNLOCK(tdvp, 0);
3535 if ((tcnp->cn_flags & ISDOTDOT) != 0)
3536 error = SET_ERROR(EINVAL);
3545 * At present the vnode locks must be acquired before z_teardown_lock,
3546 * although it would be more logical to use the opposite order.
3551 * Now try acquire locks on svp and tvp.
3554 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3556 VOP_UNLOCK(sdvp, 0);
3557 VOP_UNLOCK(tdvp, 0);
3560 if (error != EBUSY) {
3564 error = vn_lock(nvp, LK_EXCLUSIVE);
3571 * Concurrent rename race.
3576 error = SET_ERROR(EINVAL);
3591 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3593 VOP_UNLOCK(sdvp, 0);
3594 VOP_UNLOCK(tdvp, 0);
3595 VOP_UNLOCK(*svpp, 0);
3596 if (error != EBUSY) {
3600 error = vn_lock(nvp, LK_EXCLUSIVE);
3618 * Note that we must use VRELE_ASYNC in this function as it walks
3619 * up the directory tree and vrele may need to acquire an exclusive
3620 * lock if a last reference to a vnode is dropped.
3623 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
3630 zfsvfs = tdzp->z_zfsvfs;
3632 return (SET_ERROR(EINVAL));
3635 if (tdzp->z_id == zfsvfs->z_root)
3639 ASSERT(!zp->z_unlinked);
3640 if ((error = sa_lookup(zp->z_sa_hdl,
3641 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
3644 if (parent == szp->z_id) {
3645 error = SET_ERROR(EINVAL);
3648 if (parent == zfsvfs->z_root)
3650 if (parent == sdzp->z_id)
3653 error = zfs_zget(zfsvfs, parent, &zp1);
3658 VN_RELE_ASYNC(ZTOV(zp),
3659 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3663 if (error == ENOTDIR)
3664 panic("checkpath: .. not a directory\n");
3666 VN_RELE_ASYNC(ZTOV(zp),
3667 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3672 * Move an entry from the provided source directory to the target
3673 * directory. Change the entry name as indicated.
3675 * IN: sdvp - Source directory containing the "old entry".
3676 * snm - Old entry name.
3677 * tdvp - Target directory to contain the "new entry".
3678 * tnm - New entry name.
3679 * cr - credentials of caller.
3680 * ct - caller context
3681 * flags - case flags
3683 * RETURN: 0 on success, error code on failure.
3686 * sdvp,tdvp - ctime|mtime updated
3690 zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3691 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3695 znode_t *sdzp, *tdzp, *szp, *tzp;
3696 zilog_t *zilog = NULL;
3698 char *snm = scnp->cn_nameptr;
3699 char *tnm = tcnp->cn_nameptr;
3702 /* Reject renames across filesystems. */
3703 if ((*svpp)->v_mount != tdvp->v_mount ||
3704 ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3705 error = SET_ERROR(EXDEV);
3709 if (zfsctl_is_node(tdvp)) {
3710 error = SET_ERROR(EXDEV);
3715 * Lock all four vnodes to ensure safety and semantics of renaming.
3717 error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3719 /* no vnodes are locked in the case of error here */
3725 zfsvfs = tdzp->z_zfsvfs;
3726 zilog = zfsvfs->z_log;
3729 * After we re-enter ZFS_ENTER() we will have to revalidate all
3734 if (zfsvfs->z_utf8 && u8_validate(tnm,
3735 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3736 error = SET_ERROR(EILSEQ);
3740 /* If source and target are the same file, there is nothing to do. */
3741 if ((*svpp) == (*tvpp)) {
3746 if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3747 ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3748 (*tvpp)->v_mountedhere != NULL)) {
3749 error = SET_ERROR(EXDEV);
3754 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3755 * bypassing the cleanup code in the case of an error.
3757 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3758 error = SET_ERROR(EIO);
3763 tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3764 if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
3765 error = SET_ERROR(EIO);
3770 * This is to prevent the creation of links into attribute space
3771 * by renaming a linked file into/outof an attribute directory.
3772 * See the comment in zfs_link() for why this is considered bad.
3774 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3775 error = SET_ERROR(EINVAL);
3780 * Must have write access at the source to remove the old entry
3781 * and write access at the target to create the new entry.
3782 * Note that if target and source are the same, this can be
3783 * done in a single check.
3785 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3788 if ((*svpp)->v_type == VDIR) {
3790 * Avoid ".", "..", and aliases of "." for obvious reasons.
3792 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3794 (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3800 * Check to make sure rename is valid.
3801 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3803 if (error = zfs_rename_check(szp, sdzp, tdzp))
3808 * Does target exist?
3812 * Source and target must be the same type.
3814 if ((*svpp)->v_type == VDIR) {
3815 if ((*tvpp)->v_type != VDIR) {
3816 error = SET_ERROR(ENOTDIR);
3824 if ((*tvpp)->v_type == VDIR) {
3825 error = SET_ERROR(EISDIR);
3831 vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3833 vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3836 * notify the target directory if it is not the same
3837 * as source directory.
3840 vnevent_rename_dest_dir(tdvp, ct);
3843 tx = dmu_tx_create(zfsvfs->z_os);
3844 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3845 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3846 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3847 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3849 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3850 zfs_sa_upgrade_txholds(tx, tdzp);
3853 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3854 zfs_sa_upgrade_txholds(tx, tzp);
3857 zfs_sa_upgrade_txholds(tx, szp);
3858 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3859 error = dmu_tx_assign(tx, TXG_WAIT);
3866 if (tzp) /* Attempt to remove the existing target */
3867 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3870 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3872 szp->z_pflags |= ZFS_AV_MODIFIED;
3874 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3875 (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3878 error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3881 zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3882 snm, tdzp, tnm, szp);
3885 * Update path information for the target vnode
3887 vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
3890 * At this point, we have successfully created
3891 * the target name, but have failed to remove
3892 * the source name. Since the create was done
3893 * with the ZRENAMING flag, there are
3894 * complications; for one, the link count is
3895 * wrong. The easiest way to deal with this
3896 * is to remove the newly created target, and
3897 * return the original error. This must
3898 * succeed; fortunately, it is very unlikely to
3899 * fail, since we just created it.
3901 VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
3902 ZRENAMING, NULL), ==, 0);
3909 cache_purge_negative(tdvp);
3915 unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */
3917 VOP_UNLOCK(*svpp, 0);
3918 VOP_UNLOCK(sdvp, 0);
3920 out: /* original two vnodes are locked */
3921 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3922 zil_commit(zilog, 0);
3925 VOP_UNLOCK(*tvpp, 0);
3927 VOP_UNLOCK(tdvp, 0);
3932 * Insert the indicated symbolic reference entry into the directory.
3934 * IN: dvp - Directory to contain new symbolic link.
3935 * link - Name for new symlink entry.
3936 * vap - Attributes of new entry.
3937 * cr - credentials of caller.
3938 * ct - caller context
3939 * flags - case flags
3941 * RETURN: 0 on success, error code on failure.
3944 * dvp - ctime|mtime updated
3948 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3949 cred_t *cr, kthread_t *td)
3951 znode_t *zp, *dzp = VTOZ(dvp);
3953 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
3955 uint64_t len = strlen(link);
3957 zfs_acl_ids_t acl_ids;
3958 boolean_t fuid_dirtied;
3959 uint64_t txtype = TX_SYMLINK;
3962 ASSERT(vap->va_type == VLNK);
3966 zilog = zfsvfs->z_log;
3968 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3969 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3971 return (SET_ERROR(EILSEQ));
3974 if (len > MAXPATHLEN) {
3976 return (SET_ERROR(ENAMETOOLONG));
3979 if ((error = zfs_acl_ids_create(dzp, 0,
3980 vap, cr, NULL, &acl_ids)) != 0) {
3986 * Attempt to lock directory; fail if entry already exists.
3988 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
3990 zfs_acl_ids_free(&acl_ids);
3995 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3996 zfs_acl_ids_free(&acl_ids);
4001 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4002 zfs_acl_ids_free(&acl_ids);
4004 return (SET_ERROR(EDQUOT));
4007 getnewvnode_reserve(1);
4008 tx = dmu_tx_create(zfsvfs->z_os);
4009 fuid_dirtied = zfsvfs->z_fuid_dirty;
4010 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4011 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4012 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4013 ZFS_SA_BASE_ATTR_SIZE + len);
4014 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4015 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4016 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4017 acl_ids.z_aclp->z_acl_bytes);
4020 zfs_fuid_txhold(zfsvfs, tx);
4021 error = dmu_tx_assign(tx, TXG_WAIT);
4023 zfs_acl_ids_free(&acl_ids);
4025 getnewvnode_drop_reserve();
4031 * Create a new object for the symlink.
4032 * for version 4 ZPL datsets the symlink will be an SA attribute
4034 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4037 zfs_fuid_sync(zfsvfs, tx);
4040 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4043 zfs_sa_symlink(zp, link, len, tx);
4046 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4047 &zp->z_size, sizeof (zp->z_size), tx);
4049 * Insert the new object into the directory.
4051 (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4053 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4056 zfs_acl_ids_free(&acl_ids);
4060 getnewvnode_drop_reserve();
4062 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4063 zil_commit(zilog, 0);
4070 * Return, in the buffer contained in the provided uio structure,
4071 * the symbolic path referred to by vp.
4073 * IN: vp - vnode of symbolic link.
4074 * uio - structure to contain the link path.
4075 * cr - credentials of caller.
4076 * ct - caller context
4078 * OUT: uio - structure containing the link path.
4080 * RETURN: 0 on success, error code on failure.
4083 * vp - atime updated
4087 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4089 znode_t *zp = VTOZ(vp);
4090 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4097 error = sa_lookup_uio(zp->z_sa_hdl,
4098 SA_ZPL_SYMLINK(zfsvfs), uio);
4100 error = zfs_sa_readlink(zp, uio);
4102 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4109 * Insert a new entry into directory tdvp referencing svp.
4111 * IN: tdvp - Directory to contain new entry.
4112 * svp - vnode of new entry.
4113 * name - name of new entry.
4114 * cr - credentials of caller.
4115 * ct - caller context
4117 * RETURN: 0 on success, error code on failure.
4120 * tdvp - ctime|mtime updated
4121 * svp - ctime updated
4125 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4126 caller_context_t *ct, int flags)
4128 znode_t *dzp = VTOZ(tdvp);
4130 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
4137 ASSERT(tdvp->v_type == VDIR);
4141 zilog = zfsvfs->z_log;
4144 * POSIX dictates that we return EPERM here.
4145 * Better choices include ENOTSUP or EISDIR.
4147 if (svp->v_type == VDIR) {
4149 return (SET_ERROR(EPERM));
4155 if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4157 return (SET_ERROR(EPERM));
4160 /* Prevent links to .zfs/shares files */
4162 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4163 &parent, sizeof (uint64_t))) != 0) {
4167 if (parent == zfsvfs->z_shares_dir) {
4169 return (SET_ERROR(EPERM));
4172 if (zfsvfs->z_utf8 && u8_validate(name,
4173 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4175 return (SET_ERROR(EILSEQ));
4179 * We do not support links between attributes and non-attributes
4180 * because of the potential security risk of creating links
4181 * into "normal" file space in order to circumvent restrictions
4182 * imposed in attribute space.
4184 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4186 return (SET_ERROR(EINVAL));
4190 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4191 if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4193 return (SET_ERROR(EPERM));
4196 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4202 * Attempt to lock directory; fail if entry already exists.
4204 error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4210 tx = dmu_tx_create(zfsvfs->z_os);
4211 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4212 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4213 zfs_sa_upgrade_txholds(tx, szp);
4214 zfs_sa_upgrade_txholds(tx, dzp);
4215 error = dmu_tx_assign(tx, TXG_WAIT);
4222 error = zfs_link_create(dzp, name, szp, tx, 0);
4225 uint64_t txtype = TX_LINK;
4226 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4232 vnevent_link(svp, ct);
4235 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4236 zil_commit(zilog, 0);
4245 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4247 znode_t *zp = VTOZ(vp);
4248 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4251 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4252 if (zp->z_sa_hdl == NULL) {
4254 * The fs has been unmounted, or we did a
4255 * suspend/resume and this file no longer exists.
4257 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4262 if (zp->z_unlinked) {
4264 * Fast path to recycle a vnode of a removed file.
4266 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4271 if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4272 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4274 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4275 zfs_sa_upgrade_txholds(tx, zp);
4276 error = dmu_tx_assign(tx, TXG_WAIT);
4280 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4281 (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4282 zp->z_atime_dirty = 0;
4286 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4290 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4291 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4295 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4297 znode_t *zp = VTOZ(vp);
4298 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4301 uint64_t object = zp->z_id;
4308 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4309 &gen64, sizeof (uint64_t))) != 0) {
4314 gen = (uint32_t)gen64;
4316 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4319 if (fidp->fid_len < size) {
4320 fidp->fid_len = size;
4322 return (SET_ERROR(ENOSPC));
4325 fidp->fid_len = size;
4328 zfid = (zfid_short_t *)fidp;
4330 zfid->zf_len = size;
4332 for (i = 0; i < sizeof (zfid->zf_object); i++)
4333 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4335 /* Must have a non-zero generation number to distinguish from .zfs */
4338 for (i = 0; i < sizeof (zfid->zf_gen); i++)
4339 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4341 if (size == LONG_FID_LEN) {
4342 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
4345 zlfid = (zfid_long_t *)fidp;
4347 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4348 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4350 /* XXX - this should be the generation number for the objset */
4351 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4352 zlfid->zf_setgen[i] = 0;
4360 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4361 caller_context_t *ct)
4369 *valp = MIN(LONG_MAX, ZFS_LINK_MAX);
4372 case _PC_FILESIZEBITS:
4376 case _PC_XATTR_EXISTS:
4378 zfsvfs = zp->z_zfsvfs;
4382 error = zfs_dirent_lookup(zp, "", &xzp,
4383 ZXATTR | ZEXISTS | ZSHARED);
4385 if (!zfs_dirempty(xzp))
4388 } else if (error == ENOENT) {
4390 * If there aren't extended attributes, it's the
4391 * same as having zero of them.
4398 case _PC_SATTR_ENABLED:
4399 case _PC_SATTR_EXISTS:
4400 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4401 (vp->v_type == VREG || vp->v_type == VDIR);
4404 case _PC_ACCESS_FILTERING:
4405 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4409 case _PC_ACL_ENABLED:
4410 *valp = _ACL_ACE_ENABLED;
4412 #endif /* illumos */
4413 case _PC_MIN_HOLE_SIZE:
4414 *valp = (int)SPA_MINBLOCKSIZE;
4417 case _PC_TIMESTAMP_RESOLUTION:
4418 /* nanosecond timestamp resolution */
4422 case _PC_ACL_EXTENDED:
4430 case _PC_ACL_PATH_MAX:
4431 *valp = ACL_MAX_ENTRIES;
4435 return (EOPNOTSUPP);
4441 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4442 caller_context_t *ct)
4444 znode_t *zp = VTOZ(vp);
4445 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4447 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4451 error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4459 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4460 caller_context_t *ct)
4462 znode_t *zp = VTOZ(vp);
4463 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4465 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4466 zilog_t *zilog = zfsvfs->z_log;
4471 error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4473 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4474 zil_commit(zilog, 0);
4481 zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
4484 znode_t *zp = VTOZ(vp);
4485 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4486 objset_t *os = zp->z_zfsvfs->z_os;
4489 off_t start, end, obj_size;
4491 int pgsin_b, pgsin_a;
4497 start = IDX_TO_OFF(ma[0]->pindex);
4498 end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
4501 * Lock a range covering all required and optional pages.
4502 * Note that we need to handle the case of the block size growing.
4505 blksz = zp->z_blksz;
4506 rl = zfs_range_lock(zp, rounddown(start, blksz),
4507 roundup(end, blksz) - rounddown(start, blksz), RL_READER);
4508 if (blksz == zp->z_blksz)
4510 zfs_range_unlock(rl);
4513 object = ma[0]->object;
4514 zfs_vmobject_wlock(object);
4515 obj_size = object->un_pager.vnp.vnp_size;
4516 zfs_vmobject_wunlock(object);
4517 if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
4518 zfs_range_unlock(rl);
4520 return (zfs_vm_pagerret_bad);
4524 if (rbehind != NULL) {
4525 pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
4526 pgsin_b = MIN(*rbehind, pgsin_b);
4530 if (rahead != NULL) {
4531 pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
4532 if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
4533 pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
4534 pgsin_a = MIN(*rahead, pgsin_a);
4538 * NB: we need to pass the exact byte size of the data that we expect
4539 * to read after accounting for the file size. This is required because
4540 * ZFS will panic if we request DMU to read beyond the end of the last
4543 error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
4544 MIN(end, obj_size) - (end - PAGE_SIZE));
4546 zfs_range_unlock(rl);
4547 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4551 return (zfs_vm_pagerret_error);
4553 VM_CNT_INC(v_vnodein);
4554 VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
4555 if (rbehind != NULL)
4559 return (zfs_vm_pagerret_ok);
4563 zfs_freebsd_getpages(ap)
4564 struct vop_getpages_args /* {
4573 return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
4578 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4581 znode_t *zp = VTOZ(vp);
4582 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4591 vm_ooffset_t lo_off;
4602 object = vp->v_object;
4606 KASSERT(ma[0]->object == object, ("mismatching object"));
4607 KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4609 for (i = 0; i < pcount; i++)
4610 rtvals[i] = zfs_vm_pagerret_error;
4612 off = IDX_TO_OFF(ma[0]->pindex);
4613 blksz = zp->z_blksz;
4614 lo_off = rounddown(off, blksz);
4615 lo_len = roundup(len + (off - lo_off), blksz);
4616 rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
4618 zfs_vmobject_wlock(object);
4619 if (len + off > object->un_pager.vnp.vnp_size) {
4620 if (object->un_pager.vnp.vnp_size > off) {
4623 len = object->un_pager.vnp.vnp_size - off;
4625 if ((pgoff = (int)len & PAGE_MASK) != 0) {
4627 * If the object is locked and the following
4628 * conditions hold, then the page's dirty
4629 * field cannot be concurrently changed by a
4633 vm_page_assert_sbusied(m);
4634 KASSERT(!pmap_page_is_write_mapped(m),
4635 ("zfs_putpages: page %p is not read-only", m));
4636 vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4643 if (ncount < pcount) {
4644 for (i = ncount; i < pcount; i++) {
4645 rtvals[i] = zfs_vm_pagerret_bad;
4649 zfs_vmobject_wunlock(object);
4654 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4655 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4659 tx = dmu_tx_create(zfsvfs->z_os);
4660 dmu_tx_hold_write(tx, zp->z_id, off, len);
4662 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4663 zfs_sa_upgrade_txholds(tx, zp);
4664 err = dmu_tx_assign(tx, TXG_WAIT);
4670 if (zp->z_blksz < PAGE_SIZE) {
4671 for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4672 tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4673 va = zfs_map_page(ma[i], &sf);
4674 dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4678 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4682 uint64_t mtime[2], ctime[2];
4683 sa_bulk_attr_t bulk[3];
4686 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4688 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4690 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4692 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4694 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4696 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4698 zfs_vmobject_wlock(object);
4699 for (i = 0; i < ncount; i++) {
4700 rtvals[i] = zfs_vm_pagerret_ok;
4701 vm_page_undirty(ma[i]);
4703 zfs_vmobject_wunlock(object);
4704 VM_CNT_INC(v_vnodeout);
4705 VM_CNT_ADD(v_vnodepgsout, ncount);
4710 zfs_range_unlock(rl);
4711 if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
4712 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4713 zil_commit(zfsvfs->z_log, zp->z_id);
4719 zfs_freebsd_putpages(ap)
4720 struct vop_putpages_args /* {
4729 return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4734 zfs_freebsd_bmap(ap)
4735 struct vop_bmap_args /* {
4738 struct bufobj **a_bop;
4745 if (ap->a_bop != NULL)
4746 *ap->a_bop = &ap->a_vp->v_bufobj;
4747 if (ap->a_bnp != NULL)
4748 *ap->a_bnp = ap->a_bn;
4749 if (ap->a_runp != NULL)
4751 if (ap->a_runb != NULL)
4758 zfs_freebsd_open(ap)
4759 struct vop_open_args /* {
4762 struct ucred *a_cred;
4763 struct thread *a_td;
4766 vnode_t *vp = ap->a_vp;
4767 znode_t *zp = VTOZ(vp);
4770 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4772 vnode_create_vobject(vp, zp->z_size, ap->a_td);
4777 zfs_freebsd_close(ap)
4778 struct vop_close_args /* {
4781 struct ucred *a_cred;
4782 struct thread *a_td;
4786 return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
4790 zfs_freebsd_ioctl(ap)
4791 struct vop_ioctl_args /* {
4801 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4802 ap->a_fflag, ap->a_cred, NULL, NULL));
4806 ioflags(int ioflags)
4810 if (ioflags & IO_APPEND)
4812 if (ioflags & IO_NDELAY)
4814 if (ioflags & IO_SYNC)
4815 flags |= (FSYNC | FDSYNC | FRSYNC);
4821 zfs_freebsd_read(ap)
4822 struct vop_read_args /* {
4826 struct ucred *a_cred;
4830 return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4835 zfs_freebsd_write(ap)
4836 struct vop_write_args /* {
4840 struct ucred *a_cred;
4844 return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4849 zfs_freebsd_access(ap)
4850 struct vop_access_args /* {
4852 accmode_t a_accmode;
4853 struct ucred *a_cred;
4854 struct thread *a_td;
4857 vnode_t *vp = ap->a_vp;
4858 znode_t *zp = VTOZ(vp);
4863 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4865 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4867 error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4870 * VADMIN has to be handled by vaccess().
4873 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4875 error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4876 zp->z_gid, accmode, ap->a_cred, NULL);
4881 * For VEXEC, ensure that at least one execute bit is set for
4884 if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4885 (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4893 zfs_freebsd_lookup(ap)
4894 struct vop_lookup_args /* {
4895 struct vnode *a_dvp;
4896 struct vnode **a_vpp;
4897 struct componentname *a_cnp;
4900 struct componentname *cnp = ap->a_cnp;
4901 char nm[NAME_MAX + 1];
4903 ASSERT(cnp->cn_namelen < sizeof(nm));
4904 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4906 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4907 cnp->cn_cred, cnp->cn_thread, 0));
4911 zfs_cache_lookup(ap)
4912 struct vop_lookup_args /* {
4913 struct vnode *a_dvp;
4914 struct vnode **a_vpp;
4915 struct componentname *a_cnp;
4920 zfsvfs = ap->a_dvp->v_mount->mnt_data;
4921 if (zfsvfs->z_use_namecache)
4922 return (vfs_cache_lookup(ap));
4924 return (zfs_freebsd_lookup(ap));
4928 zfs_freebsd_create(ap)
4929 struct vop_create_args /* {
4930 struct vnode *a_dvp;
4931 struct vnode **a_vpp;
4932 struct componentname *a_cnp;
4933 struct vattr *a_vap;
4937 struct componentname *cnp = ap->a_cnp;
4938 vattr_t *vap = ap->a_vap;
4941 ASSERT(cnp->cn_flags & SAVENAME);
4943 vattr_init_mask(vap);
4944 mode = vap->va_mode & ALLPERMS;
4945 zfsvfs = ap->a_dvp->v_mount->mnt_data;
4947 error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
4948 ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
4949 if (zfsvfs->z_use_namecache &&
4950 error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
4951 cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
4956 zfs_freebsd_remove(ap)
4957 struct vop_remove_args /* {
4958 struct vnode *a_dvp;
4960 struct componentname *a_cnp;
4964 ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4966 return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
4967 ap->a_cnp->cn_cred));
4971 zfs_freebsd_mkdir(ap)
4972 struct vop_mkdir_args /* {
4973 struct vnode *a_dvp;
4974 struct vnode **a_vpp;
4975 struct componentname *a_cnp;
4976 struct vattr *a_vap;
4979 vattr_t *vap = ap->a_vap;
4981 ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4983 vattr_init_mask(vap);
4985 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
4986 ap->a_cnp->cn_cred));
4990 zfs_freebsd_rmdir(ap)
4991 struct vop_rmdir_args /* {
4992 struct vnode *a_dvp;
4994 struct componentname *a_cnp;
4997 struct componentname *cnp = ap->a_cnp;
4999 ASSERT(cnp->cn_flags & SAVENAME);
5001 return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
5005 zfs_freebsd_readdir(ap)
5006 struct vop_readdir_args /* {
5009 struct ucred *a_cred;
5016 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5017 ap->a_ncookies, ap->a_cookies));
5021 zfs_freebsd_fsync(ap)
5022 struct vop_fsync_args /* {
5025 struct thread *a_td;
5030 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5034 zfs_freebsd_getattr(ap)
5035 struct vop_getattr_args /* {
5037 struct vattr *a_vap;
5038 struct ucred *a_cred;
5041 vattr_t *vap = ap->a_vap;
5047 xvap.xva_vattr = *vap;
5048 xvap.xva_vattr.va_mask |= AT_XVATTR;
5050 /* Convert chflags into ZFS-type flags. */
5051 /* XXX: what about SF_SETTABLE?. */
5052 XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5053 XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5054 XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5055 XVA_SET_REQ(&xvap, XAT_NODUMP);
5056 XVA_SET_REQ(&xvap, XAT_READONLY);
5057 XVA_SET_REQ(&xvap, XAT_ARCHIVE);
5058 XVA_SET_REQ(&xvap, XAT_SYSTEM);
5059 XVA_SET_REQ(&xvap, XAT_HIDDEN);
5060 XVA_SET_REQ(&xvap, XAT_REPARSE);
5061 XVA_SET_REQ(&xvap, XAT_OFFLINE);
5062 XVA_SET_REQ(&xvap, XAT_SPARSE);
5064 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5068 /* Convert ZFS xattr into chflags. */
5069 #define FLAG_CHECK(fflag, xflag, xfield) do { \
5070 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \
5071 fflags |= (fflag); \
5073 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5074 xvap.xva_xoptattrs.xoa_immutable);
5075 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5076 xvap.xva_xoptattrs.xoa_appendonly);
5077 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5078 xvap.xva_xoptattrs.xoa_nounlink);
5079 FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
5080 xvap.xva_xoptattrs.xoa_archive);
5081 FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5082 xvap.xva_xoptattrs.xoa_nodump);
5083 FLAG_CHECK(UF_READONLY, XAT_READONLY,
5084 xvap.xva_xoptattrs.xoa_readonly);
5085 FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
5086 xvap.xva_xoptattrs.xoa_system);
5087 FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
5088 xvap.xva_xoptattrs.xoa_hidden);
5089 FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
5090 xvap.xva_xoptattrs.xoa_reparse);
5091 FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
5092 xvap.xva_xoptattrs.xoa_offline);
5093 FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
5094 xvap.xva_xoptattrs.xoa_sparse);
5097 *vap = xvap.xva_vattr;
5098 vap->va_flags = fflags;
5103 zfs_freebsd_setattr(ap)
5104 struct vop_setattr_args /* {
5106 struct vattr *a_vap;
5107 struct ucred *a_cred;
5110 vnode_t *vp = ap->a_vp;
5111 vattr_t *vap = ap->a_vap;
5112 cred_t *cred = ap->a_cred;
5117 vattr_init_mask(vap);
5118 vap->va_mask &= ~AT_NOSET;
5121 xvap.xva_vattr = *vap;
5123 zflags = VTOZ(vp)->z_pflags;
5125 if (vap->va_flags != VNOVAL) {
5126 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5129 if (zfsvfs->z_use_fuids == B_FALSE)
5130 return (EOPNOTSUPP);
5132 fflags = vap->va_flags;
5135 * We need to figure out whether it makes sense to allow
5136 * UF_REPARSE through, since we don't really have other
5137 * facilities to handle reparse points and zfs_setattr()
5138 * doesn't currently allow setting that attribute anyway.
5140 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
5141 UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
5142 UF_OFFLINE|UF_SPARSE)) != 0)
5143 return (EOPNOTSUPP);
5145 * Unprivileged processes are not permitted to unset system
5146 * flags, or modify flags if any system flags are set.
5147 * Privileged non-jail processes may not modify system flags
5148 * if securelevel > 0 and any existing system flags are set.
5149 * Privileged jail processes behave like privileged non-jail
5150 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
5151 * otherwise, they behave like unprivileged processes.
5153 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5154 priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
5156 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5157 error = securelevel_gt(cred, 0);
5163 * Callers may only modify the file flags on objects they
5164 * have VADMIN rights for.
5166 if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5169 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5173 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5178 #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \
5179 if (((fflags & (fflag)) && !(zflags & (zflag))) || \
5180 ((zflags & (zflag)) && !(fflags & (fflag)))) { \
5181 XVA_SET_REQ(&xvap, (xflag)); \
5182 (xfield) = ((fflags & (fflag)) != 0); \
5185 /* Convert chflags into ZFS-type flags. */
5186 /* XXX: what about SF_SETTABLE?. */
5187 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5188 xvap.xva_xoptattrs.xoa_immutable);
5189 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5190 xvap.xva_xoptattrs.xoa_appendonly);
5191 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5192 xvap.xva_xoptattrs.xoa_nounlink);
5193 FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
5194 xvap.xva_xoptattrs.xoa_archive);
5195 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5196 xvap.xva_xoptattrs.xoa_nodump);
5197 FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
5198 xvap.xva_xoptattrs.xoa_readonly);
5199 FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
5200 xvap.xva_xoptattrs.xoa_system);
5201 FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
5202 xvap.xva_xoptattrs.xoa_hidden);
5203 FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
5204 xvap.xva_xoptattrs.xoa_reparse);
5205 FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
5206 xvap.xva_xoptattrs.xoa_offline);
5207 FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
5208 xvap.xva_xoptattrs.xoa_sparse);
5211 if (vap->va_birthtime.tv_sec != VNOVAL) {
5212 xvap.xva_vattr.va_mask |= AT_XVATTR;
5213 XVA_SET_REQ(&xvap, XAT_CREATETIME);
5215 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
5219 zfs_freebsd_rename(ap)
5220 struct vop_rename_args /* {
5221 struct vnode *a_fdvp;
5222 struct vnode *a_fvp;
5223 struct componentname *a_fcnp;
5224 struct vnode *a_tdvp;
5225 struct vnode *a_tvp;
5226 struct componentname *a_tcnp;
5229 vnode_t *fdvp = ap->a_fdvp;
5230 vnode_t *fvp = ap->a_fvp;
5231 vnode_t *tdvp = ap->a_tdvp;
5232 vnode_t *tvp = ap->a_tvp;
5235 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
5236 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
5238 error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
5239 ap->a_tcnp, ap->a_fcnp->cn_cred);
5251 zfs_freebsd_symlink(ap)
5252 struct vop_symlink_args /* {
5253 struct vnode *a_dvp;
5254 struct vnode **a_vpp;
5255 struct componentname *a_cnp;
5256 struct vattr *a_vap;
5260 struct componentname *cnp = ap->a_cnp;
5261 vattr_t *vap = ap->a_vap;
5263 ASSERT(cnp->cn_flags & SAVENAME);
5265 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */
5266 vattr_init_mask(vap);
5268 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
5269 __DECONST(char *, ap->a_target), cnp->cn_cred, cnp->cn_thread));
5273 zfs_freebsd_readlink(ap)
5274 struct vop_readlink_args /* {
5277 struct ucred *a_cred;
5281 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5285 zfs_freebsd_link(ap)
5286 struct vop_link_args /* {
5287 struct vnode *a_tdvp;
5289 struct componentname *a_cnp;
5292 struct componentname *cnp = ap->a_cnp;
5293 vnode_t *vp = ap->a_vp;
5294 vnode_t *tdvp = ap->a_tdvp;
5296 if (tdvp->v_mount != vp->v_mount)
5299 ASSERT(cnp->cn_flags & SAVENAME);
5301 return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
5305 zfs_freebsd_inactive(ap)
5306 struct vop_inactive_args /* {
5308 struct thread *a_td;
5311 vnode_t *vp = ap->a_vp;
5313 zfs_inactive(vp, ap->a_td->td_ucred, NULL);
5318 zfs_freebsd_reclaim(ap)
5319 struct vop_reclaim_args /* {
5321 struct thread *a_td;
5324 vnode_t *vp = ap->a_vp;
5325 znode_t *zp = VTOZ(vp);
5326 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5331 * z_teardown_inactive_lock protects from a race with
5332 * zfs_znode_dmu_fini in zfsvfs_teardown during
5335 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5336 if (zp->z_sa_hdl == NULL)
5340 rw_exit(&zfsvfs->z_teardown_inactive_lock);
5348 struct vop_fid_args /* {
5354 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5358 zfs_freebsd_pathconf(ap)
5359 struct vop_pathconf_args /* {
5362 register_t *a_retval;
5368 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
5370 *ap->a_retval = val;
5373 if (error != EOPNOTSUPP)
5376 switch (ap->a_name) {
5378 *ap->a_retval = NAME_MAX;
5381 if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
5382 *ap->a_retval = PIPE_BUF;
5387 return (vop_stdpathconf(ap));
5392 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5393 * extended attribute name:
5396 * system freebsd:system:
5397 * user (none, can be used to access ZFS fsattr(5) attributes
5398 * created on Solaris)
5401 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5404 const char *namespace, *prefix, *suffix;
5406 /* We don't allow '/' character in attribute name. */
5407 if (strchr(name, '/') != NULL)
5409 /* We don't allow attribute names that start with "freebsd:" string. */
5410 if (strncmp(name, "freebsd:", 8) == 0)
5413 bzero(attrname, size);
5415 switch (attrnamespace) {
5416 case EXTATTR_NAMESPACE_USER:
5418 prefix = "freebsd:";
5419 namespace = EXTATTR_NAMESPACE_USER_STRING;
5423 * This is the default namespace by which we can access all
5424 * attributes created on Solaris.
5426 prefix = namespace = suffix = "";
5429 case EXTATTR_NAMESPACE_SYSTEM:
5430 prefix = "freebsd:";
5431 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5434 case EXTATTR_NAMESPACE_EMPTY:
5438 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5440 return (ENAMETOOLONG);
5446 * Vnode operating to retrieve a named extended attribute.
5449 zfs_getextattr(struct vop_getextattr_args *ap)
5452 IN struct vnode *a_vp;
5453 IN int a_attrnamespace;
5454 IN const char *a_name;
5455 INOUT struct uio *a_uio;
5457 IN struct ucred *a_cred;
5458 IN struct thread *a_td;
5462 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5463 struct thread *td = ap->a_td;
5464 struct nameidata nd;
5467 vnode_t *xvp = NULL, *vp;
5470 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5471 ap->a_cred, ap->a_td, VREAD);
5475 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5482 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5490 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5492 error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
5494 NDFREE(&nd, NDF_ONLY_PNBUF);
5497 if (error == ENOENT)
5502 if (ap->a_size != NULL) {
5503 error = VOP_GETATTR(vp, &va, ap->a_cred);
5505 *ap->a_size = (size_t)va.va_size;
5506 } else if (ap->a_uio != NULL)
5507 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5510 vn_close(vp, flags, ap->a_cred, td);
5517 * Vnode operation to remove a named attribute.
5520 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5523 IN struct vnode *a_vp;
5524 IN int a_attrnamespace;
5525 IN const char *a_name;
5526 IN struct ucred *a_cred;
5527 IN struct thread *a_td;
5531 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5532 struct thread *td = ap->a_td;
5533 struct nameidata nd;
5536 vnode_t *xvp = NULL, *vp;
5539 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5540 ap->a_cred, ap->a_td, VWRITE);
5544 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5551 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5558 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5559 UIO_SYSSPACE, attrname, xvp, td);
5564 NDFREE(&nd, NDF_ONLY_PNBUF);
5565 if (error == ENOENT)
5570 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5571 NDFREE(&nd, NDF_ONLY_PNBUF);
5574 if (vp == nd.ni_dvp)
5584 * Vnode operation to set a named attribute.
5587 zfs_setextattr(struct vop_setextattr_args *ap)
5590 IN struct vnode *a_vp;
5591 IN int a_attrnamespace;
5592 IN const char *a_name;
5593 INOUT struct uio *a_uio;
5594 IN struct ucred *a_cred;
5595 IN struct thread *a_td;
5599 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5600 struct thread *td = ap->a_td;
5601 struct nameidata nd;
5604 vnode_t *xvp = NULL, *vp;
5607 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5608 ap->a_cred, ap->a_td, VWRITE);
5612 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5619 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5620 LOOKUP_XATTR | CREATE_XATTR_DIR);
5626 flags = FFLAGS(O_WRONLY | O_CREAT);
5627 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5629 error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
5631 NDFREE(&nd, NDF_ONLY_PNBUF);
5639 error = VOP_SETATTR(vp, &va, ap->a_cred);
5641 VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5644 vn_close(vp, flags, ap->a_cred, td);
5651 * Vnode operation to retrieve extended attributes on a vnode.
5654 zfs_listextattr(struct vop_listextattr_args *ap)
5657 IN struct vnode *a_vp;
5658 IN int a_attrnamespace;
5659 INOUT struct uio *a_uio;
5661 IN struct ucred *a_cred;
5662 IN struct thread *a_td;
5666 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5667 struct thread *td = ap->a_td;
5668 struct nameidata nd;
5669 char attrprefix[16];
5670 u_char dirbuf[sizeof(struct dirent)];
5673 struct uio auio, *uio = ap->a_uio;
5674 size_t *sizep = ap->a_size;
5676 vnode_t *xvp = NULL, *vp;
5677 int done, error, eof, pos;
5679 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5680 ap->a_cred, ap->a_td, VREAD);
5684 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5685 sizeof(attrprefix));
5688 plen = strlen(attrprefix);
5695 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5700 * ENOATTR means that the EA directory does not yet exist,
5701 * i.e. there are no extended attributes there.
5703 if (error == ENOATTR)
5708 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5709 UIO_SYSSPACE, ".", xvp, td);
5712 NDFREE(&nd, NDF_ONLY_PNBUF);
5718 auio.uio_iov = &aiov;
5719 auio.uio_iovcnt = 1;
5720 auio.uio_segflg = UIO_SYSSPACE;
5722 auio.uio_rw = UIO_READ;
5723 auio.uio_offset = 0;
5728 aiov.iov_base = (void *)dirbuf;
5729 aiov.iov_len = sizeof(dirbuf);
5730 auio.uio_resid = sizeof(dirbuf);
5731 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5732 done = sizeof(dirbuf) - auio.uio_resid;
5735 for (pos = 0; pos < done;) {
5736 dp = (struct dirent *)(dirbuf + pos);
5737 pos += dp->d_reclen;
5739 * XXX: Temporarily we also accept DT_UNKNOWN, as this
5740 * is what we get when attribute was created on Solaris.
5742 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5744 if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5746 else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5748 nlen = dp->d_namlen - plen;
5751 else if (uio != NULL) {
5753 * Format of extattr name entry is one byte for
5754 * length and the rest for name.
5756 error = uiomove(&nlen, 1, uio->uio_rw, uio);
5758 error = uiomove(dp->d_name + plen, nlen,
5765 } while (!eof && error == 0);
5774 zfs_freebsd_getacl(ap)
5775 struct vop_getacl_args /* {
5784 vsecattr_t vsecattr;
5786 if (ap->a_type != ACL_TYPE_NFS4)
5789 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5790 if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5793 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5794 if (vsecattr.vsa_aclentp != NULL)
5795 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5801 zfs_freebsd_setacl(ap)
5802 struct vop_setacl_args /* {
5811 vsecattr_t vsecattr;
5812 int aclbsize; /* size of acl list in bytes */
5815 if (ap->a_type != ACL_TYPE_NFS4)
5818 if (ap->a_aclp == NULL)
5821 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5825 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5826 * splitting every entry into two and appending "canonical six"
5827 * entries at the end. Don't allow for setting an ACL that would
5828 * cause chmod(2) to run out of ACL entries.
5830 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5833 error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5837 vsecattr.vsa_mask = VSA_ACE;
5838 aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5839 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5840 aaclp = vsecattr.vsa_aclentp;
5841 vsecattr.vsa_aclentsz = aclbsize;
5843 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5844 error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5845 kmem_free(aaclp, aclbsize);
5851 zfs_freebsd_aclcheck(ap)
5852 struct vop_aclcheck_args /* {
5861 return (EOPNOTSUPP);
5865 zfs_vptocnp(struct vop_vptocnp_args *ap)
5867 vnode_t *covered_vp;
5868 vnode_t *vp = ap->a_vp;;
5869 zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
5870 znode_t *zp = VTOZ(vp);
5879 * If we are a snapshot mounted under .zfs, run the operation
5880 * on the covered vnode.
5882 if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
5883 char name[MAXNAMLEN + 1];
5887 error = zfs_znode_parent_and_name(zp, &dzp, name);
5890 if (*ap->a_buflen < len)
5891 error = SET_ERROR(ENOMEM);
5894 *ap->a_buflen -= len;
5895 bcopy(name, ap->a_buf + *ap->a_buflen, len);
5896 *ap->a_vpp = ZTOV(dzp);
5903 covered_vp = vp->v_mount->mnt_vnodecovered;
5904 vs = vget_prep(covered_vp);
5905 ltype = VOP_ISLOCKED(vp);
5907 error = vget_finish(covered_vp, LK_SHARED, vs);
5909 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
5910 ap->a_buf, ap->a_buflen);
5913 vn_lock(vp, ltype | LK_RETRY);
5914 if ((vp->v_iflag & VI_DOOMED) != 0)
5915 error = SET_ERROR(ENOENT);
5922 struct vop_lock1_args /* {
5933 err = vop_stdlock(ap);
5934 if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
5937 if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
5938 zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
5939 VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
5945 struct vop_vector zfs_vnodeops;
5946 struct vop_vector zfs_fifoops;
5947 struct vop_vector zfs_shareops;
5949 struct vop_vector zfs_vnodeops = {
5950 .vop_default = &default_vnodeops,
5951 .vop_inactive = zfs_freebsd_inactive,
5952 .vop_reclaim = zfs_freebsd_reclaim,
5953 .vop_access = zfs_freebsd_access,
5954 .vop_allocate = VOP_EINVAL,
5955 .vop_lookup = zfs_cache_lookup,
5956 .vop_cachedlookup = zfs_freebsd_lookup,
5957 .vop_getattr = zfs_freebsd_getattr,
5958 .vop_setattr = zfs_freebsd_setattr,
5959 .vop_create = zfs_freebsd_create,
5960 .vop_mknod = zfs_freebsd_create,
5961 .vop_mkdir = zfs_freebsd_mkdir,
5962 .vop_readdir = zfs_freebsd_readdir,
5963 .vop_fsync = zfs_freebsd_fsync,
5964 .vop_open = zfs_freebsd_open,
5965 .vop_close = zfs_freebsd_close,
5966 .vop_rmdir = zfs_freebsd_rmdir,
5967 .vop_ioctl = zfs_freebsd_ioctl,
5968 .vop_link = zfs_freebsd_link,
5969 .vop_symlink = zfs_freebsd_symlink,
5970 .vop_readlink = zfs_freebsd_readlink,
5971 .vop_read = zfs_freebsd_read,
5972 .vop_write = zfs_freebsd_write,
5973 .vop_remove = zfs_freebsd_remove,
5974 .vop_rename = zfs_freebsd_rename,
5975 .vop_pathconf = zfs_freebsd_pathconf,
5976 .vop_bmap = zfs_freebsd_bmap,
5977 .vop_fid = zfs_freebsd_fid,
5978 .vop_getextattr = zfs_getextattr,
5979 .vop_deleteextattr = zfs_deleteextattr,
5980 .vop_setextattr = zfs_setextattr,
5981 .vop_listextattr = zfs_listextattr,
5982 .vop_getacl = zfs_freebsd_getacl,
5983 .vop_setacl = zfs_freebsd_setacl,
5984 .vop_aclcheck = zfs_freebsd_aclcheck,
5985 .vop_getpages = zfs_freebsd_getpages,
5986 .vop_putpages = zfs_freebsd_putpages,
5987 .vop_vptocnp = zfs_vptocnp,
5989 .vop_lock1 = zfs_lock,
5993 struct vop_vector zfs_fifoops = {
5994 .vop_default = &fifo_specops,
5995 .vop_fsync = zfs_freebsd_fsync,
5996 .vop_access = zfs_freebsd_access,
5997 .vop_getattr = zfs_freebsd_getattr,
5998 .vop_inactive = zfs_freebsd_inactive,
5999 .vop_read = VOP_PANIC,
6000 .vop_reclaim = zfs_freebsd_reclaim,
6001 .vop_setattr = zfs_freebsd_setattr,
6002 .vop_write = VOP_PANIC,
6003 .vop_pathconf = zfs_freebsd_pathconf,
6004 .vop_fid = zfs_freebsd_fid,
6005 .vop_getacl = zfs_freebsd_getacl,
6006 .vop_setacl = zfs_freebsd_setacl,
6007 .vop_aclcheck = zfs_freebsd_aclcheck,
6011 * special share hidden files vnode operations template
6013 struct vop_vector zfs_shareops = {
6014 .vop_default = &default_vnodeops,
6015 .vop_access = zfs_freebsd_access,
6016 .vop_inactive = zfs_freebsd_inactive,
6017 .vop_reclaim = zfs_freebsd_reclaim,
6018 .vop_fid = zfs_freebsd_fid,
6019 .vop_pathconf = zfs_freebsd_pathconf,