4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Portions Copyright 2007 Jeremy Teo */
29 #include <sys/types.h>
30 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/resource.h>
35 #include <sys/mntent.h>
36 #include <sys/u8_textprep.h>
37 #include <sys/dsl_dataset.h>
39 #include <sys/vnode.h>
42 #include <sys/errno.h>
43 #include <sys/unistd.h>
44 #include <sys/atomic.h>
45 #include <sys/zfs_dir.h>
46 #include <sys/zfs_acl.h>
47 #include <sys/zfs_ioctl.h>
48 #include <sys/zfs_rlock.h>
49 #include <sys/zfs_fuid.h>
50 #include <sys/fs/zfs.h>
51 #include <sys/kidmap.h>
55 #include <sys/refcount.h>
58 #include <sys/zfs_znode.h>
59 #include <sys/refcount.h>
63 /* Used by fstat(1). */
64 SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
68 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
69 * turned on when DEBUG is also defined.
76 #define ZNODE_STAT_ADD(stat) ((stat)++)
78 #define ZNODE_STAT_ADD(stat) /* nothing */
79 #endif /* ZNODE_STATS */
81 #define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3))
82 #define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
85 * Functions needed for userland (ie: libzpool) are not put under
86 * #ifdef_KERNEL; the rest of the functions have dependencies
87 * (such as VFS logic) that will not compile easily in userland.
90 static kmem_cache_t *znode_cache = NULL;
94 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
96 #if 1 /* XXXPJD: From OpenSolaris. */
98 * We should never drop all dbuf refs without first clearing
99 * the eviction callback.
101 panic("evicting znode %p\n", user_ptr);
103 znode_t *zp = user_ptr;
106 mutex_enter(&zp->z_lock);
110 mutex_exit(&zp->z_lock);
112 } else if (vp->v_count == 0) {
115 mutex_exit(&zp->z_lock);
116 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
117 vrecycle(vp, curthread);
122 mutex_exit(&zp->z_lock);
127 extern struct vop_vector zfs_vnodeops;
128 extern struct vop_vector zfs_fifoops;
129 extern struct vop_vector zfs_shareops;
132 * XXX: We cannot use this function as a cache constructor, because
133 * there is one global cache for all file systems and we need
134 * to pass vfsp here, which is not possible, because argument
135 * 'cdrarg' is defined at kmem_cache_create() time.
138 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
145 POINTER_INVALIDATE(&zp->z_zfsvfs);
146 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
149 error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp);
150 if (error != 0 && (kmflags & KM_NOSLEEP))
153 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
155 vp->v_data = (caddr_t)zp;
161 list_link_init(&zp->z_link_node);
163 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
164 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
165 rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
166 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
168 mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
169 avl_create(&zp->z_range_avl, zfs_range_compare,
170 sizeof (rl_t), offsetof(rl_t, r_node));
173 zp->z_dirlocks = NULL;
179 zfs_znode_cache_destructor(void *buf, void *arg)
183 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
184 ASSERT(ZTOV(zp) == NULL);
186 ASSERT(!list_link_active(&zp->z_link_node));
187 mutex_destroy(&zp->z_lock);
188 rw_destroy(&zp->z_parent_lock);
189 rw_destroy(&zp->z_name_lock);
190 mutex_destroy(&zp->z_acl_lock);
191 avl_destroy(&zp->z_range_avl);
192 mutex_destroy(&zp->z_range_lock);
194 ASSERT(zp->z_dbuf == NULL);
195 ASSERT(zp->z_dirlocks == NULL);
200 uint64_t zms_zfsvfs_invalid;
201 uint64_t zms_zfsvfs_unmounted;
202 uint64_t zms_zfsvfs_recheck_invalid;
203 uint64_t zms_obj_held;
204 uint64_t zms_vnode_locked;
205 uint64_t zms_not_only_dnlc;
207 #endif /* ZNODE_STATS */
211 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
216 nzp->z_zfsvfs = ozp->z_zfsvfs;
220 nzp->z_vnode = ozp->z_vnode;
221 ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
222 ZTOV(ozp)->v_data = ozp;
223 ZTOV(nzp)->v_data = nzp;
225 nzp->z_id = ozp->z_id;
226 ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
227 ASSERT(avl_numnodes(&ozp->z_range_avl) == 0);
228 nzp->z_unlinked = ozp->z_unlinked;
229 nzp->z_atime_dirty = ozp->z_atime_dirty;
230 nzp->z_zn_prefetch = ozp->z_zn_prefetch;
231 nzp->z_blksz = ozp->z_blksz;
232 nzp->z_seq = ozp->z_seq;
233 nzp->z_mapcnt = ozp->z_mapcnt;
234 nzp->z_last_itx = ozp->z_last_itx;
235 nzp->z_gen = ozp->z_gen;
236 nzp->z_sync_cnt = ozp->z_sync_cnt;
237 nzp->z_phys = ozp->z_phys;
238 nzp->z_dbuf = ozp->z_dbuf;
240 /* Update back pointers. */
241 (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
245 * Invalidate the original znode by clearing fields that provide a
246 * pointer back to the znode. Set the low bit of the vfs pointer to
247 * ensure that zfs_znode_move() recognizes the znode as invalid in any
248 * subsequent callback.
251 POINTER_INVALIDATE(&ozp->z_zfsvfs);
256 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
258 znode_t *ozp = buf, *nzp = newbuf;
263 * The znode is on the file system's list of known znodes if the vfs
264 * pointer is valid. We set the low bit of the vfs pointer when freeing
265 * the znode to invalidate it, and the memory patterns written by kmem
266 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
267 * created znode sets the vfs pointer last of all to indicate that the
268 * znode is known and in a valid state to be moved by this function.
270 zfsvfs = ozp->z_zfsvfs;
271 if (!POINTER_IS_VALID(zfsvfs)) {
272 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
273 return (KMEM_CBRC_DONT_KNOW);
277 * Ensure that the filesystem is not unmounted during the move.
278 * This is the equivalent to ZFS_ENTER().
280 rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
281 if (zfsvfs->z_unmounted) {
283 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
284 return (KMEM_CBRC_DONT_KNOW);
287 mutex_enter(&zfsvfs->z_znodes_lock);
289 * Recheck the vfs pointer in case the znode was removed just before
290 * acquiring the lock.
292 if (zfsvfs != ozp->z_zfsvfs) {
293 mutex_exit(&zfsvfs->z_znodes_lock);
295 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid);
296 return (KMEM_CBRC_DONT_KNOW);
300 * At this point we know that as long as we hold z_znodes_lock, the
301 * znode cannot be freed and fields within the znode can be safely
302 * accessed. Now, prevent a race with zfs_zget().
304 if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
305 mutex_exit(&zfsvfs->z_znodes_lock);
307 ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
308 return (KMEM_CBRC_LATER);
312 if (mutex_tryenter(&vp->v_lock) == 0) {
313 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
314 mutex_exit(&zfsvfs->z_znodes_lock);
316 ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
317 return (KMEM_CBRC_LATER);
320 /* Only move znodes that are referenced _only_ by the DNLC. */
321 if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
322 mutex_exit(&vp->v_lock);
323 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
324 mutex_exit(&zfsvfs->z_znodes_lock);
326 ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
327 return (KMEM_CBRC_LATER);
331 * The znode is known and in a valid state to move. We're holding the
332 * locks needed to execute the critical section.
334 zfs_znode_move_impl(ozp, nzp);
335 mutex_exit(&vp->v_lock);
336 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
338 list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
339 mutex_exit(&zfsvfs->z_znodes_lock);
342 return (KMEM_CBRC_YES);
352 ASSERT(znode_cache == NULL);
353 znode_cache = kmem_cache_create("zfs_znode_cache",
354 sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
355 zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
357 kmem_cache_set_move(znode_cache, zfs_znode_move);
368 kmem_cache_destroy(znode_cache);
373 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
375 zfs_acl_ids_t acl_ids;
382 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
383 vattr.va_type = VDIR;
384 vattr.va_mode = S_IFDIR|0555;
385 vattr.va_uid = crgetuid(kcred);
386 vattr.va_gid = crgetgid(kcred);
388 sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
389 zfs_znode_cache_constructor(sharezp, zfsvfs->z_parent->z_vfs, 0);
390 sharezp->z_unlinked = 0;
391 sharezp->z_atime_dirty = 0;
392 sharezp->z_zfsvfs = zfsvfs;
394 sharezp->z_vnode = &vnode;
395 vnode.v_data = sharezp;
400 VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
401 kcred, NULL, &acl_ids));
402 zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE,
404 ASSERT3P(zp, ==, sharezp);
405 POINTER_INVALIDATE(&sharezp->z_zfsvfs);
406 error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
407 ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
408 zfsvfs->z_shares_dir = sharezp->z_id;
410 zfs_acl_ids_free(&acl_ids);
411 ZTOV(sharezp)->v_data = NULL;
412 ZTOV(sharezp)->v_count = 0;
413 ZTOV(sharezp)->v_holdcnt = 0;
415 sharezp->z_vnode = NULL;
416 dmu_buf_rele(sharezp->z_dbuf, NULL);
417 sharezp->z_dbuf = NULL;
418 kmem_cache_free(znode_cache, sharezp);
424 * define a couple of values we need available
425 * for both 64 and 32 bit environments.
428 #define NBITSMINOR64 32
431 #define MAXMAJ64 0xffffffffUL
434 #define MAXMIN64 0xffffffffUL
438 * Create special expldev for ZFS private use.
439 * Can't use standard expldev since it doesn't do
440 * what we want. The standard expldev() takes a
441 * dev32_t in LP64 and expands it to a long dev_t.
442 * We need an interface that takes a dev32_t in ILP32
443 * and expands it to a long dev_t.
446 zfs_expldev(dev_t dev)
448 return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
451 * Special cmpldev for ZFS private use.
452 * Can't use standard cmpldev since it takes
453 * a long dev_t and compresses it to dev32_t in
454 * LP64. We need to do a compaction of a long dev_t
455 * to a dev32_t in ILP32.
458 zfs_cmpldev(uint64_t dev)
460 return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
464 zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
468 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
469 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
471 mutex_enter(&zp->z_lock);
473 ASSERT(zp->z_dbuf == NULL);
475 nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
479 * concurrent zgets on this object.
482 panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db);
485 * Slap on VROOT if we are the root znode
487 if (zp->z_id == zfsvfs->z_root)
488 ZTOV(zp)->v_flag |= VROOT;
490 mutex_exit(&zp->z_lock);
495 zfs_znode_dmu_fini(znode_t *zp)
497 dmu_buf_t *db = zp->z_dbuf;
498 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
500 RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
501 ASSERT(zp->z_dbuf != NULL);
503 VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
504 dmu_buf_rele(db, NULL);
508 * Construct a new znode/vnode and intialize.
510 * This does not do a call to dmu_set_user() that is
511 * up to the caller to do, in case you don't want to
515 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
520 zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
521 zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0);
523 ASSERT(zp->z_dirlocks == NULL);
524 ASSERT(zp->z_dbuf == NULL);
525 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
528 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
529 * the zfs_znode_move() callback.
533 zp->z_atime_dirty = 0;
536 zp->z_id = db->db_object;
538 zp->z_seq = 0x7A4653;
546 zfs_znode_dmu_init(zfsvfs, zp, db);
548 zp->z_gen = zp->z_phys->zp_gen;
555 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
556 switch (vp->v_type) {
558 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
561 vp->v_op = &zfs_fifoops;
564 if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir) {
565 vp->v_op = &zfs_shareops;
569 if (vp->v_type != VFIFO)
572 mutex_enter(&zfsvfs->z_znodes_lock);
573 list_insert_tail(&zfsvfs->z_all_znodes, zp);
576 * Everything else must be valid before assigning z_zfsvfs makes the
577 * znode eligible for zfs_znode_move().
579 zp->z_zfsvfs = zfsvfs;
580 mutex_exit(&zfsvfs->z_znodes_lock);
582 VFS_HOLD(zfsvfs->z_vfs);
587 * Create a new DMU object to hold a zfs znode.
589 * IN: dzp - parent directory for new znode
590 * vap - file attributes for new znode
591 * tx - dmu transaction id for zap operations
592 * cr - credentials of caller
594 * IS_ROOT_NODE - new object will be root
595 * IS_XATTR - new object is an attribute
596 * bonuslen - length of bonus buffer
597 * setaclp - File/Dir initial ACL
598 * fuidp - Tracks fuid allocation.
600 * OUT: zpp - allocated znode
604 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
605 uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids)
609 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
614 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
616 if (zfsvfs->z_replay) {
617 obj = vap->va_nodeid;
618 now = vap->va_ctime; /* see zfs_replay_create() */
619 gen = vap->va_nblocks; /* ditto */
623 gen = dmu_tx_get_txg(tx);
627 * Create a new DMU object.
630 * There's currently no mechanism for pre-reading the blocks that will
631 * be to needed allocate a new object, so we accept the small chance
632 * that there will be an i/o error and we will fail one of the
635 if (vap->va_type == VDIR) {
636 if (zfsvfs->z_replay) {
637 err = zap_create_claim_norm(zfsvfs->z_os, obj,
638 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
639 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
640 ASSERT3U(err, ==, 0);
642 obj = zap_create_norm(zfsvfs->z_os,
643 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
644 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
647 if (zfsvfs->z_replay) {
648 err = dmu_object_claim(zfsvfs->z_os, obj,
649 DMU_OT_PLAIN_FILE_CONTENTS, 0,
650 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
651 ASSERT3U(err, ==, 0);
653 obj = dmu_object_alloc(zfsvfs->z_os,
654 DMU_OT_PLAIN_FILE_CONTENTS, 0,
655 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
659 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
660 VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
661 dmu_buf_will_dirty(db, tx);
664 * Initialize the znode physical data to zero.
666 ASSERT(db->db_size >= sizeof (znode_phys_t));
667 bzero(db->db_data, db->db_size);
671 * If this is the root, fix up the half-initialized parent pointer
672 * to reference the just-allocated physical data area.
674 if (flag & IS_ROOT_NODE) {
681 * If parent is an xattr, so am I.
683 if (dzp->z_phys->zp_flags & ZFS_XATTR)
686 if (vap->va_type == VBLK || vap->va_type == VCHR) {
687 pzp->zp_rdev = zfs_expldev(vap->va_rdev);
690 if (zfsvfs->z_use_fuids)
691 pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
693 if (vap->va_type == VDIR) {
694 pzp->zp_size = 2; /* contents ("." and "..") */
695 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
698 pzp->zp_parent = dzp->z_id;
700 pzp->zp_flags |= ZFS_XATTR;
704 ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
705 ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
707 if (vap->va_mask & AT_ATIME) {
708 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
710 ZFS_TIME_ENCODE(&now, pzp->zp_atime);
713 if (vap->va_mask & AT_MTIME) {
714 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
716 ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
719 pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
720 if (!(flag & IS_ROOT_NODE)) {
721 *zpp = zfs_znode_alloc(zfsvfs, db, 0);
724 * If we are creating the root node, the "parent" we
725 * passed in is the znode for the root.
729 pzp->zp_uid = acl_ids->z_fuid;
730 pzp->zp_gid = acl_ids->z_fgid;
731 pzp->zp_mode = acl_ids->z_mode;
732 VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
733 if (vap->va_mask & AT_XVATTR)
734 zfs_xvattr_set(*zpp, (xvattr_t *)vap);
735 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
736 if (!(flag & IS_ROOT_NODE)) {
740 vp->v_vflag |= VV_FORCEINSMQ;
741 err = insmntque(vp, zfsvfs->z_vfs);
742 vp->v_vflag &= ~VV_FORCEINSMQ;
743 KASSERT(err == 0, ("insmntque() failed: error %d", err));
748 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
752 xoap = xva_getxoptattr(xvap);
755 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
756 ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
757 XVA_SET_RTN(xvap, XAT_CREATETIME);
759 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
760 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
761 XVA_SET_RTN(xvap, XAT_READONLY);
763 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
764 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
765 XVA_SET_RTN(xvap, XAT_HIDDEN);
767 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
768 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
769 XVA_SET_RTN(xvap, XAT_SYSTEM);
771 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
772 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
773 XVA_SET_RTN(xvap, XAT_ARCHIVE);
775 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
776 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
777 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
779 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
780 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
781 XVA_SET_RTN(xvap, XAT_NOUNLINK);
783 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
784 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
785 XVA_SET_RTN(xvap, XAT_APPENDONLY);
787 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
788 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
789 XVA_SET_RTN(xvap, XAT_NODUMP);
791 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
792 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
793 XVA_SET_RTN(xvap, XAT_OPAQUE);
795 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
796 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
797 xoap->xoa_av_quarantined);
798 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
800 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
801 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
802 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
804 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
805 (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
806 sizeof (xoap->xoa_av_scanstamp));
807 zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
808 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
813 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
815 dmu_object_info_t doi;
823 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
825 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
827 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
831 dmu_object_info_from_db(db, &doi);
832 if (doi.doi_bonus_type != DMU_OT_ZNODE ||
833 doi.doi_bonus_size < sizeof (znode_phys_t)) {
834 dmu_buf_rele(db, NULL);
835 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
839 zp = dmu_buf_get_user(db);
841 mutex_enter(&zp->z_lock);
844 * Since we do immediate eviction of the z_dbuf, we
845 * should never find a dbuf with a znode that doesn't
846 * know about the dbuf.
848 ASSERT3P(zp->z_dbuf, ==, db);
849 ASSERT3U(zp->z_id, ==, obj_num);
850 if (zp->z_unlinked) {
860 if ((vp->v_iflag & VI_DOOMED) != 0) {
863 * Don't VN_RELE() vnode here, because
864 * it can call vn_lock() which creates
865 * LOR between vnode lock and znode
866 * lock. We will VN_RELE() the vnode
867 * after droping znode lock.
873 ZFS_LOG(1, "dying znode detected (zp=%p)", zp);
877 * znode is dying so we can't reuse it, we must
878 * wait until destruction is completed.
880 dmu_buf_rele(db, NULL);
881 mutex_exit(&zp->z_lock);
882 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
885 tsleep(zp, 0, "zcollide", 1);
891 dmu_buf_rele(db, NULL);
892 mutex_exit(&zp->z_lock);
893 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
898 * Not found create new znode/vnode
899 * but only if file exists.
901 * There is a small window where zfs_vget() could
902 * find this object while a file create is still in
903 * progress. Since a gen number can never be zero
904 * we will check that to determine if its an allocated
908 if (((znode_phys_t *)db->db_data)->zp_gen != 0) {
909 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
912 vp->v_vflag |= VV_FORCEINSMQ;
913 err = insmntque(vp, zfsvfs->z_vfs);
914 vp->v_vflag &= ~VV_FORCEINSMQ;
915 KASSERT(err == 0, ("insmntque() failed: error %d", err));
919 dmu_buf_rele(db, NULL);
922 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
927 zfs_rezget(znode_t *zp)
929 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
930 dmu_object_info_t doi;
932 uint64_t obj_num = zp->z_id;
935 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
937 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
939 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
943 dmu_object_info_from_db(db, &doi);
944 if (doi.doi_bonus_type != DMU_OT_ZNODE ||
945 doi.doi_bonus_size < sizeof (znode_phys_t)) {
946 dmu_buf_rele(db, NULL);
947 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
951 if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
952 dmu_buf_rele(db, NULL);
953 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
957 zfs_znode_dmu_init(zfsvfs, zp, db);
958 zp->z_unlinked = (zp->z_phys->zp_links == 0);
959 zp->z_blksz = doi.doi_data_block_size;
961 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
967 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
969 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
970 objset_t *os = zfsvfs->z_os;
971 uint64_t obj = zp->z_id;
972 uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
974 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
976 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
977 VERIFY(0 == dmu_object_free(os, obj, tx));
978 zfs_znode_dmu_fini(zp);
979 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
984 zfs_zinactive(znode_t *zp)
986 vnode_t *vp = ZTOV(zp);
987 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
988 uint64_t z_id = zp->z_id;
991 ASSERT(zp->z_dbuf && zp->z_phys);
994 * Don't allow a zfs_zget() while were trying to release this znode
996 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
998 mutex_enter(&zp->z_lock);
1000 if (vp->v_count > 0) {
1002 * If the hold count is greater than zero, somebody has
1003 * obtained a new reference on this znode while we were
1004 * processing it here, so we are done.
1007 mutex_exit(&zp->z_lock);
1008 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1014 * If this was the last reference to a file with no links,
1015 * remove the file from the file system.
1017 if (zp->z_unlinked) {
1018 mutex_exit(&zp->z_lock);
1019 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1020 ASSERT(vp->v_count == 0);
1021 vrecycle(vp, curthread);
1022 vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs);
1024 VFS_UNLOCK_GIANT(vfslocked);
1027 mutex_exit(&zp->z_lock);
1028 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1032 zfs_znode_free(znode_t *zp)
1034 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1036 ASSERT(ZTOV(zp) == NULL);
1037 mutex_enter(&zfsvfs->z_znodes_lock);
1038 POINTER_INVALIDATE(&zp->z_zfsvfs);
1039 list_remove(&zfsvfs->z_all_znodes, zp);
1040 mutex_exit(&zfsvfs->z_znodes_lock);
1042 kmem_cache_free(znode_cache, zp);
1044 VFS_RELE(zfsvfs->z_vfs);
1048 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
1052 ASSERT(MUTEX_HELD(&zp->z_lock));
1057 dmu_buf_will_dirty(zp->z_dbuf, tx);
1058 zp->z_atime_dirty = 0;
1061 zp->z_atime_dirty = 1;
1064 if (flag & AT_ATIME)
1065 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
1067 if (flag & AT_MTIME) {
1068 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
1069 if (zp->z_zfsvfs->z_use_fuids)
1070 zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
1073 if (flag & AT_CTIME) {
1074 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
1075 if (zp->z_zfsvfs->z_use_fuids)
1076 zp->z_phys->zp_flags |= ZFS_ARCHIVE;
1081 * Update the requested znode timestamps with the current time.
1082 * If we are in a transaction, then go ahead and mark the znode
1083 * dirty in the transaction so the timestamps will go to disk.
1084 * Otherwise, we will get pushed next time the znode is updated
1085 * in a transaction, or when this znode eventually goes inactive.
1088 * 1 - Only the ACCESS time is ever updated outside of a transaction.
1089 * 2 - Multiple consecutive updates will be collapsed into a single
1090 * znode update by the transaction grouping semantics of the DMU.
1093 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
1095 mutex_enter(&zp->z_lock);
1096 zfs_time_stamper_locked(zp, flag, tx);
1097 mutex_exit(&zp->z_lock);
1101 * Grow the block size for a file.
1103 * IN: zp - znode of file to free data in.
1104 * size - requested block size
1105 * tx - open transaction.
1107 * NOTE: this function assumes that the znode is write locked.
1110 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1115 if (size <= zp->z_blksz)
1118 * If the file size is already greater than the current blocksize,
1119 * we will not grow. If there is more than one block in a file,
1120 * the blocksize cannot change.
1122 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
1125 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
1127 if (error == ENOTSUP)
1129 ASSERT3U(error, ==, 0);
1131 /* What blocksize did we actually get? */
1132 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
1136 * Increase the file length
1138 * IN: zp - znode of file to free data in.
1139 * end - new end-of-file
1141 * RETURN: 0 if success
1142 * error code if failure
1145 zfs_extend(znode_t *zp, uint64_t end)
1147 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1154 * We will change zp_size, lock the whole file.
1156 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1159 * Nothing to do if file already at desired length.
1161 if (end <= zp->z_phys->zp_size) {
1162 zfs_range_unlock(rl);
1166 tx = dmu_tx_create(zfsvfs->z_os);
1167 dmu_tx_hold_bonus(tx, zp->z_id);
1168 if (end > zp->z_blksz &&
1169 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1171 * We are growing the file past the current block size.
1173 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
1174 ASSERT(!ISP2(zp->z_blksz));
1175 newblksz = MIN(end, SPA_MAXBLOCKSIZE);
1177 newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
1179 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1184 error = dmu_tx_assign(tx, TXG_NOWAIT);
1186 if (error == ERESTART) {
1192 zfs_range_unlock(rl);
1195 dmu_buf_will_dirty(zp->z_dbuf, tx);
1198 zfs_grow_blocksize(zp, newblksz, tx);
1200 zp->z_phys->zp_size = end;
1202 zfs_range_unlock(rl);
1206 vnode_pager_setsize(ZTOV(zp), end);
1212 * Free space in a file.
1214 * IN: zp - znode of file to free data in.
1215 * off - start of section to free.
1216 * len - length of section to free.
1218 * RETURN: 0 if success
1219 * error code if failure
1222 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1224 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1229 * Lock the range being freed.
1231 rl = zfs_range_lock(zp, off, len, RL_WRITER);
1234 * Nothing to do if file already at desired length.
1236 if (off >= zp->z_phys->zp_size) {
1237 zfs_range_unlock(rl);
1241 if (off + len > zp->z_phys->zp_size)
1242 len = zp->z_phys->zp_size - off;
1244 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1248 * In FreeBSD we cannot free block in the middle of a file,
1249 * but only at the end of a file.
1251 vnode_pager_setsize(ZTOV(zp), off);
1254 zfs_range_unlock(rl);
1262 * IN: zp - znode of file to free data in.
1263 * end - new end-of-file.
1265 * RETURN: 0 if success
1266 * error code if failure
1269 zfs_trunc(znode_t *zp, uint64_t end)
1271 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1272 vnode_t *vp = ZTOV(zp);
1278 * We will change zp_size, lock the whole file.
1280 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1283 * Nothing to do if file already at desired length.
1285 if (end >= zp->z_phys->zp_size) {
1286 zfs_range_unlock(rl);
1290 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1);
1292 zfs_range_unlock(rl);
1296 tx = dmu_tx_create(zfsvfs->z_os);
1297 dmu_tx_hold_bonus(tx, zp->z_id);
1298 error = dmu_tx_assign(tx, TXG_NOWAIT);
1300 if (error == ERESTART) {
1306 zfs_range_unlock(rl);
1309 dmu_buf_will_dirty(zp->z_dbuf, tx);
1311 zp->z_phys->zp_size = end;
1316 * Clear any mapped pages in the truncated region. This has to
1317 * happen outside of the transaction to avoid the possibility of
1318 * a deadlock with someone trying to push a page that we are
1319 * about to invalidate.
1321 vnode_pager_setsize(vp, end);
1323 zfs_range_unlock(rl);
1329 * Free space in a file
1331 * IN: zp - znode of file to free data in.
1332 * off - start of range
1333 * len - end of range (0 => EOF)
1334 * flag - current file open mode flags.
1335 * log - TRUE if this action should be logged
1337 * RETURN: 0 if success
1338 * error code if failure
1341 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1343 vnode_t *vp = ZTOV(zp);
1345 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1346 zilog_t *zilog = zfsvfs->z_log;
1349 if (off > zp->z_phys->zp_size) {
1350 error = zfs_extend(zp, off+len);
1351 if (error == 0 && log)
1358 error = zfs_trunc(zp, off);
1360 if ((error = zfs_free_range(zp, off, len)) == 0 &&
1361 off + len > zp->z_phys->zp_size)
1362 error = zfs_extend(zp, off+len);
1367 tx = dmu_tx_create(zfsvfs->z_os);
1368 dmu_tx_hold_bonus(tx, zp->z_id);
1369 error = dmu_tx_assign(tx, TXG_NOWAIT);
1371 if (error == ERESTART) {
1380 zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
1381 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1388 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1391 uint64_t moid, obj, version;
1392 uint64_t sense = ZFS_CASE_SENSITIVE;
1397 znode_t *rootzp = NULL;
1401 zfs_acl_ids_t acl_ids;
1404 * First attempt to create master node.
1407 * In an empty objset, there are no blocks to read and thus
1408 * there can be no i/o errors (which we assert below).
1410 moid = MASTER_NODE_OBJ;
1411 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1412 DMU_OT_NONE, 0, tx);
1416 * Set starting attributes.
1418 if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE)
1419 version = ZPL_VERSION;
1420 else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
1421 version = ZPL_VERSION_USERSPACE - 1;
1423 version = ZPL_VERSION_FUID - 1;
1425 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1426 /* For the moment we expect all zpl props to be uint64_ts */
1430 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1431 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1432 name = nvpair_name(elem);
1433 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1437 error = zap_update(os, moid, name, 8, 1, &val, tx);
1440 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1442 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1445 ASSERT(version != 0);
1446 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1449 * Create a delete queue.
1451 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1453 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1457 * Create root znode. Create minimal znode/vnode/zfsvfs
1458 * to allow zfs_mknode to work.
1461 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
1462 vattr.va_type = VDIR;
1463 vattr.va_mode = S_IFDIR|0755;
1464 vattr.va_uid = crgetuid(cr);
1465 vattr.va_gid = crgetgid(cr);
1467 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1468 zfs_znode_cache_constructor(rootzp, NULL, 0);
1469 rootzp->z_unlinked = 0;
1470 rootzp->z_atime_dirty = 0;
1472 vnode.v_type = VDIR;
1473 vnode.v_data = rootzp;
1474 rootzp->z_vnode = &vnode;
1476 bzero(&zfsvfs, sizeof (zfsvfs_t));
1479 zfsvfs.z_parent = &zfsvfs;
1480 zfsvfs.z_version = version;
1481 zfsvfs.z_use_fuids = USE_FUIDS(version, os);
1482 zfsvfs.z_norm = norm;
1484 * Fold case on file systems that are always or sometimes case
1487 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1488 zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER;
1490 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1491 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
1492 offsetof(znode_t, z_link_node));
1494 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1495 mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1497 ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
1498 rootzp->z_zfsvfs = &zfsvfs;
1499 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1500 cr, NULL, &acl_ids));
1501 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids);
1502 ASSERT3P(zp, ==, rootzp);
1503 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1505 zfs_acl_ids_free(&acl_ids);
1506 POINTER_INVALIDATE(&rootzp->z_zfsvfs);
1508 dmu_buf_rele(rootzp->z_dbuf, NULL);
1509 rootzp->z_dbuf = NULL;
1510 rootzp->z_vnode = NULL;
1511 kmem_cache_free(znode_cache, rootzp);
1514 * Create shares directory
1517 error = zfs_create_share_dir(&zfsvfs, tx);
1521 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1522 mutex_destroy(&zfsvfs.z_hold_mtx[i]);
1525 #endif /* _KERNEL */
1527 * Given an object number, return its parent object number and whether
1528 * or not the object is an extended attribute directory.
1531 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
1534 dmu_object_info_t doi;
1538 if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
1541 dmu_object_info_from_db(db, &doi);
1542 if (doi.doi_bonus_type != DMU_OT_ZNODE ||
1543 doi.doi_bonus_size < sizeof (znode_phys_t)) {
1544 dmu_buf_rele(db, FTAG);
1549 *pobjp = zp->zp_parent;
1550 *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
1551 S_ISDIR(zp->zp_mode);
1552 dmu_buf_rele(db, FTAG);
1558 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
1560 char *path = buf + len - 1;
1567 char component[MAXNAMELEN + 2];
1571 if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
1572 &is_xattrdir)) != 0)
1583 (void) sprintf(component + 1, "<xattrdir>");
1585 error = zap_value_search(osp, pobj, obj,
1586 ZFS_DIRENT_OBJ(-1ULL), component + 1);
1591 complen = strlen(component);
1593 ASSERT(path >= buf);
1594 bcopy(component, path, complen);
1599 (void) memmove(buf, path, buf + len - path);