4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Portions Copyright 2007 Jeremy Teo */
28 #pragma ident "%Z%%M% %I% %E% SMI"
31 #include <sys/types.h>
32 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/mntent.h>
39 #include <sys/vnode.h>
42 #include <sys/cmn_err.h>
43 #include <sys/errno.h>
44 #include <sys/unistd.h>
45 #include <sys/atomic.h>
46 #include <sys/zfs_dir.h>
47 #include <sys/zfs_acl.h>
48 #include <sys/zfs_ioctl.h>
49 #include <sys/zfs_rlock.h>
50 #include <sys/fs/zfs.h>
54 #include <sys/refcount.h>
57 #include <sys/zfs_znode.h>
58 #include <sys/refcount.h>
61 * Functions needed for userland (ie: libzpool) are not put under
62 * #ifdef_KERNEL; the rest of the functions have dependencies
63 * (such as VFS logic) that will not compile easily in userland.
66 struct kmem_cache *znode_cache = NULL;
70 znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
72 znode_t *zp = user_ptr;
75 mutex_enter(&zp->z_lock);
78 mutex_exit(&zp->z_lock);
80 } else if (vp->v_count == 0) {
83 mutex_exit(&zp->z_lock);
84 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
85 vrecycle(vp, curthread);
86 VOP_UNLOCK(vp, 0, curthread);
90 /* signal force unmount that this znode can be freed */
92 mutex_exit(&zp->z_lock);
96 extern struct vop_vector zfs_vnodeops;
97 extern struct vop_vector zfs_fifoops;
100 * XXX: We cannot use this function as a cache constructor, because
101 * there is one global cache for all file systems and we need
102 * to pass vfsp here, which is not possible, because argument
103 * 'cdrarg' is defined at kmem_cache_create() time.
106 zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
110 vfs_t *vfsp = cdrarg;
113 if (cdrarg != NULL) {
114 error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp);
116 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
118 vp->v_data = (caddr_t)zp;
119 vp->v_vnlock->lk_flags |= LK_CANRECURSE;
120 vp->v_vnlock->lk_flags &= ~LK_NOSHARE;
124 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
125 rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
126 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
127 rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
128 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
130 mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
131 avl_create(&zp->z_range_avl, zfs_range_compare,
132 sizeof (rl_t), offsetof(rl_t, r_node));
141 zfs_znode_cache_destructor(void *buf, void *cdarg)
145 ASSERT(zp->z_dirlocks == 0);
146 mutex_destroy(&zp->z_lock);
147 rw_destroy(&zp->z_map_lock);
148 rw_destroy(&zp->z_parent_lock);
149 rw_destroy(&zp->z_name_lock);
150 mutex_destroy(&zp->z_acl_lock);
151 mutex_destroy(&zp->z_range_lock);
152 avl_destroy(&zp->z_range_avl);
154 ASSERT(zp->z_dbuf_held == 0);
163 ASSERT(znode_cache == NULL);
164 znode_cache = kmem_cache_create("zfs_znode_cache",
165 sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
166 zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
176 kmem_cache_destroy(znode_cache);
181 * zfs_init_fs - Initialize the zfsvfs struct and the file system
182 * incore "master" object. Verify version compatibility.
185 zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
187 objset_t *os = zfsvfs->z_os;
188 uint64_t version = ZPL_VERSION;
190 dmu_object_info_t doi;
196 * XXX - hack to auto-create the pool root filesystem at
197 * the first attempted mount.
199 if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
200 dmu_tx_t *tx = dmu_tx_create(os);
202 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
203 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
204 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
205 error = dmu_tx_assign(tx, TXG_WAIT);
206 ASSERT3U(error, ==, 0);
207 zfs_create_fs(os, cr, tx);
211 error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1,
215 } else if (version != ZPL_VERSION) {
216 (void) printf("Mismatched versions: File system "
217 "is version %lld on-disk format, which is "
218 "incompatible with this software version %lld!",
219 (u_longlong_t)version, ZPL_VERSION);
224 * The fsid is 64 bits, composed of an 8-bit fs type, which
225 * separates our fsid from any other filesystem types, and a
226 * 56-bit objset unique ID. The objset unique ID is unique to
227 * all objsets open on this system, provided by unique_create().
228 * The 8-bit fs type must be put in the low bits of fsid[1]
229 * because that's where other Solaris filesystems put it.
231 fsid_guid = dmu_objset_fsid_guid(os);
232 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
233 zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
234 zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
235 zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF;
237 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
241 ASSERT(zfsvfs->z_root != 0);
244 * Create the per mount vop tables.
248 * Initialize zget mutex's
250 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
251 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
253 error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
256 ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
258 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
259 &zfsvfs->z_unlinkedobj);
267 * define a couple of values we need available
268 * for both 64 and 32 bit environments.
271 #define NBITSMINOR64 32
274 #define MAXMAJ64 0xffffffffUL
277 #define MAXMIN64 0xffffffffUL
280 #define major(x) ((int)(((u_int)(x) >> 8)&0xff)) /* major number */
283 #define minor(x) ((int)((x)&0xffff00ff)) /* minor number */
287 * Create special expldev for ZFS private use.
288 * Can't use standard expldev since it doesn't do
289 * what we want. The standard expldev() takes a
290 * dev32_t in LP64 and expands it to a long dev_t.
291 * We need an interface that takes a dev32_t in ILP32
292 * and expands it to a long dev_t.
295 zfs_expldev(dev_t dev)
297 return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
300 * Special cmpldev for ZFS private use.
301 * Can't use standard cmpldev since it takes
302 * a long dev_t and compresses it to dev32_t in
303 * LP64. We need to do a compaction of a long dev_t
304 * to a dev32_t in ILP32.
307 zfs_cmpldev(uint64_t dev)
309 return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
313 * Construct a new znode/vnode and intialize.
315 * This does not do a call to dmu_set_user() that is
316 * up to the caller to do, in case you don't want to
320 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
326 zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
327 zfs_znode_cache_constructor(zp, zfsvfs->z_vfs, 0);
329 ASSERT(zp->z_dirlocks == NULL);
331 zp->z_phys = db->db_data;
332 zp->z_zfsvfs = zfsvfs;
334 zp->z_atime_dirty = 0;
341 zp->z_seq = 0x7A4653;
344 mutex_enter(&zfsvfs->z_znodes_lock);
345 list_insert_tail(&zfsvfs->z_all_znodes, zp);
346 mutex_exit(&zfsvfs->z_znodes_lock);
352 vp->v_vflag |= VV_FORCEINSMQ;
353 error = insmntque(vp, zfsvfs->z_vfs);
354 vp->v_vflag &= ~VV_FORCEINSMQ;
355 KASSERT(error == 0, ("insmntque() failed: error %d", error));
357 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
358 switch (vp->v_type) {
360 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
363 vp->v_op = &zfs_fifoops;
371 zfs_znode_dmu_init(znode_t *zp)
374 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
375 dmu_buf_t *db = zp->z_dbuf;
377 mutex_enter(&zp->z_lock);
379 nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_pageout_func);
383 * concurrent zgets on this object.
385 ASSERT3P(nzp, ==, NULL);
388 * Slap on VROOT if we are the root znode
390 if (zp->z_id == zfsvfs->z_root) {
391 ZTOV(zp)->v_flag |= VROOT;
394 ASSERT(zp->z_dbuf_held == 0);
396 VFS_HOLD(zfsvfs->z_vfs);
397 mutex_exit(&zp->z_lock);
401 * Create a new DMU object to hold a zfs znode.
403 * IN: dzp - parent directory for new znode
404 * vap - file attributes for new znode
405 * tx - dmu transaction id for zap operations
406 * cr - credentials of caller
408 * IS_ROOT_NODE - new object will be root
409 * IS_XATTR - new object is an attribute
410 * IS_REPLAY - intent log replay
412 * OUT: oid - ID of created object
416 zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
417 uint_t flag, znode_t **zpp, int bonuslen)
422 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
427 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
429 if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */
430 *oid = vap->va_nodeid;
432 now = vap->va_ctime; /* see zfs_replay_create() */
433 gen = vap->va_nblocks; /* ditto */
437 gen = dmu_tx_get_txg(tx);
441 * Create a new DMU object.
444 * There's currently no mechanism for pre-reading the blocks that will
445 * be to needed allocate a new object, so we accept the small chance
446 * that there will be an i/o error and we will fail one of the
449 if (vap->va_type == VDIR) {
450 if (flag & IS_REPLAY) {
451 err = zap_create_claim(zfsvfs->z_os, *oid,
452 DMU_OT_DIRECTORY_CONTENTS,
453 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
454 ASSERT3U(err, ==, 0);
456 *oid = zap_create(zfsvfs->z_os,
457 DMU_OT_DIRECTORY_CONTENTS,
458 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
461 if (flag & IS_REPLAY) {
462 err = dmu_object_claim(zfsvfs->z_os, *oid,
463 DMU_OT_PLAIN_FILE_CONTENTS, 0,
464 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
465 ASSERT3U(err, ==, 0);
467 *oid = dmu_object_alloc(zfsvfs->z_os,
468 DMU_OT_PLAIN_FILE_CONTENTS, 0,
469 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
472 VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp));
473 dmu_buf_will_dirty(dbp, tx);
476 * Initialize the znode physical data to zero.
478 ASSERT(dbp->db_size >= sizeof (znode_phys_t));
479 bzero(dbp->db_data, dbp->db_size);
483 * If this is the root, fix up the half-initialized parent pointer
484 * to reference the just-allocated physical data area.
486 if (flag & IS_ROOT_NODE) {
492 * If parent is an xattr, so am I.
494 if (dzp->z_phys->zp_flags & ZFS_XATTR)
497 if (vap->va_type == VBLK || vap->va_type == VCHR) {
498 pzp->zp_rdev = zfs_expldev(vap->va_rdev);
501 if (vap->va_type == VDIR) {
502 pzp->zp_size = 2; /* contents ("." and "..") */
503 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
506 pzp->zp_parent = dzp->z_id;
508 pzp->zp_flags |= ZFS_XATTR;
512 ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
513 ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
515 if (vap->va_mask & AT_ATIME) {
516 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
518 ZFS_TIME_ENCODE(&now, pzp->zp_atime);
521 if (vap->va_mask & AT_MTIME) {
522 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
524 ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
527 pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
528 zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0);
530 zfs_perm_init(zp, dzp, flag, vap, tx, cr);
533 kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp);
535 mutex_enter(hash_mtx);
536 zfs_znode_dmu_init(zp);
537 mutex_exit(hash_mtx);
541 if (ZTOV(zp) != NULL) {
542 ZTOV(zp)->v_count = 0;
543 VOP_UNLOCK(ZTOV(zp), 0, curthread);
545 dmu_buf_rele(dbp, NULL);
551 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
553 dmu_object_info_t doi;
561 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
563 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
565 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
569 dmu_object_info_from_db(db, &doi);
570 if (doi.doi_bonus_type != DMU_OT_ZNODE ||
571 doi.doi_bonus_size < sizeof (znode_phys_t)) {
572 dmu_buf_rele(db, NULL);
573 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
577 ASSERT(db->db_object == obj_num);
578 ASSERT(db->db_offset == -1);
579 ASSERT(db->db_data != NULL);
581 zp = dmu_buf_get_user(db);
584 mutex_enter(&zp->z_lock);
586 ASSERT3U(zp->z_id, ==, obj_num);
587 if (zp->z_unlinked) {
588 dmu_buf_rele(db, NULL);
589 mutex_exit(&zp->z_lock);
590 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
592 } else if (zp->z_dbuf_held) {
593 dmu_buf_rele(db, NULL);
596 VFS_HOLD(zfsvfs->z_vfs);
599 if (ZTOV(zp) != NULL)
602 err = getnewvnode("zfs", zfsvfs->z_vfs, &zfs_vnodeops,
606 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
607 vp->v_data = (caddr_t)zp;
608 vp->v_vnlock->lk_flags |= LK_CANRECURSE;
609 vp->v_vnlock->lk_flags &= ~LK_NOSHARE;
610 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
611 if (vp->v_type == VDIR)
612 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
613 vp->v_vflag |= VV_FORCEINSMQ;
614 err = insmntque(vp, zfsvfs->z_vfs);
615 vp->v_vflag &= ~VV_FORCEINSMQ;
616 KASSERT(err == 0, ("insmntque() failed: error %d", err));
617 VOP_UNLOCK(vp, 0, curthread);
619 mutex_exit(&zp->z_lock);
620 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
626 * Not found create new znode/vnode
628 zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
629 ASSERT3U(zp->z_id, ==, obj_num);
630 zfs_znode_dmu_init(zp);
631 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
633 if ((vp = ZTOV(zp)) != NULL)
634 VOP_UNLOCK(vp, 0, curthread);
639 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
641 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
644 ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
645 if (zp->z_phys->zp_acl.z_acl_extern_obj) {
646 error = dmu_object_free(zfsvfs->z_os,
647 zp->z_phys->zp_acl.z_acl_extern_obj, tx);
648 ASSERT3U(error, ==, 0);
650 error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
651 ASSERT3U(error, ==, 0);
653 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
654 dmu_buf_rele(zp->z_dbuf, NULL);
658 zfs_zinactive(znode_t *zp)
660 vnode_t *vp = ZTOV(zp);
661 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
662 uint64_t z_id = zp->z_id;
664 ASSERT(zp->z_dbuf_held && zp->z_phys);
667 * Don't allow a zfs_zget() while were trying to release this znode
669 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
671 mutex_enter(&zp->z_lock);
673 if (vp->v_count > 0) {
675 * If the hold count is greater than zero, somebody has
676 * obtained a new reference on this znode while we were
677 * processing it here, so we are done.
680 mutex_exit(&zp->z_lock);
681 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
687 * If this was the last reference to a file with no links,
688 * remove the file from the file system.
690 if (zp->z_unlinked) {
692 mutex_exit(&zp->z_lock);
693 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
694 ASSERT(vp->v_count == 0);
695 vrecycle(vp, curthread);
697 VFS_RELE(zfsvfs->z_vfs);
701 ASSERT(zp->z_dbuf_held);
702 mutex_exit(&zp->z_lock);
703 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
707 zfs_znode_free(znode_t *zp)
709 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
711 mutex_enter(&zfsvfs->z_znodes_lock);
712 list_remove(&zfsvfs->z_all_znodes, zp);
713 mutex_exit(&zfsvfs->z_znodes_lock);
715 kmem_cache_free(znode_cache, zp);
719 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
723 ASSERT(MUTEX_HELD(&zp->z_lock));
728 dmu_buf_will_dirty(zp->z_dbuf, tx);
729 zp->z_atime_dirty = 0;
732 zp->z_atime_dirty = 1;
736 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
739 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
742 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
746 * Update the requested znode timestamps with the current time.
747 * If we are in a transaction, then go ahead and mark the znode
748 * dirty in the transaction so the timestamps will go to disk.
749 * Otherwise, we will get pushed next time the znode is updated
750 * in a transaction, or when this znode eventually goes inactive.
753 * 1 - Only the ACCESS time is ever updated outside of a transaction.
754 * 2 - Multiple consecutive updates will be collapsed into a single
755 * znode update by the transaction grouping semantics of the DMU.
758 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
760 mutex_enter(&zp->z_lock);
761 zfs_time_stamper_locked(zp, flag, tx);
762 mutex_exit(&zp->z_lock);
766 * Grow the block size for a file.
768 * IN: zp - znode of file to free data in.
769 * size - requested block size
770 * tx - open transaction.
772 * NOTE: this function assumes that the znode is write locked.
775 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
780 if (size <= zp->z_blksz)
783 * If the file size is already greater than the current blocksize,
784 * we will not grow. If there is more than one block in a file,
785 * the blocksize cannot change.
787 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
790 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
792 if (error == ENOTSUP)
794 ASSERT3U(error, ==, 0);
796 /* What blocksize did we actually get? */
797 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
801 * Free space in a file.
803 * IN: zp - znode of file to free data in.
804 * off - start of section to free.
805 * len - length of section to free (0 => to EOF).
806 * flag - current file open mode flags.
808 * RETURN: 0 if success
809 * error code if failure
812 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
814 vnode_t *vp = ZTOV(zp);
816 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
817 zilog_t *zilog = zfsvfs->z_log;
819 uint64_t end = off + len;
820 uint64_t size, new_blksz;
823 if (ZTOV(zp)->v_type == VFIFO)
827 * If we will change zp_size then lock the whole file,
828 * otherwise just lock the range being freed.
830 if (len == 0 || off + len > zp->z_phys->zp_size) {
831 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
833 rl = zfs_range_lock(zp, off, len, RL_WRITER);
834 /* recheck, in case zp_size changed */
835 if (off + len > zp->z_phys->zp_size) {
836 /* lost race: file size changed, lock whole file */
837 zfs_range_unlock(rl);
838 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
843 * Nothing to do if file already at desired length.
845 size = zp->z_phys->zp_size;
846 if (len == 0 && size == off && off != 0) {
847 zfs_range_unlock(rl);
851 tx = dmu_tx_create(zfsvfs->z_os);
852 dmu_tx_hold_bonus(tx, zp->z_id);
855 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
857 * We are growing the file past the current block size.
859 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
860 ASSERT(!ISP2(zp->z_blksz));
861 new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
863 new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
865 dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz));
866 } else if (off < size) {
868 * If len == 0, we are truncating the file.
870 dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
873 error = dmu_tx_assign(tx, zfsvfs->z_assign);
875 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
878 zfs_range_unlock(rl);
883 zfs_grow_blocksize(zp, new_blksz, tx);
885 if (end > size || len == 0)
886 zp->z_phys->zp_size = end;
889 objset_t *os = zfsvfs->z_os;
896 VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx));
900 zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
901 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
904 zfs_range_unlock(rl);
909 * Clear any mapped pages in the truncated region. This has to
910 * happen outside of the transaction to avoid the possibility of
911 * a deadlock with someone trying to push a page that we are
912 * about to invalidate.
914 rw_enter(&zp->z_map_lock, RW_WRITER);
916 vnode_pager_setsize(vp, end);
919 error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE);
921 error = vinvalbuf(vp, V_SAVE, curthread, 0, 0);
922 vnode_pager_setsize(vp, end);
925 rw_exit(&zp->z_map_lock);
931 zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
934 uint64_t moid, doid, roid = 0;
935 uint64_t version = ZPL_VERSION;
937 znode_t *rootzp = NULL;
941 * First attempt to create master node.
944 * In an empty objset, there are no blocks to read and thus
945 * there can be no i/o errors (which we assert below).
947 moid = MASTER_NODE_OBJ;
948 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
953 * Set starting attributes.
956 error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx);
960 * Create a delete queue.
962 doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
964 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
968 * Create root znode. Create minimal znode/vnode/zfsvfs
969 * to allow zfs_mknode to work.
971 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
972 vattr.va_type = VDIR;
973 vattr.va_mode = S_IFDIR|0755;
974 vattr.va_uid = UID_ROOT;
975 vattr.va_gid = GID_WHEEL;
977 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
978 zfs_znode_cache_constructor(rootzp, NULL, 0);
979 rootzp->z_zfsvfs = &zfsvfs;
980 rootzp->z_unlinked = 0;
981 rootzp->z_atime_dirty = 0;
982 rootzp->z_dbuf_held = 0;
984 bzero(&zfsvfs, sizeof (zfsvfs_t));
987 zfsvfs.z_assign = TXG_NOWAIT;
988 zfsvfs.z_parent = &zfsvfs;
990 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
991 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
992 offsetof(znode_t, z_link_node));
994 zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0);
995 ASSERT3U(rootzp->z_id, ==, roid);
996 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx);
999 mutex_destroy(&zfsvfs.z_znodes_lock);
1000 kmem_cache_free(znode_cache, rootzp);
1002 #endif /* _KERNEL */
1005 * Given an object number, return its parent object number and whether
1006 * or not the object is an extended attribute directory.
1009 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
1012 dmu_object_info_t doi;
1016 if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
1019 dmu_object_info_from_db(db, &doi);
1020 if (doi.doi_bonus_type != DMU_OT_ZNODE ||
1021 doi.doi_bonus_size < sizeof (znode_phys_t)) {
1022 dmu_buf_rele(db, FTAG);
1027 *pobjp = zp->zp_parent;
1028 *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
1029 S_ISDIR(zp->zp_mode);
1030 dmu_buf_rele(db, FTAG);
1036 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
1038 char *path = buf + len - 1;
1045 char component[MAXNAMELEN + 2];
1049 if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
1050 &is_xattrdir)) != 0)
1061 (void) sprintf(component + 1, "<xattrdir>");
1063 error = zap_value_search(osp, pobj, obj, component + 1);
1068 complen = strlen(component);
1070 ASSERT(path >= buf);
1071 bcopy(component, path, complen);
1076 (void) memmove(buf, path, buf + len - path);