sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #include <sys/types.h>
  27 #include <sys/param.h>
  28 #include <sys/systm.h>
  29 #include <sys/kernel.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/kmem.h>
  32 #include <sys/acl.h>
  33 #include <sys/vnode.h>
  34 #include <sys/vfs.h>
  35 #include <sys/mntent.h>
  36 #include <sys/mount.h>
  37 #include <sys/cmn_err.h>
  38 #include <sys/zfs_znode.h>
  39 #include <sys/zfs_dir.h>
  40 #include <sys/zil.h>
  41 #include <sys/fs/zfs.h>
  42 #include <sys/dmu.h>
  43 #include <sys/dsl_prop.h>
  44 #include <sys/dsl_dataset.h>
  45 #include <sys/dsl_deleg.h>
  46 #include <sys/spa.h>
  47 #include <sys/zap.h>
  48 #include <sys/varargs.h>
  49 #include <sys/policy.h>
  50 #include <sys/atomic.h>
  51 #include <sys/zfs_ioctl.h>
  52 #include <sys/zfs_ctldir.h>
  53 #include <sys/zfs_fuid.h>
  54 #include <sys/sunddi.h>
  55 #include <sys/dnlc.h>
  56 #include <sys/dmu_objset.h>
  57 #include <sys/spa_boot.h>
  58 #include <sys/vdev_impl.h>      /* VDEV_BOOT_VERSION */
  59
  60 struct mtx zfs_debug_mtx;
  61 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
  62
  63 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
  64
  65 int zfs_super_owner = 0;
  66 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
  67     "File system owner can perform privileged operation on his file systems");
  68
  69 int zfs_debug_level = 0;
  70 TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
  71 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
  72     "Debug level");
  73
  74 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
  75 static int zfs_version_acl = ZFS_ACL_VERSION;
  76 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
  77     "ZFS_ACL_VERSION");
  78 static int zfs_version_dmu_backup_header = DMU_BACKUP_HEADER_VERSION;
  79 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_header, CTLFLAG_RD,
  80     &zfs_version_dmu_backup_header, 0, "DMU_BACKUP_HEADER_VERSION");
  81 static int zfs_version_dmu_backup_stream = DMU_BACKUP_STREAM_VERSION;
  82 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_stream, CTLFLAG_RD,
  83     &zfs_version_dmu_backup_stream, 0, "DMU_BACKUP_STREAM_VERSION");
  84 static int zfs_version_spa = SPA_VERSION;
  85 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
  86     "SPA_VERSION");
  87 static int zfs_version_vdev_boot = VDEV_BOOT_VERSION;
  88 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, vdev_boot, CTLFLAG_RD,
  89     &zfs_version_vdev_boot, 0, "VDEV_BOOT_VERSION");
  90 static int zfs_version_zpl = ZPL_VERSION;
  91 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
  92     "ZPL_VERSION");
  93
  94 static int zfs_mount(vfs_t *vfsp);
  95 static int zfs_umount(vfs_t *vfsp, int fflag);
  96 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
  97 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
  98 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
  99 static int zfs_sync(vfs_t *vfsp, int waitfor);
 100 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
 101 static void zfs_objset_close(zfsvfs_t *zfsvfs);
 102 static void zfs_freevfs(vfs_t *vfsp);
 103
 104 static struct vfsops zfs_vfsops = {
 105         .vfs_mount =            zfs_mount,
 106         .vfs_unmount =          zfs_umount,
 107         .vfs_root =             zfs_root,
 108         .vfs_statfs =           zfs_statfs,
 109         .vfs_vget =             zfs_vget,
 110         .vfs_sync =             zfs_sync,
 111         .vfs_fhtovp =           zfs_fhtovp,
 112 };
 113
 114 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
 115
 116 /*
 117  * We need to keep a count of active fs's.
 118  * This is necessary to prevent our module
 119  * from being unloaded after a umount -f
 120  */
 121 static uint32_t zfs_active_fs_count = 0;
 122
 123 /*ARGSUSED*/
 124 static int
 125 zfs_sync(vfs_t *vfsp, int waitfor)
 126 {
 127
 128         /*
 129          * Data integrity is job one.  We don't want a compromised kernel
 130          * writing to the storage pool, so we never sync during panic.
 131          */
 132         if (panicstr)
 133                 return (0);
 134
 135         if (vfsp != NULL) {
 136                 /*
 137                  * Sync a specific filesystem.
 138                  */
 139                 zfsvfs_t *zfsvfs = vfsp->vfs_data;
 140                 int error;
 141
 142                 error = vfs_stdsync(vfsp, waitfor);
 143                 if (error != 0)
 144                         return (error);
 145
 146                 ZFS_ENTER(zfsvfs);
 147                 if (zfsvfs->z_log != NULL)
 148                         zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
 149                 else
 150                         txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 151                 ZFS_EXIT(zfsvfs);
 152         } else {
 153                 /*
 154                  * Sync all ZFS filesystems.  This is what happens when you
 155                  * run sync(1M).  Unlike other filesystems, ZFS honors the
 156                  * request by waiting for all pools to commit all dirty data.
 157                  */
 158                 spa_sync_allpools();
 159         }
 160
 161         return (0);
 162 }
 163
 164 static void
 165 atime_changed_cb(void *arg, uint64_t newval)
 166 {
 167         zfsvfs_t *zfsvfs = arg;
 168
 169         if (newval == TRUE) {
 170                 zfsvfs->z_atime = TRUE;
 171                 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 172                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 173                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 174         } else {
 175                 zfsvfs->z_atime = FALSE;
 176                 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 177                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 178                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 179         }
 180 }
 181
 182 static void
 183 xattr_changed_cb(void *arg, uint64_t newval)
 184 {
 185         zfsvfs_t *zfsvfs = arg;
 186
 187         if (newval == TRUE) {
 188                 /* XXX locking on vfs_flag? */
 189 #ifdef TODO
 190                 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 191 #endif
 192                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 193                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 194         } else {
 195                 /* XXX locking on vfs_flag? */
 196 #ifdef TODO
 197                 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 198 #endif
 199                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 200                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 201         }
 202 }
 203
 204 static void
 205 blksz_changed_cb(void *arg, uint64_t newval)
 206 {
 207         zfsvfs_t *zfsvfs = arg;
 208
 209         if (newval < SPA_MINBLOCKSIZE ||
 210             newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
 211                 newval = SPA_MAXBLOCKSIZE;
 212
 213         zfsvfs->z_max_blksz = newval;
 214         zfsvfs->z_vfs->vfs_bsize = newval;
 215 }
 216
 217 static void
 218 readonly_changed_cb(void *arg, uint64_t newval)
 219 {
 220         zfsvfs_t *zfsvfs = arg;
 221
 222         if (newval) {
 223                 /* XXX locking on vfs_flag? */
 224                 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 225                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 226                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 227         } else {
 228                 /* XXX locking on vfs_flag? */
 229                 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 230                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 231                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 232         }
 233 }
 234
 235 static void
 236 setuid_changed_cb(void *arg, uint64_t newval)
 237 {
 238         zfsvfs_t *zfsvfs = arg;
 239
 240         if (newval == FALSE) {
 241                 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 242                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 243                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 244         } else {
 245                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 246                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 247                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 248         }
 249 }
 250
 251 static void
 252 exec_changed_cb(void *arg, uint64_t newval)
 253 {
 254         zfsvfs_t *zfsvfs = arg;
 255
 256         if (newval == FALSE) {
 257                 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 258                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 259                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 260         } else {
 261                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 262                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 263                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 264         }
 265 }
 266
 267 /*
 268  * The nbmand mount option can be changed at mount time.
 269  * We can't allow it to be toggled on live file systems or incorrect
 270  * behavior may be seen from cifs clients
 271  *
 272  * This property isn't registered via dsl_prop_register(), but this callback
 273  * will be called when a file system is first mounted
 274  */
 275 static void
 276 nbmand_changed_cb(void *arg, uint64_t newval)
 277 {
 278         zfsvfs_t *zfsvfs = arg;
 279         if (newval == FALSE) {
 280                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 281                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 282         } else {
 283                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 284                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 285         }
 286 }
 287
 288 static void
 289 snapdir_changed_cb(void *arg, uint64_t newval)
 290 {
 291         zfsvfs_t *zfsvfs = arg;
 292
 293         zfsvfs->z_show_ctldir = newval;
 294 }
 295
 296 static void
 297 vscan_changed_cb(void *arg, uint64_t newval)
 298 {
 299         zfsvfs_t *zfsvfs = arg;
 300
 301         zfsvfs->z_vscan = newval;
 302 }
 303
 304 static void
 305 acl_mode_changed_cb(void *arg, uint64_t newval)
 306 {
 307         zfsvfs_t *zfsvfs = arg;
 308
 309         zfsvfs->z_acl_mode = newval;
 310 }
 311
 312 static void
 313 acl_inherit_changed_cb(void *arg, uint64_t newval)
 314 {
 315         zfsvfs_t *zfsvfs = arg;
 316
 317         zfsvfs->z_acl_inherit = newval;
 318 }
 319
 320 static int
 321 zfs_register_callbacks(vfs_t *vfsp)
 322 {
 323         struct dsl_dataset *ds = NULL;
 324         objset_t *os = NULL;
 325         zfsvfs_t *zfsvfs = NULL;
 326         uint64_t nbmand;
 327         int readonly, do_readonly = FALSE;
 328         int setuid, do_setuid = FALSE;
 329         int exec, do_exec = FALSE;
 330         int xattr, do_xattr = FALSE;
 331         int atime, do_atime = FALSE;
 332         int error = 0;
 333
 334         ASSERT(vfsp);
 335         zfsvfs = vfsp->vfs_data;
 336         ASSERT(zfsvfs);
 337         os = zfsvfs->z_os;
 338
 339         /*
 340          * The act of registering our callbacks will destroy any mount
 341          * options we may have.  In order to enable temporary overrides
 342          * of mount options, we stash away the current values and
 343          * restore them after we register the callbacks.
 344          */
 345         if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
 346                 readonly = B_TRUE;
 347                 do_readonly = B_TRUE;
 348         } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 349                 readonly = B_FALSE;
 350                 do_readonly = B_TRUE;
 351         }
 352         if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 353                 setuid = B_FALSE;
 354                 do_setuid = B_TRUE;
 355         } else {
 356                 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 357                         setuid = B_FALSE;
 358                         do_setuid = B_TRUE;
 359                 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 360                         setuid = B_TRUE;
 361                         do_setuid = B_TRUE;
 362                 }
 363         }
 364         if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 365                 exec = B_FALSE;
 366                 do_exec = B_TRUE;
 367         } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 368                 exec = B_TRUE;
 369                 do_exec = B_TRUE;
 370         }
 371         if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 372                 xattr = B_FALSE;
 373                 do_xattr = B_TRUE;
 374         } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 375                 xattr = B_TRUE;
 376                 do_xattr = B_TRUE;
 377         }
 378         if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 379                 atime = B_FALSE;
 380                 do_atime = B_TRUE;
 381         } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 382                 atime = B_TRUE;
 383                 do_atime = B_TRUE;
 384         }
 385
 386         /*
 387          * nbmand is a special property.  It can only be changed at
 388          * mount time.
 389          *
 390          * This is weird, but it is documented to only be changeable
 391          * at mount time.
 392          */
 393         if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 394                 nbmand = B_FALSE;
 395         } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 396                 nbmand = B_TRUE;
 397         } else {
 398                 char osname[MAXNAMELEN];
 399
 400                 dmu_objset_name(os, osname);
 401                 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
 402                     NULL)) {
 403                         return (error);
 404                 }
 405         }
 406
 407         /*
 408          * Register property callbacks.
 409          *
 410          * It would probably be fine to just check for i/o error from
 411          * the first prop_register(), but I guess I like to go
 412          * overboard...
 413          */
 414         ds = dmu_objset_ds(os);
 415         error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
 416         error = error ? error : dsl_prop_register(ds,
 417             "xattr", xattr_changed_cb, zfsvfs);
 418         error = error ? error : dsl_prop_register(ds,
 419             "recordsize", blksz_changed_cb, zfsvfs);
 420         error = error ? error : dsl_prop_register(ds,
 421             "readonly", readonly_changed_cb, zfsvfs);
 422         error = error ? error : dsl_prop_register(ds,
 423             "setuid", setuid_changed_cb, zfsvfs);
 424         error = error ? error : dsl_prop_register(ds,
 425             "exec", exec_changed_cb, zfsvfs);
 426         error = error ? error : dsl_prop_register(ds,
 427             "snapdir", snapdir_changed_cb, zfsvfs);
 428         error = error ? error : dsl_prop_register(ds,
 429             "aclmode", acl_mode_changed_cb, zfsvfs);
 430         error = error ? error : dsl_prop_register(ds,
 431             "aclinherit", acl_inherit_changed_cb, zfsvfs);
 432         error = error ? error : dsl_prop_register(ds,
 433             "vscan", vscan_changed_cb, zfsvfs);
 434         if (error)
 435                 goto unregister;
 436
 437         /*
 438          * Invoke our callbacks to restore temporary mount options.
 439          */
 440         if (do_readonly)
 441                 readonly_changed_cb(zfsvfs, readonly);
 442         if (do_setuid)
 443                 setuid_changed_cb(zfsvfs, setuid);
 444         if (do_exec)
 445                 exec_changed_cb(zfsvfs, exec);
 446         if (do_xattr)
 447                 xattr_changed_cb(zfsvfs, xattr);
 448         if (do_atime)
 449                 atime_changed_cb(zfsvfs, atime);
 450
 451         nbmand_changed_cb(zfsvfs, nbmand);
 452
 453         return (0);
 454
 455 unregister:
 456         /*
 457          * We may attempt to unregister some callbacks that are not
 458          * registered, but this is OK; it will simply return ENOMSG,
 459          * which we will ignore.
 460          */
 461         (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
 462         (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
 463         (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
 464         (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
 465         (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
 466         (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
 467         (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
 468         (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
 469         (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
 470             zfsvfs);
 471         (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
 472         return (error);
 473
 474 }
 475
 476 static int
 477 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 478 {
 479         int error;
 480
 481         error = zfs_register_callbacks(zfsvfs->z_vfs);
 482         if (error)
 483                 return (error);
 484
 485         /*
 486          * Set the objset user_ptr to track its zfsvfs.
 487          */
 488         mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
 489         dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
 490         mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
 491
 492         /*
 493          * If we are not mounting (ie: online recv), then we don't
 494          * have to worry about replaying the log as we blocked all
 495          * operations out since we closed the ZIL.
 496          */
 497         if (mounting) {
 498                 boolean_t readonly;
 499
 500                 /*
 501                  * During replay we remove the read only flag to
 502                  * allow replays to succeed.
 503                  */
 504                 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
 505                 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 506
 507                 /*
 508                  * Parse and replay the intent log.
 509                  */
 510                 zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
 511                     zfs_replay_vector, zfs_unlinked_drain);
 512
 513                 zfs_unlinked_drain(zfsvfs);
 514                 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
 515         }
 516
 517         if (!zil_disable)
 518                 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
 519
 520         return (0);
 521 }
 522
 523 static void
 524 zfs_freezfsvfs(zfsvfs_t *zfsvfs)
 525 {
 526         mutex_destroy(&zfsvfs->z_znodes_lock);
 527         mutex_destroy(&zfsvfs->z_online_recv_lock);
 528         list_destroy(&zfsvfs->z_all_znodes);
 529         rrw_destroy(&zfsvfs->z_teardown_lock);
 530         rw_destroy(&zfsvfs->z_teardown_inactive_lock);
 531         rw_destroy(&zfsvfs->z_fuid_lock);
 532         kmem_free(zfsvfs, sizeof (zfsvfs_t));
 533 }
 534
 535 static int
 536 zfs_domount(vfs_t *vfsp, char *osname)
 537 {
 538         uint64_t recordsize, readonly;
 539         int error = 0;
 540         int mode;
 541         zfsvfs_t *zfsvfs;
 542         znode_t *zp = NULL;
 543
 544         ASSERT(vfsp);
 545         ASSERT(osname);
 546
 547         /*
 548          * Initialize the zfs-specific filesystem structure.
 549          * Should probably make this a kmem cache, shuffle fields,
 550          * and just bzero up to z_hold_mtx[].
 551          */
 552         zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 553         zfsvfs->z_vfs = vfsp;
 554         zfsvfs->z_parent = zfsvfs;
 555         zfsvfs->z_assign = TXG_NOWAIT;
 556         zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
 557         zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 558
 559         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 560         mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
 561         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 562             offsetof(znode_t, z_link_node));
 563         rrw_init(&zfsvfs->z_teardown_lock);
 564         rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
 565         rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 566
 567         if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
 568             NULL))
 569                 goto out;
 570         zfsvfs->z_vfs->vfs_bsize = recordsize;
 571
 572         vfsp->vfs_data = zfsvfs;
 573         vfsp->mnt_flag |= MNT_LOCAL;
 574         vfsp->mnt_kern_flag |= MNTK_MPSAFE;
 575         vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
 576         vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
 577
 578         if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
 579                 goto out;
 580
 581         mode = DS_MODE_OWNER;
 582         if (readonly)
 583                 mode |= DS_MODE_READONLY;
 584
 585         error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
 586         if (error == EROFS) {
 587                 mode = DS_MODE_OWNER | DS_MODE_READONLY;
 588                 error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
 589                     &zfsvfs->z_os);
 590         }
 591
 592         if (error)
 593                 goto out;
 594
 595         if (error = zfs_init_fs(zfsvfs, &zp))
 596                 goto out;
 597
 598         /*
 599          * Set features for file system.
 600          */
 601         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 602         if (zfsvfs->z_use_fuids) {
 603                 vfs_set_feature(vfsp, VFSFT_XVATTR);
 604                 vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
 605                 vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
 606                 vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
 607         }
 608         if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 609                 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 610                 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
 611                 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
 612         } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
 613                 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 614                 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
 615         }
 616
 617         if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 618                 uint64_t pval;
 619
 620                 ASSERT(mode & DS_MODE_READONLY);
 621                 atime_changed_cb(zfsvfs, B_FALSE);
 622                 readonly_changed_cb(zfsvfs, B_TRUE);
 623                 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
 624                         goto out;
 625                 xattr_changed_cb(zfsvfs, pval);
 626                 zfsvfs->z_issnap = B_TRUE;
 627         } else {
 628                 error = zfsvfs_setup(zfsvfs, B_TRUE);
 629         }
 630
 631         vfs_mountedfrom(vfsp, osname);
 632
 633         if (!zfsvfs->z_issnap)
 634                 zfsctl_create(zfsvfs);
 635 out:
 636         if (error) {
 637                 if (zfsvfs->z_os)
 638                         dmu_objset_close(zfsvfs->z_os);
 639                 zfs_freezfsvfs(zfsvfs);
 640         } else {
 641                 atomic_add_32(&zfs_active_fs_count, 1);
 642         }
 643
 644         return (error);
 645 }
 646
 647 void
 648 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
 649 {
 650         objset_t *os = zfsvfs->z_os;
 651         struct dsl_dataset *ds;
 652
 653         /*
 654          * Unregister properties.
 655          */
 656         if (!dmu_objset_is_snapshot(os)) {
 657                 ds = dmu_objset_ds(os);
 658                 VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
 659                     zfsvfs) == 0);
 660
 661                 VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
 662                     zfsvfs) == 0);
 663
 664                 VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
 665                     zfsvfs) == 0);
 666
 667                 VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
 668                     zfsvfs) == 0);
 669
 670                 VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
 671                     zfsvfs) == 0);
 672
 673                 VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
 674                     zfsvfs) == 0);
 675
 676                 VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
 677                     zfsvfs) == 0);
 678
 679                 VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
 680                     zfsvfs) == 0);
 681
 682                 VERIFY(dsl_prop_unregister(ds, "aclinherit",
 683                     acl_inherit_changed_cb, zfsvfs) == 0);
 684
 685                 VERIFY(dsl_prop_unregister(ds, "vscan",
 686                     vscan_changed_cb, zfsvfs) == 0);
 687         }
 688 }
 689
 690 /*ARGSUSED*/
 691 static int
 692 zfs_mount(vfs_t *vfsp)
 693 {
 694         kthread_t       *td = curthread;
 695         vnode_t         *mvp = vfsp->mnt_vnodecovered;
 696         cred_t          *cr = td->td_ucred;
 697         char            *osname;
 698         int             error = 0;
 699         int             canwrite;
 700
 701         if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
 702                 return (EINVAL);
 703
 704         /*
 705          * If full-owner-access is enabled and delegated administration is
 706          * turned on, we must set nosuid.
 707          */
 708         if (zfs_super_owner &&
 709             dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
 710                 secpolicy_fs_mount_clearopts(cr, vfsp);
 711         }
 712
 713         /*
 714          * Check for mount privilege?
 715          *
 716          * If we don't have privilege then see if
 717          * we have local permission to allow it
 718          */
 719         error = secpolicy_fs_mount(cr, mvp, vfsp);
 720         if (error) {
 721                 error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
 722                 if (error == 0) {
 723                         vattr_t         vattr;
 724
 725                         /*
 726                          * Make sure user is the owner of the mount point
 727                          * or has sufficient privileges.
 728                          */
 729
 730                         vattr.va_mask = AT_UID;
 731
 732                         if (error = VOP_GETATTR(mvp, &vattr, cr)) {
 733                                 goto out;
 734                         }
 735
 736 #if 0 /* CHECK THIS! Is probably needed for zfs_suser. */
 737                         if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
 738                             VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
 739                                 error = EPERM;
 740                                 goto out;
 741                         }
 742 #else
 743                         if (error = secpolicy_vnode_owner(mvp, cr, vattr.va_uid)) {
 744                                 goto out;
 745                         }
 746
 747                         if (error = VOP_ACCESS(mvp, VWRITE, cr, td)) {
 748                                 goto out;
 749                         }
 750 #endif
 751
 752                         secpolicy_fs_mount_clearopts(cr, vfsp);
 753                 } else {
 754                         goto out;
 755                 }
 756         }
 757
 758         /*
 759          * Refuse to mount a filesystem if we are in a local zone and the
 760          * dataset is not visible.
 761          */
 762         if (!INGLOBALZONE(curthread) &&
 763             (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
 764                 error = EPERM;
 765                 goto out;
 766         }
 767
 768         /*
 769          * When doing a remount, we simply refresh our temporary properties
 770          * according to those options set in the current VFS options.
 771          */
 772         if (vfsp->vfs_flag & MS_REMOUNT) {
 773                 /* refresh mount options */
 774                 zfs_unregister_callbacks(vfsp->vfs_data);
 775                 error = zfs_register_callbacks(vfsp);
 776                 goto out;
 777         }
 778
 779         DROP_GIANT();
 780         error = zfs_domount(vfsp, osname);
 781         PICKUP_GIANT();
 782 out:
 783         return (error);
 784 }
 785
 786 static int
 787 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
 788 {
 789         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 790         uint64_t refdbytes, availbytes, usedobjs, availobjs;
 791
 792         statp->f_version = STATFS_VERSION;
 793
 794         ZFS_ENTER(zfsvfs);
 795
 796         dmu_objset_space(zfsvfs->z_os,
 797             &refdbytes, &availbytes, &usedobjs, &availobjs);
 798
 799         /*
 800          * The underlying storage pool actually uses multiple block sizes.
 801          * We report the fragsize as the smallest block size we support,
 802          * and we report our blocksize as the filesystem's maximum blocksize.
 803          */
 804         statp->f_bsize = zfsvfs->z_vfs->vfs_bsize;
 805         statp->f_iosize = zfsvfs->z_vfs->vfs_bsize;
 806
 807         /*
 808          * The following report "total" blocks of various kinds in the
 809          * file system, but reported in terms of f_frsize - the
 810          * "fragment" size.
 811          */
 812
 813         statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize;
 814         statp->f_bfree = availbytes / statp->f_bsize;
 815         statp->f_bavail = statp->f_bfree; /* no root reservation */
 816
 817         /*
 818          * statvfs() should really be called statufs(), because it assumes
 819          * static metadata.  ZFS doesn't preallocate files, so the best
 820          * we can do is report the max that could possibly fit in f_files,
 821          * and that minus the number actually used in f_ffree.
 822          * For f_ffree, report the smaller of the number of object available
 823          * and the number of blocks (each object will take at least a block).
 824          */
 825         statp->f_ffree = MIN(availobjs, statp->f_bfree);
 826         statp->f_files = statp->f_ffree + usedobjs;
 827
 828         /*
 829          * We're a zfs filesystem.
 830          */
 831         (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
 832
 833         strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
 834             sizeof(statp->f_mntfromname));
 835         strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
 836             sizeof(statp->f_mntonname));
 837
 838         statp->f_namemax = ZFS_MAXNAMELEN;
 839
 840         ZFS_EXIT(zfsvfs);
 841         return (0);
 842 }
 843
 844 static int
 845 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
 846 {
 847         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 848         znode_t *rootzp;
 849         int error;
 850
 851         ZFS_ENTER(zfsvfs);
 852
 853         error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
 854         if (error == 0) {
 855                 *vpp = ZTOV(rootzp);
 856                 error = vn_lock(*vpp, flags);
 857                 (*vpp)->v_vflag |= VV_ROOT;
 858         }
 859
 860         ZFS_EXIT(zfsvfs);
 861         return (error);
 862 }
 863
 864 /*
 865  * Teardown the zfsvfs::z_os.
 866  *
 867  * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
 868  * and 'z_teardown_inactive_lock' held.
 869  */
 870 static int
 871 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
 872 {
 873         znode_t *zp;
 874
 875         rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 876
 877         if (!unmounting) {
 878                 /*
 879                  * We purge the parent filesystem's vfsp as the parent
 880                  * filesystem and all of its snapshots have their vnode's
 881                  * v_vfsp set to the parent's filesystem's vfsp.  Note,
 882                  * 'z_parent' is self referential for non-snapshots.
 883                  */
 884                 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
 885         }
 886
 887         /*
 888          * Close the zil. NB: Can't close the zil while zfs_inactive
 889          * threads are blocked as zil_close can call zfs_inactive.
 890          */
 891         if (zfsvfs->z_log) {
 892                 zil_close(zfsvfs->z_log);
 893                 zfsvfs->z_log = NULL;
 894         }
 895
 896         rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
 897
 898         /*
 899          * If we are not unmounting (ie: online recv) and someone already
 900          * unmounted this file system while we were doing the switcheroo,
 901          * or a reopen of z_os failed then just bail out now.
 902          */
 903         if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
 904                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
 905                 rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
 906                 return (EIO);
 907         }
 908
 909         /*
 910          * At this point there are no vops active, and any new vops will
 911          * fail with EIO since we have z_teardown_lock for writer (only
 912          * relavent for forced unmount).
 913          *
 914          * Release all holds on dbufs.
 915          */
 916         mutex_enter(&zfsvfs->z_znodes_lock);
 917         for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
 918             zp = list_next(&zfsvfs->z_all_znodes, zp))
 919                 if (zp->z_dbuf) {
 920                         ASSERT(ZTOV(zp)->v_count > 0);
 921                         zfs_znode_dmu_fini(zp);
 922                 }
 923         mutex_exit(&zfsvfs->z_znodes_lock);
 924
 925         /*
 926          * If we are unmounting, set the unmounted flag and let new vops
 927          * unblock.  zfs_inactive will have the unmounted behavior, and all
 928          * other vops will fail with EIO.
 929          */
 930         if (unmounting) {
 931                 zfsvfs->z_unmounted = B_TRUE;
 932                 rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
 933                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
 934         }
 935
 936         /*
 937          * z_os will be NULL if there was an error in attempting to reopen
 938          * zfsvfs, so just return as the properties had already been
 939          * unregistered and cached data had been evicted before.
 940          */
 941         if (zfsvfs->z_os == NULL)
 942                 return (0);
 943
 944         /*
 945          * Unregister properties.
 946          */
 947         zfs_unregister_callbacks(zfsvfs);
 948
 949         /*
 950          * Evict cached data
 951          */
 952         if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
 953                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 954                 (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
 955         }
 956
 957         return (0);
 958 }
 959
 960 /*ARGSUSED*/
 961 static int
 962 zfs_umount(vfs_t *vfsp, int fflag)
 963 {
 964         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 965         objset_t *os;
 966         cred_t *cr = curthread->td_ucred;
 967         int ret;
 968
 969         if (fflag & MS_FORCE) {
 970                 /* TODO: Force unmount is not well implemented yet, so deny it. */
 971                 ZFS_LOG(0, "Force unmount is experimental - report any problems.");
 972         }
 973
 974         ret = secpolicy_fs_unmount(cr, vfsp);
 975         if (ret) {
 976                 ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
 977                     ZFS_DELEG_PERM_MOUNT, cr);
 978                 if (ret)
 979                         return (ret);
 980         }
 981         /*
 982          * We purge the parent filesystem's vfsp as the parent filesystem
 983          * and all of its snapshots have their vnode's v_vfsp set to the
 984          * parent's filesystem's vfsp.  Note, 'z_parent' is self
 985          * referential for non-snapshots.
 986          */
 987         (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
 988
 989         /*
 990          * Unmount any snapshots mounted under .zfs before unmounting the
 991          * dataset itself.
 992          */
 993         if (zfsvfs->z_ctldir != NULL) {
 994                 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
 995                         return (ret);
 996                 ret = vflush(vfsp, 0, 0, curthread);
 997                 ASSERT(ret == EBUSY);
 998                 if (!(fflag & MS_FORCE)) {
 999                         if (zfsvfs->z_ctldir->v_count > 1)
1000                                 return (EBUSY);
1001                         ASSERT(zfsvfs->z_ctldir->v_count == 1);
1002                 }
1003                 zfsctl_destroy(zfsvfs);
1004                 ASSERT(zfsvfs->z_ctldir == NULL);
1005         }
1006
1007         /*
1008          * Flush all the files.
1009          */
1010         ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, curthread);
1011         if (ret != 0) {
1012                 if (!zfsvfs->z_issnap) {
1013                         zfsctl_create(zfsvfs);
1014                         ASSERT(zfsvfs->z_ctldir != NULL);
1015                 }
1016                 return (ret);
1017         }
1018
1019         if (!(fflag & MS_FORCE)) {
1020                 /*
1021                  * Check the number of active vnodes in the file system.
1022                  * Our count is maintained in the vfs structure, but the
1023                  * number is off by 1 to indicate a hold on the vfs
1024                  * structure itself.
1025                  *
1026                  * The '.zfs' directory maintains a reference of its
1027                  * own, and any active references underneath are
1028                  * reflected in the vnode count.
1029                  */
1030                 if (zfsvfs->z_ctldir == NULL) {
1031                         if (vfsp->vfs_count > 1)
1032                                 return (EBUSY);
1033                 } else {
1034                         if (vfsp->vfs_count > 2 ||
1035                             zfsvfs->z_ctldir->v_count > 1)
1036                                 return (EBUSY);
1037                 }
1038         } else {
1039                 MNT_ILOCK(vfsp);
1040                 vfsp->mnt_kern_flag |= MNTK_UNMOUNTF;
1041                 MNT_IUNLOCK(vfsp);
1042         }
1043
1044         VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1045         os = zfsvfs->z_os;
1046
1047         /*
1048          * z_os will be NULL if there was an error in
1049          * attempting to reopen zfsvfs.
1050          */
1051         if (os != NULL) {
1052                 /*
1053                  * Unset the objset user_ptr.
1054                  */
1055                 mutex_enter(&os->os->os_user_ptr_lock);
1056                 dmu_objset_set_user(os, NULL);
1057                 mutex_exit(&os->os->os_user_ptr_lock);
1058
1059                 /*
1060                  * Finally release the objset
1061                  */
1062                 dmu_objset_close(os);
1063         }
1064
1065         /*
1066          * We can now safely destroy the '.zfs' directory node.
1067          */
1068         if (zfsvfs->z_ctldir != NULL)
1069                 zfsctl_destroy(zfsvfs);
1070         if (zfsvfs->z_issnap) {
1071                 vnode_t *svp = vfsp->mnt_vnodecovered;
1072
1073                 ASSERT(svp->v_count == 2 || svp->v_count == 1);
1074                 if (svp->v_count == 2)
1075                         VN_RELE(svp);
1076         }
1077         zfs_freevfs(vfsp);
1078
1079         return (0);
1080 }
1081
1082 static int
1083 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1084 {
1085         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
1086         znode_t         *zp;
1087         int             err;
1088
1089         ZFS_ENTER(zfsvfs);
1090         err = zfs_zget(zfsvfs, ino, &zp);
1091         if (err == 0 && zp->z_unlinked) {
1092                 VN_RELE(ZTOV(zp));
1093                 err = EINVAL;
1094         }
1095         if (err != 0)
1096                 *vpp = NULL;
1097         else {
1098                 *vpp = ZTOV(zp);
1099                 vn_lock(*vpp, flags);
1100         }
1101         ZFS_EXIT(zfsvfs);
1102         return (err);
1103 }
1104
1105 static int
1106 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
1107 {
1108         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
1109         znode_t         *zp;
1110         uint64_t        object = 0;
1111         uint64_t        fid_gen = 0;
1112         uint64_t        gen_mask;
1113         uint64_t        zp_gen;
1114         int             i, err;
1115
1116         *vpp = NULL;
1117
1118         ZFS_ENTER(zfsvfs);
1119
1120         if (fidp->fid_len == LONG_FID_LEN) {
1121                 zfid_long_t     *zlfid = (zfid_long_t *)fidp;
1122                 uint64_t        objsetid = 0;
1123                 uint64_t        setgen = 0;
1124
1125                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1126                         objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1127
1128                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1129                         setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1130
1131                 ZFS_EXIT(zfsvfs);
1132
1133                 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1134                 if (err)
1135                         return (EINVAL);
1136                 ZFS_ENTER(zfsvfs);
1137         }
1138
1139         if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1140                 zfid_short_t    *zfid = (zfid_short_t *)fidp;
1141
1142                 for (i = 0; i < sizeof (zfid->zf_object); i++)
1143                         object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1144
1145                 for (i = 0; i < sizeof (zfid->zf_gen); i++)
1146                         fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1147         } else {
1148                 ZFS_EXIT(zfsvfs);
1149                 return (EINVAL);
1150         }
1151
1152         /* A zero fid_gen means we are in the .zfs control directories */
1153         if (fid_gen == 0 &&
1154             (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
1155                 *vpp = zfsvfs->z_ctldir;
1156                 ASSERT(*vpp != NULL);
1157                 if (object == ZFSCTL_INO_SNAPDIR) {
1158                         VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
1159                             0, NULL, NULL, NULL, NULL, NULL) == 0);
1160                 } else {
1161                         VN_HOLD(*vpp);
1162                 }
1163                 ZFS_EXIT(zfsvfs);
1164                 /* XXX: LK_RETRY? */
1165                 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1166                 return (0);
1167         }
1168
1169         gen_mask = -1ULL >> (64 - 8 * i);
1170
1171         dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
1172         if (err = zfs_zget(zfsvfs, object, &zp)) {
1173                 ZFS_EXIT(zfsvfs);
1174                 return (err);
1175         }
1176         zp_gen = zp->z_phys->zp_gen & gen_mask;
1177         if (zp_gen == 0)
1178                 zp_gen = 1;
1179         if (zp->z_unlinked || zp_gen != fid_gen) {
1180                 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
1181                 VN_RELE(ZTOV(zp));
1182                 ZFS_EXIT(zfsvfs);
1183                 return (EINVAL);
1184         }
1185
1186         *vpp = ZTOV(zp);
1187         /* XXX: LK_RETRY? */
1188         vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1189         vnode_create_vobject(*vpp, zp->z_phys->zp_size, curthread);
1190         ZFS_EXIT(zfsvfs);
1191         return (0);
1192 }
1193
1194 /*
1195  * Block out VOPs and close zfsvfs_t::z_os
1196  *
1197  * Note, if successful, then we return with the 'z_teardown_lock' and
1198  * 'z_teardown_inactive_lock' write held.
1199  */
1200 int
1201 zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
1202 {
1203         int error;
1204
1205         if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1206                 return (error);
1207
1208         *mode = zfsvfs->z_os->os_mode;
1209         dmu_objset_name(zfsvfs->z_os, name);
1210         dmu_objset_close(zfsvfs->z_os);
1211
1212         return (0);
1213 }
1214
1215 /*
1216  * Reopen zfsvfs_t::z_os and release VOPs.
1217  */
1218 int
1219 zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
1220 {
1221         int err;
1222
1223         ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
1224         ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
1225
1226         err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
1227         if (err) {
1228                 zfsvfs->z_os = NULL;
1229         } else {
1230                 znode_t *zp;
1231
1232                 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
1233
1234                 /*
1235                  * Attempt to re-establish all the active znodes with
1236                  * their dbufs.  If a zfs_rezget() fails, then we'll let
1237                  * any potential callers discover that via ZFS_ENTER_VERIFY_VP
1238                  * when they try to use their znode.
1239                  */
1240                 mutex_enter(&zfsvfs->z_znodes_lock);
1241                 for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1242                     zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1243                         (void) zfs_rezget(zp);
1244                 }
1245                 mutex_exit(&zfsvfs->z_znodes_lock);
1246
1247         }
1248
1249         /* release the VOPs */
1250         rw_exit(&zfsvfs->z_teardown_inactive_lock);
1251         rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1252
1253         if (err) {
1254                 /*
1255                  * Since we couldn't reopen zfsvfs::z_os, force
1256                  * unmount this file system.
1257                  */
1258                 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
1259                         (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
1260         }
1261         return (err);
1262 }
1263
1264 static void
1265 zfs_freevfs(vfs_t *vfsp)
1266 {
1267         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1268         int i;
1269
1270         for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1271                 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1272
1273         zfs_fuid_destroy(zfsvfs);
1274         zfs_freezfsvfs(zfsvfs);
1275
1276         atomic_add_32(&zfs_active_fs_count, -1);
1277 }
1278
1279 #ifdef __i386__
1280 static int desiredvnodes_backup;
1281 #endif
1282
1283 static void
1284 zfs_vnodes_adjust(void)
1285 {
1286 #ifdef __i386__
1287         int newdesiredvnodes;
1288
1289         desiredvnodes_backup = desiredvnodes;
1290
1291         /*
1292          * We calculate newdesiredvnodes the same way it is done in
1293          * vntblinit(). If it is equal to desiredvnodes, it means that
1294          * it wasn't tuned by the administrator and we can tune it down.
1295          */
1296         newdesiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 *
1297             vm_kmem_size / (5 * (sizeof(struct vm_object) +
1298             sizeof(struct vnode))));
1299         if (newdesiredvnodes == desiredvnodes)
1300                 desiredvnodes = (3 * newdesiredvnodes) / 4;
1301 #endif
1302 }
1303
1304 static void
1305 zfs_vnodes_adjust_back(void)
1306 {
1307
1308 #ifdef __i386__
1309         desiredvnodes = desiredvnodes_backup;
1310 #endif
1311 }
1312
1313 void
1314 zfs_init(void)
1315 {
1316
1317         printf("ZFS filesystem version " SPA_VERSION_STRING "\n");
1318
1319         /*
1320          * Initialize znode cache, vnode ops, etc...
1321          */
1322         zfs_znode_init();
1323
1324         /*
1325          * Initialize .zfs directory structures
1326          */
1327         zfsctl_init();
1328
1329         /*
1330          * Reduce number of vnode. Originally number of vnodes is calculated
1331          * with UFS inode in mind. We reduce it here, because it's too big for
1332          * ZFS/i386.
1333          */
1334         zfs_vnodes_adjust();
1335 }
1336
1337 void
1338 zfs_fini(void)
1339 {
1340         zfsctl_fini();
1341         zfs_znode_fini();
1342         zfs_vnodes_adjust_back();
1343 }
1344
1345 int
1346 zfs_busy(void)
1347 {
1348         return (zfs_active_fs_count != 0);
1349 }
1350
1351 int
1352 zfs_set_version(const char *name, uint64_t newvers)
1353 {
1354         int error;
1355         objset_t *os;
1356         dmu_tx_t *tx;
1357         uint64_t curvers;
1358
1359         /*
1360          * XXX for now, require that the filesystem be unmounted.  Would
1361          * be nice to find the zfsvfs_t and just update that if
1362          * possible.
1363          */
1364
1365         if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
1366                 return (EINVAL);
1367
1368         error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os);
1369         if (error)
1370                 return (error);
1371
1372         error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
1373             8, 1, &curvers);
1374         if (error)
1375                 goto out;
1376         if (newvers < curvers) {
1377                 error = EINVAL;
1378                 goto out;
1379         }
1380
1381         tx = dmu_tx_create(os);
1382         dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR);
1383         error = dmu_tx_assign(tx, TXG_WAIT);
1384         if (error) {
1385                 dmu_tx_abort(tx);
1386                 goto out;
1387         }
1388         error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
1389             &newvers, tx);
1390
1391         spa_history_internal_log(LOG_DS_UPGRADE,
1392             dmu_objset_spa(os), tx, CRED(),
1393             "oldver=%llu newver=%llu dataset = %llu", curvers, newvers,
1394             dmu_objset_id(os));
1395         dmu_tx_commit(tx);
1396
1397 out:
1398         dmu_objset_close(os);
1399         return (error);
1400 }
1401 /*
1402  * Read a property stored within the master node.
1403  */
1404 int
1405 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
1406 {
1407         const char *pname;
1408         int error = ENOENT;
1409
1410         /*
1411          * Look up the file system's value for the property.  For the
1412          * version property, we look up a slightly different string.
1413          */
1414         if (prop == ZFS_PROP_VERSION)
1415                 pname = ZPL_VERSION_STR;
1416         else
1417                 pname = zfs_prop_to_name(prop);
1418
1419         if (os != NULL)
1420                 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
1421
1422         if (error == ENOENT) {
1423                 /* No value set, use the default value */
1424                 switch (prop) {
1425                 case ZFS_PROP_VERSION:
1426                         *value = ZPL_VERSION;
1427                         break;
1428                 case ZFS_PROP_NORMALIZE:
1429                 case ZFS_PROP_UTF8ONLY:
1430                         *value = 0;
1431                         break;
1432                 case ZFS_PROP_CASE:
1433                         *value = ZFS_CASE_SENSITIVE;
1434                         break;
1435                 default:
1436                         return (error);
1437                 }
1438                 error = 0;
1439         }
1440         return (error);
1441 }