sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27
  28 #include <sys/types.h>
  29 #include <sys/param.h>
  30 #include <sys/systm.h>
  31 #include <sys/kernel.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/kmem.h>
  34 #include <sys/acl.h>
  35 #include <sys/vnode.h>
  36 #include <sys/vfs.h>
  37 #include <sys/mntent.h>
  38 #include <sys/mount.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/zfs_znode.h>
  41 #include <sys/zfs_dir.h>
  42 #include <sys/zil.h>
  43 #include <sys/fs/zfs.h>
  44 #include <sys/dmu.h>
  45 #include <sys/dsl_prop.h>
  46 #include <sys/dsl_dataset.h>
  47 #include <sys/spa.h>
  48 #include <sys/zap.h>
  49 #include <sys/varargs.h>
  50 #include <sys/atomic.h>
  51 #include <sys/zfs_ioctl.h>
  52 #include <sys/zfs_ctldir.h>
  53 #include <sys/dnlc.h>
  54
  55 struct mtx atomic_mtx;
  56 MTX_SYSINIT(atomic, &atomic_mtx, "atomic", MTX_DEF);
  57
  58 struct mtx zfs_debug_mtx;
  59 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
  60 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
  61 int zfs_debug_level = 0;
  62 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
  63     "Debug level");
  64
  65 static int zfs_mount(vfs_t *vfsp, kthread_t *td);
  66 static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td);
  67 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td);
  68 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td);
  69 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
  70 static int zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td);
  71 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
  72 static void zfs_objset_close(zfsvfs_t *zfsvfs);
  73 static void zfs_freevfs(vfs_t *vfsp);
  74
  75 static struct vfsops zfs_vfsops = {
  76         .vfs_mount =            zfs_mount,
  77         .vfs_unmount =          zfs_umount,
  78         .vfs_root =             zfs_root,
  79         .vfs_statfs =           zfs_statfs,
  80         .vfs_vget =             zfs_vget,
  81         .vfs_sync =             zfs_sync,
  82         .vfs_fhtovp =           zfs_fhtovp,
  83 };
  84
  85 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL);
  86
  87 /*
  88  * We need to keep a count of active fs's.
  89  * This is necessary to prevent our module
  90  * from being unloaded after a umount -f
  91  */
  92 static uint32_t zfs_active_fs_count = 0;
  93
  94 /*ARGSUSED*/
  95 static int
  96 zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td)
  97 {
  98
  99         /*
 100          * Data integrity is job one.  We don't want a compromised kernel
 101          * writing to the storage pool, so we never sync during panic.
 102          */
 103         if (panicstr)
 104                 return (0);
 105
 106         if (vfsp != NULL) {
 107                 /*
 108                  * Sync a specific filesystem.
 109                  */
 110                 zfsvfs_t *zfsvfs = vfsp->vfs_data;
 111                 int error;
 112
 113                 error = vfs_stdsync(vfsp, waitfor, td);
 114                 if (error != 0)
 115                         return (error);
 116
 117                 ZFS_ENTER(zfsvfs);
 118                 if (zfsvfs->z_log != NULL)
 119                         zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
 120                 else
 121                         txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 122                 ZFS_EXIT(zfsvfs);
 123         } else {
 124                 /*
 125                  * Sync all ZFS filesystems.  This is what happens when you
 126                  * run sync(1M).  Unlike other filesystems, ZFS honors the
 127                  * request by waiting for all pools to commit all dirty data.
 128                  */
 129                 spa_sync_allpools();
 130         }
 131
 132         return (0);
 133 }
 134
 135 static void
 136 atime_changed_cb(void *arg, uint64_t newval)
 137 {
 138         zfsvfs_t *zfsvfs = arg;
 139
 140         if (newval == TRUE) {
 141                 zfsvfs->z_atime = TRUE;
 142                 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 143                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 144                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 145         } else {
 146                 zfsvfs->z_atime = FALSE;
 147                 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 148                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 149                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 150         }
 151 }
 152
 153 static void
 154 xattr_changed_cb(void *arg, uint64_t newval)
 155 {
 156         zfsvfs_t *zfsvfs = arg;
 157
 158         if (newval == TRUE) {
 159                 /* XXX locking on vfs_flag? */
 160 #ifdef TODO
 161                 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 162 #endif
 163                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 164                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 165         } else {
 166                 /* XXX locking on vfs_flag? */
 167 #ifdef TODO
 168                 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 169 #endif
 170                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 171                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 172         }
 173 }
 174
 175 static void
 176 blksz_changed_cb(void *arg, uint64_t newval)
 177 {
 178         zfsvfs_t *zfsvfs = arg;
 179
 180         if (newval < SPA_MINBLOCKSIZE ||
 181             newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
 182                 newval = SPA_MAXBLOCKSIZE;
 183
 184         zfsvfs->z_max_blksz = newval;
 185         zfsvfs->z_vfs->vfs_bsize = newval;
 186 }
 187
 188 static void
 189 readonly_changed_cb(void *arg, uint64_t newval)
 190 {
 191         zfsvfs_t *zfsvfs = arg;
 192
 193         if (newval) {
 194                 /* XXX locking on vfs_flag? */
 195                 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 196                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 197                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 198         } else {
 199                 /* XXX locking on vfs_flag? */
 200                 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 201                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 202                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 203         }
 204 }
 205
 206 static void
 207 setuid_changed_cb(void *arg, uint64_t newval)
 208 {
 209         zfsvfs_t *zfsvfs = arg;
 210
 211         if (newval == FALSE) {
 212                 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 213                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 214                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 215         } else {
 216                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 217                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 218                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 219         }
 220 }
 221
 222 static void
 223 exec_changed_cb(void *arg, uint64_t newval)
 224 {
 225         zfsvfs_t *zfsvfs = arg;
 226
 227         if (newval == FALSE) {
 228                 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 229                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 230                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 231         } else {
 232                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 233                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 234                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 235         }
 236 }
 237
 238 static void
 239 snapdir_changed_cb(void *arg, uint64_t newval)
 240 {
 241         zfsvfs_t *zfsvfs = arg;
 242
 243         zfsvfs->z_show_ctldir = newval;
 244 }
 245
 246 static void
 247 acl_mode_changed_cb(void *arg, uint64_t newval)
 248 {
 249         zfsvfs_t *zfsvfs = arg;
 250
 251         zfsvfs->z_acl_mode = newval;
 252 }
 253
 254 static void
 255 acl_inherit_changed_cb(void *arg, uint64_t newval)
 256 {
 257         zfsvfs_t *zfsvfs = arg;
 258
 259         zfsvfs->z_acl_inherit = newval;
 260 }
 261
 262 static int
 263 zfs_refresh_properties(vfs_t *vfsp)
 264 {
 265         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 266
 267         /*
 268          * Remount operations default to "rw" unless "ro" is explicitly
 269          * specified.
 270          */
 271         if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
 272                 readonly_changed_cb(zfsvfs, B_TRUE);
 273         } else {
 274                 if (!dmu_objset_is_snapshot(zfsvfs->z_os))
 275                         readonly_changed_cb(zfsvfs, B_FALSE);
 276                 else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
 277                         return (EROFS);
 278         }
 279
 280         if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 281                 setuid_changed_cb(zfsvfs, B_FALSE);
 282         } else {
 283                 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
 284                         setuid_changed_cb(zfsvfs, B_FALSE);
 285                 else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
 286                         setuid_changed_cb(zfsvfs, B_TRUE);
 287         }
 288
 289         if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
 290                 exec_changed_cb(zfsvfs, B_FALSE);
 291         else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
 292                 exec_changed_cb(zfsvfs, B_TRUE);
 293
 294         if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
 295                 atime_changed_cb(zfsvfs, B_TRUE);
 296         else if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
 297                 atime_changed_cb(zfsvfs, B_FALSE);
 298
 299         if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
 300                 xattr_changed_cb(zfsvfs, B_TRUE);
 301         else if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
 302                 xattr_changed_cb(zfsvfs, B_FALSE);
 303
 304         return (0);
 305 }
 306
 307 static int
 308 zfs_register_callbacks(vfs_t *vfsp)
 309 {
 310         struct dsl_dataset *ds = NULL;
 311         objset_t *os = NULL;
 312         zfsvfs_t *zfsvfs = NULL;
 313         int readonly, do_readonly = FALSE;
 314         int setuid, do_setuid = FALSE;
 315         int exec, do_exec = FALSE;
 316         int xattr, do_xattr = FALSE;
 317         int error = 0;
 318
 319         ASSERT(vfsp);
 320         zfsvfs = vfsp->vfs_data;
 321         ASSERT(zfsvfs);
 322         os = zfsvfs->z_os;
 323
 324         /*
 325          * The act of registering our callbacks will destroy any mount
 326          * options we may have.  In order to enable temporary overrides
 327          * of mount options, we stash away the current values and
 328          * restore them after we register the callbacks.
 329          */
 330         if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
 331                 readonly = B_TRUE;
 332                 do_readonly = B_TRUE;
 333         } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 334                 readonly = B_FALSE;
 335                 do_readonly = B_TRUE;
 336         }
 337         if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 338                 setuid = B_FALSE;
 339                 do_setuid = B_TRUE;
 340         } else {
 341                 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 342                         setuid = B_FALSE;
 343                         do_setuid = B_TRUE;
 344                 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 345                         setuid = B_TRUE;
 346                         do_setuid = B_TRUE;
 347                 }
 348         }
 349         if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 350                 exec = B_FALSE;
 351                 do_exec = B_TRUE;
 352         } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 353                 exec = B_TRUE;
 354                 do_exec = B_TRUE;
 355         }
 356         if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 357                 xattr = B_FALSE;
 358                 do_xattr = B_TRUE;
 359         } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 360                 xattr = B_TRUE;
 361                 do_xattr = B_TRUE;
 362         }
 363
 364         /*
 365          * Register property callbacks.
 366          *
 367          * It would probably be fine to just check for i/o error from
 368          * the first prop_register(), but I guess I like to go
 369          * overboard...
 370          */
 371         ds = dmu_objset_ds(os);
 372         error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
 373         error = error ? error : dsl_prop_register(ds,
 374             "xattr", xattr_changed_cb, zfsvfs);
 375         error = error ? error : dsl_prop_register(ds,
 376             "recordsize", blksz_changed_cb, zfsvfs);
 377         error = error ? error : dsl_prop_register(ds,
 378             "readonly", readonly_changed_cb, zfsvfs);
 379         error = error ? error : dsl_prop_register(ds,
 380             "setuid", setuid_changed_cb, zfsvfs);
 381         error = error ? error : dsl_prop_register(ds,
 382             "exec", exec_changed_cb, zfsvfs);
 383         error = error ? error : dsl_prop_register(ds,
 384             "snapdir", snapdir_changed_cb, zfsvfs);
 385         error = error ? error : dsl_prop_register(ds,
 386             "aclmode", acl_mode_changed_cb, zfsvfs);
 387         error = error ? error : dsl_prop_register(ds,
 388             "aclinherit", acl_inherit_changed_cb, zfsvfs);
 389         if (error)
 390                 goto unregister;
 391
 392         /*
 393          * Invoke our callbacks to restore temporary mount options.
 394          */
 395         if (do_readonly)
 396                 readonly_changed_cb(zfsvfs, readonly);
 397         if (do_setuid)
 398                 setuid_changed_cb(zfsvfs, setuid);
 399         if (do_exec)
 400                 exec_changed_cb(zfsvfs, exec);
 401         if (do_xattr)
 402                 xattr_changed_cb(zfsvfs, xattr);
 403
 404         return (0);
 405
 406 unregister:
 407         /*
 408          * We may attempt to unregister some callbacks that are not
 409          * registered, but this is OK; it will simply return ENOMSG,
 410          * which we will ignore.
 411          */
 412         (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
 413         (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
 414         (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
 415         (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
 416         (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
 417         (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
 418         (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
 419         (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
 420         (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
 421             zfsvfs);
 422         return (error);
 423
 424 }
 425
 426 static int
 427 zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td)
 428 {
 429         cred_t *cr = td->td_ucred;
 430         uint64_t recordsize, readonly;
 431         int error = 0;
 432         int mode;
 433         zfsvfs_t *zfsvfs;
 434         znode_t *zp = NULL;
 435
 436         ASSERT(vfsp);
 437         ASSERT(osname);
 438
 439         /*
 440          * Initialize the zfs-specific filesystem structure.
 441          * Should probably make this a kmem cache, shuffle fields,
 442          * and just bzero up to z_hold_mtx[].
 443          */
 444         zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 445         zfsvfs->z_vfs = vfsp;
 446         zfsvfs->z_parent = zfsvfs;
 447         zfsvfs->z_assign = TXG_NOWAIT;
 448         zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
 449         zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 450
 451         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 452         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 453             offsetof(znode_t, z_link_node));
 454         rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
 455
 456         if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
 457             NULL))
 458                 goto out;
 459         zfsvfs->z_vfs->vfs_bsize = recordsize;
 460
 461         vfsp->vfs_data = zfsvfs;
 462         vfsp->mnt_flag |= MNT_LOCAL;
 463         vfsp->mnt_kern_flag |= MNTK_MPSAFE;
 464         vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
 465
 466         if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
 467                 goto out;
 468
 469         if (readonly)
 470                 mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
 471         else
 472                 mode = DS_MODE_PRIMARY;
 473
 474         error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
 475         if (error == EROFS) {
 476                 /*
 477                  * FreeBSD: In Solaris there is DS_MODE_PRIMARY instead of
 478                  * DS_MODE_STANDARD, but it doesn't work on FreeBSD and
 479                  * I don't know why. It looks like the dataset is opened
 480                  * on mount DS_MODE_PRIMARY mode and snapshot cannot open
 481                  * the same dataset in DS_MODE_PRIMARY mode again.
 482                  */
 483                 mode = DS_MODE_STANDARD | DS_MODE_READONLY;
 484                 error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
 485                     &zfsvfs->z_os);
 486         }
 487
 488         if (error)
 489                 goto out;
 490
 491         if (error = zfs_init_fs(zfsvfs, &zp, cr))
 492                 goto out;
 493
 494         if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 495                 uint64_t xattr;
 496
 497                 ASSERT(mode & DS_MODE_READONLY);
 498                 atime_changed_cb(zfsvfs, B_FALSE);
 499                 readonly_changed_cb(zfsvfs, B_TRUE);
 500                 if (error = dsl_prop_get_integer(osname, "xattr", &xattr, NULL))
 501                         goto out;
 502                 xattr_changed_cb(zfsvfs, xattr);
 503                 zfsvfs->z_issnap = B_TRUE;
 504         } else {
 505                 error = zfs_register_callbacks(vfsp);
 506                 if (error)
 507                         goto out;
 508
 509                 zfs_unlinked_drain(zfsvfs);
 510
 511                 /*
 512                  * Parse and replay the intent log.
 513                  */
 514                 zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
 515                     zfs_replay_vector);
 516
 517                 if (!zil_disable)
 518                         zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
 519         }
 520
 521         vfs_mountedfrom(vfsp, osname);
 522
 523         if (!zfsvfs->z_issnap)
 524                 zfsctl_create(zfsvfs);
 525 out:
 526         if (error) {
 527                 if (zfsvfs->z_os)
 528                         dmu_objset_close(zfsvfs->z_os);
 529                 rw_destroy(&zfsvfs->z_um_lock);
 530                 mutex_destroy(&zfsvfs->z_znodes_lock);
 531                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
 532         } else {
 533                 atomic_add_32(&zfs_active_fs_count, 1);
 534         }
 535
 536         return (error);
 537
 538 }
 539
 540 void
 541 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
 542 {
 543         objset_t *os = zfsvfs->z_os;
 544         struct dsl_dataset *ds;
 545
 546         /*
 547          * Unregister properties.
 548          */
 549         if (!dmu_objset_is_snapshot(os)) {
 550                 ds = dmu_objset_ds(os);
 551                 VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
 552                     zfsvfs) == 0);
 553
 554                 VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
 555                     zfsvfs) == 0);
 556
 557                 VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
 558                     zfsvfs) == 0);
 559
 560                 VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
 561                     zfsvfs) == 0);
 562
 563                 VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
 564                     zfsvfs) == 0);
 565
 566                 VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
 567                     zfsvfs) == 0);
 568
 569                 VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
 570                     zfsvfs) == 0);
 571
 572                 VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
 573                     zfsvfs) == 0);
 574
 575                 VERIFY(dsl_prop_unregister(ds, "aclinherit",
 576                     acl_inherit_changed_cb, zfsvfs) == 0);
 577         }
 578 }
 579
 580 /*ARGSUSED*/
 581 static int
 582 zfs_mount(vfs_t *vfsp, kthread_t *td)
 583 {
 584         char *from;
 585         int error;
 586
 587         /* TODO: For now deny user mounts. */
 588         if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
 589                 return (error);
 590
 591         /*
 592          * When doing a remount, we simply refresh our temporary properties
 593          * according to those options set in the current VFS options.
 594          */
 595         if (vfsp->vfs_flag & MS_REMOUNT)
 596                 return (zfs_refresh_properties(vfsp));
 597
 598         if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&from, NULL))
 599                 return (EINVAL);
 600
 601         DROP_GIANT();
 602         error = zfs_domount(vfsp, from, td);
 603         PICKUP_GIANT();
 604         return (error);
 605 }
 606
 607 static int
 608 zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td)
 609 {
 610         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 611         uint64_t refdbytes, availbytes, usedobjs, availobjs;
 612
 613         statp->f_version = STATFS_VERSION;
 614
 615         ZFS_ENTER(zfsvfs);
 616
 617         dmu_objset_space(zfsvfs->z_os,
 618             &refdbytes, &availbytes, &usedobjs, &availobjs);
 619
 620         /*
 621          * The underlying storage pool actually uses multiple block sizes.
 622          * We report the fragsize as the smallest block size we support,
 623          * and we report our blocksize as the filesystem's maximum blocksize.
 624          */
 625         statp->f_bsize = zfsvfs->z_vfs->vfs_bsize;
 626         statp->f_iosize = zfsvfs->z_vfs->vfs_bsize;
 627
 628         /*
 629          * The following report "total" blocks of various kinds in the
 630          * file system, but reported in terms of f_frsize - the
 631          * "fragment" size.
 632          */
 633
 634         statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize;
 635         statp->f_bfree = availbytes / statp->f_bsize;
 636         statp->f_bavail = statp->f_bfree; /* no root reservation */
 637
 638         /*
 639          * statvfs() should really be called statufs(), because it assumes
 640          * static metadata.  ZFS doesn't preallocate files, so the best
 641          * we can do is report the max that could possibly fit in f_files,
 642          * and that minus the number actually used in f_ffree.
 643          * For f_ffree, report the smaller of the number of object available
 644          * and the number of blocks (each object will take at least a block).
 645          */
 646         statp->f_ffree = MIN(availobjs, statp->f_bfree);
 647         statp->f_files = statp->f_ffree + usedobjs;
 648
 649         /*
 650          * We're a zfs filesystem.
 651          */
 652         (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
 653
 654         strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
 655             sizeof(statp->f_mntfromname));
 656         strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
 657             sizeof(statp->f_mntonname));
 658
 659         statp->f_namemax = ZFS_MAXNAMELEN;
 660
 661         ZFS_EXIT(zfsvfs);
 662         return (0);
 663 }
 664
 665 static int
 666 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td)
 667 {
 668         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 669         znode_t *rootzp;
 670         int error;
 671
 672         ZFS_ENTER(zfsvfs);
 673
 674         error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
 675         if (error == 0) {
 676                 *vpp = ZTOV(rootzp);
 677                 error = vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
 678                 (*vpp)->v_vflag |= VV_ROOT;
 679         }
 680
 681         ZFS_EXIT(zfsvfs);
 682         return (error);
 683 }
 684
 685 /*ARGSUSED*/
 686 static int
 687 zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td)
 688 {
 689         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 690         cred_t *cr = td->td_ucred;
 691         int ret;
 692
 693         if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
 694                 return (ret);
 695
 696         (void) dnlc_purge_vfsp(vfsp, 0);
 697
 698         /*
 699          * Unmount any snapshots mounted under .zfs before unmounting the
 700          * dataset itself.
 701          */
 702         if (zfsvfs->z_ctldir != NULL) {
 703                 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
 704                         return (ret);
 705                 ret = vflush(vfsp, 0, 0, td);
 706                 ASSERT(ret == EBUSY);
 707                 if (!(fflag & MS_FORCE)) {
 708                         if (zfsvfs->z_ctldir->v_count > 1)
 709                                 return (EBUSY);
 710                         ASSERT(zfsvfs->z_ctldir->v_count == 1);
 711                 }
 712                 zfsctl_destroy(zfsvfs);
 713                 ASSERT(zfsvfs->z_ctldir == NULL);
 714         }
 715
 716         /*
 717          * Flush all the files.
 718          */
 719         ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
 720         if (ret != 0) {
 721                 if (!zfsvfs->z_issnap) {
 722                         zfsctl_create(zfsvfs);
 723                         ASSERT(zfsvfs->z_ctldir != NULL);
 724                 }
 725                 return (ret);
 726         }
 727
 728         if (fflag & MS_FORCE) {
 729                 MNT_ILOCK(vfsp);
 730                 vfsp->mnt_kern_flag |= MNTK_UNMOUNTF;
 731                 MNT_IUNLOCK(vfsp);
 732                 zfsvfs->z_unmounted1 = B_TRUE;
 733
 734                 /*
 735                  * Wait for all zfs threads to leave zfs.
 736                  * Grabbing a rwlock as reader in all vops and
 737                  * as writer here doesn't work because it too easy to get
 738                  * multiple reader enters as zfs can re-enter itself.
 739                  * This can lead to deadlock if there is an intervening
 740                  * rw_enter as writer.
 741                  * So a file system threads ref count (z_op_cnt) is used.
 742                  * A polling loop on z_op_cnt may seem inefficient, but
 743                  * - this saves all threads on exit from having to grab a
 744                  *   mutex in order to cv_signal
 745                  * - only occurs on forced unmount in the rare case when
 746                  *   there are outstanding threads within the file system.
 747                  */
 748                 while (zfsvfs->z_op_cnt) {
 749                         delay(1);
 750                 }
 751         }
 752
 753         zfs_objset_close(zfsvfs);
 754         VFS_RELE(vfsp);
 755         zfs_freevfs(vfsp);
 756
 757         return (0);
 758 }
 759
 760 static int
 761 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
 762 {
 763         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
 764         znode_t         *zp;
 765         int             err;
 766
 767         ZFS_ENTER(zfsvfs);
 768         err = zfs_zget(zfsvfs, ino, &zp);
 769         if (err == 0 && zp->z_unlinked) {
 770                 VN_RELE(ZTOV(zp));
 771                 err = EINVAL;
 772         }
 773         if (err != 0)
 774                 *vpp = NULL;
 775         else {
 776                 *vpp = ZTOV(zp);
 777                 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
 778         }
 779         ZFS_EXIT(zfsvfs);
 780         return (0);
 781 }
 782
 783 static int
 784 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
 785 {
 786         kthread_t       *td = curthread;
 787         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
 788         znode_t         *zp;
 789         uint64_t        object = 0;
 790         uint64_t        fid_gen = 0;
 791         uint64_t        gen_mask;
 792         uint64_t        zp_gen;
 793         int             i, err;
 794
 795         *vpp = NULL;
 796
 797         ZFS_ENTER(zfsvfs);
 798
 799         if (fidp->fid_len == LONG_FID_LEN) {
 800                 zfid_long_t     *zlfid = (zfid_long_t *)fidp;
 801                 uint64_t        objsetid = 0;
 802                 uint64_t        setgen = 0;
 803
 804                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 805                         objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
 806
 807                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 808                         setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
 809
 810                 ZFS_EXIT(zfsvfs);
 811
 812                 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
 813                 if (err)
 814                         return (EINVAL);
 815                 ZFS_ENTER(zfsvfs);
 816         }
 817
 818         if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
 819                 zfid_short_t    *zfid = (zfid_short_t *)fidp;
 820
 821                 for (i = 0; i < sizeof (zfid->zf_object); i++)
 822                         object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
 823
 824                 for (i = 0; i < sizeof (zfid->zf_gen); i++)
 825                         fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
 826         } else {
 827                 ZFS_EXIT(zfsvfs);
 828                 return (EINVAL);
 829         }
 830
 831         /* A zero fid_gen means we are in the .zfs control directories */
 832         if (fid_gen == 0 &&
 833             (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
 834                 *vpp = zfsvfs->z_ctldir;
 835                 ASSERT(*vpp != NULL);
 836                 if (object == ZFSCTL_INO_SNAPDIR) {
 837                         VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
 838                             0, NULL, NULL) == 0);
 839                 } else {
 840                         VN_HOLD(*vpp);
 841                 }
 842                 ZFS_EXIT(zfsvfs);
 843                 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
 844                 return (0);
 845         }
 846
 847         gen_mask = -1ULL >> (64 - 8 * i);
 848
 849         dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
 850         if (err = zfs_zget(zfsvfs, object, &zp)) {
 851                 ZFS_EXIT(zfsvfs);
 852                 return (err);
 853         }
 854         zp_gen = zp->z_phys->zp_gen & gen_mask;
 855         if (zp_gen == 0)
 856                 zp_gen = 1;
 857         if (zp->z_unlinked || zp_gen != fid_gen) {
 858                 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
 859                 VN_RELE(ZTOV(zp));
 860                 ZFS_EXIT(zfsvfs);
 861                 return (EINVAL);
 862         }
 863
 864         *vpp = ZTOV(zp);
 865         vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
 866         vnode_create_vobject(*vpp, zp->z_phys->zp_size, td);
 867         ZFS_EXIT(zfsvfs);
 868         return (0);
 869 }
 870
 871 static void
 872 zfs_objset_close(zfsvfs_t *zfsvfs)
 873 {
 874         znode_t         *zp, *nextzp;
 875         objset_t        *os = zfsvfs->z_os;
 876
 877         /*
 878          * For forced unmount, at this point all vops except zfs_inactive
 879          * are erroring EIO. We need to now suspend zfs_inactive threads
 880          * while we are freeing dbufs before switching zfs_inactive
 881          * to use behaviour without a objset.
 882          */
 883         rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
 884
 885         /*
 886          * Release all holds on dbufs
 887          * Note, although we have stopped all other vop threads and
 888          * zfs_inactive(), the dmu can callback via znode_pageout_func()
 889          * which can zfs_znode_free() the znode.
 890          * So we lock z_all_znodes; search the list for a held
 891          * dbuf; drop the lock (we know zp can't disappear if we hold
 892          * a dbuf lock; then regrab the lock and restart.
 893          */
 894         mutex_enter(&zfsvfs->z_znodes_lock);
 895         for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
 896                 nextzp = list_next(&zfsvfs->z_all_znodes, zp);
 897                 if (zp->z_dbuf_held) {
 898                         /* dbufs should only be held when force unmounting */
 899                         zp->z_dbuf_held = 0;
 900                         mutex_exit(&zfsvfs->z_znodes_lock);
 901                         dmu_buf_rele(zp->z_dbuf, NULL);
 902                         /* Start again */
 903                         mutex_enter(&zfsvfs->z_znodes_lock);
 904                         nextzp = list_head(&zfsvfs->z_all_znodes);
 905                 }
 906         }
 907         mutex_exit(&zfsvfs->z_znodes_lock);
 908
 909         /*
 910          * Unregister properties.
 911          */
 912         if (!dmu_objset_is_snapshot(os))
 913                 zfs_unregister_callbacks(zfsvfs);
 914
 915         /*
 916          * Switch zfs_inactive to behaviour without an objset.
 917          * It just tosses cached pages and frees the znode & vnode.
 918          * Then re-enable zfs_inactive threads in that new behaviour.
 919          */
 920         zfsvfs->z_unmounted2 = B_TRUE;
 921         rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */
 922
 923         /*
 924          * Close the zil. Can't close the zil while zfs_inactive
 925          * threads are blocked as zil_close can call zfs_inactive.
 926          */
 927         if (zfsvfs->z_log) {
 928                 zil_close(zfsvfs->z_log);
 929                 zfsvfs->z_log = NULL;
 930         }
 931
 932         /*
 933          * Evict all dbufs so that cached znodes will be freed
 934          */
 935         if (dmu_objset_evict_dbufs(os, 1)) {
 936                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 937                 (void) dmu_objset_evict_dbufs(os, 0);
 938         }
 939
 940         /*
 941          * Finally close the objset
 942          */
 943         dmu_objset_close(os);
 944 }
 945
 946 static void
 947 zfs_freevfs(vfs_t *vfsp)
 948 {
 949         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 950         int i;
 951
 952         for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 953                 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 954         rw_destroy(&zfsvfs->z_um_lock);
 955         mutex_destroy(&zfsvfs->z_znodes_lock);
 956         kmem_free(zfsvfs, sizeof (zfsvfs_t));
 957
 958         atomic_add_32(&zfs_active_fs_count, -1);
 959 }
 960
 961 void
 962 zfs_init(void)
 963 {
 964
 965         printf("ZFS filesystem version " ZFS_VERSION_STRING "\n");
 966
 967         /*
 968          * Initialize .zfs directory structures
 969          */
 970         zfsctl_init();
 971
 972         /*
 973          * Initialize znode cache, vnode ops, etc...
 974          */
 975         zfs_znode_init();
 976 }
 977
 978 void
 979 zfs_fini(void)
 980 {
 981         zfsctl_fini();
 982         zfs_znode_fini();
 983 }
 984
 985 int
 986 zfs_busy(void)
 987 {
 988         return (zfs_active_fs_count != 0);
 989 }