sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27
  28 #include <sys/types.h>
  29 #include <sys/param.h>
  30 #include <sys/systm.h>
  31 #include <sys/kernel.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/kmem.h>
  34 #include <sys/acl.h>
  35 #include <sys/vnode.h>
  36 #include <sys/vfs.h>
  37 #include <sys/mntent.h>
  38 #include <sys/mount.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/zfs_znode.h>
  41 #include <sys/zfs_dir.h>
  42 #include <sys/zil.h>
  43 #include <sys/fs/zfs.h>
  44 #include <sys/dmu.h>
  45 #include <sys/dsl_prop.h>
  46 #include <sys/dsl_dataset.h>
  47 #include <sys/spa.h>
  48 #include <sys/zap.h>
  49 #include <sys/varargs.h>
  50 #include <sys/policy.h>
  51 #include <sys/atomic.h>
  52 #include <sys/zfs_ioctl.h>
  53 #include <sys/zfs_ctldir.h>
  54 #include <sys/sunddi.h>
  55 #include <sys/dnlc.h>
  56
  57 struct mtx zfs_debug_mtx;
  58 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
  59 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
  60 int zfs_debug_level = 0;
  61 TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
  62 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
  63     "Debug level");
  64
  65 static int zfs_mount(vfs_t *vfsp, kthread_t *td);
  66 static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td);
  67 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td);
  68 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td);
  69 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
  70 static int zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td);
  71 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
  72 static void zfs_objset_close(zfsvfs_t *zfsvfs);
  73 static void zfs_freevfs(vfs_t *vfsp);
  74
  75 static struct vfsops zfs_vfsops = {
  76         .vfs_mount =            zfs_mount,
  77         .vfs_unmount =          zfs_umount,
  78         .vfs_root =             zfs_root,
  79         .vfs_statfs =           zfs_statfs,
  80         .vfs_vget =             zfs_vget,
  81         .vfs_sync =             zfs_sync,
  82         .vfs_fhtovp =           zfs_fhtovp,
  83 };
  84
  85 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL);
  86
  87 /*
  88  * We need to keep a count of active fs's.
  89  * This is necessary to prevent our module
  90  * from being unloaded after a umount -f
  91  */
  92 static uint32_t zfs_active_fs_count = 0;
  93
  94 /*ARGSUSED*/
  95 static int
  96 zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td)
  97 {
  98
  99         /*
 100          * Data integrity is job one.  We don't want a compromised kernel
 101          * writing to the storage pool, so we never sync during panic.
 102          */
 103         if (panicstr)
 104                 return (0);
 105
 106         if (vfsp != NULL) {
 107                 /*
 108                  * Sync a specific filesystem.
 109                  */
 110                 zfsvfs_t *zfsvfs = vfsp->vfs_data;
 111                 int error;
 112
 113                 error = vfs_stdsync(vfsp, waitfor, td);
 114                 if (error != 0)
 115                         return (error);
 116
 117                 ZFS_ENTER(zfsvfs);
 118                 if (zfsvfs->z_log != NULL)
 119                         zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
 120                 else
 121                         txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 122                 ZFS_EXIT(zfsvfs);
 123         } else {
 124                 /*
 125                  * Sync all ZFS filesystems.  This is what happens when you
 126                  * run sync(1M).  Unlike other filesystems, ZFS honors the
 127                  * request by waiting for all pools to commit all dirty data.
 128                  */
 129                 spa_sync_allpools();
 130         }
 131
 132         return (0);
 133 }
 134
 135 static void
 136 atime_changed_cb(void *arg, uint64_t newval)
 137 {
 138         zfsvfs_t *zfsvfs = arg;
 139
 140         if (newval == TRUE) {
 141                 zfsvfs->z_atime = TRUE;
 142                 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 143                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 144                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 145         } else {
 146                 zfsvfs->z_atime = FALSE;
 147                 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 148                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 149                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 150         }
 151 }
 152
 153 static void
 154 xattr_changed_cb(void *arg, uint64_t newval)
 155 {
 156         zfsvfs_t *zfsvfs = arg;
 157
 158         if (newval == TRUE) {
 159                 /* XXX locking on vfs_flag? */
 160 #ifdef TODO
 161                 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 162 #endif
 163                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 164                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 165         } else {
 166                 /* XXX locking on vfs_flag? */
 167 #ifdef TODO
 168                 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 169 #endif
 170                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 171                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 172         }
 173 }
 174
 175 static void
 176 blksz_changed_cb(void *arg, uint64_t newval)
 177 {
 178         zfsvfs_t *zfsvfs = arg;
 179
 180         if (newval < SPA_MINBLOCKSIZE ||
 181             newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
 182                 newval = SPA_MAXBLOCKSIZE;
 183
 184         zfsvfs->z_max_blksz = newval;
 185         zfsvfs->z_vfs->vfs_bsize = newval;
 186 }
 187
 188 static void
 189 readonly_changed_cb(void *arg, uint64_t newval)
 190 {
 191         zfsvfs_t *zfsvfs = arg;
 192
 193         if (newval) {
 194                 /* XXX locking on vfs_flag? */
 195                 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 196                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 197                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 198         } else {
 199                 /* XXX locking on vfs_flag? */
 200                 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 201                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 202                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 203         }
 204 }
 205
 206 static void
 207 setuid_changed_cb(void *arg, uint64_t newval)
 208 {
 209         zfsvfs_t *zfsvfs = arg;
 210
 211         if (newval == FALSE) {
 212                 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 213                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 214                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 215         } else {
 216                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 217                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 218                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 219         }
 220 }
 221
 222 static void
 223 exec_changed_cb(void *arg, uint64_t newval)
 224 {
 225         zfsvfs_t *zfsvfs = arg;
 226
 227         if (newval == FALSE) {
 228                 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 229                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 230                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 231         } else {
 232                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 233                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 234                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 235         }
 236 }
 237
 238 static void
 239 snapdir_changed_cb(void *arg, uint64_t newval)
 240 {
 241         zfsvfs_t *zfsvfs = arg;
 242
 243         zfsvfs->z_show_ctldir = newval;
 244 }
 245
 246 static void
 247 acl_mode_changed_cb(void *arg, uint64_t newval)
 248 {
 249         zfsvfs_t *zfsvfs = arg;
 250
 251         zfsvfs->z_acl_mode = newval;
 252 }
 253
 254 static void
 255 acl_inherit_changed_cb(void *arg, uint64_t newval)
 256 {
 257         zfsvfs_t *zfsvfs = arg;
 258
 259         zfsvfs->z_acl_inherit = newval;
 260 }
 261
 262 static int
 263 zfs_refresh_properties(vfs_t *vfsp)
 264 {
 265         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 266
 267         /*
 268          * Remount operations default to "rw" unless "ro" is explicitly
 269          * specified.
 270          */
 271         if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
 272                 readonly_changed_cb(zfsvfs, B_TRUE);
 273         } else {
 274                 if (!dmu_objset_is_snapshot(zfsvfs->z_os))
 275                         readonly_changed_cb(zfsvfs, B_FALSE);
 276                 else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
 277                         return (EROFS);
 278         }
 279
 280         if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 281                 setuid_changed_cb(zfsvfs, B_FALSE);
 282         } else {
 283                 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
 284                         setuid_changed_cb(zfsvfs, B_FALSE);
 285                 else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
 286                         setuid_changed_cb(zfsvfs, B_TRUE);
 287         }
 288
 289         if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
 290                 exec_changed_cb(zfsvfs, B_FALSE);
 291         else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
 292                 exec_changed_cb(zfsvfs, B_TRUE);
 293
 294         if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
 295                 atime_changed_cb(zfsvfs, B_TRUE);
 296         else if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
 297                 atime_changed_cb(zfsvfs, B_FALSE);
 298
 299         if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
 300                 xattr_changed_cb(zfsvfs, B_TRUE);
 301         else if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
 302                 xattr_changed_cb(zfsvfs, B_FALSE);
 303
 304         return (0);
 305 }
 306
 307 static int
 308 zfs_register_callbacks(vfs_t *vfsp)
 309 {
 310         struct dsl_dataset *ds = NULL;
 311         objset_t *os = NULL;
 312         zfsvfs_t *zfsvfs = NULL;
 313         int readonly, do_readonly = FALSE;
 314         int setuid, do_setuid = FALSE;
 315         int exec, do_exec = FALSE;
 316         int xattr, do_xattr = FALSE;
 317         int error = 0;
 318
 319         ASSERT(vfsp);
 320         zfsvfs = vfsp->vfs_data;
 321         ASSERT(zfsvfs);
 322         os = zfsvfs->z_os;
 323
 324         /*
 325          * The act of registering our callbacks will destroy any mount
 326          * options we may have.  In order to enable temporary overrides
 327          * of mount options, we stash away the current values and
 328          * restore them after we register the callbacks.
 329          */
 330         if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
 331                 readonly = B_TRUE;
 332                 do_readonly = B_TRUE;
 333         } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 334                 readonly = B_FALSE;
 335                 do_readonly = B_TRUE;
 336         }
 337         if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 338                 setuid = B_FALSE;
 339                 do_setuid = B_TRUE;
 340         } else {
 341                 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 342                         setuid = B_FALSE;
 343                         do_setuid = B_TRUE;
 344                 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 345                         setuid = B_TRUE;
 346                         do_setuid = B_TRUE;
 347                 }
 348         }
 349         if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 350                 exec = B_FALSE;
 351                 do_exec = B_TRUE;
 352         } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 353                 exec = B_TRUE;
 354                 do_exec = B_TRUE;
 355         }
 356         if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 357                 xattr = B_FALSE;
 358                 do_xattr = B_TRUE;
 359         } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 360                 xattr = B_TRUE;
 361                 do_xattr = B_TRUE;
 362         }
 363
 364         /*
 365          * Register property callbacks.
 366          *
 367          * It would probably be fine to just check for i/o error from
 368          * the first prop_register(), but I guess I like to go
 369          * overboard...
 370          */
 371         ds = dmu_objset_ds(os);
 372         error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
 373         error = error ? error : dsl_prop_register(ds,
 374             "xattr", xattr_changed_cb, zfsvfs);
 375         error = error ? error : dsl_prop_register(ds,
 376             "recordsize", blksz_changed_cb, zfsvfs);
 377         error = error ? error : dsl_prop_register(ds,
 378             "readonly", readonly_changed_cb, zfsvfs);
 379         error = error ? error : dsl_prop_register(ds,
 380             "setuid", setuid_changed_cb, zfsvfs);
 381         error = error ? error : dsl_prop_register(ds,
 382             "exec", exec_changed_cb, zfsvfs);
 383         error = error ? error : dsl_prop_register(ds,
 384             "snapdir", snapdir_changed_cb, zfsvfs);
 385         error = error ? error : dsl_prop_register(ds,
 386             "aclmode", acl_mode_changed_cb, zfsvfs);
 387         error = error ? error : dsl_prop_register(ds,
 388             "aclinherit", acl_inherit_changed_cb, zfsvfs);
 389         if (error)
 390                 goto unregister;
 391
 392         /*
 393          * Invoke our callbacks to restore temporary mount options.
 394          */
 395         if (do_readonly)
 396                 readonly_changed_cb(zfsvfs, readonly);
 397         if (do_setuid)
 398                 setuid_changed_cb(zfsvfs, setuid);
 399         if (do_exec)
 400                 exec_changed_cb(zfsvfs, exec);
 401         if (do_xattr)
 402                 xattr_changed_cb(zfsvfs, xattr);
 403
 404         return (0);
 405
 406 unregister:
 407         /*
 408          * We may attempt to unregister some callbacks that are not
 409          * registered, but this is OK; it will simply return ENOMSG,
 410          * which we will ignore.
 411          */
 412         (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
 413         (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
 414         (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
 415         (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
 416         (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
 417         (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
 418         (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
 419         (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
 420         (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
 421             zfsvfs);
 422         return (error);
 423
 424 }
 425
 426 static int
 427 zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td)
 428 {
 429         cred_t *cr = td->td_ucred;
 430         uint64_t recordsize, readonly;
 431         int error = 0;
 432         int mode;
 433         zfsvfs_t *zfsvfs;
 434         znode_t *zp = NULL;
 435
 436         ASSERT(vfsp);
 437         ASSERT(osname);
 438
 439         /*
 440          * Initialize the zfs-specific filesystem structure.
 441          * Should probably make this a kmem cache, shuffle fields,
 442          * and just bzero up to z_hold_mtx[].
 443          */
 444         zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 445         zfsvfs->z_vfs = vfsp;
 446         zfsvfs->z_parent = zfsvfs;
 447         zfsvfs->z_assign = TXG_NOWAIT;
 448         zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
 449         zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 450
 451         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 452         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 453             offsetof(znode_t, z_link_node));
 454         rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
 455
 456         if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
 457             NULL))
 458                 goto out;
 459         zfsvfs->z_vfs->vfs_bsize = recordsize;
 460
 461         vfsp->vfs_data = zfsvfs;
 462         vfsp->mnt_flag |= MNT_LOCAL;
 463         vfsp->mnt_kern_flag |= MNTK_MPSAFE;
 464         vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
 465
 466         if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
 467                 goto out;
 468
 469         if (readonly)
 470                 mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
 471         else
 472                 mode = DS_MODE_PRIMARY;
 473
 474         error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
 475         if (error == EROFS) {
 476                 mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
 477                 error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
 478                     &zfsvfs->z_os);
 479         }
 480
 481         if (error)
 482                 goto out;
 483
 484         if (error = zfs_init_fs(zfsvfs, &zp, cr))
 485                 goto out;
 486
 487         if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 488                 uint64_t xattr;
 489
 490                 ASSERT(mode & DS_MODE_READONLY);
 491                 atime_changed_cb(zfsvfs, B_FALSE);
 492                 readonly_changed_cb(zfsvfs, B_TRUE);
 493                 if (error = dsl_prop_get_integer(osname, "xattr", &xattr, NULL))
 494                         goto out;
 495                 xattr_changed_cb(zfsvfs, xattr);
 496                 zfsvfs->z_issnap = B_TRUE;
 497         } else {
 498                 error = zfs_register_callbacks(vfsp);
 499                 if (error)
 500                         goto out;
 501
 502                 zfs_unlinked_drain(zfsvfs);
 503
 504                 /*
 505                  * Parse and replay the intent log.
 506                  */
 507                 zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
 508                     zfs_replay_vector);
 509
 510                 if (!zil_disable)
 511                         zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
 512         }
 513
 514         vfs_mountedfrom(vfsp, osname);
 515
 516         if (!zfsvfs->z_issnap)
 517                 zfsctl_create(zfsvfs);
 518 out:
 519         if (error) {
 520                 if (zfsvfs->z_os)
 521                         dmu_objset_close(zfsvfs->z_os);
 522                 rw_destroy(&zfsvfs->z_um_lock);
 523                 mutex_destroy(&zfsvfs->z_znodes_lock);
 524                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
 525         } else {
 526                 atomic_add_32(&zfs_active_fs_count, 1);
 527         }
 528
 529         return (error);
 530
 531 }
 532
 533 void
 534 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
 535 {
 536         objset_t *os = zfsvfs->z_os;
 537         struct dsl_dataset *ds;
 538
 539         /*
 540          * Unregister properties.
 541          */
 542         if (!dmu_objset_is_snapshot(os)) {
 543                 ds = dmu_objset_ds(os);
 544                 VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
 545                     zfsvfs) == 0);
 546
 547                 VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
 548                     zfsvfs) == 0);
 549
 550                 VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
 551                     zfsvfs) == 0);
 552
 553                 VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
 554                     zfsvfs) == 0);
 555
 556                 VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
 557                     zfsvfs) == 0);
 558
 559                 VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
 560                     zfsvfs) == 0);
 561
 562                 VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
 563                     zfsvfs) == 0);
 564
 565                 VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
 566                     zfsvfs) == 0);
 567
 568                 VERIFY(dsl_prop_unregister(ds, "aclinherit",
 569                     acl_inherit_changed_cb, zfsvfs) == 0);
 570         }
 571 }
 572
 573 /*ARGSUSED*/
 574 static int
 575 zfs_mount(vfs_t *vfsp, kthread_t *td)
 576 {
 577         char *from;
 578         int error;
 579
 580         /*
 581          * When doing a remount, we simply refresh our temporary properties
 582          * according to those options set in the current VFS options.
 583          */
 584         if (vfsp->vfs_flag & MS_REMOUNT)
 585                 return (zfs_refresh_properties(vfsp));
 586
 587         if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&from, NULL))
 588                 return (EINVAL);
 589
 590         DROP_GIANT();
 591         error = zfs_domount(vfsp, from, td);
 592         PICKUP_GIANT();
 593         return (error);
 594 }
 595
 596 static int
 597 zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td)
 598 {
 599         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 600         uint64_t refdbytes, availbytes, usedobjs, availobjs;
 601
 602         statp->f_version = STATFS_VERSION;
 603
 604         ZFS_ENTER(zfsvfs);
 605
 606         dmu_objset_space(zfsvfs->z_os,
 607             &refdbytes, &availbytes, &usedobjs, &availobjs);
 608
 609         /*
 610          * The underlying storage pool actually uses multiple block sizes.
 611          * We report the fragsize as the smallest block size we support,
 612          * and we report our blocksize as the filesystem's maximum blocksize.
 613          */
 614         statp->f_bsize = zfsvfs->z_vfs->vfs_bsize;
 615         statp->f_iosize = zfsvfs->z_vfs->vfs_bsize;
 616
 617         /*
 618          * The following report "total" blocks of various kinds in the
 619          * file system, but reported in terms of f_frsize - the
 620          * "fragment" size.
 621          */
 622
 623         statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize;
 624         statp->f_bfree = availbytes / statp->f_bsize;
 625         statp->f_bavail = statp->f_bfree; /* no root reservation */
 626
 627         /*
 628          * statvfs() should really be called statufs(), because it assumes
 629          * static metadata.  ZFS doesn't preallocate files, so the best
 630          * we can do is report the max that could possibly fit in f_files,
 631          * and that minus the number actually used in f_ffree.
 632          * For f_ffree, report the smaller of the number of object available
 633          * and the number of blocks (each object will take at least a block).
 634          */
 635         statp->f_ffree = MIN(availobjs, statp->f_bfree);
 636         statp->f_files = statp->f_ffree + usedobjs;
 637
 638         /*
 639          * We're a zfs filesystem.
 640          */
 641         (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
 642
 643         strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
 644             sizeof(statp->f_mntfromname));
 645         strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
 646             sizeof(statp->f_mntonname));
 647
 648         statp->f_namemax = ZFS_MAXNAMELEN;
 649
 650         ZFS_EXIT(zfsvfs);
 651         return (0);
 652 }
 653
 654 static int
 655 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td)
 656 {
 657         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 658         znode_t *rootzp;
 659         int error;
 660
 661         ZFS_ENTER(zfsvfs);
 662
 663         error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
 664         if (error == 0) {
 665                 *vpp = ZTOV(rootzp);
 666                 error = vn_lock(*vpp, flags, td);
 667                 (*vpp)->v_vflag |= VV_ROOT;
 668         }
 669
 670         ZFS_EXIT(zfsvfs);
 671         return (error);
 672 }
 673
 674 /*ARGSUSED*/
 675 static int
 676 zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td)
 677 {
 678         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 679         cred_t *cr = td->td_ucred;
 680         int ret;
 681
 682         if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
 683                 return (ret);
 684
 685         (void) dnlc_purge_vfsp(vfsp, 0);
 686
 687         /*
 688          * Unmount any snapshots mounted under .zfs before unmounting the
 689          * dataset itself.
 690          */
 691         if (zfsvfs->z_ctldir != NULL) {
 692                 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
 693                         return (ret);
 694                 ret = vflush(vfsp, 0, 0, td);
 695                 ASSERT(ret == EBUSY);
 696                 if (!(fflag & MS_FORCE)) {
 697                         if (zfsvfs->z_ctldir->v_count > 1)
 698                                 return (EBUSY);
 699                         ASSERT(zfsvfs->z_ctldir->v_count == 1);
 700                 }
 701                 zfsctl_destroy(zfsvfs);
 702                 ASSERT(zfsvfs->z_ctldir == NULL);
 703         }
 704
 705         /*
 706          * Flush all the files.
 707          */
 708         ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
 709         if (ret != 0) {
 710                 if (!zfsvfs->z_issnap) {
 711                         zfsctl_create(zfsvfs);
 712                         ASSERT(zfsvfs->z_ctldir != NULL);
 713                 }
 714                 return (ret);
 715         }
 716
 717         if (fflag & MS_FORCE) {
 718                 MNT_ILOCK(vfsp);
 719                 vfsp->mnt_kern_flag |= MNTK_UNMOUNTF;
 720                 MNT_IUNLOCK(vfsp);
 721                 zfsvfs->z_unmounted1 = B_TRUE;
 722
 723                 /*
 724                  * Wait for all zfs threads to leave zfs.
 725                  * Grabbing a rwlock as reader in all vops and
 726                  * as writer here doesn't work because it too easy to get
 727                  * multiple reader enters as zfs can re-enter itself.
 728                  * This can lead to deadlock if there is an intervening
 729                  * rw_enter as writer.
 730                  * So a file system threads ref count (z_op_cnt) is used.
 731                  * A polling loop on z_op_cnt may seem inefficient, but
 732                  * - this saves all threads on exit from having to grab a
 733                  *   mutex in order to cv_signal
 734                  * - only occurs on forced unmount in the rare case when
 735                  *   there are outstanding threads within the file system.
 736                  */
 737                 while (zfsvfs->z_op_cnt) {
 738                         delay(1);
 739                 }
 740         }
 741
 742         zfs_objset_close(zfsvfs);
 743         VFS_RELE(vfsp);
 744         zfs_freevfs(vfsp);
 745
 746         return (0);
 747 }
 748
 749 static int
 750 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
 751 {
 752         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
 753         znode_t         *zp;
 754         int             err;
 755
 756         ZFS_ENTER(zfsvfs);
 757         err = zfs_zget(zfsvfs, ino, &zp);
 758         if (err == 0 && zp->z_unlinked) {
 759                 VN_RELE(ZTOV(zp));
 760                 err = EINVAL;
 761         }
 762         if (err != 0)
 763                 *vpp = NULL;
 764         else {
 765                 *vpp = ZTOV(zp);
 766                 vn_lock(*vpp, flags, curthread);
 767         }
 768         ZFS_EXIT(zfsvfs);
 769         return (err);
 770 }
 771
 772 static int
 773 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
 774 {
 775         kthread_t       *td = curthread;
 776         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
 777         znode_t         *zp;
 778         uint64_t        object = 0;
 779         uint64_t        fid_gen = 0;
 780         uint64_t        gen_mask;
 781         uint64_t        zp_gen;
 782         int             i, err;
 783
 784         *vpp = NULL;
 785
 786         ZFS_ENTER(zfsvfs);
 787
 788         if (fidp->fid_len == LONG_FID_LEN) {
 789                 zfid_long_t     *zlfid = (zfid_long_t *)fidp;
 790                 uint64_t        objsetid = 0;
 791                 uint64_t        setgen = 0;
 792
 793                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 794                         objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
 795
 796                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 797                         setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
 798
 799                 ZFS_EXIT(zfsvfs);
 800
 801                 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
 802                 if (err)
 803                         return (EINVAL);
 804                 ZFS_ENTER(zfsvfs);
 805         }
 806
 807         if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
 808                 zfid_short_t    *zfid = (zfid_short_t *)fidp;
 809
 810                 for (i = 0; i < sizeof (zfid->zf_object); i++)
 811                         object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
 812
 813                 for (i = 0; i < sizeof (zfid->zf_gen); i++)
 814                         fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
 815         } else {
 816                 ZFS_EXIT(zfsvfs);
 817                 return (EINVAL);
 818         }
 819
 820         /* A zero fid_gen means we are in the .zfs control directories */
 821         if (fid_gen == 0 &&
 822             (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
 823                 *vpp = zfsvfs->z_ctldir;
 824                 ASSERT(*vpp != NULL);
 825                 if (object == ZFSCTL_INO_SNAPDIR) {
 826                         VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
 827                             0, NULL, NULL) == 0);
 828                 } else {
 829                         VN_HOLD(*vpp);
 830                 }
 831                 ZFS_EXIT(zfsvfs);
 832                 /* XXX: LK_RETRY? */
 833                 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
 834                 return (0);
 835         }
 836
 837         gen_mask = -1ULL >> (64 - 8 * i);
 838
 839         dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
 840         if (err = zfs_zget(zfsvfs, object, &zp)) {
 841                 ZFS_EXIT(zfsvfs);
 842                 return (err);
 843         }
 844         zp_gen = zp->z_phys->zp_gen & gen_mask;
 845         if (zp_gen == 0)
 846                 zp_gen = 1;
 847         if (zp->z_unlinked || zp_gen != fid_gen) {
 848                 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
 849                 VN_RELE(ZTOV(zp));
 850                 ZFS_EXIT(zfsvfs);
 851                 return (EINVAL);
 852         }
 853
 854         *vpp = ZTOV(zp);
 855         /* XXX: LK_RETRY? */
 856         vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
 857         vnode_create_vobject(*vpp, zp->z_phys->zp_size, td);
 858         ZFS_EXIT(zfsvfs);
 859         return (0);
 860 }
 861
 862 static void
 863 zfs_objset_close(zfsvfs_t *zfsvfs)
 864 {
 865         znode_t         *zp, *nextzp;
 866         objset_t        *os = zfsvfs->z_os;
 867
 868         /*
 869          * For forced unmount, at this point all vops except zfs_inactive
 870          * are erroring EIO. We need to now suspend zfs_inactive threads
 871          * while we are freeing dbufs before switching zfs_inactive
 872          * to use behaviour without a objset.
 873          */
 874         rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
 875
 876         /*
 877          * Release all holds on dbufs
 878          * Note, although we have stopped all other vop threads and
 879          * zfs_inactive(), the dmu can callback via znode_pageout_func()
 880          * which can zfs_znode_free() the znode.
 881          * So we lock z_all_znodes; search the list for a held
 882          * dbuf; drop the lock (we know zp can't disappear if we hold
 883          * a dbuf lock; then regrab the lock and restart.
 884          */
 885         mutex_enter(&zfsvfs->z_znodes_lock);
 886         for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
 887                 nextzp = list_next(&zfsvfs->z_all_znodes, zp);
 888                 if (zp->z_dbuf_held) {
 889                         /* dbufs should only be held when force unmounting */
 890                         zp->z_dbuf_held = 0;
 891                         mutex_exit(&zfsvfs->z_znodes_lock);
 892                         dmu_buf_rele(zp->z_dbuf, NULL);
 893                         /* Start again */
 894                         mutex_enter(&zfsvfs->z_znodes_lock);
 895                         nextzp = list_head(&zfsvfs->z_all_znodes);
 896                 }
 897         }
 898         mutex_exit(&zfsvfs->z_znodes_lock);
 899
 900         /*
 901          * Unregister properties.
 902          */
 903         if (!dmu_objset_is_snapshot(os))
 904                 zfs_unregister_callbacks(zfsvfs);
 905
 906         /*
 907          * Switch zfs_inactive to behaviour without an objset.
 908          * It just tosses cached pages and frees the znode & vnode.
 909          * Then re-enable zfs_inactive threads in that new behaviour.
 910          */
 911         zfsvfs->z_unmounted2 = B_TRUE;
 912         rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */
 913
 914         /*
 915          * Close the zil. Can't close the zil while zfs_inactive
 916          * threads are blocked as zil_close can call zfs_inactive.
 917          */
 918         if (zfsvfs->z_log) {
 919                 zil_close(zfsvfs->z_log);
 920                 zfsvfs->z_log = NULL;
 921         }
 922
 923         /*
 924          * Evict all dbufs so that cached znodes will be freed
 925          */
 926         if (dmu_objset_evict_dbufs(os, 1)) {
 927                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 928                 (void) dmu_objset_evict_dbufs(os, 0);
 929         }
 930
 931         /*
 932          * Finally close the objset
 933          */
 934         dmu_objset_close(os);
 935 }
 936
 937 static void
 938 zfs_freevfs(vfs_t *vfsp)
 939 {
 940         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 941         int i;
 942
 943         for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 944                 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 945         rw_destroy(&zfsvfs->z_um_lock);
 946         mutex_destroy(&zfsvfs->z_znodes_lock);
 947         kmem_free(zfsvfs, sizeof (zfsvfs_t));
 948
 949         atomic_add_32(&zfs_active_fs_count, -1);
 950 }
 951
 952 #ifdef __i386__
 953 static int desiredvnodes_backup;
 954 #endif
 955
 956 static void
 957 zfs_vnodes_adjust(void)
 958 {
 959 #ifdef __i386__
 960         int val;
 961
 962         desiredvnodes_backup = desiredvnodes;
 963
 964         /*
 965          * We calculate newdesiredvnodes the same way it is done in
 966          * vntblinit(). If it is equal to desiredvnodes, it means that
 967          * it wasn't tuned by the administrator and we can tune it down.
 968          */
 969         val = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
 970             (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
 971         if (desiredvnodes == val)
 972                 desiredvnodes = (3 * desiredvnodes) / 4;
 973 #endif
 974 }
 975
 976 static void
 977 zfs_vnodes_adjust_back(void)
 978 {
 979
 980 #ifdef __i386__
 981         desiredvnodes = desiredvnodes_backup;
 982 #endif
 983 }
 984
 985 void
 986 zfs_init(void)
 987 {
 988
 989         printf("ZFS filesystem version " ZFS_VERSION_STRING "\n");
 990
 991         /*
 992          * Initialize .zfs directory structures
 993          */
 994         zfsctl_init();
 995
 996         /*
 997          * Initialize znode cache, vnode ops, etc...
 998          */
 999         zfs_znode_init();
1000
1001         /*
1002          * Reduce number of vnodes. Originally number of vnodes is calculated
1003          * with UFS inode in mind. We reduce it here, because it's too big for
1004          * ZFS/i386.
1005          */
1006         zfs_vnodes_adjust();
1007 }
1008
1009 void
1010 zfs_fini(void)
1011 {
1012         zfsctl_fini();
1013         zfs_znode_fini();
1014         zfs_vnodes_adjust_back();
1015 }
1016
1017 int
1018 zfs_busy(void)
1019 {
1020         return (zfs_active_fs_count != 0);
1021 }