sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
  24  * All rights reserved.
  25  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  26  * Copyright (c) 2014 Integros [integros.com]
  27  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
  28  */
  29
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32 #include <sys/types.h>
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/kernel.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/kmem.h>
  38 #include <sys/acl.h>
  39 #include <sys/vnode.h>
  40 #include <sys/vfs.h>
  41 #include <sys/mntent.h>
  42 #include <sys/mount.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/zfs_znode.h>
  45 #include <sys/zfs_vnops.h>
  46 #include <sys/zfs_dir.h>
  47 #include <sys/zil.h>
  48 #include <sys/fs/zfs.h>
  49 #include <sys/dmu.h>
  50 #include <sys/dsl_prop.h>
  51 #include <sys/dsl_dataset.h>
  52 #include <sys/dsl_deleg.h>
  53 #include <sys/spa.h>
  54 #include <sys/zap.h>
  55 #include <sys/sa.h>
  56 #include <sys/sa_impl.h>
  57 #include <sys/policy.h>
  58 #include <sys/atomic.h>
  59 #include <sys/zfs_ioctl.h>
  60 #include <sys/zfs_ctldir.h>
  61 #include <sys/zfs_fuid.h>
  62 #include <sys/sunddi.h>
  63 #include <sys/dmu_objset.h>
  64 #include <sys/dsl_dir.h>
  65 #include <sys/spa_boot.h>
  66 #include <sys/jail.h>
  67 #include <ufs/ufs/quota.h>
  68 #include <sys/zfs_quota.h>
  69
  70 #include "zfs_comutil.h"
  71
  72 #ifndef MNTK_VMSETSIZE_BUG
  73 #define MNTK_VMSETSIZE_BUG      0
  74 #endif
  75 #ifndef MNTK_NOMSYNC
  76 #define MNTK_NOMSYNC    8
  77 #endif
  78
  79 /* BEGIN CSTYLED */
  80 struct mtx zfs_debug_mtx;
  81 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
  82
  83 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
  84
  85 int zfs_super_owner;
  86 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
  87     "File system owner can perform privileged operation on his file systems");
  88
  89 int zfs_debug_level;
  90 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
  91         "Debug level");
  92
  93 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
  94 static int zfs_version_acl = ZFS_ACL_VERSION;
  95 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
  96     "ZFS_ACL_VERSION");
  97 static int zfs_version_spa = SPA_VERSION;
  98 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
  99     "SPA_VERSION");
 100 static int zfs_version_zpl = ZPL_VERSION;
 101 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
 102     "ZPL_VERSION");
 103 /* END CSTYLED */
 104
 105 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
 106 static int zfs_mount(vfs_t *vfsp);
 107 static int zfs_umount(vfs_t *vfsp, int fflag);
 108 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
 109 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
 110 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
 111 static int zfs_sync(vfs_t *vfsp, int waitfor);
 112 #if __FreeBSD_version >= 1300098
 113 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
 114     struct ucred **credanonp, int *numsecflavors, int *secflavors);
 115 #else
 116 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
 117     struct ucred **credanonp, int *numsecflavors, int **secflavors);
 118 #endif
 119 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
 120 static void zfs_freevfs(vfs_t *vfsp);
 121
 122 struct vfsops zfs_vfsops = {
 123         .vfs_mount =            zfs_mount,
 124         .vfs_unmount =          zfs_umount,
 125 #if __FreeBSD_version >= 1300049
 126         .vfs_root =             vfs_cache_root,
 127         .vfs_cachedroot = zfs_root,
 128 #else
 129         .vfs_root =             zfs_root,
 130 #endif
 131         .vfs_statfs =           zfs_statfs,
 132         .vfs_vget =             zfs_vget,
 133         .vfs_sync =             zfs_sync,
 134         .vfs_checkexp =         zfs_checkexp,
 135         .vfs_fhtovp =           zfs_fhtovp,
 136         .vfs_quotactl =         zfs_quotactl,
 137 };
 138
 139 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
 140
 141 /*
 142  * We need to keep a count of active fs's.
 143  * This is necessary to prevent our module
 144  * from being unloaded after a umount -f
 145  */
 146 static uint32_t zfs_active_fs_count = 0;
 147
 148 int
 149 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
 150     char *setpoint)
 151 {
 152         int error;
 153         zfsvfs_t *zfvp;
 154         vfs_t *vfsp;
 155         objset_t *os;
 156         uint64_t tmp = *val;
 157
 158         error = dmu_objset_from_ds(ds, &os);
 159         if (error != 0)
 160                 return (error);
 161
 162         error = getzfsvfs_impl(os, &zfvp);
 163         if (error != 0)
 164                 return (error);
 165         if (zfvp == NULL)
 166                 return (ENOENT);
 167         vfsp = zfvp->z_vfs;
 168         switch (zfs_prop) {
 169         case ZFS_PROP_ATIME:
 170                 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
 171                         tmp = 0;
 172                 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
 173                         tmp = 1;
 174                 break;
 175         case ZFS_PROP_DEVICES:
 176                 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
 177                         tmp = 0;
 178                 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
 179                         tmp = 1;
 180                 break;
 181         case ZFS_PROP_EXEC:
 182                 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
 183                         tmp = 0;
 184                 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
 185                         tmp = 1;
 186                 break;
 187         case ZFS_PROP_SETUID:
 188                 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
 189                         tmp = 0;
 190                 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
 191                         tmp = 1;
 192                 break;
 193         case ZFS_PROP_READONLY:
 194                 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
 195                         tmp = 0;
 196                 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
 197                         tmp = 1;
 198                 break;
 199         case ZFS_PROP_XATTR:
 200                 if (zfvp->z_flags & ZSB_XATTR)
 201                         tmp = zfvp->z_xattr;
 202                 break;
 203         case ZFS_PROP_NBMAND:
 204                 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
 205                         tmp = 0;
 206                 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
 207                         tmp = 1;
 208                 break;
 209         default:
 210                 vfs_unbusy(vfsp);
 211                 return (ENOENT);
 212         }
 213
 214         vfs_unbusy(vfsp);
 215         if (tmp != *val) {
 216                 (void) strcpy(setpoint, "temporary");
 217                 *val = tmp;
 218         }
 219         return (0);
 220 }
 221
 222 static int
 223 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
 224 {
 225         int error = 0;
 226         char buf[32];
 227         uint64_t usedobj, quotaobj;
 228         uint64_t quota, used = 0;
 229         timespec_t now;
 230
 231         usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 232         quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 233
 234         if (quotaobj == 0 || zfsvfs->z_replay) {
 235                 error = ENOENT;
 236                 goto done;
 237         }
 238         (void) sprintf(buf, "%llx", (longlong_t)id);
 239         if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
 240             buf, sizeof (quota), 1, &quota)) != 0) {
 241                 dprintf("%s(%d): quotaobj lookup failed\n",
 242                     __FUNCTION__, __LINE__);
 243                 goto done;
 244         }
 245         /*
 246          * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
 247          * So we set them to be the same.
 248          */
 249         dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
 250         error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
 251         if (error && error != ENOENT) {
 252                 dprintf("%s(%d):  usedobj failed; %d\n",
 253                     __FUNCTION__, __LINE__, error);
 254                 goto done;
 255         }
 256         dqp->dqb_curblocks = btodb(used);
 257         dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
 258         vfs_timestamp(&now);
 259         /*
 260          * Setting this to 0 causes FreeBSD quota(8) to print
 261          * the number of days since the epoch, which isn't
 262          * particularly useful.
 263          */
 264         dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
 265 done:
 266         return (error);
 267 }
 268
 269 static int
 270 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
 271 {
 272         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 273         struct thread *td;
 274         int cmd, type, error = 0;
 275         int bitsize;
 276         zfs_userquota_prop_t quota_type;
 277         struct dqblk64 dqblk = { 0 };
 278
 279         td = curthread;
 280         cmd = cmds >> SUBCMDSHIFT;
 281         type = cmds & SUBCMDMASK;
 282
 283         ZFS_ENTER(zfsvfs);
 284         if (id == -1) {
 285                 switch (type) {
 286                 case USRQUOTA:
 287                         id = td->td_ucred->cr_ruid;
 288                         break;
 289                 case GRPQUOTA:
 290                         id = td->td_ucred->cr_rgid;
 291                         break;
 292                 default:
 293                         error = EINVAL;
 294                         if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
 295                                 vfs_unbusy(vfsp);
 296                         goto done;
 297                 }
 298         }
 299         /*
 300          * Map BSD type to:
 301          * ZFS_PROP_USERUSED,
 302          * ZFS_PROP_USERQUOTA,
 303          * ZFS_PROP_GROUPUSED,
 304          * ZFS_PROP_GROUPQUOTA
 305          */
 306         switch (cmd) {
 307         case Q_SETQUOTA:
 308         case Q_SETQUOTA32:
 309                 if (type == USRQUOTA)
 310                         quota_type = ZFS_PROP_USERQUOTA;
 311                 else if (type == GRPQUOTA)
 312                         quota_type = ZFS_PROP_GROUPQUOTA;
 313                 else
 314                         error = EINVAL;
 315                 break;
 316         case Q_GETQUOTA:
 317         case Q_GETQUOTA32:
 318                 if (type == USRQUOTA)
 319                         quota_type = ZFS_PROP_USERUSED;
 320                 else if (type == GRPQUOTA)
 321                         quota_type = ZFS_PROP_GROUPUSED;
 322                 else
 323                         error = EINVAL;
 324                 break;
 325         }
 326
 327         /*
 328          * Depending on the cmd, we may need to get
 329          * the ruid and domain (see fuidstr_to_sid?),
 330          * the fuid (how?), or other information.
 331          * Create fuid using zfs_fuid_create(zfsvfs, id,
 332          * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
 333          * I think I can use just the id?
 334          *
 335          * Look at zfs_id_overquota() to look up a quota.
 336          * zap_lookup(something, quotaobj, fuidstring,
 337          *     sizeof (long long), 1, &quota)
 338          *
 339          * See zfs_set_userquota() to set a quota.
 340          */
 341         if ((uint32_t)type >= MAXQUOTAS) {
 342                 error = EINVAL;
 343                 goto done;
 344         }
 345
 346         switch (cmd) {
 347         case Q_GETQUOTASIZE:
 348                 bitsize = 64;
 349                 error = copyout(&bitsize, arg, sizeof (int));
 350                 break;
 351         case Q_QUOTAON:
 352                 // As far as I can tell, you can't turn quotas on or off on zfs
 353                 error = 0;
 354                 vfs_unbusy(vfsp);
 355                 break;
 356         case Q_QUOTAOFF:
 357                 error = ENOTSUP;
 358                 vfs_unbusy(vfsp);
 359                 break;
 360         case Q_SETQUOTA:
 361                 error = copyin(arg, &dqblk, sizeof (dqblk));
 362                 if (error == 0)
 363                         error = zfs_set_userquota(zfsvfs, quota_type,
 364                             "", id, dbtob(dqblk.dqb_bhardlimit));
 365                 break;
 366         case Q_GETQUOTA:
 367                 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
 368                 if (error == 0)
 369                         error = copyout(&dqblk, arg, sizeof (dqblk));
 370                 break;
 371         default:
 372                 error = EINVAL;
 373                 break;
 374         }
 375 done:
 376         ZFS_EXIT(zfsvfs);
 377         return (error);
 378 }
 379
 380
 381 boolean_t
 382 zfs_is_readonly(zfsvfs_t *zfsvfs)
 383 {
 384         return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
 385 }
 386
 387 /*ARGSUSED*/
 388 static int
 389 zfs_sync(vfs_t *vfsp, int waitfor)
 390 {
 391
 392         /*
 393          * Data integrity is job one.  We don't want a compromised kernel
 394          * writing to the storage pool, so we never sync during panic.
 395          */
 396         if (panicstr)
 397                 return (0);
 398
 399         /*
 400          * Ignore the system syncher.  ZFS already commits async data
 401          * at zfs_txg_timeout intervals.
 402          */
 403         if (waitfor == MNT_LAZY)
 404                 return (0);
 405
 406         if (vfsp != NULL) {
 407                 /*
 408                  * Sync a specific filesystem.
 409                  */
 410                 zfsvfs_t *zfsvfs = vfsp->vfs_data;
 411                 dsl_pool_t *dp;
 412                 int error;
 413
 414                 error = vfs_stdsync(vfsp, waitfor);
 415                 if (error != 0)
 416                         return (error);
 417
 418                 ZFS_ENTER(zfsvfs);
 419                 dp = dmu_objset_pool(zfsvfs->z_os);
 420
 421                 /*
 422                  * If the system is shutting down, then skip any
 423                  * filesystems which may exist on a suspended pool.
 424                  */
 425                 if (rebooting && spa_suspended(dp->dp_spa)) {
 426                         ZFS_EXIT(zfsvfs);
 427                         return (0);
 428                 }
 429
 430                 if (zfsvfs->z_log != NULL)
 431                         zil_commit(zfsvfs->z_log, 0);
 432
 433                 ZFS_EXIT(zfsvfs);
 434         } else {
 435                 /*
 436                  * Sync all ZFS filesystems.  This is what happens when you
 437                  * run sync(8).  Unlike other filesystems, ZFS honors the
 438                  * request by waiting for all pools to commit all dirty data.
 439                  */
 440                 spa_sync_allpools();
 441         }
 442
 443         return (0);
 444 }
 445
 446 static void
 447 atime_changed_cb(void *arg, uint64_t newval)
 448 {
 449         zfsvfs_t *zfsvfs = arg;
 450
 451         if (newval == TRUE) {
 452                 zfsvfs->z_atime = TRUE;
 453                 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 454                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 455                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 456         } else {
 457                 zfsvfs->z_atime = FALSE;
 458                 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 459                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 460                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 461         }
 462 }
 463
 464 static void
 465 xattr_changed_cb(void *arg, uint64_t newval)
 466 {
 467         zfsvfs_t *zfsvfs = arg;
 468
 469         if (newval == ZFS_XATTR_OFF) {
 470                 zfsvfs->z_flags &= ~ZSB_XATTR;
 471         } else {
 472                 zfsvfs->z_flags |= ZSB_XATTR;
 473
 474                 if (newval == ZFS_XATTR_SA)
 475                         zfsvfs->z_xattr_sa = B_TRUE;
 476                 else
 477                         zfsvfs->z_xattr_sa = B_FALSE;
 478         }
 479 }
 480
 481 static void
 482 blksz_changed_cb(void *arg, uint64_t newval)
 483 {
 484         zfsvfs_t *zfsvfs = arg;
 485         ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
 486         ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
 487         ASSERT(ISP2(newval));
 488
 489         zfsvfs->z_max_blksz = newval;
 490         zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
 491 }
 492
 493 static void
 494 readonly_changed_cb(void *arg, uint64_t newval)
 495 {
 496         zfsvfs_t *zfsvfs = arg;
 497
 498         if (newval) {
 499                 /* XXX locking on vfs_flag? */
 500                 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 501                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 502                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 503         } else {
 504                 /* XXX locking on vfs_flag? */
 505                 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 506                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 507                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 508         }
 509 }
 510
 511 static void
 512 setuid_changed_cb(void *arg, uint64_t newval)
 513 {
 514         zfsvfs_t *zfsvfs = arg;
 515
 516         if (newval == FALSE) {
 517                 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 518                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 519                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 520         } else {
 521                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 522                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 523                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 524         }
 525 }
 526
 527 static void
 528 exec_changed_cb(void *arg, uint64_t newval)
 529 {
 530         zfsvfs_t *zfsvfs = arg;
 531
 532         if (newval == FALSE) {
 533                 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 534                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 535                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 536         } else {
 537                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 538                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 539                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 540         }
 541 }
 542
 543 /*
 544  * The nbmand mount option can be changed at mount time.
 545  * We can't allow it to be toggled on live file systems or incorrect
 546  * behavior may be seen from cifs clients
 547  *
 548  * This property isn't registered via dsl_prop_register(), but this callback
 549  * will be called when a file system is first mounted
 550  */
 551 static void
 552 nbmand_changed_cb(void *arg, uint64_t newval)
 553 {
 554         zfsvfs_t *zfsvfs = arg;
 555         if (newval == FALSE) {
 556                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 557                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 558         } else {
 559                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 560                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 561         }
 562 }
 563
 564 static void
 565 snapdir_changed_cb(void *arg, uint64_t newval)
 566 {
 567         zfsvfs_t *zfsvfs = arg;
 568
 569         zfsvfs->z_show_ctldir = newval;
 570 }
 571
 572 static void
 573 vscan_changed_cb(void *arg, uint64_t newval)
 574 {
 575         zfsvfs_t *zfsvfs = arg;
 576
 577         zfsvfs->z_vscan = newval;
 578 }
 579
 580 static void
 581 acl_mode_changed_cb(void *arg, uint64_t newval)
 582 {
 583         zfsvfs_t *zfsvfs = arg;
 584
 585         zfsvfs->z_acl_mode = newval;
 586 }
 587
 588 static void
 589 acl_inherit_changed_cb(void *arg, uint64_t newval)
 590 {
 591         zfsvfs_t *zfsvfs = arg;
 592
 593         zfsvfs->z_acl_inherit = newval;
 594 }
 595
 596 static void
 597 acl_type_changed_cb(void *arg, uint64_t newval)
 598 {
 599         zfsvfs_t *zfsvfs = arg;
 600
 601         zfsvfs->z_acl_type = newval;
 602 }
 603
 604 static int
 605 zfs_register_callbacks(vfs_t *vfsp)
 606 {
 607         struct dsl_dataset *ds = NULL;
 608         objset_t *os = NULL;
 609         zfsvfs_t *zfsvfs = NULL;
 610         uint64_t nbmand;
 611         boolean_t readonly = B_FALSE;
 612         boolean_t do_readonly = B_FALSE;
 613         boolean_t setuid = B_FALSE;
 614         boolean_t do_setuid = B_FALSE;
 615         boolean_t exec = B_FALSE;
 616         boolean_t do_exec = B_FALSE;
 617         boolean_t xattr = B_FALSE;
 618         boolean_t atime = B_FALSE;
 619         boolean_t do_atime = B_FALSE;
 620         boolean_t do_xattr = B_FALSE;
 621         int error = 0;
 622
 623         ASSERT(vfsp);
 624         zfsvfs = vfsp->vfs_data;
 625         ASSERT(zfsvfs);
 626         os = zfsvfs->z_os;
 627
 628         /*
 629          * This function can be called for a snapshot when we update snapshot's
 630          * mount point, which isn't really supported.
 631          */
 632         if (dmu_objset_is_snapshot(os))
 633                 return (EOPNOTSUPP);
 634
 635         /*
 636          * The act of registering our callbacks will destroy any mount
 637          * options we may have.  In order to enable temporary overrides
 638          * of mount options, we stash away the current values and
 639          * restore them after we register the callbacks.
 640          */
 641         if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
 642             !spa_writeable(dmu_objset_spa(os))) {
 643                 readonly = B_TRUE;
 644                 do_readonly = B_TRUE;
 645         } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 646                 readonly = B_FALSE;
 647                 do_readonly = B_TRUE;
 648         }
 649         if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 650                 setuid = B_FALSE;
 651                 do_setuid = B_TRUE;
 652         } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 653                 setuid = B_TRUE;
 654                 do_setuid = B_TRUE;
 655         }
 656         if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 657                 exec = B_FALSE;
 658                 do_exec = B_TRUE;
 659         } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 660                 exec = B_TRUE;
 661                 do_exec = B_TRUE;
 662         }
 663         if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 664                 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
 665                 do_xattr = B_TRUE;
 666         } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 667                 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
 668                 do_xattr = B_TRUE;
 669         } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
 670                 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
 671                 do_xattr = B_TRUE;
 672         } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
 673                 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
 674                 do_xattr = B_TRUE;
 675         }
 676         if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 677                 atime = B_FALSE;
 678                 do_atime = B_TRUE;
 679         } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 680                 atime = B_TRUE;
 681                 do_atime = B_TRUE;
 682         }
 683
 684         /*
 685          * We need to enter pool configuration here, so that we can use
 686          * dsl_prop_get_int_ds() to handle the special nbmand property below.
 687          * dsl_prop_get_integer() can not be used, because it has to acquire
 688          * spa_namespace_lock and we can not do that because we already hold
 689          * z_teardown_lock.  The problem is that spa_write_cachefile() is called
 690          * with spa_namespace_lock held and the function calls ZFS vnode
 691          * operations to write the cache file and thus z_teardown_lock is
 692          * acquired after spa_namespace_lock.
 693          */
 694         ds = dmu_objset_ds(os);
 695         dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 696
 697         /*
 698          * nbmand is a special property.  It can only be changed at
 699          * mount time.
 700          *
 701          * This is weird, but it is documented to only be changeable
 702          * at mount time.
 703          */
 704         if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 705                 nbmand = B_FALSE;
 706         } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 707                 nbmand = B_TRUE;
 708         } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) {
 709                 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 710                 return (error);
 711         }
 712
 713         /*
 714          * Register property callbacks.
 715          *
 716          * It would probably be fine to just check for i/o error from
 717          * the first prop_register(), but I guess I like to go
 718          * overboard...
 719          */
 720         error = dsl_prop_register(ds,
 721             zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
 722         error = error ? error : dsl_prop_register(ds,
 723             zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
 724         error = error ? error : dsl_prop_register(ds,
 725             zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
 726         error = error ? error : dsl_prop_register(ds,
 727             zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
 728         error = error ? error : dsl_prop_register(ds,
 729             zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
 730         error = error ? error : dsl_prop_register(ds,
 731             zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
 732         error = error ? error : dsl_prop_register(ds,
 733             zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
 734         error = error ? error : dsl_prop_register(ds,
 735             zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
 736         error = error ? error : dsl_prop_register(ds,
 737             zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
 738         error = error ? error : dsl_prop_register(ds,
 739             zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
 740             zfsvfs);
 741         error = error ? error : dsl_prop_register(ds,
 742             zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
 743         dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 744         if (error)
 745                 goto unregister;
 746
 747         /*
 748          * Invoke our callbacks to restore temporary mount options.
 749          */
 750         if (do_readonly)
 751                 readonly_changed_cb(zfsvfs, readonly);
 752         if (do_setuid)
 753                 setuid_changed_cb(zfsvfs, setuid);
 754         if (do_exec)
 755                 exec_changed_cb(zfsvfs, exec);
 756         if (do_xattr)
 757                 xattr_changed_cb(zfsvfs, xattr);
 758         if (do_atime)
 759                 atime_changed_cb(zfsvfs, atime);
 760
 761         nbmand_changed_cb(zfsvfs, nbmand);
 762
 763         return (0);
 764
 765 unregister:
 766         dsl_prop_unregister_all(ds, zfsvfs);
 767         return (error);
 768 }
 769
 770 /*
 771  * Associate this zfsvfs with the given objset, which must be owned.
 772  * This will cache a bunch of on-disk state from the objset in the
 773  * zfsvfs.
 774  */
 775 static int
 776 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
 777 {
 778         int error;
 779         uint64_t val;
 780
 781         zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
 782         zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 783         zfsvfs->z_os = os;
 784
 785         error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 786         if (error != 0)
 787                 return (error);
 788         if (zfsvfs->z_version >
 789             zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
 790                 (void) printf("Can't mount a version %lld file system "
 791                     "on a version %lld pool\n. Pool must be upgraded to mount "
 792                     "this file system.", (u_longlong_t)zfsvfs->z_version,
 793                     (u_longlong_t)spa_version(dmu_objset_spa(os)));
 794                 return (SET_ERROR(ENOTSUP));
 795         }
 796         error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
 797         if (error != 0)
 798                 return (error);
 799         zfsvfs->z_norm = (int)val;
 800
 801         error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
 802         if (error != 0)
 803                 return (error);
 804         zfsvfs->z_utf8 = (val != 0);
 805
 806         error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
 807         if (error != 0)
 808                 return (error);
 809         zfsvfs->z_case = (uint_t)val;
 810
 811         error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
 812         if (error != 0)
 813                 return (error);
 814         zfsvfs->z_acl_type = (uint_t)val;
 815
 816         /*
 817          * Fold case on file systems that are always or sometimes case
 818          * insensitive.
 819          */
 820         if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 821             zfsvfs->z_case == ZFS_CASE_MIXED)
 822                 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 823
 824         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 825         zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 826
 827         uint64_t sa_obj = 0;
 828         if (zfsvfs->z_use_sa) {
 829                 /* should either have both of these objects or none */
 830                 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 831                     &sa_obj);
 832                 if (error != 0)
 833                         return (error);
 834         }
 835
 836         error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 837             &zfsvfs->z_attr_table);
 838         if (error != 0)
 839                 return (error);
 840
 841         if (zfsvfs->z_version >= ZPL_VERSION_SA)
 842                 sa_register_update_callback(os, zfs_sa_upgrade);
 843
 844         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 845             &zfsvfs->z_root);
 846         if (error != 0)
 847                 return (error);
 848         ASSERT(zfsvfs->z_root != 0);
 849
 850         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 851             &zfsvfs->z_unlinkedobj);
 852         if (error != 0)
 853                 return (error);
 854
 855         error = zap_lookup(os, MASTER_NODE_OBJ,
 856             zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 857             8, 1, &zfsvfs->z_userquota_obj);
 858         if (error == ENOENT)
 859                 zfsvfs->z_userquota_obj = 0;
 860         else if (error != 0)
 861                 return (error);
 862
 863         error = zap_lookup(os, MASTER_NODE_OBJ,
 864             zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 865             8, 1, &zfsvfs->z_groupquota_obj);
 866         if (error == ENOENT)
 867                 zfsvfs->z_groupquota_obj = 0;
 868         else if (error != 0)
 869                 return (error);
 870
 871         error = zap_lookup(os, MASTER_NODE_OBJ,
 872             zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
 873             8, 1, &zfsvfs->z_projectquota_obj);
 874         if (error == ENOENT)
 875                 zfsvfs->z_projectquota_obj = 0;
 876         else if (error != 0)
 877                 return (error);
 878
 879         error = zap_lookup(os, MASTER_NODE_OBJ,
 880             zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
 881             8, 1, &zfsvfs->z_userobjquota_obj);
 882         if (error == ENOENT)
 883                 zfsvfs->z_userobjquota_obj = 0;
 884         else if (error != 0)
 885                 return (error);
 886
 887         error = zap_lookup(os, MASTER_NODE_OBJ,
 888             zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
 889             8, 1, &zfsvfs->z_groupobjquota_obj);
 890         if (error == ENOENT)
 891                 zfsvfs->z_groupobjquota_obj = 0;
 892         else if (error != 0)
 893                 return (error);
 894
 895         error = zap_lookup(os, MASTER_NODE_OBJ,
 896             zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
 897             8, 1, &zfsvfs->z_projectobjquota_obj);
 898         if (error == ENOENT)
 899                 zfsvfs->z_projectobjquota_obj = 0;
 900         else if (error != 0)
 901                 return (error);
 902
 903         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 904             &zfsvfs->z_fuid_obj);
 905         if (error == ENOENT)
 906                 zfsvfs->z_fuid_obj = 0;
 907         else if (error != 0)
 908                 return (error);
 909
 910         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 911             &zfsvfs->z_shares_dir);
 912         if (error == ENOENT)
 913                 zfsvfs->z_shares_dir = 0;
 914         else if (error != 0)
 915                 return (error);
 916
 917         /*
 918          * Only use the name cache if we are looking for a
 919          * name on a file system that does not require normalization
 920          * or case folding.  We can also look there if we happen to be
 921          * on a non-normalizing, mixed sensitivity file system IF we
 922          * are looking for the exact name (which is always the case on
 923          * FreeBSD).
 924          */
 925         zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
 926             ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
 927             !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
 928
 929         return (0);
 930 }
 931
 932 taskq_t *zfsvfs_taskq;
 933
 934 static void
 935 zfsvfs_task_unlinked_drain(void *context, int pending __unused)
 936 {
 937
 938         zfs_unlinked_drain((zfsvfs_t *)context);
 939 }
 940
 941 int
 942 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
 943 {
 944         objset_t *os;
 945         zfsvfs_t *zfsvfs;
 946         int error;
 947         boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
 948
 949         /*
 950          * XXX: Fix struct statfs so this isn't necessary!
 951          *
 952          * The 'osname' is used as the filesystem's special node, which means
 953          * it must fit in statfs.f_mntfromname, or else it can't be
 954          * enumerated, so libzfs_mnttab_find() returns NULL, which causes
 955          * 'zfs unmount' to think it's not mounted when it is.
 956          */
 957         if (strlen(osname) >= MNAMELEN)
 958                 return (SET_ERROR(ENAMETOOLONG));
 959
 960         zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 961
 962         error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
 963             &os);
 964         if (error != 0) {
 965                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
 966                 return (error);
 967         }
 968
 969         error = zfsvfs_create_impl(zfvp, zfsvfs, os);
 970
 971         return (error);
 972 }
 973
 974
 975 int
 976 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
 977 {
 978         int error;
 979
 980         zfsvfs->z_vfs = NULL;
 981         zfsvfs->z_parent = zfsvfs;
 982
 983         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 984         mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 985         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 986             offsetof(znode_t, z_link_node));
 987         TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
 988             zfsvfs_task_unlinked_drain, zfsvfs);
 989         ZFS_TEARDOWN_INIT(zfsvfs);
 990         ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
 991         rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 992         for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 993                 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 994
 995         error = zfsvfs_init(zfsvfs, os);
 996         if (error != 0) {
 997                 dmu_objset_disown(os, B_TRUE, zfsvfs);
 998                 *zfvp = NULL;
 999                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1000                 return (error);
1001         }
1002
1003         *zfvp = zfsvfs;
1004         return (0);
1005 }
1006
1007 static int
1008 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1009 {
1010         int error;
1011
1012         /*
1013          * Check for a bad on-disk format version now since we
1014          * lied about owning the dataset readonly before.
1015          */
1016         if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1017             dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1018                 return (SET_ERROR(EROFS));
1019
1020         error = zfs_register_callbacks(zfsvfs->z_vfs);
1021         if (error)
1022                 return (error);
1023
1024         zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1025
1026         /*
1027          * If we are not mounting (ie: online recv), then we don't
1028          * have to worry about replaying the log as we blocked all
1029          * operations out since we closed the ZIL.
1030          */
1031         if (mounting) {
1032                 boolean_t readonly;
1033
1034                 ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
1035                 dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1036
1037                 /*
1038                  * During replay we remove the read only flag to
1039                  * allow replays to succeed.
1040                  */
1041                 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1042                 if (readonly != 0) {
1043                         zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1044                 } else {
1045                         dsl_dir_t *dd;
1046                         zap_stats_t zs;
1047
1048                         if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1049                             &zs) == 0) {
1050                                 dataset_kstats_update_nunlinks_kstat(
1051                                     &zfsvfs->z_kstat, zs.zs_num_entries);
1052                                 dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1053                                     "num_entries in unlinked set: %llu",
1054                                     zs.zs_num_entries);
1055                         }
1056
1057                         zfs_unlinked_drain(zfsvfs);
1058                         dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1059                         dd->dd_activity_cancelled = B_FALSE;
1060                 }
1061
1062                 /*
1063                  * Parse and replay the intent log.
1064                  *
1065                  * Because of ziltest, this must be done after
1066                  * zfs_unlinked_drain().  (Further note: ziltest
1067                  * doesn't use readonly mounts, where
1068                  * zfs_unlinked_drain() isn't called.)  This is because
1069                  * ziltest causes spa_sync() to think it's committed,
1070                  * but actually it is not, so the intent log contains
1071                  * many txg's worth of changes.
1072                  *
1073                  * In particular, if object N is in the unlinked set in
1074                  * the last txg to actually sync, then it could be
1075                  * actually freed in a later txg and then reallocated
1076                  * in a yet later txg.  This would write a "create
1077                  * object N" record to the intent log.  Normally, this
1078                  * would be fine because the spa_sync() would have
1079                  * written out the fact that object N is free, before
1080                  * we could write the "create object N" intent log
1081                  * record.
1082                  *
1083                  * But when we are in ziltest mode, we advance the "open
1084                  * txg" without actually spa_sync()-ing the changes to
1085                  * disk.  So we would see that object N is still
1086                  * allocated and in the unlinked set, and there is an
1087                  * intent log record saying to allocate it.
1088                  */
1089                 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1090                         if (zil_replay_disable) {
1091                                 zil_destroy(zfsvfs->z_log, B_FALSE);
1092                         } else {
1093                                 boolean_t use_nc = zfsvfs->z_use_namecache;
1094                                 zfsvfs->z_use_namecache = B_FALSE;
1095                                 zfsvfs->z_replay = B_TRUE;
1096                                 zil_replay(zfsvfs->z_os, zfsvfs,
1097                                     zfs_replay_vector);
1098                                 zfsvfs->z_replay = B_FALSE;
1099                                 zfsvfs->z_use_namecache = use_nc;
1100                         }
1101                 }
1102
1103                 /* restore readonly bit */
1104                 if (readonly != 0)
1105                         zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1106         }
1107
1108         /*
1109          * Set the objset user_ptr to track its zfsvfs.
1110          */
1111         mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1112         dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1113         mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1114
1115         return (0);
1116 }
1117
1118 void
1119 zfsvfs_free(zfsvfs_t *zfsvfs)
1120 {
1121         int i;
1122
1123         zfs_fuid_destroy(zfsvfs);
1124
1125         mutex_destroy(&zfsvfs->z_znodes_lock);
1126         mutex_destroy(&zfsvfs->z_lock);
1127         ASSERT(zfsvfs->z_nr_znodes == 0);
1128         list_destroy(&zfsvfs->z_all_znodes);
1129         ZFS_TEARDOWN_DESTROY(zfsvfs);
1130         ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
1131         rw_destroy(&zfsvfs->z_fuid_lock);
1132         for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1133                 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1134         dataset_kstats_destroy(&zfsvfs->z_kstat);
1135         kmem_free(zfsvfs, sizeof (zfsvfs_t));
1136 }
1137
1138 static void
1139 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1140 {
1141         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1142         if (zfsvfs->z_vfs) {
1143                 if (zfsvfs->z_use_fuids) {
1144                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1145                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1146                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1147                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1148                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1149                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1150                 } else {
1151                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1152                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1153                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1154                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1155                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1156                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1157                 }
1158         }
1159         zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1160 }
1161
1162 static int
1163 zfs_domount(vfs_t *vfsp, char *osname)
1164 {
1165         uint64_t recordsize, fsid_guid;
1166         int error = 0;
1167         zfsvfs_t *zfsvfs;
1168
1169         ASSERT(vfsp);
1170         ASSERT(osname);
1171
1172         error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1173         if (error)
1174                 return (error);
1175         zfsvfs->z_vfs = vfsp;
1176
1177         if ((error = dsl_prop_get_integer(osname,
1178             "recordsize", &recordsize, NULL)))
1179                 goto out;
1180         zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1181         zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1182
1183         vfsp->vfs_data = zfsvfs;
1184         vfsp->mnt_flag |= MNT_LOCAL;
1185         vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1186         vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1187         vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1188         /*
1189          * This can cause a loss of coherence between ARC and page cache
1190          * on ZoF - unclear if the problem is in FreeBSD or ZoF
1191          */
1192         vfsp->mnt_kern_flag |= MNTK_NO_IOPF;    /* vn_io_fault can be used */
1193         vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1194         vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1195
1196 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
1197         vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1198 #endif
1199         /*
1200          * The fsid is 64 bits, composed of an 8-bit fs type, which
1201          * separates our fsid from any other filesystem types, and a
1202          * 56-bit objset unique ID.  The objset unique ID is unique to
1203          * all objsets open on this system, provided by unique_create().
1204          * The 8-bit fs type must be put in the low bits of fsid[1]
1205          * because that's where other Solaris filesystems put it.
1206          */
1207         fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1208         ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1209         vfsp->vfs_fsid.val[0] = fsid_guid;
1210         vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1211             (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1212
1213         /*
1214          * Set features for file system.
1215          */
1216         zfs_set_fuid_feature(zfsvfs);
1217         if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
1218                 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1219                 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1220                 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
1221         } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
1222                 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1223                 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1224         }
1225         vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
1226
1227         if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1228                 uint64_t pval;
1229
1230                 atime_changed_cb(zfsvfs, B_FALSE);
1231                 readonly_changed_cb(zfsvfs, B_TRUE);
1232                 if ((error = dsl_prop_get_integer(osname,
1233                     "xattr", &pval, NULL)))
1234                         goto out;
1235                 xattr_changed_cb(zfsvfs, pval);
1236                 if ((error = dsl_prop_get_integer(osname,
1237                     "acltype", &pval, NULL)))
1238                         goto out;
1239                 acl_type_changed_cb(zfsvfs, pval);
1240                 zfsvfs->z_issnap = B_TRUE;
1241                 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1242
1243                 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1244                 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1245                 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1246         } else {
1247                 if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1248                         goto out;
1249         }
1250
1251         vfs_mountedfrom(vfsp, osname);
1252
1253         if (!zfsvfs->z_issnap)
1254                 zfsctl_create(zfsvfs);
1255 out:
1256         if (error) {
1257                 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1258                 zfsvfs_free(zfsvfs);
1259         } else {
1260                 atomic_inc_32(&zfs_active_fs_count);
1261         }
1262
1263         return (error);
1264 }
1265
1266 static void
1267 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1268 {
1269         objset_t *os = zfsvfs->z_os;
1270
1271         if (!dmu_objset_is_snapshot(os))
1272                 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1273 }
1274
1275 static int
1276 getpoolname(const char *osname, char *poolname)
1277 {
1278         char *p;
1279
1280         p = strchr(osname, '/');
1281         if (p == NULL) {
1282                 if (strlen(osname) >= MAXNAMELEN)
1283                         return (ENAMETOOLONG);
1284                 (void) strcpy(poolname, osname);
1285         } else {
1286                 if (p - osname >= MAXNAMELEN)
1287                         return (ENAMETOOLONG);
1288                 (void) strncpy(poolname, osname, p - osname);
1289                 poolname[p - osname] = '\0';
1290         }
1291         return (0);
1292 }
1293
1294 static void
1295 fetch_osname_options(char *name, bool *checkpointrewind)
1296 {
1297
1298         if (name[0] == '!') {
1299                 *checkpointrewind = true;
1300                 memmove(name, name + 1, strlen(name));
1301         } else {
1302                 *checkpointrewind = false;
1303         }
1304 }
1305
1306 /*ARGSUSED*/
1307 static int
1308 zfs_mount(vfs_t *vfsp)
1309 {
1310         kthread_t       *td = curthread;
1311         vnode_t         *mvp = vfsp->mnt_vnodecovered;
1312         cred_t          *cr = td->td_ucred;
1313         char            *osname;
1314         int             error = 0;
1315         int             canwrite;
1316         bool            checkpointrewind;
1317
1318         if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1319                 return (SET_ERROR(EINVAL));
1320
1321         /*
1322          * If full-owner-access is enabled and delegated administration is
1323          * turned on, we must set nosuid.
1324          */
1325         if (zfs_super_owner &&
1326             dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1327                 secpolicy_fs_mount_clearopts(cr, vfsp);
1328         }
1329
1330         fetch_osname_options(osname, &checkpointrewind);
1331
1332         /*
1333          * Check for mount privilege?
1334          *
1335          * If we don't have privilege then see if
1336          * we have local permission to allow it
1337          */
1338         error = secpolicy_fs_mount(cr, mvp, vfsp);
1339         if (error) {
1340                 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1341                         goto out;
1342
1343                 if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1344                         vattr_t         vattr;
1345
1346                         /*
1347                          * Make sure user is the owner of the mount point
1348                          * or has sufficient privileges.
1349                          */
1350
1351                         vattr.va_mask = AT_UID;
1352
1353                         vn_lock(mvp, LK_SHARED | LK_RETRY);
1354                         if (VOP_GETATTR(mvp, &vattr, cr)) {
1355                                 VOP_UNLOCK1(mvp);
1356                                 goto out;
1357                         }
1358
1359                         if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1360                             VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1361                                 VOP_UNLOCK1(mvp);
1362                                 goto out;
1363                         }
1364                         VOP_UNLOCK1(mvp);
1365                 }
1366
1367                 secpolicy_fs_mount_clearopts(cr, vfsp);
1368         }
1369
1370         /*
1371          * Refuse to mount a filesystem if we are in a local zone and the
1372          * dataset is not visible.
1373          */
1374         if (!INGLOBALZONE(curproc) &&
1375             (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1376                 error = SET_ERROR(EPERM);
1377                 goto out;
1378         }
1379
1380         vfsp->vfs_flag |= MNT_NFS4ACLS;
1381
1382         /*
1383          * When doing a remount, we simply refresh our temporary properties
1384          * according to those options set in the current VFS options.
1385          */
1386         if (vfsp->vfs_flag & MS_REMOUNT) {
1387                 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1388
1389                 /*
1390                  * Refresh mount options with z_teardown_lock blocking I/O while
1391                  * the filesystem is in an inconsistent state.
1392                  * The lock also serializes this code with filesystem
1393                  * manipulations between entry to zfs_suspend_fs() and return
1394                  * from zfs_resume_fs().
1395                  */
1396                 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1397                 zfs_unregister_callbacks(zfsvfs);
1398                 error = zfs_register_callbacks(vfsp);
1399                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1400                 goto out;
1401         }
1402
1403         /* Initial root mount: try hard to import the requested root pool. */
1404         if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1405             (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1406                 char pname[MAXNAMELEN];
1407
1408                 error = getpoolname(osname, pname);
1409                 if (error == 0)
1410                         error = spa_import_rootpool(pname, checkpointrewind);
1411                 if (error)
1412                         goto out;
1413         }
1414         DROP_GIANT();
1415         error = zfs_domount(vfsp, osname);
1416         PICKUP_GIANT();
1417
1418 out:
1419         return (error);
1420 }
1421
1422 static int
1423 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1424 {
1425         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1426         uint64_t refdbytes, availbytes, usedobjs, availobjs;
1427
1428         statp->f_version = STATFS_VERSION;
1429
1430         ZFS_ENTER(zfsvfs);
1431
1432         dmu_objset_space(zfsvfs->z_os,
1433             &refdbytes, &availbytes, &usedobjs, &availobjs);
1434
1435         /*
1436          * The underlying storage pool actually uses multiple block sizes.
1437          * We report the fragsize as the smallest block size we support,
1438          * and we report our blocksize as the filesystem's maximum blocksize.
1439          */
1440         statp->f_bsize = SPA_MINBLOCKSIZE;
1441         statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1442
1443         /*
1444          * The following report "total" blocks of various kinds in the
1445          * file system, but reported in terms of f_frsize - the
1446          * "fragment" size.
1447          */
1448
1449         statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1450         statp->f_bfree = availbytes / statp->f_bsize;
1451         statp->f_bavail = statp->f_bfree; /* no root reservation */
1452
1453         /*
1454          * statvfs() should really be called statufs(), because it assumes
1455          * static metadata.  ZFS doesn't preallocate files, so the best
1456          * we can do is report the max that could possibly fit in f_files,
1457          * and that minus the number actually used in f_ffree.
1458          * For f_ffree, report the smaller of the number of object available
1459          * and the number of blocks (each object will take at least a block).
1460          */
1461         statp->f_ffree = MIN(availobjs, statp->f_bfree);
1462         statp->f_files = statp->f_ffree + usedobjs;
1463
1464         /*
1465          * We're a zfs filesystem.
1466          */
1467         strlcpy(statp->f_fstypename, "zfs",
1468             sizeof (statp->f_fstypename));
1469
1470         strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1471             sizeof (statp->f_mntfromname));
1472         strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1473             sizeof (statp->f_mntonname));
1474
1475         statp->f_namemax = MAXNAMELEN - 1;
1476
1477         ZFS_EXIT(zfsvfs);
1478         return (0);
1479 }
1480
1481 static int
1482 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1483 {
1484         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1485         znode_t *rootzp;
1486         int error;
1487
1488         ZFS_ENTER(zfsvfs);
1489
1490         error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1491         if (error == 0)
1492                 *vpp = ZTOV(rootzp);
1493
1494         ZFS_EXIT(zfsvfs);
1495
1496         if (error == 0) {
1497                 error = vn_lock(*vpp, flags);
1498                 if (error != 0) {
1499                         VN_RELE(*vpp);
1500                         *vpp = NULL;
1501                 }
1502         }
1503         return (error);
1504 }
1505
1506 /*
1507  * Teardown the zfsvfs::z_os.
1508  *
1509  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1510  * and 'z_teardown_inactive_lock' held.
1511  */
1512 static int
1513 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1514 {
1515         znode_t *zp;
1516         dsl_dir_t *dd;
1517
1518         /*
1519          * If someone has not already unmounted this file system,
1520          * drain the zrele_taskq to ensure all active references to the
1521          * zfsvfs_t have been handled only then can it be safely destroyed.
1522          */
1523         if (zfsvfs->z_os) {
1524                 /*
1525                  * If we're unmounting we have to wait for the list to
1526                  * drain completely.
1527                  *
1528                  * If we're not unmounting there's no guarantee the list
1529                  * will drain completely, but zreles run from the taskq
1530                  * may add the parents of dir-based xattrs to the taskq
1531                  * so we want to wait for these.
1532                  *
1533                  * We can safely read z_nr_znodes without locking because the
1534                  * VFS has already blocked operations which add to the
1535                  * z_all_znodes list and thus increment z_nr_znodes.
1536                  */
1537                 int round = 0;
1538                 while (zfsvfs->z_nr_znodes > 0) {
1539                         taskq_wait_outstanding(dsl_pool_zrele_taskq(
1540                             dmu_objset_pool(zfsvfs->z_os)), 0);
1541                         if (++round > 1 && !unmounting)
1542                                 break;
1543                 }
1544         }
1545         ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1546
1547         if (!unmounting) {
1548                 /*
1549                  * We purge the parent filesystem's vfsp as the parent
1550                  * filesystem and all of its snapshots have their vnode's
1551                  * v_vfsp set to the parent's filesystem's vfsp.  Note,
1552                  * 'z_parent' is self referential for non-snapshots.
1553                  */
1554 #ifdef FREEBSD_NAMECACHE
1555 #if __FreeBSD_version >= 1300117
1556                 cache_purgevfs(zfsvfs->z_parent->z_vfs);
1557 #else
1558                 cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
1559 #endif
1560 #endif
1561         }
1562
1563         /*
1564          * Close the zil. NB: Can't close the zil while zfs_inactive
1565          * threads are blocked as zil_close can call zfs_inactive.
1566          */
1567         if (zfsvfs->z_log) {
1568                 zil_close(zfsvfs->z_log);
1569                 zfsvfs->z_log = NULL;
1570         }
1571
1572         ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
1573
1574         /*
1575          * If we are not unmounting (ie: online recv) and someone already
1576          * unmounted this file system while we were doing the switcheroo,
1577          * or a reopen of z_os failed then just bail out now.
1578          */
1579         if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1580                 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1581                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1582                 return (SET_ERROR(EIO));
1583         }
1584
1585         /*
1586          * At this point there are no vops active, and any new vops will
1587          * fail with EIO since we have z_teardown_lock for writer (only
1588          * relevant for forced unmount).
1589          *
1590          * Release all holds on dbufs.
1591          */
1592         mutex_enter(&zfsvfs->z_znodes_lock);
1593         for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1594             zp = list_next(&zfsvfs->z_all_znodes, zp))
1595                 if (zp->z_sa_hdl) {
1596                         ASSERT(ZTOV(zp)->v_count >= 0);
1597                         zfs_znode_dmu_fini(zp);
1598                 }
1599         mutex_exit(&zfsvfs->z_znodes_lock);
1600
1601         /*
1602          * If we are unmounting, set the unmounted flag and let new vops
1603          * unblock.  zfs_inactive will have the unmounted behavior, and all
1604          * other vops will fail with EIO.
1605          */
1606         if (unmounting) {
1607                 zfsvfs->z_unmounted = B_TRUE;
1608                 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1609                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1610         }
1611
1612         /*
1613          * z_os will be NULL if there was an error in attempting to reopen
1614          * zfsvfs, so just return as the properties had already been
1615          * unregistered and cached data had been evicted before.
1616          */
1617         if (zfsvfs->z_os == NULL)
1618                 return (0);
1619
1620         /*
1621          * Unregister properties.
1622          */
1623         zfs_unregister_callbacks(zfsvfs);
1624
1625         /*
1626          * Evict cached data
1627          */
1628         if (!zfs_is_readonly(zfsvfs))
1629                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1630         dmu_objset_evict_dbufs(zfsvfs->z_os);
1631         dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1632         dsl_dir_cancel_waiters(dd);
1633
1634         return (0);
1635 }
1636
1637 /*ARGSUSED*/
1638 static int
1639 zfs_umount(vfs_t *vfsp, int fflag)
1640 {
1641         kthread_t *td = curthread;
1642         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1643         objset_t *os;
1644         cred_t *cr = td->td_ucred;
1645         int ret;
1646
1647         ret = secpolicy_fs_unmount(cr, vfsp);
1648         if (ret) {
1649                 if (dsl_deleg_access((char *)vfsp->vfs_resource,
1650                     ZFS_DELEG_PERM_MOUNT, cr))
1651                         return (ret);
1652         }
1653
1654         /*
1655          * Unmount any snapshots mounted under .zfs before unmounting the
1656          * dataset itself.
1657          */
1658         if (zfsvfs->z_ctldir != NULL) {
1659                 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1660                         return (ret);
1661         }
1662
1663         if (fflag & MS_FORCE) {
1664                 /*
1665                  * Mark file system as unmounted before calling
1666                  * vflush(FORCECLOSE). This way we ensure no future vnops
1667                  * will be called and risk operating on DOOMED vnodes.
1668                  */
1669                 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1670                 zfsvfs->z_unmounted = B_TRUE;
1671                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1672         }
1673
1674         /*
1675          * Flush all the files.
1676          */
1677         ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1678         if (ret != 0)
1679                 return (ret);
1680         while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1681             &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1682                 taskqueue_drain(zfsvfs_taskq->tq_queue,
1683                     &zfsvfs->z_unlinked_drain_task);
1684
1685         VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1686         os = zfsvfs->z_os;
1687
1688         /*
1689          * z_os will be NULL if there was an error in
1690          * attempting to reopen zfsvfs.
1691          */
1692         if (os != NULL) {
1693                 /*
1694                  * Unset the objset user_ptr.
1695                  */
1696                 mutex_enter(&os->os_user_ptr_lock);
1697                 dmu_objset_set_user(os, NULL);
1698                 mutex_exit(&os->os_user_ptr_lock);
1699
1700                 /*
1701                  * Finally release the objset
1702                  */
1703                 dmu_objset_disown(os, B_TRUE, zfsvfs);
1704         }
1705
1706         /*
1707          * We can now safely destroy the '.zfs' directory node.
1708          */
1709         if (zfsvfs->z_ctldir != NULL)
1710                 zfsctl_destroy(zfsvfs);
1711         zfs_freevfs(vfsp);
1712
1713         return (0);
1714 }
1715
1716 static int
1717 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1718 {
1719         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
1720         znode_t         *zp;
1721         int             err;
1722
1723         /*
1724          * zfs_zget() can't operate on virtual entries like .zfs/ or
1725          * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1726          * This will make NFS to switch to LOOKUP instead of using VGET.
1727          */
1728         if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1729             (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1730                 return (EOPNOTSUPP);
1731
1732         ZFS_ENTER(zfsvfs);
1733         err = zfs_zget(zfsvfs, ino, &zp);
1734         if (err == 0 && zp->z_unlinked) {
1735                 vrele(ZTOV(zp));
1736                 err = EINVAL;
1737         }
1738         if (err == 0)
1739                 *vpp = ZTOV(zp);
1740         ZFS_EXIT(zfsvfs);
1741         if (err == 0) {
1742                 err = vn_lock(*vpp, flags);
1743                 if (err != 0)
1744                         vrele(*vpp);
1745         }
1746         if (err != 0)
1747                 *vpp = NULL;
1748         return (err);
1749 }
1750
1751 static int
1752 #if __FreeBSD_version >= 1300098
1753 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1754     struct ucred **credanonp, int *numsecflavors, int *secflavors)
1755 #else
1756 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
1757     struct ucred **credanonp, int *numsecflavors, int **secflavors)
1758 #endif
1759 {
1760         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1761
1762         /*
1763          * If this is regular file system vfsp is the same as
1764          * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1765          * zfsvfs->z_parent->z_vfs represents parent file system
1766          * which we have to use here, because only this file system
1767          * has mnt_export configured.
1768          */
1769         return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1770             credanonp, numsecflavors, secflavors));
1771 }
1772
1773 CTASSERT(SHORT_FID_LEN <= sizeof (struct fid));
1774 CTASSERT(LONG_FID_LEN <= sizeof (struct fid));
1775
1776 static int
1777 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1778 {
1779         struct componentname cn;
1780         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
1781         znode_t         *zp;
1782         vnode_t         *dvp;
1783         uint64_t        object = 0;
1784         uint64_t        fid_gen = 0;
1785         uint64_t        gen_mask;
1786         uint64_t        zp_gen;
1787         int             i, err;
1788
1789         *vpp = NULL;
1790
1791         ZFS_ENTER(zfsvfs);
1792
1793         /*
1794          * On FreeBSD we can get snapshot's mount point or its parent file
1795          * system mount point depending if snapshot is already mounted or not.
1796          */
1797         if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1798                 zfid_long_t     *zlfid = (zfid_long_t *)fidp;
1799                 uint64_t        objsetid = 0;
1800                 uint64_t        setgen = 0;
1801
1802                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1803                         objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1804
1805                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1806                         setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1807
1808                 ZFS_EXIT(zfsvfs);
1809
1810                 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1811                 if (err)
1812                         return (SET_ERROR(EINVAL));
1813                 ZFS_ENTER(zfsvfs);
1814         }
1815
1816         if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1817                 zfid_short_t    *zfid = (zfid_short_t *)fidp;
1818
1819                 for (i = 0; i < sizeof (zfid->zf_object); i++)
1820                         object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1821
1822                 for (i = 0; i < sizeof (zfid->zf_gen); i++)
1823                         fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1824         } else {
1825                 ZFS_EXIT(zfsvfs);
1826                 return (SET_ERROR(EINVAL));
1827         }
1828
1829         /*
1830          * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1831          * directory tree. If the object == zfsvfs->z_shares_dir, then
1832          * we are in the .zfs/shares directory tree.
1833          */
1834         if ((fid_gen == 0 &&
1835             (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1836             (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
1837                 ZFS_EXIT(zfsvfs);
1838                 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1839                 if (object == ZFSCTL_INO_SNAPDIR) {
1840                         cn.cn_nameptr = "snapshot";
1841                         cn.cn_namelen = strlen(cn.cn_nameptr);
1842                         cn.cn_nameiop = LOOKUP;
1843                         cn.cn_flags = ISLASTCN | LOCKLEAF;
1844                         cn.cn_lkflags = flags;
1845                         VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1846                         vput(dvp);
1847                 } else if (object == zfsvfs->z_shares_dir) {
1848                         /*
1849                          * XXX This branch must not be taken,
1850                          * if it is, then the lookup below will
1851                          * explode.
1852                          */
1853                         cn.cn_nameptr = "shares";
1854                         cn.cn_namelen = strlen(cn.cn_nameptr);
1855                         cn.cn_nameiop = LOOKUP;
1856                         cn.cn_flags = ISLASTCN;
1857                         cn.cn_lkflags = flags;
1858                         VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1859                         vput(dvp);
1860                 } else {
1861                         *vpp = dvp;
1862                 }
1863                 return (err);
1864         }
1865
1866         gen_mask = -1ULL >> (64 - 8 * i);
1867
1868         dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
1869         if ((err = zfs_zget(zfsvfs, object, &zp))) {
1870                 ZFS_EXIT(zfsvfs);
1871                 return (err);
1872         }
1873         (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1874             sizeof (uint64_t));
1875         zp_gen = zp_gen & gen_mask;
1876         if (zp_gen == 0)
1877                 zp_gen = 1;
1878         if (zp->z_unlinked || zp_gen != fid_gen) {
1879                 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
1880                 vrele(ZTOV(zp));
1881                 ZFS_EXIT(zfsvfs);
1882                 return (SET_ERROR(EINVAL));
1883         }
1884
1885         *vpp = ZTOV(zp);
1886         ZFS_EXIT(zfsvfs);
1887         err = vn_lock(*vpp, flags);
1888         if (err == 0)
1889                 vnode_create_vobject(*vpp, zp->z_size, curthread);
1890         else
1891                 *vpp = NULL;
1892         return (err);
1893 }
1894
1895 /*
1896  * Block out VOPs and close zfsvfs_t::z_os
1897  *
1898  * Note, if successful, then we return with the 'z_teardown_lock' and
1899  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
1900  * dataset and objset intact so that they can be atomically handed off during
1901  * a subsequent rollback or recv operation and the resume thereafter.
1902  */
1903 int
1904 zfs_suspend_fs(zfsvfs_t *zfsvfs)
1905 {
1906         int error;
1907
1908         if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1909                 return (error);
1910
1911         return (0);
1912 }
1913
1914 /*
1915  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
1916  * is an invariant across any of the operations that can be performed while the
1917  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
1918  * are the same: the relevant objset and associated dataset are owned by
1919  * zfsvfs, held, and long held on entry.
1920  */
1921 int
1922 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
1923 {
1924         int err;
1925         znode_t *zp;
1926
1927         ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
1928         ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
1929
1930         /*
1931          * We already own this, so just update the objset_t, as the one we
1932          * had before may have been evicted.
1933          */
1934         objset_t *os;
1935         VERIFY3P(ds->ds_owner, ==, zfsvfs);
1936         VERIFY(dsl_dataset_long_held(ds));
1937         dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
1938         dsl_pool_config_enter(dp, FTAG);
1939         VERIFY0(dmu_objset_from_ds(ds, &os));
1940         dsl_pool_config_exit(dp, FTAG);
1941
1942         err = zfsvfs_init(zfsvfs, os);
1943         if (err != 0)
1944                 goto bail;
1945
1946         ds->ds_dir->dd_activity_cancelled = B_FALSE;
1947         VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
1948
1949         zfs_set_fuid_feature(zfsvfs);
1950
1951         /*
1952          * Attempt to re-establish all the active znodes with
1953          * their dbufs.  If a zfs_rezget() fails, then we'll let
1954          * any potential callers discover that via ZFS_ENTER_VERIFY_VP
1955          * when they try to use their znode.
1956          */
1957         mutex_enter(&zfsvfs->z_znodes_lock);
1958         for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1959             zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1960                 (void) zfs_rezget(zp);
1961         }
1962         mutex_exit(&zfsvfs->z_znodes_lock);
1963
1964 bail:
1965         /* release the VOPs */
1966         ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1967         ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1968
1969         if (err) {
1970                 /*
1971                  * Since we couldn't setup the sa framework, try to force
1972                  * unmount this file system.
1973                  */
1974                 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
1975                         vfs_ref(zfsvfs->z_vfs);
1976                         (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
1977                 }
1978         }
1979         return (err);
1980 }
1981
1982 static void
1983 zfs_freevfs(vfs_t *vfsp)
1984 {
1985         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1986
1987         zfsvfs_free(zfsvfs);
1988
1989         atomic_dec_32(&zfs_active_fs_count);
1990 }
1991
1992 #ifdef __i386__
1993 static int desiredvnodes_backup;
1994 #include <sys/vmmeter.h>
1995
1996
1997 #include <vm/vm_page.h>
1998 #include <vm/vm_object.h>
1999 #include <vm/vm_kern.h>
2000 #include <vm/vm_map.h>
2001 #endif
2002
2003 static void
2004 zfs_vnodes_adjust(void)
2005 {
2006 #ifdef __i386__
2007         int newdesiredvnodes;
2008
2009         desiredvnodes_backup = desiredvnodes;
2010
2011         /*
2012          * We calculate newdesiredvnodes the same way it is done in
2013          * vntblinit(). If it is equal to desiredvnodes, it means that
2014          * it wasn't tuned by the administrator and we can tune it down.
2015          */
2016         newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2017             vm_kmem_size / (5 * (sizeof (struct vm_object) +
2018             sizeof (struct vnode))));
2019         if (newdesiredvnodes == desiredvnodes)
2020                 desiredvnodes = (3 * newdesiredvnodes) / 4;
2021 #endif
2022 }
2023
2024 static void
2025 zfs_vnodes_adjust_back(void)
2026 {
2027
2028 #ifdef __i386__
2029         desiredvnodes = desiredvnodes_backup;
2030 #endif
2031 }
2032
2033 void
2034 zfs_init(void)
2035 {
2036
2037         printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2038
2039         /*
2040          * Initialize .zfs directory structures
2041          */
2042         zfsctl_init();
2043
2044         /*
2045          * Initialize znode cache, vnode ops, etc...
2046          */
2047         zfs_znode_init();
2048
2049         /*
2050          * Reduce number of vnodes. Originally number of vnodes is calculated
2051          * with UFS inode in mind. We reduce it here, because it's too big for
2052          * ZFS/i386.
2053          */
2054         zfs_vnodes_adjust();
2055
2056         dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
2057
2058         zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2059 }
2060
2061 void
2062 zfs_fini(void)
2063 {
2064         taskq_destroy(zfsvfs_taskq);
2065         zfsctl_fini();
2066         zfs_znode_fini();
2067         zfs_vnodes_adjust_back();
2068 }
2069
2070 int
2071 zfs_busy(void)
2072 {
2073         return (zfs_active_fs_count != 0);
2074 }
2075
2076 /*
2077  * Release VOPs and unmount a suspended filesystem.
2078  */
2079 int
2080 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2081 {
2082         ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
2083         ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
2084
2085         /*
2086          * We already own this, so just hold and rele it to update the
2087          * objset_t, as the one we had before may have been evicted.
2088          */
2089         objset_t *os;
2090         VERIFY3P(ds->ds_owner, ==, zfsvfs);
2091         VERIFY(dsl_dataset_long_held(ds));
2092         dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2093         dsl_pool_config_enter(dp, FTAG);
2094         VERIFY0(dmu_objset_from_ds(ds, &os));
2095         dsl_pool_config_exit(dp, FTAG);
2096         zfsvfs->z_os = os;
2097
2098         /* release the VOPs */
2099         ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2100         ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2101
2102         /*
2103          * Try to force unmount this file system.
2104          */
2105         (void) zfs_umount(zfsvfs->z_vfs, 0);
2106         zfsvfs->z_unmounted = B_TRUE;
2107         return (0);
2108 }
2109
2110 int
2111 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2112 {
2113         int error;
2114         objset_t *os = zfsvfs->z_os;
2115         dmu_tx_t *tx;
2116
2117         if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2118                 return (SET_ERROR(EINVAL));
2119
2120         if (newvers < zfsvfs->z_version)
2121                 return (SET_ERROR(EINVAL));
2122
2123         if (zfs_spa_version_map(newvers) >
2124             spa_version(dmu_objset_spa(zfsvfs->z_os)))
2125                 return (SET_ERROR(ENOTSUP));
2126
2127         tx = dmu_tx_create(os);
2128         dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2129         if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2130                 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2131                     ZFS_SA_ATTRS);
2132                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2133         }
2134         error = dmu_tx_assign(tx, TXG_WAIT);
2135         if (error) {
2136                 dmu_tx_abort(tx);
2137                 return (error);
2138         }
2139
2140         error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2141             8, 1, &newvers, tx);
2142
2143         if (error) {
2144                 dmu_tx_commit(tx);
2145                 return (error);
2146         }
2147
2148         if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2149                 uint64_t sa_obj;
2150
2151                 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2152                     SPA_VERSION_SA);
2153                 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2154                     DMU_OT_NONE, 0, tx);
2155
2156                 error = zap_add(os, MASTER_NODE_OBJ,
2157                     ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2158                 ASSERT0(error);
2159
2160                 VERIFY(0 == sa_set_sa_object(os, sa_obj));
2161                 sa_register_update_callback(os, zfs_sa_upgrade);
2162         }
2163
2164         spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2165             "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2166             (uintmax_t)newvers);
2167         dmu_tx_commit(tx);
2168
2169         zfsvfs->z_version = newvers;
2170         os->os_version = newvers;
2171
2172         zfs_set_fuid_feature(zfsvfs);
2173
2174         return (0);
2175 }
2176
2177 /*
2178  * Read a property stored within the master node.
2179  */
2180 int
2181 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2182 {
2183         uint64_t *cached_copy = NULL;
2184
2185         /*
2186          * Figure out where in the objset_t the cached copy would live, if it
2187          * is available for the requested property.
2188          */
2189         if (os != NULL) {
2190                 switch (prop) {
2191                 case ZFS_PROP_VERSION:
2192                         cached_copy = &os->os_version;
2193                         break;
2194                 case ZFS_PROP_NORMALIZE:
2195                         cached_copy = &os->os_normalization;
2196                         break;
2197                 case ZFS_PROP_UTF8ONLY:
2198                         cached_copy = &os->os_utf8only;
2199                         break;
2200                 case ZFS_PROP_CASE:
2201                         cached_copy = &os->os_casesensitivity;
2202                         break;
2203                 default:
2204                         break;
2205                 }
2206         }
2207         if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2208                 *value = *cached_copy;
2209                 return (0);
2210         }
2211
2212         /*
2213          * If the property wasn't cached, look up the file system's value for
2214          * the property. For the version property, we look up a slightly
2215          * different string.
2216          */
2217         const char *pname;
2218         int error = ENOENT;
2219         if (prop == ZFS_PROP_VERSION) {
2220                 pname = ZPL_VERSION_STR;
2221         } else {
2222                 pname = zfs_prop_to_name(prop);
2223         }
2224
2225         if (os != NULL) {
2226                 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2227                 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2228         }
2229
2230         if (error == ENOENT) {
2231                 /* No value set, use the default value */
2232                 switch (prop) {
2233                 case ZFS_PROP_VERSION:
2234                         *value = ZPL_VERSION;
2235                         break;
2236                 case ZFS_PROP_NORMALIZE:
2237                 case ZFS_PROP_UTF8ONLY:
2238                         *value = 0;
2239                         break;
2240                 case ZFS_PROP_CASE:
2241                         *value = ZFS_CASE_SENSITIVE;
2242                         break;
2243                 case ZFS_PROP_ACLTYPE:
2244                         *value = ZFS_ACLTYPE_NFSV4;
2245                         break;
2246                 default:
2247                         return (error);
2248                 }
2249                 error = 0;
2250         }
2251
2252         /*
2253          * If one of the methods for getting the property value above worked,
2254          * copy it into the objset_t's cache.
2255          */
2256         if (error == 0 && cached_copy != NULL) {
2257                 *cached_copy = *value;
2258         }
2259
2260         return (error);
2261 }
2262
2263 /*
2264  * Return true if the corresponding vfs's unmounted flag is set.
2265  * Otherwise return false.
2266  * If this function returns true we know VFS unmount has been initiated.
2267  */
2268 boolean_t
2269 zfs_get_vfs_flag_unmounted(objset_t *os)
2270 {
2271         zfsvfs_t *zfvp;
2272         boolean_t unmounted = B_FALSE;
2273
2274         ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
2275
2276         mutex_enter(&os->os_user_ptr_lock);
2277         zfvp = dmu_objset_get_user(os);
2278         if (zfvp != NULL && zfvp->z_vfs != NULL &&
2279             (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2280                 unmounted = B_TRUE;
2281         mutex_exit(&os->os_user_ptr_lock);
2282
2283         return (unmounted);
2284 }
2285
2286 #ifdef _KERNEL
2287 void
2288 zfsvfs_update_fromname(const char *oldname, const char *newname)
2289 {
2290         char tmpbuf[MAXPATHLEN];
2291         struct mount *mp;
2292         char *fromname;
2293         size_t oldlen;
2294
2295         oldlen = strlen(oldname);
2296
2297         mtx_lock(&mountlist_mtx);
2298         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2299                 fromname = mp->mnt_stat.f_mntfromname;
2300                 if (strcmp(fromname, oldname) == 0) {
2301                         (void) strlcpy(fromname, newname,
2302                             sizeof (mp->mnt_stat.f_mntfromname));
2303                         continue;
2304                 }
2305                 if (strncmp(fromname, oldname, oldlen) == 0 &&
2306                     (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2307                         (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2308                             newname, fromname + oldlen);
2309                         (void) strlcpy(fromname, tmpbuf,
2310                             sizeof (mp->mnt_stat.f_mntfromname));
2311                         continue;
2312                 }
2313         }
2314         mtx_unlock(&mountlist_mtx);
2315 }
2316 #endif