sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
  24  * All rights reserved.
  25  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  26  * Copyright (c) 2014 Integros [integros.com]
  27  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
  28  */
  29
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32 #include <sys/types.h>
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/kernel.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/kmem.h>
  38 #include <sys/acl.h>
  39 #include <sys/vnode.h>
  40 #include <sys/vfs.h>
  41 #include <sys/mntent.h>
  42 #include <sys/mount.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/zfs_znode.h>
  45 #include <sys/zfs_dir.h>
  46 #include <sys/zil.h>
  47 #include <sys/fs/zfs.h>
  48 #include <sys/dmu.h>
  49 #include <sys/dsl_prop.h>
  50 #include <sys/dsl_dataset.h>
  51 #include <sys/dsl_deleg.h>
  52 #include <sys/spa.h>
  53 #include <sys/zap.h>
  54 #include <sys/sa.h>
  55 #include <sys/sa_impl.h>
  56 #include <sys/varargs.h>
  57 #include <sys/policy.h>
  58 #include <sys/atomic.h>
  59 #include <sys/zfs_ioctl.h>
  60 #include <sys/zfs_ctldir.h>
  61 #include <sys/zfs_fuid.h>
  62 #include <sys/sunddi.h>
  63 #include <sys/dnlc.h>
  64 #include <sys/dmu_objset.h>
  65 #include <sys/spa_boot.h>
  66 #include <sys/jail.h>
  67 #include <ufs/ufs/quota.h>
  68
  69 #include "zfs_comutil.h"
  70
  71 struct mtx zfs_debug_mtx;
  72 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
  73
  74 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
  75
  76 int zfs_super_owner;
  77 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
  78     "File system owner can perform privileged operation on his file systems");
  79
  80 int zfs_debug_level;
  81 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
  82     "Debug level");
  83
  84 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
  85 static int zfs_version_acl = ZFS_ACL_VERSION;
  86 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
  87     "ZFS_ACL_VERSION");
  88 static int zfs_version_spa = SPA_VERSION;
  89 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
  90     "SPA_VERSION");
  91 static int zfs_version_zpl = ZPL_VERSION;
  92 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
  93     "ZPL_VERSION");
  94
  95 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
  96 static int zfs_mount(vfs_t *vfsp);
  97 static int zfs_umount(vfs_t *vfsp, int fflag);
  98 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
  99 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
 100 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
 101 static int zfs_sync(vfs_t *vfsp, int waitfor);
 102 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
 103     struct ucred **credanonp, int *numsecflavors, int **secflavors);
 104 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
 105 static void zfs_objset_close(zfsvfs_t *zfsvfs);
 106 static void zfs_freevfs(vfs_t *vfsp);
 107
 108 struct vfsops zfs_vfsops = {
 109         .vfs_mount =            zfs_mount,
 110         .vfs_unmount =          zfs_umount,
 111         .vfs_root =             vfs_cache_root,
 112         .vfs_cachedroot =       zfs_root,
 113         .vfs_statfs =           zfs_statfs,
 114         .vfs_vget =             zfs_vget,
 115         .vfs_sync =             zfs_sync,
 116         .vfs_checkexp =         zfs_checkexp,
 117         .vfs_fhtovp =           zfs_fhtovp,
 118         .vfs_quotactl =         zfs_quotactl,
 119 };
 120
 121 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
 122
 123 /*
 124  * We need to keep a count of active fs's.
 125  * This is necessary to prevent our module
 126  * from being unloaded after a umount -f
 127  */
 128 static uint32_t zfs_active_fs_count = 0;
 129
 130 static int
 131 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
 132 {
 133         int error = 0;
 134         char buf[32];
 135         int err;
 136         uint64_t usedobj, quotaobj;
 137         uint64_t quota, used = 0;
 138         timespec_t now;
 139
 140         usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 141         quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 142
 143         if (quotaobj == 0 || zfsvfs->z_replay) {
 144                 error = EINVAL;
 145                 goto done;
 146         }
 147         (void)sprintf(buf, "%llx", (longlong_t)id);
 148         if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
 149                                 buf, sizeof(quota), 1, &quota)) != 0) {
 150                 dprintf("%s(%d): quotaobj lookup failed\n", __FUNCTION__, __LINE__);
 151                 goto done;
 152         }
 153         /*
 154          * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
 155          * So we set them to be the same.
 156          */
 157         dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
 158         error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof(used), 1, &used);
 159         if (error && error != ENOENT) {
 160                 dprintf("%s(%d):  usedobj failed; %d\n", __FUNCTION__, __LINE__, error);
 161                 goto done;
 162         }
 163         dqp->dqb_curblocks = btodb(used);
 164         dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
 165         vfs_timestamp(&now);
 166         /*
 167          * Setting this to 0 causes FreeBSD quota(8) to print
 168          * the number of days since the epoch, which isn't
 169          * particularly useful.
 170          */
 171         dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
 172 done:
 173         return (error);
 174 }
 175
 176 static int
 177 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
 178 {
 179         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 180         struct thread *td;
 181         int cmd, type, error = 0;
 182         int bitsize;
 183         uint64_t fuid;
 184         zfs_userquota_prop_t quota_type;
 185         struct dqblk64 dqblk = { 0 };
 186
 187         td = curthread;
 188         cmd = cmds >> SUBCMDSHIFT;
 189         type = cmds & SUBCMDMASK;
 190
 191         ZFS_ENTER(zfsvfs);
 192         if (id == -1) {
 193                 switch (type) {
 194                 case USRQUOTA:
 195                         id = td->td_ucred->cr_ruid;
 196                         break;
 197                 case GRPQUOTA:
 198                         id = td->td_ucred->cr_rgid;
 199                         break;
 200                 default:
 201                         error = EINVAL;
 202                         if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
 203                                 vfs_unbusy(vfsp);
 204                         goto done;
 205                 }
 206         }
 207         /*
 208          * Map BSD type to:
 209          * ZFS_PROP_USERUSED,
 210          * ZFS_PROP_USERQUOTA,
 211          * ZFS_PROP_GROUPUSED,
 212          * ZFS_PROP_GROUPQUOTA
 213          */
 214         switch (cmd) {
 215         case Q_SETQUOTA:
 216         case Q_SETQUOTA32:
 217                 if (type == USRQUOTA)
 218                         quota_type = ZFS_PROP_USERQUOTA;
 219                 else if (type == GRPQUOTA)
 220                         quota_type = ZFS_PROP_GROUPQUOTA;
 221                 else
 222                         error = EINVAL;
 223                 break;
 224         case Q_GETQUOTA:
 225         case Q_GETQUOTA32:
 226                 if (type == USRQUOTA)
 227                         quota_type = ZFS_PROP_USERUSED;
 228                 else if (type == GRPQUOTA)
 229                         quota_type = ZFS_PROP_GROUPUSED;
 230                 else
 231                         error = EINVAL;
 232                 break;
 233         }
 234
 235         /*
 236          * Depending on the cmd, we may need to get
 237          * the ruid and domain (see fuidstr_to_sid?),
 238          * the fuid (how?), or other information.
 239          * Create fuid using zfs_fuid_create(zfsvfs, id,
 240          * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
 241          * I think I can use just the id?
 242          *
 243          * Look at zfs_fuid_overquota() to look up a quota.
 244          * zap_lookup(something, quotaobj, fuidstring, sizeof(long long), 1, &quota)
 245          *
 246          * See zfs_set_userquota() to set a quota.
 247          */
 248         if ((u_int)type >= MAXQUOTAS) {
 249                 error = EINVAL;
 250                 goto done;
 251         }
 252
 253         switch (cmd) {
 254         case Q_GETQUOTASIZE:
 255                 bitsize = 64;
 256                 error = copyout(&bitsize, arg, sizeof(int));
 257                 break;
 258         case Q_QUOTAON:
 259                 // As far as I can tell, you can't turn quotas on or off on zfs
 260                 error = 0;
 261                 vfs_unbusy(vfsp);
 262                 break;
 263         case Q_QUOTAOFF:
 264                 error = ENOTSUP;
 265                 vfs_unbusy(vfsp);
 266                 break;
 267         case Q_SETQUOTA:
 268                 error = copyin(&dqblk, arg, sizeof(dqblk));
 269                 if (error == 0)
 270                         error = zfs_set_userquota(zfsvfs, quota_type,
 271                                                   "", id, dbtob(dqblk.dqb_bhardlimit));
 272                 break;
 273         case Q_GETQUOTA:
 274                 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
 275                 if (error == 0)
 276                         error = copyout(&dqblk, arg, sizeof(dqblk));
 277                 break;
 278         default:
 279                 error = EINVAL;
 280                 break;
 281         }
 282 done:
 283         ZFS_EXIT(zfsvfs);
 284         return (error);
 285 }
 286
 287 /*ARGSUSED*/
 288 static int
 289 zfs_sync(vfs_t *vfsp, int waitfor)
 290 {
 291
 292         /*
 293          * Data integrity is job one.  We don't want a compromised kernel
 294          * writing to the storage pool, so we never sync during panic.
 295          */
 296         if (panicstr)
 297                 return (0);
 298
 299         /*
 300          * Ignore the system syncher.  ZFS already commits async data
 301          * at zfs_txg_timeout intervals.
 302          */
 303         if (waitfor == MNT_LAZY)
 304                 return (0);
 305
 306         if (vfsp != NULL) {
 307                 /*
 308                  * Sync a specific filesystem.
 309                  */
 310                 zfsvfs_t *zfsvfs = vfsp->vfs_data;
 311                 dsl_pool_t *dp;
 312                 int error;
 313
 314                 error = vfs_stdsync(vfsp, waitfor);
 315                 if (error != 0)
 316                         return (error);
 317
 318                 ZFS_ENTER(zfsvfs);
 319                 dp = dmu_objset_pool(zfsvfs->z_os);
 320
 321                 /*
 322                  * If the system is shutting down, then skip any
 323                  * filesystems which may exist on a suspended pool.
 324                  */
 325                 if (sys_shutdown && spa_suspended(dp->dp_spa)) {
 326                         ZFS_EXIT(zfsvfs);
 327                         return (0);
 328                 }
 329
 330                 if (zfsvfs->z_log != NULL)
 331                         zil_commit(zfsvfs->z_log, 0);
 332
 333                 ZFS_EXIT(zfsvfs);
 334         } else {
 335                 /*
 336                  * Sync all ZFS filesystems.  This is what happens when you
 337                  * run sync(1M).  Unlike other filesystems, ZFS honors the
 338                  * request by waiting for all pools to commit all dirty data.
 339                  */
 340                 spa_sync_allpools();
 341         }
 342
 343         return (0);
 344 }
 345
 346 #ifndef __FreeBSD_kernel__
 347 static int
 348 zfs_create_unique_device(dev_t *dev)
 349 {
 350         major_t new_major;
 351
 352         do {
 353                 ASSERT3U(zfs_minor, <=, MAXMIN32);
 354                 minor_t start = zfs_minor;
 355                 do {
 356                         mutex_enter(&zfs_dev_mtx);
 357                         if (zfs_minor >= MAXMIN32) {
 358                                 /*
 359                                  * If we're still using the real major
 360                                  * keep out of /dev/zfs and /dev/zvol minor
 361                                  * number space.  If we're using a getudev()'ed
 362                                  * major number, we can use all of its minors.
 363                                  */
 364                                 if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
 365                                         zfs_minor = ZFS_MIN_MINOR;
 366                                 else
 367                                         zfs_minor = 0;
 368                         } else {
 369                                 zfs_minor++;
 370                         }
 371                         *dev = makedevice(zfs_major, zfs_minor);
 372                         mutex_exit(&zfs_dev_mtx);
 373                 } while (vfs_devismounted(*dev) && zfs_minor != start);
 374                 if (zfs_minor == start) {
 375                         /*
 376                          * We are using all ~262,000 minor numbers for the
 377                          * current major number.  Create a new major number.
 378                          */
 379                         if ((new_major = getudev()) == (major_t)-1) {
 380                                 cmn_err(CE_WARN,
 381                                     "zfs_mount: Can't get unique major "
 382                                     "device number.");
 383                                 return (-1);
 384                         }
 385                         mutex_enter(&zfs_dev_mtx);
 386                         zfs_major = new_major;
 387                         zfs_minor = 0;
 388
 389                         mutex_exit(&zfs_dev_mtx);
 390                 } else {
 391                         break;
 392                 }
 393                 /* CONSTANTCONDITION */
 394         } while (1);
 395
 396         return (0);
 397 }
 398 #endif  /* !__FreeBSD_kernel__ */
 399
 400 static void
 401 atime_changed_cb(void *arg, uint64_t newval)
 402 {
 403         zfsvfs_t *zfsvfs = arg;
 404
 405         if (newval == TRUE) {
 406                 zfsvfs->z_atime = TRUE;
 407                 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 408                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 409                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 410         } else {
 411                 zfsvfs->z_atime = FALSE;
 412                 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 413                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 414                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 415         }
 416 }
 417
 418 static void
 419 xattr_changed_cb(void *arg, uint64_t newval)
 420 {
 421         zfsvfs_t *zfsvfs = arg;
 422
 423         if (newval == TRUE) {
 424                 /* XXX locking on vfs_flag? */
 425 #ifdef TODO
 426                 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 427 #endif
 428                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 429                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 430         } else {
 431                 /* XXX locking on vfs_flag? */
 432 #ifdef TODO
 433                 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 434 #endif
 435                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 436                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 437         }
 438 }
 439
 440 static void
 441 blksz_changed_cb(void *arg, uint64_t newval)
 442 {
 443         zfsvfs_t *zfsvfs = arg;
 444         ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
 445         ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
 446         ASSERT(ISP2(newval));
 447
 448         zfsvfs->z_max_blksz = newval;
 449         zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
 450 }
 451
 452 static void
 453 readonly_changed_cb(void *arg, uint64_t newval)
 454 {
 455         zfsvfs_t *zfsvfs = arg;
 456
 457         if (newval) {
 458                 /* XXX locking on vfs_flag? */
 459                 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 460                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 461                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 462         } else {
 463                 /* XXX locking on vfs_flag? */
 464                 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 465                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 466                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 467         }
 468 }
 469
 470 static void
 471 setuid_changed_cb(void *arg, uint64_t newval)
 472 {
 473         zfsvfs_t *zfsvfs = arg;
 474
 475         if (newval == FALSE) {
 476                 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 477                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 478                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 479         } else {
 480                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 481                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 482                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 483         }
 484 }
 485
 486 static void
 487 exec_changed_cb(void *arg, uint64_t newval)
 488 {
 489         zfsvfs_t *zfsvfs = arg;
 490
 491         if (newval == FALSE) {
 492                 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 493                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 494                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 495         } else {
 496                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 497                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 498                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 499         }
 500 }
 501
 502 /*
 503  * The nbmand mount option can be changed at mount time.
 504  * We can't allow it to be toggled on live file systems or incorrect
 505  * behavior may be seen from cifs clients
 506  *
 507  * This property isn't registered via dsl_prop_register(), but this callback
 508  * will be called when a file system is first mounted
 509  */
 510 static void
 511 nbmand_changed_cb(void *arg, uint64_t newval)
 512 {
 513         zfsvfs_t *zfsvfs = arg;
 514         if (newval == FALSE) {
 515                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 516                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 517         } else {
 518                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 519                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 520         }
 521 }
 522
 523 static void
 524 snapdir_changed_cb(void *arg, uint64_t newval)
 525 {
 526         zfsvfs_t *zfsvfs = arg;
 527
 528         zfsvfs->z_show_ctldir = newval;
 529 }
 530
 531 static void
 532 vscan_changed_cb(void *arg, uint64_t newval)
 533 {
 534         zfsvfs_t *zfsvfs = arg;
 535
 536         zfsvfs->z_vscan = newval;
 537 }
 538
 539 static void
 540 acl_mode_changed_cb(void *arg, uint64_t newval)
 541 {
 542         zfsvfs_t *zfsvfs = arg;
 543
 544         zfsvfs->z_acl_mode = newval;
 545 }
 546
 547 static void
 548 acl_inherit_changed_cb(void *arg, uint64_t newval)
 549 {
 550         zfsvfs_t *zfsvfs = arg;
 551
 552         zfsvfs->z_acl_inherit = newval;
 553 }
 554
 555 static int
 556 zfs_register_callbacks(vfs_t *vfsp)
 557 {
 558         struct dsl_dataset *ds = NULL;
 559         objset_t *os = NULL;
 560         zfsvfs_t *zfsvfs = NULL;
 561         uint64_t nbmand;
 562         boolean_t readonly = B_FALSE;
 563         boolean_t do_readonly = B_FALSE;
 564         boolean_t setuid = B_FALSE;
 565         boolean_t do_setuid = B_FALSE;
 566         boolean_t exec = B_FALSE;
 567         boolean_t do_exec = B_FALSE;
 568 #ifdef illumos
 569         boolean_t devices = B_FALSE;
 570         boolean_t do_devices = B_FALSE;
 571 #endif
 572         boolean_t xattr = B_FALSE;
 573         boolean_t do_xattr = B_FALSE;
 574         boolean_t atime = B_FALSE;
 575         boolean_t do_atime = B_FALSE;
 576         int error = 0;
 577
 578         ASSERT(vfsp);
 579         zfsvfs = vfsp->vfs_data;
 580         ASSERT(zfsvfs);
 581         os = zfsvfs->z_os;
 582
 583         /*
 584          * This function can be called for a snapshot when we update snapshot's
 585          * mount point, which isn't really supported.
 586          */
 587         if (dmu_objset_is_snapshot(os))
 588                 return (EOPNOTSUPP);
 589
 590         /*
 591          * The act of registering our callbacks will destroy any mount
 592          * options we may have.  In order to enable temporary overrides
 593          * of mount options, we stash away the current values and
 594          * restore them after we register the callbacks.
 595          */
 596         if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
 597             !spa_writeable(dmu_objset_spa(os))) {
 598                 readonly = B_TRUE;
 599                 do_readonly = B_TRUE;
 600         } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 601                 readonly = B_FALSE;
 602                 do_readonly = B_TRUE;
 603         }
 604         if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 605                 setuid = B_FALSE;
 606                 do_setuid = B_TRUE;
 607         } else {
 608                 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 609                         setuid = B_FALSE;
 610                         do_setuid = B_TRUE;
 611                 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 612                         setuid = B_TRUE;
 613                         do_setuid = B_TRUE;
 614                 }
 615         }
 616         if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 617                 exec = B_FALSE;
 618                 do_exec = B_TRUE;
 619         } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 620                 exec = B_TRUE;
 621                 do_exec = B_TRUE;
 622         }
 623         if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 624                 xattr = B_FALSE;
 625                 do_xattr = B_TRUE;
 626         } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 627                 xattr = B_TRUE;
 628                 do_xattr = B_TRUE;
 629         }
 630         if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 631                 atime = B_FALSE;
 632                 do_atime = B_TRUE;
 633         } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 634                 atime = B_TRUE;
 635                 do_atime = B_TRUE;
 636         }
 637
 638         /*
 639          * We need to enter pool configuration here, so that we can use
 640          * dsl_prop_get_int_ds() to handle the special nbmand property below.
 641          * dsl_prop_get_integer() can not be used, because it has to acquire
 642          * spa_namespace_lock and we can not do that because we already hold
 643          * z_teardown_lock.  The problem is that spa_write_cachefile() is called
 644          * with spa_namespace_lock held and the function calls ZFS vnode
 645          * operations to write the cache file and thus z_teardown_lock is
 646          * acquired after spa_namespace_lock.
 647          */
 648         ds = dmu_objset_ds(os);
 649         dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 650
 651         /*
 652          * nbmand is a special property.  It can only be changed at
 653          * mount time.
 654          *
 655          * This is weird, but it is documented to only be changeable
 656          * at mount time.
 657          */
 658         if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 659                 nbmand = B_FALSE;
 660         } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 661                 nbmand = B_TRUE;
 662         } else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
 663                 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 664                 return (error);
 665         }
 666
 667         /*
 668          * Register property callbacks.
 669          *
 670          * It would probably be fine to just check for i/o error from
 671          * the first prop_register(), but I guess I like to go
 672          * overboard...
 673          */
 674         error = dsl_prop_register(ds,
 675             zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
 676         error = error ? error : dsl_prop_register(ds,
 677             zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
 678         error = error ? error : dsl_prop_register(ds,
 679             zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
 680         error = error ? error : dsl_prop_register(ds,
 681             zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
 682 #ifdef illumos
 683         error = error ? error : dsl_prop_register(ds,
 684             zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
 685 #endif
 686         error = error ? error : dsl_prop_register(ds,
 687             zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
 688         error = error ? error : dsl_prop_register(ds,
 689             zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
 690         error = error ? error : dsl_prop_register(ds,
 691             zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
 692         error = error ? error : dsl_prop_register(ds,
 693             zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
 694         error = error ? error : dsl_prop_register(ds,
 695             zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
 696             zfsvfs);
 697         error = error ? error : dsl_prop_register(ds,
 698             zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
 699         dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 700         if (error)
 701                 goto unregister;
 702
 703         /*
 704          * Invoke our callbacks to restore temporary mount options.
 705          */
 706         if (do_readonly)
 707                 readonly_changed_cb(zfsvfs, readonly);
 708         if (do_setuid)
 709                 setuid_changed_cb(zfsvfs, setuid);
 710         if (do_exec)
 711                 exec_changed_cb(zfsvfs, exec);
 712         if (do_xattr)
 713                 xattr_changed_cb(zfsvfs, xattr);
 714         if (do_atime)
 715                 atime_changed_cb(zfsvfs, atime);
 716
 717         nbmand_changed_cb(zfsvfs, nbmand);
 718
 719         return (0);
 720
 721 unregister:
 722         dsl_prop_unregister_all(ds, zfsvfs);
 723         return (error);
 724 }
 725
 726 static int
 727 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
 728     uint64_t *userp, uint64_t *groupp)
 729 {
 730         /*
 731          * Is it a valid type of object to track?
 732          */
 733         if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 734                 return (SET_ERROR(ENOENT));
 735
 736         /*
 737          * If we have a NULL data pointer
 738          * then assume the id's aren't changing and
 739          * return EEXIST to the dmu to let it know to
 740          * use the same ids
 741          */
 742         if (data == NULL)
 743                 return (SET_ERROR(EEXIST));
 744
 745         if (bonustype == DMU_OT_ZNODE) {
 746                 znode_phys_t *znp = data;
 747                 *userp = znp->zp_uid;
 748                 *groupp = znp->zp_gid;
 749         } else {
 750                 int hdrsize;
 751                 sa_hdr_phys_t *sap = data;
 752                 sa_hdr_phys_t sa = *sap;
 753                 boolean_t swap = B_FALSE;
 754
 755                 ASSERT(bonustype == DMU_OT_SA);
 756
 757                 if (sa.sa_magic == 0) {
 758                         /*
 759                          * This should only happen for newly created
 760                          * files that haven't had the znode data filled
 761                          * in yet.
 762                          */
 763                         *userp = 0;
 764                         *groupp = 0;
 765                         return (0);
 766                 }
 767                 if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
 768                         sa.sa_magic = SA_MAGIC;
 769                         sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
 770                         swap = B_TRUE;
 771                 } else {
 772                         VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
 773                 }
 774
 775                 hdrsize = sa_hdrsize(&sa);
 776                 VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
 777                 *userp = *((uint64_t *)((uintptr_t)data + hdrsize +
 778                     SA_UID_OFFSET));
 779                 *groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
 780                     SA_GID_OFFSET));
 781                 if (swap) {
 782                         *userp = BSWAP_64(*userp);
 783                         *groupp = BSWAP_64(*groupp);
 784                 }
 785         }
 786         return (0);
 787 }
 788
 789 static void
 790 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
 791     char *domainbuf, int buflen, uid_t *ridp)
 792 {
 793         uint64_t fuid;
 794         const char *domain;
 795
 796         fuid = zfs_strtonum(fuidstr, NULL);
 797
 798         domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
 799         if (domain)
 800                 (void) strlcpy(domainbuf, domain, buflen);
 801         else
 802                 domainbuf[0] = '\0';
 803         *ridp = FUID_RID(fuid);
 804 }
 805
 806 static uint64_t
 807 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
 808 {
 809         switch (type) {
 810         case ZFS_PROP_USERUSED:
 811                 return (DMU_USERUSED_OBJECT);
 812         case ZFS_PROP_GROUPUSED:
 813                 return (DMU_GROUPUSED_OBJECT);
 814         case ZFS_PROP_USERQUOTA:
 815                 return (zfsvfs->z_userquota_obj);
 816         case ZFS_PROP_GROUPQUOTA:
 817                 return (zfsvfs->z_groupquota_obj);
 818         }
 819         return (0);
 820 }
 821
 822 int
 823 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 824     uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
 825 {
 826         int error;
 827         zap_cursor_t zc;
 828         zap_attribute_t za;
 829         zfs_useracct_t *buf = vbuf;
 830         uint64_t obj;
 831
 832         if (!dmu_objset_userspace_present(zfsvfs->z_os))
 833                 return (SET_ERROR(ENOTSUP));
 834
 835         obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 836         if (obj == 0) {
 837                 *bufsizep = 0;
 838                 return (0);
 839         }
 840
 841         for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
 842             (error = zap_cursor_retrieve(&zc, &za)) == 0;
 843             zap_cursor_advance(&zc)) {
 844                 if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
 845                     *bufsizep)
 846                         break;
 847
 848                 fuidstr_to_sid(zfsvfs, za.za_name,
 849                     buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
 850
 851                 buf->zu_space = za.za_first_integer;
 852                 buf++;
 853         }
 854         if (error == ENOENT)
 855                 error = 0;
 856
 857         ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
 858         *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
 859         *cookiep = zap_cursor_serialize(&zc);
 860         zap_cursor_fini(&zc);
 861         return (error);
 862 }
 863
 864 /*
 865  * buf must be big enough (eg, 32 bytes)
 866  */
 867 static int
 868 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
 869     char *buf, boolean_t addok)
 870 {
 871         uint64_t fuid;
 872         int domainid = 0;
 873
 874         if (domain && domain[0]) {
 875                 domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
 876                 if (domainid == -1)
 877                         return (SET_ERROR(ENOENT));
 878         }
 879         fuid = FUID_ENCODE(domainid, rid);
 880         (void) sprintf(buf, "%llx", (longlong_t)fuid);
 881         return (0);
 882 }
 883
 884 int
 885 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 886     const char *domain, uint64_t rid, uint64_t *valp)
 887 {
 888         char buf[32];
 889         int err;
 890         uint64_t obj;
 891
 892         *valp = 0;
 893
 894         if (!dmu_objset_userspace_present(zfsvfs->z_os))
 895                 return (SET_ERROR(ENOTSUP));
 896
 897         obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 898         if (obj == 0)
 899                 return (0);
 900
 901         err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
 902         if (err)
 903                 return (err);
 904
 905         err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
 906         if (err == ENOENT)
 907                 err = 0;
 908         return (err);
 909 }
 910
 911 int
 912 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 913     const char *domain, uint64_t rid, uint64_t quota)
 914 {
 915         char buf[32];
 916         int err;
 917         dmu_tx_t *tx;
 918         uint64_t *objp;
 919         boolean_t fuid_dirtied;
 920
 921         if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
 922                 return (SET_ERROR(EINVAL));
 923
 924         if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
 925                 return (SET_ERROR(ENOTSUP));
 926
 927         objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
 928             &zfsvfs->z_groupquota_obj;
 929
 930         err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
 931         if (err)
 932                 return (err);
 933         fuid_dirtied = zfsvfs->z_fuid_dirty;
 934
 935         tx = dmu_tx_create(zfsvfs->z_os);
 936         dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
 937         if (*objp == 0) {
 938                 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 939                     zfs_userquota_prop_prefixes[type]);
 940         }
 941         if (fuid_dirtied)
 942                 zfs_fuid_txhold(zfsvfs, tx);
 943         err = dmu_tx_assign(tx, TXG_WAIT);
 944         if (err) {
 945                 dmu_tx_abort(tx);
 946                 return (err);
 947         }
 948
 949         mutex_enter(&zfsvfs->z_lock);
 950         if (*objp == 0) {
 951                 *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
 952                     DMU_OT_NONE, 0, tx);
 953                 VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 954                     zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
 955         }
 956         mutex_exit(&zfsvfs->z_lock);
 957
 958         if (quota == 0) {
 959                 err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
 960                 if (err == ENOENT)
 961                         err = 0;
 962         } else {
 963                 err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
 964         }
 965         ASSERT(err == 0);
 966         if (fuid_dirtied)
 967                 zfs_fuid_sync(zfsvfs, tx);
 968         dmu_tx_commit(tx);
 969         return (err);
 970 }
 971
 972 boolean_t
 973 zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
 974 {
 975         char buf[32];
 976         uint64_t used, quota, usedobj, quotaobj;
 977         int err;
 978
 979         usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 980         quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 981
 982         if (quotaobj == 0 || zfsvfs->z_replay)
 983                 return (B_FALSE);
 984
 985         (void) sprintf(buf, "%llx", (longlong_t)fuid);
 986         err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
 987         if (err != 0)
 988                 return (B_FALSE);
 989
 990         err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
 991         if (err != 0)
 992                 return (B_FALSE);
 993         return (used >= quota);
 994 }
 995
 996 boolean_t
 997 zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
 998 {
 999         uint64_t fuid;
1000         uint64_t quotaobj;
1001
1002         quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
1003
1004         fuid = isgroup ? zp->z_gid : zp->z_uid;
1005
1006         if (quotaobj == 0 || zfsvfs->z_replay)
1007                 return (B_FALSE);
1008
1009         return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
1010 }
1011
1012 /*
1013  * Associate this zfsvfs with the given objset, which must be owned.
1014  * This will cache a bunch of on-disk state from the objset in the
1015  * zfsvfs.
1016  */
1017 static int
1018 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
1019 {
1020         int error;
1021         uint64_t val;
1022
1023         zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
1024         zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
1025         zfsvfs->z_os = os;
1026
1027         error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
1028         if (error != 0)
1029                 return (error);
1030         if (zfsvfs->z_version >
1031             zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
1032                 (void) printf("Can't mount a version %lld file system "
1033                     "on a version %lld pool\n. Pool must be upgraded to mount "
1034                     "this file system.", (u_longlong_t)zfsvfs->z_version,
1035                     (u_longlong_t)spa_version(dmu_objset_spa(os)));
1036                 return (SET_ERROR(ENOTSUP));
1037         }
1038         error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
1039         if (error != 0)
1040                 return (error);
1041         zfsvfs->z_norm = (int)val;
1042
1043         error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
1044         if (error != 0)
1045                 return (error);
1046         zfsvfs->z_utf8 = (val != 0);
1047
1048         error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
1049         if (error != 0)
1050                 return (error);
1051         zfsvfs->z_case = (uint_t)val;
1052
1053         /*
1054          * Fold case on file systems that are always or sometimes case
1055          * insensitive.
1056          */
1057         if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
1058             zfsvfs->z_case == ZFS_CASE_MIXED)
1059                 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1060
1061         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1062         zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1063
1064         uint64_t sa_obj = 0;
1065         if (zfsvfs->z_use_sa) {
1066                 /* should either have both of these objects or none */
1067                 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
1068                     &sa_obj);
1069                 if (error != 0)
1070                         return (error);
1071         }
1072
1073         error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1074             &zfsvfs->z_attr_table);
1075         if (error != 0)
1076                 return (error);
1077
1078         if (zfsvfs->z_version >= ZPL_VERSION_SA)
1079                 sa_register_update_callback(os, zfs_sa_upgrade);
1080
1081         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
1082             &zfsvfs->z_root);
1083         if (error != 0)
1084                 return (error);
1085         ASSERT(zfsvfs->z_root != 0);
1086
1087         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
1088             &zfsvfs->z_unlinkedobj);
1089         if (error != 0)
1090                 return (error);
1091
1092         error = zap_lookup(os, MASTER_NODE_OBJ,
1093             zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
1094             8, 1, &zfsvfs->z_userquota_obj);
1095         if (error == ENOENT)
1096                 zfsvfs->z_userquota_obj = 0;
1097         else if (error != 0)
1098                 return (error);
1099
1100         error = zap_lookup(os, MASTER_NODE_OBJ,
1101             zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
1102             8, 1, &zfsvfs->z_groupquota_obj);
1103         if (error == ENOENT)
1104                 zfsvfs->z_groupquota_obj = 0;
1105         else if (error != 0)
1106                 return (error);
1107
1108         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
1109             &zfsvfs->z_fuid_obj);
1110         if (error == ENOENT)
1111                 zfsvfs->z_fuid_obj = 0;
1112         else if (error != 0)
1113                 return (error);
1114
1115         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
1116             &zfsvfs->z_shares_dir);
1117         if (error == ENOENT)
1118                 zfsvfs->z_shares_dir = 0;
1119         else if (error != 0)
1120                 return (error);
1121
1122         /*
1123          * Only use the name cache if we are looking for a
1124          * name on a file system that does not require normalization
1125          * or case folding.  We can also look there if we happen to be
1126          * on a non-normalizing, mixed sensitivity file system IF we
1127          * are looking for the exact name (which is always the case on
1128          * FreeBSD).
1129          */
1130         zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
1131             ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
1132             !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
1133
1134         return (0);
1135 }
1136
1137 #if defined(__FreeBSD__)
1138 taskq_t *zfsvfs_taskq;
1139
1140 static void
1141 zfsvfs_task_unlinked_drain(void *context, int pending __unused)
1142 {
1143
1144         zfs_unlinked_drain((zfsvfs_t *)context);
1145 }
1146 #endif
1147
1148 int
1149 zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
1150 {
1151         objset_t *os;
1152         zfsvfs_t *zfsvfs;
1153         int error;
1154
1155         /*
1156          * XXX: Fix struct statfs so this isn't necessary!
1157          *
1158          * The 'osname' is used as the filesystem's special node, which means
1159          * it must fit in statfs.f_mntfromname, or else it can't be
1160          * enumerated, so libzfs_mnttab_find() returns NULL, which causes
1161          * 'zfs unmount' to think it's not mounted when it is.
1162          */
1163         if (strlen(osname) >= MNAMELEN)
1164                 return (SET_ERROR(ENAMETOOLONG));
1165
1166         zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1167
1168         /*
1169          * We claim to always be readonly so we can open snapshots;
1170          * other ZPL code will prevent us from writing to snapshots.
1171          */
1172
1173         error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
1174         if (error != 0) {
1175                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1176                 return (error);
1177         }
1178
1179         error = zfsvfs_create_impl(zfvp, zfsvfs, os);
1180         if (error != 0) {
1181                 dmu_objset_disown(os, zfsvfs);
1182         }
1183         return (error);
1184 }
1185
1186
1187 int
1188 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
1189 {
1190         int error;
1191
1192         zfsvfs->z_vfs = NULL;
1193         zfsvfs->z_parent = zfsvfs;
1194
1195         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1196         mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1197         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1198             offsetof(znode_t, z_link_node));
1199 #if defined(__FreeBSD__)
1200         TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
1201             zfsvfs_task_unlinked_drain, zfsvfs);
1202 #endif
1203 #ifdef DIAGNOSTIC
1204         rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
1205 #else
1206         rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
1207 #endif
1208         rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
1209         rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1210         for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1211                 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1212
1213         error = zfsvfs_init(zfsvfs, os);
1214         if (error != 0) {
1215                 *zfvp = NULL;
1216                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1217                 return (error);
1218         }
1219
1220         *zfvp = zfsvfs;
1221         return (0);
1222 }
1223
1224 static int
1225 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1226 {
1227         int error;
1228
1229         error = zfs_register_callbacks(zfsvfs->z_vfs);
1230         if (error)
1231                 return (error);
1232
1233         zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1234
1235         /*
1236          * If we are not mounting (ie: online recv), then we don't
1237          * have to worry about replaying the log as we blocked all
1238          * operations out since we closed the ZIL.
1239          */
1240         if (mounting) {
1241                 boolean_t readonly;
1242
1243                 /*
1244                  * During replay we remove the read only flag to
1245                  * allow replays to succeed.
1246                  */
1247                 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1248                 if (readonly != 0)
1249                         zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1250                 else
1251                         zfs_unlinked_drain(zfsvfs);
1252
1253                 /*
1254                  * Parse and replay the intent log.
1255                  *
1256                  * Because of ziltest, this must be done after
1257                  * zfs_unlinked_drain().  (Further note: ziltest
1258                  * doesn't use readonly mounts, where
1259                  * zfs_unlinked_drain() isn't called.)  This is because
1260                  * ziltest causes spa_sync() to think it's committed,
1261                  * but actually it is not, so the intent log contains
1262                  * many txg's worth of changes.
1263                  *
1264                  * In particular, if object N is in the unlinked set in
1265                  * the last txg to actually sync, then it could be
1266                  * actually freed in a later txg and then reallocated
1267                  * in a yet later txg.  This would write a "create
1268                  * object N" record to the intent log.  Normally, this
1269                  * would be fine because the spa_sync() would have
1270                  * written out the fact that object N is free, before
1271                  * we could write the "create object N" intent log
1272                  * record.
1273                  *
1274                  * But when we are in ziltest mode, we advance the "open
1275                  * txg" without actually spa_sync()-ing the changes to
1276                  * disk.  So we would see that object N is still
1277                  * allocated and in the unlinked set, and there is an
1278                  * intent log record saying to allocate it.
1279                  */
1280                 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1281                         if (zil_replay_disable) {
1282                                 zil_destroy(zfsvfs->z_log, B_FALSE);
1283                         } else {
1284                                 zfsvfs->z_replay = B_TRUE;
1285                                 zil_replay(zfsvfs->z_os, zfsvfs,
1286                                     zfs_replay_vector);
1287                                 zfsvfs->z_replay = B_FALSE;
1288                         }
1289                 }
1290                 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
1291         }
1292
1293         /*
1294          * Set the objset user_ptr to track its zfsvfs.
1295          */
1296         mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1297         dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1298         mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1299
1300         return (0);
1301 }
1302
1303 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1304
1305 void
1306 zfsvfs_free(zfsvfs_t *zfsvfs)
1307 {
1308         int i;
1309
1310         /*
1311          * This is a barrier to prevent the filesystem from going away in
1312          * zfs_znode_move() until we can safely ensure that the filesystem is
1313          * not unmounted. We consider the filesystem valid before the barrier
1314          * and invalid after the barrier.
1315          */
1316         rw_enter(&zfsvfs_lock, RW_READER);
1317         rw_exit(&zfsvfs_lock);
1318
1319         zfs_fuid_destroy(zfsvfs);
1320
1321         mutex_destroy(&zfsvfs->z_znodes_lock);
1322         mutex_destroy(&zfsvfs->z_lock);
1323         list_destroy(&zfsvfs->z_all_znodes);
1324         rrm_destroy(&zfsvfs->z_teardown_lock);
1325         rw_destroy(&zfsvfs->z_teardown_inactive_lock);
1326         rw_destroy(&zfsvfs->z_fuid_lock);
1327         for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1328                 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1329         kmem_free(zfsvfs, sizeof (zfsvfs_t));
1330 }
1331
1332 static void
1333 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1334 {
1335         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1336         if (zfsvfs->z_vfs) {
1337                 if (zfsvfs->z_use_fuids) {
1338                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1339                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1340                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1341                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1342                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1343                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1344                 } else {
1345                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1346                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1347                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1348                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1349                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1350                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1351                 }
1352         }
1353         zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1354 }
1355
1356 static int
1357 zfs_domount(vfs_t *vfsp, char *osname)
1358 {
1359         uint64_t recordsize, fsid_guid;
1360         int error = 0;
1361         zfsvfs_t *zfsvfs;
1362         vnode_t *vp;
1363
1364         ASSERT(vfsp);
1365         ASSERT(osname);
1366
1367         error = zfsvfs_create(osname, &zfsvfs);
1368         if (error)
1369                 return (error);
1370         zfsvfs->z_vfs = vfsp;
1371
1372 #ifdef illumos
1373         /* Initialize the generic filesystem structure. */
1374         vfsp->vfs_bcount = 0;
1375         vfsp->vfs_data = NULL;
1376
1377         if (zfs_create_unique_device(&mount_dev) == -1) {
1378                 error = SET_ERROR(ENODEV);
1379                 goto out;
1380         }
1381         ASSERT(vfs_devismounted(mount_dev) == 0);
1382 #endif
1383
1384         if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
1385             NULL))
1386                 goto out;
1387         zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1388         zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1389
1390         vfsp->vfs_data = zfsvfs;
1391         vfsp->mnt_flag |= MNT_LOCAL;
1392         vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1393         vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1394         vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1395         vfsp->mnt_kern_flag |= MNTK_NO_IOPF;    /* vn_io_fault can be used */
1396
1397         /*
1398          * The fsid is 64 bits, composed of an 8-bit fs type, which
1399          * separates our fsid from any other filesystem types, and a
1400          * 56-bit objset unique ID.  The objset unique ID is unique to
1401          * all objsets open on this system, provided by unique_create().
1402          * The 8-bit fs type must be put in the low bits of fsid[1]
1403          * because that's where other Solaris filesystems put it.
1404          */
1405         fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1406         ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1407         vfsp->vfs_fsid.val[0] = fsid_guid;
1408         vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1409             vfsp->mnt_vfc->vfc_typenum & 0xFF;
1410
1411         /*
1412          * Set features for file system.
1413          */
1414         zfs_set_fuid_feature(zfsvfs);
1415         if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
1416                 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1417                 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1418                 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
1419         } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
1420                 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1421                 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1422         }
1423         vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
1424
1425         if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1426                 uint64_t pval;
1427
1428                 atime_changed_cb(zfsvfs, B_FALSE);
1429                 readonly_changed_cb(zfsvfs, B_TRUE);
1430                 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
1431                         goto out;
1432                 xattr_changed_cb(zfsvfs, pval);
1433                 zfsvfs->z_issnap = B_TRUE;
1434                 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1435
1436                 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1437                 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1438                 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1439         } else {
1440                 error = zfsvfs_setup(zfsvfs, B_TRUE);
1441         }
1442
1443         vfs_mountedfrom(vfsp, osname);
1444
1445         if (!zfsvfs->z_issnap)
1446                 zfsctl_create(zfsvfs);
1447 out:
1448         if (error) {
1449                 dmu_objset_disown(zfsvfs->z_os, zfsvfs);
1450                 zfsvfs_free(zfsvfs);
1451         } else {
1452                 atomic_inc_32(&zfs_active_fs_count);
1453         }
1454
1455         return (error);
1456 }
1457
1458 void
1459 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1460 {
1461         objset_t *os = zfsvfs->z_os;
1462
1463         if (!dmu_objset_is_snapshot(os))
1464                 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1465 }
1466
1467 #ifdef SECLABEL
1468 /*
1469  * Convert a decimal digit string to a uint64_t integer.
1470  */
1471 static int
1472 str_to_uint64(char *str, uint64_t *objnum)
1473 {
1474         uint64_t num = 0;
1475
1476         while (*str) {
1477                 if (*str < '0' || *str > '9')
1478                         return (SET_ERROR(EINVAL));
1479
1480                 num = num*10 + *str++ - '0';
1481         }
1482
1483         *objnum = num;
1484         return (0);
1485 }
1486
1487 /*
1488  * The boot path passed from the boot loader is in the form of
1489  * "rootpool-name/root-filesystem-object-number'. Convert this
1490  * string to a dataset name: "rootpool-name/root-filesystem-name".
1491  */
1492 static int
1493 zfs_parse_bootfs(char *bpath, char *outpath)
1494 {
1495         char *slashp;
1496         uint64_t objnum;
1497         int error;
1498
1499         if (*bpath == 0 || *bpath == '/')
1500                 return (SET_ERROR(EINVAL));
1501
1502         (void) strcpy(outpath, bpath);
1503
1504         slashp = strchr(bpath, '/');
1505
1506         /* if no '/', just return the pool name */
1507         if (slashp == NULL) {
1508                 return (0);
1509         }
1510
1511         /* if not a number, just return the root dataset name */
1512         if (str_to_uint64(slashp+1, &objnum)) {
1513                 return (0);
1514         }
1515
1516         *slashp = '\0';
1517         error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
1518         *slashp = '/';
1519
1520         return (error);
1521 }
1522
1523 /*
1524  * Check that the hex label string is appropriate for the dataset being
1525  * mounted into the global_zone proper.
1526  *
1527  * Return an error if the hex label string is not default or
1528  * admin_low/admin_high.  For admin_low labels, the corresponding
1529  * dataset must be readonly.
1530  */
1531 int
1532 zfs_check_global_label(const char *dsname, const char *hexsl)
1533 {
1534         if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1535                 return (0);
1536         if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
1537                 return (0);
1538         if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
1539                 /* must be readonly */
1540                 uint64_t rdonly;
1541
1542                 if (dsl_prop_get_integer(dsname,
1543                     zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
1544                         return (SET_ERROR(EACCES));
1545                 return (rdonly ? 0 : EACCES);
1546         }
1547         return (SET_ERROR(EACCES));
1548 }
1549
1550 /*
1551  * Determine whether the mount is allowed according to MAC check.
1552  * by comparing (where appropriate) label of the dataset against
1553  * the label of the zone being mounted into.  If the dataset has
1554  * no label, create one.
1555  *
1556  * Returns 0 if access allowed, error otherwise (e.g. EACCES)
1557  */
1558 static int
1559 zfs_mount_label_policy(vfs_t *vfsp, char *osname)
1560 {
1561         int             error, retv;
1562         zone_t          *mntzone = NULL;
1563         ts_label_t      *mnt_tsl;
1564         bslabel_t       *mnt_sl;
1565         bslabel_t       ds_sl;
1566         char            ds_hexsl[MAXNAMELEN];
1567
1568         retv = EACCES;                          /* assume the worst */
1569
1570         /*
1571          * Start by getting the dataset label if it exists.
1572          */
1573         error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1574             1, sizeof (ds_hexsl), &ds_hexsl, NULL);
1575         if (error)
1576                 return (SET_ERROR(EACCES));
1577
1578         /*
1579          * If labeling is NOT enabled, then disallow the mount of datasets
1580          * which have a non-default label already.  No other label checks
1581          * are needed.
1582          */
1583         if (!is_system_labeled()) {
1584                 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1585                         return (0);
1586                 return (SET_ERROR(EACCES));
1587         }
1588
1589         /*
1590          * Get the label of the mountpoint.  If mounting into the global
1591          * zone (i.e. mountpoint is not within an active zone and the
1592          * zoned property is off), the label must be default or
1593          * admin_low/admin_high only; no other checks are needed.
1594          */
1595         mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
1596         if (mntzone->zone_id == GLOBAL_ZONEID) {
1597                 uint64_t zoned;
1598
1599                 zone_rele(mntzone);
1600
1601                 if (dsl_prop_get_integer(osname,
1602                     zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
1603                         return (SET_ERROR(EACCES));
1604                 if (!zoned)
1605                         return (zfs_check_global_label(osname, ds_hexsl));
1606                 else
1607                         /*
1608                          * This is the case of a zone dataset being mounted
1609                          * initially, before the zone has been fully created;
1610                          * allow this mount into global zone.
1611                          */
1612                         return (0);
1613         }
1614
1615         mnt_tsl = mntzone->zone_slabel;
1616         ASSERT(mnt_tsl != NULL);
1617         label_hold(mnt_tsl);
1618         mnt_sl = label2bslabel(mnt_tsl);
1619
1620         if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
1621                 /*
1622                  * The dataset doesn't have a real label, so fabricate one.
1623                  */
1624                 char *str = NULL;
1625
1626                 if (l_to_str_internal(mnt_sl, &str) == 0 &&
1627                     dsl_prop_set_string(osname,
1628                     zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1629                     ZPROP_SRC_LOCAL, str) == 0)
1630                         retv = 0;
1631                 if (str != NULL)
1632                         kmem_free(str, strlen(str) + 1);
1633         } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
1634                 /*
1635                  * Now compare labels to complete the MAC check.  If the
1636                  * labels are equal then allow access.  If the mountpoint
1637                  * label dominates the dataset label, allow readonly access.
1638                  * Otherwise, access is denied.
1639                  */
1640                 if (blequal(mnt_sl, &ds_sl))
1641                         retv = 0;
1642                 else if (bldominates(mnt_sl, &ds_sl)) {
1643                         vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1644                         retv = 0;
1645                 }
1646         }
1647
1648         label_rele(mnt_tsl);
1649         zone_rele(mntzone);
1650         return (retv);
1651 }
1652 #endif  /* SECLABEL */
1653
1654 #ifdef OPENSOLARIS_MOUNTROOT
1655 static int
1656 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
1657 {
1658         int error = 0;
1659         static int zfsrootdone = 0;
1660         zfsvfs_t *zfsvfs = NULL;
1661         znode_t *zp = NULL;
1662         vnode_t *vp = NULL;
1663         char *zfs_bootfs;
1664         char *zfs_devid;
1665
1666         ASSERT(vfsp);
1667
1668         /*
1669          * The filesystem that we mount as root is defined in the
1670          * boot property "zfs-bootfs" with a format of
1671          * "poolname/root-dataset-objnum".
1672          */
1673         if (why == ROOT_INIT) {
1674                 if (zfsrootdone++)
1675                         return (SET_ERROR(EBUSY));
1676                 /*
1677                  * the process of doing a spa_load will require the
1678                  * clock to be set before we could (for example) do
1679                  * something better by looking at the timestamp on
1680                  * an uberblock, so just set it to -1.
1681                  */
1682                 clkset(-1);
1683
1684                 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
1685                         cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
1686                             "bootfs name");
1687                         return (SET_ERROR(EINVAL));
1688                 }
1689                 zfs_devid = spa_get_bootprop("diskdevid");
1690                 error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
1691                 if (zfs_devid)
1692                         spa_free_bootprop(zfs_devid);
1693                 if (error) {
1694                         spa_free_bootprop(zfs_bootfs);
1695                         cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
1696                             error);
1697                         return (error);
1698                 }
1699                 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
1700                         spa_free_bootprop(zfs_bootfs);
1701                         cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
1702                             error);
1703                         return (error);
1704                 }
1705
1706                 spa_free_bootprop(zfs_bootfs);
1707
1708                 if (error = vfs_lock(vfsp))
1709                         return (error);
1710
1711                 if (error = zfs_domount(vfsp, rootfs.bo_name)) {
1712                         cmn_err(CE_NOTE, "zfs_domount: error %d", error);
1713                         goto out;
1714                 }
1715
1716                 zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
1717                 ASSERT(zfsvfs);
1718                 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
1719                         cmn_err(CE_NOTE, "zfs_zget: error %d", error);
1720                         goto out;
1721                 }
1722
1723                 vp = ZTOV(zp);
1724                 mutex_enter(&vp->v_lock);
1725                 vp->v_flag |= VROOT;
1726                 mutex_exit(&vp->v_lock);
1727                 rootvp = vp;
1728
1729                 /*
1730                  * Leave rootvp held.  The root file system is never unmounted.
1731                  */
1732
1733                 vfs_add((struct vnode *)0, vfsp,
1734                     (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
1735 out:
1736                 vfs_unlock(vfsp);
1737                 return (error);
1738         } else if (why == ROOT_REMOUNT) {
1739                 readonly_changed_cb(vfsp->vfs_data, B_FALSE);
1740                 vfsp->vfs_flag |= VFS_REMOUNT;
1741
1742                 /* refresh mount options */
1743                 zfs_unregister_callbacks(vfsp->vfs_data);
1744                 return (zfs_register_callbacks(vfsp));
1745
1746         } else if (why == ROOT_UNMOUNT) {
1747                 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
1748                 (void) zfs_sync(vfsp, 0, 0);
1749                 return (0);
1750         }
1751
1752         /*
1753          * if "why" is equal to anything else other than ROOT_INIT,
1754          * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
1755          */
1756         return (SET_ERROR(ENOTSUP));
1757 }
1758 #endif  /* OPENSOLARIS_MOUNTROOT */
1759
1760 static int
1761 getpoolname(const char *osname, char *poolname)
1762 {
1763         char *p;
1764
1765         p = strchr(osname, '/');
1766         if (p == NULL) {
1767                 if (strlen(osname) >= MAXNAMELEN)
1768                         return (ENAMETOOLONG);
1769                 (void) strcpy(poolname, osname);
1770         } else {
1771                 if (p - osname >= MAXNAMELEN)
1772                         return (ENAMETOOLONG);
1773                 (void) strncpy(poolname, osname, p - osname);
1774                 poolname[p - osname] = '\0';
1775         }
1776         return (0);
1777 }
1778
1779 /*ARGSUSED*/
1780 static int
1781 zfs_mount(vfs_t *vfsp)
1782 {
1783         kthread_t       *td = curthread;
1784         vnode_t         *mvp = vfsp->mnt_vnodecovered;
1785         cred_t          *cr = td->td_ucred;
1786         char            *osname;
1787         int             error = 0;
1788         int             canwrite;
1789
1790 #ifdef illumos
1791         if (mvp->v_type != VDIR)
1792                 return (SET_ERROR(ENOTDIR));
1793
1794         mutex_enter(&mvp->v_lock);
1795         if ((uap->flags & MS_REMOUNT) == 0 &&
1796             (uap->flags & MS_OVERLAY) == 0 &&
1797             (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
1798                 mutex_exit(&mvp->v_lock);
1799                 return (SET_ERROR(EBUSY));
1800         }
1801         mutex_exit(&mvp->v_lock);
1802
1803         /*
1804          * ZFS does not support passing unparsed data in via MS_DATA.
1805          * Users should use the MS_OPTIONSTR interface; this means
1806          * that all option parsing is already done and the options struct
1807          * can be interrogated.
1808          */
1809         if ((uap->flags & MS_DATA) && uap->datalen > 0)
1810                 return (SET_ERROR(EINVAL));
1811
1812         /*
1813          * Get the objset name (the "special" mount argument).
1814          */
1815         if (error = pn_get(uap->spec, fromspace, &spn))
1816                 return (error);
1817
1818         osname = spn.pn_path;
1819 #else   /* !illumos */
1820         if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1821                 return (SET_ERROR(EINVAL));
1822
1823         /*
1824          * If full-owner-access is enabled and delegated administration is
1825          * turned on, we must set nosuid.
1826          */
1827         if (zfs_super_owner &&
1828             dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1829                 secpolicy_fs_mount_clearopts(cr, vfsp);
1830         }
1831 #endif  /* illumos */
1832
1833         /*
1834          * Check for mount privilege?
1835          *
1836          * If we don't have privilege then see if
1837          * we have local permission to allow it
1838          */
1839         error = secpolicy_fs_mount(cr, mvp, vfsp);
1840         if (error) {
1841                 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1842                         goto out;
1843
1844                 if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1845                         vattr_t         vattr;
1846
1847                         /*
1848                          * Make sure user is the owner of the mount point
1849                          * or has sufficient privileges.
1850                          */
1851
1852                         vattr.va_mask = AT_UID;
1853
1854                         vn_lock(mvp, LK_SHARED | LK_RETRY);
1855                         if (VOP_GETATTR(mvp, &vattr, cr)) {
1856                                 VOP_UNLOCK(mvp, 0);
1857                                 goto out;
1858                         }
1859
1860                         if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1861                             VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1862                                 VOP_UNLOCK(mvp, 0);
1863                                 goto out;
1864                         }
1865                         VOP_UNLOCK(mvp, 0);
1866                 }
1867
1868                 secpolicy_fs_mount_clearopts(cr, vfsp);
1869         }
1870
1871         /*
1872          * Refuse to mount a filesystem if we are in a local zone and the
1873          * dataset is not visible.
1874          */
1875         if (!INGLOBALZONE(curthread) &&
1876             (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1877                 error = SET_ERROR(EPERM);
1878                 goto out;
1879         }
1880
1881 #ifdef SECLABEL
1882         error = zfs_mount_label_policy(vfsp, osname);
1883         if (error)
1884                 goto out;
1885 #endif
1886
1887         vfsp->vfs_flag |= MNT_NFS4ACLS;
1888
1889         /*
1890          * When doing a remount, we simply refresh our temporary properties
1891          * according to those options set in the current VFS options.
1892          */
1893         if (vfsp->vfs_flag & MS_REMOUNT) {
1894                 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1895
1896                 /*
1897                  * Refresh mount options with z_teardown_lock blocking I/O while
1898                  * the filesystem is in an inconsistent state.
1899                  * The lock also serializes this code with filesystem
1900                  * manipulations between entry to zfs_suspend_fs() and return
1901                  * from zfs_resume_fs().
1902                  */
1903                 rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1904                 zfs_unregister_callbacks(zfsvfs);
1905                 error = zfs_register_callbacks(vfsp);
1906                 rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1907                 goto out;
1908         }
1909
1910         /* Initial root mount: try hard to import the requested root pool. */
1911         if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1912             (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1913                 char pname[MAXNAMELEN];
1914
1915                 error = getpoolname(osname, pname);
1916                 if (error == 0)
1917                         error = spa_import_rootpool(pname);
1918                 if (error)
1919                         goto out;
1920         }
1921         DROP_GIANT();
1922         error = zfs_domount(vfsp, osname);
1923         PICKUP_GIANT();
1924
1925 #ifdef illumos
1926         /*
1927          * Add an extra VFS_HOLD on our parent vfs so that it can't
1928          * disappear due to a forced unmount.
1929          */
1930         if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
1931                 VFS_HOLD(mvp->v_vfsp);
1932 #endif
1933
1934 out:
1935         return (error);
1936 }
1937
1938 static int
1939 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1940 {
1941         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1942         uint64_t refdbytes, availbytes, usedobjs, availobjs;
1943
1944         statp->f_version = STATFS_VERSION;
1945
1946         ZFS_ENTER(zfsvfs);
1947
1948         dmu_objset_space(zfsvfs->z_os,
1949             &refdbytes, &availbytes, &usedobjs, &availobjs);
1950
1951         /*
1952          * The underlying storage pool actually uses multiple block sizes.
1953          * We report the fragsize as the smallest block size we support,
1954          * and we report our blocksize as the filesystem's maximum blocksize.
1955          */
1956         statp->f_bsize = SPA_MINBLOCKSIZE;
1957         statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1958
1959         /*
1960          * The following report "total" blocks of various kinds in the
1961          * file system, but reported in terms of f_frsize - the
1962          * "fragment" size.
1963          */
1964
1965         statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1966         statp->f_bfree = availbytes / statp->f_bsize;
1967         statp->f_bavail = statp->f_bfree; /* no root reservation */
1968
1969         /*
1970          * statvfs() should really be called statufs(), because it assumes
1971          * static metadata.  ZFS doesn't preallocate files, so the best
1972          * we can do is report the max that could possibly fit in f_files,
1973          * and that minus the number actually used in f_ffree.
1974          * For f_ffree, report the smaller of the number of object available
1975          * and the number of blocks (each object will take at least a block).
1976          */
1977         statp->f_ffree = MIN(availobjs, statp->f_bfree);
1978         statp->f_files = statp->f_ffree + usedobjs;
1979
1980         /*
1981          * We're a zfs filesystem.
1982          */
1983         (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
1984
1985         strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1986             sizeof(statp->f_mntfromname));
1987         strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1988             sizeof(statp->f_mntonname));
1989
1990         statp->f_namemax = MAXNAMELEN - 1;
1991
1992         ZFS_EXIT(zfsvfs);
1993         return (0);
1994 }
1995
1996 static int
1997 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1998 {
1999         zfsvfs_t *zfsvfs = vfsp->vfs_data;
2000         znode_t *rootzp;
2001         int error;
2002
2003         ZFS_ENTER(zfsvfs);
2004
2005         error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
2006         if (error == 0)
2007                 *vpp = ZTOV(rootzp);
2008
2009         ZFS_EXIT(zfsvfs);
2010
2011         if (error == 0) {
2012                 error = vn_lock(*vpp, flags);
2013                 if (error != 0) {
2014                         VN_RELE(*vpp);
2015                         *vpp = NULL;
2016                 }
2017         }
2018         return (error);
2019 }
2020
2021 /*
2022  * Teardown the zfsvfs::z_os.
2023  *
2024  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
2025  * and 'z_teardown_inactive_lock' held.
2026  */
2027 static int
2028 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
2029 {
2030         znode_t *zp;
2031
2032         rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
2033
2034         if (!unmounting) {
2035                 /*
2036                  * We purge the parent filesystem's vfsp as the parent
2037                  * filesystem and all of its snapshots have their vnode's
2038                  * v_vfsp set to the parent's filesystem's vfsp.  Note,
2039                  * 'z_parent' is self referential for non-snapshots.
2040                  */
2041                 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
2042 #ifdef FREEBSD_NAMECACHE
2043                 cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
2044 #endif
2045         }
2046
2047         /*
2048          * Close the zil. NB: Can't close the zil while zfs_inactive
2049          * threads are blocked as zil_close can call zfs_inactive.
2050          */
2051         if (zfsvfs->z_log) {
2052                 zil_close(zfsvfs->z_log);
2053                 zfsvfs->z_log = NULL;
2054         }
2055
2056         rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
2057
2058         /*
2059          * If we are not unmounting (ie: online recv) and someone already
2060          * unmounted this file system while we were doing the switcheroo,
2061          * or a reopen of z_os failed then just bail out now.
2062          */
2063         if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
2064                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
2065                 rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2066                 return (SET_ERROR(EIO));
2067         }
2068
2069         /*
2070          * At this point there are no vops active, and any new vops will
2071          * fail with EIO since we have z_teardown_lock for writer (only
2072          * relavent for forced unmount).
2073          *
2074          * Release all holds on dbufs.
2075          */
2076         mutex_enter(&zfsvfs->z_znodes_lock);
2077         for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
2078             zp = list_next(&zfsvfs->z_all_znodes, zp))
2079                 if (zp->z_sa_hdl) {
2080                         ASSERT(ZTOV(zp)->v_count >= 0);
2081                         zfs_znode_dmu_fini(zp);
2082                 }
2083         mutex_exit(&zfsvfs->z_znodes_lock);
2084
2085         /*
2086          * If we are unmounting, set the unmounted flag and let new vops
2087          * unblock.  zfs_inactive will have the unmounted behavior, and all
2088          * other vops will fail with EIO.
2089          */
2090         if (unmounting) {
2091                 zfsvfs->z_unmounted = B_TRUE;
2092                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
2093                 rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2094         }
2095
2096         /*
2097          * z_os will be NULL if there was an error in attempting to reopen
2098          * zfsvfs, so just return as the properties had already been
2099          * unregistered and cached data had been evicted before.
2100          */
2101         if (zfsvfs->z_os == NULL)
2102                 return (0);
2103
2104         /*
2105          * Unregister properties.
2106          */
2107         zfs_unregister_callbacks(zfsvfs);
2108
2109         /*
2110          * Evict cached data
2111          */
2112         if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
2113             !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
2114                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
2115         dmu_objset_evict_dbufs(zfsvfs->z_os);
2116
2117         return (0);
2118 }
2119
2120 /*ARGSUSED*/
2121 static int
2122 zfs_umount(vfs_t *vfsp, int fflag)
2123 {
2124         kthread_t *td = curthread;
2125         zfsvfs_t *zfsvfs = vfsp->vfs_data;
2126         objset_t *os;
2127         cred_t *cr = td->td_ucred;
2128         int ret;
2129
2130         ret = secpolicy_fs_unmount(cr, vfsp);
2131         if (ret) {
2132                 if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
2133                     ZFS_DELEG_PERM_MOUNT, cr))
2134                         return (ret);
2135         }
2136
2137         /*
2138          * We purge the parent filesystem's vfsp as the parent filesystem
2139          * and all of its snapshots have their vnode's v_vfsp set to the
2140          * parent's filesystem's vfsp.  Note, 'z_parent' is self
2141          * referential for non-snapshots.
2142          */
2143         (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
2144
2145         /*
2146          * Unmount any snapshots mounted under .zfs before unmounting the
2147          * dataset itself.
2148          */
2149         if (zfsvfs->z_ctldir != NULL) {
2150                 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
2151                         return (ret);
2152         }
2153
2154         if (fflag & MS_FORCE) {
2155                 /*
2156                  * Mark file system as unmounted before calling
2157                  * vflush(FORCECLOSE). This way we ensure no future vnops
2158                  * will be called and risk operating on DOOMED vnodes.
2159                  */
2160                 rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
2161                 zfsvfs->z_unmounted = B_TRUE;
2162                 rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2163         }
2164
2165         /*
2166          * Flush all the files.
2167          */
2168         ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
2169         if (ret != 0)
2170                 return (ret);
2171
2172 #ifdef illumos
2173         if (!(fflag & MS_FORCE)) {
2174                 /*
2175                  * Check the number of active vnodes in the file system.
2176                  * Our count is maintained in the vfs structure, but the
2177                  * number is off by 1 to indicate a hold on the vfs
2178                  * structure itself.
2179                  *
2180                  * The '.zfs' directory maintains a reference of its
2181                  * own, and any active references underneath are
2182                  * reflected in the vnode count.
2183                  */
2184                 if (zfsvfs->z_ctldir == NULL) {
2185                         if (vfsp->vfs_count > 1)
2186                                 return (SET_ERROR(EBUSY));
2187                 } else {
2188                         if (vfsp->vfs_count > 2 ||
2189                             zfsvfs->z_ctldir->v_count > 1)
2190                                 return (SET_ERROR(EBUSY));
2191                 }
2192         }
2193 #endif
2194
2195         while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
2196             &zfsvfs->z_unlinked_drain_task, NULL) != 0)
2197                 taskqueue_drain(zfsvfs_taskq->tq_queue,
2198                     &zfsvfs->z_unlinked_drain_task);
2199
2200         VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
2201         os = zfsvfs->z_os;
2202
2203         /*
2204          * z_os will be NULL if there was an error in
2205          * attempting to reopen zfsvfs.
2206          */
2207         if (os != NULL) {
2208                 /*
2209                  * Unset the objset user_ptr.
2210                  */
2211                 mutex_enter(&os->os_user_ptr_lock);
2212                 dmu_objset_set_user(os, NULL);
2213                 mutex_exit(&os->os_user_ptr_lock);
2214
2215                 /*
2216                  * Finally release the objset
2217                  */
2218                 dmu_objset_disown(os, zfsvfs);
2219         }
2220
2221         /*
2222          * We can now safely destroy the '.zfs' directory node.
2223          */
2224         if (zfsvfs->z_ctldir != NULL)
2225                 zfsctl_destroy(zfsvfs);
2226         zfs_freevfs(vfsp);
2227
2228         return (0);
2229 }
2230
2231 static int
2232 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
2233 {
2234         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
2235         znode_t         *zp;
2236         int             err;
2237
2238         /*
2239          * zfs_zget() can't operate on virtual entries like .zfs/ or
2240          * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
2241          * This will make NFS to switch to LOOKUP instead of using VGET.
2242          */
2243         if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
2244             (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
2245                 return (EOPNOTSUPP);
2246
2247         ZFS_ENTER(zfsvfs);
2248         err = zfs_zget(zfsvfs, ino, &zp);
2249         if (err == 0 && zp->z_unlinked) {
2250                 vrele(ZTOV(zp));
2251                 err = EINVAL;
2252         }
2253         if (err == 0)
2254                 *vpp = ZTOV(zp);
2255         ZFS_EXIT(zfsvfs);
2256         if (err == 0) {
2257                 err = vn_lock(*vpp, flags);
2258                 if (err != 0)
2259                         vrele(*vpp);
2260         }
2261         if (err != 0)
2262                 *vpp = NULL;
2263         return (err);
2264 }
2265
2266 static int
2267 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
2268     struct ucred **credanonp, int *numsecflavors, int **secflavors)
2269 {
2270         zfsvfs_t *zfsvfs = vfsp->vfs_data;
2271
2272         /*
2273          * If this is regular file system vfsp is the same as
2274          * zfsvfs->z_parent->z_vfs, but if it is snapshot,
2275          * zfsvfs->z_parent->z_vfs represents parent file system
2276          * which we have to use here, because only this file system
2277          * has mnt_export configured.
2278          */
2279         return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
2280             credanonp, numsecflavors, secflavors));
2281 }
2282
2283 CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
2284 CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
2285
2286 static int
2287 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
2288 {
2289         struct componentname cn;
2290         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
2291         znode_t         *zp;
2292         vnode_t         *dvp;
2293         uint64_t        object = 0;
2294         uint64_t        fid_gen = 0;
2295         uint64_t        gen_mask;
2296         uint64_t        zp_gen;
2297         int             i, err;
2298
2299         *vpp = NULL;
2300
2301         ZFS_ENTER(zfsvfs);
2302
2303         /*
2304          * On FreeBSD we can get snapshot's mount point or its parent file
2305          * system mount point depending if snapshot is already mounted or not.
2306          */
2307         if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
2308                 zfid_long_t     *zlfid = (zfid_long_t *)fidp;
2309                 uint64_t        objsetid = 0;
2310                 uint64_t        setgen = 0;
2311
2312                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
2313                         objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
2314
2315                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
2316                         setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
2317
2318                 ZFS_EXIT(zfsvfs);
2319
2320                 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
2321                 if (err)
2322                         return (SET_ERROR(EINVAL));
2323                 ZFS_ENTER(zfsvfs);
2324         }
2325
2326         if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
2327                 zfid_short_t    *zfid = (zfid_short_t *)fidp;
2328
2329                 for (i = 0; i < sizeof (zfid->zf_object); i++)
2330                         object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
2331
2332                 for (i = 0; i < sizeof (zfid->zf_gen); i++)
2333                         fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
2334         } else {
2335                 ZFS_EXIT(zfsvfs);
2336                 return (SET_ERROR(EINVAL));
2337         }
2338
2339         /*
2340          * A zero fid_gen means we are in .zfs or the .zfs/snapshot
2341          * directory tree. If the object == zfsvfs->z_shares_dir, then
2342          * we are in the .zfs/shares directory tree.
2343          */
2344         if ((fid_gen == 0 &&
2345              (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
2346             (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
2347                 ZFS_EXIT(zfsvfs);
2348                 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
2349                 if (object == ZFSCTL_INO_SNAPDIR) {
2350                         cn.cn_nameptr = "snapshot";
2351                         cn.cn_namelen = strlen(cn.cn_nameptr);
2352                         cn.cn_nameiop = LOOKUP;
2353                         cn.cn_flags = ISLASTCN | LOCKLEAF;
2354                         cn.cn_lkflags = flags;
2355                         VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
2356                         vput(dvp);
2357                 } else if (object == zfsvfs->z_shares_dir) {
2358                         /*
2359                          * XXX This branch must not be taken,
2360                          * if it is, then the lookup below will
2361                          * explode.
2362                          */
2363                         cn.cn_nameptr = "shares";
2364                         cn.cn_namelen = strlen(cn.cn_nameptr);
2365                         cn.cn_nameiop = LOOKUP;
2366                         cn.cn_flags = ISLASTCN;
2367                         cn.cn_lkflags = flags;
2368                         VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
2369                         vput(dvp);
2370                 } else {
2371                         *vpp = dvp;
2372                 }
2373                 return (err);
2374         }
2375
2376         gen_mask = -1ULL >> (64 - 8 * i);
2377
2378         dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
2379         if (err = zfs_zget(zfsvfs, object, &zp)) {
2380                 ZFS_EXIT(zfsvfs);
2381                 return (err);
2382         }
2383         (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
2384             sizeof (uint64_t));
2385         zp_gen = zp_gen & gen_mask;
2386         if (zp_gen == 0)
2387                 zp_gen = 1;
2388         if (zp->z_unlinked || zp_gen != fid_gen) {
2389                 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
2390                 vrele(ZTOV(zp));
2391                 ZFS_EXIT(zfsvfs);
2392                 return (SET_ERROR(EINVAL));
2393         }
2394
2395         *vpp = ZTOV(zp);
2396         ZFS_EXIT(zfsvfs);
2397         err = vn_lock(*vpp, flags);
2398         if (err == 0)
2399                 vnode_create_vobject(*vpp, zp->z_size, curthread);
2400         else
2401                 *vpp = NULL;
2402         return (err);
2403 }
2404
2405 /*
2406  * Block out VOPs and close zfsvfs_t::z_os
2407  *
2408  * Note, if successful, then we return with the 'z_teardown_lock' and
2409  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
2410  * dataset and objset intact so that they can be atomically handed off during
2411  * a subsequent rollback or recv operation and the resume thereafter.
2412  */
2413 int
2414 zfs_suspend_fs(zfsvfs_t *zfsvfs)
2415 {
2416         int error;
2417
2418         if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
2419                 return (error);
2420
2421         return (0);
2422 }
2423
2424 /*
2425  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
2426  * is an invariant across any of the operations that can be performed while the
2427  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
2428  * are the same: the relevant objset and associated dataset are owned by
2429  * zfsvfs, held, and long held on entry.
2430  */
2431 int
2432 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2433 {
2434         int err;
2435         znode_t *zp;
2436
2437         ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
2438         ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
2439
2440         /*
2441          * We already own this, so just update the objset_t, as the one we
2442          * had before may have been evicted.
2443          */
2444         objset_t *os;
2445         VERIFY3P(ds->ds_owner, ==, zfsvfs);
2446         VERIFY(dsl_dataset_long_held(ds));
2447         VERIFY0(dmu_objset_from_ds(ds, &os));
2448
2449         err = zfsvfs_init(zfsvfs, os);
2450         if (err != 0)
2451                 goto bail;
2452
2453         VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
2454
2455         zfs_set_fuid_feature(zfsvfs);
2456
2457         /*
2458          * Attempt to re-establish all the active znodes with
2459          * their dbufs.  If a zfs_rezget() fails, then we'll let
2460          * any potential callers discover that via ZFS_ENTER_VERIFY_VP
2461          * when they try to use their znode.
2462          */
2463         mutex_enter(&zfsvfs->z_znodes_lock);
2464         for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2465             zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2466                 (void) zfs_rezget(zp);
2467         }
2468         mutex_exit(&zfsvfs->z_znodes_lock);
2469
2470 bail:
2471         /* release the VOPs */
2472         rw_exit(&zfsvfs->z_teardown_inactive_lock);
2473         rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2474
2475         if (err) {
2476                 /*
2477                  * Since we couldn't setup the sa framework, try to force
2478                  * unmount this file system.
2479                  */
2480                 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
2481                         vfs_ref(zfsvfs->z_vfs);
2482                         (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2483                 }
2484         }
2485         return (err);
2486 }
2487
2488 static void
2489 zfs_freevfs(vfs_t *vfsp)
2490 {
2491         zfsvfs_t *zfsvfs = vfsp->vfs_data;
2492
2493 #ifdef illumos
2494         /*
2495          * If this is a snapshot, we have an extra VFS_HOLD on our parent
2496          * from zfs_mount().  Release it here.  If we came through
2497          * zfs_mountroot() instead, we didn't grab an extra hold, so
2498          * skip the VFS_RELE for rootvfs.
2499          */
2500         if (zfsvfs->z_issnap && (vfsp != rootvfs))
2501                 VFS_RELE(zfsvfs->z_parent->z_vfs);
2502 #endif
2503
2504         zfsvfs_free(zfsvfs);
2505
2506         atomic_dec_32(&zfs_active_fs_count);
2507 }
2508
2509 #ifdef __i386__
2510 static int desiredvnodes_backup;
2511 #endif
2512
2513 static void
2514 zfs_vnodes_adjust(void)
2515 {
2516 #ifdef __i386__
2517         int newdesiredvnodes;
2518
2519         desiredvnodes_backup = desiredvnodes;
2520
2521         /*
2522          * We calculate newdesiredvnodes the same way it is done in
2523          * vntblinit(). If it is equal to desiredvnodes, it means that
2524          * it wasn't tuned by the administrator and we can tune it down.
2525          */
2526         newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2527             vm_kmem_size / (5 * (sizeof(struct vm_object) +
2528             sizeof(struct vnode))));
2529         if (newdesiredvnodes == desiredvnodes)
2530                 desiredvnodes = (3 * newdesiredvnodes) / 4;
2531 #endif
2532 }
2533
2534 static void
2535 zfs_vnodes_adjust_back(void)
2536 {
2537
2538 #ifdef __i386__
2539         desiredvnodes = desiredvnodes_backup;
2540 #endif
2541 }
2542
2543 void
2544 zfs_init(void)
2545 {
2546
2547         printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2548
2549         /*
2550          * Initialize .zfs directory structures
2551          */
2552         zfsctl_init();
2553
2554         /*
2555          * Initialize znode cache, vnode ops, etc...
2556          */
2557         zfs_znode_init();
2558
2559         /*
2560          * Reduce number of vnodes. Originally number of vnodes is calculated
2561          * with UFS inode in mind. We reduce it here, because it's too big for
2562          * ZFS/i386.
2563          */
2564         zfs_vnodes_adjust();
2565
2566         dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
2567 #if defined(__FreeBSD__)
2568         zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2569 #endif
2570 }
2571
2572 void
2573 zfs_fini(void)
2574 {
2575 #if defined(__FreeBSD__)
2576         taskq_destroy(zfsvfs_taskq);
2577 #endif
2578         zfsctl_fini();
2579         zfs_znode_fini();
2580         zfs_vnodes_adjust_back();
2581 }
2582
2583 int
2584 zfs_busy(void)
2585 {
2586         return (zfs_active_fs_count != 0);
2587 }
2588
2589 int
2590 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2591 {
2592         int error;
2593         objset_t *os = zfsvfs->z_os;
2594         dmu_tx_t *tx;
2595
2596         if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2597                 return (SET_ERROR(EINVAL));
2598
2599         if (newvers < zfsvfs->z_version)
2600                 return (SET_ERROR(EINVAL));
2601
2602         if (zfs_spa_version_map(newvers) >
2603             spa_version(dmu_objset_spa(zfsvfs->z_os)))
2604                 return (SET_ERROR(ENOTSUP));
2605
2606         tx = dmu_tx_create(os);
2607         dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2608         if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2609                 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2610                     ZFS_SA_ATTRS);
2611                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2612         }
2613         error = dmu_tx_assign(tx, TXG_WAIT);
2614         if (error) {
2615                 dmu_tx_abort(tx);
2616                 return (error);
2617         }
2618
2619         error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2620             8, 1, &newvers, tx);
2621
2622         if (error) {
2623                 dmu_tx_commit(tx);
2624                 return (error);
2625         }
2626
2627         if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2628                 uint64_t sa_obj;
2629
2630                 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2631                     SPA_VERSION_SA);
2632                 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2633                     DMU_OT_NONE, 0, tx);
2634
2635                 error = zap_add(os, MASTER_NODE_OBJ,
2636                     ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2637                 ASSERT0(error);
2638
2639                 VERIFY(0 == sa_set_sa_object(os, sa_obj));
2640                 sa_register_update_callback(os, zfs_sa_upgrade);
2641         }
2642
2643         spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2644             "from %llu to %llu", zfsvfs->z_version, newvers);
2645
2646         dmu_tx_commit(tx);
2647
2648         zfsvfs->z_version = newvers;
2649         os->os_version = newvers;
2650
2651         zfs_set_fuid_feature(zfsvfs);
2652
2653         return (0);
2654 }
2655
2656 /*
2657  * Read a property stored within the master node.
2658  */
2659 int
2660 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2661 {
2662         uint64_t *cached_copy = NULL;
2663
2664         /*
2665          * Figure out where in the objset_t the cached copy would live, if it
2666          * is available for the requested property.
2667          */
2668         if (os != NULL) {
2669                 switch (prop) {
2670                 case ZFS_PROP_VERSION:
2671                         cached_copy = &os->os_version;
2672                         break;
2673                 case ZFS_PROP_NORMALIZE:
2674                         cached_copy = &os->os_normalization;
2675                         break;
2676                 case ZFS_PROP_UTF8ONLY:
2677                         cached_copy = &os->os_utf8only;
2678                         break;
2679                 case ZFS_PROP_CASE:
2680                         cached_copy = &os->os_casesensitivity;
2681                         break;
2682                 default:
2683                         break;
2684                 }
2685         }
2686         if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2687                 *value = *cached_copy;
2688                 return (0);
2689         }
2690
2691         /*
2692          * If the property wasn't cached, look up the file system's value for
2693          * the property. For the version property, we look up a slightly
2694          * different string.
2695          */
2696         const char *pname;
2697         int error = ENOENT;
2698         if (prop == ZFS_PROP_VERSION) {
2699                 pname = ZPL_VERSION_STR;
2700         } else {
2701                 pname = zfs_prop_to_name(prop);
2702         }
2703
2704         if (os != NULL) {
2705                 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2706                 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2707         }
2708
2709         if (error == ENOENT) {
2710                 /* No value set, use the default value */
2711                 switch (prop) {
2712                 case ZFS_PROP_VERSION:
2713                         *value = ZPL_VERSION;
2714                         break;
2715                 case ZFS_PROP_NORMALIZE:
2716                 case ZFS_PROP_UTF8ONLY:
2717                         *value = 0;
2718                         break;
2719                 case ZFS_PROP_CASE:
2720                         *value = ZFS_CASE_SENSITIVE;
2721                         break;
2722                 default:
2723                         return (error);
2724                 }
2725                 error = 0;
2726         }
2727
2728         /*
2729          * If one of the methods for getting the property value above worked,
2730          * copy it into the objset_t's cache.
2731          */
2732         if (error == 0 && cached_copy != NULL) {
2733                 *cached_copy = *value;
2734         }
2735
2736         return (error);
2737 }
2738
2739 /*
2740  * Return true if the coresponding vfs's unmounted flag is set.
2741  * Otherwise return false.
2742  * If this function returns true we know VFS unmount has been initiated.
2743  */
2744 boolean_t
2745 zfs_get_vfs_flag_unmounted(objset_t *os)
2746 {
2747         zfsvfs_t *zfvp;
2748         boolean_t unmounted = B_FALSE;
2749
2750         ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
2751
2752         mutex_enter(&os->os_user_ptr_lock);
2753         zfvp = dmu_objset_get_user(os);
2754         if (zfvp != NULL && zfvp->z_vfs != NULL &&
2755             (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2756                 unmounted = B_TRUE;
2757         mutex_exit(&os->os_user_ptr_lock);
2758
2759         return (unmounted);
2760 }
2761
2762 #ifdef _KERNEL
2763 void
2764 zfsvfs_update_fromname(const char *oldname, const char *newname)
2765 {
2766         char tmpbuf[MAXPATHLEN];
2767         struct mount *mp;
2768         char *fromname;
2769         size_t oldlen;
2770
2771         oldlen = strlen(oldname);
2772
2773         mtx_lock(&mountlist_mtx);
2774         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2775                 fromname = mp->mnt_stat.f_mntfromname;
2776                 if (strcmp(fromname, oldname) == 0) {
2777                         (void)strlcpy(fromname, newname,
2778                             sizeof(mp->mnt_stat.f_mntfromname));
2779                         continue;
2780                 }
2781                 if (strncmp(fromname, oldname, oldlen) == 0 &&
2782                     (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2783                         (void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
2784                             newname, fromname + oldlen);
2785                         (void)strlcpy(fromname, tmpbuf,
2786                             sizeof(mp->mnt_stat.f_mntfromname));
2787                         continue;
2788                 }
2789         }
2790         mtx_unlock(&mountlist_mtx);
2791 }
2792 #endif