sys/kern/vfs_mount.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1999-2004 Poul-Henning Kamp
   5  * Copyright (c) 1999 Michael Smith
   6  * Copyright (c) 1989, 1993
   7  *      The Regents of the University of California.  All rights reserved.
   8  * (c) UNIX System Laboratories, Inc.
   9  * All or some portions of this file are derived from material licensed
  10  * to the University of California by American Telephone and Telegraph
  11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  12  * the permission of UNIX System Laboratories, Inc.
  13  *
  14  * Redistribution and use in source and binary forms, with or without
  15  * modification, are permitted provided that the following conditions
  16  * are met:
  17  * 1. Redistributions of source code must retain the above copyright
  18  *    notice, this list of conditions and the following disclaimer.
  19  * 2. Redistributions in binary form must reproduce the above copyright
  20  *    notice, this list of conditions and the following disclaimer in the
  21  *    documentation and/or other materials provided with the distribution.
  22  * 3. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  */
  38
  39 #include <sys/param.h>
  40 #include <sys/conf.h>
  41 #include <sys/smp.h>
  42 #include <sys/devctl.h>
  43 #include <sys/eventhandler.h>
  44 #include <sys/fcntl.h>
  45 #include <sys/jail.h>
  46 #include <sys/kernel.h>
  47 #include <sys/ktr.h>
  48 #include <sys/libkern.h>
  49 #include <sys/limits.h>
  50 #include <sys/malloc.h>
  51 #include <sys/mount.h>
  52 #include <sys/mutex.h>
  53 #include <sys/namei.h>
  54 #include <sys/priv.h>
  55 #include <sys/proc.h>
  56 #include <sys/filedesc.h>
  57 #include <sys/reboot.h>
  58 #include <sys/sbuf.h>
  59 #include <sys/syscallsubr.h>
  60 #include <sys/sysproto.h>
  61 #include <sys/sx.h>
  62 #include <sys/sysctl.h>
  63 #include <sys/systm.h>
  64 #include <sys/taskqueue.h>
  65 #include <sys/vnode.h>
  66 #include <vm/uma.h>
  67
  68 #include <geom/geom.h>
  69
  70 #include <machine/stdarg.h>
  71
  72 #include <security/audit/audit.h>
  73 #include <security/mac/mac_framework.h>
  74
  75 #define VFS_MOUNTARG_SIZE_MAX   (1024 * 64)
  76
  77 static int      vfs_domount(struct thread *td, const char *fstype, char *fspath,
  78                     uint64_t fsflags, bool jail_export,
  79                     struct vfsoptlist **optlist);
  80 static void     free_mntarg(struct mntarg *ma);
  81
  82 static int      usermount = 0;
  83 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
  84     "Unprivileged users may mount and unmount file systems");
  85
  86 static bool     default_autoro = false;
  87 SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0,
  88     "Retry failed r/w mount as r/o if no explicit ro/rw option is specified");
  89
  90 static bool     recursive_forced_unmount = false;
  91 SYSCTL_BOOL(_vfs, OID_AUTO, recursive_forced_unmount, CTLFLAG_RW,
  92     &recursive_forced_unmount, 0, "Recursively unmount stacked upper mounts"
  93     " when a file system is forcibly unmounted");
  94
  95 static SYSCTL_NODE(_vfs, OID_AUTO, deferred_unmount,
  96     CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "deferred unmount controls");
  97
  98 static unsigned int     deferred_unmount_retry_limit = 10;
  99 SYSCTL_UINT(_vfs_deferred_unmount, OID_AUTO, retry_limit, CTLFLAG_RW,
 100     &deferred_unmount_retry_limit, 0,
 101     "Maximum number of retries for deferred unmount failure");
 102
 103 static int      deferred_unmount_retry_delay_hz;
 104 SYSCTL_INT(_vfs_deferred_unmount, OID_AUTO, retry_delay_hz, CTLFLAG_RW,
 105     &deferred_unmount_retry_delay_hz, 0,
 106     "Delay in units of [1/kern.hz]s when retrying a failed deferred unmount");
 107
 108 static int      deferred_unmount_total_retries = 0;
 109 SYSCTL_INT(_vfs_deferred_unmount, OID_AUTO, total_retries, CTLFLAG_RD,
 110     &deferred_unmount_total_retries, 0,
 111     "Total number of retried deferred unmounts");
 112
 113 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
 114 MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure");
 115 static uma_zone_t mount_zone;
 116
 117 /* List of mounted filesystems. */
 118 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
 119
 120 /* For any iteration/modification of mountlist */
 121 struct mtx_padalign __exclusive_cache_line mountlist_mtx;
 122
 123 EVENTHANDLER_LIST_DEFINE(vfs_mounted);
 124 EVENTHANDLER_LIST_DEFINE(vfs_unmounted);
 125
 126 static void vfs_deferred_unmount(void *arg, int pending);
 127 static struct timeout_task deferred_unmount_task;
 128 static struct mtx deferred_unmount_lock;
 129 MTX_SYSINIT(deferred_unmount, &deferred_unmount_lock, "deferred_unmount",
 130     MTX_DEF);
 131 static STAILQ_HEAD(, mount) deferred_unmount_list =
 132     STAILQ_HEAD_INITIALIZER(deferred_unmount_list);
 133 TASKQUEUE_DEFINE_THREAD(deferred_unmount);
 134
 135 static void mount_devctl_event(const char *type, struct mount *mp, bool donew);
 136
 137 /*
 138  * Global opts, taken by all filesystems
 139  */
 140 static const char *global_opts[] = {
 141         "errmsg",
 142         "fstype",
 143         "fspath",
 144         "ro",
 145         "rw",
 146         "nosuid",
 147         "noexec",
 148         NULL
 149 };
 150
 151 static int
 152 mount_init(void *mem, int size, int flags)
 153 {
 154         struct mount *mp;
 155
 156         mp = (struct mount *)mem;
 157         mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
 158         mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF);
 159         lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
 160         mp->mnt_pcpu = uma_zalloc_pcpu(pcpu_zone_16, M_WAITOK | M_ZERO);
 161         mp->mnt_ref = 0;
 162         mp->mnt_vfs_ops = 1;
 163         mp->mnt_rootvnode = NULL;
 164         return (0);
 165 }
 166
 167 static void
 168 mount_fini(void *mem, int size)
 169 {
 170         struct mount *mp;
 171
 172         mp = (struct mount *)mem;
 173         uma_zfree_pcpu(pcpu_zone_16, mp->mnt_pcpu);
 174         lockdestroy(&mp->mnt_explock);
 175         mtx_destroy(&mp->mnt_listmtx);
 176         mtx_destroy(&mp->mnt_mtx);
 177 }
 178
 179 static void
 180 vfs_mount_init(void *dummy __unused)
 181 {
 182         TIMEOUT_TASK_INIT(taskqueue_deferred_unmount, &deferred_unmount_task,
 183             0, vfs_deferred_unmount, NULL);
 184         deferred_unmount_retry_delay_hz = hz;
 185         mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
 186             NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
 187         mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
 188 }
 189 SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);
 190
 191 /*
 192  * ---------------------------------------------------------------------
 193  * Functions for building and sanitizing the mount options
 194  */
 195
 196 /* Remove one mount option. */
 197 static void
 198 vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
 199 {
 200
 201         TAILQ_REMOVE(opts, opt, link);
 202         free(opt->name, M_MOUNT);
 203         if (opt->value != NULL)
 204                 free(opt->value, M_MOUNT);
 205         free(opt, M_MOUNT);
 206 }
 207
 208 /* Release all resources related to the mount options. */
 209 void
 210 vfs_freeopts(struct vfsoptlist *opts)
 211 {
 212         struct vfsopt *opt;
 213
 214         while (!TAILQ_EMPTY(opts)) {
 215                 opt = TAILQ_FIRST(opts);
 216                 vfs_freeopt(opts, opt);
 217         }
 218         free(opts, M_MOUNT);
 219 }
 220
 221 void
 222 vfs_deleteopt(struct vfsoptlist *opts, const char *name)
 223 {
 224         struct vfsopt *opt, *temp;
 225
 226         if (opts == NULL)
 227                 return;
 228         TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
 229                 if (strcmp(opt->name, name) == 0)
 230                         vfs_freeopt(opts, opt);
 231         }
 232 }
 233
 234 static int
 235 vfs_isopt_ro(const char *opt)
 236 {
 237
 238         if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 ||
 239             strcmp(opt, "norw") == 0)
 240                 return (1);
 241         return (0);
 242 }
 243
 244 static int
 245 vfs_isopt_rw(const char *opt)
 246 {
 247
 248         if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0)
 249                 return (1);
 250         return (0);
 251 }
 252
 253 /*
 254  * Check if options are equal (with or without the "no" prefix).
 255  */
 256 static int
 257 vfs_equalopts(const char *opt1, const char *opt2)
 258 {
 259         char *p;
 260
 261         /* "opt" vs. "opt" or "noopt" vs. "noopt" */
 262         if (strcmp(opt1, opt2) == 0)
 263                 return (1);
 264         /* "noopt" vs. "opt" */
 265         if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 266                 return (1);
 267         /* "opt" vs. "noopt" */
 268         if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 269                 return (1);
 270         while ((p = strchr(opt1, '.')) != NULL &&
 271             !strncmp(opt1, opt2, ++p - opt1)) {
 272                 opt2 += p - opt1;
 273                 opt1 = p;
 274                 /* "foo.noopt" vs. "foo.opt" */
 275                 if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 276                         return (1);
 277                 /* "foo.opt" vs. "foo.noopt" */
 278                 if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 279                         return (1);
 280         }
 281         /* "ro" / "rdonly" / "norw" / "rw" / "noro" */
 282         if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) &&
 283             (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2)))
 284                 return (1);
 285         return (0);
 286 }
 287
 288 /*
 289  * If a mount option is specified several times,
 290  * (with or without the "no" prefix) only keep
 291  * the last occurrence of it.
 292  */
 293 static void
 294 vfs_sanitizeopts(struct vfsoptlist *opts)
 295 {
 296         struct vfsopt *opt, *opt2, *tmp;
 297
 298         TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
 299                 opt2 = TAILQ_PREV(opt, vfsoptlist, link);
 300                 while (opt2 != NULL) {
 301                         if (vfs_equalopts(opt->name, opt2->name)) {
 302                                 tmp = TAILQ_PREV(opt2, vfsoptlist, link);
 303                                 vfs_freeopt(opts, opt2);
 304                                 opt2 = tmp;
 305                         } else {
 306                                 opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
 307                         }
 308                 }
 309         }
 310 }
 311
 312 /*
 313  * Build a linked list of mount options from a struct uio.
 314  */
 315 int
 316 vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
 317 {
 318         struct vfsoptlist *opts;
 319         struct vfsopt *opt;
 320         size_t memused, namelen, optlen;
 321         unsigned int i, iovcnt;
 322         int error;
 323
 324         opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
 325         TAILQ_INIT(opts);
 326         memused = 0;
 327         iovcnt = auio->uio_iovcnt;
 328         for (i = 0; i < iovcnt; i += 2) {
 329                 namelen = auio->uio_iov[i].iov_len;
 330                 optlen = auio->uio_iov[i + 1].iov_len;
 331                 memused += sizeof(struct vfsopt) + optlen + namelen;
 332                 /*
 333                  * Avoid consuming too much memory, and attempts to overflow
 334                  * memused.
 335                  */
 336                 if (memused > VFS_MOUNTARG_SIZE_MAX ||
 337                     optlen > VFS_MOUNTARG_SIZE_MAX ||
 338                     namelen > VFS_MOUNTARG_SIZE_MAX) {
 339                         error = EINVAL;
 340                         goto bad;
 341                 }
 342
 343                 opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 344                 opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
 345                 opt->value = NULL;
 346                 opt->len = 0;
 347                 opt->pos = i / 2;
 348                 opt->seen = 0;
 349
 350                 /*
 351                  * Do this early, so jumps to "bad" will free the current
 352                  * option.
 353                  */
 354                 TAILQ_INSERT_TAIL(opts, opt, link);
 355
 356                 if (auio->uio_segflg == UIO_SYSSPACE) {
 357                         bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
 358                 } else {
 359                         error = copyin(auio->uio_iov[i].iov_base, opt->name,
 360                             namelen);
 361                         if (error)
 362                                 goto bad;
 363                 }
 364                 /* Ensure names are null-terminated strings. */
 365                 if (namelen == 0 || opt->name[namelen - 1] != '\0') {
 366                         error = EINVAL;
 367                         goto bad;
 368                 }
 369                 if (optlen != 0) {
 370                         opt->len = optlen;
 371                         opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
 372                         if (auio->uio_segflg == UIO_SYSSPACE) {
 373                                 bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
 374                                     optlen);
 375                         } else {
 376                                 error = copyin(auio->uio_iov[i + 1].iov_base,
 377                                     opt->value, optlen);
 378                                 if (error)
 379                                         goto bad;
 380                         }
 381                 }
 382         }
 383         vfs_sanitizeopts(opts);
 384         *options = opts;
 385         return (0);
 386 bad:
 387         vfs_freeopts(opts);
 388         return (error);
 389 }
 390
 391 /*
 392  * Merge the old mount options with the new ones passed
 393  * in the MNT_UPDATE case.
 394  *
 395  * XXX: This function will keep a "nofoo" option in the new
 396  * options.  E.g, if the option's canonical name is "foo",
 397  * "nofoo" ends up in the mount point's active options.
 398  */
 399 static void
 400 vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts)
 401 {
 402         struct vfsopt *opt, *new;
 403
 404         TAILQ_FOREACH(opt, oldopts, link) {
 405                 new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 406                 new->name = strdup(opt->name, M_MOUNT);
 407                 if (opt->len != 0) {
 408                         new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
 409                         bcopy(opt->value, new->value, opt->len);
 410                 } else
 411                         new->value = NULL;
 412                 new->len = opt->len;
 413                 new->seen = opt->seen;
 414                 TAILQ_INSERT_HEAD(toopts, new, link);
 415         }
 416         vfs_sanitizeopts(toopts);
 417 }
 418
 419 /*
 420  * Mount a filesystem.
 421  */
 422 #ifndef _SYS_SYSPROTO_H_
 423 struct nmount_args {
 424         struct iovec *iovp;
 425         unsigned int iovcnt;
 426         int flags;
 427 };
 428 #endif
 429 int
 430 sys_nmount(struct thread *td, struct nmount_args *uap)
 431 {
 432         struct uio *auio;
 433         int error;
 434         u_int iovcnt;
 435         uint64_t flags;
 436
 437         /*
 438          * Mount flags are now 64-bits. On 32-bit archtectures only
 439          * 32-bits are passed in, but from here on everything handles
 440          * 64-bit flags correctly.
 441          */
 442         flags = uap->flags;
 443
 444         AUDIT_ARG_FFLAGS(flags);
 445         CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__,
 446             uap->iovp, uap->iovcnt, flags);
 447
 448         /*
 449          * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
 450          * userspace to set this flag, but we must filter it out if we want
 451          * MNT_UPDATE on the root file system to work.
 452          * MNT_ROOTFS should only be set by the kernel when mounting its
 453          * root file system.
 454          */
 455         flags &= ~MNT_ROOTFS;
 456
 457         iovcnt = uap->iovcnt;
 458         /*
 459          * Check that we have an even number of iovec's
 460          * and that we have at least two options.
 461          */
 462         if ((iovcnt & 1) || (iovcnt < 4)) {
 463                 CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__,
 464                     uap->iovcnt);
 465                 return (EINVAL);
 466         }
 467
 468         error = copyinuio(uap->iovp, iovcnt, &auio);
 469         if (error) {
 470                 CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno",
 471                     __func__, error);
 472                 return (error);
 473         }
 474         error = vfs_donmount(td, flags, auio);
 475
 476         free(auio, M_IOV);
 477         return (error);
 478 }
 479
 480 /*
 481  * ---------------------------------------------------------------------
 482  * Various utility functions
 483  */
 484
 485 /*
 486  * Get a reference on a mount point from a vnode.
 487  *
 488  * The vnode is allowed to be passed unlocked and race against dooming. Note in
 489  * such case there are no guarantees the referenced mount point will still be
 490  * associated with it after the function returns.
 491  */
 492 struct mount *
 493 vfs_ref_from_vp(struct vnode *vp)
 494 {
 495         struct mount *mp;
 496         struct mount_pcpu *mpcpu;
 497
 498         mp = atomic_load_ptr(&vp->v_mount);
 499         if (__predict_false(mp == NULL)) {
 500                 return (mp);
 501         }
 502         if (vfs_op_thread_enter(mp, mpcpu)) {
 503                 if (__predict_true(mp == vp->v_mount)) {
 504                         vfs_mp_count_add_pcpu(mpcpu, ref, 1);
 505                         vfs_op_thread_exit(mp, mpcpu);
 506                 } else {
 507                         vfs_op_thread_exit(mp, mpcpu);
 508                         mp = NULL;
 509                 }
 510         } else {
 511                 MNT_ILOCK(mp);
 512                 if (mp == vp->v_mount) {
 513                         MNT_REF(mp);
 514                         MNT_IUNLOCK(mp);
 515                 } else {
 516                         MNT_IUNLOCK(mp);
 517                         mp = NULL;
 518                 }
 519         }
 520         return (mp);
 521 }
 522
 523 void
 524 vfs_ref(struct mount *mp)
 525 {
 526         struct mount_pcpu *mpcpu;
 527
 528         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 529         if (vfs_op_thread_enter(mp, mpcpu)) {
 530                 vfs_mp_count_add_pcpu(mpcpu, ref, 1);
 531                 vfs_op_thread_exit(mp, mpcpu);
 532                 return;
 533         }
 534
 535         MNT_ILOCK(mp);
 536         MNT_REF(mp);
 537         MNT_IUNLOCK(mp);
 538 }
 539
 540 /*
 541  * Register ump as an upper mount of the mount associated with
 542  * vnode vp.  This registration will be tracked through
 543  * mount_upper_node upper, which should be allocated by the
 544  * caller and stored in per-mount data associated with mp.
 545  *
 546  * If successful, this function will return the mount associated
 547  * with vp, and will ensure that it cannot be unmounted until
 548  * ump has been unregistered as one of its upper mounts.
 549  *
 550  * Upon failure this function will return NULL.
 551  */
 552 struct mount *
 553 vfs_register_upper_from_vp(struct vnode *vp, struct mount *ump,
 554     struct mount_upper_node *upper)
 555 {
 556         struct mount *mp;
 557
 558         mp = atomic_load_ptr(&vp->v_mount);
 559         if (mp == NULL)
 560                 return (NULL);
 561         MNT_ILOCK(mp);
 562         if (mp != vp->v_mount ||
 563             ((mp->mnt_kern_flag & (MNTK_UNMOUNT | MNTK_RECURSE)) != 0)) {
 564                 MNT_IUNLOCK(mp);
 565                 return (NULL);
 566         }
 567         KASSERT(ump != mp, ("upper and lower mounts are identical"));
 568         upper->mp = ump;
 569         MNT_REF(mp);
 570         TAILQ_INSERT_TAIL(&mp->mnt_uppers, upper, mnt_upper_link);
 571         MNT_IUNLOCK(mp);
 572         return (mp);
 573 }
 574
 575 /*
 576  * Register upper mount ump to receive vnode unlink/reclaim
 577  * notifications from lower mount mp. This registration will
 578  * be tracked through mount_upper_node upper, which should be
 579  * allocated by the caller and stored in per-mount data
 580  * associated with mp.
 581  *
 582  * ump must already be registered as an upper mount of mp
 583  * through a call to vfs_register_upper_from_vp().
 584  */
 585 void
 586 vfs_register_for_notification(struct mount *mp, struct mount *ump,
 587     struct mount_upper_node *upper)
 588 {
 589         upper->mp = ump;
 590         MNT_ILOCK(mp);
 591         TAILQ_INSERT_TAIL(&mp->mnt_notify, upper, mnt_upper_link);
 592         MNT_IUNLOCK(mp);
 593 }
 594
 595 static void
 596 vfs_drain_upper_locked(struct mount *mp)
 597 {
 598         mtx_assert(MNT_MTX(mp), MA_OWNED);
 599         while (mp->mnt_upper_pending != 0) {
 600                 mp->mnt_kern_flag |= MNTK_UPPER_WAITER;
 601                 msleep(&mp->mnt_uppers, MNT_MTX(mp), 0, "mntupw", 0);
 602         }
 603 }
 604
 605 /*
 606  * Undo a previous call to vfs_register_for_notification().
 607  * The mount represented by upper must be currently registered
 608  * as an upper mount for mp.
 609  */
 610 void
 611 vfs_unregister_for_notification(struct mount *mp,
 612     struct mount_upper_node *upper)
 613 {
 614         MNT_ILOCK(mp);
 615         vfs_drain_upper_locked(mp);
 616         TAILQ_REMOVE(&mp->mnt_notify, upper, mnt_upper_link);
 617         MNT_IUNLOCK(mp);
 618 }
 619
 620 /*
 621  * Undo a previous call to vfs_register_upper_from_vp().
 622  * This must be done before mp can be unmounted.
 623  */
 624 void
 625 vfs_unregister_upper(struct mount *mp, struct mount_upper_node *upper)
 626 {
 627         MNT_ILOCK(mp);
 628         KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0,
 629             ("registered upper with pending unmount"));
 630         vfs_drain_upper_locked(mp);
 631         TAILQ_REMOVE(&mp->mnt_uppers, upper, mnt_upper_link);
 632         if ((mp->mnt_kern_flag & MNTK_TASKQUEUE_WAITER) != 0 &&
 633             TAILQ_EMPTY(&mp->mnt_uppers)) {
 634                 mp->mnt_kern_flag &= ~MNTK_TASKQUEUE_WAITER;
 635                 wakeup(&mp->mnt_taskqueue_link);
 636         }
 637         MNT_REL(mp);
 638         MNT_IUNLOCK(mp);
 639 }
 640
 641 void
 642 vfs_rel(struct mount *mp)
 643 {
 644         struct mount_pcpu *mpcpu;
 645
 646         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 647         if (vfs_op_thread_enter(mp, mpcpu)) {
 648                 vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
 649                 vfs_op_thread_exit(mp, mpcpu);
 650                 return;
 651         }
 652
 653         MNT_ILOCK(mp);
 654         MNT_REL(mp);
 655         MNT_IUNLOCK(mp);
 656 }
 657
 658 /*
 659  * Allocate and initialize the mount point struct.
 660  */
 661 struct mount *
 662 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
 663     struct ucred *cred)
 664 {
 665         struct mount *mp;
 666
 667         mp = uma_zalloc(mount_zone, M_WAITOK);
 668         bzero(&mp->mnt_startzero,
 669             __rangeof(struct mount, mnt_startzero, mnt_endzero));
 670         mp->mnt_kern_flag = 0;
 671         mp->mnt_flag = 0;
 672         mp->mnt_rootvnode = NULL;
 673         mp->mnt_vnodecovered = NULL;
 674         mp->mnt_op = NULL;
 675         mp->mnt_vfc = NULL;
 676         TAILQ_INIT(&mp->mnt_nvnodelist);
 677         mp->mnt_nvnodelistsize = 0;
 678         TAILQ_INIT(&mp->mnt_lazyvnodelist);
 679         mp->mnt_lazyvnodelistsize = 0;
 680         MPPASS(mp->mnt_ref == 0 && mp->mnt_lockref == 0 &&
 681             mp->mnt_writeopcount == 0, mp);
 682         MPASSERT(mp->mnt_vfs_ops == 1, mp,
 683             ("vfs_ops should be 1 but %d found", mp->mnt_vfs_ops));
 684         (void) vfs_busy(mp, MBF_NOWAIT);
 685         atomic_add_acq_int(&vfsp->vfc_refcount, 1);
 686         mp->mnt_op = vfsp->vfc_vfsops;
 687         mp->mnt_vfc = vfsp;
 688         mp->mnt_stat.f_type = vfsp->vfc_typenum;
 689         mp->mnt_gen++;
 690         strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 691         mp->mnt_vnodecovered = vp;
 692         mp->mnt_cred = crdup(cred);
 693         mp->mnt_stat.f_owner = cred->cr_uid;
 694         strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
 695         mp->mnt_iosize_max = DFLTPHYS;
 696 #ifdef MAC
 697         mac_mount_init(mp);
 698         mac_mount_create(cred, mp);
 699 #endif
 700         arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
 701         mp->mnt_upper_pending = 0;
 702         TAILQ_INIT(&mp->mnt_uppers);
 703         TAILQ_INIT(&mp->mnt_notify);
 704         mp->mnt_taskqueue_flags = 0;
 705         mp->mnt_unmount_retries = 0;
 706         return (mp);
 707 }
 708
 709 /*
 710  * Destroy the mount struct previously allocated by vfs_mount_alloc().
 711  */
 712 void
 713 vfs_mount_destroy(struct mount *mp)
 714 {
 715
 716         MPPASS(mp->mnt_vfs_ops != 0, mp);
 717
 718         vfs_assert_mount_counters(mp);
 719
 720         MNT_ILOCK(mp);
 721         mp->mnt_kern_flag |= MNTK_REFEXPIRE;
 722         if (mp->mnt_kern_flag & MNTK_MWAIT) {
 723                 mp->mnt_kern_flag &= ~MNTK_MWAIT;
 724                 wakeup(mp);
 725         }
 726         while (mp->mnt_ref)
 727                 msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
 728         KASSERT(mp->mnt_ref == 0,
 729             ("%s: invalid refcount in the drain path @ %s:%d", __func__,
 730             __FILE__, __LINE__));
 731         MPPASS(mp->mnt_writeopcount == 0, mp);
 732         MPPASS(mp->mnt_secondary_writes == 0, mp);
 733         atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1);
 734         if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
 735                 struct vnode *vp;
 736
 737                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
 738                         vn_printf(vp, "dangling vnode ");
 739                 panic("unmount: dangling vnode");
 740         }
 741         KASSERT(mp->mnt_upper_pending == 0, ("mnt_upper_pending"));
 742         KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers"));
 743         KASSERT(TAILQ_EMPTY(&mp->mnt_notify), ("mnt_notify"));
 744         MPPASS(mp->mnt_nvnodelistsize == 0, mp);
 745         MPPASS(mp->mnt_lazyvnodelistsize == 0, mp);
 746         MPPASS(mp->mnt_lockref == 0, mp);
 747         MNT_IUNLOCK(mp);
 748
 749         MPASSERT(mp->mnt_vfs_ops == 1, mp,
 750             ("vfs_ops should be 1 but %d found", mp->mnt_vfs_ops));
 751
 752         MPASSERT(mp->mnt_rootvnode == NULL, mp,
 753             ("mount point still has a root vnode %p", mp->mnt_rootvnode));
 754
 755         if (mp->mnt_vnodecovered != NULL)
 756                 vrele(mp->mnt_vnodecovered);
 757 #ifdef MAC
 758         mac_mount_destroy(mp);
 759 #endif
 760         if (mp->mnt_opt != NULL)
 761                 vfs_freeopts(mp->mnt_opt);
 762         if (mp->mnt_exjail != NULL) {
 763                 atomic_subtract_int(&mp->mnt_exjail->cr_prison->pr_exportcnt,
 764                     1);
 765                 crfree(mp->mnt_exjail);
 766         }
 767         if (mp->mnt_export != NULL) {
 768                 vfs_free_addrlist(mp->mnt_export);
 769                 free(mp->mnt_export, M_MOUNT);
 770         }
 771         crfree(mp->mnt_cred);
 772         uma_zfree(mount_zone, mp);
 773 }
 774
 775 static bool
 776 vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error)
 777 {
 778         /* This is an upgrade of an exisiting mount. */
 779         if ((fsflags & MNT_UPDATE) != 0)
 780                 return (false);
 781         /* This is already an R/O mount. */
 782         if ((fsflags & MNT_RDONLY) != 0)
 783                 return (false);
 784
 785         switch (error) {
 786         case ENODEV:    /* generic, geom, ... */
 787         case EACCES:    /* cam/scsi, ... */
 788         case EROFS:     /* md, mmcsd, ... */
 789                 /*
 790                  * These errors can be returned by the storage layer to signal
 791                  * that the media is read-only.  No harm in the R/O mount
 792                  * attempt if the error was returned for some other reason.
 793                  */
 794                 return (true);
 795         default:
 796                 return (false);
 797         }
 798 }
 799
 800 int
 801 vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions)
 802 {
 803         struct vfsoptlist *optlist;
 804         struct vfsopt *opt, *tmp_opt;
 805         char *fstype, *fspath, *errmsg;
 806         int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
 807         bool autoro, has_nonexport, jail_export;
 808
 809         errmsg = fspath = NULL;
 810         errmsg_len = fspathlen = 0;
 811         errmsg_pos = -1;
 812         autoro = default_autoro;
 813
 814         error = vfs_buildopts(fsoptions, &optlist);
 815         if (error)
 816                 return (error);
 817
 818         if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
 819                 errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
 820
 821         /*
 822          * We need these two options before the others,
 823          * and they are mandatory for any filesystem.
 824          * Ensure they are NUL terminated as well.
 825          */
 826         fstypelen = 0;
 827         error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
 828         if (error || fstypelen <= 0 || fstype[fstypelen - 1] != '\0') {
 829                 error = EINVAL;
 830                 if (errmsg != NULL)
 831                         strncpy(errmsg, "Invalid fstype", errmsg_len);
 832                 goto bail;
 833         }
 834         fspathlen = 0;
 835         error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
 836         if (error || fspathlen <= 0 || fspath[fspathlen - 1] != '\0') {
 837                 error = EINVAL;
 838                 if (errmsg != NULL)
 839                         strncpy(errmsg, "Invalid fspath", errmsg_len);
 840                 goto bail;
 841         }
 842
 843         /*
 844          * Check to see that "export" is only used with the "update", "fstype",
 845          * "fspath", "from" and "errmsg" options when in a vnet jail.
 846          * These are the ones used to set/update exports by mountd(8).
 847          * If only the above options are set in a jail that can run mountd(8),
 848          * then the jail_export argument of vfs_domount() will be true.
 849          * When jail_export is true, the vfs_suser() check does not cause
 850          * failure, but limits the update to exports only.
 851          * This allows mountd(8) running within the vnet jail
 852          * to export file systems visible within the jail, but
 853          * mounted outside of the jail.
 854          */
 855         /*
 856          * We need to see if we have the "update" option
 857          * before we call vfs_domount(), since vfs_domount() has special
 858          * logic based on MNT_UPDATE.  This is very important
 859          * when we want to update the root filesystem.
 860          */
 861         has_nonexport = false;
 862         jail_export = false;
 863         TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
 864                 int do_freeopt = 0;
 865
 866                 if (jailed(td->td_ucred) &&
 867                     strcmp(opt->name, "export") != 0 &&
 868                     strcmp(opt->name, "update") != 0 &&
 869                     strcmp(opt->name, "fstype") != 0 &&
 870                     strcmp(opt->name, "fspath") != 0 &&
 871                     strcmp(opt->name, "from") != 0 &&
 872                     strcmp(opt->name, "errmsg") != 0)
 873                         has_nonexport = true;
 874                 if (strcmp(opt->name, "update") == 0) {
 875                         fsflags |= MNT_UPDATE;
 876                         do_freeopt = 1;
 877                 }
 878                 else if (strcmp(opt->name, "async") == 0)
 879                         fsflags |= MNT_ASYNC;
 880                 else if (strcmp(opt->name, "force") == 0) {
 881                         fsflags |= MNT_FORCE;
 882                         do_freeopt = 1;
 883                 }
 884                 else if (strcmp(opt->name, "reload") == 0) {
 885                         fsflags |= MNT_RELOAD;
 886                         do_freeopt = 1;
 887                 }
 888                 else if (strcmp(opt->name, "multilabel") == 0)
 889                         fsflags |= MNT_MULTILABEL;
 890                 else if (strcmp(opt->name, "noasync") == 0)
 891                         fsflags &= ~MNT_ASYNC;
 892                 else if (strcmp(opt->name, "noatime") == 0)
 893                         fsflags |= MNT_NOATIME;
 894                 else if (strcmp(opt->name, "atime") == 0) {
 895                         free(opt->name, M_MOUNT);
 896                         opt->name = strdup("nonoatime", M_MOUNT);
 897                 }
 898                 else if (strcmp(opt->name, "noclusterr") == 0)
 899                         fsflags |= MNT_NOCLUSTERR;
 900                 else if (strcmp(opt->name, "clusterr") == 0) {
 901                         free(opt->name, M_MOUNT);
 902                         opt->name = strdup("nonoclusterr", M_MOUNT);
 903                 }
 904                 else if (strcmp(opt->name, "noclusterw") == 0)
 905                         fsflags |= MNT_NOCLUSTERW;
 906                 else if (strcmp(opt->name, "clusterw") == 0) {
 907                         free(opt->name, M_MOUNT);
 908                         opt->name = strdup("nonoclusterw", M_MOUNT);
 909                 }
 910                 else if (strcmp(opt->name, "noexec") == 0)
 911                         fsflags |= MNT_NOEXEC;
 912                 else if (strcmp(opt->name, "exec") == 0) {
 913                         free(opt->name, M_MOUNT);
 914                         opt->name = strdup("nonoexec", M_MOUNT);
 915                 }
 916                 else if (strcmp(opt->name, "nosuid") == 0)
 917                         fsflags |= MNT_NOSUID;
 918                 else if (strcmp(opt->name, "suid") == 0) {
 919                         free(opt->name, M_MOUNT);
 920                         opt->name = strdup("nonosuid", M_MOUNT);
 921                 }
 922                 else if (strcmp(opt->name, "nosymfollow") == 0)
 923                         fsflags |= MNT_NOSYMFOLLOW;
 924                 else if (strcmp(opt->name, "symfollow") == 0) {
 925                         free(opt->name, M_MOUNT);
 926                         opt->name = strdup("nonosymfollow", M_MOUNT);
 927                 }
 928                 else if (strcmp(opt->name, "noro") == 0) {
 929                         fsflags &= ~MNT_RDONLY;
 930                         autoro = false;
 931                 }
 932                 else if (strcmp(opt->name, "rw") == 0) {
 933                         fsflags &= ~MNT_RDONLY;
 934                         autoro = false;
 935                 }
 936                 else if (strcmp(opt->name, "ro") == 0) {
 937                         fsflags |= MNT_RDONLY;
 938                         autoro = false;
 939                 }
 940                 else if (strcmp(opt->name, "rdonly") == 0) {
 941                         free(opt->name, M_MOUNT);
 942                         opt->name = strdup("ro", M_MOUNT);
 943                         fsflags |= MNT_RDONLY;
 944                         autoro = false;
 945                 }
 946                 else if (strcmp(opt->name, "autoro") == 0) {
 947                         do_freeopt = 1;
 948                         autoro = true;
 949                 }
 950                 else if (strcmp(opt->name, "suiddir") == 0)
 951                         fsflags |= MNT_SUIDDIR;
 952                 else if (strcmp(opt->name, "sync") == 0)
 953                         fsflags |= MNT_SYNCHRONOUS;
 954                 else if (strcmp(opt->name, "union") == 0)
 955                         fsflags |= MNT_UNION;
 956                 else if (strcmp(opt->name, "export") == 0) {
 957                         fsflags |= MNT_EXPORTED;
 958                         jail_export = true;
 959                 } else if (strcmp(opt->name, "automounted") == 0) {
 960                         fsflags |= MNT_AUTOMOUNTED;
 961                         do_freeopt = 1;
 962                 } else if (strcmp(opt->name, "nocover") == 0) {
 963                         fsflags |= MNT_NOCOVER;
 964                         do_freeopt = 1;
 965                 } else if (strcmp(opt->name, "cover") == 0) {
 966                         fsflags &= ~MNT_NOCOVER;
 967                         do_freeopt = 1;
 968                 } else if (strcmp(opt->name, "emptydir") == 0) {
 969                         fsflags |= MNT_EMPTYDIR;
 970                         do_freeopt = 1;
 971                 } else if (strcmp(opt->name, "noemptydir") == 0) {
 972                         fsflags &= ~MNT_EMPTYDIR;
 973                         do_freeopt = 1;
 974                 }
 975                 if (do_freeopt)
 976                         vfs_freeopt(optlist, opt);
 977         }
 978
 979         /*
 980          * Be ultra-paranoid about making sure the type and fspath
 981          * variables will fit in our mp buffers, including the
 982          * terminating NUL.
 983          */
 984         if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) {
 985                 error = ENAMETOOLONG;
 986                 goto bail;
 987         }
 988
 989         /*
 990          * If has_nonexport is true or the caller is not running within a
 991          * vnet prison that can run mountd(8), set jail_export false.
 992          */
 993         if (has_nonexport || !jailed(td->td_ucred) ||
 994             !prison_check_nfsd(td->td_ucred))
 995                 jail_export = false;
 996
 997         error = vfs_domount(td, fstype, fspath, fsflags, jail_export, &optlist);
 998         if (error == ENODEV) {
 999                 error = EINVAL;
1000                 if (errmsg != NULL)
1001                         strncpy(errmsg, "Invalid fstype", errmsg_len);
1002                 goto bail;
1003         }
1004
1005         /*
1006          * See if we can mount in the read-only mode if the error code suggests
1007          * that it could be possible and the mount options allow for that.
1008          * Never try it if "[no]{ro|rw}" has been explicitly requested and not
1009          * overridden by "autoro".
1010          */
1011         if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) {
1012                 printf("%s: R/W mount failed, possibly R/O media,"
1013                     " trying R/O mount\n", __func__);
1014                 fsflags |= MNT_RDONLY;
1015                 error = vfs_domount(td, fstype, fspath, fsflags, jail_export,
1016                     &optlist);
1017         }
1018 bail:
1019         /* copyout the errmsg */
1020         if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
1021             && errmsg_len > 0 && errmsg != NULL) {
1022                 if (fsoptions->uio_segflg == UIO_SYSSPACE) {
1023                         bcopy(errmsg,
1024                             fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
1025                             fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
1026                 } else {
1027                         copyout(errmsg,
1028                             fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
1029                             fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
1030                 }
1031         }
1032
1033         if (optlist != NULL)
1034                 vfs_freeopts(optlist);
1035         return (error);
1036 }
1037
1038 /*
1039  * Old mount API.
1040  */
1041 #ifndef _SYS_SYSPROTO_H_
1042 struct mount_args {
1043         char    *type;
1044         char    *path;
1045         int     flags;
1046         caddr_t data;
1047 };
1048 #endif
1049 /* ARGSUSED */
1050 int
1051 sys_mount(struct thread *td, struct mount_args *uap)
1052 {
1053         char *fstype;
1054         struct vfsconf *vfsp = NULL;
1055         struct mntarg *ma = NULL;
1056         uint64_t flags;
1057         int error;
1058
1059         /*
1060          * Mount flags are now 64-bits. On 32-bit architectures only
1061          * 32-bits are passed in, but from here on everything handles
1062          * 64-bit flags correctly.
1063          */
1064         flags = uap->flags;
1065
1066         AUDIT_ARG_FFLAGS(flags);
1067
1068         /*
1069          * Filter out MNT_ROOTFS.  We do not want clients of mount() in
1070          * userspace to set this flag, but we must filter it out if we want
1071          * MNT_UPDATE on the root file system to work.
1072          * MNT_ROOTFS should only be set by the kernel when mounting its
1073          * root file system.
1074          */
1075         flags &= ~MNT_ROOTFS;
1076
1077         fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
1078         error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
1079         if (error) {
1080                 free(fstype, M_TEMP);
1081                 return (error);
1082         }
1083
1084         AUDIT_ARG_TEXT(fstype);
1085         vfsp = vfs_byname_kld(fstype, td, &error);
1086         free(fstype, M_TEMP);
1087         if (vfsp == NULL)
1088                 return (EINVAL);
1089         if (((vfsp->vfc_flags & VFCF_SBDRY) != 0 &&
1090             vfsp->vfc_vfsops_sd->vfs_cmount == NULL) ||
1091             ((vfsp->vfc_flags & VFCF_SBDRY) == 0 &&
1092             vfsp->vfc_vfsops->vfs_cmount == NULL))
1093                 return (EOPNOTSUPP);
1094
1095         ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN);
1096         ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
1097         ma = mount_argb(ma, flags & MNT_RDONLY, "noro");
1098         ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid");
1099         ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec");
1100
1101         if ((vfsp->vfc_flags & VFCF_SBDRY) != 0)
1102                 return (vfsp->vfc_vfsops_sd->vfs_cmount(ma, uap->data, flags));
1103         return (vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags));
1104 }
1105
1106 /*
1107  * vfs_domount_first(): first file system mount (not update)
1108  */
1109 static int
1110 vfs_domount_first(
1111         struct thread *td,              /* Calling thread. */
1112         struct vfsconf *vfsp,           /* File system type. */
1113         char *fspath,                   /* Mount path. */
1114         struct vnode *vp,               /* Vnode to be covered. */
1115         uint64_t fsflags,               /* Flags common to all filesystems. */
1116         struct vfsoptlist **optlist     /* Options local to the filesystem. */
1117         )
1118 {
1119         struct vattr va;
1120         struct mount *mp;
1121         struct vnode *newdp, *rootvp;
1122         int error, error1;
1123         bool unmounted;
1124
1125         ASSERT_VOP_ELOCKED(vp, __func__);
1126         KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
1127
1128         /*
1129          * If the jail of the calling thread lacks permission for this type of
1130          * file system, or is trying to cover its own root, deny immediately.
1131          */
1132         if (jailed(td->td_ucred) && (!prison_allow(td->td_ucred,
1133             vfsp->vfc_prison_flag) || vp == td->td_ucred->cr_prison->pr_root)) {
1134                 vput(vp);
1135                 return (EPERM);
1136         }
1137
1138         /*
1139          * If the user is not root, ensure that they own the directory
1140          * onto which we are attempting to mount.
1141          */
1142         error = VOP_GETATTR(vp, &va, td->td_ucred);
1143         if (error == 0 && va.va_uid != td->td_ucred->cr_uid)
1144                 error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN);
1145         if (error == 0)
1146                 error = vinvalbuf(vp, V_SAVE, 0, 0);
1147         if (vfsp->vfc_flags & VFCF_FILEMOUNT) {
1148                 if (error == 0 && vp->v_type != VDIR && vp->v_type != VREG)
1149                         error = EINVAL;
1150                 /*
1151                  * For file mounts, ensure that there is only one hardlink to the file.
1152                  */
1153                 if (error == 0 && vp->v_type == VREG && va.va_nlink != 1)
1154                         error = EINVAL;
1155         } else {
1156                 if (error == 0 && vp->v_type != VDIR)
1157                         error = ENOTDIR;
1158         }
1159         if (error == 0 && (fsflags & MNT_EMPTYDIR) != 0)
1160                 error = vn_dir_check_empty(vp);
1161         if (error == 0) {
1162                 VI_LOCK(vp);
1163                 if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
1164                         vp->v_iflag |= VI_MOUNT;
1165                 else
1166                         error = EBUSY;
1167                 VI_UNLOCK(vp);
1168         }
1169         if (error != 0) {
1170                 vput(vp);
1171                 return (error);
1172         }
1173         vn_seqc_write_begin(vp);
1174         VOP_UNLOCK(vp);
1175
1176         /* Allocate and initialize the filesystem. */
1177         mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
1178         /* XXXMAC: pass to vfs_mount_alloc? */
1179         mp->mnt_optnew = *optlist;
1180         /* Set the mount level flags. */
1181         mp->mnt_flag = (fsflags &
1182             (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY | MNT_FORCE));
1183
1184         /*
1185          * Mount the filesystem.
1186          * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
1187          * get.  No freeing of cn_pnbuf.
1188          */
1189         error1 = 0;
1190         unmounted = true;
1191         if ((error = VFS_MOUNT(mp)) != 0 ||
1192             (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 ||
1193             (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) {
1194                 rootvp = NULL;
1195                 if (error1 != 0) {
1196                         MPASS(error == 0);
1197                         rootvp = vfs_cache_root_clear(mp);
1198                         if (rootvp != NULL) {
1199                                 vhold(rootvp);
1200                                 vrele(rootvp);
1201                         }
1202                         (void)vn_start_write(NULL, &mp, V_WAIT);
1203                         MNT_ILOCK(mp);
1204                         mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_UNMOUNTF;
1205                         MNT_IUNLOCK(mp);
1206                         VFS_PURGE(mp);
1207                         error = VFS_UNMOUNT(mp, 0);
1208                         vn_finished_write(mp);
1209                         if (error != 0) {
1210                                 printf(
1211                     "failed post-mount (%d): rollback unmount returned %d\n",
1212                                     error1, error);
1213                                 unmounted = false;
1214                         }
1215                         error = error1;
1216                 }
1217                 vfs_unbusy(mp);
1218                 mp->mnt_vnodecovered = NULL;
1219                 if (unmounted) {
1220                         /* XXXKIB wait for mnt_lockref drain? */
1221                         vfs_mount_destroy(mp);
1222                 }
1223                 VI_LOCK(vp);
1224                 vp->v_iflag &= ~VI_MOUNT;
1225                 VI_UNLOCK(vp);
1226                 if (rootvp != NULL) {
1227                         vn_seqc_write_end(rootvp);
1228                         vdrop(rootvp);
1229                 }
1230                 vn_seqc_write_end(vp);
1231                 vrele(vp);
1232                 return (error);
1233         }
1234         vn_seqc_write_begin(newdp);
1235         VOP_UNLOCK(newdp);
1236
1237         if (mp->mnt_opt != NULL)
1238                 vfs_freeopts(mp->mnt_opt);
1239         mp->mnt_opt = mp->mnt_optnew;
1240         *optlist = NULL;
1241
1242         /*
1243          * Prevent external consumers of mount options from reading mnt_optnew.
1244          */
1245         mp->mnt_optnew = NULL;
1246
1247         MNT_ILOCK(mp);
1248         if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
1249             (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
1250                 mp->mnt_kern_flag |= MNTK_ASYNC;
1251         else
1252                 mp->mnt_kern_flag &= ~MNTK_ASYNC;
1253         MNT_IUNLOCK(mp);
1254
1255         /*
1256          * VIRF_MOUNTPOINT and v_mountedhere need to be set under the
1257          * vp lock to satisfy vfs_lookup() requirements.
1258          */
1259         VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY);
1260         VI_LOCK(vp);
1261         vn_irflag_set_locked(vp, VIRF_MOUNTPOINT);
1262         vp->v_mountedhere = mp;
1263         VI_UNLOCK(vp);
1264         VOP_UNLOCK(vp);
1265         cache_purge(vp);
1266
1267         /*
1268          * We need to lock both vnodes.
1269          *
1270          * Use vn_lock_pair to avoid establishing an ordering between vnodes
1271          * from different filesystems.
1272          */
1273         vn_lock_pair(vp, false, LK_EXCLUSIVE, newdp, false, LK_EXCLUSIVE);
1274
1275         VI_LOCK(vp);
1276         vp->v_iflag &= ~VI_MOUNT;
1277         VI_UNLOCK(vp);
1278         /* Place the new filesystem at the end of the mount list. */
1279         mtx_lock(&mountlist_mtx);
1280         TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
1281         mtx_unlock(&mountlist_mtx);
1282         vfs_event_signal(NULL, VQ_MOUNT, 0);
1283         VOP_UNLOCK(vp);
1284         EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td);
1285         VOP_UNLOCK(newdp);
1286         mount_devctl_event("MOUNT", mp, false);
1287         mountcheckdirs(vp, newdp);
1288         vn_seqc_write_end(vp);
1289         vn_seqc_write_end(newdp);
1290         vrele(newdp);
1291         if ((mp->mnt_flag & MNT_RDONLY) == 0)
1292                 vfs_allocate_syncvnode(mp);
1293         vfs_op_exit(mp);
1294         vfs_unbusy(mp);
1295         return (0);
1296 }
1297
1298 /*
1299  * vfs_domount_update(): update of mounted file system
1300  */
1301 static int
1302 vfs_domount_update(
1303         struct thread *td,              /* Calling thread. */
1304         struct vnode *vp,               /* Mount point vnode. */
1305         uint64_t fsflags,               /* Flags common to all filesystems. */
1306         bool jail_export,               /* Got export option in vnet prison. */
1307         struct vfsoptlist **optlist     /* Options local to the filesystem. */
1308         )
1309 {
1310         struct export_args export;
1311         struct o2export_args o2export;
1312         struct vnode *rootvp;
1313         void *bufp;
1314         struct mount *mp;
1315         int error, export_error, i, len, fsid_up_len;
1316         uint64_t flag;
1317         gid_t *grps;
1318         fsid_t *fsid_up;
1319         bool vfs_suser_failed;
1320
1321         ASSERT_VOP_ELOCKED(vp, __func__);
1322         KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
1323         mp = vp->v_mount;
1324
1325         if ((vp->v_vflag & VV_ROOT) == 0) {
1326                 if (vfs_copyopt(*optlist, "export", &export, sizeof(export))
1327                     == 0)
1328                         error = EXDEV;
1329                 else
1330                         error = EINVAL;
1331                 vput(vp);
1332                 return (error);
1333         }
1334
1335         /*
1336          * We only allow the filesystem to be reloaded if it
1337          * is currently mounted read-only.
1338          */
1339         flag = mp->mnt_flag;
1340         if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) {
1341                 vput(vp);
1342                 return (EOPNOTSUPP);    /* Needs translation */
1343         }
1344         /*
1345          * Only privileged root, or (if MNT_USER is set) the user that
1346          * did the original mount is permitted to update it.
1347          */
1348         /*
1349          * For the case of mountd(8) doing exports in a jail, the vfs_suser()
1350          * call does not cause failure.  vfs_domount() has already checked
1351          * that "root" is doing this and vfs_suser() will fail when
1352          * the file system has been mounted outside the jail.
1353          * jail_export set true indicates that "export" is not mixed
1354          * with other options that change mount behaviour.
1355          */
1356         vfs_suser_failed = false;
1357         error = vfs_suser(mp, td);
1358         if (jail_export && error != 0) {
1359                 error = 0;
1360                 vfs_suser_failed = true;
1361         }
1362         if (error != 0) {
1363                 vput(vp);
1364                 return (error);
1365         }
1366         if (vfs_busy(mp, MBF_NOWAIT)) {
1367                 vput(vp);
1368                 return (EBUSY);
1369         }
1370         VI_LOCK(vp);
1371         if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) {
1372                 VI_UNLOCK(vp);
1373                 vfs_unbusy(mp);
1374                 vput(vp);
1375                 return (EBUSY);
1376         }
1377         vp->v_iflag |= VI_MOUNT;
1378         VI_UNLOCK(vp);
1379         VOP_UNLOCK(vp);
1380
1381         rootvp = NULL;
1382         vfs_op_enter(mp);
1383         vn_seqc_write_begin(vp);
1384
1385         if (vfs_getopt(*optlist, "fsid", (void **)&fsid_up,
1386             &fsid_up_len) == 0) {
1387                 if (fsid_up_len != sizeof(*fsid_up)) {
1388                         error = EINVAL;
1389                         goto end;
1390                 }
1391                 if (fsidcmp(&fsid_up, &mp->mnt_stat.f_fsid) != 0) {
1392                         error = ENOENT;
1393                         goto end;
1394                 }
1395                 vfs_deleteopt(*optlist, "fsid");
1396         }
1397
1398         MNT_ILOCK(mp);
1399         if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
1400                 MNT_IUNLOCK(mp);
1401                 error = EBUSY;
1402                 goto end;
1403         }
1404         if (vfs_suser_failed) {
1405                 KASSERT((fsflags & (MNT_EXPORTED | MNT_UPDATE)) ==
1406                     (MNT_EXPORTED | MNT_UPDATE),
1407                     ("%s: jailed export did not set expected fsflags",
1408                      __func__));
1409                 /*
1410                  * For this case, only MNT_UPDATE and
1411                  * MNT_EXPORTED have been set in fsflags
1412                  * by the options.  Only set MNT_UPDATE,
1413                  * since that is the one that would be set
1414                  * when set in fsflags, below.
1415                  */
1416                 mp->mnt_flag |= MNT_UPDATE;
1417         } else {
1418                 mp->mnt_flag &= ~MNT_UPDATEMASK;
1419                 mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
1420                     MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
1421                 if ((mp->mnt_flag & MNT_ASYNC) == 0)
1422                         mp->mnt_kern_flag &= ~MNTK_ASYNC;
1423         }
1424         rootvp = vfs_cache_root_clear(mp);
1425         MNT_IUNLOCK(mp);
1426         mp->mnt_optnew = *optlist;
1427         vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
1428
1429         /*
1430          * Mount the filesystem.
1431          * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
1432          * get.  No freeing of cn_pnbuf.
1433          */
1434         /*
1435          * For the case of mountd(8) doing exports from within a vnet jail,
1436          * "from" is typically not set correctly such that VFS_MOUNT() will
1437          * return ENOENT. It is not obvious that VFS_MOUNT() ever needs to be
1438          * called when mountd is doing exports, but this check only applies to
1439          * the specific case where it is running inside a vnet jail, to
1440          * avoid any POLA violation.
1441          */
1442         error = 0;
1443         if (!jail_export)
1444                 error = VFS_MOUNT(mp);
1445
1446         export_error = 0;
1447         /* Process the export option. */
1448         if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp,
1449             &len) == 0) {
1450                 /* Assume that there is only 1 ABI for each length. */
1451                 switch (len) {
1452                 case (sizeof(struct oexport_args)):
1453                         bzero(&o2export, sizeof(o2export));
1454                         /* FALLTHROUGH */
1455                 case (sizeof(o2export)):
1456                         bcopy(bufp, &o2export, len);
1457                         export.ex_flags = (uint64_t)o2export.ex_flags;
1458                         export.ex_root = o2export.ex_root;
1459                         export.ex_uid = o2export.ex_anon.cr_uid;
1460                         export.ex_groups = NULL;
1461                         export.ex_ngroups = o2export.ex_anon.cr_ngroups;
1462                         if (export.ex_ngroups > 0) {
1463                                 if (export.ex_ngroups <= XU_NGROUPS) {
1464                                         export.ex_groups = malloc(
1465                                             export.ex_ngroups * sizeof(gid_t),
1466                                             M_TEMP, M_WAITOK);
1467                                         for (i = 0; i < export.ex_ngroups; i++)
1468                                                 export.ex_groups[i] =
1469                                                   o2export.ex_anon.cr_groups[i];
1470                                 } else
1471                                         export_error = EINVAL;
1472                         } else if (export.ex_ngroups < 0)
1473                                 export_error = EINVAL;
1474                         export.ex_addr = o2export.ex_addr;
1475                         export.ex_addrlen = o2export.ex_addrlen;
1476                         export.ex_mask = o2export.ex_mask;
1477                         export.ex_masklen = o2export.ex_masklen;
1478                         export.ex_indexfile = o2export.ex_indexfile;
1479                         export.ex_numsecflavors = o2export.ex_numsecflavors;
1480                         if (export.ex_numsecflavors < MAXSECFLAVORS) {
1481                                 for (i = 0; i < export.ex_numsecflavors; i++)
1482                                         export.ex_secflavors[i] =
1483                                             o2export.ex_secflavors[i];
1484                         } else
1485                                 export_error = EINVAL;
1486                         if (export_error == 0)
1487                                 export_error = vfs_export(mp, &export, true);
1488                         free(export.ex_groups, M_TEMP);
1489                         break;
1490                 case (sizeof(export)):
1491                         bcopy(bufp, &export, len);
1492                         grps = NULL;
1493                         if (export.ex_ngroups > 0) {
1494                                 if (export.ex_ngroups <= NGROUPS_MAX) {
1495                                         grps = malloc(export.ex_ngroups *
1496                                             sizeof(gid_t), M_TEMP, M_WAITOK);
1497                                         export_error = copyin(export.ex_groups,
1498                                             grps, export.ex_ngroups *
1499                                             sizeof(gid_t));
1500                                         if (export_error == 0)
1501                                                 export.ex_groups = grps;
1502                                 } else
1503                                         export_error = EINVAL;
1504                         } else if (export.ex_ngroups == 0)
1505                                 export.ex_groups = NULL;
1506                         else
1507                                 export_error = EINVAL;
1508                         if (export_error == 0)
1509                                 export_error = vfs_export(mp, &export, true);
1510                         free(grps, M_TEMP);
1511                         break;
1512                 default:
1513                         export_error = EINVAL;
1514                         break;
1515                 }
1516         }
1517
1518         MNT_ILOCK(mp);
1519         if (error == 0) {
1520                 mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE |
1521                     MNT_SNAPSHOT);
1522         } else {
1523                 /*
1524                  * If we fail, restore old mount flags. MNT_QUOTA is special,
1525                  * because it is not part of MNT_UPDATEMASK, but it could have
1526                  * changed in the meantime if quotactl(2) was called.
1527                  * All in all we want current value of MNT_QUOTA, not the old
1528                  * one.
1529                  */
1530                 mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
1531         }
1532         if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
1533             (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
1534                 mp->mnt_kern_flag |= MNTK_ASYNC;
1535         else
1536                 mp->mnt_kern_flag &= ~MNTK_ASYNC;
1537         MNT_IUNLOCK(mp);
1538
1539         if (error != 0)
1540                 goto end;
1541
1542         mount_devctl_event("REMOUNT", mp, true);
1543         if (mp->mnt_opt != NULL)
1544                 vfs_freeopts(mp->mnt_opt);
1545         mp->mnt_opt = mp->mnt_optnew;
1546         *optlist = NULL;
1547         (void)VFS_STATFS(mp, &mp->mnt_stat);
1548         /*
1549          * Prevent external consumers of mount options from reading
1550          * mnt_optnew.
1551          */
1552         mp->mnt_optnew = NULL;
1553
1554         if ((mp->mnt_flag & MNT_RDONLY) == 0)
1555                 vfs_allocate_syncvnode(mp);
1556         else
1557                 vfs_deallocate_syncvnode(mp);
1558 end:
1559         vfs_op_exit(mp);
1560         if (rootvp != NULL) {
1561                 vn_seqc_write_end(rootvp);
1562                 vrele(rootvp);
1563         }
1564         vn_seqc_write_end(vp);
1565         vfs_unbusy(mp);
1566         VI_LOCK(vp);
1567         vp->v_iflag &= ~VI_MOUNT;
1568         VI_UNLOCK(vp);
1569         vrele(vp);
1570         return (error != 0 ? error : export_error);
1571 }
1572
1573 /*
1574  * vfs_domount(): actually attempt a filesystem mount.
1575  */
1576 static int
1577 vfs_domount(
1578         struct thread *td,              /* Calling thread. */
1579         const char *fstype,             /* Filesystem type. */
1580         char *fspath,                   /* Mount path. */
1581         uint64_t fsflags,               /* Flags common to all filesystems. */
1582         bool jail_export,               /* Got export option in vnet prison. */
1583         struct vfsoptlist **optlist     /* Options local to the filesystem. */
1584         )
1585 {
1586         struct vfsconf *vfsp;
1587         struct nameidata nd;
1588         struct vnode *vp;
1589         char *pathbuf;
1590         int error;
1591
1592         /*
1593          * Be ultra-paranoid about making sure the type and fspath
1594          * variables will fit in our mp buffers, including the
1595          * terminating NUL.
1596          */
1597         if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
1598                 return (ENAMETOOLONG);
1599
1600         if (jail_export) {
1601                 error = priv_check(td, PRIV_NFS_DAEMON);
1602                 if (error)
1603                         return (error);
1604         } else if (jailed(td->td_ucred) || usermount == 0) {
1605                 if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
1606                         return (error);
1607         }
1608
1609         /*
1610          * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
1611          */
1612         if (fsflags & MNT_EXPORTED) {
1613                 error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
1614                 if (error)
1615                         return (error);
1616         }
1617         if (fsflags & MNT_SUIDDIR) {
1618                 error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
1619                 if (error)
1620                         return (error);
1621         }
1622         /*
1623          * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
1624          */
1625         if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
1626                 if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
1627                         fsflags |= MNT_NOSUID | MNT_USER;
1628         }
1629
1630         /* Load KLDs before we lock the covered vnode to avoid reversals. */
1631         vfsp = NULL;
1632         if ((fsflags & MNT_UPDATE) == 0) {
1633                 /* Don't try to load KLDs if we're mounting the root. */
1634                 if (fsflags & MNT_ROOTFS) {
1635                         if ((vfsp = vfs_byname(fstype)) == NULL)
1636                                 return (ENODEV);
1637                 } else {
1638                         if ((vfsp = vfs_byname_kld(fstype, td, &error)) == NULL)
1639                                 return (error);
1640                 }
1641         }
1642
1643         /*
1644          * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
1645          */
1646         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1 | WANTPARENT,
1647             UIO_SYSSPACE, fspath);
1648         error = namei(&nd);
1649         if (error != 0)
1650                 return (error);
1651         vp = nd.ni_vp;
1652         /*
1653          * Don't allow stacking file mounts to work around problems with the way
1654          * that namei sets nd.ni_dvp to vp_crossmp for these.
1655          */
1656         if (vp->v_type == VREG)
1657                 fsflags |= MNT_NOCOVER;
1658         if ((fsflags & MNT_UPDATE) == 0) {
1659                 if ((vp->v_vflag & VV_ROOT) != 0 &&
1660                     (fsflags & MNT_NOCOVER) != 0) {
1661                         vput(vp);
1662                         error = EBUSY;
1663                         goto out;
1664                 }
1665                 pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
1666                 strcpy(pathbuf, fspath);
1667                 /*
1668                  * Note: we allow any vnode type here. If the path sanity check
1669                  * succeeds, the type will be validated in vfs_domount_first
1670                  * above.
1671                  */
1672                 if (vp->v_type == VDIR)
1673                         error = vn_path_to_global_path(td, vp, pathbuf,
1674                             MNAMELEN);
1675                 else
1676                         error = vn_path_to_global_path_hardlink(td, vp,
1677                             nd.ni_dvp, pathbuf, MNAMELEN,
1678                             nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen);
1679                 if (error == 0) {
1680                         error = vfs_domount_first(td, vfsp, pathbuf, vp,
1681                             fsflags, optlist);
1682                 }
1683                 free(pathbuf, M_TEMP);
1684         } else
1685                 error = vfs_domount_update(td, vp, fsflags, jail_export,
1686                     optlist);
1687
1688 out:
1689         NDFREE_PNBUF(&nd);
1690         vrele(nd.ni_dvp);
1691
1692         return (error);
1693 }
1694
1695 /*
1696  * Unmount a filesystem.
1697  *
1698  * Note: unmount takes a path to the vnode mounted on as argument, not
1699  * special file (as before).
1700  */
1701 #ifndef _SYS_SYSPROTO_H_
1702 struct unmount_args {
1703         char    *path;
1704         int     flags;
1705 };
1706 #endif
1707 /* ARGSUSED */
1708 int
1709 sys_unmount(struct thread *td, struct unmount_args *uap)
1710 {
1711
1712         return (kern_unmount(td, uap->path, uap->flags));
1713 }
1714
1715 int
1716 kern_unmount(struct thread *td, const char *path, int flags)
1717 {
1718         struct nameidata nd;
1719         struct mount *mp;
1720         char *fsidbuf, *pathbuf;
1721         fsid_t fsid;
1722         int error;
1723
1724         AUDIT_ARG_VALUE(flags);
1725         if (jailed(td->td_ucred) || usermount == 0) {
1726                 error = priv_check(td, PRIV_VFS_UNMOUNT);
1727                 if (error)
1728                         return (error);
1729         }
1730
1731         if (flags & MNT_BYFSID) {
1732                 fsidbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
1733                 error = copyinstr(path, fsidbuf, MNAMELEN, NULL);
1734                 if (error) {
1735                         free(fsidbuf, M_TEMP);
1736                         return (error);
1737                 }
1738
1739                 AUDIT_ARG_TEXT(fsidbuf);
1740                 /* Decode the filesystem ID. */
1741                 if (sscanf(fsidbuf, "FSID:%d:%d", &fsid.val[0], &fsid.val[1]) != 2) {
1742                         free(fsidbuf, M_TEMP);
1743                         return (EINVAL);
1744                 }
1745
1746                 mp = vfs_getvfs(&fsid);
1747                 free(fsidbuf, M_TEMP);
1748                 if (mp == NULL) {
1749                         return (ENOENT);
1750                 }
1751         } else {
1752                 pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
1753                 error = copyinstr(path, pathbuf, MNAMELEN, NULL);
1754                 if (error) {
1755                         free(pathbuf, M_TEMP);
1756                         return (error);
1757                 }
1758
1759                 /*
1760                  * Try to find global path for path argument.
1761                  */
1762                 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
1763                     UIO_SYSSPACE, pathbuf);
1764                 if (namei(&nd) == 0) {
1765                         NDFREE_PNBUF(&nd);
1766                         error = vn_path_to_global_path(td, nd.ni_vp, pathbuf,
1767                             MNAMELEN);
1768                         if (error == 0)
1769                                 vput(nd.ni_vp);
1770                 }
1771                 mtx_lock(&mountlist_mtx);
1772                 TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
1773                         if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) {
1774                                 vfs_ref(mp);
1775                                 break;
1776                         }
1777                 }
1778                 mtx_unlock(&mountlist_mtx);
1779                 free(pathbuf, M_TEMP);
1780                 if (mp == NULL) {
1781                         /*
1782                          * Previously we returned ENOENT for a nonexistent path and
1783                          * EINVAL for a non-mountpoint.  We cannot tell these apart
1784                          * now, so in the !MNT_BYFSID case return the more likely
1785                          * EINVAL for compatibility.
1786                          */
1787                         return (EINVAL);
1788                 }
1789         }
1790
1791         /*
1792          * Don't allow unmounting the root filesystem.
1793          */
1794         if (mp->mnt_flag & MNT_ROOTFS) {
1795                 vfs_rel(mp);
1796                 return (EINVAL);
1797         }
1798         error = dounmount(mp, flags, td);
1799         return (error);
1800 }
1801
1802 /*
1803  * Return error if any of the vnodes, ignoring the root vnode
1804  * and the syncer vnode, have non-zero usecount.
1805  *
1806  * This function is purely advisory - it can return false positives
1807  * and negatives.
1808  */
1809 static int
1810 vfs_check_usecounts(struct mount *mp)
1811 {
1812         struct vnode *vp, *mvp;
1813
1814         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
1815                 if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON &&
1816                     vp->v_usecount != 0) {
1817                         VI_UNLOCK(vp);
1818                         MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
1819                         return (EBUSY);
1820                 }
1821                 VI_UNLOCK(vp);
1822         }
1823
1824         return (0);
1825 }
1826
1827 static void
1828 dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags)
1829 {
1830
1831         mtx_assert(MNT_MTX(mp), MA_OWNED);
1832         mp->mnt_kern_flag &= ~mntkflags;
1833         if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) {
1834                 mp->mnt_kern_flag &= ~MNTK_MWAIT;
1835                 wakeup(mp);
1836         }
1837         vfs_op_exit_locked(mp);
1838         MNT_IUNLOCK(mp);
1839         if (coveredvp != NULL) {
1840                 VOP_UNLOCK(coveredvp);
1841                 vdrop(coveredvp);
1842         }
1843         vn_finished_write(mp);
1844         vfs_rel(mp);
1845 }
1846
1847 /*
1848  * There are various reference counters associated with the mount point.
1849  * Normally it is permitted to modify them without taking the mnt ilock,
1850  * but this behavior can be temporarily disabled if stable value is needed
1851  * or callers are expected to block (e.g. to not allow new users during
1852  * forced unmount).
1853  */
1854 void
1855 vfs_op_enter(struct mount *mp)
1856 {
1857         struct mount_pcpu *mpcpu;
1858         int cpu;
1859
1860         MNT_ILOCK(mp);
1861         mp->mnt_vfs_ops++;
1862         if (mp->mnt_vfs_ops > 1) {
1863                 MNT_IUNLOCK(mp);
1864                 return;
1865         }
1866         vfs_op_barrier_wait(mp);
1867         CPU_FOREACH(cpu) {
1868                 mpcpu = vfs_mount_pcpu_remote(mp, cpu);
1869
1870                 mp->mnt_ref += mpcpu->mntp_ref;
1871                 mpcpu->mntp_ref = 0;
1872
1873                 mp->mnt_lockref += mpcpu->mntp_lockref;
1874                 mpcpu->mntp_lockref = 0;
1875
1876                 mp->mnt_writeopcount += mpcpu->mntp_writeopcount;
1877                 mpcpu->mntp_writeopcount = 0;
1878         }
1879         MPASSERT(mp->mnt_ref > 0 && mp->mnt_lockref >= 0 &&
1880             mp->mnt_writeopcount >= 0, mp,
1881             ("invalid count(s): ref %d lockref %d writeopcount %d",
1882             mp->mnt_ref, mp->mnt_lockref, mp->mnt_writeopcount));
1883         MNT_IUNLOCK(mp);
1884         vfs_assert_mount_counters(mp);
1885 }
1886
1887 void
1888 vfs_op_exit_locked(struct mount *mp)
1889 {
1890
1891         mtx_assert(MNT_MTX(mp), MA_OWNED);
1892
1893         MPASSERT(mp->mnt_vfs_ops > 0, mp,
1894             ("invalid vfs_ops count %d", mp->mnt_vfs_ops));
1895         MPASSERT(mp->mnt_vfs_ops > 1 ||
1896             (mp->mnt_kern_flag & (MNTK_UNMOUNT | MNTK_SUSPEND)) == 0, mp,
1897             ("vfs_ops too low %d in unmount or suspend", mp->mnt_vfs_ops));
1898         mp->mnt_vfs_ops--;
1899 }
1900
1901 void
1902 vfs_op_exit(struct mount *mp)
1903 {
1904
1905         MNT_ILOCK(mp);
1906         vfs_op_exit_locked(mp);
1907         MNT_IUNLOCK(mp);
1908 }
1909
1910 struct vfs_op_barrier_ipi {
1911         struct mount *mp;
1912         struct smp_rendezvous_cpus_retry_arg srcra;
1913 };
1914
1915 static void
1916 vfs_op_action_func(void *arg)
1917 {
1918         struct vfs_op_barrier_ipi *vfsopipi;
1919         struct mount *mp;
1920
1921         vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra);
1922         mp = vfsopipi->mp;
1923
1924         if (!vfs_op_thread_entered(mp))
1925                 smp_rendezvous_cpus_done(arg);
1926 }
1927
1928 static void
1929 vfs_op_wait_func(void *arg, int cpu)
1930 {
1931         struct vfs_op_barrier_ipi *vfsopipi;
1932         struct mount *mp;
1933         struct mount_pcpu *mpcpu;
1934
1935         vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra);
1936         mp = vfsopipi->mp;
1937
1938         mpcpu = vfs_mount_pcpu_remote(mp, cpu);
1939         while (atomic_load_int(&mpcpu->mntp_thread_in_ops))
1940                 cpu_spinwait();
1941 }
1942
1943 void
1944 vfs_op_barrier_wait(struct mount *mp)
1945 {
1946         struct vfs_op_barrier_ipi vfsopipi;
1947
1948         vfsopipi.mp = mp;
1949
1950         smp_rendezvous_cpus_retry(all_cpus,
1951             smp_no_rendezvous_barrier,
1952             vfs_op_action_func,
1953             smp_no_rendezvous_barrier,
1954             vfs_op_wait_func,
1955             &vfsopipi.srcra);
1956 }
1957
1958 #ifdef DIAGNOSTIC
1959 void
1960 vfs_assert_mount_counters(struct mount *mp)
1961 {
1962         struct mount_pcpu *mpcpu;
1963         int cpu;
1964
1965         if (mp->mnt_vfs_ops == 0)
1966                 return;
1967
1968         CPU_FOREACH(cpu) {
1969                 mpcpu = vfs_mount_pcpu_remote(mp, cpu);
1970                 if (mpcpu->mntp_ref != 0 ||
1971                     mpcpu->mntp_lockref != 0 ||
1972                     mpcpu->mntp_writeopcount != 0)
1973                         vfs_dump_mount_counters(mp);
1974         }
1975 }
1976
1977 void
1978 vfs_dump_mount_counters(struct mount *mp)
1979 {
1980         struct mount_pcpu *mpcpu;
1981         int ref, lockref, writeopcount;
1982         int cpu;
1983
1984         printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops);
1985
1986         printf("        ref : ");
1987         ref = mp->mnt_ref;
1988         CPU_FOREACH(cpu) {
1989                 mpcpu = vfs_mount_pcpu_remote(mp, cpu);
1990                 printf("%d ", mpcpu->mntp_ref);
1991                 ref += mpcpu->mntp_ref;
1992         }
1993         printf("\n");
1994         printf("    lockref : ");
1995         lockref = mp->mnt_lockref;
1996         CPU_FOREACH(cpu) {
1997                 mpcpu = vfs_mount_pcpu_remote(mp, cpu);
1998                 printf("%d ", mpcpu->mntp_lockref);
1999                 lockref += mpcpu->mntp_lockref;
2000         }
2001         printf("\n");
2002         printf("writeopcount: ");
2003         writeopcount = mp->mnt_writeopcount;
2004         CPU_FOREACH(cpu) {
2005                 mpcpu = vfs_mount_pcpu_remote(mp, cpu);
2006                 printf("%d ", mpcpu->mntp_writeopcount);
2007                 writeopcount += mpcpu->mntp_writeopcount;
2008         }
2009         printf("\n");
2010
2011         printf("counter       struct total\n");
2012         printf("ref             %-5d  %-5d\n", mp->mnt_ref, ref);
2013         printf("lockref         %-5d  %-5d\n", mp->mnt_lockref, lockref);
2014         printf("writeopcount    %-5d  %-5d\n", mp->mnt_writeopcount, writeopcount);
2015
2016         panic("invalid counts on struct mount");
2017 }
2018 #endif
2019
2020 int
2021 vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which)
2022 {
2023         struct mount_pcpu *mpcpu;
2024         int cpu, sum;
2025
2026         switch (which) {
2027         case MNT_COUNT_REF:
2028                 sum = mp->mnt_ref;
2029                 break;
2030         case MNT_COUNT_LOCKREF:
2031                 sum = mp->mnt_lockref;
2032                 break;
2033         case MNT_COUNT_WRITEOPCOUNT:
2034                 sum = mp->mnt_writeopcount;
2035                 break;
2036         }
2037
2038         CPU_FOREACH(cpu) {
2039                 mpcpu = vfs_mount_pcpu_remote(mp, cpu);
2040                 switch (which) {
2041                 case MNT_COUNT_REF:
2042                         sum += mpcpu->mntp_ref;
2043                         break;
2044                 case MNT_COUNT_LOCKREF:
2045                         sum += mpcpu->mntp_lockref;
2046                         break;
2047                 case MNT_COUNT_WRITEOPCOUNT:
2048                         sum += mpcpu->mntp_writeopcount;
2049                         break;
2050                 }
2051         }
2052         return (sum);
2053 }
2054
2055 static bool
2056 deferred_unmount_enqueue(struct mount *mp, uint64_t flags, bool requeue,
2057     int timeout_ticks)
2058 {
2059         bool enqueued;
2060
2061         enqueued = false;
2062         mtx_lock(&deferred_unmount_lock);
2063         if ((mp->mnt_taskqueue_flags & MNT_DEFERRED) == 0 || requeue) {
2064                 mp->mnt_taskqueue_flags = flags | MNT_DEFERRED;
2065                 STAILQ_INSERT_TAIL(&deferred_unmount_list, mp,
2066                     mnt_taskqueue_link);
2067                 enqueued = true;
2068         }
2069         mtx_unlock(&deferred_unmount_lock);
2070
2071         if (enqueued) {
2072                 taskqueue_enqueue_timeout(taskqueue_deferred_unmount,
2073                     &deferred_unmount_task, timeout_ticks);
2074         }
2075
2076         return (enqueued);
2077 }
2078
2079 /*
2080  * Taskqueue handler for processing async/recursive unmounts
2081  */
2082 static void
2083 vfs_deferred_unmount(void *argi __unused, int pending __unused)
2084 {
2085         STAILQ_HEAD(, mount) local_unmounts;
2086         uint64_t flags;
2087         struct mount *mp, *tmp;
2088         int error;
2089         unsigned int retries;
2090         bool unmounted;
2091
2092         STAILQ_INIT(&local_unmounts);
2093         mtx_lock(&deferred_unmount_lock);
2094         STAILQ_CONCAT(&local_unmounts, &deferred_unmount_list);
2095         mtx_unlock(&deferred_unmount_lock);
2096
2097         STAILQ_FOREACH_SAFE(mp, &local_unmounts, mnt_taskqueue_link, tmp) {
2098                 flags = mp->mnt_taskqueue_flags;
2099                 KASSERT((flags & MNT_DEFERRED) != 0,
2100                     ("taskqueue unmount without MNT_DEFERRED"));
2101                 error = dounmount(mp, flags, curthread);
2102                 if (error != 0) {
2103                         MNT_ILOCK(mp);
2104                         unmounted = ((mp->mnt_kern_flag & MNTK_REFEXPIRE) != 0);
2105                         MNT_IUNLOCK(mp);
2106
2107                         /*
2108                          * The deferred unmount thread is the only thread that
2109                          * modifies the retry counts, so locking/atomics aren't
2110                          * needed here.
2111                          */
2112                         retries = (mp->mnt_unmount_retries)++;
2113                         deferred_unmount_total_retries++;
2114                         if (!unmounted && retries < deferred_unmount_retry_limit) {
2115                                 deferred_unmount_enqueue(mp, flags, true,
2116                                     -deferred_unmount_retry_delay_hz);
2117                         } else {
2118                                 if (retries >= deferred_unmount_retry_limit) {
2119                                         printf("giving up on deferred unmount "
2120                                             "of %s after %d retries, error %d\n",
2121                                             mp->mnt_stat.f_mntonname, retries, error);
2122                                 }
2123                                 vfs_rel(mp);
2124                         }
2125                 }
2126         }
2127 }
2128
2129 /*
2130  * Do the actual filesystem unmount.
2131  */
2132 int
2133 dounmount(struct mount *mp, uint64_t flags, struct thread *td)
2134 {
2135         struct mount_upper_node *upper;
2136         struct vnode *coveredvp, *rootvp;
2137         int error;
2138         uint64_t async_flag;
2139         int mnt_gen_r;
2140         unsigned int retries;
2141
2142         KASSERT((flags & MNT_DEFERRED) == 0 ||
2143             (flags & (MNT_RECURSE | MNT_FORCE)) == (MNT_RECURSE | MNT_FORCE),
2144             ("MNT_DEFERRED requires MNT_RECURSE | MNT_FORCE"));
2145
2146         /*
2147          * If the caller has explicitly requested the unmount to be handled by
2148          * the taskqueue and we're not already in taskqueue context, queue
2149          * up the unmount request and exit.  This is done prior to any
2150          * credential checks; MNT_DEFERRED should be used only for kernel-
2151          * initiated unmounts and will therefore be processed with the
2152          * (kernel) credentials of the taskqueue thread.  Still, callers
2153          * should be sure this is the behavior they want.
2154          */
2155         if ((flags & MNT_DEFERRED) != 0 &&
2156             taskqueue_member(taskqueue_deferred_unmount, curthread) == 0) {
2157                 if (!deferred_unmount_enqueue(mp, flags, false, 0))
2158                         vfs_rel(mp);
2159                 return (EINPROGRESS);
2160         }
2161
2162         /*
2163          * Only privileged root, or (if MNT_USER is set) the user that did the
2164          * original mount is permitted to unmount this filesystem.
2165          * This check should be made prior to queueing up any recursive
2166          * unmounts of upper filesystems.  Those unmounts will be executed
2167          * with kernel thread credentials and are expected to succeed, so
2168          * we must at least ensure the originating context has sufficient
2169          * privilege to unmount the base filesystem before proceeding with
2170          * the uppers.
2171          */
2172         error = vfs_suser(mp, td);
2173         if (error != 0) {
2174                 KASSERT((flags & MNT_DEFERRED) == 0,
2175                     ("taskqueue unmount with insufficient privilege"));
2176                 vfs_rel(mp);
2177                 return (error);
2178         }
2179
2180         if (recursive_forced_unmount && ((flags & MNT_FORCE) != 0))
2181                 flags |= MNT_RECURSE;
2182
2183         if ((flags & MNT_RECURSE) != 0) {
2184                 KASSERT((flags & MNT_FORCE) != 0,
2185                     ("MNT_RECURSE requires MNT_FORCE"));
2186
2187                 MNT_ILOCK(mp);
2188                 /*
2189                  * Set MNTK_RECURSE to prevent new upper mounts from being
2190                  * added, and note that an operation on the uppers list is in
2191                  * progress.  This will ensure that unregistration from the
2192                  * uppers list, and therefore any pending unmount of the upper
2193                  * FS, can't complete until after we finish walking the list.
2194                  */
2195                 mp->mnt_kern_flag |= MNTK_RECURSE;
2196                 mp->mnt_upper_pending++;
2197                 TAILQ_FOREACH(upper, &mp->mnt_uppers, mnt_upper_link) {
2198                         retries = upper->mp->mnt_unmount_retries;
2199                         if (retries > deferred_unmount_retry_limit) {
2200                                 error = EBUSY;
2201                                 continue;
2202                         }
2203                         MNT_IUNLOCK(mp);
2204
2205                         vfs_ref(upper->mp);
2206                         if (!deferred_unmount_enqueue(upper->mp, flags,
2207                             false, 0))
2208                                 vfs_rel(upper->mp);
2209                         MNT_ILOCK(mp);
2210                 }
2211                 mp->mnt_upper_pending--;
2212                 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 &&
2213                     mp->mnt_upper_pending == 0) {
2214                         mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER;
2215                         wakeup(&mp->mnt_uppers);
2216                 }
2217
2218                 /*
2219                  * If we're not on the taskqueue, wait until the uppers list
2220                  * is drained before proceeding with unmount.  Otherwise, if
2221                  * we are on the taskqueue and there are still pending uppers,
2222                  * just re-enqueue on the end of the taskqueue.
2223                  */
2224                 if ((flags & MNT_DEFERRED) == 0) {
2225                         while (error == 0 && !TAILQ_EMPTY(&mp->mnt_uppers)) {
2226                                 mp->mnt_kern_flag |= MNTK_TASKQUEUE_WAITER;
2227                                 error = msleep(&mp->mnt_taskqueue_link,
2228                                     MNT_MTX(mp), PCATCH, "umntqw", 0);
2229                         }
2230                         if (error != 0) {
2231                                 MNT_REL(mp);
2232                                 MNT_IUNLOCK(mp);
2233                                 return (error);
2234                         }
2235                 } else if (!TAILQ_EMPTY(&mp->mnt_uppers)) {
2236                         MNT_IUNLOCK(mp);
2237                         if (error == 0)
2238                                 deferred_unmount_enqueue(mp, flags, true, 0);
2239                         return (error);
2240                 }
2241                 MNT_IUNLOCK(mp);
2242                 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers not empty"));
2243         }
2244
2245         /* Allow the taskqueue to safely re-enqueue on failure */
2246         if ((flags & MNT_DEFERRED) != 0)
2247                 vfs_ref(mp);
2248
2249         if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
2250                 mnt_gen_r = mp->mnt_gen;
2251                 VI_LOCK(coveredvp);
2252                 vholdl(coveredvp);
2253                 vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
2254                 /*
2255                  * Check for mp being unmounted while waiting for the
2256                  * covered vnode lock.
2257                  */
2258                 if (coveredvp->v_mountedhere != mp ||
2259                     coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
2260                         VOP_UNLOCK(coveredvp);
2261                         vdrop(coveredvp);
2262                         vfs_rel(mp);
2263                         return (EBUSY);
2264                 }
2265         }
2266
2267         vfs_op_enter(mp);
2268
2269         vn_start_write(NULL, &mp, V_WAIT);
2270         MNT_ILOCK(mp);
2271         if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 ||
2272             (mp->mnt_flag & MNT_UPDATE) != 0 ||
2273             !TAILQ_EMPTY(&mp->mnt_uppers)) {
2274                 dounmount_cleanup(mp, coveredvp, 0);
2275                 return (EBUSY);
2276         }
2277         mp->mnt_kern_flag |= MNTK_UNMOUNT;
2278         rootvp = vfs_cache_root_clear(mp);
2279         if (coveredvp != NULL)
2280                 vn_seqc_write_begin(coveredvp);
2281         if (flags & MNT_NONBUSY) {
2282                 MNT_IUNLOCK(mp);
2283                 error = vfs_check_usecounts(mp);
2284                 MNT_ILOCK(mp);
2285                 if (error != 0) {
2286                         vn_seqc_write_end(coveredvp);
2287                         dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT);
2288                         if (rootvp != NULL) {
2289                                 vn_seqc_write_end(rootvp);
2290                                 vrele(rootvp);
2291                         }
2292                         return (error);
2293                 }
2294         }
2295         /* Allow filesystems to detect that a forced unmount is in progress. */
2296         if (flags & MNT_FORCE) {
2297                 mp->mnt_kern_flag |= MNTK_UNMOUNTF;
2298                 MNT_IUNLOCK(mp);
2299                 /*
2300                  * Must be done after setting MNTK_UNMOUNTF and before
2301                  * waiting for mnt_lockref to become 0.
2302                  */
2303                 VFS_PURGE(mp);
2304                 MNT_ILOCK(mp);
2305         }
2306         error = 0;
2307         if (mp->mnt_lockref) {
2308                 mp->mnt_kern_flag |= MNTK_DRAINING;
2309                 error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
2310                     "mount drain", 0);
2311         }
2312         MNT_IUNLOCK(mp);
2313         KASSERT(mp->mnt_lockref == 0,
2314             ("%s: invalid lock refcount in the drain path @ %s:%d",
2315             __func__, __FILE__, __LINE__));
2316         KASSERT(error == 0,
2317             ("%s: invalid return value for msleep in the drain path @ %s:%d",
2318             __func__, __FILE__, __LINE__));
2319
2320         /*
2321          * We want to keep the vnode around so that we can vn_seqc_write_end
2322          * after we are done with unmount. Downgrade our reference to a mere
2323          * hold count so that we don't interefere with anything.
2324          */
2325         if (rootvp != NULL) {
2326                 vhold(rootvp);
2327                 vrele(rootvp);
2328         }
2329
2330         if (mp->mnt_flag & MNT_EXPUBLIC)
2331                 vfs_setpublicfs(NULL, NULL, NULL);
2332
2333         vfs_periodic(mp, MNT_WAIT);
2334         MNT_ILOCK(mp);
2335         async_flag = mp->mnt_flag & MNT_ASYNC;
2336         mp->mnt_flag &= ~MNT_ASYNC;
2337         mp->mnt_kern_flag &= ~MNTK_ASYNC;
2338         MNT_IUNLOCK(mp);
2339         vfs_deallocate_syncvnode(mp);
2340         error = VFS_UNMOUNT(mp, flags);
2341         vn_finished_write(mp);
2342         vfs_rel(mp);
2343         /*
2344          * If we failed to flush the dirty blocks for this mount point,
2345          * undo all the cdir/rdir and rootvnode changes we made above.
2346          * Unless we failed to do so because the device is reporting that
2347          * it doesn't exist anymore.
2348          */
2349         if (error && error != ENXIO) {
2350                 MNT_ILOCK(mp);
2351                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2352                         MNT_IUNLOCK(mp);
2353                         vfs_allocate_syncvnode(mp);
2354                         MNT_ILOCK(mp);
2355                 }
2356                 mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
2357                 mp->mnt_flag |= async_flag;
2358                 if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
2359                     (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
2360                         mp->mnt_kern_flag |= MNTK_ASYNC;
2361                 if (mp->mnt_kern_flag & MNTK_MWAIT) {
2362                         mp->mnt_kern_flag &= ~MNTK_MWAIT;
2363                         wakeup(mp);
2364                 }
2365                 vfs_op_exit_locked(mp);
2366                 MNT_IUNLOCK(mp);
2367                 if (coveredvp) {
2368                         vn_seqc_write_end(coveredvp);
2369                         VOP_UNLOCK(coveredvp);
2370                         vdrop(coveredvp);
2371                 }
2372                 if (rootvp != NULL) {
2373                         vn_seqc_write_end(rootvp);
2374                         vdrop(rootvp);
2375                 }
2376                 return (error);
2377         }
2378
2379         mtx_lock(&mountlist_mtx);
2380         TAILQ_REMOVE(&mountlist, mp, mnt_list);
2381         mtx_unlock(&mountlist_mtx);
2382         EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td);
2383         if (coveredvp != NULL) {
2384                 VI_LOCK(coveredvp);
2385                 vn_irflag_unset_locked(coveredvp, VIRF_MOUNTPOINT);
2386                 coveredvp->v_mountedhere = NULL;
2387                 vn_seqc_write_end_locked(coveredvp);
2388                 VI_UNLOCK(coveredvp);
2389                 VOP_UNLOCK(coveredvp);
2390                 vdrop(coveredvp);
2391         }
2392         mount_devctl_event("UNMOUNT", mp, false);
2393         if (rootvp != NULL) {
2394                 vn_seqc_write_end(rootvp);
2395                 vdrop(rootvp);
2396         }
2397         vfs_event_signal(NULL, VQ_UNMOUNT, 0);
2398         if (rootvnode != NULL && mp == rootvnode->v_mount) {
2399                 vrele(rootvnode);
2400                 rootvnode = NULL;
2401         }
2402         if (mp == rootdevmp)
2403                 rootdevmp = NULL;
2404         if ((flags & MNT_DEFERRED) != 0)
2405                 vfs_rel(mp);
2406         vfs_mount_destroy(mp);
2407         return (0);
2408 }
2409
2410 /*
2411  * Report errors during filesystem mounting.
2412  */
2413 void
2414 vfs_mount_error(struct mount *mp, const char *fmt, ...)
2415 {
2416         struct vfsoptlist *moptlist = mp->mnt_optnew;
2417         va_list ap;
2418         int error, len;
2419         char *errmsg;
2420
2421         error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
2422         if (error || errmsg == NULL || len <= 0)
2423                 return;
2424
2425         va_start(ap, fmt);
2426         vsnprintf(errmsg, (size_t)len, fmt, ap);
2427         va_end(ap);
2428 }
2429
2430 void
2431 vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...)
2432 {
2433         va_list ap;
2434         int error, len;
2435         char *errmsg;
2436
2437         error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len);
2438         if (error || errmsg == NULL || len <= 0)
2439                 return;
2440
2441         va_start(ap, fmt);
2442         vsnprintf(errmsg, (size_t)len, fmt, ap);
2443         va_end(ap);
2444 }
2445
2446 /*
2447  * ---------------------------------------------------------------------
2448  * Functions for querying mount options/arguments from filesystems.
2449  */
2450
2451 /*
2452  * Check that no unknown options are given
2453  */
2454 int
2455 vfs_filteropt(struct vfsoptlist *opts, const char **legal)
2456 {
2457         struct vfsopt *opt;
2458         char errmsg[255];
2459         const char **t, *p, *q;
2460         int ret = 0;
2461
2462         TAILQ_FOREACH(opt, opts, link) {
2463                 p = opt->name;
2464                 q = NULL;
2465                 if (p[0] == 'n' && p[1] == 'o')
2466                         q = p + 2;
2467                 for(t = global_opts; *t != NULL; t++) {
2468                         if (strcmp(*t, p) == 0)
2469                                 break;
2470                         if (q != NULL) {
2471                                 if (strcmp(*t, q) == 0)
2472                                         break;
2473                         }
2474                 }
2475                 if (*t != NULL)
2476                         continue;
2477                 for(t = legal; *t != NULL; t++) {
2478                         if (strcmp(*t, p) == 0)
2479                                 break;
2480                         if (q != NULL) {
2481                                 if (strcmp(*t, q) == 0)
2482                                         break;
2483                         }
2484                 }
2485                 if (*t != NULL)
2486                         continue;
2487                 snprintf(errmsg, sizeof(errmsg),
2488                     "mount option <%s> is unknown", p);
2489                 ret = EINVAL;
2490         }
2491         if (ret != 0) {
2492                 TAILQ_FOREACH(opt, opts, link) {
2493                         if (strcmp(opt->name, "errmsg") == 0) {
2494                                 strncpy((char *)opt->value, errmsg, opt->len);
2495                                 break;
2496                         }
2497                 }
2498                 if (opt == NULL)
2499                         printf("%s\n", errmsg);
2500         }
2501         return (ret);
2502 }
2503
2504 /*
2505  * Get a mount option by its name.
2506  *
2507  * Return 0 if the option was found, ENOENT otherwise.
2508  * If len is non-NULL it will be filled with the length
2509  * of the option. If buf is non-NULL, it will be filled
2510  * with the address of the option.
2511  */
2512 int
2513 vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len)
2514 {
2515         struct vfsopt *opt;
2516
2517         KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
2518
2519         TAILQ_FOREACH(opt, opts, link) {
2520                 if (strcmp(name, opt->name) == 0) {
2521                         opt->seen = 1;
2522                         if (len != NULL)
2523                                 *len = opt->len;
2524                         if (buf != NULL)
2525                                 *buf = opt->value;
2526                         return (0);
2527                 }
2528         }
2529         return (ENOENT);
2530 }
2531
2532 int
2533 vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
2534 {
2535         struct vfsopt *opt;
2536
2537         if (opts == NULL)
2538                 return (-1);
2539
2540         TAILQ_FOREACH(opt, opts, link) {
2541                 if (strcmp(name, opt->name) == 0) {
2542                         opt->seen = 1;
2543                         return (opt->pos);
2544                 }
2545         }
2546         return (-1);
2547 }
2548
2549 int
2550 vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value)
2551 {
2552         char *opt_value, *vtp;
2553         quad_t iv;
2554         int error, opt_len;
2555
2556         error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len);
2557         if (error != 0)
2558                 return (error);
2559         if (opt_len == 0 || opt_value == NULL)
2560                 return (EINVAL);
2561         if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0')
2562                 return (EINVAL);
2563         iv = strtoq(opt_value, &vtp, 0);
2564         if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0'))
2565                 return (EINVAL);
2566         if (iv < 0)
2567                 return (EINVAL);
2568         switch (vtp[0]) {
2569         case 't': case 'T':
2570                 iv *= 1024;
2571                 /* FALLTHROUGH */
2572         case 'g': case 'G':
2573                 iv *= 1024;
2574                 /* FALLTHROUGH */
2575         case 'm': case 'M':
2576                 iv *= 1024;
2577                 /* FALLTHROUGH */
2578         case 'k': case 'K':
2579                 iv *= 1024;
2580         case '\0':
2581                 break;
2582         default:
2583                 return (EINVAL);
2584         }
2585         *value = iv;
2586
2587         return (0);
2588 }
2589
2590 char *
2591 vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
2592 {
2593         struct vfsopt *opt;
2594
2595         *error = 0;
2596         TAILQ_FOREACH(opt, opts, link) {
2597                 if (strcmp(name, opt->name) != 0)
2598                         continue;
2599                 opt->seen = 1;
2600                 if (opt->len == 0 ||
2601                     ((char *)opt->value)[opt->len - 1] != '\0') {
2602                         *error = EINVAL;
2603                         return (NULL);
2604                 }
2605                 return (opt->value);
2606         }
2607         *error = ENOENT;
2608         return (NULL);
2609 }
2610
2611 int
2612 vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w,
2613         uint64_t val)
2614 {
2615         struct vfsopt *opt;
2616
2617         TAILQ_FOREACH(opt, opts, link) {
2618                 if (strcmp(name, opt->name) == 0) {
2619                         opt->seen = 1;
2620                         if (w != NULL)
2621                                 *w |= val;
2622                         return (1);
2623                 }
2624         }
2625         if (w != NULL)
2626                 *w &= ~val;
2627         return (0);
2628 }
2629
2630 int
2631 vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
2632 {
2633         va_list ap;
2634         struct vfsopt *opt;
2635         int ret;
2636
2637         KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
2638
2639         TAILQ_FOREACH(opt, opts, link) {
2640                 if (strcmp(name, opt->name) != 0)
2641                         continue;
2642                 opt->seen = 1;
2643                 if (opt->len == 0 || opt->value == NULL)
2644                         return (0);
2645                 if (((char *)opt->value)[opt->len - 1] != '\0')
2646                         return (0);
2647                 va_start(ap, fmt);
2648                 ret = vsscanf(opt->value, fmt, ap);
2649                 va_end(ap);
2650                 return (ret);
2651         }
2652         return (0);
2653 }
2654
2655 int
2656 vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len)
2657 {
2658         struct vfsopt *opt;
2659
2660         TAILQ_FOREACH(opt, opts, link) {
2661                 if (strcmp(name, opt->name) != 0)
2662                         continue;
2663                 opt->seen = 1;
2664                 if (opt->value == NULL)
2665                         opt->len = len;
2666                 else {
2667                         if (opt->len != len)
2668                                 return (EINVAL);
2669                         bcopy(value, opt->value, len);
2670                 }
2671                 return (0);
2672         }
2673         return (ENOENT);
2674 }
2675
2676 int
2677 vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len)
2678 {
2679         struct vfsopt *opt;
2680
2681         TAILQ_FOREACH(opt, opts, link) {
2682                 if (strcmp(name, opt->name) != 0)
2683                         continue;
2684                 opt->seen = 1;
2685                 if (opt->value == NULL)
2686                         opt->len = len;
2687                 else {
2688                         if (opt->len < len)
2689                                 return (EINVAL);
2690                         opt->len = len;
2691                         bcopy(value, opt->value, len);
2692                 }
2693                 return (0);
2694         }
2695         return (ENOENT);
2696 }
2697
2698 int
2699 vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value)
2700 {
2701         struct vfsopt *opt;
2702
2703         TAILQ_FOREACH(opt, opts, link) {
2704                 if (strcmp(name, opt->name) != 0)
2705                         continue;
2706                 opt->seen = 1;
2707                 if (opt->value == NULL)
2708                         opt->len = strlen(value) + 1;
2709                 else if (strlcpy(opt->value, value, opt->len) >= opt->len)
2710                         return (EINVAL);
2711                 return (0);
2712         }
2713         return (ENOENT);
2714 }
2715
2716 /*
2717  * Find and copy a mount option.
2718  *
2719  * The size of the buffer has to be specified
2720  * in len, if it is not the same length as the
2721  * mount option, EINVAL is returned.
2722  * Returns ENOENT if the option is not found.
2723  */
2724 int
2725 vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len)
2726 {
2727         struct vfsopt *opt;
2728
2729         KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
2730
2731         TAILQ_FOREACH(opt, opts, link) {
2732                 if (strcmp(name, opt->name) == 0) {
2733                         opt->seen = 1;
2734                         if (len != opt->len)
2735                                 return (EINVAL);
2736                         bcopy(opt->value, dest, opt->len);
2737                         return (0);
2738                 }
2739         }
2740         return (ENOENT);
2741 }
2742
2743 int
2744 __vfs_statfs(struct mount *mp, struct statfs *sbp)
2745 {
2746         /*
2747          * Filesystems only fill in part of the structure for updates, we
2748          * have to read the entirety first to get all content.
2749          */
2750         if (sbp != &mp->mnt_stat)
2751                 memcpy(sbp, &mp->mnt_stat, sizeof(*sbp));
2752
2753         /*
2754          * Set these in case the underlying filesystem fails to do so.
2755          */
2756         sbp->f_version = STATFS_VERSION;
2757         sbp->f_namemax = NAME_MAX;
2758         sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2759         sbp->f_nvnodelistsize = mp->mnt_nvnodelistsize;
2760
2761         return (mp->mnt_op->vfs_statfs(mp, sbp));
2762 }
2763
2764 void
2765 vfs_mountedfrom(struct mount *mp, const char *from)
2766 {
2767
2768         bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
2769         strlcpy(mp->mnt_stat.f_mntfromname, from,
2770             sizeof mp->mnt_stat.f_mntfromname);
2771 }
2772
2773 /*
2774  * ---------------------------------------------------------------------
2775  * This is the api for building mount args and mounting filesystems from
2776  * inside the kernel.
2777  *
2778  * The API works by accumulation of individual args.  First error is
2779  * latched.
2780  *
2781  * XXX: should be documented in new manpage kernel_mount(9)
2782  */
2783
2784 /* A memory allocation which must be freed when we are done */
2785 struct mntaarg {
2786         SLIST_ENTRY(mntaarg)    next;
2787 };
2788
2789 /* The header for the mount arguments */
2790 struct mntarg {
2791         struct iovec *v;
2792         int len;
2793         int error;
2794         SLIST_HEAD(, mntaarg)   list;
2795 };
2796
2797 /*
2798  * Add a boolean argument.
2799  *
2800  * flag is the boolean value.
2801  * name must start with "no".
2802  */
2803 struct mntarg *
2804 mount_argb(struct mntarg *ma, int flag, const char *name)
2805 {
2806
2807         KASSERT(name[0] == 'n' && name[1] == 'o',
2808             ("mount_argb(...,%s): name must start with 'no'", name));
2809
2810         return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
2811 }
2812
2813 /*
2814  * Add an argument printf style
2815  */
2816 struct mntarg *
2817 mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
2818 {
2819         va_list ap;
2820         struct mntaarg *maa;
2821         struct sbuf *sb;
2822         int len;
2823
2824         if (ma == NULL) {
2825                 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
2826                 SLIST_INIT(&ma->list);
2827         }
2828         if (ma->error)
2829                 return (ma);
2830
2831         ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
2832             M_MOUNT, M_WAITOK);
2833         ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
2834         ma->v[ma->len].iov_len = strlen(name) + 1;
2835         ma->len++;
2836
2837         sb = sbuf_new_auto();
2838         va_start(ap, fmt);
2839         sbuf_vprintf(sb, fmt, ap);
2840         va_end(ap);
2841         sbuf_finish(sb);
2842         len = sbuf_len(sb) + 1;
2843         maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
2844         SLIST_INSERT_HEAD(&ma->list, maa, next);
2845         bcopy(sbuf_data(sb), maa + 1, len);
2846         sbuf_delete(sb);
2847
2848         ma->v[ma->len].iov_base = maa + 1;
2849         ma->v[ma->len].iov_len = len;
2850         ma->len++;
2851
2852         return (ma);
2853 }
2854
2855 /*
2856  * Add an argument which is a userland string.
2857  */
2858 struct mntarg *
2859 mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
2860 {
2861         struct mntaarg *maa;
2862         char *tbuf;
2863
2864         if (val == NULL)
2865                 return (ma);
2866         if (ma == NULL) {
2867                 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
2868                 SLIST_INIT(&ma->list);
2869         }
2870         if (ma->error)
2871                 return (ma);
2872         maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
2873         SLIST_INSERT_HEAD(&ma->list, maa, next);
2874         tbuf = (void *)(maa + 1);
2875         ma->error = copyinstr(val, tbuf, len, NULL);
2876         return (mount_arg(ma, name, tbuf, -1));
2877 }
2878
2879 /*
2880  * Plain argument.
2881  *
2882  * If length is -1, treat value as a C string.
2883  */
2884 struct mntarg *
2885 mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
2886 {
2887
2888         if (ma == NULL) {
2889                 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
2890                 SLIST_INIT(&ma->list);
2891         }
2892         if (ma->error)
2893                 return (ma);
2894
2895         ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
2896             M_MOUNT, M_WAITOK);
2897         ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
2898         ma->v[ma->len].iov_len = strlen(name) + 1;
2899         ma->len++;
2900
2901         ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
2902         if (len < 0)
2903                 ma->v[ma->len].iov_len = strlen(val) + 1;
2904         else
2905                 ma->v[ma->len].iov_len = len;
2906         ma->len++;
2907         return (ma);
2908 }
2909
2910 /*
2911  * Free a mntarg structure
2912  */
2913 static void
2914 free_mntarg(struct mntarg *ma)
2915 {
2916         struct mntaarg *maa;
2917
2918         while (!SLIST_EMPTY(&ma->list)) {
2919                 maa = SLIST_FIRST(&ma->list);
2920                 SLIST_REMOVE_HEAD(&ma->list, next);
2921                 free(maa, M_MOUNT);
2922         }
2923         free(ma->v, M_MOUNT);
2924         free(ma, M_MOUNT);
2925 }
2926
2927 /*
2928  * Mount a filesystem
2929  */
2930 int
2931 kernel_mount(struct mntarg *ma, uint64_t flags)
2932 {
2933         struct uio auio;
2934         int error;
2935
2936         KASSERT(ma != NULL, ("kernel_mount NULL ma"));
2937         KASSERT(ma->error != 0 || ma->v != NULL, ("kernel_mount NULL ma->v"));
2938         KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
2939
2940         error = ma->error;
2941         if (error == 0) {
2942                 auio.uio_iov = ma->v;
2943                 auio.uio_iovcnt = ma->len;
2944                 auio.uio_segflg = UIO_SYSSPACE;
2945                 error = vfs_donmount(curthread, flags, &auio);
2946         }
2947         free_mntarg(ma);
2948         return (error);
2949 }
2950
2951 /* Map from mount options to printable formats. */
2952 static struct mntoptnames optnames[] = {
2953         MNTOPT_NAMES
2954 };
2955
2956 #define DEVCTL_LEN 1024
2957 static void
2958 mount_devctl_event(const char *type, struct mount *mp, bool donew)
2959 {
2960         const uint8_t *cp;
2961         struct mntoptnames *fp;
2962         struct sbuf sb;
2963         struct statfs *sfp = &mp->mnt_stat;
2964         char *buf;
2965
2966         buf = malloc(DEVCTL_LEN, M_MOUNT, M_NOWAIT);
2967         if (buf == NULL)
2968                 return;
2969         sbuf_new(&sb, buf, DEVCTL_LEN, SBUF_FIXEDLEN);
2970         sbuf_cpy(&sb, "mount-point=\"");
2971         devctl_safe_quote_sb(&sb, sfp->f_mntonname);
2972         sbuf_cat(&sb, "\" mount-dev=\"");
2973         devctl_safe_quote_sb(&sb, sfp->f_mntfromname);
2974         sbuf_cat(&sb, "\" mount-type=\"");
2975         devctl_safe_quote_sb(&sb, sfp->f_fstypename);
2976         sbuf_cat(&sb, "\" fsid=0x");
2977         cp = (const uint8_t *)&sfp->f_fsid.val[0];
2978         for (int i = 0; i < sizeof(sfp->f_fsid); i++)
2979                 sbuf_printf(&sb, "%02x", cp[i]);
2980         sbuf_printf(&sb, " owner=%u flags=\"", sfp->f_owner);
2981         for (fp = optnames; fp->o_opt != 0; fp++) {
2982                 if ((mp->mnt_flag & fp->o_opt) != 0) {
2983                         sbuf_cat(&sb, fp->o_name);
2984                         sbuf_putc(&sb, ';');
2985                 }
2986         }
2987         sbuf_putc(&sb, '"');
2988         sbuf_finish(&sb);
2989
2990         /*
2991          * Options are not published because the form of the options depends on
2992          * the file system and may include binary data. In addition, they don't
2993          * necessarily provide enough useful information to be actionable when
2994          * devd processes them.
2995          */
2996
2997         if (sbuf_error(&sb) == 0)
2998                 devctl_notify("VFS", "FS", type, sbuf_data(&sb));
2999         sbuf_delete(&sb);
3000         free(buf, M_MOUNT);
3001 }
3002
3003 /*
3004  * Force remount specified mount point to read-only.  The argument
3005  * must be busied to avoid parallel unmount attempts.
3006  *
3007  * Intended use is to prevent further writes if some metadata
3008  * inconsistency is detected.  Note that the function still flushes
3009  * all cached metadata and data for the mount point, which might be
3010  * not always suitable.
3011  */
3012 int
3013 vfs_remount_ro(struct mount *mp)
3014 {
3015         struct vfsoptlist *opts;
3016         struct vfsopt *opt;
3017         struct vnode *vp_covered, *rootvp;
3018         int error;
3019
3020         vfs_op_enter(mp);
3021         KASSERT(mp->mnt_lockref > 0,
3022             ("vfs_remount_ro: mp %p is not busied", mp));
3023         KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0,
3024             ("vfs_remount_ro: mp %p is being unmounted (and busy?)", mp));
3025
3026         rootvp = NULL;
3027         vp_covered = mp->mnt_vnodecovered;
3028         error = vget(vp_covered, LK_EXCLUSIVE | LK_NOWAIT);
3029         if (error != 0) {
3030                 vfs_op_exit(mp);
3031                 return (error);
3032         }
3033         VI_LOCK(vp_covered);
3034         if ((vp_covered->v_iflag & VI_MOUNT) != 0) {
3035                 VI_UNLOCK(vp_covered);
3036                 vput(vp_covered);
3037                 vfs_op_exit(mp);
3038                 return (EBUSY);
3039         }
3040         vp_covered->v_iflag |= VI_MOUNT;
3041         VI_UNLOCK(vp_covered);
3042         vn_seqc_write_begin(vp_covered);
3043
3044         MNT_ILOCK(mp);
3045         if ((mp->mnt_flag & MNT_RDONLY) != 0) {
3046                 MNT_IUNLOCK(mp);
3047                 error = EBUSY;
3048                 goto out;
3049         }
3050         mp->mnt_flag |= MNT_UPDATE | MNT_FORCE | MNT_RDONLY;
3051         rootvp = vfs_cache_root_clear(mp);
3052         MNT_IUNLOCK(mp);
3053
3054         opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK | M_ZERO);
3055         TAILQ_INIT(opts);
3056         opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK | M_ZERO);
3057         opt->name = strdup("ro", M_MOUNT);
3058         opt->value = NULL;
3059         TAILQ_INSERT_TAIL(opts, opt, link);
3060         vfs_mergeopts(opts, mp->mnt_opt);
3061         mp->mnt_optnew = opts;
3062
3063         error = VFS_MOUNT(mp);
3064
3065         if (error == 0) {
3066                 MNT_ILOCK(mp);
3067                 mp->mnt_flag &= ~(MNT_UPDATE | MNT_FORCE);
3068                 MNT_IUNLOCK(mp);
3069                 vfs_deallocate_syncvnode(mp);
3070                 if (mp->mnt_opt != NULL)
3071                         vfs_freeopts(mp->mnt_opt);
3072                 mp->mnt_opt = mp->mnt_optnew;
3073         } else {
3074                 MNT_ILOCK(mp);
3075                 mp->mnt_flag &= ~(MNT_UPDATE | MNT_FORCE | MNT_RDONLY);
3076                 MNT_IUNLOCK(mp);
3077                 vfs_freeopts(mp->mnt_optnew);
3078         }
3079         mp->mnt_optnew = NULL;
3080
3081 out:
3082         vfs_op_exit(mp);
3083         VI_LOCK(vp_covered);
3084         vp_covered->v_iflag &= ~VI_MOUNT;
3085         VI_UNLOCK(vp_covered);
3086         vput(vp_covered);
3087         vn_seqc_write_end(vp_covered);
3088         if (rootvp != NULL) {
3089                 vn_seqc_write_end(rootvp);
3090                 vrele(rootvp);
3091         }
3092         return (error);
3093 }
3094
3095 /*
3096  * Suspend write operations on all local writeable filesystems.  Does
3097  * full sync of them in the process.
3098  *
3099  * Iterate over the mount points in reverse order, suspending most
3100  * recently mounted filesystems first.  It handles a case where a
3101  * filesystem mounted from a md(4) vnode-backed device should be
3102  * suspended before the filesystem that owns the vnode.
3103  */
3104 void
3105 suspend_all_fs(void)
3106 {
3107         struct mount *mp;
3108         int error;
3109
3110         mtx_lock(&mountlist_mtx);
3111         TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
3112                 error = vfs_busy(mp, MBF_MNTLSTLOCK | MBF_NOWAIT);
3113                 if (error != 0)
3114                         continue;
3115                 if ((mp->mnt_flag & (MNT_RDONLY | MNT_LOCAL)) != MNT_LOCAL ||
3116                     (mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
3117                         mtx_lock(&mountlist_mtx);
3118                         vfs_unbusy(mp);
3119                         continue;
3120                 }
3121                 error = vfs_write_suspend(mp, 0);
3122                 if (error == 0) {
3123                         MNT_ILOCK(mp);
3124                         MPASS((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0);
3125                         mp->mnt_kern_flag |= MNTK_SUSPEND_ALL;
3126                         MNT_IUNLOCK(mp);
3127                         mtx_lock(&mountlist_mtx);
3128                 } else {
3129                         printf("suspend of %s failed, error %d\n",
3130                             mp->mnt_stat.f_mntonname, error);
3131                         mtx_lock(&mountlist_mtx);
3132                         vfs_unbusy(mp);
3133                 }
3134         }
3135         mtx_unlock(&mountlist_mtx);
3136 }
3137
3138 /*
3139  * Clone the mnt_exjail field to a new mount point.
3140  */
3141 void
3142 vfs_exjail_clone(struct mount *inmp, struct mount *outmp)
3143 {
3144         struct ucred *cr;
3145         struct prison *pr;
3146
3147         MNT_ILOCK(inmp);
3148         cr = inmp->mnt_exjail;
3149         if (cr != NULL) {
3150                 crhold(cr);
3151                 MNT_IUNLOCK(inmp);
3152                 pr = cr->cr_prison;
3153                 sx_slock(&allprison_lock);
3154                 if (!prison_isalive(pr)) {
3155                         sx_sunlock(&allprison_lock);
3156                         crfree(cr);
3157                         return;
3158                 }
3159                 MNT_ILOCK(outmp);
3160                 if (outmp->mnt_exjail == NULL) {
3161                         outmp->mnt_exjail = cr;
3162                         atomic_add_int(&pr->pr_exportcnt, 1);
3163                         cr = NULL;
3164                 }
3165                 MNT_IUNLOCK(outmp);
3166                 sx_sunlock(&allprison_lock);
3167                 if (cr != NULL)
3168                         crfree(cr);
3169         } else
3170                 MNT_IUNLOCK(inmp);
3171 }
3172
3173 void
3174 resume_all_fs(void)
3175 {
3176         struct mount *mp;
3177
3178         mtx_lock(&mountlist_mtx);
3179         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3180                 if ((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0)
3181                         continue;
3182                 mtx_unlock(&mountlist_mtx);
3183                 MNT_ILOCK(mp);
3184                 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) != 0);
3185                 mp->mnt_kern_flag &= ~MNTK_SUSPEND_ALL;
3186                 MNT_IUNLOCK(mp);
3187                 vfs_write_resume(mp, 0);
3188                 mtx_lock(&mountlist_mtx);
3189                 vfs_unbusy(mp);
3190         }
3191         mtx_unlock(&mountlist_mtx);
3192 }