sys/kern/vfs_mount.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1999-2004 Poul-Henning Kamp
   5  * Copyright (c) 1999 Michael Smith
   6  * Copyright (c) 1989, 1993
   7  *      The Regents of the University of California.  All rights reserved.
   8  * (c) UNIX System Laboratories, Inc.
   9  * All or some portions of this file are derived from material licensed
  10  * to the University of California by American Telephone and Telegraph
  11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  12  * the permission of UNIX System Laboratories, Inc.
  13  *
  14  * Redistribution and use in source and binary forms, with or without
  15  * modification, are permitted provided that the following conditions
  16  * are met:
  17  * 1. Redistributions of source code must retain the above copyright
  18  *    notice, this list of conditions and the following disclaimer.
  19  * 2. Redistributions in binary form must reproduce the above copyright
  20  *    notice, this list of conditions and the following disclaimer in the
  21  *    documentation and/or other materials provided with the distribution.
  22  * 3. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  */
  38
  39 #include <sys/cdefs.h>
  40 #include <sys/param.h>
  41 #include <sys/conf.h>
  42 #include <sys/smp.h>
  43 #include <sys/devctl.h>
  44 #include <sys/eventhandler.h>
  45 #include <sys/fcntl.h>
  46 #include <sys/jail.h>
  47 #include <sys/kernel.h>
  48 #include <sys/ktr.h>
  49 #include <sys/libkern.h>
  50 #include <sys/limits.h>
  51 #include <sys/malloc.h>
  52 #include <sys/mount.h>
  53 #include <sys/mutex.h>
  54 #include <sys/namei.h>
  55 #include <sys/priv.h>
  56 #include <sys/proc.h>
  57 #include <sys/filedesc.h>
  58 #include <sys/reboot.h>
  59 #include <sys/sbuf.h>
  60 #include <sys/syscallsubr.h>
  61 #include <sys/sysproto.h>
  62 #include <sys/sx.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/systm.h>
  65 #include <sys/taskqueue.h>
  66 #include <sys/vnode.h>
  67 #include <vm/uma.h>
  68
  69 #include <geom/geom.h>
  70
  71 #include <machine/stdarg.h>
  72
  73 #include <security/audit/audit.h>
  74 #include <security/mac/mac_framework.h>
  75
  76 #define VFS_MOUNTARG_SIZE_MAX   (1024 * 64)
  77
  78 static int      vfs_domount(struct thread *td, const char *fstype, char *fspath,
  79                     uint64_t fsflags, bool jail_export,
  80                     struct vfsoptlist **optlist);
  81 static void     free_mntarg(struct mntarg *ma);
  82
  83 static int      usermount = 0;
  84 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
  85     "Unprivileged users may mount and unmount file systems");
  86
  87 static bool     default_autoro = false;
  88 SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0,
  89     "Retry failed r/w mount as r/o if no explicit ro/rw option is specified");
  90
  91 static bool     recursive_forced_unmount = false;
  92 SYSCTL_BOOL(_vfs, OID_AUTO, recursive_forced_unmount, CTLFLAG_RW,
  93     &recursive_forced_unmount, 0, "Recursively unmount stacked upper mounts"
  94     " when a file system is forcibly unmounted");
  95
  96 static SYSCTL_NODE(_vfs, OID_AUTO, deferred_unmount,
  97     CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "deferred unmount controls");
  98
  99 static unsigned int     deferred_unmount_retry_limit = 10;
 100 SYSCTL_UINT(_vfs_deferred_unmount, OID_AUTO, retry_limit, CTLFLAG_RW,
 101     &deferred_unmount_retry_limit, 0,
 102     "Maximum number of retries for deferred unmount failure");
 103
 104 static int      deferred_unmount_retry_delay_hz;
 105 SYSCTL_INT(_vfs_deferred_unmount, OID_AUTO, retry_delay_hz, CTLFLAG_RW,
 106     &deferred_unmount_retry_delay_hz, 0,
 107     "Delay in units of [1/kern.hz]s when retrying a failed deferred unmount");
 108
 109 static int      deferred_unmount_total_retries = 0;
 110 SYSCTL_INT(_vfs_deferred_unmount, OID_AUTO, total_retries, CTLFLAG_RD,
 111     &deferred_unmount_total_retries, 0,
 112     "Total number of retried deferred unmounts");
 113
 114 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
 115 MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure");
 116 static uma_zone_t mount_zone;
 117
 118 /* List of mounted filesystems. */
 119 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
 120
 121 /* For any iteration/modification of mountlist */
 122 struct mtx_padalign __exclusive_cache_line mountlist_mtx;
 123
 124 EVENTHANDLER_LIST_DEFINE(vfs_mounted);
 125 EVENTHANDLER_LIST_DEFINE(vfs_unmounted);
 126
 127 static void vfs_deferred_unmount(void *arg, int pending);
 128 static struct timeout_task deferred_unmount_task;
 129 static struct mtx deferred_unmount_lock;
 130 MTX_SYSINIT(deferred_unmount, &deferred_unmount_lock, "deferred_unmount",
 131     MTX_DEF);
 132 static STAILQ_HEAD(, mount) deferred_unmount_list =
 133     STAILQ_HEAD_INITIALIZER(deferred_unmount_list);
 134 TASKQUEUE_DEFINE_THREAD(deferred_unmount);
 135
 136 static void mount_devctl_event(const char *type, struct mount *mp, bool donew);
 137
 138 /*
 139  * Global opts, taken by all filesystems
 140  */
 141 static const char *global_opts[] = {
 142         "errmsg",
 143         "fstype",
 144         "fspath",
 145         "ro",
 146         "rw",
 147         "nosuid",
 148         "noexec",
 149         NULL
 150 };
 151
 152 static int
 153 mount_init(void *mem, int size, int flags)
 154 {
 155         struct mount *mp;
 156
 157         mp = (struct mount *)mem;
 158         mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
 159         mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF);
 160         lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
 161         mp->mnt_pcpu = uma_zalloc_pcpu(pcpu_zone_16, M_WAITOK | M_ZERO);
 162         mp->mnt_ref = 0;
 163         mp->mnt_vfs_ops = 1;
 164         mp->mnt_rootvnode = NULL;
 165         return (0);
 166 }
 167
 168 static void
 169 mount_fini(void *mem, int size)
 170 {
 171         struct mount *mp;
 172
 173         mp = (struct mount *)mem;
 174         uma_zfree_pcpu(pcpu_zone_16, mp->mnt_pcpu);
 175         lockdestroy(&mp->mnt_explock);
 176         mtx_destroy(&mp->mnt_listmtx);
 177         mtx_destroy(&mp->mnt_mtx);
 178 }
 179
 180 static void
 181 vfs_mount_init(void *dummy __unused)
 182 {
 183         TIMEOUT_TASK_INIT(taskqueue_deferred_unmount, &deferred_unmount_task,
 184             0, vfs_deferred_unmount, NULL);
 185         deferred_unmount_retry_delay_hz = hz;
 186         mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
 187             NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
 188         mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
 189 }
 190 SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);
 191
 192 /*
 193  * ---------------------------------------------------------------------
 194  * Functions for building and sanitizing the mount options
 195  */
 196
 197 /* Remove one mount option. */
 198 static void
 199 vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
 200 {
 201
 202         TAILQ_REMOVE(opts, opt, link);
 203         free(opt->name, M_MOUNT);
 204         if (opt->value != NULL)
 205                 free(opt->value, M_MOUNT);
 206         free(opt, M_MOUNT);
 207 }
 208
 209 /* Release all resources related to the mount options. */
 210 void
 211 vfs_freeopts(struct vfsoptlist *opts)
 212 {
 213         struct vfsopt *opt;
 214
 215         while (!TAILQ_EMPTY(opts)) {
 216                 opt = TAILQ_FIRST(opts);
 217                 vfs_freeopt(opts, opt);
 218         }
 219         free(opts, M_MOUNT);
 220 }
 221
 222 void
 223 vfs_deleteopt(struct vfsoptlist *opts, const char *name)
 224 {
 225         struct vfsopt *opt, *temp;
 226
 227         if (opts == NULL)
 228                 return;
 229         TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
 230                 if (strcmp(opt->name, name) == 0)
 231                         vfs_freeopt(opts, opt);
 232         }
 233 }
 234
 235 static int
 236 vfs_isopt_ro(const char *opt)
 237 {
 238
 239         if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 ||
 240             strcmp(opt, "norw") == 0)
 241                 return (1);
 242         return (0);
 243 }
 244
 245 static int
 246 vfs_isopt_rw(const char *opt)
 247 {
 248
 249         if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0)
 250                 return (1);
 251         return (0);
 252 }
 253
 254 /*
 255  * Check if options are equal (with or without the "no" prefix).
 256  */
 257 static int
 258 vfs_equalopts(const char *opt1, const char *opt2)
 259 {
 260         char *p;
 261
 262         /* "opt" vs. "opt" or "noopt" vs. "noopt" */
 263         if (strcmp(opt1, opt2) == 0)
 264                 return (1);
 265         /* "noopt" vs. "opt" */
 266         if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 267                 return (1);
 268         /* "opt" vs. "noopt" */
 269         if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 270                 return (1);
 271         while ((p = strchr(opt1, '.')) != NULL &&
 272             !strncmp(opt1, opt2, ++p - opt1)) {
 273                 opt2 += p - opt1;
 274                 opt1 = p;
 275                 /* "foo.noopt" vs. "foo.opt" */
 276                 if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 277                         return (1);
 278                 /* "foo.opt" vs. "foo.noopt" */
 279                 if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 280                         return (1);
 281         }
 282         /* "ro" / "rdonly" / "norw" / "rw" / "noro" */
 283         if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) &&
 284             (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2)))
 285                 return (1);
 286         return (0);
 287 }
 288
 289 /*
 290  * If a mount option is specified several times,
 291  * (with or without the "no" prefix) only keep
 292  * the last occurrence of it.
 293  */
 294 static void
 295 vfs_sanitizeopts(struct vfsoptlist *opts)
 296 {
 297         struct vfsopt *opt, *opt2, *tmp;
 298
 299         TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
 300                 opt2 = TAILQ_PREV(opt, vfsoptlist, link);
 301                 while (opt2 != NULL) {
 302                         if (vfs_equalopts(opt->name, opt2->name)) {
 303                                 tmp = TAILQ_PREV(opt2, vfsoptlist, link);
 304                                 vfs_freeopt(opts, opt2);
 305                                 opt2 = tmp;
 306                         } else {
 307                                 opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
 308                         }
 309                 }
 310         }
 311 }
 312
 313 /*
 314  * Build a linked list of mount options from a struct uio.
 315  */
 316 int
 317 vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
 318 {
 319         struct vfsoptlist *opts;
 320         struct vfsopt *opt;
 321         size_t memused, namelen, optlen;
 322         unsigned int i, iovcnt;
 323         int error;
 324
 325         opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
 326         TAILQ_INIT(opts);
 327         memused = 0;
 328         iovcnt = auio->uio_iovcnt;
 329         for (i = 0; i < iovcnt; i += 2) {
 330                 namelen = auio->uio_iov[i].iov_len;
 331                 optlen = auio->uio_iov[i + 1].iov_len;
 332                 memused += sizeof(struct vfsopt) + optlen + namelen;
 333                 /*
 334                  * Avoid consuming too much memory, and attempts to overflow
 335                  * memused.
 336                  */
 337                 if (memused > VFS_MOUNTARG_SIZE_MAX ||
 338                     optlen > VFS_MOUNTARG_SIZE_MAX ||
 339                     namelen > VFS_MOUNTARG_SIZE_MAX) {
 340                         error = EINVAL;
 341                         goto bad;
 342                 }
 343
 344                 opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 345                 opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
 346                 opt->value = NULL;
 347                 opt->len = 0;
 348                 opt->pos = i / 2;
 349                 opt->seen = 0;
 350
 351                 /*
 352                  * Do this early, so jumps to "bad" will free the current
 353                  * option.
 354                  */
 355                 TAILQ_INSERT_TAIL(opts, opt, link);
 356
 357                 if (auio->uio_segflg == UIO_SYSSPACE) {
 358                         bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
 359                 } else {
 360                         error = copyin(auio->uio_iov[i].iov_base, opt->name,
 361                             namelen);
 362                         if (error)
 363                                 goto bad;
 364                 }
 365                 /* Ensure names are null-terminated strings. */
 366                 if (namelen == 0 || opt->name[namelen - 1] != '\0') {
 367                         error = EINVAL;
 368                         goto bad;
 369                 }
 370                 if (optlen != 0) {
 371                         opt->len = optlen;
 372                         opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
 373                         if (auio->uio_segflg == UIO_SYSSPACE) {
 374                                 bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
 375                                     optlen);
 376                         } else {
 377                                 error = copyin(auio->uio_iov[i + 1].iov_base,
 378                                     opt->value, optlen);
 379                                 if (error)
 380                                         goto bad;
 381                         }
 382                 }
 383         }
 384         vfs_sanitizeopts(opts);
 385         *options = opts;
 386         return (0);
 387 bad:
 388         vfs_freeopts(opts);
 389         return (error);
 390 }
 391
 392 /*
 393  * Merge the old mount options with the new ones passed
 394  * in the MNT_UPDATE case.
 395  *
 396  * XXX: This function will keep a "nofoo" option in the new
 397  * options.  E.g, if the option's canonical name is "foo",
 398  * "nofoo" ends up in the mount point's active options.
 399  */
 400 static void
 401 vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts)
 402 {
 403         struct vfsopt *opt, *new;
 404
 405         TAILQ_FOREACH(opt, oldopts, link) {
 406                 new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 407                 new->name = strdup(opt->name, M_MOUNT);
 408                 if (opt->len != 0) {
 409                         new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
 410                         bcopy(opt->value, new->value, opt->len);
 411                 } else
 412                         new->value = NULL;
 413                 new->len = opt->len;
 414                 new->seen = opt->seen;
 415                 TAILQ_INSERT_HEAD(toopts, new, link);
 416         }
 417         vfs_sanitizeopts(toopts);
 418 }
 419
 420 /*
 421  * Mount a filesystem.
 422  */
 423 #ifndef _SYS_SYSPROTO_H_
 424 struct nmount_args {
 425         struct iovec *iovp;
 426         unsigned int iovcnt;
 427         int flags;
 428 };
 429 #endif
 430 int
 431 sys_nmount(struct thread *td, struct nmount_args *uap)
 432 {
 433         struct uio *auio;
 434         int error;
 435         u_int iovcnt;
 436         uint64_t flags;
 437
 438         /*
 439          * Mount flags are now 64-bits. On 32-bit archtectures only
 440          * 32-bits are passed in, but from here on everything handles
 441          * 64-bit flags correctly.
 442          */
 443         flags = uap->flags;
 444
 445         AUDIT_ARG_FFLAGS(flags);
 446         CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__,
 447             uap->iovp, uap->iovcnt, flags);
 448
 449         /*
 450          * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
 451          * userspace to set this flag, but we must filter it out if we want
 452          * MNT_UPDATE on the root file system to work.
 453          * MNT_ROOTFS should only be set by the kernel when mounting its
 454          * root file system.
 455          */
 456         flags &= ~MNT_ROOTFS;
 457
 458         iovcnt = uap->iovcnt;
 459         /*
 460          * Check that we have an even number of iovec's
 461          * and that we have at least two options.
 462          */
 463         if ((iovcnt & 1) || (iovcnt < 4)) {
 464                 CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__,
 465                     uap->iovcnt);
 466                 return (EINVAL);
 467         }
 468
 469         error = copyinuio(uap->iovp, iovcnt, &auio);
 470         if (error) {
 471                 CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno",
 472                     __func__, error);
 473                 return (error);
 474         }
 475         error = vfs_donmount(td, flags, auio);
 476
 477         free(auio, M_IOV);
 478         return (error);
 479 }
 480
 481 /*
 482  * ---------------------------------------------------------------------
 483  * Various utility functions
 484  */
 485
 486 /*
 487  * Get a reference on a mount point from a vnode.
 488  *
 489  * The vnode is allowed to be passed unlocked and race against dooming. Note in
 490  * such case there are no guarantees the referenced mount point will still be
 491  * associated with it after the function returns.
 492  */
 493 struct mount *
 494 vfs_ref_from_vp(struct vnode *vp)
 495 {
 496         struct mount *mp;
 497         struct mount_pcpu *mpcpu;
 498
 499         mp = atomic_load_ptr(&vp->v_mount);
 500         if (__predict_false(mp == NULL)) {
 501                 return (mp);
 502         }
 503         if (vfs_op_thread_enter(mp, mpcpu)) {
 504                 if (__predict_true(mp == vp->v_mount)) {
 505                         vfs_mp_count_add_pcpu(mpcpu, ref, 1);
 506                         vfs_op_thread_exit(mp, mpcpu);
 507                 } else {
 508                         vfs_op_thread_exit(mp, mpcpu);
 509                         mp = NULL;
 510                 }
 511         } else {
 512                 MNT_ILOCK(mp);
 513                 if (mp == vp->v_mount) {
 514                         MNT_REF(mp);
 515                         MNT_IUNLOCK(mp);
 516                 } else {
 517                         MNT_IUNLOCK(mp);
 518                         mp = NULL;
 519                 }
 520         }
 521         return (mp);
 522 }
 523
 524 void
 525 vfs_ref(struct mount *mp)
 526 {
 527         struct mount_pcpu *mpcpu;
 528
 529         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 530         if (vfs_op_thread_enter(mp, mpcpu)) {
 531                 vfs_mp_count_add_pcpu(mpcpu, ref, 1);
 532                 vfs_op_thread_exit(mp, mpcpu);
 533                 return;
 534         }
 535
 536         MNT_ILOCK(mp);
 537         MNT_REF(mp);
 538         MNT_IUNLOCK(mp);
 539 }
 540
 541 /*
 542  * Register ump as an upper mount of the mount associated with
 543  * vnode vp.  This registration will be tracked through
 544  * mount_upper_node upper, which should be allocated by the
 545  * caller and stored in per-mount data associated with mp.
 546  *
 547  * If successful, this function will return the mount associated
 548  * with vp, and will ensure that it cannot be unmounted until
 549  * ump has been unregistered as one of its upper mounts.
 550  *
 551  * Upon failure this function will return NULL.
 552  */
 553 struct mount *
 554 vfs_register_upper_from_vp(struct vnode *vp, struct mount *ump,
 555     struct mount_upper_node *upper)
 556 {
 557         struct mount *mp;
 558
 559         mp = atomic_load_ptr(&vp->v_mount);
 560         if (mp == NULL)
 561                 return (NULL);
 562         MNT_ILOCK(mp);
 563         if (mp != vp->v_mount ||
 564             ((mp->mnt_kern_flag & (MNTK_UNMOUNT | MNTK_RECURSE)) != 0)) {
 565                 MNT_IUNLOCK(mp);
 566                 return (NULL);
 567         }
 568         KASSERT(ump != mp, ("upper and lower mounts are identical"));
 569         upper->mp = ump;
 570         MNT_REF(mp);
 571         TAILQ_INSERT_TAIL(&mp->mnt_uppers, upper, mnt_upper_link);
 572         MNT_IUNLOCK(mp);
 573         return (mp);
 574 }
 575
 576 /*
 577  * Register upper mount ump to receive vnode unlink/reclaim
 578  * notifications from lower mount mp. This registration will
 579  * be tracked through mount_upper_node upper, which should be
 580  * allocated by the caller and stored in per-mount data
 581  * associated with mp.
 582  *
 583  * ump must already be registered as an upper mount of mp
 584  * through a call to vfs_register_upper_from_vp().
 585  */
 586 void
 587 vfs_register_for_notification(struct mount *mp, struct mount *ump,
 588     struct mount_upper_node *upper)
 589 {
 590         upper->mp = ump;
 591         MNT_ILOCK(mp);
 592         TAILQ_INSERT_TAIL(&mp->mnt_notify, upper, mnt_upper_link);
 593         MNT_IUNLOCK(mp);
 594 }
 595
 596 static void
 597 vfs_drain_upper_locked(struct mount *mp)
 598 {
 599         mtx_assert(MNT_MTX(mp), MA_OWNED);
 600         while (mp->mnt_upper_pending != 0) {
 601                 mp->mnt_kern_flag |= MNTK_UPPER_WAITER;
 602                 msleep(&mp->mnt_uppers, MNT_MTX(mp), 0, "mntupw", 0);
 603         }
 604 }
 605
 606 /*
 607  * Undo a previous call to vfs_register_for_notification().
 608  * The mount represented by upper must be currently registered
 609  * as an upper mount for mp.
 610  */
 611 void
 612 vfs_unregister_for_notification(struct mount *mp,
 613     struct mount_upper_node *upper)
 614 {
 615         MNT_ILOCK(mp);
 616         vfs_drain_upper_locked(mp);
 617         TAILQ_REMOVE(&mp->mnt_notify, upper, mnt_upper_link);
 618         MNT_IUNLOCK(mp);
 619 }
 620
 621 /*
 622  * Undo a previous call to vfs_register_upper_from_vp().
 623  * This must be done before mp can be unmounted.
 624  */
 625 void
 626 vfs_unregister_upper(struct mount *mp, struct mount_upper_node *upper)
 627 {
 628         MNT_ILOCK(mp);
 629         KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0,
 630             ("registered upper with pending unmount"));
 631         vfs_drain_upper_locked(mp);
 632         TAILQ_REMOVE(&mp->mnt_uppers, upper, mnt_upper_link);
 633         if ((mp->mnt_kern_flag & MNTK_TASKQUEUE_WAITER) != 0 &&
 634             TAILQ_EMPTY(&mp->mnt_uppers)) {
 635                 mp->mnt_kern_flag &= ~MNTK_TASKQUEUE_WAITER;
 636                 wakeup(&mp->mnt_taskqueue_link);
 637         }
 638         MNT_REL(mp);
 639         MNT_IUNLOCK(mp);
 640 }
 641
 642 void
 643 vfs_rel(struct mount *mp)
 644 {
 645         struct mount_pcpu *mpcpu;
 646
 647         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 648         if (vfs_op_thread_enter(mp, mpcpu)) {
 649                 vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
 650                 vfs_op_thread_exit(mp, mpcpu);
 651                 return;
 652         }
 653
 654         MNT_ILOCK(mp);
 655         MNT_REL(mp);
 656         MNT_IUNLOCK(mp);
 657 }
 658
 659 /*
 660  * Allocate and initialize the mount point struct.
 661  */
 662 struct mount *
 663 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
 664     struct ucred *cred)
 665 {
 666         struct mount *mp;
 667
 668         mp = uma_zalloc(mount_zone, M_WAITOK);
 669         bzero(&mp->mnt_startzero,
 670             __rangeof(struct mount, mnt_startzero, mnt_endzero));
 671         mp->mnt_kern_flag = 0;
 672         mp->mnt_flag = 0;
 673         mp->mnt_rootvnode = NULL;
 674         mp->mnt_vnodecovered = NULL;
 675         mp->mnt_op = NULL;
 676         mp->mnt_vfc = NULL;
 677         TAILQ_INIT(&mp->mnt_nvnodelist);
 678         mp->mnt_nvnodelistsize = 0;
 679         TAILQ_INIT(&mp->mnt_lazyvnodelist);
 680         mp->mnt_lazyvnodelistsize = 0;
 681         MPPASS(mp->mnt_ref == 0 && mp->mnt_lockref == 0 &&
 682             mp->mnt_writeopcount == 0, mp);
 683         MPASSERT(mp->mnt_vfs_ops == 1, mp,
 684             ("vfs_ops should be 1 but %d found", mp->mnt_vfs_ops));
 685         (void) vfs_busy(mp, MBF_NOWAIT);
 686         atomic_add_acq_int(&vfsp->vfc_refcount, 1);
 687         mp->mnt_op = vfsp->vfc_vfsops;
 688         mp->mnt_vfc = vfsp;
 689         mp->mnt_stat.f_type = vfsp->vfc_typenum;
 690         mp->mnt_gen++;
 691         strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 692         mp->mnt_vnodecovered = vp;
 693         mp->mnt_cred = crdup(cred);
 694         mp->mnt_stat.f_owner = cred->cr_uid;
 695         strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
 696         mp->mnt_iosize_max = DFLTPHYS;
 697 #ifdef MAC
 698         mac_mount_init(mp);
 699         mac_mount_create(cred, mp);
 700 #endif
 701         arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
 702         mp->mnt_upper_pending = 0;
 703         TAILQ_INIT(&mp->mnt_uppers);
 704         TAILQ_INIT(&mp->mnt_notify);
 705         mp->mnt_taskqueue_flags = 0;
 706         mp->mnt_unmount_retries = 0;
 707         return (mp);
 708 }
 709
 710 /*
 711  * Destroy the mount struct previously allocated by vfs_mount_alloc().
 712  */
 713 void
 714 vfs_mount_destroy(struct mount *mp)
 715 {
 716
 717         MPPASS(mp->mnt_vfs_ops != 0, mp);
 718
 719         vfs_assert_mount_counters(mp);
 720
 721         MNT_ILOCK(mp);
 722         mp->mnt_kern_flag |= MNTK_REFEXPIRE;
 723         if (mp->mnt_kern_flag & MNTK_MWAIT) {
 724                 mp->mnt_kern_flag &= ~MNTK_MWAIT;
 725                 wakeup(mp);
 726         }
 727         while (mp->mnt_ref)
 728                 msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
 729         KASSERT(mp->mnt_ref == 0,
 730             ("%s: invalid refcount in the drain path @ %s:%d", __func__,
 731             __FILE__, __LINE__));
 732         MPPASS(mp->mnt_writeopcount == 0, mp);
 733         MPPASS(mp->mnt_secondary_writes == 0, mp);
 734         atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1);
 735         if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
 736                 struct vnode *vp;
 737
 738                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
 739                         vn_printf(vp, "dangling vnode ");
 740                 panic("unmount: dangling vnode");
 741         }
 742         KASSERT(mp->mnt_upper_pending == 0, ("mnt_upper_pending"));
 743         KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers"));
 744         KASSERT(TAILQ_EMPTY(&mp->mnt_notify), ("mnt_notify"));
 745         MPPASS(mp->mnt_nvnodelistsize == 0, mp);
 746         MPPASS(mp->mnt_lazyvnodelistsize == 0, mp);
 747         MPPASS(mp->mnt_lockref == 0, mp);
 748         MNT_IUNLOCK(mp);
 749
 750         MPASSERT(mp->mnt_vfs_ops == 1, mp,
 751             ("vfs_ops should be 1 but %d found", mp->mnt_vfs_ops));
 752
 753         MPASSERT(mp->mnt_rootvnode == NULL, mp,
 754             ("mount point still has a root vnode %p", mp->mnt_rootvnode));
 755
 756         if (mp->mnt_vnodecovered != NULL)
 757                 vrele(mp->mnt_vnodecovered);
 758 #ifdef MAC
 759         mac_mount_destroy(mp);
 760 #endif
 761         if (mp->mnt_opt != NULL)
 762                 vfs_freeopts(mp->mnt_opt);
 763         if (mp->mnt_exjail != NULL) {
 764                 atomic_subtract_int(&mp->mnt_exjail->cr_prison->pr_exportcnt,
 765                     1);
 766                 crfree(mp->mnt_exjail);
 767         }
 768         if (mp->mnt_export != NULL) {
 769                 vfs_free_addrlist(mp->mnt_export);
 770                 free(mp->mnt_export, M_MOUNT);
 771         }
 772         crfree(mp->mnt_cred);
 773         uma_zfree(mount_zone, mp);
 774 }
 775
 776 static bool
 777 vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error)
 778 {
 779         /* This is an upgrade of an exisiting mount. */
 780         if ((fsflags & MNT_UPDATE) != 0)
 781                 return (false);
 782         /* This is already an R/O mount. */
 783         if ((fsflags & MNT_RDONLY) != 0)
 784                 return (false);
 785
 786         switch (error) {
 787         case ENODEV:    /* generic, geom, ... */
 788         case EACCES:    /* cam/scsi, ... */
 789         case EROFS:     /* md, mmcsd, ... */
 790                 /*
 791                  * These errors can be returned by the storage layer to signal
 792                  * that the media is read-only.  No harm in the R/O mount
 793                  * attempt if the error was returned for some other reason.
 794                  */
 795                 return (true);
 796         default:
 797                 return (false);
 798         }
 799 }
 800
 801 int
 802 vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions)
 803 {
 804         struct vfsoptlist *optlist;
 805         struct vfsopt *opt, *tmp_opt;
 806         char *fstype, *fspath, *errmsg;
 807         int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
 808         bool autoro, has_nonexport, jail_export;
 809
 810         errmsg = fspath = NULL;
 811         errmsg_len = fspathlen = 0;
 812         errmsg_pos = -1;
 813         autoro = default_autoro;
 814
 815         error = vfs_buildopts(fsoptions, &optlist);
 816         if (error)
 817                 return (error);
 818
 819         if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
 820                 errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
 821
 822         /*
 823          * We need these two options before the others,
 824          * and they are mandatory for any filesystem.
 825          * Ensure they are NUL terminated as well.
 826          */
 827         fstypelen = 0;
 828         error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
 829         if (error || fstypelen <= 0 || fstype[fstypelen - 1] != '\0') {
 830                 error = EINVAL;
 831                 if (errmsg != NULL)
 832                         strncpy(errmsg, "Invalid fstype", errmsg_len);
 833                 goto bail;
 834         }
 835         fspathlen = 0;
 836         error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
 837         if (error || fspathlen <= 0 || fspath[fspathlen - 1] != '\0') {
 838                 error = EINVAL;
 839                 if (errmsg != NULL)
 840                         strncpy(errmsg, "Invalid fspath", errmsg_len);
 841                 goto bail;
 842         }
 843
 844         /*
 845          * Check to see that "export" is only used with the "update", "fstype",
 846          * "fspath", "from" and "errmsg" options when in a vnet jail.
 847          * These are the ones used to set/update exports by mountd(8).
 848          * If only the above options are set in a jail that can run mountd(8),
 849          * then the jail_export argument of vfs_domount() will be true.
 850          * When jail_export is true, the vfs_suser() check does not cause
 851          * failure, but limits the update to exports only.
 852          * This allows mountd(8) running within the vnet jail
 853          * to export file systems visible within the jail, but
 854          * mounted outside of the jail.
 855          */
 856         /*
 857          * We need to see if we have the "update" option
 858          * before we call vfs_domount(), since vfs_domount() has special
 859          * logic based on MNT_UPDATE.  This is very important
 860          * when we want to update the root filesystem.
 861          */
 862         has_nonexport = false;
 863         jail_export = false;
 864         TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
 865                 int do_freeopt = 0;
 866
 867                 if (jailed(td->td_ucred) &&
 868                     strcmp(opt->name, "export") != 0 &&
 869                     strcmp(opt->name, "update") != 0 &&
 870                     strcmp(opt->name, "fstype") != 0 &&
 871                     strcmp(opt->name, "fspath") != 0 &&
 872                     strcmp(opt->name, "from") != 0 &&
 873                     strcmp(opt->name, "errmsg") != 0)
 874                         has_nonexport = true;
 875                 if (strcmp(opt->name, "update") == 0) {
 876                         fsflags |= MNT_UPDATE;
 877                         do_freeopt = 1;
 878                 }
 879                 else if (strcmp(opt->name, "async") == 0)
 880                         fsflags |= MNT_ASYNC;
 881                 else if (strcmp(opt->name, "force") == 0) {
 882                         fsflags |= MNT_FORCE;
 883                         do_freeopt = 1;
 884                 }
 885                 else if (strcmp(opt->name, "reload") == 0) {
 886                         fsflags |= MNT_RELOAD;
 887                         do_freeopt = 1;
 888                 }
 889                 else if (strcmp(opt->name, "multilabel") == 0)
 890                         fsflags |= MNT_MULTILABEL;
 891                 else if (strcmp(opt->name, "noasync") == 0)
 892                         fsflags &= ~MNT_ASYNC;
 893                 else if (strcmp(opt->name, "noatime") == 0)
 894                         fsflags |= MNT_NOATIME;
 895                 else if (strcmp(opt->name, "atime") == 0) {
 896                         free(opt->name, M_MOUNT);
 897                         opt->name = strdup("nonoatime", M_MOUNT);
 898                 }
 899                 else if (strcmp(opt->name, "noclusterr") == 0)
 900                         fsflags |= MNT_NOCLUSTERR;
 901                 else if (strcmp(opt->name, "clusterr") == 0) {
 902                         free(opt->name, M_MOUNT);
 903                         opt->name = strdup("nonoclusterr", M_MOUNT);
 904                 }
 905                 else if (strcmp(opt->name, "noclusterw") == 0)
 906                         fsflags |= MNT_NOCLUSTERW;
 907                 else if (strcmp(opt->name, "clusterw") == 0) {
 908                         free(opt->name, M_MOUNT);
 909                         opt->name = strdup("nonoclusterw", M_MOUNT);
 910                 }
 911                 else if (strcmp(opt->name, "noexec") == 0)
 912                         fsflags |= MNT_NOEXEC;
 913                 else if (strcmp(opt->name, "exec") == 0) {
 914                         free(opt->name, M_MOUNT);
 915                         opt->name = strdup("nonoexec", M_MOUNT);
 916                 }
 917                 else if (strcmp(opt->name, "nosuid") == 0)
 918                         fsflags |= MNT_NOSUID;
 919                 else if (strcmp(opt->name, "suid") == 0) {
 920                         free(opt->name, M_MOUNT);
 921                         opt->name = strdup("nonosuid", M_MOUNT);
 922                 }
 923                 else if (strcmp(opt->name, "nosymfollow") == 0)
 924                         fsflags |= MNT_NOSYMFOLLOW;
 925                 else if (strcmp(opt->name, "symfollow") == 0) {
 926                         free(opt->name, M_MOUNT);
 927                         opt->name = strdup("nonosymfollow", M_MOUNT);
 928                 }
 929                 else if (strcmp(opt->name, "noro") == 0) {
 930                         fsflags &= ~MNT_RDONLY;
 931                         autoro = false;
 932                 }
 933                 else if (strcmp(opt->name, "rw") == 0) {
 934                         fsflags &= ~MNT_RDONLY;
 935                         autoro = false;
 936                 }
 937                 else if (strcmp(opt->name, "ro") == 0) {
 938                         fsflags |= MNT_RDONLY;
 939                         autoro = false;
 940                 }
 941                 else if (strcmp(opt->name, "rdonly") == 0) {
 942                         free(opt->name, M_MOUNT);
 943                         opt->name = strdup("ro", M_MOUNT);
 944                         fsflags |= MNT_RDONLY;
 945                         autoro = false;
 946                 }
 947                 else if (strcmp(opt->name, "autoro") == 0) {
 948                         do_freeopt = 1;
 949                         autoro = true;
 950                 }
 951                 else if (strcmp(opt->name, "suiddir") == 0)
 952                         fsflags |= MNT_SUIDDIR;
 953                 else if (strcmp(opt->name, "sync") == 0)
 954                         fsflags |= MNT_SYNCHRONOUS;
 955                 else if (strcmp(opt->name, "union") == 0)
 956                         fsflags |= MNT_UNION;
 957                 else if (strcmp(opt->name, "export") == 0) {
 958                         fsflags |= MNT_EXPORTED;
 959                         jail_export = true;
 960                 } else if (strcmp(opt->name, "automounted") == 0) {
 961                         fsflags |= MNT_AUTOMOUNTED;
 962                         do_freeopt = 1;
 963                 } else if (strcmp(opt->name, "nocover") == 0) {
 964                         fsflags |= MNT_NOCOVER;
 965                         do_freeopt = 1;
 966                 } else if (strcmp(opt->name, "cover") == 0) {
 967                         fsflags &= ~MNT_NOCOVER;
 968                         do_freeopt = 1;
 969                 } else if (strcmp(opt->name, "emptydir") == 0) {
 970                         fsflags |= MNT_EMPTYDIR;
 971                         do_freeopt = 1;
 972                 } else if (strcmp(opt->name, "noemptydir") == 0) {
 973                         fsflags &= ~MNT_EMPTYDIR;
 974                         do_freeopt = 1;
 975                 }
 976                 if (do_freeopt)
 977                         vfs_freeopt(optlist, opt);
 978         }
 979
 980         /*
 981          * Be ultra-paranoid about making sure the type and fspath
 982          * variables will fit in our mp buffers, including the
 983          * terminating NUL.
 984          */
 985         if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) {
 986                 error = ENAMETOOLONG;
 987                 goto bail;
 988         }
 989
 990         /*
 991          * If has_nonexport is true or the caller is not running within a
 992          * vnet prison that can run mountd(8), set jail_export false.
 993          */
 994         if (has_nonexport || !jailed(td->td_ucred) ||
 995             !prison_check_nfsd(td->td_ucred))
 996                 jail_export = false;
 997
 998         error = vfs_domount(td, fstype, fspath, fsflags, jail_export, &optlist);
 999         if (error == ENOENT) {
1000                 error = EINVAL;
1001                 if (errmsg != NULL)
1002                         strncpy(errmsg, "Invalid fstype", errmsg_len);
1003                 goto bail;
1004         }
1005
1006         /*
1007          * See if we can mount in the read-only mode if the error code suggests
1008          * that it could be possible and the mount options allow for that.
1009          * Never try it if "[no]{ro|rw}" has been explicitly requested and not
1010          * overridden by "autoro".
1011          */
1012         if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) {
1013                 printf("%s: R/W mount failed, possibly R/O media,"
1014                     " trying R/O mount\n", __func__);
1015                 fsflags |= MNT_RDONLY;
1016                 error = vfs_domount(td, fstype, fspath, fsflags, jail_export,
1017                     &optlist);
1018         }
1019 bail:
1020         /* copyout the errmsg */
1021         if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
1022             && errmsg_len > 0 && errmsg != NULL) {
1023                 if (fsoptions->uio_segflg == UIO_SYSSPACE) {
1024                         bcopy(errmsg,
1025                             fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
1026                             fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
1027                 } else {
1028                         copyout(errmsg,
1029                             fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
1030                             fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
1031                 }
1032         }
1033
1034         if (optlist != NULL)
1035                 vfs_freeopts(optlist);
1036         return (error);
1037 }
1038
1039 /*
1040  * Old mount API.
1041  */
1042 #ifndef _SYS_SYSPROTO_H_
1043 struct mount_args {
1044         char    *type;
1045         char    *path;
1046         int     flags;
1047         caddr_t data;
1048 };
1049 #endif
1050 /* ARGSUSED */
1051 int
1052 sys_mount(struct thread *td, struct mount_args *uap)
1053 {
1054         char *fstype;
1055         struct vfsconf *vfsp = NULL;
1056         struct mntarg *ma = NULL;
1057         uint64_t flags;
1058         int error;
1059
1060         /*
1061          * Mount flags are now 64-bits. On 32-bit architectures only
1062          * 32-bits are passed in, but from here on everything handles
1063          * 64-bit flags correctly.
1064          */
1065         flags = uap->flags;
1066
1067         AUDIT_ARG_FFLAGS(flags);
1068
1069         /*
1070          * Filter out MNT_ROOTFS.  We do not want clients of mount() in
1071          * userspace to set this flag, but we must filter it out if we want
1072          * MNT_UPDATE on the root file system to work.
1073          * MNT_ROOTFS should only be set by the kernel when mounting its
1074          * root file system.
1075          */
1076         flags &= ~MNT_ROOTFS;
1077
1078         fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
1079         error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
1080         if (error) {
1081                 free(fstype, M_TEMP);
1082                 return (error);
1083         }
1084
1085         AUDIT_ARG_TEXT(fstype);
1086         vfsp = vfs_byname_kld(fstype, td, &error);
1087         free(fstype, M_TEMP);
1088         if (vfsp == NULL)
1089                 return (ENOENT);
1090         if (((vfsp->vfc_flags & VFCF_SBDRY) != 0 &&
1091             vfsp->vfc_vfsops_sd->vfs_cmount == NULL) ||
1092             ((vfsp->vfc_flags & VFCF_SBDRY) == 0 &&
1093             vfsp->vfc_vfsops->vfs_cmount == NULL))
1094                 return (EOPNOTSUPP);
1095
1096         ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN);
1097         ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
1098         ma = mount_argb(ma, flags & MNT_RDONLY, "noro");
1099         ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid");
1100         ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec");
1101
1102         if ((vfsp->vfc_flags & VFCF_SBDRY) != 0)
1103                 return (vfsp->vfc_vfsops_sd->vfs_cmount(ma, uap->data, flags));
1104         return (vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags));
1105 }
1106
1107 /*
1108  * vfs_domount_first(): first file system mount (not update)
1109  */
1110 static int
1111 vfs_domount_first(
1112         struct thread *td,              /* Calling thread. */
1113         struct vfsconf *vfsp,           /* File system type. */
1114         char *fspath,                   /* Mount path. */
1115         struct vnode *vp,               /* Vnode to be covered. */
1116         uint64_t fsflags,               /* Flags common to all filesystems. */
1117         struct vfsoptlist **optlist     /* Options local to the filesystem. */
1118         )
1119 {
1120         struct vattr va;
1121         struct mount *mp;
1122         struct vnode *newdp, *rootvp;
1123         int error, error1;
1124         bool unmounted;
1125
1126         ASSERT_VOP_ELOCKED(vp, __func__);
1127         KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
1128
1129         /*
1130          * If the jail of the calling thread lacks permission for this type of
1131          * file system, or is trying to cover its own root, deny immediately.
1132          */
1133         if (jailed(td->td_ucred) && (!prison_allow(td->td_ucred,
1134             vfsp->vfc_prison_flag) || vp == td->td_ucred->cr_prison->pr_root)) {
1135                 vput(vp);
1136                 return (EPERM);
1137         }
1138
1139         /*
1140          * If the user is not root, ensure that they own the directory
1141          * onto which we are attempting to mount.
1142          */
1143         error = VOP_GETATTR(vp, &va, td->td_ucred);
1144         if (error == 0 && va.va_uid != td->td_ucred->cr_uid)
1145                 error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN);
1146         if (error == 0)
1147                 error = vinvalbuf(vp, V_SAVE, 0, 0);
1148         if (vfsp->vfc_flags & VFCF_FILEMOUNT) {
1149                 if (error == 0 && vp->v_type != VDIR && vp->v_type != VREG)
1150                         error = EINVAL;
1151                 /*
1152                  * For file mounts, ensure that there is only one hardlink to the file.
1153                  */
1154                 if (error == 0 && vp->v_type == VREG && va.va_nlink != 1)
1155                         error = EINVAL;
1156         } else {
1157                 if (error == 0 && vp->v_type != VDIR)
1158                         error = ENOTDIR;
1159         }
1160         if (error == 0 && (fsflags & MNT_EMPTYDIR) != 0)
1161                 error = vn_dir_check_empty(vp);
1162         if (error == 0) {
1163                 VI_LOCK(vp);
1164                 if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
1165                         vp->v_iflag |= VI_MOUNT;
1166                 else
1167                         error = EBUSY;
1168                 VI_UNLOCK(vp);
1169         }
1170         if (error != 0) {
1171                 vput(vp);
1172                 return (error);
1173         }
1174         vn_seqc_write_begin(vp);
1175         VOP_UNLOCK(vp);
1176
1177         /* Allocate and initialize the filesystem. */
1178         mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
1179         /* XXXMAC: pass to vfs_mount_alloc? */
1180         mp->mnt_optnew = *optlist;
1181         /* Set the mount level flags. */
1182         mp->mnt_flag = (fsflags &
1183             (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY | MNT_FORCE));
1184
1185         /*
1186          * Mount the filesystem.
1187          * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
1188          * get.  No freeing of cn_pnbuf.
1189          */
1190         error1 = 0;
1191         unmounted = true;
1192         if ((error = VFS_MOUNT(mp)) != 0 ||
1193             (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 ||
1194             (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) {
1195                 rootvp = NULL;
1196                 if (error1 != 0) {
1197                         MPASS(error == 0);
1198                         rootvp = vfs_cache_root_clear(mp);
1199                         if (rootvp != NULL) {
1200                                 vhold(rootvp);
1201                                 vrele(rootvp);
1202                         }
1203                         (void)vn_start_write(NULL, &mp, V_WAIT);
1204                         MNT_ILOCK(mp);
1205                         mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_UNMOUNTF;
1206                         MNT_IUNLOCK(mp);
1207                         VFS_PURGE(mp);
1208                         error = VFS_UNMOUNT(mp, 0);
1209                         vn_finished_write(mp);
1210                         if (error != 0) {
1211                                 printf(
1212                     "failed post-mount (%d): rollback unmount returned %d\n",
1213                                     error1, error);
1214                                 unmounted = false;
1215                         }
1216                         error = error1;
1217                 }
1218                 vfs_unbusy(mp);
1219                 mp->mnt_vnodecovered = NULL;
1220                 if (unmounted) {
1221                         /* XXXKIB wait for mnt_lockref drain? */
1222                         vfs_mount_destroy(mp);
1223                 }
1224                 VI_LOCK(vp);
1225                 vp->v_iflag &= ~VI_MOUNT;
1226                 VI_UNLOCK(vp);
1227                 if (rootvp != NULL) {
1228                         vn_seqc_write_end(rootvp);
1229                         vdrop(rootvp);
1230                 }
1231                 vn_seqc_write_end(vp);
1232                 vrele(vp);
1233                 return (error);
1234         }
1235         vn_seqc_write_begin(newdp);
1236         VOP_UNLOCK(newdp);
1237
1238         if (mp->mnt_opt != NULL)
1239                 vfs_freeopts(mp->mnt_opt);
1240         mp->mnt_opt = mp->mnt_optnew;
1241         *optlist = NULL;
1242
1243         /*
1244          * Prevent external consumers of mount options from reading mnt_optnew.
1245          */
1246         mp->mnt_optnew = NULL;
1247
1248         MNT_ILOCK(mp);
1249         if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
1250             (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
1251                 mp->mnt_kern_flag |= MNTK_ASYNC;
1252         else
1253                 mp->mnt_kern_flag &= ~MNTK_ASYNC;
1254         MNT_IUNLOCK(mp);
1255
1256         /*
1257          * VIRF_MOUNTPOINT and v_mountedhere need to be set under the
1258          * vp lock to satisfy vfs_lookup() requirements.
1259          */
1260         VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY);
1261         VI_LOCK(vp);
1262         vn_irflag_set_locked(vp, VIRF_MOUNTPOINT);
1263         vp->v_mountedhere = mp;
1264         VI_UNLOCK(vp);
1265         VOP_UNLOCK(vp);
1266         cache_purge(vp);
1267
1268         /*
1269          * We need to lock both vnodes.
1270          *
1271          * Use vn_lock_pair to avoid establishing an ordering between vnodes
1272          * from different filesystems.
1273          */
1274         vn_lock_pair(vp, false, LK_EXCLUSIVE, newdp, false, LK_EXCLUSIVE);
1275
1276         VI_LOCK(vp);
1277         vp->v_iflag &= ~VI_MOUNT;
1278         VI_UNLOCK(vp);
1279         /* Place the new filesystem at the end of the mount list. */
1280         mtx_lock(&mountlist_mtx);
1281         TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
1282         mtx_unlock(&mountlist_mtx);
1283         vfs_event_signal(NULL, VQ_MOUNT, 0);
1284         VOP_UNLOCK(vp);
1285         EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td);
1286         VOP_UNLOCK(newdp);
1287         mount_devctl_event("MOUNT", mp, false);
1288         mountcheckdirs(vp, newdp);
1289         vn_seqc_write_end(vp);
1290         vn_seqc_write_end(newdp);
1291         vrele(newdp);
1292         if ((mp->mnt_flag & MNT_RDONLY) == 0)
1293                 vfs_allocate_syncvnode(mp);
1294         vfs_op_exit(mp);
1295         vfs_unbusy(mp);
1296         return (0);
1297 }
1298
1299 /*
1300  * vfs_domount_update(): update of mounted file system
1301  */
1302 static int
1303 vfs_domount_update(
1304         struct thread *td,              /* Calling thread. */
1305         struct vnode *vp,               /* Mount point vnode. */
1306         uint64_t fsflags,               /* Flags common to all filesystems. */
1307         bool jail_export,               /* Got export option in vnet prison. */
1308         struct vfsoptlist **optlist     /* Options local to the filesystem. */
1309         )
1310 {
1311         struct export_args export;
1312         struct o2export_args o2export;
1313         struct vnode *rootvp;
1314         void *bufp;
1315         struct mount *mp;
1316         int error, export_error, i, len;
1317         uint64_t flag;
1318         gid_t *grps;
1319         bool vfs_suser_failed;
1320
1321         ASSERT_VOP_ELOCKED(vp, __func__);
1322         KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
1323         mp = vp->v_mount;
1324
1325         if ((vp->v_vflag & VV_ROOT) == 0) {
1326                 if (vfs_copyopt(*optlist, "export", &export, sizeof(export))
1327                     == 0)
1328                         error = EXDEV;
1329                 else
1330                         error = EINVAL;
1331                 vput(vp);
1332                 return (error);
1333         }
1334
1335         /*
1336          * We only allow the filesystem to be reloaded if it
1337          * is currently mounted read-only.
1338          */
1339         flag = mp->mnt_flag;
1340         if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) {
1341                 vput(vp);
1342                 return (EOPNOTSUPP);    /* Needs translation */
1343         }
1344         /*
1345          * Only privileged root, or (if MNT_USER is set) the user that
1346          * did the original mount is permitted to update it.
1347          */
1348         /*
1349          * For the case of mountd(8) doing exports in a jail, the vfs_suser()
1350          * call does not cause failure.  vfs_domount() has already checked
1351          * that "root" is doing this and vfs_suser() will fail when
1352          * the file system has been mounted outside the jail.
1353          * jail_export set true indicates that "export" is not mixed
1354          * with other options that change mount behaviour.
1355          */
1356         vfs_suser_failed = false;
1357         error = vfs_suser(mp, td);
1358         if (jail_export && error != 0) {
1359                 error = 0;
1360                 vfs_suser_failed = true;
1361         }
1362         if (error != 0) {
1363                 vput(vp);
1364                 return (error);
1365         }
1366         if (vfs_busy(mp, MBF_NOWAIT)) {
1367                 vput(vp);
1368                 return (EBUSY);
1369         }
1370         VI_LOCK(vp);
1371         if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) {
1372                 VI_UNLOCK(vp);
1373                 vfs_unbusy(mp);
1374                 vput(vp);
1375                 return (EBUSY);
1376         }
1377         vp->v_iflag |= VI_MOUNT;
1378         VI_UNLOCK(vp);
1379         VOP_UNLOCK(vp);
1380
1381         vfs_op_enter(mp);
1382         vn_seqc_write_begin(vp);
1383
1384         rootvp = NULL;
1385         MNT_ILOCK(mp);
1386         if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
1387                 MNT_IUNLOCK(mp);
1388                 error = EBUSY;
1389                 goto end;
1390         }
1391         if (vfs_suser_failed) {
1392                 KASSERT((fsflags & (MNT_EXPORTED | MNT_UPDATE)) ==
1393                     (MNT_EXPORTED | MNT_UPDATE),
1394                     ("%s: jailed export did not set expected fsflags",
1395                      __func__));
1396                 /*
1397                  * For this case, only MNT_UPDATE and
1398                  * MNT_EXPORTED have been set in fsflags
1399                  * by the options.  Only set MNT_UPDATE,
1400                  * since that is the one that would be set
1401                  * when set in fsflags, below.
1402                  */
1403                 mp->mnt_flag |= MNT_UPDATE;
1404         } else {
1405                 mp->mnt_flag &= ~MNT_UPDATEMASK;
1406                 mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
1407                     MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
1408                 if ((mp->mnt_flag & MNT_ASYNC) == 0)
1409                         mp->mnt_kern_flag &= ~MNTK_ASYNC;
1410         }
1411         rootvp = vfs_cache_root_clear(mp);
1412         MNT_IUNLOCK(mp);
1413         mp->mnt_optnew = *optlist;
1414         vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
1415
1416         /*
1417          * Mount the filesystem.
1418          * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
1419          * get.  No freeing of cn_pnbuf.
1420          */
1421         /*
1422          * For the case of mountd(8) doing exports from within a vnet jail,
1423          * "from" is typically not set correctly such that VFS_MOUNT() will
1424          * return ENOENT. It is not obvious that VFS_MOUNT() ever needs to be
1425          * called when mountd is doing exports, but this check only applies to
1426          * the specific case where it is running inside a vnet jail, to
1427          * avoid any POLA violation.
1428          */
1429         error = 0;
1430         if (!jail_export)
1431                 error = VFS_MOUNT(mp);
1432
1433         export_error = 0;
1434         /* Process the export option. */
1435         if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp,
1436             &len) == 0) {
1437                 /* Assume that there is only 1 ABI for each length. */
1438                 switch (len) {
1439                 case (sizeof(struct oexport_args)):
1440                         bzero(&o2export, sizeof(o2export));
1441                         /* FALLTHROUGH */
1442                 case (sizeof(o2export)):
1443                         bcopy(bufp, &o2export, len);
1444                         export.ex_flags = (uint64_t)o2export.ex_flags;
1445                         export.ex_root = o2export.ex_root;
1446                         export.ex_uid = o2export.ex_anon.cr_uid;
1447                         export.ex_groups = NULL;
1448                         export.ex_ngroups = o2export.ex_anon.cr_ngroups;
1449                         if (export.ex_ngroups > 0) {
1450                                 if (export.ex_ngroups <= XU_NGROUPS) {
1451                                         export.ex_groups = malloc(
1452                                             export.ex_ngroups * sizeof(gid_t),
1453                                             M_TEMP, M_WAITOK);
1454                                         for (i = 0; i < export.ex_ngroups; i++)
1455                                                 export.ex_groups[i] =
1456                                                   o2export.ex_anon.cr_groups[i];
1457                                 } else
1458                                         export_error = EINVAL;
1459                         } else if (export.ex_ngroups < 0)
1460                                 export_error = EINVAL;
1461                         export.ex_addr = o2export.ex_addr;
1462                         export.ex_addrlen = o2export.ex_addrlen;
1463                         export.ex_mask = o2export.ex_mask;
1464                         export.ex_masklen = o2export.ex_masklen;
1465                         export.ex_indexfile = o2export.ex_indexfile;
1466                         export.ex_numsecflavors = o2export.ex_numsecflavors;
1467                         if (export.ex_numsecflavors < MAXSECFLAVORS) {
1468                                 for (i = 0; i < export.ex_numsecflavors; i++)
1469                                         export.ex_secflavors[i] =
1470                                             o2export.ex_secflavors[i];
1471                         } else
1472                                 export_error = EINVAL;
1473                         if (export_error == 0)
1474                                 export_error = vfs_export(mp, &export, true);
1475                         free(export.ex_groups, M_TEMP);
1476                         break;
1477                 case (sizeof(export)):
1478                         bcopy(bufp, &export, len);
1479                         grps = NULL;
1480                         if (export.ex_ngroups > 0) {
1481                                 if (export.ex_ngroups <= NGROUPS_MAX) {
1482                                         grps = malloc(export.ex_ngroups *
1483                                             sizeof(gid_t), M_TEMP, M_WAITOK);
1484                                         export_error = copyin(export.ex_groups,
1485                                             grps, export.ex_ngroups *
1486                                             sizeof(gid_t));
1487                                         if (export_error == 0)
1488                                                 export.ex_groups = grps;
1489                                 } else
1490                                         export_error = EINVAL;
1491                         } else if (export.ex_ngroups == 0)
1492                                 export.ex_groups = NULL;
1493                         else
1494                                 export_error = EINVAL;
1495                         if (export_error == 0)
1496                                 export_error = vfs_export(mp, &export, true);
1497                         free(grps, M_TEMP);
1498                         break;
1499                 default:
1500                         export_error = EINVAL;
1501                         break;
1502                 }
1503         }
1504
1505         MNT_ILOCK(mp);
1506         if (error == 0) {
1507                 mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE |
1508                     MNT_SNAPSHOT);
1509         } else {
1510                 /*
1511                  * If we fail, restore old mount flags. MNT_QUOTA is special,
1512                  * because it is not part of MNT_UPDATEMASK, but it could have
1513                  * changed in the meantime if quotactl(2) was called.
1514                  * All in all we want current value of MNT_QUOTA, not the old
1515                  * one.
1516                  */
1517                 mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
1518         }
1519         if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
1520             (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
1521                 mp->mnt_kern_flag |= MNTK_ASYNC;
1522         else
1523                 mp->mnt_kern_flag &= ~MNTK_ASYNC;
1524         MNT_IUNLOCK(mp);
1525
1526         if (error != 0)
1527                 goto end;
1528
1529         mount_devctl_event("REMOUNT", mp, true);
1530         if (mp->mnt_opt != NULL)
1531                 vfs_freeopts(mp->mnt_opt);
1532         mp->mnt_opt = mp->mnt_optnew;
1533         *optlist = NULL;
1534         (void)VFS_STATFS(mp, &mp->mnt_stat);
1535         /*
1536          * Prevent external consumers of mount options from reading
1537          * mnt_optnew.
1538          */
1539         mp->mnt_optnew = NULL;
1540
1541         if ((mp->mnt_flag & MNT_RDONLY) == 0)
1542                 vfs_allocate_syncvnode(mp);
1543         else
1544                 vfs_deallocate_syncvnode(mp);
1545 end:
1546         vfs_op_exit(mp);
1547         if (rootvp != NULL) {
1548                 vn_seqc_write_end(rootvp);
1549                 vrele(rootvp);
1550         }
1551         vn_seqc_write_end(vp);
1552         vfs_unbusy(mp);
1553         VI_LOCK(vp);
1554         vp->v_iflag &= ~VI_MOUNT;
1555         VI_UNLOCK(vp);
1556         vrele(vp);
1557         return (error != 0 ? error : export_error);
1558 }
1559
1560 /*
1561  * vfs_domount(): actually attempt a filesystem mount.
1562  */
1563 static int
1564 vfs_domount(
1565         struct thread *td,              /* Calling thread. */
1566         const char *fstype,             /* Filesystem type. */
1567         char *fspath,                   /* Mount path. */
1568         uint64_t fsflags,               /* Flags common to all filesystems. */
1569         bool jail_export,               /* Got export option in vnet prison. */
1570         struct vfsoptlist **optlist     /* Options local to the filesystem. */
1571         )
1572 {
1573         struct vfsconf *vfsp;
1574         struct nameidata nd;
1575         struct vnode *vp;
1576         char *pathbuf;
1577         int error;
1578
1579         /*
1580          * Be ultra-paranoid about making sure the type and fspath
1581          * variables will fit in our mp buffers, including the
1582          * terminating NUL.
1583          */
1584         if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
1585                 return (ENAMETOOLONG);
1586
1587         if (jail_export) {
1588                 error = priv_check(td, PRIV_NFS_DAEMON);
1589                 if (error)
1590                         return (error);
1591         } else if (jailed(td->td_ucred) || usermount == 0) {
1592                 if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
1593                         return (error);
1594         }
1595
1596         /*
1597          * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
1598          */
1599         if (fsflags & MNT_EXPORTED) {
1600                 error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
1601                 if (error)
1602                         return (error);
1603         }
1604         if (fsflags & MNT_SUIDDIR) {
1605                 error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
1606                 if (error)
1607                         return (error);
1608         }
1609         /*
1610          * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
1611          */
1612         if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
1613                 if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
1614                         fsflags |= MNT_NOSUID | MNT_USER;
1615         }
1616
1617         /* Load KLDs before we lock the covered vnode to avoid reversals. */
1618         vfsp = NULL;
1619         if ((fsflags & MNT_UPDATE) == 0) {
1620                 /* Don't try to load KLDs if we're mounting the root. */
1621                 if (fsflags & MNT_ROOTFS) {
1622                         if ((vfsp = vfs_byname(fstype)) == NULL)
1623                                 return (ENODEV);
1624                 } else {
1625                         if ((vfsp = vfs_byname_kld(fstype, td, &error)) == NULL)
1626                                 return (error);
1627                 }
1628         }
1629
1630         /*
1631          * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
1632          */
1633         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1 | WANTPARENT,
1634             UIO_SYSSPACE, fspath);
1635         error = namei(&nd);
1636         if (error != 0)
1637                 return (error);
1638         vp = nd.ni_vp;
1639         /*
1640          * Don't allow stacking file mounts to work around problems with the way
1641          * that namei sets nd.ni_dvp to vp_crossmp for these.
1642          */
1643         if (vp->v_type == VREG)
1644                 fsflags |= MNT_NOCOVER;
1645         if ((fsflags & MNT_UPDATE) == 0) {
1646                 if ((vp->v_vflag & VV_ROOT) != 0 &&
1647                     (fsflags & MNT_NOCOVER) != 0) {
1648                         vput(vp);
1649                         error = EBUSY;
1650                         goto out;
1651                 }
1652                 pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
1653                 strcpy(pathbuf, fspath);
1654                 /*
1655                  * Note: we allow any vnode type here. If the path sanity check
1656                  * succeeds, the type will be validated in vfs_domount_first
1657                  * above.
1658                  */
1659                 if (vp->v_type == VDIR)
1660                         error = vn_path_to_global_path(td, vp, pathbuf,
1661                             MNAMELEN);
1662                 else
1663                         error = vn_path_to_global_path_hardlink(td, vp,
1664                             nd.ni_dvp, pathbuf, MNAMELEN,
1665                             nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen);
1666                 if (error == 0) {
1667                         error = vfs_domount_first(td, vfsp, pathbuf, vp,
1668                             fsflags, optlist);
1669                 }
1670                 free(pathbuf, M_TEMP);
1671         } else
1672                 error = vfs_domount_update(td, vp, fsflags, jail_export,
1673                     optlist);
1674
1675 out:
1676         NDFREE_PNBUF(&nd);
1677         vrele(nd.ni_dvp);
1678
1679         return (error);
1680 }
1681
1682 /*
1683  * Unmount a filesystem.
1684  *
1685  * Note: unmount takes a path to the vnode mounted on as argument, not
1686  * special file (as before).
1687  */
1688 #ifndef _SYS_SYSPROTO_H_
1689 struct unmount_args {
1690         char    *path;
1691         int     flags;
1692 };
1693 #endif
1694 /* ARGSUSED */
1695 int
1696 sys_unmount(struct thread *td, struct unmount_args *uap)
1697 {
1698
1699         return (kern_unmount(td, uap->path, uap->flags));
1700 }
1701
1702 int
1703 kern_unmount(struct thread *td, const char *path, int flags)
1704 {
1705         struct nameidata nd;
1706         struct mount *mp;
1707         char *fsidbuf, *pathbuf;
1708         fsid_t fsid;
1709         int error;
1710
1711         AUDIT_ARG_VALUE(flags);
1712         if (jailed(td->td_ucred) || usermount == 0) {
1713                 error = priv_check(td, PRIV_VFS_UNMOUNT);
1714                 if (error)
1715                         return (error);
1716         }
1717
1718         if (flags & MNT_BYFSID) {
1719                 fsidbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
1720                 error = copyinstr(path, fsidbuf, MNAMELEN, NULL);
1721                 if (error) {
1722                         free(fsidbuf, M_TEMP);
1723                         return (error);
1724                 }
1725
1726                 AUDIT_ARG_TEXT(fsidbuf);
1727                 /* Decode the filesystem ID. */
1728                 if (sscanf(fsidbuf, "FSID:%d:%d", &fsid.val[0], &fsid.val[1]) != 2) {
1729                         free(fsidbuf, M_TEMP);
1730                         return (EINVAL);
1731                 }
1732
1733                 mp = vfs_getvfs(&fsid);
1734                 free(fsidbuf, M_TEMP);
1735                 if (mp == NULL) {
1736                         return (ENOENT);
1737                 }
1738         } else {
1739                 pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
1740                 error = copyinstr(path, pathbuf, MNAMELEN, NULL);
1741                 if (error) {
1742                         free(pathbuf, M_TEMP);
1743                         return (error);
1744                 }
1745
1746                 /*
1747                  * Try to find global path for path argument.
1748                  */
1749                 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
1750                     UIO_SYSSPACE, pathbuf);
1751                 if (namei(&nd) == 0) {
1752                         NDFREE_PNBUF(&nd);
1753                         error = vn_path_to_global_path(td, nd.ni_vp, pathbuf,
1754                             MNAMELEN);
1755                         if (error == 0)
1756                                 vput(nd.ni_vp);
1757                 }
1758                 mtx_lock(&mountlist_mtx);
1759                 TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
1760                         if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) {
1761                                 vfs_ref(mp);
1762                                 break;
1763                         }
1764                 }
1765                 mtx_unlock(&mountlist_mtx);
1766                 free(pathbuf, M_TEMP);
1767                 if (mp == NULL) {
1768                         /*
1769                          * Previously we returned ENOENT for a nonexistent path and
1770                          * EINVAL for a non-mountpoint.  We cannot tell these apart
1771                          * now, so in the !MNT_BYFSID case return the more likely
1772                          * EINVAL for compatibility.
1773                          */
1774                         return (EINVAL);
1775                 }
1776         }
1777
1778         /*
1779          * Don't allow unmounting the root filesystem.
1780          */
1781         if (mp->mnt_flag & MNT_ROOTFS) {
1782                 vfs_rel(mp);
1783                 return (EINVAL);
1784         }
1785         error = dounmount(mp, flags, td);
1786         return (error);
1787 }
1788
1789 /*
1790  * Return error if any of the vnodes, ignoring the root vnode
1791  * and the syncer vnode, have non-zero usecount.
1792  *
1793  * This function is purely advisory - it can return false positives
1794  * and negatives.
1795  */
1796 static int
1797 vfs_check_usecounts(struct mount *mp)
1798 {
1799         struct vnode *vp, *mvp;
1800
1801         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
1802                 if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON &&
1803                     vp->v_usecount != 0) {
1804                         VI_UNLOCK(vp);
1805                         MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
1806                         return (EBUSY);
1807                 }
1808                 VI_UNLOCK(vp);
1809         }
1810
1811         return (0);
1812 }
1813
1814 static void
1815 dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags)
1816 {
1817
1818         mtx_assert(MNT_MTX(mp), MA_OWNED);
1819         mp->mnt_kern_flag &= ~mntkflags;
1820         if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) {
1821                 mp->mnt_kern_flag &= ~MNTK_MWAIT;
1822                 wakeup(mp);
1823         }
1824         vfs_op_exit_locked(mp);
1825         MNT_IUNLOCK(mp);
1826         if (coveredvp != NULL) {
1827                 VOP_UNLOCK(coveredvp);
1828                 vdrop(coveredvp);
1829         }
1830         vn_finished_write(mp);
1831         vfs_rel(mp);
1832 }
1833
1834 /*
1835  * There are various reference counters associated with the mount point.
1836  * Normally it is permitted to modify them without taking the mnt ilock,
1837  * but this behavior can be temporarily disabled if stable value is needed
1838  * or callers are expected to block (e.g. to not allow new users during
1839  * forced unmount).
1840  */
1841 void
1842 vfs_op_enter(struct mount *mp)
1843 {
1844         struct mount_pcpu *mpcpu;
1845         int cpu;
1846
1847         MNT_ILOCK(mp);
1848         mp->mnt_vfs_ops++;
1849         if (mp->mnt_vfs_ops > 1) {
1850                 MNT_IUNLOCK(mp);
1851                 return;
1852         }
1853         vfs_op_barrier_wait(mp);
1854         CPU_FOREACH(cpu) {
1855                 mpcpu = vfs_mount_pcpu_remote(mp, cpu);
1856
1857                 mp->mnt_ref += mpcpu->mntp_ref;
1858                 mpcpu->mntp_ref = 0;
1859
1860                 mp->mnt_lockref += mpcpu->mntp_lockref;
1861                 mpcpu->mntp_lockref = 0;
1862
1863                 mp->mnt_writeopcount += mpcpu->mntp_writeopcount;
1864                 mpcpu->mntp_writeopcount = 0;
1865         }
1866         MPASSERT(mp->mnt_ref > 0 && mp->mnt_lockref >= 0 &&
1867             mp->mnt_writeopcount >= 0, mp,
1868             ("invalid count(s): ref %d lockref %d writeopcount %d",
1869             mp->mnt_ref, mp->mnt_lockref, mp->mnt_writeopcount));
1870         MNT_IUNLOCK(mp);
1871         vfs_assert_mount_counters(mp);
1872 }
1873
1874 void
1875 vfs_op_exit_locked(struct mount *mp)
1876 {
1877
1878         mtx_assert(MNT_MTX(mp), MA_OWNED);
1879
1880         MPASSERT(mp->mnt_vfs_ops > 0, mp,
1881             ("invalid vfs_ops count %d", mp->mnt_vfs_ops));
1882         MPASSERT(mp->mnt_vfs_ops > 1 ||
1883             (mp->mnt_kern_flag & (MNTK_UNMOUNT | MNTK_SUSPEND)) == 0, mp,
1884             ("vfs_ops too low %d in unmount or suspend", mp->mnt_vfs_ops));
1885         mp->mnt_vfs_ops--;
1886 }
1887
1888 void
1889 vfs_op_exit(struct mount *mp)
1890 {
1891
1892         MNT_ILOCK(mp);
1893         vfs_op_exit_locked(mp);
1894         MNT_IUNLOCK(mp);
1895 }
1896
1897 struct vfs_op_barrier_ipi {
1898         struct mount *mp;
1899         struct smp_rendezvous_cpus_retry_arg srcra;
1900 };
1901
1902 static void
1903 vfs_op_action_func(void *arg)
1904 {
1905         struct vfs_op_barrier_ipi *vfsopipi;
1906         struct mount *mp;
1907
1908         vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra);
1909         mp = vfsopipi->mp;
1910
1911         if (!vfs_op_thread_entered(mp))
1912                 smp_rendezvous_cpus_done(arg);
1913 }
1914
1915 static void
1916 vfs_op_wait_func(void *arg, int cpu)
1917 {
1918         struct vfs_op_barrier_ipi *vfsopipi;
1919         struct mount *mp;
1920         struct mount_pcpu *mpcpu;
1921
1922         vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra);
1923         mp = vfsopipi->mp;
1924
1925         mpcpu = vfs_mount_pcpu_remote(mp, cpu);
1926         while (atomic_load_int(&mpcpu->mntp_thread_in_ops))
1927                 cpu_spinwait();
1928 }
1929
1930 void
1931 vfs_op_barrier_wait(struct mount *mp)
1932 {
1933         struct vfs_op_barrier_ipi vfsopipi;
1934
1935         vfsopipi.mp = mp;
1936
1937         smp_rendezvous_cpus_retry(all_cpus,
1938             smp_no_rendezvous_barrier,
1939             vfs_op_action_func,
1940             smp_no_rendezvous_barrier,
1941             vfs_op_wait_func,
1942             &vfsopipi.srcra);
1943 }
1944
1945 #ifdef DIAGNOSTIC
1946 void
1947 vfs_assert_mount_counters(struct mount *mp)
1948 {
1949         struct mount_pcpu *mpcpu;
1950         int cpu;
1951
1952         if (mp->mnt_vfs_ops == 0)
1953                 return;
1954
1955         CPU_FOREACH(cpu) {
1956                 mpcpu = vfs_mount_pcpu_remote(mp, cpu);
1957                 if (mpcpu->mntp_ref != 0 ||
1958                     mpcpu->mntp_lockref != 0 ||
1959                     mpcpu->mntp_writeopcount != 0)
1960                         vfs_dump_mount_counters(mp);
1961         }
1962 }
1963
1964 void
1965 vfs_dump_mount_counters(struct mount *mp)
1966 {
1967         struct mount_pcpu *mpcpu;
1968         int ref, lockref, writeopcount;
1969         int cpu;
1970
1971         printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops);
1972
1973         printf("        ref : ");
1974         ref = mp->mnt_ref;
1975         CPU_FOREACH(cpu) {
1976                 mpcpu = vfs_mount_pcpu_remote(mp, cpu);
1977                 printf("%d ", mpcpu->mntp_ref);
1978                 ref += mpcpu->mntp_ref;
1979         }
1980         printf("\n");
1981         printf("    lockref : ");
1982         lockref = mp->mnt_lockref;
1983         CPU_FOREACH(cpu) {
1984                 mpcpu = vfs_mount_pcpu_remote(mp, cpu);
1985                 printf("%d ", mpcpu->mntp_lockref);
1986                 lockref += mpcpu->mntp_lockref;
1987         }
1988         printf("\n");
1989         printf("writeopcount: ");
1990         writeopcount = mp->mnt_writeopcount;
1991         CPU_FOREACH(cpu) {
1992                 mpcpu = vfs_mount_pcpu_remote(mp, cpu);
1993                 printf("%d ", mpcpu->mntp_writeopcount);
1994                 writeopcount += mpcpu->mntp_writeopcount;
1995         }
1996         printf("\n");
1997
1998         printf("counter       struct total\n");
1999         printf("ref             %-5d  %-5d\n", mp->mnt_ref, ref);
2000         printf("lockref         %-5d  %-5d\n", mp->mnt_lockref, lockref);
2001         printf("writeopcount    %-5d  %-5d\n", mp->mnt_writeopcount, writeopcount);
2002
2003         panic("invalid counts on struct mount");
2004 }
2005 #endif
2006
2007 int
2008 vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which)
2009 {
2010         struct mount_pcpu *mpcpu;
2011         int cpu, sum;
2012
2013         switch (which) {
2014         case MNT_COUNT_REF:
2015                 sum = mp->mnt_ref;
2016                 break;
2017         case MNT_COUNT_LOCKREF:
2018                 sum = mp->mnt_lockref;
2019                 break;
2020         case MNT_COUNT_WRITEOPCOUNT:
2021                 sum = mp->mnt_writeopcount;
2022                 break;
2023         }
2024
2025         CPU_FOREACH(cpu) {
2026                 mpcpu = vfs_mount_pcpu_remote(mp, cpu);
2027                 switch (which) {
2028                 case MNT_COUNT_REF:
2029                         sum += mpcpu->mntp_ref;
2030                         break;
2031                 case MNT_COUNT_LOCKREF:
2032                         sum += mpcpu->mntp_lockref;
2033                         break;
2034                 case MNT_COUNT_WRITEOPCOUNT:
2035                         sum += mpcpu->mntp_writeopcount;
2036                         break;
2037                 }
2038         }
2039         return (sum);
2040 }
2041
2042 static bool
2043 deferred_unmount_enqueue(struct mount *mp, uint64_t flags, bool requeue,
2044     int timeout_ticks)
2045 {
2046         bool enqueued;
2047
2048         enqueued = false;
2049         mtx_lock(&deferred_unmount_lock);
2050         if ((mp->mnt_taskqueue_flags & MNT_DEFERRED) == 0 || requeue) {
2051                 mp->mnt_taskqueue_flags = flags | MNT_DEFERRED;
2052                 STAILQ_INSERT_TAIL(&deferred_unmount_list, mp,
2053                     mnt_taskqueue_link);
2054                 enqueued = true;
2055         }
2056         mtx_unlock(&deferred_unmount_lock);
2057
2058         if (enqueued) {
2059                 taskqueue_enqueue_timeout(taskqueue_deferred_unmount,
2060                     &deferred_unmount_task, timeout_ticks);
2061         }
2062
2063         return (enqueued);
2064 }
2065
2066 /*
2067  * Taskqueue handler for processing async/recursive unmounts
2068  */
2069 static void
2070 vfs_deferred_unmount(void *argi __unused, int pending __unused)
2071 {
2072         STAILQ_HEAD(, mount) local_unmounts;
2073         uint64_t flags;
2074         struct mount *mp, *tmp;
2075         int error;
2076         unsigned int retries;
2077         bool unmounted;
2078
2079         STAILQ_INIT(&local_unmounts);
2080         mtx_lock(&deferred_unmount_lock);
2081         STAILQ_CONCAT(&local_unmounts, &deferred_unmount_list);
2082         mtx_unlock(&deferred_unmount_lock);
2083
2084         STAILQ_FOREACH_SAFE(mp, &local_unmounts, mnt_taskqueue_link, tmp) {
2085                 flags = mp->mnt_taskqueue_flags;
2086                 KASSERT((flags & MNT_DEFERRED) != 0,
2087                     ("taskqueue unmount without MNT_DEFERRED"));
2088                 error = dounmount(mp, flags, curthread);
2089                 if (error != 0) {
2090                         MNT_ILOCK(mp);
2091                         unmounted = ((mp->mnt_kern_flag & MNTK_REFEXPIRE) != 0);
2092                         MNT_IUNLOCK(mp);
2093
2094                         /*
2095                          * The deferred unmount thread is the only thread that
2096                          * modifies the retry counts, so locking/atomics aren't
2097                          * needed here.
2098                          */
2099                         retries = (mp->mnt_unmount_retries)++;
2100                         deferred_unmount_total_retries++;
2101                         if (!unmounted && retries < deferred_unmount_retry_limit) {
2102                                 deferred_unmount_enqueue(mp, flags, true,
2103                                     -deferred_unmount_retry_delay_hz);
2104                         } else {
2105                                 if (retries >= deferred_unmount_retry_limit) {
2106                                         printf("giving up on deferred unmount "
2107                                             "of %s after %d retries, error %d\n",
2108                                             mp->mnt_stat.f_mntonname, retries, error);
2109                                 }
2110                                 vfs_rel(mp);
2111                         }
2112                 }
2113         }
2114 }
2115
2116 /*
2117  * Do the actual filesystem unmount.
2118  */
2119 int
2120 dounmount(struct mount *mp, uint64_t flags, struct thread *td)
2121 {
2122         struct mount_upper_node *upper;
2123         struct vnode *coveredvp, *rootvp;
2124         int error;
2125         uint64_t async_flag;
2126         int mnt_gen_r;
2127         unsigned int retries;
2128
2129         KASSERT((flags & MNT_DEFERRED) == 0 ||
2130             (flags & (MNT_RECURSE | MNT_FORCE)) == (MNT_RECURSE | MNT_FORCE),
2131             ("MNT_DEFERRED requires MNT_RECURSE | MNT_FORCE"));
2132
2133         /*
2134          * If the caller has explicitly requested the unmount to be handled by
2135          * the taskqueue and we're not already in taskqueue context, queue
2136          * up the unmount request and exit.  This is done prior to any
2137          * credential checks; MNT_DEFERRED should be used only for kernel-
2138          * initiated unmounts and will therefore be processed with the
2139          * (kernel) credentials of the taskqueue thread.  Still, callers
2140          * should be sure this is the behavior they want.
2141          */
2142         if ((flags & MNT_DEFERRED) != 0 &&
2143             taskqueue_member(taskqueue_deferred_unmount, curthread) == 0) {
2144                 if (!deferred_unmount_enqueue(mp, flags, false, 0))
2145                         vfs_rel(mp);
2146                 return (EINPROGRESS);
2147         }
2148
2149         /*
2150          * Only privileged root, or (if MNT_USER is set) the user that did the
2151          * original mount is permitted to unmount this filesystem.
2152          * This check should be made prior to queueing up any recursive
2153          * unmounts of upper filesystems.  Those unmounts will be executed
2154          * with kernel thread credentials and are expected to succeed, so
2155          * we must at least ensure the originating context has sufficient
2156          * privilege to unmount the base filesystem before proceeding with
2157          * the uppers.
2158          */
2159         error = vfs_suser(mp, td);
2160         if (error != 0) {
2161                 KASSERT((flags & MNT_DEFERRED) == 0,
2162                     ("taskqueue unmount with insufficient privilege"));
2163                 vfs_rel(mp);
2164                 return (error);
2165         }
2166
2167         if (recursive_forced_unmount && ((flags & MNT_FORCE) != 0))
2168                 flags |= MNT_RECURSE;
2169
2170         if ((flags & MNT_RECURSE) != 0) {
2171                 KASSERT((flags & MNT_FORCE) != 0,
2172                     ("MNT_RECURSE requires MNT_FORCE"));
2173
2174                 MNT_ILOCK(mp);
2175                 /*
2176                  * Set MNTK_RECURSE to prevent new upper mounts from being
2177                  * added, and note that an operation on the uppers list is in
2178                  * progress.  This will ensure that unregistration from the
2179                  * uppers list, and therefore any pending unmount of the upper
2180                  * FS, can't complete until after we finish walking the list.
2181                  */
2182                 mp->mnt_kern_flag |= MNTK_RECURSE;
2183                 mp->mnt_upper_pending++;
2184                 TAILQ_FOREACH(upper, &mp->mnt_uppers, mnt_upper_link) {
2185                         retries = upper->mp->mnt_unmount_retries;
2186                         if (retries > deferred_unmount_retry_limit) {
2187                                 error = EBUSY;
2188                                 continue;
2189                         }
2190                         MNT_IUNLOCK(mp);
2191
2192                         vfs_ref(upper->mp);
2193                         if (!deferred_unmount_enqueue(upper->mp, flags,
2194                             false, 0))
2195                                 vfs_rel(upper->mp);
2196                         MNT_ILOCK(mp);
2197                 }
2198                 mp->mnt_upper_pending--;
2199                 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 &&
2200                     mp->mnt_upper_pending == 0) {
2201                         mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER;
2202                         wakeup(&mp->mnt_uppers);
2203                 }
2204
2205                 /*
2206                  * If we're not on the taskqueue, wait until the uppers list
2207                  * is drained before proceeding with unmount.  Otherwise, if
2208                  * we are on the taskqueue and there are still pending uppers,
2209                  * just re-enqueue on the end of the taskqueue.
2210                  */
2211                 if ((flags & MNT_DEFERRED) == 0) {
2212                         while (error == 0 && !TAILQ_EMPTY(&mp->mnt_uppers)) {
2213                                 mp->mnt_kern_flag |= MNTK_TASKQUEUE_WAITER;
2214                                 error = msleep(&mp->mnt_taskqueue_link,
2215                                     MNT_MTX(mp), PCATCH, "umntqw", 0);
2216                         }
2217                         if (error != 0) {
2218                                 MNT_REL(mp);
2219                                 MNT_IUNLOCK(mp);
2220                                 return (error);
2221                         }
2222                 } else if (!TAILQ_EMPTY(&mp->mnt_uppers)) {
2223                         MNT_IUNLOCK(mp);
2224                         if (error == 0)
2225                                 deferred_unmount_enqueue(mp, flags, true, 0);
2226                         return (error);
2227                 }
2228                 MNT_IUNLOCK(mp);
2229                 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers not empty"));
2230         }
2231
2232         /* Allow the taskqueue to safely re-enqueue on failure */
2233         if ((flags & MNT_DEFERRED) != 0)
2234                 vfs_ref(mp);
2235
2236         if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
2237                 mnt_gen_r = mp->mnt_gen;
2238                 VI_LOCK(coveredvp);
2239                 vholdl(coveredvp);
2240                 vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
2241                 /*
2242                  * Check for mp being unmounted while waiting for the
2243                  * covered vnode lock.
2244                  */
2245                 if (coveredvp->v_mountedhere != mp ||
2246                     coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
2247                         VOP_UNLOCK(coveredvp);
2248                         vdrop(coveredvp);
2249                         vfs_rel(mp);
2250                         return (EBUSY);
2251                 }
2252         }
2253
2254         vfs_op_enter(mp);
2255
2256         vn_start_write(NULL, &mp, V_WAIT);
2257         MNT_ILOCK(mp);
2258         if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 ||
2259             (mp->mnt_flag & MNT_UPDATE) != 0 ||
2260             !TAILQ_EMPTY(&mp->mnt_uppers)) {
2261                 dounmount_cleanup(mp, coveredvp, 0);
2262                 return (EBUSY);
2263         }
2264         mp->mnt_kern_flag |= MNTK_UNMOUNT;
2265         rootvp = vfs_cache_root_clear(mp);
2266         if (coveredvp != NULL)
2267                 vn_seqc_write_begin(coveredvp);
2268         if (flags & MNT_NONBUSY) {
2269                 MNT_IUNLOCK(mp);
2270                 error = vfs_check_usecounts(mp);
2271                 MNT_ILOCK(mp);
2272                 if (error != 0) {
2273                         vn_seqc_write_end(coveredvp);
2274                         dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT);
2275                         if (rootvp != NULL) {
2276                                 vn_seqc_write_end(rootvp);
2277                                 vrele(rootvp);
2278                         }
2279                         return (error);
2280                 }
2281         }
2282         /* Allow filesystems to detect that a forced unmount is in progress. */
2283         if (flags & MNT_FORCE) {
2284                 mp->mnt_kern_flag |= MNTK_UNMOUNTF;
2285                 MNT_IUNLOCK(mp);
2286                 /*
2287                  * Must be done after setting MNTK_UNMOUNTF and before
2288                  * waiting for mnt_lockref to become 0.
2289                  */
2290                 VFS_PURGE(mp);
2291                 MNT_ILOCK(mp);
2292         }
2293         error = 0;
2294         if (mp->mnt_lockref) {
2295                 mp->mnt_kern_flag |= MNTK_DRAINING;
2296                 error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
2297                     "mount drain", 0);
2298         }
2299         MNT_IUNLOCK(mp);
2300         KASSERT(mp->mnt_lockref == 0,
2301             ("%s: invalid lock refcount in the drain path @ %s:%d",
2302             __func__, __FILE__, __LINE__));
2303         KASSERT(error == 0,
2304             ("%s: invalid return value for msleep in the drain path @ %s:%d",
2305             __func__, __FILE__, __LINE__));
2306
2307         /*
2308          * We want to keep the vnode around so that we can vn_seqc_write_end
2309          * after we are done with unmount. Downgrade our reference to a mere
2310          * hold count so that we don't interefere with anything.
2311          */
2312         if (rootvp != NULL) {
2313                 vhold(rootvp);
2314                 vrele(rootvp);
2315         }
2316
2317         if (mp->mnt_flag & MNT_EXPUBLIC)
2318                 vfs_setpublicfs(NULL, NULL, NULL);
2319
2320         vfs_periodic(mp, MNT_WAIT);
2321         MNT_ILOCK(mp);
2322         async_flag = mp->mnt_flag & MNT_ASYNC;
2323         mp->mnt_flag &= ~MNT_ASYNC;
2324         mp->mnt_kern_flag &= ~MNTK_ASYNC;
2325         MNT_IUNLOCK(mp);
2326         vfs_deallocate_syncvnode(mp);
2327         error = VFS_UNMOUNT(mp, flags);
2328         vn_finished_write(mp);
2329         vfs_rel(mp);
2330         /*
2331          * If we failed to flush the dirty blocks for this mount point,
2332          * undo all the cdir/rdir and rootvnode changes we made above.
2333          * Unless we failed to do so because the device is reporting that
2334          * it doesn't exist anymore.
2335          */
2336         if (error && error != ENXIO) {
2337                 MNT_ILOCK(mp);
2338                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2339                         MNT_IUNLOCK(mp);
2340                         vfs_allocate_syncvnode(mp);
2341                         MNT_ILOCK(mp);
2342                 }
2343                 mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
2344                 mp->mnt_flag |= async_flag;
2345                 if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
2346                     (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
2347                         mp->mnt_kern_flag |= MNTK_ASYNC;
2348                 if (mp->mnt_kern_flag & MNTK_MWAIT) {
2349                         mp->mnt_kern_flag &= ~MNTK_MWAIT;
2350                         wakeup(mp);
2351                 }
2352                 vfs_op_exit_locked(mp);
2353                 MNT_IUNLOCK(mp);
2354                 if (coveredvp) {
2355                         vn_seqc_write_end(coveredvp);
2356                         VOP_UNLOCK(coveredvp);
2357                         vdrop(coveredvp);
2358                 }
2359                 if (rootvp != NULL) {
2360                         vn_seqc_write_end(rootvp);
2361                         vdrop(rootvp);
2362                 }
2363                 return (error);
2364         }
2365
2366         mtx_lock(&mountlist_mtx);
2367         TAILQ_REMOVE(&mountlist, mp, mnt_list);
2368         mtx_unlock(&mountlist_mtx);
2369         EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td);
2370         if (coveredvp != NULL) {
2371                 VI_LOCK(coveredvp);
2372                 vn_irflag_unset_locked(coveredvp, VIRF_MOUNTPOINT);
2373                 coveredvp->v_mountedhere = NULL;
2374                 vn_seqc_write_end_locked(coveredvp);
2375                 VI_UNLOCK(coveredvp);
2376                 VOP_UNLOCK(coveredvp);
2377                 vdrop(coveredvp);
2378         }
2379         mount_devctl_event("UNMOUNT", mp, false);
2380         if (rootvp != NULL) {
2381                 vn_seqc_write_end(rootvp);
2382                 vdrop(rootvp);
2383         }
2384         vfs_event_signal(NULL, VQ_UNMOUNT, 0);
2385         if (rootvnode != NULL && mp == rootvnode->v_mount) {
2386                 vrele(rootvnode);
2387                 rootvnode = NULL;
2388         }
2389         if (mp == rootdevmp)
2390                 rootdevmp = NULL;
2391         if ((flags & MNT_DEFERRED) != 0)
2392                 vfs_rel(mp);
2393         vfs_mount_destroy(mp);
2394         return (0);
2395 }
2396
2397 /*
2398  * Report errors during filesystem mounting.
2399  */
2400 void
2401 vfs_mount_error(struct mount *mp, const char *fmt, ...)
2402 {
2403         struct vfsoptlist *moptlist = mp->mnt_optnew;
2404         va_list ap;
2405         int error, len;
2406         char *errmsg;
2407
2408         error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
2409         if (error || errmsg == NULL || len <= 0)
2410                 return;
2411
2412         va_start(ap, fmt);
2413         vsnprintf(errmsg, (size_t)len, fmt, ap);
2414         va_end(ap);
2415 }
2416
2417 void
2418 vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...)
2419 {
2420         va_list ap;
2421         int error, len;
2422         char *errmsg;
2423
2424         error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len);
2425         if (error || errmsg == NULL || len <= 0)
2426                 return;
2427
2428         va_start(ap, fmt);
2429         vsnprintf(errmsg, (size_t)len, fmt, ap);
2430         va_end(ap);
2431 }
2432
2433 /*
2434  * ---------------------------------------------------------------------
2435  * Functions for querying mount options/arguments from filesystems.
2436  */
2437
2438 /*
2439  * Check that no unknown options are given
2440  */
2441 int
2442 vfs_filteropt(struct vfsoptlist *opts, const char **legal)
2443 {
2444         struct vfsopt *opt;
2445         char errmsg[255];
2446         const char **t, *p, *q;
2447         int ret = 0;
2448
2449         TAILQ_FOREACH(opt, opts, link) {
2450                 p = opt->name;
2451                 q = NULL;
2452                 if (p[0] == 'n' && p[1] == 'o')
2453                         q = p + 2;
2454                 for(t = global_opts; *t != NULL; t++) {
2455                         if (strcmp(*t, p) == 0)
2456                                 break;
2457                         if (q != NULL) {
2458                                 if (strcmp(*t, q) == 0)
2459                                         break;
2460                         }
2461                 }
2462                 if (*t != NULL)
2463                         continue;
2464                 for(t = legal; *t != NULL; t++) {
2465                         if (strcmp(*t, p) == 0)
2466                                 break;
2467                         if (q != NULL) {
2468                                 if (strcmp(*t, q) == 0)
2469                                         break;
2470                         }
2471                 }
2472                 if (*t != NULL)
2473                         continue;
2474                 snprintf(errmsg, sizeof(errmsg),
2475                     "mount option <%s> is unknown", p);
2476                 ret = EINVAL;
2477         }
2478         if (ret != 0) {
2479                 TAILQ_FOREACH(opt, opts, link) {
2480                         if (strcmp(opt->name, "errmsg") == 0) {
2481                                 strncpy((char *)opt->value, errmsg, opt->len);
2482                                 break;
2483                         }
2484                 }
2485                 if (opt == NULL)
2486                         printf("%s\n", errmsg);
2487         }
2488         return (ret);
2489 }
2490
2491 /*
2492  * Get a mount option by its name.
2493  *
2494  * Return 0 if the option was found, ENOENT otherwise.
2495  * If len is non-NULL it will be filled with the length
2496  * of the option. If buf is non-NULL, it will be filled
2497  * with the address of the option.
2498  */
2499 int
2500 vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len)
2501 {
2502         struct vfsopt *opt;
2503
2504         KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
2505
2506         TAILQ_FOREACH(opt, opts, link) {
2507                 if (strcmp(name, opt->name) == 0) {
2508                         opt->seen = 1;
2509                         if (len != NULL)
2510                                 *len = opt->len;
2511                         if (buf != NULL)
2512                                 *buf = opt->value;
2513                         return (0);
2514                 }
2515         }
2516         return (ENOENT);
2517 }
2518
2519 int
2520 vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
2521 {
2522         struct vfsopt *opt;
2523
2524         if (opts == NULL)
2525                 return (-1);
2526
2527         TAILQ_FOREACH(opt, opts, link) {
2528                 if (strcmp(name, opt->name) == 0) {
2529                         opt->seen = 1;
2530                         return (opt->pos);
2531                 }
2532         }
2533         return (-1);
2534 }
2535
2536 int
2537 vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value)
2538 {
2539         char *opt_value, *vtp;
2540         quad_t iv;
2541         int error, opt_len;
2542
2543         error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len);
2544         if (error != 0)
2545                 return (error);
2546         if (opt_len == 0 || opt_value == NULL)
2547                 return (EINVAL);
2548         if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0')
2549                 return (EINVAL);
2550         iv = strtoq(opt_value, &vtp, 0);
2551         if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0'))
2552                 return (EINVAL);
2553         if (iv < 0)
2554                 return (EINVAL);
2555         switch (vtp[0]) {
2556         case 't': case 'T':
2557                 iv *= 1024;
2558                 /* FALLTHROUGH */
2559         case 'g': case 'G':
2560                 iv *= 1024;
2561                 /* FALLTHROUGH */
2562         case 'm': case 'M':
2563                 iv *= 1024;
2564                 /* FALLTHROUGH */
2565         case 'k': case 'K':
2566                 iv *= 1024;
2567         case '\0':
2568                 break;
2569         default:
2570                 return (EINVAL);
2571         }
2572         *value = iv;
2573
2574         return (0);
2575 }
2576
2577 char *
2578 vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
2579 {
2580         struct vfsopt *opt;
2581
2582         *error = 0;
2583         TAILQ_FOREACH(opt, opts, link) {
2584                 if (strcmp(name, opt->name) != 0)
2585                         continue;
2586                 opt->seen = 1;
2587                 if (opt->len == 0 ||
2588                     ((char *)opt->value)[opt->len - 1] != '\0') {
2589                         *error = EINVAL;
2590                         return (NULL);
2591                 }
2592                 return (opt->value);
2593         }
2594         *error = ENOENT;
2595         return (NULL);
2596 }
2597
2598 int
2599 vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w,
2600         uint64_t val)
2601 {
2602         struct vfsopt *opt;
2603
2604         TAILQ_FOREACH(opt, opts, link) {
2605                 if (strcmp(name, opt->name) == 0) {
2606                         opt->seen = 1;
2607                         if (w != NULL)
2608                                 *w |= val;
2609                         return (1);
2610                 }
2611         }
2612         if (w != NULL)
2613                 *w &= ~val;
2614         return (0);
2615 }
2616
2617 int
2618 vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
2619 {
2620         va_list ap;
2621         struct vfsopt *opt;
2622         int ret;
2623
2624         KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
2625
2626         TAILQ_FOREACH(opt, opts, link) {
2627                 if (strcmp(name, opt->name) != 0)
2628                         continue;
2629                 opt->seen = 1;
2630                 if (opt->len == 0 || opt->value == NULL)
2631                         return (0);
2632                 if (((char *)opt->value)[opt->len - 1] != '\0')
2633                         return (0);
2634                 va_start(ap, fmt);
2635                 ret = vsscanf(opt->value, fmt, ap);
2636                 va_end(ap);
2637                 return (ret);
2638         }
2639         return (0);
2640 }
2641
2642 int
2643 vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len)
2644 {
2645         struct vfsopt *opt;
2646
2647         TAILQ_FOREACH(opt, opts, link) {
2648                 if (strcmp(name, opt->name) != 0)
2649                         continue;
2650                 opt->seen = 1;
2651                 if (opt->value == NULL)
2652                         opt->len = len;
2653                 else {
2654                         if (opt->len != len)
2655                                 return (EINVAL);
2656                         bcopy(value, opt->value, len);
2657                 }
2658                 return (0);
2659         }
2660         return (ENOENT);
2661 }
2662
2663 int
2664 vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len)
2665 {
2666         struct vfsopt *opt;
2667
2668         TAILQ_FOREACH(opt, opts, link) {
2669                 if (strcmp(name, opt->name) != 0)
2670                         continue;
2671                 opt->seen = 1;
2672                 if (opt->value == NULL)
2673                         opt->len = len;
2674                 else {
2675                         if (opt->len < len)
2676                                 return (EINVAL);
2677                         opt->len = len;
2678                         bcopy(value, opt->value, len);
2679                 }
2680                 return (0);
2681         }
2682         return (ENOENT);
2683 }
2684
2685 int
2686 vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value)
2687 {
2688         struct vfsopt *opt;
2689
2690         TAILQ_FOREACH(opt, opts, link) {
2691                 if (strcmp(name, opt->name) != 0)
2692                         continue;
2693                 opt->seen = 1;
2694                 if (opt->value == NULL)
2695                         opt->len = strlen(value) + 1;
2696                 else if (strlcpy(opt->value, value, opt->len) >= opt->len)
2697                         return (EINVAL);
2698                 return (0);
2699         }
2700         return (ENOENT);
2701 }
2702
2703 /*
2704  * Find and copy a mount option.
2705  *
2706  * The size of the buffer has to be specified
2707  * in len, if it is not the same length as the
2708  * mount option, EINVAL is returned.
2709  * Returns ENOENT if the option is not found.
2710  */
2711 int
2712 vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len)
2713 {
2714         struct vfsopt *opt;
2715
2716         KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
2717
2718         TAILQ_FOREACH(opt, opts, link) {
2719                 if (strcmp(name, opt->name) == 0) {
2720                         opt->seen = 1;
2721                         if (len != opt->len)
2722                                 return (EINVAL);
2723                         bcopy(opt->value, dest, opt->len);
2724                         return (0);
2725                 }
2726         }
2727         return (ENOENT);
2728 }
2729
2730 int
2731 __vfs_statfs(struct mount *mp, struct statfs *sbp)
2732 {
2733         /*
2734          * Filesystems only fill in part of the structure for updates, we
2735          * have to read the entirety first to get all content.
2736          */
2737         if (sbp != &mp->mnt_stat)
2738                 memcpy(sbp, &mp->mnt_stat, sizeof(*sbp));
2739
2740         /*
2741          * Set these in case the underlying filesystem fails to do so.
2742          */
2743         sbp->f_version = STATFS_VERSION;
2744         sbp->f_namemax = NAME_MAX;
2745         sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2746         sbp->f_nvnodelistsize = mp->mnt_nvnodelistsize;
2747
2748         return (mp->mnt_op->vfs_statfs(mp, sbp));
2749 }
2750
2751 void
2752 vfs_mountedfrom(struct mount *mp, const char *from)
2753 {
2754
2755         bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
2756         strlcpy(mp->mnt_stat.f_mntfromname, from,
2757             sizeof mp->mnt_stat.f_mntfromname);
2758 }
2759
2760 /*
2761  * ---------------------------------------------------------------------
2762  * This is the api for building mount args and mounting filesystems from
2763  * inside the kernel.
2764  *
2765  * The API works by accumulation of individual args.  First error is
2766  * latched.
2767  *
2768  * XXX: should be documented in new manpage kernel_mount(9)
2769  */
2770
2771 /* A memory allocation which must be freed when we are done */
2772 struct mntaarg {
2773         SLIST_ENTRY(mntaarg)    next;
2774 };
2775
2776 /* The header for the mount arguments */
2777 struct mntarg {
2778         struct iovec *v;
2779         int len;
2780         int error;
2781         SLIST_HEAD(, mntaarg)   list;
2782 };
2783
2784 /*
2785  * Add a boolean argument.
2786  *
2787  * flag is the boolean value.
2788  * name must start with "no".
2789  */
2790 struct mntarg *
2791 mount_argb(struct mntarg *ma, int flag, const char *name)
2792 {
2793
2794         KASSERT(name[0] == 'n' && name[1] == 'o',
2795             ("mount_argb(...,%s): name must start with 'no'", name));
2796
2797         return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
2798 }
2799
2800 /*
2801  * Add an argument printf style
2802  */
2803 struct mntarg *
2804 mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
2805 {
2806         va_list ap;
2807         struct mntaarg *maa;
2808         struct sbuf *sb;
2809         int len;
2810
2811         if (ma == NULL) {
2812                 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
2813                 SLIST_INIT(&ma->list);
2814         }
2815         if (ma->error)
2816                 return (ma);
2817
2818         ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
2819             M_MOUNT, M_WAITOK);
2820         ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
2821         ma->v[ma->len].iov_len = strlen(name) + 1;
2822         ma->len++;
2823
2824         sb = sbuf_new_auto();
2825         va_start(ap, fmt);
2826         sbuf_vprintf(sb, fmt, ap);
2827         va_end(ap);
2828         sbuf_finish(sb);
2829         len = sbuf_len(sb) + 1;
2830         maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
2831         SLIST_INSERT_HEAD(&ma->list, maa, next);
2832         bcopy(sbuf_data(sb), maa + 1, len);
2833         sbuf_delete(sb);
2834
2835         ma->v[ma->len].iov_base = maa + 1;
2836         ma->v[ma->len].iov_len = len;
2837         ma->len++;
2838
2839         return (ma);
2840 }
2841
2842 /*
2843  * Add an argument which is a userland string.
2844  */
2845 struct mntarg *
2846 mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
2847 {
2848         struct mntaarg *maa;
2849         char *tbuf;
2850
2851         if (val == NULL)
2852                 return (ma);
2853         if (ma == NULL) {
2854                 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
2855                 SLIST_INIT(&ma->list);
2856         }
2857         if (ma->error)
2858                 return (ma);
2859         maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
2860         SLIST_INSERT_HEAD(&ma->list, maa, next);
2861         tbuf = (void *)(maa + 1);
2862         ma->error = copyinstr(val, tbuf, len, NULL);
2863         return (mount_arg(ma, name, tbuf, -1));
2864 }
2865
2866 /*
2867  * Plain argument.
2868  *
2869  * If length is -1, treat value as a C string.
2870  */
2871 struct mntarg *
2872 mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
2873 {
2874
2875         if (ma == NULL) {
2876                 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
2877                 SLIST_INIT(&ma->list);
2878         }
2879         if (ma->error)
2880                 return (ma);
2881
2882         ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
2883             M_MOUNT, M_WAITOK);
2884         ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
2885         ma->v[ma->len].iov_len = strlen(name) + 1;
2886         ma->len++;
2887
2888         ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
2889         if (len < 0)
2890                 ma->v[ma->len].iov_len = strlen(val) + 1;
2891         else
2892                 ma->v[ma->len].iov_len = len;
2893         ma->len++;
2894         return (ma);
2895 }
2896
2897 /*
2898  * Free a mntarg structure
2899  */
2900 static void
2901 free_mntarg(struct mntarg *ma)
2902 {
2903         struct mntaarg *maa;
2904
2905         while (!SLIST_EMPTY(&ma->list)) {
2906                 maa = SLIST_FIRST(&ma->list);
2907                 SLIST_REMOVE_HEAD(&ma->list, next);
2908                 free(maa, M_MOUNT);
2909         }
2910         free(ma->v, M_MOUNT);
2911         free(ma, M_MOUNT);
2912 }
2913
2914 /*
2915  * Mount a filesystem
2916  */
2917 int
2918 kernel_mount(struct mntarg *ma, uint64_t flags)
2919 {
2920         struct uio auio;
2921         int error;
2922
2923         KASSERT(ma != NULL, ("kernel_mount NULL ma"));
2924         KASSERT(ma->error != 0 || ma->v != NULL, ("kernel_mount NULL ma->v"));
2925         KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
2926
2927         error = ma->error;
2928         if (error == 0) {
2929                 auio.uio_iov = ma->v;
2930                 auio.uio_iovcnt = ma->len;
2931                 auio.uio_segflg = UIO_SYSSPACE;
2932                 error = vfs_donmount(curthread, flags, &auio);
2933         }
2934         free_mntarg(ma);
2935         return (error);
2936 }
2937
2938 /* Map from mount options to printable formats. */
2939 static struct mntoptnames optnames[] = {
2940         MNTOPT_NAMES
2941 };
2942
2943 #define DEVCTL_LEN 1024
2944 static void
2945 mount_devctl_event(const char *type, struct mount *mp, bool donew)
2946 {
2947         const uint8_t *cp;
2948         struct mntoptnames *fp;
2949         struct sbuf sb;
2950         struct statfs *sfp = &mp->mnt_stat;
2951         char *buf;
2952
2953         buf = malloc(DEVCTL_LEN, M_MOUNT, M_NOWAIT);
2954         if (buf == NULL)
2955                 return;
2956         sbuf_new(&sb, buf, DEVCTL_LEN, SBUF_FIXEDLEN);
2957         sbuf_cpy(&sb, "mount-point=\"");
2958         devctl_safe_quote_sb(&sb, sfp->f_mntonname);
2959         sbuf_cat(&sb, "\" mount-dev=\"");
2960         devctl_safe_quote_sb(&sb, sfp->f_mntfromname);
2961         sbuf_cat(&sb, "\" mount-type=\"");
2962         devctl_safe_quote_sb(&sb, sfp->f_fstypename);
2963         sbuf_cat(&sb, "\" fsid=0x");
2964         cp = (const uint8_t *)&sfp->f_fsid.val[0];
2965         for (int i = 0; i < sizeof(sfp->f_fsid); i++)
2966                 sbuf_printf(&sb, "%02x", cp[i]);
2967         sbuf_printf(&sb, " owner=%u flags=\"", sfp->f_owner);
2968         for (fp = optnames; fp->o_opt != 0; fp++) {
2969                 if ((mp->mnt_flag & fp->o_opt) != 0) {
2970                         sbuf_cat(&sb, fp->o_name);
2971                         sbuf_putc(&sb, ';');
2972                 }
2973         }
2974         sbuf_putc(&sb, '"');
2975         sbuf_finish(&sb);
2976
2977         /*
2978          * Options are not published because the form of the options depends on
2979          * the file system and may include binary data. In addition, they don't
2980          * necessarily provide enough useful information to be actionable when
2981          * devd processes them.
2982          */
2983
2984         if (sbuf_error(&sb) == 0)
2985                 devctl_notify("VFS", "FS", type, sbuf_data(&sb));
2986         sbuf_delete(&sb);
2987         free(buf, M_MOUNT);
2988 }
2989
2990 /*
2991  * Force remount specified mount point to read-only.  The argument
2992  * must be busied to avoid parallel unmount attempts.
2993  *
2994  * Intended use is to prevent further writes if some metadata
2995  * inconsistency is detected.  Note that the function still flushes
2996  * all cached metadata and data for the mount point, which might be
2997  * not always suitable.
2998  */
2999 int
3000 vfs_remount_ro(struct mount *mp)
3001 {
3002         struct vfsoptlist *opts;
3003         struct vfsopt *opt;
3004         struct vnode *vp_covered, *rootvp;
3005         int error;
3006
3007         KASSERT(mp->mnt_lockref > 0,
3008             ("vfs_remount_ro: mp %p is not busied", mp));
3009         KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0,
3010             ("vfs_remount_ro: mp %p is being unmounted (and busy?)", mp));
3011
3012         rootvp = NULL;
3013         vp_covered = mp->mnt_vnodecovered;
3014         error = vget(vp_covered, LK_EXCLUSIVE | LK_NOWAIT);
3015         if (error != 0)
3016                 return (error);
3017         VI_LOCK(vp_covered);
3018         if ((vp_covered->v_iflag & VI_MOUNT) != 0) {
3019                 VI_UNLOCK(vp_covered);
3020                 vput(vp_covered);
3021                 return (EBUSY);
3022         }
3023         vp_covered->v_iflag |= VI_MOUNT;
3024         VI_UNLOCK(vp_covered);
3025         vfs_op_enter(mp);
3026         vn_seqc_write_begin(vp_covered);
3027
3028         MNT_ILOCK(mp);
3029         if ((mp->mnt_flag & MNT_RDONLY) != 0) {
3030                 MNT_IUNLOCK(mp);
3031                 error = EBUSY;
3032                 goto out;
3033         }
3034         mp->mnt_flag |= MNT_UPDATE | MNT_FORCE | MNT_RDONLY;
3035         rootvp = vfs_cache_root_clear(mp);
3036         MNT_IUNLOCK(mp);
3037
3038         opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK | M_ZERO);
3039         TAILQ_INIT(opts);
3040         opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK | M_ZERO);
3041         opt->name = strdup("ro", M_MOUNT);
3042         opt->value = NULL;
3043         TAILQ_INSERT_TAIL(opts, opt, link);
3044         vfs_mergeopts(opts, mp->mnt_opt);
3045         mp->mnt_optnew = opts;
3046
3047         error = VFS_MOUNT(mp);
3048
3049         if (error == 0) {
3050                 MNT_ILOCK(mp);
3051                 mp->mnt_flag &= ~(MNT_UPDATE | MNT_FORCE);
3052                 MNT_IUNLOCK(mp);
3053                 vfs_deallocate_syncvnode(mp);
3054                 if (mp->mnt_opt != NULL)
3055                         vfs_freeopts(mp->mnt_opt);
3056                 mp->mnt_opt = mp->mnt_optnew;
3057         } else {
3058                 MNT_ILOCK(mp);
3059                 mp->mnt_flag &= ~(MNT_UPDATE | MNT_FORCE | MNT_RDONLY);
3060                 MNT_IUNLOCK(mp);
3061                 vfs_freeopts(mp->mnt_optnew);
3062         }
3063         mp->mnt_optnew = NULL;
3064
3065 out:
3066         vfs_op_exit(mp);
3067         VI_LOCK(vp_covered);
3068         vp_covered->v_iflag &= ~VI_MOUNT;
3069         VI_UNLOCK(vp_covered);
3070         vput(vp_covered);
3071         vn_seqc_write_end(vp_covered);
3072         if (rootvp != NULL) {
3073                 vn_seqc_write_end(rootvp);
3074                 vrele(rootvp);
3075         }
3076         return (error);
3077 }
3078
3079 /*
3080  * Suspend write operations on all local writeable filesystems.  Does
3081  * full sync of them in the process.
3082  *
3083  * Iterate over the mount points in reverse order, suspending most
3084  * recently mounted filesystems first.  It handles a case where a
3085  * filesystem mounted from a md(4) vnode-backed device should be
3086  * suspended before the filesystem that owns the vnode.
3087  */
3088 void
3089 suspend_all_fs(void)
3090 {
3091         struct mount *mp;
3092         int error;
3093
3094         mtx_lock(&mountlist_mtx);
3095         TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
3096                 error = vfs_busy(mp, MBF_MNTLSTLOCK | MBF_NOWAIT);
3097                 if (error != 0)
3098                         continue;
3099                 if ((mp->mnt_flag & (MNT_RDONLY | MNT_LOCAL)) != MNT_LOCAL ||
3100                     (mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
3101                         mtx_lock(&mountlist_mtx);
3102                         vfs_unbusy(mp);
3103                         continue;
3104                 }
3105                 error = vfs_write_suspend(mp, 0);
3106                 if (error == 0) {
3107                         MNT_ILOCK(mp);
3108                         MPASS((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0);
3109                         mp->mnt_kern_flag |= MNTK_SUSPEND_ALL;
3110                         MNT_IUNLOCK(mp);
3111                         mtx_lock(&mountlist_mtx);
3112                 } else {
3113                         printf("suspend of %s failed, error %d\n",
3114                             mp->mnt_stat.f_mntonname, error);
3115                         mtx_lock(&mountlist_mtx);
3116                         vfs_unbusy(mp);
3117                 }
3118         }
3119         mtx_unlock(&mountlist_mtx);
3120 }
3121
3122 void
3123 resume_all_fs(void)
3124 {
3125         struct mount *mp;
3126
3127         mtx_lock(&mountlist_mtx);
3128         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3129                 if ((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0)
3130                         continue;
3131                 mtx_unlock(&mountlist_mtx);
3132                 MNT_ILOCK(mp);
3133                 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) != 0);
3134                 mp->mnt_kern_flag &= ~MNTK_SUSPEND_ALL;
3135                 MNT_IUNLOCK(mp);
3136                 vfs_write_resume(mp, 0);
3137                 mtx_lock(&mountlist_mtx);
3138                 vfs_unbusy(mp);
3139         }
3140         mtx_unlock(&mountlist_mtx);
3141 }