sys/kern/vfs_subr.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  * (c) UNIX System Laboratories, Inc.
   7  * All or some portions of this file are derived from material licensed
   8  * to the University of California by American Telephone and Telegraph
   9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  10  * the permission of UNIX System Laboratories, Inc.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  *
  36  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
  37  */
  38
  39 /*
  40  * External virtual filesystem routines
  41  */
  42
  43 #include <sys/cdefs.h>
  44 __FBSDID("$FreeBSD$");
  45
  46 #include "opt_ddb.h"
  47 #include "opt_watchdog.h"
  48
  49 #include <sys/param.h>
  50 #include <sys/systm.h>
  51 #include <sys/bio.h>
  52 #include <sys/buf.h>
  53 #include <sys/capsicum.h>
  54 #include <sys/condvar.h>
  55 #include <sys/conf.h>
  56 #include <sys/counter.h>
  57 #include <sys/dirent.h>
  58 #include <sys/event.h>
  59 #include <sys/eventhandler.h>
  60 #include <sys/extattr.h>
  61 #include <sys/file.h>
  62 #include <sys/fcntl.h>
  63 #include <sys/jail.h>
  64 #include <sys/kdb.h>
  65 #include <sys/kernel.h>
  66 #include <sys/kthread.h>
  67 #include <sys/ktr.h>
  68 #include <sys/lockf.h>
  69 #include <sys/malloc.h>
  70 #include <sys/mount.h>
  71 #include <sys/namei.h>
  72 #include <sys/pctrie.h>
  73 #include <sys/priv.h>
  74 #include <sys/reboot.h>
  75 #include <sys/refcount.h>
  76 #include <sys/rwlock.h>
  77 #include <sys/sched.h>
  78 #include <sys/sleepqueue.h>
  79 #include <sys/smp.h>
  80 #include <sys/stat.h>
  81 #include <sys/sysctl.h>
  82 #include <sys/syslog.h>
  83 #include <sys/vmmeter.h>
  84 #include <sys/vnode.h>
  85 #include <sys/watchdog.h>
  86
  87 #include <machine/stdarg.h>
  88
  89 #include <security/mac/mac_framework.h>
  90
  91 #include <vm/vm.h>
  92 #include <vm/vm_object.h>
  93 #include <vm/vm_extern.h>
  94 #include <vm/pmap.h>
  95 #include <vm/vm_map.h>
  96 #include <vm/vm_page.h>
  97 #include <vm/vm_kern.h>
  98 #include <vm/uma.h>
  99
 100 #ifdef DDB
 101 #include <ddb/ddb.h>
 102 #endif
 103
 104 static void     delmntque(struct vnode *vp);
 105 static int      flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 106                     int slpflag, int slptimeo);
 107 static void     syncer_shutdown(void *arg, int howto);
 108 static int      vtryrecycle(struct vnode *vp);
 109 static void     v_init_counters(struct vnode *);
 110 static void     v_incr_devcount(struct vnode *);
 111 static void     v_decr_devcount(struct vnode *);
 112 static void     vgonel(struct vnode *);
 113 static void     vfs_knllock(void *arg);
 114 static void     vfs_knlunlock(void *arg);
 115 static void     vfs_knl_assert_locked(void *arg);
 116 static void     vfs_knl_assert_unlocked(void *arg);
 117 static void     vnlru_return_batches(struct vfsops *mnt_op);
 118 static void     destroy_vpollinfo(struct vpollinfo *vi);
 119 static int      v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
 120                     daddr_t startlbn, daddr_t endlbn);
 121
 122 /*
 123  * These fences are intended for cases where some synchronization is
 124  * needed between access of v_iflags and lockless vnode refcount (v_holdcnt
 125  * and v_usecount) updates.  Access to v_iflags is generally synchronized
 126  * by the interlock, but we have some internal assertions that check vnode
 127  * flags without acquiring the lock.  Thus, these fences are INVARIANTS-only
 128  * for now.
 129  */
 130 #ifdef INVARIANTS
 131 #define VNODE_REFCOUNT_FENCE_ACQ()      atomic_thread_fence_acq()
 132 #define VNODE_REFCOUNT_FENCE_REL()      atomic_thread_fence_rel()
 133 #else
 134 #define VNODE_REFCOUNT_FENCE_ACQ()
 135 #define VNODE_REFCOUNT_FENCE_REL()
 136 #endif
 137
 138 /*
 139  * Number of vnodes in existence.  Increased whenever getnewvnode()
 140  * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode.
 141  */
 142 static u_long __exclusive_cache_line numvnodes;
 143
 144 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
 145     "Number of vnodes in existence");
 146
 147 static counter_u64_t vnodes_created;
 148 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
 149     "Number of vnodes created by getnewvnode");
 150
 151 static u_long mnt_free_list_batch = 128;
 152 SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW,
 153     &mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list");
 154
 155 /*
 156  * Conversion tables for conversion from vnode types to inode formats
 157  * and back.
 158  */
 159 enum vtype iftovt_tab[16] = {
 160         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 161         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 162 };
 163 int vttoif_tab[10] = {
 164         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 165         S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 166 };
 167
 168 /*
 169  * List of vnodes that are ready for recycling.
 170  */
 171 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 172
 173 /*
 174  * "Free" vnode target.  Free vnodes are rarely completely free, but are
 175  * just ones that are cheap to recycle.  Usually they are for files which
 176  * have been stat'd but not read; these usually have inode and namecache
 177  * data attached to them.  This target is the preferred minimum size of a
 178  * sub-cache consisting mostly of such files. The system balances the size
 179  * of this sub-cache with its complement to try to prevent either from
 180  * thrashing while the other is relatively inactive.  The targets express
 181  * a preference for the best balance.
 182  *
 183  * "Above" this target there are 2 further targets (watermarks) related
 184  * to recyling of free vnodes.  In the best-operating case, the cache is
 185  * exactly full, the free list has size between vlowat and vhiwat above the
 186  * free target, and recycling from it and normal use maintains this state.
 187  * Sometimes the free list is below vlowat or even empty, but this state
 188  * is even better for immediate use provided the cache is not full.
 189  * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
 190  * ones) to reach one of these states.  The watermarks are currently hard-
 191  * coded as 4% and 9% of the available space higher.  These and the default
 192  * of 25% for wantfreevnodes are too large if the memory size is large.
 193  * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
 194  * whenever vnlru_proc() becomes active.
 195  */
 196 static u_long wantfreevnodes;
 197 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
 198     &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
 199 static u_long freevnodes;
 200 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
 201     &freevnodes, 0, "Number of \"free\" vnodes");
 202
 203 static counter_u64_t recycles_count;
 204 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
 205     "Number of vnodes recycled to meet vnode cache targets");
 206
 207 static counter_u64_t recycles_free_count;
 208 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count,
 209     "Number of free vnodes recycled to meet vnode cache targets");
 210
 211 /*
 212  * Various variables used for debugging the new implementation of
 213  * reassignbuf().
 214  * XXX these are probably of (very) limited utility now.
 215  */
 216 static int reassignbufcalls;
 217 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW | CTLFLAG_STATS,
 218     &reassignbufcalls, 0, "Number of calls to reassignbuf");
 219
 220 static counter_u64_t free_owe_inact;
 221 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact,
 222     "Number of times free vnodes kept on active list due to VFS "
 223     "owing inactivation");
 224
 225 /* To keep more than one thread at a time from running vfs_getnewfsid */
 226 static struct mtx mntid_mtx;
 227
 228 /*
 229  * Lock for any access to the following:
 230  *      vnode_free_list
 231  *      numvnodes
 232  *      freevnodes
 233  */
 234 static struct mtx __exclusive_cache_line vnode_free_list_mtx;
 235
 236 /* Publicly exported FS */
 237 struct nfs_public nfs_pub;
 238
 239 static uma_zone_t buf_trie_zone;
 240
 241 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 242 static uma_zone_t vnode_zone;
 243 static uma_zone_t vnodepoll_zone;
 244
 245 /*
 246  * The workitem queue.
 247  *
 248  * It is useful to delay writes of file data and filesystem metadata
 249  * for tens of seconds so that quickly created and deleted files need
 250  * not waste disk bandwidth being created and removed. To realize this,
 251  * we append vnodes to a "workitem" queue. When running with a soft
 252  * updates implementation, most pending metadata dependencies should
 253  * not wait for more than a few seconds. Thus, mounted on block devices
 254  * are delayed only about a half the time that file data is delayed.
 255  * Similarly, directory updates are more critical, so are only delayed
 256  * about a third the time that file data is delayed. Thus, there are
 257  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
 258  * one each second (driven off the filesystem syncer process). The
 259  * syncer_delayno variable indicates the next queue that is to be processed.
 260  * Items that need to be processed soon are placed in this queue:
 261  *
 262  *      syncer_workitem_pending[syncer_delayno]
 263  *
 264  * A delay of fifteen seconds is done by placing the request fifteen
 265  * entries later in the queue:
 266  *
 267  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
 268  *
 269  */
 270 static int syncer_delayno;
 271 static long syncer_mask;
 272 LIST_HEAD(synclist, bufobj);
 273 static struct synclist *syncer_workitem_pending;
 274 /*
 275  * The sync_mtx protects:
 276  *      bo->bo_synclist
 277  *      sync_vnode_count
 278  *      syncer_delayno
 279  *      syncer_state
 280  *      syncer_workitem_pending
 281  *      syncer_worklist_len
 282  *      rushjob
 283  */
 284 static struct mtx sync_mtx;
 285 static struct cv sync_wakeup;
 286
 287 #define SYNCER_MAXDELAY         32
 288 static int syncer_maxdelay = SYNCER_MAXDELAY;   /* maximum delay time */
 289 static int syncdelay = 30;              /* max time to delay syncing data */
 290 static int filedelay = 30;              /* time to delay syncing files */
 291 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
 292     "Time to delay syncing files (in seconds)");
 293 static int dirdelay = 29;               /* time to delay syncing directories */
 294 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
 295     "Time to delay syncing directories (in seconds)");
 296 static int metadelay = 28;              /* time to delay syncing metadata */
 297 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
 298     "Time to delay syncing metadata (in seconds)");
 299 static int rushjob;             /* number of slots to run ASAP */
 300 static int stat_rush_requests;  /* number of times I/O speeded up */
 301 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
 302     "Number of times I/O speeded up (rush requests)");
 303
 304 /*
 305  * When shutting down the syncer, run it at four times normal speed.
 306  */
 307 #define SYNCER_SHUTDOWN_SPEEDUP         4
 308 static int sync_vnode_count;
 309 static int syncer_worklist_len;
 310 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
 311     syncer_state;
 312
 313 /* Target for maximum number of vnodes. */
 314 int desiredvnodes;
 315 static int gapvnodes;           /* gap between wanted and desired */
 316 static int vhiwat;              /* enough extras after expansion */
 317 static int vlowat;              /* minimal extras before expansion */
 318 static int vstir;               /* nonzero to stir non-free vnodes */
 319 static volatile int vsmalltrigger = 8;  /* pref to keep if > this many pages */
 320
 321 static int
 322 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
 323 {
 324         int error, old_desiredvnodes;
 325
 326         old_desiredvnodes = desiredvnodes;
 327         if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
 328                 return (error);
 329         if (old_desiredvnodes != desiredvnodes) {
 330                 wantfreevnodes = desiredvnodes / 4;
 331                 /* XXX locking seems to be incomplete. */
 332                 vfs_hash_changesize(desiredvnodes);
 333                 cache_changesize(desiredvnodes);
 334         }
 335         return (0);
 336 }
 337
 338 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
 339     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
 340     sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
 341 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
 342     &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
 343 static int vnlru_nowhere;
 344 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
 345     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 346
 347 static int
 348 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
 349 {
 350         struct vnode *vp;
 351         struct nameidata nd;
 352         char *buf;
 353         unsigned long ndflags;
 354         int error;
 355
 356         if (req->newptr == NULL)
 357                 return (EINVAL);
 358         if (req->newlen >= PATH_MAX)
 359                 return (E2BIG);
 360
 361         buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
 362         error = SYSCTL_IN(req, buf, req->newlen);
 363         if (error != 0)
 364                 goto out;
 365
 366         buf[req->newlen] = '\0';
 367
 368         ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | NOCACHE | SAVENAME;
 369         NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread);
 370         if ((error = namei(&nd)) != 0)
 371                 goto out;
 372         vp = nd.ni_vp;
 373
 374         if (VN_IS_DOOMED(vp)) {
 375                 /*
 376                  * This vnode is being recycled.  Return != 0 to let the caller
 377                  * know that the sysctl had no effect.  Return EAGAIN because a
 378                  * subsequent call will likely succeed (since namei will create
 379                  * a new vnode if necessary)
 380                  */
 381                 error = EAGAIN;
 382                 goto putvnode;
 383         }
 384
 385         counter_u64_add(recycles_count, 1);
 386         vgone(vp);
 387 putvnode:
 388         NDFREE(&nd, 0);
 389 out:
 390         free(buf, M_TEMP);
 391         return (error);
 392 }
 393
 394 static int
 395 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
 396 {
 397         struct thread *td = curthread;
 398         struct vnode *vp;
 399         struct file *fp;
 400         int error;
 401         int fd;
 402
 403         if (req->newptr == NULL)
 404                 return (EBADF);
 405
 406         error = sysctl_handle_int(oidp, &fd, 0, req);
 407         if (error != 0)
 408                 return (error);
 409         error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
 410         if (error != 0)
 411                 return (error);
 412         vp = fp->f_vnode;
 413
 414         error = vn_lock(vp, LK_EXCLUSIVE);
 415         if (error != 0)
 416                 goto drop;
 417
 418         counter_u64_add(recycles_count, 1);
 419         vgone(vp);
 420         VOP_UNLOCK(vp);
 421 drop:
 422         fdrop(fp, td);
 423         return (error);
 424 }
 425
 426 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
 427     CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
 428     sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
 429 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
 430     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
 431     sysctl_ftry_reclaim_vnode, "I",
 432     "Try to reclaim a vnode by its file descriptor");
 433
 434 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
 435 static int vnsz2log;
 436
 437 /*
 438  * Support for the bufobj clean & dirty pctrie.
 439  */
 440 static void *
 441 buf_trie_alloc(struct pctrie *ptree)
 442 {
 443
 444         return uma_zalloc(buf_trie_zone, M_NOWAIT);
 445 }
 446
 447 static void
 448 buf_trie_free(struct pctrie *ptree, void *node)
 449 {
 450
 451         uma_zfree(buf_trie_zone, node);
 452 }
 453 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
 454
 455 /*
 456  * Initialize the vnode management data structures.
 457  *
 458  * Reevaluate the following cap on the number of vnodes after the physical
 459  * memory size exceeds 512GB.  In the limit, as the physical memory size
 460  * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
 461  */
 462 #ifndef MAXVNODES_MAX
 463 #define MAXVNODES_MAX   (512 * 1024 * 1024 / 64)        /* 8M */
 464 #endif
 465
 466 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
 467
 468 static struct vnode *
 469 vn_alloc_marker(struct mount *mp)
 470 {
 471         struct vnode *vp;
 472
 473         vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
 474         vp->v_type = VMARKER;
 475         vp->v_mount = mp;
 476
 477         return (vp);
 478 }
 479
 480 static void
 481 vn_free_marker(struct vnode *vp)
 482 {
 483
 484         MPASS(vp->v_type == VMARKER);
 485         free(vp, M_VNODE_MARKER);
 486 }
 487
 488 /*
 489  * Initialize a vnode as it first enters the zone.
 490  */
 491 static int
 492 vnode_init(void *mem, int size, int flags)
 493 {
 494         struct vnode *vp;
 495
 496         vp = mem;
 497         bzero(vp, size);
 498         /*
 499          * Setup locks.
 500          */
 501         vp->v_vnlock = &vp->v_lock;
 502         mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 503         /*
 504          * By default, don't allow shared locks unless filesystems opt-in.
 505          */
 506         lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
 507             LK_NOSHARE | LK_IS_VNODE);
 508         /*
 509          * Initialize bufobj.
 510          */
 511         bufobj_init(&vp->v_bufobj, vp);
 512         /*
 513          * Initialize namecache.
 514          */
 515         LIST_INIT(&vp->v_cache_src);
 516         TAILQ_INIT(&vp->v_cache_dst);
 517         /*
 518          * Initialize rangelocks.
 519          */
 520         rangelock_init(&vp->v_rl);
 521         return (0);
 522 }
 523
 524 /*
 525  * Free a vnode when it is cleared from the zone.
 526  */
 527 static void
 528 vnode_fini(void *mem, int size)
 529 {
 530         struct vnode *vp;
 531         struct bufobj *bo;
 532
 533         vp = mem;
 534         rangelock_destroy(&vp->v_rl);
 535         lockdestroy(vp->v_vnlock);
 536         mtx_destroy(&vp->v_interlock);
 537         bo = &vp->v_bufobj;
 538         rw_destroy(BO_LOCKPTR(bo));
 539 }
 540
 541 /*
 542  * Provide the size of NFS nclnode and NFS fh for calculation of the
 543  * vnode memory consumption.  The size is specified directly to
 544  * eliminate dependency on NFS-private header.
 545  *
 546  * Other filesystems may use bigger or smaller (like UFS and ZFS)
 547  * private inode data, but the NFS-based estimation is ample enough.
 548  * Still, we care about differences in the size between 64- and 32-bit
 549  * platforms.
 550  *
 551  * Namecache structure size is heuristically
 552  * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
 553  */
 554 #ifdef _LP64
 555 #define NFS_NCLNODE_SZ  (528 + 64)
 556 #define NC_SZ           148
 557 #else
 558 #define NFS_NCLNODE_SZ  (360 + 32)
 559 #define NC_SZ           92
 560 #endif
 561
 562 static void
 563 vntblinit(void *dummy __unused)
 564 {
 565         u_int i;
 566         int physvnodes, virtvnodes;
 567
 568         /*
 569          * Desiredvnodes is a function of the physical memory size and the
 570          * kernel's heap size.  Generally speaking, it scales with the
 571          * physical memory size.  The ratio of desiredvnodes to the physical
 572          * memory size is 1:16 until desiredvnodes exceeds 98,304.
 573          * Thereafter, the
 574          * marginal ratio of desiredvnodes to the physical memory size is
 575          * 1:64.  However, desiredvnodes is limited by the kernel's heap
 576          * size.  The memory required by desiredvnodes vnodes and vm objects
 577          * must not exceed 1/10th of the kernel's heap size.
 578          */
 579         physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
 580             3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
 581         virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
 582             sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
 583         desiredvnodes = min(physvnodes, virtvnodes);
 584         if (desiredvnodes > MAXVNODES_MAX) {
 585                 if (bootverbose)
 586                         printf("Reducing kern.maxvnodes %d -> %d\n",
 587                             desiredvnodes, MAXVNODES_MAX);
 588                 desiredvnodes = MAXVNODES_MAX;
 589         }
 590         wantfreevnodes = desiredvnodes / 4;
 591         mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 592         TAILQ_INIT(&vnode_free_list);
 593         mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 594         vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 595             vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
 596         vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 597             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 598         /*
 599          * Preallocate enough nodes to support one-per buf so that
 600          * we can not fail an insert.  reassignbuf() callers can not
 601          * tolerate the insertion failure.
 602          */
 603         buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
 604             NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
 605             UMA_ZONE_NOFREE | UMA_ZONE_VM);
 606         uma_prealloc(buf_trie_zone, nbuf);
 607
 608         vnodes_created = counter_u64_alloc(M_WAITOK);
 609         recycles_count = counter_u64_alloc(M_WAITOK);
 610         recycles_free_count = counter_u64_alloc(M_WAITOK);
 611         free_owe_inact = counter_u64_alloc(M_WAITOK);
 612
 613         /*
 614          * Initialize the filesystem syncer.
 615          */
 616         syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 617             &syncer_mask);
 618         syncer_maxdelay = syncer_mask + 1;
 619         mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 620         cv_init(&sync_wakeup, "syncer");
 621         for (i = 1; i <= sizeof(struct vnode); i <<= 1)
 622                 vnsz2log++;
 623         vnsz2log--;
 624 }
 625 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
 626
 627
 628 /*
 629  * Mark a mount point as busy. Used to synchronize access and to delay
 630  * unmounting. Eventually, mountlist_mtx is not released on failure.
 631  *
 632  * vfs_busy() is a custom lock, it can block the caller.
 633  * vfs_busy() only sleeps if the unmount is active on the mount point.
 634  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
 635  * vnode belonging to mp.
 636  *
 637  * Lookup uses vfs_busy() to traverse mount points.
 638  * root fs                      var fs
 639  * / vnode lock         A       / vnode lock (/var)             D
 640  * /var vnode lock      B       /log vnode lock(/var/log)       E
 641  * vfs_busy lock        C       vfs_busy lock                   F
 642  *
 643  * Within each file system, the lock order is C->A->B and F->D->E.
 644  *
 645  * When traversing across mounts, the system follows that lock order:
 646  *
 647  *        C->A->B
 648  *              |
 649  *              +->F->D->E
 650  *
 651  * The lookup() process for namei("/var") illustrates the process:
 652  *  VOP_LOOKUP() obtains B while A is held
 653  *  vfs_busy() obtains a shared lock on F while A and B are held
 654  *  vput() releases lock on B
 655  *  vput() releases lock on A
 656  *  VFS_ROOT() obtains lock on D while shared lock on F is held
 657  *  vfs_unbusy() releases shared lock on F
 658  *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
 659  *    Attempt to lock A (instead of vp_crossmp) while D is held would
 660  *    violate the global order, causing deadlocks.
 661  *
 662  * dounmount() locks B while F is drained.
 663  */
 664 int
 665 vfs_busy(struct mount *mp, int flags)
 666 {
 667
 668         MPASS((flags & ~MBF_MASK) == 0);
 669         CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
 670
 671         if (vfs_op_thread_enter(mp)) {
 672                 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 673                 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0);
 674                 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0);
 675                 vfs_mp_count_add_pcpu(mp, ref, 1);
 676                 vfs_mp_count_add_pcpu(mp, lockref, 1);
 677                 vfs_op_thread_exit(mp);
 678                 if (flags & MBF_MNTLSTLOCK)
 679                         mtx_unlock(&mountlist_mtx);
 680                 return (0);
 681         }
 682
 683         MNT_ILOCK(mp);
 684         vfs_assert_mount_counters(mp);
 685         MNT_REF(mp);
 686         /*
 687          * If mount point is currently being unmounted, sleep until the
 688          * mount point fate is decided.  If thread doing the unmounting fails,
 689          * it will clear MNTK_UNMOUNT flag before waking us up, indicating
 690          * that this mount point has survived the unmount attempt and vfs_busy
 691          * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
 692          * flag in addition to MNTK_UNMOUNT, indicating that mount point is
 693          * about to be really destroyed.  vfs_busy needs to release its
 694          * reference on the mount point in this case and return with ENOENT,
 695          * telling the caller that mount mount it tried to busy is no longer
 696          * valid.
 697          */
 698         while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 699                 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
 700                         MNT_REL(mp);
 701                         MNT_IUNLOCK(mp);
 702                         CTR1(KTR_VFS, "%s: failed busying before sleeping",
 703                             __func__);
 704                         return (ENOENT);
 705                 }
 706                 if (flags & MBF_MNTLSTLOCK)
 707                         mtx_unlock(&mountlist_mtx);
 708                 mp->mnt_kern_flag |= MNTK_MWAIT;
 709                 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
 710                 if (flags & MBF_MNTLSTLOCK)
 711                         mtx_lock(&mountlist_mtx);
 712                 MNT_ILOCK(mp);
 713         }
 714         if (flags & MBF_MNTLSTLOCK)
 715                 mtx_unlock(&mountlist_mtx);
 716         mp->mnt_lockref++;
 717         MNT_IUNLOCK(mp);
 718         return (0);
 719 }
 720
 721 /*
 722  * Free a busy filesystem.
 723  */
 724 void
 725 vfs_unbusy(struct mount *mp)
 726 {
 727         int c;
 728
 729         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 730
 731         if (vfs_op_thread_enter(mp)) {
 732                 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 733                 vfs_mp_count_sub_pcpu(mp, lockref, 1);
 734                 vfs_mp_count_sub_pcpu(mp, ref, 1);
 735                 vfs_op_thread_exit(mp);
 736                 return;
 737         }
 738
 739         MNT_ILOCK(mp);
 740         vfs_assert_mount_counters(mp);
 741         MNT_REL(mp);
 742         c = --mp->mnt_lockref;
 743         if (mp->mnt_vfs_ops == 0) {
 744                 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 745                 MNT_IUNLOCK(mp);
 746                 return;
 747         }
 748         if (c < 0)
 749                 vfs_dump_mount_counters(mp);
 750         if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
 751                 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
 752                 CTR1(KTR_VFS, "%s: waking up waiters", __func__);
 753                 mp->mnt_kern_flag &= ~MNTK_DRAINING;
 754                 wakeup(&mp->mnt_lockref);
 755         }
 756         MNT_IUNLOCK(mp);
 757 }
 758
 759 /*
 760  * Lookup a mount point by filesystem identifier.
 761  */
 762 struct mount *
 763 vfs_getvfs(fsid_t *fsid)
 764 {
 765         struct mount *mp;
 766
 767         CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 768         mtx_lock(&mountlist_mtx);
 769         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 770                 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 771                     mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 772                         vfs_ref(mp);
 773                         mtx_unlock(&mountlist_mtx);
 774                         return (mp);
 775                 }
 776         }
 777         mtx_unlock(&mountlist_mtx);
 778         CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 779         return ((struct mount *) 0);
 780 }
 781
 782 /*
 783  * Lookup a mount point by filesystem identifier, busying it before
 784  * returning.
 785  *
 786  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
 787  * cache for popular filesystem identifiers.  The cache is lockess, using
 788  * the fact that struct mount's are never freed.  In worst case we may
 789  * get pointer to unmounted or even different filesystem, so we have to
 790  * check what we got, and go slow way if so.
 791  */
 792 struct mount *
 793 vfs_busyfs(fsid_t *fsid)
 794 {
 795 #define FSID_CACHE_SIZE 256
 796         typedef struct mount * volatile vmp_t;
 797         static vmp_t cache[FSID_CACHE_SIZE];
 798         struct mount *mp;
 799         int error;
 800         uint32_t hash;
 801
 802         CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 803         hash = fsid->val[0] ^ fsid->val[1];
 804         hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
 805         mp = cache[hash];
 806         if (mp == NULL ||
 807             mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
 808             mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
 809                 goto slow;
 810         if (vfs_busy(mp, 0) != 0) {
 811                 cache[hash] = NULL;
 812                 goto slow;
 813         }
 814         if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 815             mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
 816                 return (mp);
 817         else
 818             vfs_unbusy(mp);
 819
 820 slow:
 821         mtx_lock(&mountlist_mtx);
 822         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 823                 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 824                     mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 825                         error = vfs_busy(mp, MBF_MNTLSTLOCK);
 826                         if (error) {
 827                                 cache[hash] = NULL;
 828                                 mtx_unlock(&mountlist_mtx);
 829                                 return (NULL);
 830                         }
 831                         cache[hash] = mp;
 832                         return (mp);
 833                 }
 834         }
 835         CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 836         mtx_unlock(&mountlist_mtx);
 837         return ((struct mount *) 0);
 838 }
 839
 840 /*
 841  * Check if a user can access privileged mount options.
 842  */
 843 int
 844 vfs_suser(struct mount *mp, struct thread *td)
 845 {
 846         int error;
 847
 848         if (jailed(td->td_ucred)) {
 849                 /*
 850                  * If the jail of the calling thread lacks permission for
 851                  * this type of file system, deny immediately.
 852                  */
 853                 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag))
 854                         return (EPERM);
 855
 856                 /*
 857                  * If the file system was mounted outside the jail of the
 858                  * calling thread, deny immediately.
 859                  */
 860                 if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
 861                         return (EPERM);
 862         }
 863
 864         /*
 865          * If file system supports delegated administration, we don't check
 866          * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
 867          * by the file system itself.
 868          * If this is not the user that did original mount, we check for
 869          * the PRIV_VFS_MOUNT_OWNER privilege.
 870          */
 871         if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
 872             mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 873                 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
 874                         return (error);
 875         }
 876         return (0);
 877 }
 878
 879 /*
 880  * Get a new unique fsid.  Try to make its val[0] unique, since this value
 881  * will be used to create fake device numbers for stat().  Also try (but
 882  * not so hard) make its val[0] unique mod 2^16, since some emulators only
 883  * support 16-bit device numbers.  We end up with unique val[0]'s for the
 884  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
 885  *
 886  * Keep in mind that several mounts may be running in parallel.  Starting
 887  * the search one past where the previous search terminated is both a
 888  * micro-optimization and a defense against returning the same fsid to
 889  * different mounts.
 890  */
 891 void
 892 vfs_getnewfsid(struct mount *mp)
 893 {
 894         static uint16_t mntid_base;
 895         struct mount *nmp;
 896         fsid_t tfsid;
 897         int mtype;
 898
 899         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 900         mtx_lock(&mntid_mtx);
 901         mtype = mp->mnt_vfc->vfc_typenum;
 902         tfsid.val[1] = mtype;
 903         mtype = (mtype & 0xFF) << 24;
 904         for (;;) {
 905                 tfsid.val[0] = makedev(255,
 906                     mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 907                 mntid_base++;
 908                 if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 909                         break;
 910                 vfs_rel(nmp);
 911         }
 912         mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 913         mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 914         mtx_unlock(&mntid_mtx);
 915 }
 916
 917 /*
 918  * Knob to control the precision of file timestamps:
 919  *
 920  *   0 = seconds only; nanoseconds zeroed.
 921  *   1 = seconds and nanoseconds, accurate within 1/HZ.
 922  *   2 = seconds and nanoseconds, truncated to microseconds.
 923  * >=3 = seconds and nanoseconds, maximum precision.
 924  */
 925 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 926
 927 static int timestamp_precision = TSP_USEC;
 928 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
 929     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
 930     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
 931     "3+: sec + ns (max. precision))");
 932
 933 /*
 934  * Get a current timestamp.
 935  */
 936 void
 937 vfs_timestamp(struct timespec *tsp)
 938 {
 939         struct timeval tv;
 940
 941         switch (timestamp_precision) {
 942         case TSP_SEC:
 943                 tsp->tv_sec = time_second;
 944                 tsp->tv_nsec = 0;
 945                 break;
 946         case TSP_HZ:
 947                 getnanotime(tsp);
 948                 break;
 949         case TSP_USEC:
 950                 microtime(&tv);
 951                 TIMEVAL_TO_TIMESPEC(&tv, tsp);
 952                 break;
 953         case TSP_NSEC:
 954         default:
 955                 nanotime(tsp);
 956                 break;
 957         }
 958 }
 959
 960 /*
 961  * Set vnode attributes to VNOVAL
 962  */
 963 void
 964 vattr_null(struct vattr *vap)
 965 {
 966
 967         vap->va_type = VNON;
 968         vap->va_size = VNOVAL;
 969         vap->va_bytes = VNOVAL;
 970         vap->va_mode = VNOVAL;
 971         vap->va_nlink = VNOVAL;
 972         vap->va_uid = VNOVAL;
 973         vap->va_gid = VNOVAL;
 974         vap->va_fsid = VNOVAL;
 975         vap->va_fileid = VNOVAL;
 976         vap->va_blocksize = VNOVAL;
 977         vap->va_rdev = VNOVAL;
 978         vap->va_atime.tv_sec = VNOVAL;
 979         vap->va_atime.tv_nsec = VNOVAL;
 980         vap->va_mtime.tv_sec = VNOVAL;
 981         vap->va_mtime.tv_nsec = VNOVAL;
 982         vap->va_ctime.tv_sec = VNOVAL;
 983         vap->va_ctime.tv_nsec = VNOVAL;
 984         vap->va_birthtime.tv_sec = VNOVAL;
 985         vap->va_birthtime.tv_nsec = VNOVAL;
 986         vap->va_flags = VNOVAL;
 987         vap->va_gen = VNOVAL;
 988         vap->va_vaflags = 0;
 989 }
 990
 991 /*
 992  * This routine is called when we have too many vnodes.  It attempts
 993  * to free <count> vnodes and will potentially free vnodes that still
 994  * have VM backing store (VM backing store is typically the cause
 995  * of a vnode blowout so we want to do this).  Therefore, this operation
 996  * is not considered cheap.
 997  *
 998  * A number of conditions may prevent a vnode from being reclaimed.
 999  * the buffer cache may have references on the vnode, a directory
1000  * vnode may still have references due to the namei cache representing
1001  * underlying files, or the vnode may be in active use.   It is not
1002  * desirable to reuse such vnodes.  These conditions may cause the
1003  * number of vnodes to reach some minimum value regardless of what
1004  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
1005  *
1006  * @param mp             Try to reclaim vnodes from this mountpoint
1007  * @param reclaim_nc_src Only reclaim directories with outgoing namecache
1008  *                       entries if this argument is strue
1009  * @param trigger        Only reclaim vnodes with fewer than this many resident
1010  *                       pages.
1011  * @return               The number of vnodes that were reclaimed.
1012  */
1013 static int
1014 vlrureclaim(struct mount *mp, bool reclaim_nc_src, int trigger)
1015 {
1016         struct vnode *vp;
1017         int count, done, target;
1018
1019         done = 0;
1020         vn_start_write(NULL, &mp, V_WAIT);
1021         MNT_ILOCK(mp);
1022         count = mp->mnt_nvnodelistsize;
1023         target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
1024         target = target / 10 + 1;
1025         while (count != 0 && done < target) {
1026                 vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
1027                 while (vp != NULL && vp->v_type == VMARKER)
1028                         vp = TAILQ_NEXT(vp, v_nmntvnodes);
1029                 if (vp == NULL)
1030                         break;
1031                 /*
1032                  * XXX LRU is completely broken for non-free vnodes.  First
1033                  * by calling here in mountpoint order, then by moving
1034                  * unselected vnodes to the end here, and most grossly by
1035                  * removing the vlruvp() function that was supposed to
1036                  * maintain the order.  (This function was born broken
1037                  * since syncer problems prevented it doing anything.)  The
1038                  * order is closer to LRC (C = Created).
1039                  *
1040                  * LRU reclaiming of vnodes seems to have last worked in
1041                  * FreeBSD-3 where LRU wasn't mentioned under any spelling.
1042                  * Then there was no hold count, and inactive vnodes were
1043                  * simply put on the free list in LRU order.  The separate
1044                  * lists also break LRU.  We prefer to reclaim from the
1045                  * free list for technical reasons.  This tends to thrash
1046                  * the free list to keep very unrecently used held vnodes.
1047                  * The problem is mitigated by keeping the free list large.
1048                  */
1049                 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1050                 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1051                 --count;
1052                 if (!VI_TRYLOCK(vp))
1053                         goto next_iter;
1054                 /*
1055                  * If it's been deconstructed already, it's still
1056                  * referenced, or it exceeds the trigger, skip it.
1057                  * Also skip free vnodes.  We are trying to make space
1058                  * to expand the free list, not reduce it.
1059                  */
1060                 if (vp->v_usecount ||
1061                     (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
1062                     ((vp->v_iflag & VI_FREE) != 0) ||
1063                     VN_IS_DOOMED(vp) || (vp->v_object != NULL &&
1064                     vp->v_object->resident_page_count > trigger)) {
1065                         VI_UNLOCK(vp);
1066                         goto next_iter;
1067                 }
1068                 MNT_IUNLOCK(mp);
1069                 vholdl(vp);
1070                 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
1071                         vdrop(vp);
1072                         goto next_iter_mntunlocked;
1073                 }
1074                 VI_LOCK(vp);
1075                 /*
1076                  * v_usecount may have been bumped after VOP_LOCK() dropped
1077                  * the vnode interlock and before it was locked again.
1078                  *
1079                  * It is not necessary to recheck VIRF_DOOMED because it can
1080                  * only be set by another thread that holds both the vnode
1081                  * lock and vnode interlock.  If another thread has the
1082                  * vnode lock before we get to VOP_LOCK() and obtains the
1083                  * vnode interlock after VOP_LOCK() drops the vnode
1084                  * interlock, the other thread will be unable to drop the
1085                  * vnode lock before our VOP_LOCK() call fails.
1086                  */
1087                 if (vp->v_usecount ||
1088                     (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
1089                     (vp->v_object != NULL &&
1090                     vp->v_object->resident_page_count > trigger)) {
1091                         VOP_UNLOCK(vp);
1092                         vdropl(vp);
1093                         goto next_iter_mntunlocked;
1094                 }
1095                 KASSERT(!VN_IS_DOOMED(vp),
1096                     ("VIRF_DOOMED unexpectedly detected in vlrureclaim()"));
1097                 counter_u64_add(recycles_count, 1);
1098                 vgonel(vp);
1099                 VOP_UNLOCK(vp);
1100                 vdropl(vp);
1101                 done++;
1102 next_iter_mntunlocked:
1103                 if (!should_yield())
1104                         goto relock_mnt;
1105                 goto yield;
1106 next_iter:
1107                 if (!should_yield())
1108                         continue;
1109                 MNT_IUNLOCK(mp);
1110 yield:
1111                 kern_yield(PRI_USER);
1112 relock_mnt:
1113                 MNT_ILOCK(mp);
1114         }
1115         MNT_IUNLOCK(mp);
1116         vn_finished_write(mp);
1117         return done;
1118 }
1119
1120 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
1121 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
1122     0,
1123     "limit on vnode free requests per call to the vnlru_free routine");
1124
1125 /*
1126  * Attempt to reduce the free list by the requested amount.
1127  */
1128 static void
1129 vnlru_free_locked(int count, struct vfsops *mnt_op)
1130 {
1131         struct vnode *vp;
1132         struct mount *mp;
1133         bool tried_batches;
1134
1135         tried_batches = false;
1136         mtx_assert(&vnode_free_list_mtx, MA_OWNED);
1137         if (count > max_vnlru_free)
1138                 count = max_vnlru_free;
1139         for (; count > 0; count--) {
1140                 vp = TAILQ_FIRST(&vnode_free_list);
1141                 /*
1142                  * The list can be modified while the free_list_mtx
1143                  * has been dropped and vp could be NULL here.
1144                  */
1145                 if (vp == NULL) {
1146                         if (tried_batches)
1147                                 break;
1148                         mtx_unlock(&vnode_free_list_mtx);
1149                         vnlru_return_batches(mnt_op);
1150                         tried_batches = true;
1151                         mtx_lock(&vnode_free_list_mtx);
1152                         continue;
1153                 }
1154
1155                 VNASSERT(vp->v_op != NULL, vp,
1156                     ("vnlru_free: vnode already reclaimed."));
1157                 KASSERT((vp->v_iflag & VI_FREE) != 0,
1158                     ("Removing vnode not on freelist"));
1159                 KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
1160                     ("Mangling active vnode"));
1161                 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
1162
1163                 /*
1164                  * Don't recycle if our vnode is from different type
1165                  * of mount point.  Note that mp is type-safe, the
1166                  * check does not reach unmapped address even if
1167                  * vnode is reclaimed.
1168                  * Don't recycle if we can't get the interlock without
1169                  * blocking.
1170                  */
1171                 if ((mnt_op != NULL && (mp = vp->v_mount) != NULL &&
1172                     mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) {
1173                         TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
1174                         continue;
1175                 }
1176                 VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
1177                     vp, ("vp inconsistent on freelist"));
1178
1179                 /*
1180                  * The clear of VI_FREE prevents activation of the
1181                  * vnode.  There is no sense in putting the vnode on
1182                  * the mount point active list, only to remove it
1183                  * later during recycling.  Inline the relevant part
1184                  * of vholdl(), to avoid triggering assertions or
1185                  * activating.
1186                  */
1187                 freevnodes--;
1188                 vp->v_iflag &= ~VI_FREE;
1189                 VNODE_REFCOUNT_FENCE_REL();
1190                 refcount_acquire(&vp->v_holdcnt);
1191
1192                 mtx_unlock(&vnode_free_list_mtx);
1193                 VI_UNLOCK(vp);
1194                 vtryrecycle(vp);
1195                 /*
1196                  * If the recycled succeeded this vdrop will actually free
1197                  * the vnode.  If not it will simply place it back on
1198                  * the free list.
1199                  */
1200                 vdrop(vp);
1201                 mtx_lock(&vnode_free_list_mtx);
1202         }
1203 }
1204
1205 void
1206 vnlru_free(int count, struct vfsops *mnt_op)
1207 {
1208
1209         mtx_lock(&vnode_free_list_mtx);
1210         vnlru_free_locked(count, mnt_op);
1211         mtx_unlock(&vnode_free_list_mtx);
1212 }
1213
1214
1215 /* XXX some names and initialization are bad for limits and watermarks. */
1216 static int
1217 vspace(void)
1218 {
1219         u_long rnumvnodes, rfreevnodes;
1220         int space;
1221
1222         gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
1223         vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
1224         vlowat = vhiwat / 2;
1225         rnumvnodes = atomic_load_long(&numvnodes);
1226         rfreevnodes = atomic_load_long(&freevnodes);
1227         if (rnumvnodes > desiredvnodes)
1228                 return (0);
1229         space = desiredvnodes - rnumvnodes;
1230         if (freevnodes > wantfreevnodes)
1231                 space += rfreevnodes - wantfreevnodes;
1232         return (space);
1233 }
1234
1235 static void
1236 vnlru_return_batch_locked(struct mount *mp)
1237 {
1238         struct vnode *vp;
1239
1240         mtx_assert(&mp->mnt_listmtx, MA_OWNED);
1241
1242         if (mp->mnt_tmpfreevnodelistsize == 0)
1243                 return;
1244
1245         TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) {
1246                 VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp,
1247                     ("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist"));
1248                 vp->v_mflag &= ~VMP_TMPMNTFREELIST;
1249         }
1250         mtx_lock(&vnode_free_list_mtx);
1251         TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist);
1252         freevnodes += mp->mnt_tmpfreevnodelistsize;
1253         mtx_unlock(&vnode_free_list_mtx);
1254         mp->mnt_tmpfreevnodelistsize = 0;
1255 }
1256
1257 static void
1258 vnlru_return_batch(struct mount *mp)
1259 {
1260
1261         mtx_lock(&mp->mnt_listmtx);
1262         vnlru_return_batch_locked(mp);
1263         mtx_unlock(&mp->mnt_listmtx);
1264 }
1265
1266 static void
1267 vnlru_return_batches(struct vfsops *mnt_op)
1268 {
1269         struct mount *mp, *nmp;
1270         bool need_unbusy;
1271
1272         mtx_lock(&mountlist_mtx);
1273         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
1274                 need_unbusy = false;
1275                 if (mnt_op != NULL && mp->mnt_op != mnt_op)
1276                         goto next;
1277                 if (mp->mnt_tmpfreevnodelistsize == 0)
1278                         goto next;
1279                 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) == 0) {
1280                         vnlru_return_batch(mp);
1281                         need_unbusy = true;
1282                         mtx_lock(&mountlist_mtx);
1283                 }
1284 next:
1285                 nmp = TAILQ_NEXT(mp, mnt_list);
1286                 if (need_unbusy)
1287                         vfs_unbusy(mp);
1288         }
1289         mtx_unlock(&mountlist_mtx);
1290 }
1291
1292 /*
1293  * Attempt to recycle vnodes in a context that is always safe to block.
1294  * Calling vlrurecycle() from the bowels of filesystem code has some
1295  * interesting deadlock problems.
1296  */
1297 static struct proc *vnlruproc;
1298 static int vnlruproc_sig;
1299
1300 static void
1301 vnlru_proc(void)
1302 {
1303         u_long rnumvnodes, rfreevnodes;
1304         struct mount *mp, *nmp;
1305         unsigned long onumvnodes;
1306         int done, force, trigger, usevnodes, vsp;
1307         bool reclaim_nc_src;
1308
1309         EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
1310             SHUTDOWN_PRI_FIRST);
1311
1312         force = 0;
1313         for (;;) {
1314                 kproc_suspend_check(vnlruproc);
1315                 mtx_lock(&vnode_free_list_mtx);
1316                 rnumvnodes = atomic_load_long(&numvnodes);
1317                 /*
1318                  * If numvnodes is too large (due to desiredvnodes being
1319                  * adjusted using its sysctl, or emergency growth), first
1320                  * try to reduce it by discarding from the free list.
1321                  */
1322                 if (rnumvnodes > desiredvnodes)
1323                         vnlru_free_locked(rnumvnodes - desiredvnodes, NULL);
1324                 /*
1325                  * Sleep if the vnode cache is in a good state.  This is
1326                  * when it is not over-full and has space for about a 4%
1327                  * or 9% expansion (by growing its size or inexcessively
1328                  * reducing its free list).  Otherwise, try to reclaim
1329                  * space for a 10% expansion.
1330                  */
1331                 if (vstir && force == 0) {
1332                         force = 1;
1333                         vstir = 0;
1334                 }
1335                 vsp = vspace();
1336                 if (vsp >= vlowat && force == 0) {
1337                         vnlruproc_sig = 0;
1338                         wakeup(&vnlruproc_sig);
1339                         msleep(vnlruproc, &vnode_free_list_mtx,
1340                             PVFS|PDROP, "vlruwt", hz);
1341                         continue;
1342                 }
1343                 mtx_unlock(&vnode_free_list_mtx);
1344                 done = 0;
1345                 rnumvnodes = atomic_load_long(&numvnodes);
1346                 rfreevnodes = atomic_load_long(&freevnodes);
1347
1348                 onumvnodes = rnumvnodes;
1349                 /*
1350                  * Calculate parameters for recycling.  These are the same
1351                  * throughout the loop to give some semblance of fairness.
1352                  * The trigger point is to avoid recycling vnodes with lots
1353                  * of resident pages.  We aren't trying to free memory; we
1354                  * are trying to recycle or at least free vnodes.
1355                  */
1356                 if (rnumvnodes <= desiredvnodes)
1357                         usevnodes = rnumvnodes - rfreevnodes;
1358                 else
1359                         usevnodes = rnumvnodes;
1360                 if (usevnodes <= 0)
1361                         usevnodes = 1;
1362                 /*
1363                  * The trigger value is is chosen to give a conservatively
1364                  * large value to ensure that it alone doesn't prevent
1365                  * making progress.  The value can easily be so large that
1366                  * it is effectively infinite in some congested and
1367                  * misconfigured cases, and this is necessary.  Normally
1368                  * it is about 8 to 100 (pages), which is quite large.
1369                  */
1370                 trigger = vm_cnt.v_page_count * 2 / usevnodes;
1371                 if (force < 2)
1372                         trigger = vsmalltrigger;
1373                 reclaim_nc_src = force >= 3;
1374                 mtx_lock(&mountlist_mtx);
1375                 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
1376                         if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
1377                                 nmp = TAILQ_NEXT(mp, mnt_list);
1378                                 continue;
1379                         }
1380                         done += vlrureclaim(mp, reclaim_nc_src, trigger);
1381                         mtx_lock(&mountlist_mtx);
1382                         nmp = TAILQ_NEXT(mp, mnt_list);
1383                         vfs_unbusy(mp);
1384                 }
1385                 mtx_unlock(&mountlist_mtx);
1386                 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
1387                         uma_reclaim(UMA_RECLAIM_DRAIN);
1388                 if (done == 0) {
1389                         if (force == 0 || force == 1) {
1390                                 force = 2;
1391                                 continue;
1392                         }
1393                         if (force == 2) {
1394                                 force = 3;
1395                                 continue;
1396                         }
1397                         force = 0;
1398                         vnlru_nowhere++;
1399                         tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
1400                 } else
1401                         kern_yield(PRI_USER);
1402                 /*
1403                  * After becoming active to expand above low water, keep
1404                  * active until above high water.
1405                  */
1406                 vsp = vspace();
1407                 force = vsp < vhiwat;
1408         }
1409 }
1410
1411 static struct kproc_desc vnlru_kp = {
1412         "vnlru",
1413         vnlru_proc,
1414         &vnlruproc
1415 };
1416 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
1417     &vnlru_kp);
1418
1419 /*
1420  * Routines having to do with the management of the vnode table.
1421  */
1422
1423 /*
1424  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
1425  * before we actually vgone().  This function must be called with the vnode
1426  * held to prevent the vnode from being returned to the free list midway
1427  * through vgone().
1428  */
1429 static int
1430 vtryrecycle(struct vnode *vp)
1431 {
1432         struct mount *vnmp;
1433
1434         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
1435         VNASSERT(vp->v_holdcnt, vp,
1436             ("vtryrecycle: Recycling vp %p without a reference.", vp));
1437         /*
1438          * This vnode may found and locked via some other list, if so we
1439          * can't recycle it yet.
1440          */
1441         if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
1442                 CTR2(KTR_VFS,
1443                     "%s: impossible to recycle, vp %p lock is already held",
1444                     __func__, vp);
1445                 return (EWOULDBLOCK);
1446         }
1447         /*
1448          * Don't recycle if its filesystem is being suspended.
1449          */
1450         if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
1451                 VOP_UNLOCK(vp);
1452                 CTR2(KTR_VFS,
1453                     "%s: impossible to recycle, cannot start the write for %p",
1454                     __func__, vp);
1455                 return (EBUSY);
1456         }
1457         /*
1458          * If we got this far, we need to acquire the interlock and see if
1459          * anyone picked up this vnode from another list.  If not, we will
1460          * mark it with DOOMED via vgonel() so that anyone who does find it
1461          * will skip over it.
1462          */
1463         VI_LOCK(vp);
1464         if (vp->v_usecount) {
1465                 VOP_UNLOCK(vp);
1466                 VI_UNLOCK(vp);
1467                 vn_finished_write(vnmp);
1468                 CTR2(KTR_VFS,
1469                     "%s: impossible to recycle, %p is already referenced",
1470                     __func__, vp);
1471                 return (EBUSY);
1472         }
1473         if (!VN_IS_DOOMED(vp)) {
1474                 counter_u64_add(recycles_free_count, 1);
1475                 vgonel(vp);
1476         }
1477         VOP_UNLOCK(vp);
1478         VI_UNLOCK(vp);
1479         vn_finished_write(vnmp);
1480         return (0);
1481 }
1482
1483 static void
1484 vcheckspace(void)
1485 {
1486         int vsp;
1487
1488         vsp = vspace();
1489         if (vsp < vlowat && vnlruproc_sig == 0) {
1490                 vnlruproc_sig = 1;
1491                 wakeup(vnlruproc);
1492         }
1493 }
1494
1495 /*
1496  * Wait if necessary for space for a new vnode.
1497  */
1498 static int
1499 getnewvnode_wait(int suspended)
1500 {
1501
1502         mtx_assert(&vnode_free_list_mtx, MA_OWNED);
1503         if (numvnodes >= desiredvnodes) {
1504                 if (suspended) {
1505                         /*
1506                          * The file system is being suspended.  We cannot
1507                          * risk a deadlock here, so allow allocation of
1508                          * another vnode even if this would give too many.
1509                          */
1510                         return (0);
1511                 }
1512                 if (vnlruproc_sig == 0) {
1513                         vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
1514                         wakeup(vnlruproc);
1515                 }
1516                 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
1517                     "vlruwk", hz);
1518         }
1519         /* Post-adjust like the pre-adjust in getnewvnode(). */
1520         if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
1521                 vnlru_free_locked(1, NULL);
1522         return (numvnodes >= desiredvnodes ? ENFILE : 0);
1523 }
1524
1525 /*
1526  * This hack is fragile, and probably not needed any more now that the
1527  * watermark handling works.
1528  */
1529 void
1530 getnewvnode_reserve(u_int count)
1531 {
1532         u_long rnumvnodes, rfreevnodes;
1533         struct thread *td;
1534
1535         /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
1536         /* XXX no longer so quick, but this part is not racy. */
1537         mtx_lock(&vnode_free_list_mtx);
1538         rnumvnodes = atomic_load_long(&numvnodes);
1539         rfreevnodes = atomic_load_long(&freevnodes);
1540         if (rnumvnodes + count > desiredvnodes && rfreevnodes > wantfreevnodes)
1541                 vnlru_free_locked(ulmin(rnumvnodes + count - desiredvnodes,
1542                     rfreevnodes - wantfreevnodes), NULL);
1543         mtx_unlock(&vnode_free_list_mtx);
1544
1545         td = curthread;
1546         /* First try to be quick and racy. */
1547         if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
1548                 td->td_vp_reserv += count;
1549                 vcheckspace();  /* XXX no longer so quick, but more racy */
1550                 return;
1551         } else
1552                 atomic_subtract_long(&numvnodes, count);
1553
1554         mtx_lock(&vnode_free_list_mtx);
1555         while (count > 0) {
1556                 if (getnewvnode_wait(0) == 0) {
1557                         count--;
1558                         td->td_vp_reserv++;
1559                         atomic_add_long(&numvnodes, 1);
1560                 }
1561         }
1562         vcheckspace();
1563         mtx_unlock(&vnode_free_list_mtx);
1564 }
1565
1566 /*
1567  * This hack is fragile, especially if desiredvnodes or wantvnodes are
1568  * misconfgured or changed significantly.  Reducing desiredvnodes below
1569  * the reserved amount should cause bizarre behaviour like reducing it
1570  * below the number of active vnodes -- the system will try to reduce
1571  * numvnodes to match, but should fail, so the subtraction below should
1572  * not overflow.
1573  */
1574 void
1575 getnewvnode_drop_reserve(void)
1576 {
1577         struct thread *td;
1578
1579         td = curthread;
1580         atomic_subtract_long(&numvnodes, td->td_vp_reserv);
1581         td->td_vp_reserv = 0;
1582 }
1583
1584 /*
1585  * Return the next vnode from the free list.
1586  */
1587 int
1588 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
1589     struct vnode **vpp)
1590 {
1591         struct vnode *vp;
1592         struct thread *td;
1593         struct lock_object *lo;
1594         static int cyclecount;
1595         int error __unused;
1596
1597         CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1598
1599         KASSERT(vops->registered,
1600             ("%s: not registered vector op %p\n", __func__, vops));
1601
1602         vp = NULL;
1603         td = curthread;
1604         if (td->td_vp_reserv > 0) {
1605                 td->td_vp_reserv -= 1;
1606                 goto alloc;
1607         }
1608         mtx_lock(&vnode_free_list_mtx);
1609         if (numvnodes < desiredvnodes)
1610                 cyclecount = 0;
1611         else if (cyclecount++ >= freevnodes) {
1612                 cyclecount = 0;
1613                 vstir = 1;
1614         }
1615         /*
1616          * Grow the vnode cache if it will not be above its target max
1617          * after growing.  Otherwise, if the free list is nonempty, try
1618          * to reclaim 1 item from it before growing the cache (possibly
1619          * above its target max if the reclamation failed or is delayed).
1620          * Otherwise, wait for some space.  In all cases, schedule
1621          * vnlru_proc() if we are getting short of space.  The watermarks
1622          * should be chosen so that we never wait or even reclaim from
1623          * the free list to below its target minimum.
1624          */
1625         if (numvnodes + 1 <= desiredvnodes)
1626                 ;
1627         else if (freevnodes > 0)
1628                 vnlru_free_locked(1, NULL);
1629         else {
1630                 error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
1631                     MNTK_SUSPEND));
1632 #if 0   /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
1633                 if (error != 0) {
1634                         mtx_unlock(&vnode_free_list_mtx);
1635                         return (error);
1636                 }
1637 #endif
1638         }
1639         vcheckspace();
1640         atomic_add_long(&numvnodes, 1);
1641         mtx_unlock(&vnode_free_list_mtx);
1642 alloc:
1643         counter_u64_add(vnodes_created, 1);
1644         vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
1645         /*
1646          * Locks are given the generic name "vnode" when created.
1647          * Follow the historic practice of using the filesystem
1648          * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
1649          *
1650          * Locks live in a witness group keyed on their name. Thus,
1651          * when a lock is renamed, it must also move from the witness
1652          * group of its old name to the witness group of its new name.
1653          *
1654          * The change only needs to be made when the vnode moves
1655          * from one filesystem type to another. We ensure that each
1656          * filesystem use a single static name pointer for its tag so
1657          * that we can compare pointers rather than doing a strcmp().
1658          */
1659         lo = &vp->v_vnlock->lock_object;
1660         if (lo->lo_name != tag) {
1661                 lo->lo_name = tag;
1662                 WITNESS_DESTROY(lo);
1663                 WITNESS_INIT(lo, tag);
1664         }
1665         /*
1666          * By default, don't allow shared locks unless filesystems opt-in.
1667          */
1668         vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
1669         /*
1670          * Finalize various vnode identity bits.
1671          */
1672         KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
1673         KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
1674         KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
1675         vp->v_type = VNON;
1676         vp->v_op = vops;
1677         v_init_counters(vp);
1678         vp->v_bufobj.bo_ops = &buf_ops_bio;
1679 #ifdef DIAGNOSTIC
1680         if (mp == NULL && vops != &dead_vnodeops)
1681                 printf("NULL mp in getnewvnode(9), tag %s\n", tag);
1682 #endif
1683 #ifdef MAC
1684         mac_vnode_init(vp);
1685         if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1686                 mac_vnode_associate_singlelabel(mp, vp);
1687 #endif
1688         if (mp != NULL) {
1689                 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
1690                 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1691                         vp->v_vflag |= VV_NOKNOTE;
1692         }
1693
1694         /*
1695          * For the filesystems which do not use vfs_hash_insert(),
1696          * still initialize v_hash to have vfs_hash_index() useful.
1697          * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1698          * its own hashing.
1699          */
1700         vp->v_hash = (uintptr_t)vp >> vnsz2log;
1701
1702         *vpp = vp;
1703         return (0);
1704 }
1705
1706 static void
1707 freevnode(struct vnode *vp)
1708 {
1709         struct bufobj *bo;
1710
1711         /*
1712          * The vnode has been marked for destruction, so free it.
1713          *
1714          * The vnode will be returned to the zone where it will
1715          * normally remain until it is needed for another vnode. We
1716          * need to cleanup (or verify that the cleanup has already
1717          * been done) any residual data left from its current use
1718          * so as not to contaminate the freshly allocated vnode.
1719          */
1720         CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
1721         atomic_subtract_long(&numvnodes, 1);
1722         bo = &vp->v_bufobj;
1723         VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
1724             ("cleaned vnode still on the free list."));
1725         VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
1726         VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
1727         VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
1728         VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
1729         VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
1730         VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
1731         VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
1732             ("clean blk trie not empty"));
1733         VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
1734         VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
1735             ("dirty blk trie not empty"));
1736         VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
1737         VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
1738         VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
1739         VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
1740             ("Dangling rangelock waiters"));
1741         VI_UNLOCK(vp);
1742 #ifdef MAC
1743         mac_vnode_destroy(vp);
1744 #endif
1745         if (vp->v_pollinfo != NULL) {
1746                 destroy_vpollinfo(vp->v_pollinfo);
1747                 vp->v_pollinfo = NULL;
1748         }
1749 #ifdef INVARIANTS
1750         /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
1751         vp->v_op = NULL;
1752 #endif
1753         vp->v_mountedhere = NULL;
1754         vp->v_unpcb = NULL;
1755         vp->v_rdev = NULL;
1756         vp->v_fifoinfo = NULL;
1757         vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
1758         vp->v_irflag = 0;
1759         vp->v_iflag = 0;
1760         vp->v_vflag = 0;
1761         bo->bo_flag = 0;
1762         uma_zfree(vnode_zone, vp);
1763 }
1764
1765 /*
1766  * Delete from old mount point vnode list, if on one.
1767  */
1768 static void
1769 delmntque(struct vnode *vp)
1770 {
1771         struct mount *mp;
1772
1773         mp = vp->v_mount;
1774         if (mp == NULL)
1775                 return;
1776         MNT_ILOCK(mp);
1777         VI_LOCK(vp);
1778         KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
1779             ("Active vnode list size %d > Vnode list size %d",
1780              mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
1781         if (vp->v_iflag & VI_ACTIVE) {
1782                 vp->v_iflag &= ~VI_ACTIVE;
1783                 mtx_lock(&mp->mnt_listmtx);
1784                 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
1785                 mp->mnt_activevnodelistsize--;
1786                 mtx_unlock(&mp->mnt_listmtx);
1787         }
1788         vp->v_mount = NULL;
1789         VI_UNLOCK(vp);
1790         VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1791                 ("bad mount point vnode list size"));
1792         TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1793         mp->mnt_nvnodelistsize--;
1794         MNT_REL(mp);
1795         MNT_IUNLOCK(mp);
1796 }
1797
1798 static void
1799 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1800 {
1801
1802         vp->v_data = NULL;
1803         vp->v_op = &dead_vnodeops;
1804         vgone(vp);
1805         vput(vp);
1806 }
1807
1808 /*
1809  * Insert into list of vnodes for the new mount point, if available.
1810  */
1811 int
1812 insmntque1(struct vnode *vp, struct mount *mp,
1813         void (*dtr)(struct vnode *, void *), void *dtr_arg)
1814 {
1815
1816         KASSERT(vp->v_mount == NULL,
1817                 ("insmntque: vnode already on per mount vnode list"));
1818         VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1819         ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1820
1821         /*
1822          * We acquire the vnode interlock early to ensure that the
1823          * vnode cannot be recycled by another process releasing a
1824          * holdcnt on it before we get it on both the vnode list
1825          * and the active vnode list. The mount mutex protects only
1826          * manipulation of the vnode list and the vnode freelist
1827          * mutex protects only manipulation of the active vnode list.
1828          * Hence the need to hold the vnode interlock throughout.
1829          */
1830         MNT_ILOCK(mp);
1831         VI_LOCK(vp);
1832         if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 &&
1833             ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1834             mp->mnt_nvnodelistsize == 0)) &&
1835             (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1836                 VI_UNLOCK(vp);
1837                 MNT_IUNLOCK(mp);
1838                 if (dtr != NULL)
1839                         dtr(vp, dtr_arg);
1840                 return (EBUSY);
1841         }
1842         vp->v_mount = mp;
1843         MNT_REF(mp);
1844         TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1845         VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1846                 ("neg mount point vnode list size"));
1847         mp->mnt_nvnodelistsize++;
1848         KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
1849             ("Activating already active vnode"));
1850         vp->v_iflag |= VI_ACTIVE;
1851         mtx_lock(&mp->mnt_listmtx);
1852         TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
1853         mp->mnt_activevnodelistsize++;
1854         mtx_unlock(&mp->mnt_listmtx);
1855         VI_UNLOCK(vp);
1856         MNT_IUNLOCK(mp);
1857         return (0);
1858 }
1859
1860 int
1861 insmntque(struct vnode *vp, struct mount *mp)
1862 {
1863
1864         return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1865 }
1866
1867 /*
1868  * Flush out and invalidate all buffers associated with a bufobj
1869  * Called with the underlying object locked.
1870  */
1871 int
1872 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1873 {
1874         int error;
1875
1876         BO_LOCK(bo);
1877         if (flags & V_SAVE) {
1878                 error = bufobj_wwait(bo, slpflag, slptimeo);
1879                 if (error) {
1880                         BO_UNLOCK(bo);
1881                         return (error);
1882                 }
1883                 if (bo->bo_dirty.bv_cnt > 0) {
1884                         BO_UNLOCK(bo);
1885                         if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1886                                 return (error);
1887                         /*
1888                          * XXX We could save a lock/unlock if this was only
1889                          * enabled under INVARIANTS
1890                          */
1891                         BO_LOCK(bo);
1892                         if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1893                                 panic("vinvalbuf: dirty bufs");
1894                 }
1895         }
1896         /*
1897          * If you alter this loop please notice that interlock is dropped and
1898          * reacquired in flushbuflist.  Special care is needed to ensure that
1899          * no race conditions occur from this.
1900          */
1901         do {
1902                 error = flushbuflist(&bo->bo_clean,
1903                     flags, bo, slpflag, slptimeo);
1904                 if (error == 0 && !(flags & V_CLEANONLY))
1905                         error = flushbuflist(&bo->bo_dirty,
1906                             flags, bo, slpflag, slptimeo);
1907                 if (error != 0 && error != EAGAIN) {
1908                         BO_UNLOCK(bo);
1909                         return (error);
1910                 }
1911         } while (error != 0);
1912
1913         /*
1914          * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1915          * have write I/O in-progress but if there is a VM object then the
1916          * VM object can also have read-I/O in-progress.
1917          */
1918         do {
1919                 bufobj_wwait(bo, 0, 0);
1920                 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) {
1921                         BO_UNLOCK(bo);
1922                         vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx");
1923                         BO_LOCK(bo);
1924                 }
1925         } while (bo->bo_numoutput > 0);
1926         BO_UNLOCK(bo);
1927
1928         /*
1929          * Destroy the copy in the VM cache, too.
1930          */
1931         if (bo->bo_object != NULL &&
1932             (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
1933                 VM_OBJECT_WLOCK(bo->bo_object);
1934                 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1935                     OBJPR_CLEANONLY : 0);
1936                 VM_OBJECT_WUNLOCK(bo->bo_object);
1937         }
1938
1939 #ifdef INVARIANTS
1940         BO_LOCK(bo);
1941         if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
1942             V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
1943             bo->bo_clean.bv_cnt > 0))
1944                 panic("vinvalbuf: flush failed");
1945         if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
1946             bo->bo_dirty.bv_cnt > 0)
1947                 panic("vinvalbuf: flush dirty failed");
1948         BO_UNLOCK(bo);
1949 #endif
1950         return (0);
1951 }
1952
1953 /*
1954  * Flush out and invalidate all buffers associated with a vnode.
1955  * Called with the underlying object locked.
1956  */
1957 int
1958 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1959 {
1960
1961         CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1962         ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1963         if (vp->v_object != NULL && vp->v_object->handle != vp)
1964                 return (0);
1965         return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1966 }
1967
1968 /*
1969  * Flush out buffers on the specified list.
1970  *
1971  */
1972 static int
1973 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1974     int slptimeo)
1975 {
1976         struct buf *bp, *nbp;
1977         int retval, error;
1978         daddr_t lblkno;
1979         b_xflags_t xflags;
1980
1981         ASSERT_BO_WLOCKED(bo);
1982
1983         retval = 0;
1984         TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1985                 /*
1986                  * If we are flushing both V_NORMAL and V_ALT buffers then
1987                  * do not skip any buffers. If we are flushing only V_NORMAL
1988                  * buffers then skip buffers marked as BX_ALTDATA. If we are
1989                  * flushing only V_ALT buffers then skip buffers not marked
1990                  * as BX_ALTDATA.
1991                  */
1992                 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) &&
1993                    (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) ||
1994                     ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) {
1995                         continue;
1996                 }
1997                 if (nbp != NULL) {
1998                         lblkno = nbp->b_lblkno;
1999                         xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
2000                 }
2001                 retval = EAGAIN;
2002                 error = BUF_TIMELOCK(bp,
2003                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
2004                     "flushbuf", slpflag, slptimeo);
2005                 if (error) {
2006                         BO_LOCK(bo);
2007                         return (error != ENOLCK ? error : EAGAIN);
2008                 }
2009                 KASSERT(bp->b_bufobj == bo,
2010                     ("bp %p wrong b_bufobj %p should be %p",
2011                     bp, bp->b_bufobj, bo));
2012                 /*
2013                  * XXX Since there are no node locks for NFS, I
2014                  * believe there is a slight chance that a delayed
2015                  * write will occur while sleeping just above, so
2016                  * check for it.
2017                  */
2018                 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
2019                     (flags & V_SAVE)) {
2020                         bremfree(bp);
2021                         bp->b_flags |= B_ASYNC;
2022                         bwrite(bp);
2023                         BO_LOCK(bo);
2024                         return (EAGAIN);        /* XXX: why not loop ? */
2025                 }
2026                 bremfree(bp);
2027                 bp->b_flags |= (B_INVAL | B_RELBUF);
2028                 bp->b_flags &= ~B_ASYNC;
2029                 brelse(bp);
2030                 BO_LOCK(bo);
2031                 if (nbp == NULL)
2032                         break;
2033                 nbp = gbincore(bo, lblkno);
2034                 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2035                     != xflags)
2036                         break;                  /* nbp invalid */
2037         }
2038         return (retval);
2039 }
2040
2041 int
2042 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
2043 {
2044         struct buf *bp;
2045         int error;
2046         daddr_t lblkno;
2047
2048         ASSERT_BO_LOCKED(bo);
2049
2050         for (lblkno = startn;;) {
2051 again:
2052                 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
2053                 if (bp == NULL || bp->b_lblkno >= endn ||
2054                     bp->b_lblkno < startn)
2055                         break;
2056                 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
2057                     LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
2058                 if (error != 0) {
2059                         BO_RLOCK(bo);
2060                         if (error == ENOLCK)
2061                                 goto again;
2062                         return (error);
2063                 }
2064                 KASSERT(bp->b_bufobj == bo,
2065                     ("bp %p wrong b_bufobj %p should be %p",
2066                     bp, bp->b_bufobj, bo));
2067                 lblkno = bp->b_lblkno + 1;
2068                 if ((bp->b_flags & B_MANAGED) == 0)
2069                         bremfree(bp);
2070                 bp->b_flags |= B_RELBUF;
2071                 /*
2072                  * In the VMIO case, use the B_NOREUSE flag to hint that the
2073                  * pages backing each buffer in the range are unlikely to be
2074                  * reused.  Dirty buffers will have the hint applied once
2075                  * they've been written.
2076                  */
2077                 if ((bp->b_flags & B_VMIO) != 0)
2078                         bp->b_flags |= B_NOREUSE;
2079                 brelse(bp);
2080                 BO_RLOCK(bo);
2081         }
2082         return (0);
2083 }
2084
2085 /*
2086  * Truncate a file's buffer and pages to a specified length.  This
2087  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
2088  * sync activity.
2089  */
2090 int
2091 vtruncbuf(struct vnode *vp, off_t length, int blksize)
2092 {
2093         struct buf *bp, *nbp;
2094         struct bufobj *bo;
2095         daddr_t startlbn;
2096
2097         CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
2098             vp, blksize, (uintmax_t)length);
2099
2100         /*
2101          * Round up to the *next* lbn.
2102          */
2103         startlbn = howmany(length, blksize);
2104
2105         ASSERT_VOP_LOCKED(vp, "vtruncbuf");
2106
2107         bo = &vp->v_bufobj;
2108 restart_unlocked:
2109         BO_LOCK(bo);
2110
2111         while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN)
2112                 ;
2113
2114         if (length > 0) {
2115 restartsync:
2116                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2117                         if (bp->b_lblkno > 0)
2118                                 continue;
2119                         /*
2120                          * Since we hold the vnode lock this should only
2121                          * fail if we're racing with the buf daemon.
2122                          */
2123                         if (BUF_LOCK(bp,
2124                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
2125                             BO_LOCKPTR(bo)) == ENOLCK)
2126                                 goto restart_unlocked;
2127
2128                         VNASSERT((bp->b_flags & B_DELWRI), vp,
2129                             ("buf(%p) on dirty queue without DELWRI", bp));
2130
2131                         bremfree(bp);
2132                         bawrite(bp);
2133                         BO_LOCK(bo);
2134                         goto restartsync;
2135                 }
2136         }
2137
2138         bufobj_wwait(bo, 0, 0);
2139         BO_UNLOCK(bo);
2140         vnode_pager_setsize(vp, length);
2141
2142         return (0);
2143 }
2144
2145 /*
2146  * Invalidate the cached pages of a file's buffer within the range of block
2147  * numbers [startlbn, endlbn).
2148  */
2149 void
2150 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
2151     int blksize)
2152 {
2153         struct bufobj *bo;
2154         off_t start, end;
2155
2156         ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
2157
2158         start = blksize * startlbn;
2159         end = blksize * endlbn;
2160
2161         bo = &vp->v_bufobj;
2162         BO_LOCK(bo);
2163         MPASS(blksize == bo->bo_bsize);
2164
2165         while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN)
2166                 ;
2167
2168         BO_UNLOCK(bo);
2169         vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1));
2170 }
2171
2172 static int
2173 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
2174     daddr_t startlbn, daddr_t endlbn)
2175 {
2176         struct buf *bp, *nbp;
2177         bool anyfreed;
2178
2179         ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked");
2180         ASSERT_BO_LOCKED(bo);
2181
2182         do {
2183                 anyfreed = false;
2184                 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
2185                         if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
2186                                 continue;
2187                         if (BUF_LOCK(bp,
2188                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
2189                             BO_LOCKPTR(bo)) == ENOLCK) {
2190                                 BO_LOCK(bo);
2191                                 return (EAGAIN);
2192                         }
2193
2194                         bremfree(bp);
2195                         bp->b_flags |= B_INVAL | B_RELBUF;
2196                         bp->b_flags &= ~B_ASYNC;
2197                         brelse(bp);
2198                         anyfreed = true;
2199
2200                         BO_LOCK(bo);
2201                         if (nbp != NULL &&
2202                             (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
2203                             nbp->b_vp != vp ||
2204                             (nbp->b_flags & B_DELWRI) != 0))
2205                                 return (EAGAIN);
2206                 }
2207
2208                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2209                         if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
2210                                 continue;
2211                         if (BUF_LOCK(bp,
2212                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
2213                             BO_LOCKPTR(bo)) == ENOLCK) {
2214                                 BO_LOCK(bo);
2215                                 return (EAGAIN);
2216                         }
2217                         bremfree(bp);
2218                         bp->b_flags |= B_INVAL | B_RELBUF;
2219                         bp->b_flags &= ~B_ASYNC;
2220                         brelse(bp);
2221                         anyfreed = true;
2222
2223                         BO_LOCK(bo);
2224                         if (nbp != NULL &&
2225                             (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
2226                             (nbp->b_vp != vp) ||
2227                             (nbp->b_flags & B_DELWRI) == 0))
2228                                 return (EAGAIN);
2229                 }
2230         } while (anyfreed);
2231         return (0);
2232 }
2233
2234 static void
2235 buf_vlist_remove(struct buf *bp)
2236 {
2237         struct bufv *bv;
2238
2239         KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
2240         ASSERT_BO_WLOCKED(bp->b_bufobj);
2241         KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
2242             (BX_VNDIRTY|BX_VNCLEAN),
2243             ("buf_vlist_remove: Buf %p is on two lists", bp));
2244         if (bp->b_xflags & BX_VNDIRTY)
2245                 bv = &bp->b_bufobj->bo_dirty;
2246         else
2247                 bv = &bp->b_bufobj->bo_clean;
2248         BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
2249         TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
2250         bv->bv_cnt--;
2251         bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
2252 }
2253
2254 /*
2255  * Add the buffer to the sorted clean or dirty block list.
2256  *
2257  * NOTE: xflags is passed as a constant, optimizing this inline function!
2258  */
2259 static void
2260 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
2261 {
2262         struct bufv *bv;
2263         struct buf *n;
2264         int error;
2265
2266         ASSERT_BO_WLOCKED(bo);
2267         KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
2268             ("dead bo %p", bo));
2269         KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
2270             ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
2271         bp->b_xflags |= xflags;
2272         if (xflags & BX_VNDIRTY)
2273                 bv = &bo->bo_dirty;
2274         else
2275                 bv = &bo->bo_clean;
2276
2277         /*
2278          * Keep the list ordered.  Optimize empty list insertion.  Assume
2279          * we tend to grow at the tail so lookup_le should usually be cheaper
2280          * than _ge.
2281          */
2282         if (bv->bv_cnt == 0 ||
2283             bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
2284                 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
2285         else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
2286                 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
2287         else
2288                 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
2289         error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
2290         if (error)
2291                 panic("buf_vlist_add:  Preallocated nodes insufficient.");
2292         bv->bv_cnt++;
2293 }
2294
2295 /*
2296  * Look up a buffer using the buffer tries.
2297  */
2298 struct buf *
2299 gbincore(struct bufobj *bo, daddr_t lblkno)
2300 {
2301         struct buf *bp;
2302
2303         ASSERT_BO_LOCKED(bo);
2304         bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
2305         if (bp != NULL)
2306                 return (bp);
2307         return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
2308 }
2309
2310 /*
2311  * Associate a buffer with a vnode.
2312  */
2313 void
2314 bgetvp(struct vnode *vp, struct buf *bp)
2315 {
2316         struct bufobj *bo;
2317
2318         bo = &vp->v_bufobj;
2319         ASSERT_BO_WLOCKED(bo);
2320         VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
2321
2322         CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
2323         VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
2324             ("bgetvp: bp already attached! %p", bp));
2325
2326         vhold(vp);
2327         bp->b_vp = vp;
2328         bp->b_bufobj = bo;
2329         /*
2330          * Insert onto list for new vnode.
2331          */
2332         buf_vlist_add(bp, bo, BX_VNCLEAN);
2333 }
2334
2335 /*
2336  * Disassociate a buffer from a vnode.
2337  */
2338 void
2339 brelvp(struct buf *bp)
2340 {
2341         struct bufobj *bo;
2342         struct vnode *vp;
2343
2344         CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
2345         KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
2346
2347         /*
2348          * Delete from old vnode list, if on one.
2349          */
2350         vp = bp->b_vp;          /* XXX */
2351         bo = bp->b_bufobj;
2352         BO_LOCK(bo);
2353         if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2354                 buf_vlist_remove(bp);
2355         else
2356                 panic("brelvp: Buffer %p not on queue.", bp);
2357         if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2358                 bo->bo_flag &= ~BO_ONWORKLST;
2359                 mtx_lock(&sync_mtx);
2360                 LIST_REMOVE(bo, bo_synclist);
2361                 syncer_worklist_len--;
2362                 mtx_unlock(&sync_mtx);
2363         }
2364         bp->b_vp = NULL;
2365         bp->b_bufobj = NULL;
2366         BO_UNLOCK(bo);
2367         vdrop(vp);
2368 }
2369
2370 /*
2371  * Add an item to the syncer work queue.
2372  */
2373 static void
2374 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
2375 {
2376         int slot;
2377
2378         ASSERT_BO_WLOCKED(bo);
2379
2380         mtx_lock(&sync_mtx);
2381         if (bo->bo_flag & BO_ONWORKLST)
2382                 LIST_REMOVE(bo, bo_synclist);
2383         else {
2384                 bo->bo_flag |= BO_ONWORKLST;
2385                 syncer_worklist_len++;
2386         }
2387
2388         if (delay > syncer_maxdelay - 2)
2389                 delay = syncer_maxdelay - 2;
2390         slot = (syncer_delayno + delay) & syncer_mask;
2391
2392         LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
2393         mtx_unlock(&sync_mtx);
2394 }
2395
2396 static int
2397 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
2398 {
2399         int error, len;
2400
2401         mtx_lock(&sync_mtx);
2402         len = syncer_worklist_len - sync_vnode_count;
2403         mtx_unlock(&sync_mtx);
2404         error = SYSCTL_OUT(req, &len, sizeof(len));
2405         return (error);
2406 }
2407
2408 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
2409     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
2410
2411 static struct proc *updateproc;
2412 static void sched_sync(void);
2413 static struct kproc_desc up_kp = {
2414         "syncer",
2415         sched_sync,
2416         &updateproc
2417 };
2418 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
2419
2420 static int
2421 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
2422 {
2423         struct vnode *vp;
2424         struct mount *mp;
2425
2426         *bo = LIST_FIRST(slp);
2427         if (*bo == NULL)
2428                 return (0);
2429         vp = bo2vnode(*bo);
2430         if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
2431                 return (1);
2432         /*
2433          * We use vhold in case the vnode does not
2434          * successfully sync.  vhold prevents the vnode from
2435          * going away when we unlock the sync_mtx so that
2436          * we can acquire the vnode interlock.
2437          */
2438         vholdl(vp);
2439         mtx_unlock(&sync_mtx);
2440         VI_UNLOCK(vp);
2441         if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
2442                 vdrop(vp);
2443                 mtx_lock(&sync_mtx);
2444                 return (*bo == LIST_FIRST(slp));
2445         }
2446         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2447         (void) VOP_FSYNC(vp, MNT_LAZY, td);
2448         VOP_UNLOCK(vp);
2449         vn_finished_write(mp);
2450         BO_LOCK(*bo);
2451         if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
2452                 /*
2453                  * Put us back on the worklist.  The worklist
2454                  * routine will remove us from our current
2455                  * position and then add us back in at a later
2456                  * position.
2457                  */
2458                 vn_syncer_add_to_worklist(*bo, syncdelay);
2459         }
2460         BO_UNLOCK(*bo);
2461         vdrop(vp);
2462         mtx_lock(&sync_mtx);
2463         return (0);
2464 }
2465
2466 static int first_printf = 1;
2467
2468 /*
2469  * System filesystem synchronizer daemon.
2470  */
2471 static void
2472 sched_sync(void)
2473 {
2474         struct synclist *next, *slp;
2475         struct bufobj *bo;
2476         long starttime;
2477         struct thread *td = curthread;
2478         int last_work_seen;
2479         int net_worklist_len;
2480         int syncer_final_iter;
2481         int error;
2482
2483         last_work_seen = 0;
2484         syncer_final_iter = 0;
2485         syncer_state = SYNCER_RUNNING;
2486         starttime = time_uptime;
2487         td->td_pflags |= TDP_NORUNNINGBUF;
2488
2489         EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
2490             SHUTDOWN_PRI_LAST);
2491
2492         mtx_lock(&sync_mtx);
2493         for (;;) {
2494                 if (syncer_state == SYNCER_FINAL_DELAY &&
2495                     syncer_final_iter == 0) {
2496                         mtx_unlock(&sync_mtx);
2497                         kproc_suspend_check(td->td_proc);
2498                         mtx_lock(&sync_mtx);
2499                 }
2500                 net_worklist_len = syncer_worklist_len - sync_vnode_count;
2501                 if (syncer_state != SYNCER_RUNNING &&
2502                     starttime != time_uptime) {
2503                         if (first_printf) {
2504                                 printf("\nSyncing disks, vnodes remaining... ");
2505                                 first_printf = 0;
2506                         }
2507                         printf("%d ", net_worklist_len);
2508                 }
2509                 starttime = time_uptime;
2510
2511                 /*
2512                  * Push files whose dirty time has expired.  Be careful
2513                  * of interrupt race on slp queue.
2514                  *
2515                  * Skip over empty worklist slots when shutting down.
2516                  */
2517                 do {
2518                         slp = &syncer_workitem_pending[syncer_delayno];
2519                         syncer_delayno += 1;
2520                         if (syncer_delayno == syncer_maxdelay)
2521                                 syncer_delayno = 0;
2522                         next = &syncer_workitem_pending[syncer_delayno];
2523                         /*
2524                          * If the worklist has wrapped since the
2525                          * it was emptied of all but syncer vnodes,
2526                          * switch to the FINAL_DELAY state and run
2527                          * for one more second.
2528                          */
2529                         if (syncer_state == SYNCER_SHUTTING_DOWN &&
2530                             net_worklist_len == 0 &&
2531                             last_work_seen == syncer_delayno) {
2532                                 syncer_state = SYNCER_FINAL_DELAY;
2533                                 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
2534                         }
2535                 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
2536                     syncer_worklist_len > 0);
2537
2538                 /*
2539                  * Keep track of the last time there was anything
2540                  * on the worklist other than syncer vnodes.
2541                  * Return to the SHUTTING_DOWN state if any
2542                  * new work appears.
2543                  */
2544                 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
2545                         last_work_seen = syncer_delayno;
2546                 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
2547                         syncer_state = SYNCER_SHUTTING_DOWN;
2548                 while (!LIST_EMPTY(slp)) {
2549                         error = sync_vnode(slp, &bo, td);
2550                         if (error == 1) {
2551                                 LIST_REMOVE(bo, bo_synclist);
2552                                 LIST_INSERT_HEAD(next, bo, bo_synclist);
2553                                 continue;
2554                         }
2555
2556                         if (first_printf == 0) {
2557                                 /*
2558                                  * Drop the sync mutex, because some watchdog
2559                                  * drivers need to sleep while patting
2560                                  */
2561                                 mtx_unlock(&sync_mtx);
2562                                 wdog_kern_pat(WD_LASTVAL);
2563                                 mtx_lock(&sync_mtx);
2564                         }
2565
2566                 }
2567                 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
2568                         syncer_final_iter--;
2569                 /*
2570                  * The variable rushjob allows the kernel to speed up the
2571                  * processing of the filesystem syncer process. A rushjob
2572                  * value of N tells the filesystem syncer to process the next
2573                  * N seconds worth of work on its queue ASAP. Currently rushjob
2574                  * is used by the soft update code to speed up the filesystem
2575                  * syncer process when the incore state is getting so far
2576                  * ahead of the disk that the kernel memory pool is being
2577                  * threatened with exhaustion.
2578                  */
2579                 if (rushjob > 0) {
2580                         rushjob -= 1;
2581                         continue;
2582                 }
2583                 /*
2584                  * Just sleep for a short period of time between
2585                  * iterations when shutting down to allow some I/O
2586                  * to happen.
2587                  *
2588                  * If it has taken us less than a second to process the
2589                  * current work, then wait. Otherwise start right over
2590                  * again. We can still lose time if any single round
2591                  * takes more than two seconds, but it does not really
2592                  * matter as we are just trying to generally pace the
2593                  * filesystem activity.
2594                  */
2595                 if (syncer_state != SYNCER_RUNNING ||
2596                     time_uptime == starttime) {
2597                         thread_lock(td);
2598                         sched_prio(td, PPAUSE);
2599                         thread_unlock(td);
2600                 }
2601                 if (syncer_state != SYNCER_RUNNING)
2602                         cv_timedwait(&sync_wakeup, &sync_mtx,
2603                             hz / SYNCER_SHUTDOWN_SPEEDUP);
2604                 else if (time_uptime == starttime)
2605                         cv_timedwait(&sync_wakeup, &sync_mtx, hz);
2606         }
2607 }
2608
2609 /*
2610  * Request the syncer daemon to speed up its work.
2611  * We never push it to speed up more than half of its
2612  * normal turn time, otherwise it could take over the cpu.
2613  */
2614 int
2615 speedup_syncer(void)
2616 {
2617         int ret = 0;
2618
2619         mtx_lock(&sync_mtx);
2620         if (rushjob < syncdelay / 2) {
2621                 rushjob += 1;
2622                 stat_rush_requests += 1;
2623                 ret = 1;
2624         }
2625         mtx_unlock(&sync_mtx);
2626         cv_broadcast(&sync_wakeup);
2627         return (ret);
2628 }
2629
2630 /*
2631  * Tell the syncer to speed up its work and run though its work
2632  * list several times, then tell it to shut down.
2633  */
2634 static void
2635 syncer_shutdown(void *arg, int howto)
2636 {
2637
2638         if (howto & RB_NOSYNC)
2639                 return;
2640         mtx_lock(&sync_mtx);
2641         syncer_state = SYNCER_SHUTTING_DOWN;
2642         rushjob = 0;
2643         mtx_unlock(&sync_mtx);
2644         cv_broadcast(&sync_wakeup);
2645         kproc_shutdown(arg, howto);
2646 }
2647
2648 void
2649 syncer_suspend(void)
2650 {
2651
2652         syncer_shutdown(updateproc, 0);
2653 }
2654
2655 void
2656 syncer_resume(void)
2657 {
2658
2659         mtx_lock(&sync_mtx);
2660         first_printf = 1;
2661         syncer_state = SYNCER_RUNNING;
2662         mtx_unlock(&sync_mtx);
2663         cv_broadcast(&sync_wakeup);
2664         kproc_resume(updateproc);
2665 }
2666
2667 /*
2668  * Reassign a buffer from one vnode to another.
2669  * Used to assign file specific control information
2670  * (indirect blocks) to the vnode to which they belong.
2671  */
2672 void
2673 reassignbuf(struct buf *bp)
2674 {
2675         struct vnode *vp;
2676         struct bufobj *bo;
2677         int delay;
2678 #ifdef INVARIANTS
2679         struct bufv *bv;
2680 #endif
2681
2682         vp = bp->b_vp;
2683         bo = bp->b_bufobj;
2684         ++reassignbufcalls;
2685
2686         CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
2687             bp, bp->b_vp, bp->b_flags);
2688         /*
2689          * B_PAGING flagged buffers cannot be reassigned because their vp
2690          * is not fully linked in.
2691          */
2692         if (bp->b_flags & B_PAGING)
2693                 panic("cannot reassign paging buffer");
2694
2695         /*
2696          * Delete from old vnode list, if on one.
2697          */
2698         BO_LOCK(bo);
2699         if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2700                 buf_vlist_remove(bp);
2701         else
2702                 panic("reassignbuf: Buffer %p not on queue.", bp);
2703         /*
2704          * If dirty, put on list of dirty buffers; otherwise insert onto list
2705          * of clean buffers.
2706          */
2707         if (bp->b_flags & B_DELWRI) {
2708                 if ((bo->bo_flag & BO_ONWORKLST) == 0) {
2709                         switch (vp->v_type) {
2710                         case VDIR:
2711                                 delay = dirdelay;
2712                                 break;
2713                         case VCHR:
2714                                 delay = metadelay;
2715                                 break;
2716                         default:
2717                                 delay = filedelay;
2718                         }
2719                         vn_syncer_add_to_worklist(bo, delay);
2720                 }
2721                 buf_vlist_add(bp, bo, BX_VNDIRTY);
2722         } else {
2723                 buf_vlist_add(bp, bo, BX_VNCLEAN);
2724
2725                 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2726                         mtx_lock(&sync_mtx);
2727                         LIST_REMOVE(bo, bo_synclist);
2728                         syncer_worklist_len--;
2729                         mtx_unlock(&sync_mtx);
2730                         bo->bo_flag &= ~BO_ONWORKLST;
2731                 }
2732         }
2733 #ifdef INVARIANTS
2734         bv = &bo->bo_clean;
2735         bp = TAILQ_FIRST(&bv->bv_hd);
2736         KASSERT(bp == NULL || bp->b_bufobj == bo,
2737             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2738         bp = TAILQ_LAST(&bv->bv_hd, buflists);
2739         KASSERT(bp == NULL || bp->b_bufobj == bo,
2740             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2741         bv = &bo->bo_dirty;
2742         bp = TAILQ_FIRST(&bv->bv_hd);
2743         KASSERT(bp == NULL || bp->b_bufobj == bo,
2744             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2745         bp = TAILQ_LAST(&bv->bv_hd, buflists);
2746         KASSERT(bp == NULL || bp->b_bufobj == bo,
2747             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2748 #endif
2749         BO_UNLOCK(bo);
2750 }
2751
2752 static void
2753 v_init_counters(struct vnode *vp)
2754 {
2755
2756         VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
2757             vp, ("%s called for an initialized vnode", __FUNCTION__));
2758         ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
2759
2760         refcount_init(&vp->v_holdcnt, 1);
2761         refcount_init(&vp->v_usecount, 1);
2762 }
2763
2764 /*
2765  * Increment si_usecount of the associated device, if any.
2766  */
2767 static void
2768 v_incr_devcount(struct vnode *vp)
2769 {
2770
2771         ASSERT_VI_LOCKED(vp, __FUNCTION__);
2772         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2773                 dev_lock();
2774                 vp->v_rdev->si_usecount++;
2775                 dev_unlock();
2776         }
2777 }
2778
2779 /*
2780  * Decrement si_usecount of the associated device, if any.
2781  */
2782 static void
2783 v_decr_devcount(struct vnode *vp)
2784 {
2785
2786         ASSERT_VI_LOCKED(vp, __FUNCTION__);
2787         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2788                 dev_lock();
2789                 vp->v_rdev->si_usecount--;
2790                 dev_unlock();
2791         }
2792 }
2793
2794 /*
2795  * Grab a particular vnode from the free list, increment its
2796  * reference count and lock it.  VIRF_DOOMED is set if the vnode
2797  * is being destroyed.  Only callers who specify LK_RETRY will
2798  * see doomed vnodes.  If inactive processing was delayed in
2799  * vput try to do it here.
2800  *
2801  * Both holdcnt and usecount can be manipulated using atomics without holding
2802  * any locks except in these cases which require the vnode interlock:
2803  * holdcnt: 1->0 and 0->1
2804  * usecount: 0->1
2805  *
2806  * usecount is permitted to transition 1->0 without the interlock because
2807  * vnode is kept live by holdcnt.
2808  */
2809 static enum vgetstate __always_inline
2810 _vget_prep(struct vnode *vp, bool interlock)
2811 {
2812         enum vgetstate vs;
2813
2814         if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
2815                 vs = VGET_USECOUNT;
2816         } else {
2817                 if (interlock)
2818                         vholdl(vp);
2819                 else
2820                         vhold(vp);
2821                 vs = VGET_HOLDCNT;
2822         }
2823         return (vs);
2824 }
2825
2826 enum vgetstate
2827 vget_prep(struct vnode *vp)
2828 {
2829
2830         return (_vget_prep(vp, false));
2831 }
2832
2833 int
2834 vget(struct vnode *vp, int flags, struct thread *td)
2835 {
2836         enum vgetstate vs;
2837
2838         MPASS(td == curthread);
2839
2840         vs = _vget_prep(vp, (flags & LK_INTERLOCK) != 0);
2841         return (vget_finish(vp, flags, vs));
2842 }
2843
2844 int
2845 vget_finish(struct vnode *vp, int flags, enum vgetstate vs)
2846 {
2847         int error, oweinact;
2848
2849         VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2850             ("%s: invalid lock operation", __func__));
2851
2852         if ((flags & LK_INTERLOCK) != 0)
2853                 ASSERT_VI_LOCKED(vp, __func__);
2854         else
2855                 ASSERT_VI_UNLOCKED(vp, __func__);
2856         VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__));
2857         if (vs == VGET_USECOUNT) {
2858                 VNASSERT(vp->v_usecount > 0, vp,
2859                     ("%s: vnode without usecount when VGET_USECOUNT was passed",
2860                     __func__));
2861         }
2862
2863         if ((error = vn_lock(vp, flags)) != 0) {
2864                 if (vs == VGET_USECOUNT)
2865                         vrele(vp);
2866                 else
2867                         vdrop(vp);
2868                 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2869                     vp);
2870                 return (error);
2871         }
2872
2873         if (vs == VGET_USECOUNT) {
2874                 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2875                     ("%s: vnode with usecount and VI_OWEINACT set", __func__));
2876                 return (0);
2877         }
2878
2879         /*
2880          * We hold the vnode. If the usecount is 0 it will be utilized to keep
2881          * the vnode around. Otherwise someone else lended their hold count and
2882          * we have to drop ours.
2883          */
2884         if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
2885 #ifdef INVARIANTS
2886                 int old = atomic_fetchadd_int(&vp->v_holdcnt, -1);
2887                 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old));
2888 #else
2889                 refcount_release(&vp->v_holdcnt);
2890 #endif
2891                 VNODE_REFCOUNT_FENCE_ACQ();
2892                 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2893                     ("%s: vnode with usecount and VI_OWEINACT set", __func__));
2894                 return (0);
2895         }
2896
2897         /*
2898          * We don't guarantee that any particular close will
2899          * trigger inactive processing so just make a best effort
2900          * here at preventing a reference to a removed file.  If
2901          * we don't succeed no harm is done.
2902          *
2903          * Upgrade our holdcnt to a usecount.
2904          */
2905         VI_LOCK(vp);
2906         /*
2907          * See the previous section. By the time we get here we may find
2908          * ourselves in the same spot.
2909          */
2910         if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
2911 #ifdef INVARIANTS
2912                 int old = atomic_fetchadd_int(&vp->v_holdcnt, -1);
2913                 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old));
2914 #else
2915                 refcount_release(&vp->v_holdcnt);
2916 #endif
2917                 VNODE_REFCOUNT_FENCE_ACQ();
2918                 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2919                     ("%s: vnode with usecount and VI_OWEINACT set",
2920                     __func__));
2921                 VI_UNLOCK(vp);
2922                 return (0);
2923         }
2924         if ((vp->v_iflag & VI_OWEINACT) == 0) {
2925                 oweinact = 0;
2926         } else {
2927                 oweinact = 1;
2928                 vp->v_iflag &= ~VI_OWEINACT;
2929                 VNODE_REFCOUNT_FENCE_REL();
2930         }
2931         v_incr_devcount(vp);
2932         refcount_acquire(&vp->v_usecount);
2933         if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2934             (flags & LK_NOWAIT) == 0)
2935                 vinactive(vp);
2936         VI_UNLOCK(vp);
2937         return (0);
2938 }
2939
2940 /*
2941  * Increase the reference (use) and hold count of a vnode.
2942  * This will also remove the vnode from the free list if it is presently free.
2943  */
2944 void
2945 vref(struct vnode *vp)
2946 {
2947
2948         ASSERT_VI_UNLOCKED(vp, __func__);
2949         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2950         if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
2951                 VNODE_REFCOUNT_FENCE_ACQ();
2952                 VNASSERT(vp->v_holdcnt > 0, vp,
2953                     ("%s: active vnode not held", __func__));
2954                 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2955                     ("%s: vnode with usecount and VI_OWEINACT set", __func__));
2956                 return;
2957         }
2958         VI_LOCK(vp);
2959         vrefl(vp);
2960         VI_UNLOCK(vp);
2961 }
2962
2963 void
2964 vrefl(struct vnode *vp)
2965 {
2966
2967         ASSERT_VI_LOCKED(vp, __func__);
2968         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2969         if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
2970                 VNODE_REFCOUNT_FENCE_ACQ();
2971                 VNASSERT(vp->v_holdcnt > 0, vp,
2972                     ("%s: active vnode not held", __func__));
2973                 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2974                     ("%s: vnode with usecount and VI_OWEINACT set", __func__));
2975                 return;
2976         }
2977         vholdl(vp);
2978         if ((vp->v_iflag & VI_OWEINACT) != 0) {
2979                 vp->v_iflag &= ~VI_OWEINACT;
2980                 VNODE_REFCOUNT_FENCE_REL();
2981         }
2982         v_incr_devcount(vp);
2983         refcount_acquire(&vp->v_usecount);
2984 }
2985
2986 void
2987 vrefact(struct vnode *vp)
2988 {
2989
2990         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2991 #ifdef INVARIANTS
2992         int old = atomic_fetchadd_int(&vp->v_usecount, 1);
2993         VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old));
2994 #else
2995         refcount_acquire(&vp->v_usecount);
2996 #endif
2997 }
2998
2999 /*
3000  * Return reference count of a vnode.
3001  *
3002  * The results of this call are only guaranteed when some mechanism is used to
3003  * stop other processes from gaining references to the vnode.  This may be the
3004  * case if the caller holds the only reference.  This is also useful when stale
3005  * data is acceptable as race conditions may be accounted for by some other
3006  * means.
3007  */
3008 int
3009 vrefcnt(struct vnode *vp)
3010 {
3011
3012         return (vp->v_usecount);
3013 }
3014
3015 enum vputx_op { VPUTX_VRELE, VPUTX_VPUT, VPUTX_VUNREF };
3016
3017 /*
3018  * Decrement the use and hold counts for a vnode.
3019  *
3020  * See an explanation near vget() as to why atomic operation is safe.
3021  */
3022 static void
3023 vputx(struct vnode *vp, enum vputx_op func)
3024 {
3025         int error;
3026
3027         KASSERT(vp != NULL, ("vputx: null vp"));
3028         if (func == VPUTX_VUNREF)
3029                 ASSERT_VOP_LOCKED(vp, "vunref");
3030         ASSERT_VI_UNLOCKED(vp, __func__);
3031         VNASSERT(vp->v_holdcnt > 0 && vp->v_usecount > 0, vp,
3032             ("%s: wrong ref counts", __func__));
3033
3034         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3035
3036         /*
3037          * We want to hold the vnode until the inactive finishes to
3038          * prevent vgone() races.  We drop the use count here and the
3039          * hold count below when we're done.
3040          *
3041          * If we release the last usecount we take ownership of the hold
3042          * count which provides liveness of the vnode, in which case we
3043          * have to vdrop.
3044          */
3045         if (!refcount_release(&vp->v_usecount))
3046                 return;
3047         VI_LOCK(vp);
3048         v_decr_devcount(vp);
3049         /*
3050          * By the time we got here someone else might have transitioned
3051          * the count back to > 0.
3052          */
3053         if (vp->v_usecount > 0) {
3054                 vdropl(vp);
3055                 return;
3056         }
3057         if (vp->v_iflag & VI_DOINGINACT) {
3058                 vdropl(vp);
3059                 return;
3060         }
3061
3062         /*
3063          * Check if the fs wants to perform inactive processing. Note we
3064          * may be only holding the interlock, in which case it is possible
3065          * someone else called vgone on the vnode and ->v_data is now NULL.
3066          * Since vgone performs inactive on its own there is nothing to do
3067          * here but to drop our hold count.
3068          */
3069         if (__predict_false(VN_IS_DOOMED(vp)) ||
3070             VOP_NEED_INACTIVE(vp) == 0) {
3071                 vdropl(vp);
3072                 return;
3073         }
3074
3075         /*
3076          * We must call VOP_INACTIVE with the node locked. Mark
3077          * as VI_DOINGINACT to avoid recursion.
3078          */
3079         vp->v_iflag |= VI_OWEINACT;
3080         switch (func) {
3081         case VPUTX_VRELE:
3082                 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
3083                 VI_LOCK(vp);
3084                 break;
3085         case VPUTX_VPUT:
3086                 error = VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT);
3087                 VI_LOCK(vp);
3088                 break;
3089         case VPUTX_VUNREF:
3090                 error = 0;
3091                 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
3092                         error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
3093                         VI_LOCK(vp);
3094                 }
3095                 break;
3096         }
3097         VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp,
3098             ("vnode with usecount and VI_OWEINACT set"));
3099         if (error == 0) {
3100                 if (vp->v_iflag & VI_OWEINACT)
3101                         vinactive(vp);
3102                 if (func != VPUTX_VUNREF)
3103                         VOP_UNLOCK(vp);
3104         }
3105         vdropl(vp);
3106 }
3107
3108 /*
3109  * Vnode put/release.
3110  * If count drops to zero, call inactive routine and return to freelist.
3111  */
3112 void
3113 vrele(struct vnode *vp)
3114 {
3115
3116         vputx(vp, VPUTX_VRELE);
3117 }
3118
3119 /*
3120  * Release an already locked vnode.  This give the same effects as
3121  * unlock+vrele(), but takes less time and avoids releasing and
3122  * re-aquiring the lock (as vrele() acquires the lock internally.)
3123  *
3124  * It is an invariant that all VOP_* calls operate on a held vnode.
3125  * We may be only having an implicit hold stemming from our usecount,
3126  * which we are about to release. If we unlock the vnode afterwards we
3127  * open a time window where someone else dropped the last usecount and
3128  * proceeded to free the vnode before our unlock finished. For this
3129  * reason we unlock the vnode early. This is a little bit wasteful as
3130  * it may be the vnode is exclusively locked and inactive processing is
3131  * needed, in which case we are adding work.
3132  */
3133 void
3134 vput(struct vnode *vp)
3135 {
3136
3137         VOP_UNLOCK(vp);
3138         vputx(vp, VPUTX_VPUT);
3139 }
3140
3141 /*
3142  * Release an exclusively locked vnode. Do not unlock the vnode lock.
3143  */
3144 void
3145 vunref(struct vnode *vp)
3146 {
3147
3148         vputx(vp, VPUTX_VUNREF);
3149 }
3150
3151 /*
3152  * Increase the hold count and activate if this is the first reference.
3153  */
3154 static void
3155 vhold_activate(struct vnode *vp)
3156 {
3157         struct mount *mp;
3158
3159         ASSERT_VI_LOCKED(vp, __func__);
3160         VNASSERT(vp->v_holdcnt == 0, vp,
3161             ("%s: wrong hold count", __func__));
3162         VNASSERT(vp->v_op != NULL, vp,
3163             ("%s: vnode already reclaimed.", __func__));
3164         /*
3165          * Remove a vnode from the free list, mark it as in use,
3166          * and put it on the active list.
3167          */
3168         VNASSERT(vp->v_mount != NULL, vp,
3169             ("_vhold: vnode not on per mount vnode list"));
3170         mp = vp->v_mount;
3171         mtx_lock(&mp->mnt_listmtx);
3172         if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) {
3173                 TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist);
3174                 mp->mnt_tmpfreevnodelistsize--;
3175                 vp->v_mflag &= ~VMP_TMPMNTFREELIST;
3176         } else {
3177                 mtx_lock(&vnode_free_list_mtx);
3178                 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
3179                 freevnodes--;
3180                 mtx_unlock(&vnode_free_list_mtx);
3181         }
3182         KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
3183             ("Activating already active vnode"));
3184         vp->v_iflag &= ~VI_FREE;
3185         vp->v_iflag |= VI_ACTIVE;
3186         TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
3187         mp->mnt_activevnodelistsize++;
3188         mtx_unlock(&mp->mnt_listmtx);
3189         refcount_acquire(&vp->v_holdcnt);
3190 }
3191
3192 void
3193 vhold(struct vnode *vp)
3194 {
3195
3196         ASSERT_VI_UNLOCKED(vp, __func__);
3197         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3198         if (refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
3199                 VNODE_REFCOUNT_FENCE_ACQ();
3200                 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
3201                     ("vhold: vnode with holdcnt is free"));
3202                 return;
3203         }
3204         VI_LOCK(vp);
3205         vholdl(vp);
3206         VI_UNLOCK(vp);
3207 }
3208
3209 void
3210 vholdl(struct vnode *vp)
3211 {
3212
3213         ASSERT_VI_LOCKED(vp, __func__);
3214         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3215         if ((vp->v_iflag & VI_FREE) == 0) {
3216                 refcount_acquire(&vp->v_holdcnt);
3217                 return;
3218         }
3219         vhold_activate(vp);
3220 }
3221
3222 void
3223 vholdnz(struct vnode *vp)
3224 {
3225
3226         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3227 #ifdef INVARIANTS
3228         int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
3229         VNASSERT(old > 0, vp, ("%s: wrong hold count %d", __func__, old));
3230 #else
3231         atomic_add_int(&vp->v_holdcnt, 1);
3232 #endif
3233 }
3234
3235 /*
3236  * Drop the hold count of the vnode.  If this is the last reference to
3237  * the vnode we place it on the free list unless it has been vgone'd
3238  * (marked VIRF_DOOMED) in which case we will free it.
3239  *
3240  * Because the vnode vm object keeps a hold reference on the vnode if
3241  * there is at least one resident non-cached page, the vnode cannot
3242  * leave the active list without the page cleanup done.
3243  */
3244 static void
3245 vdrop_deactivate(struct vnode *vp)
3246 {
3247         struct mount *mp;
3248
3249         ASSERT_VI_LOCKED(vp, __func__);
3250         /*
3251          * Mark a vnode as free: remove it from its active list
3252          * and put it up for recycling on the freelist.
3253          */
3254         VNASSERT(!VN_IS_DOOMED(vp), vp,
3255             ("vdrop: returning doomed vnode"));
3256         VNASSERT(vp->v_op != NULL, vp,
3257             ("vdrop: vnode already reclaimed."));
3258         VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
3259             ("vnode already free"));
3260         VNASSERT(vp->v_holdcnt == 0, vp,
3261             ("vdrop: freeing when we shouldn't"));
3262         if ((vp->v_iflag & VI_OWEINACT) == 0) {
3263                 mp = vp->v_mount;
3264                 mtx_lock(&mp->mnt_listmtx);
3265                 if (vp->v_iflag & VI_ACTIVE) {
3266                         vp->v_iflag &= ~VI_ACTIVE;
3267                         TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
3268                         mp->mnt_activevnodelistsize--;
3269                 }
3270                 TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist);
3271                 mp->mnt_tmpfreevnodelistsize++;
3272                 vp->v_iflag |= VI_FREE;
3273                 vp->v_mflag |= VMP_TMPMNTFREELIST;
3274                 VI_UNLOCK(vp);
3275                 if (mp->mnt_tmpfreevnodelistsize >= mnt_free_list_batch)
3276                         vnlru_return_batch_locked(mp);
3277                 mtx_unlock(&mp->mnt_listmtx);
3278         } else {
3279                 VI_UNLOCK(vp);
3280                 counter_u64_add(free_owe_inact, 1);
3281         }
3282 }
3283
3284 void
3285 vdrop(struct vnode *vp)
3286 {
3287
3288         ASSERT_VI_UNLOCKED(vp, __func__);
3289         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3290         if (refcount_release_if_not_last(&vp->v_holdcnt))
3291                 return;
3292         VI_LOCK(vp);
3293         vdropl(vp);
3294 }
3295
3296 void
3297 vdropl(struct vnode *vp)
3298 {
3299
3300         ASSERT_VI_LOCKED(vp, __func__);
3301         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3302         if (!refcount_release(&vp->v_holdcnt)) {
3303                 VI_UNLOCK(vp);
3304                 return;
3305         }
3306         if (VN_IS_DOOMED(vp)) {
3307                 freevnode(vp);
3308                 return;
3309         }
3310         vdrop_deactivate(vp);
3311 }
3312
3313 /*
3314  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
3315  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
3316  * OWEINACT tracks whether a vnode missed a call to inactive due to a
3317  * failed lock upgrade.
3318  */
3319 void
3320 vinactive(struct vnode *vp)
3321 {
3322         struct vm_object *obj;
3323
3324         ASSERT_VOP_ELOCKED(vp, "vinactive");
3325         ASSERT_VI_LOCKED(vp, "vinactive");
3326         VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
3327             ("vinactive: recursed on VI_DOINGINACT"));
3328         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3329         vp->v_iflag |= VI_DOINGINACT;
3330         vp->v_iflag &= ~VI_OWEINACT;
3331         VI_UNLOCK(vp);
3332         /*
3333          * Before moving off the active list, we must be sure that any
3334          * modified pages are converted into the vnode's dirty
3335          * buffers, since these will no longer be checked once the
3336          * vnode is on the inactive list.
3337          *
3338          * The write-out of the dirty pages is asynchronous.  At the
3339          * point that VOP_INACTIVE() is called, there could still be
3340          * pending I/O and dirty pages in the object.
3341          */
3342         if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
3343             vm_object_mightbedirty(obj)) {
3344                 VM_OBJECT_WLOCK(obj);
3345                 vm_object_page_clean(obj, 0, 0, 0);
3346                 VM_OBJECT_WUNLOCK(obj);
3347         }
3348         VOP_INACTIVE(vp, curthread);
3349         VI_LOCK(vp);
3350         VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
3351             ("vinactive: lost VI_DOINGINACT"));
3352         vp->v_iflag &= ~VI_DOINGINACT;
3353 }
3354
3355 /*
3356  * Remove any vnodes in the vnode table belonging to mount point mp.
3357  *
3358  * If FORCECLOSE is not specified, there should not be any active ones,
3359  * return error if any are found (nb: this is a user error, not a
3360  * system error). If FORCECLOSE is specified, detach any active vnodes
3361  * that are found.
3362  *
3363  * If WRITECLOSE is set, only flush out regular file vnodes open for
3364  * writing.
3365  *
3366  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
3367  *
3368  * `rootrefs' specifies the base reference count for the root vnode
3369  * of this filesystem. The root vnode is considered busy if its
3370  * v_usecount exceeds this value. On a successful return, vflush(, td)
3371  * will call vrele() on the root vnode exactly rootrefs times.
3372  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
3373  * be zero.
3374  */
3375 #ifdef DIAGNOSTIC
3376 static int busyprt = 0;         /* print out busy vnodes */
3377 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
3378 #endif
3379
3380 int
3381 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
3382 {
3383         struct vnode *vp, *mvp, *rootvp = NULL;
3384         struct vattr vattr;
3385         int busy = 0, error;
3386
3387         CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
3388             rootrefs, flags);
3389         if (rootrefs > 0) {
3390                 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
3391                     ("vflush: bad args"));
3392                 /*
3393                  * Get the filesystem root vnode. We can vput() it
3394                  * immediately, since with rootrefs > 0, it won't go away.
3395                  */
3396                 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
3397                         CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
3398                             __func__, error);
3399                         return (error);
3400                 }
3401                 vput(rootvp);
3402         }
3403 loop:
3404         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
3405                 vholdl(vp);
3406                 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
3407                 if (error) {
3408                         vdrop(vp);
3409                         MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
3410                         goto loop;
3411                 }
3412                 /*
3413                  * Skip over a vnodes marked VV_SYSTEM.
3414                  */
3415                 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
3416                         VOP_UNLOCK(vp);
3417                         vdrop(vp);
3418                         continue;
3419                 }
3420                 /*
3421                  * If WRITECLOSE is set, flush out unlinked but still open
3422                  * files (even if open only for reading) and regular file
3423                  * vnodes open for writing.
3424                  */
3425                 if (flags & WRITECLOSE) {
3426                         if (vp->v_object != NULL) {
3427                                 VM_OBJECT_WLOCK(vp->v_object);
3428                                 vm_object_page_clean(vp->v_object, 0, 0, 0);
3429                                 VM_OBJECT_WUNLOCK(vp->v_object);
3430                         }
3431                         error = VOP_FSYNC(vp, MNT_WAIT, td);
3432                         if (error != 0) {
3433                                 VOP_UNLOCK(vp);
3434                                 vdrop(vp);
3435                                 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
3436                                 return (error);
3437                         }
3438                         error = VOP_GETATTR(vp, &vattr, td->td_ucred);
3439                         VI_LOCK(vp);
3440
3441                         if ((vp->v_type == VNON ||
3442                             (error == 0 && vattr.va_nlink > 0)) &&
3443                             (vp->v_writecount <= 0 || vp->v_type != VREG)) {
3444                                 VOP_UNLOCK(vp);
3445                                 vdropl(vp);
3446                                 continue;
3447                         }
3448                 } else
3449                         VI_LOCK(vp);
3450                 /*
3451                  * With v_usecount == 0, all we need to do is clear out the
3452                  * vnode data structures and we are done.
3453                  *
3454                  * If FORCECLOSE is set, forcibly close the vnode.
3455                  */
3456                 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
3457                         vgonel(vp);
3458                 } else {
3459                         busy++;
3460 #ifdef DIAGNOSTIC
3461                         if (busyprt)
3462                                 vn_printf(vp, "vflush: busy vnode ");
3463 #endif
3464                 }
3465                 VOP_UNLOCK(vp);
3466                 vdropl(vp);
3467         }
3468         if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
3469                 /*
3470                  * If just the root vnode is busy, and if its refcount
3471                  * is equal to `rootrefs', then go ahead and kill it.
3472                  */
3473                 VI_LOCK(rootvp);
3474                 KASSERT(busy > 0, ("vflush: not busy"));
3475                 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
3476                     ("vflush: usecount %d < rootrefs %d",
3477                      rootvp->v_usecount, rootrefs));
3478                 if (busy == 1 && rootvp->v_usecount == rootrefs) {
3479                         VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
3480                         vgone(rootvp);
3481                         VOP_UNLOCK(rootvp);
3482                         busy = 0;
3483                 } else
3484                         VI_UNLOCK(rootvp);
3485         }
3486         if (busy) {
3487                 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
3488                     busy);
3489                 return (EBUSY);
3490         }
3491         for (; rootrefs > 0; rootrefs--)
3492                 vrele(rootvp);
3493         return (0);
3494 }
3495
3496 /*
3497  * Recycle an unused vnode to the front of the free list.
3498  */
3499 int
3500 vrecycle(struct vnode *vp)
3501 {
3502         int recycled;
3503
3504         VI_LOCK(vp);
3505         recycled = vrecyclel(vp);
3506         VI_UNLOCK(vp);
3507         return (recycled);
3508 }
3509
3510 /*
3511  * vrecycle, with the vp interlock held.
3512  */
3513 int
3514 vrecyclel(struct vnode *vp)
3515 {
3516         int recycled;
3517
3518         ASSERT_VOP_ELOCKED(vp, __func__);
3519         ASSERT_VI_LOCKED(vp, __func__);
3520         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3521         recycled = 0;
3522         if (vp->v_usecount == 0) {
3523                 recycled = 1;
3524                 vgonel(vp);
3525         }
3526         return (recycled);
3527 }
3528
3529 /*
3530  * Eliminate all activity associated with a vnode
3531  * in preparation for reuse.
3532  */
3533 void
3534 vgone(struct vnode *vp)
3535 {
3536         VI_LOCK(vp);
3537         vgonel(vp);
3538         VI_UNLOCK(vp);
3539 }
3540
3541 static void
3542 notify_lowervp_vfs_dummy(struct mount *mp __unused,
3543     struct vnode *lowervp __unused)
3544 {
3545 }
3546
3547 /*
3548  * Notify upper mounts about reclaimed or unlinked vnode.
3549  */
3550 void
3551 vfs_notify_upper(struct vnode *vp, int event)
3552 {
3553         static struct vfsops vgonel_vfsops = {
3554                 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
3555                 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
3556         };
3557         struct mount *mp, *ump, *mmp;
3558
3559         mp = vp->v_mount;
3560         if (mp == NULL)
3561                 return;
3562         if (TAILQ_EMPTY(&mp->mnt_uppers))
3563                 return;
3564
3565         mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
3566         mmp->mnt_op = &vgonel_vfsops;
3567         mmp->mnt_kern_flag |= MNTK_MARKER;
3568         MNT_ILOCK(mp);
3569         mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
3570         for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
3571                 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
3572                         ump = TAILQ_NEXT(ump, mnt_upper_link);
3573                         continue;
3574                 }
3575                 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
3576                 MNT_IUNLOCK(mp);
3577                 switch (event) {
3578                 case VFS_NOTIFY_UPPER_RECLAIM:
3579                         VFS_RECLAIM_LOWERVP(ump, vp);
3580                         break;
3581                 case VFS_NOTIFY_UPPER_UNLINK:
3582                         VFS_UNLINK_LOWERVP(ump, vp);
3583                         break;
3584                 default:
3585                         KASSERT(0, ("invalid event %d", event));
3586                         break;
3587                 }
3588                 MNT_ILOCK(mp);
3589                 ump = TAILQ_NEXT(mmp, mnt_upper_link);
3590                 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
3591         }
3592         free(mmp, M_TEMP);
3593         mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
3594         if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
3595                 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
3596                 wakeup(&mp->mnt_uppers);
3597         }
3598         MNT_IUNLOCK(mp);
3599 }
3600
3601 /*
3602  * vgone, with the vp interlock held.
3603  */
3604 static void
3605 vgonel(struct vnode *vp)
3606 {
3607         struct thread *td;
3608         struct mount *mp;
3609         vm_object_t object;
3610         bool active, oweinact;
3611
3612         ASSERT_VOP_ELOCKED(vp, "vgonel");
3613         ASSERT_VI_LOCKED(vp, "vgonel");
3614         VNASSERT(vp->v_holdcnt, vp,
3615             ("vgonel: vp %p has no reference.", vp));
3616         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3617         td = curthread;
3618
3619         /*
3620          * Don't vgonel if we're already doomed.
3621          */
3622         if (vp->v_irflag & VIRF_DOOMED)
3623                 return;
3624         vp->v_irflag |= VIRF_DOOMED;
3625
3626         /*
3627          * Check to see if the vnode is in use.  If so, we have to call
3628          * VOP_CLOSE() and VOP_INACTIVE().
3629          */
3630         active = vp->v_usecount > 0;
3631         oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
3632         VI_UNLOCK(vp);
3633         vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
3634
3635         /*
3636          * If purging an active vnode, it must be closed and
3637          * deactivated before being reclaimed.
3638          */
3639         if (active)
3640                 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
3641         if (oweinact || active) {
3642                 VI_LOCK(vp);
3643                 if ((vp->v_iflag & VI_DOINGINACT) == 0)
3644                         vinactive(vp);
3645                 VI_UNLOCK(vp);
3646         }
3647         if (vp->v_type == VSOCK)
3648                 vfs_unp_reclaim(vp);
3649
3650         /*
3651          * Clean out any buffers associated with the vnode.
3652          * If the flush fails, just toss the buffers.
3653          */
3654         mp = NULL;
3655         if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
3656                 (void) vn_start_secondary_write(vp, &mp, V_WAIT);
3657         if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
3658                 while (vinvalbuf(vp, 0, 0, 0) != 0)
3659                         ;
3660         }
3661
3662         BO_LOCK(&vp->v_bufobj);
3663         KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
3664             vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
3665             TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
3666             vp->v_bufobj.bo_clean.bv_cnt == 0,
3667             ("vp %p bufobj not invalidated", vp));
3668
3669         /*
3670          * For VMIO bufobj, BO_DEAD is set later, or in
3671          * vm_object_terminate() after the object's page queue is
3672          * flushed.
3673          */
3674         object = vp->v_bufobj.bo_object;
3675         if (object == NULL)
3676                 vp->v_bufobj.bo_flag |= BO_DEAD;
3677         BO_UNLOCK(&vp->v_bufobj);
3678
3679         /*
3680          * Handle the VM part.  Tmpfs handles v_object on its own (the
3681          * OBJT_VNODE check).  Nullfs or other bypassing filesystems
3682          * should not touch the object borrowed from the lower vnode
3683          * (the handle check).
3684          */
3685         if (object != NULL && object->type == OBJT_VNODE &&
3686             object->handle == vp)
3687                 vnode_destroy_vobject(vp);
3688
3689         /*
3690          * Reclaim the vnode.
3691          */
3692         if (VOP_RECLAIM(vp, td))
3693                 panic("vgone: cannot reclaim");
3694         if (mp != NULL)
3695                 vn_finished_secondary_write(mp);
3696         VNASSERT(vp->v_object == NULL, vp,
3697             ("vop_reclaim left v_object vp=%p", vp));
3698         /*
3699          * Clear the advisory locks and wake up waiting threads.
3700          */
3701         (void)VOP_ADVLOCKPURGE(vp);
3702         vp->v_lockf = NULL;
3703         /*
3704          * Delete from old mount point vnode list.
3705          */
3706         delmntque(vp);
3707         cache_purge(vp);
3708         /*
3709          * Done with purge, reset to the standard lock and invalidate
3710          * the vnode.
3711          */
3712         VI_LOCK(vp);
3713         vp->v_vnlock = &vp->v_lock;
3714         vp->v_op = &dead_vnodeops;
3715         vp->v_type = VBAD;
3716 }
3717
3718 /*
3719  * Calculate the total number of references to a special device.
3720  */
3721 int
3722 vcount(struct vnode *vp)
3723 {
3724         int count;
3725
3726         dev_lock();
3727         count = vp->v_rdev->si_usecount;
3728         dev_unlock();
3729         return (count);
3730 }
3731
3732 /*
3733  * Print out a description of a vnode.
3734  */
3735 static char *typename[] =
3736 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
3737  "VMARKER"};
3738
3739 void
3740 vn_printf(struct vnode *vp, const char *fmt, ...)
3741 {
3742         va_list ap;
3743         char buf[256], buf2[16];
3744         u_long flags;
3745
3746         va_start(ap, fmt);
3747         vprintf(fmt, ap);
3748         va_end(ap);
3749         printf("%p: ", (void *)vp);
3750         printf("type %s\n", typename[vp->v_type]);
3751         printf("    usecount %d, writecount %d, refcount %d",
3752             vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
3753         switch (vp->v_type) {
3754         case VDIR:
3755                 printf(" mountedhere %p\n", vp->v_mountedhere);
3756                 break;
3757         case VCHR:
3758                 printf(" rdev %p\n", vp->v_rdev);
3759                 break;
3760         case VSOCK:
3761                 printf(" socket %p\n", vp->v_unpcb);
3762                 break;
3763         case VFIFO:
3764                 printf(" fifoinfo %p\n", vp->v_fifoinfo);
3765                 break;
3766         default:
3767                 printf("\n");
3768                 break;
3769         }
3770         buf[0] = '\0';
3771         buf[1] = '\0';
3772         if (vp->v_irflag & VIRF_DOOMED)
3773                 strlcat(buf, "|VIRF_DOOMED", sizeof(buf));
3774         flags = vp->v_irflag & ~(VIRF_DOOMED);
3775         if (flags != 0) {
3776                 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags);
3777                 strlcat(buf, buf2, sizeof(buf));
3778         }
3779         if (vp->v_vflag & VV_ROOT)
3780                 strlcat(buf, "|VV_ROOT", sizeof(buf));
3781         if (vp->v_vflag & VV_ISTTY)
3782                 strlcat(buf, "|VV_ISTTY", sizeof(buf));
3783         if (vp->v_vflag & VV_NOSYNC)
3784                 strlcat(buf, "|VV_NOSYNC", sizeof(buf));
3785         if (vp->v_vflag & VV_ETERNALDEV)
3786                 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
3787         if (vp->v_vflag & VV_CACHEDLABEL)
3788                 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
3789         if (vp->v_vflag & VV_VMSIZEVNLOCK)
3790                 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf));
3791         if (vp->v_vflag & VV_COPYONWRITE)
3792                 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
3793         if (vp->v_vflag & VV_SYSTEM)
3794                 strlcat(buf, "|VV_SYSTEM", sizeof(buf));
3795         if (vp->v_vflag & VV_PROCDEP)
3796                 strlcat(buf, "|VV_PROCDEP", sizeof(buf));
3797         if (vp->v_vflag & VV_NOKNOTE)
3798                 strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
3799         if (vp->v_vflag & VV_DELETED)
3800                 strlcat(buf, "|VV_DELETED", sizeof(buf));
3801         if (vp->v_vflag & VV_MD)
3802                 strlcat(buf, "|VV_MD", sizeof(buf));
3803         if (vp->v_vflag & VV_FORCEINSMQ)
3804                 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
3805         if (vp->v_vflag & VV_READLINK)
3806                 strlcat(buf, "|VV_READLINK", sizeof(buf));
3807         flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
3808             VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
3809             VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
3810         if (flags != 0) {
3811                 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
3812                 strlcat(buf, buf2, sizeof(buf));
3813         }
3814         if (vp->v_iflag & VI_TEXT_REF)
3815                 strlcat(buf, "|VI_TEXT_REF", sizeof(buf));
3816         if (vp->v_iflag & VI_MOUNT)
3817                 strlcat(buf, "|VI_MOUNT", sizeof(buf));
3818         if (vp->v_iflag & VI_FREE)
3819                 strlcat(buf, "|VI_FREE", sizeof(buf));
3820         if (vp->v_iflag & VI_ACTIVE)
3821                 strlcat(buf, "|VI_ACTIVE", sizeof(buf));
3822         if (vp->v_iflag & VI_DOINGINACT)
3823                 strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
3824         if (vp->v_iflag & VI_OWEINACT)
3825                 strlcat(buf, "|VI_OWEINACT", sizeof(buf));
3826         flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_FREE | VI_ACTIVE |
3827             VI_DOINGINACT | VI_OWEINACT);
3828         if (flags != 0) {
3829                 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
3830                 strlcat(buf, buf2, sizeof(buf));
3831         }
3832         if (vp->v_mflag & VMP_TMPMNTFREELIST)
3833                 strlcat(buf, "|VMP_TMPMNTFREELIST", sizeof(buf));
3834         flags = vp->v_mflag & ~(VMP_TMPMNTFREELIST);
3835         if (flags != 0) {
3836                 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags);
3837                 strlcat(buf, buf2, sizeof(buf));
3838         }
3839         printf("    flags (%s)\n", buf + 1);
3840         if (mtx_owned(VI_MTX(vp)))
3841                 printf(" VI_LOCKed");
3842         if (vp->v_object != NULL)
3843                 printf("    v_object %p ref %d pages %d "
3844                     "cleanbuf %d dirtybuf %d\n",
3845                     vp->v_object, vp->v_object->ref_count,
3846                     vp->v_object->resident_page_count,
3847                     vp->v_bufobj.bo_clean.bv_cnt,
3848                     vp->v_bufobj.bo_dirty.bv_cnt);
3849         printf("    ");
3850         lockmgr_printinfo(vp->v_vnlock);
3851         if (vp->v_data != NULL)
3852                 VOP_PRINT(vp);
3853 }
3854
3855 #ifdef DDB
3856 /*
3857  * List all of the locked vnodes in the system.
3858  * Called when debugging the kernel.
3859  */
3860 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
3861 {
3862         struct mount *mp;
3863         struct vnode *vp;
3864
3865         /*
3866          * Note: because this is DDB, we can't obey the locking semantics
3867          * for these structures, which means we could catch an inconsistent
3868          * state and dereference a nasty pointer.  Not much to be done
3869          * about that.
3870          */
3871         db_printf("Locked vnodes\n");
3872         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3873                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3874                         if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
3875                                 vn_printf(vp, "vnode ");
3876                 }
3877         }
3878 }
3879
3880 /*
3881  * Show details about the given vnode.
3882  */
3883 DB_SHOW_COMMAND(vnode, db_show_vnode)
3884 {
3885         struct vnode *vp;
3886
3887         if (!have_addr)
3888                 return;
3889         vp = (struct vnode *)addr;
3890         vn_printf(vp, "vnode ");
3891 }
3892
3893 /*
3894  * Show details about the given mount point.
3895  */
3896 DB_SHOW_COMMAND(mount, db_show_mount)
3897 {
3898         struct mount *mp;
3899         struct vfsopt *opt;
3900         struct statfs *sp;
3901         struct vnode *vp;
3902         char buf[512];
3903         uint64_t mflags;
3904         u_int flags;
3905
3906         if (!have_addr) {
3907                 /* No address given, print short info about all mount points. */
3908                 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3909                         db_printf("%p %s on %s (%s)\n", mp,
3910                             mp->mnt_stat.f_mntfromname,
3911                             mp->mnt_stat.f_mntonname,
3912                             mp->mnt_stat.f_fstypename);
3913                         if (db_pager_quit)
3914                                 break;
3915                 }
3916                 db_printf("\nMore info: show mount <addr>\n");
3917                 return;
3918         }
3919
3920         mp = (struct mount *)addr;
3921         db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
3922             mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
3923
3924         buf[0] = '\0';
3925         mflags = mp->mnt_flag;
3926 #define MNT_FLAG(flag)  do {                                            \
3927         if (mflags & (flag)) {                                          \
3928                 if (buf[0] != '\0')                                     \
3929                         strlcat(buf, ", ", sizeof(buf));                \
3930                 strlcat(buf, (#flag) + 4, sizeof(buf));                 \
3931                 mflags &= ~(flag);                                      \
3932         }                                                               \
3933 } while (0)
3934         MNT_FLAG(MNT_RDONLY);
3935         MNT_FLAG(MNT_SYNCHRONOUS);
3936         MNT_FLAG(MNT_NOEXEC);
3937         MNT_FLAG(MNT_NOSUID);
3938         MNT_FLAG(MNT_NFS4ACLS);
3939         MNT_FLAG(MNT_UNION);
3940         MNT_FLAG(MNT_ASYNC);
3941         MNT_FLAG(MNT_SUIDDIR);
3942         MNT_FLAG(MNT_SOFTDEP);
3943         MNT_FLAG(MNT_NOSYMFOLLOW);
3944         MNT_FLAG(MNT_GJOURNAL);
3945         MNT_FLAG(MNT_MULTILABEL);
3946         MNT_FLAG(MNT_ACLS);
3947         MNT_FLAG(MNT_NOATIME);
3948         MNT_FLAG(MNT_NOCLUSTERR);
3949         MNT_FLAG(MNT_NOCLUSTERW);
3950         MNT_FLAG(MNT_SUJ);
3951         MNT_FLAG(MNT_EXRDONLY);
3952         MNT_FLAG(MNT_EXPORTED);
3953         MNT_FLAG(MNT_DEFEXPORTED);
3954         MNT_FLAG(MNT_EXPORTANON);
3955         MNT_FLAG(MNT_EXKERB);
3956         MNT_FLAG(MNT_EXPUBLIC);
3957         MNT_FLAG(MNT_LOCAL);
3958         MNT_FLAG(MNT_QUOTA);
3959         MNT_FLAG(MNT_ROOTFS);
3960         MNT_FLAG(MNT_USER);
3961         MNT_FLAG(MNT_IGNORE);
3962         MNT_FLAG(MNT_UPDATE);
3963         MNT_FLAG(MNT_DELEXPORT);
3964         MNT_FLAG(MNT_RELOAD);
3965         MNT_FLAG(MNT_FORCE);
3966         MNT_FLAG(MNT_SNAPSHOT);
3967         MNT_FLAG(MNT_BYFSID);
3968 #undef MNT_FLAG
3969         if (mflags != 0) {
3970                 if (buf[0] != '\0')
3971                         strlcat(buf, ", ", sizeof(buf));
3972                 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3973                     "0x%016jx", mflags);
3974         }
3975         db_printf("    mnt_flag = %s\n", buf);
3976
3977         buf[0] = '\0';
3978         flags = mp->mnt_kern_flag;
3979 #define MNT_KERN_FLAG(flag)     do {                                    \
3980         if (flags & (flag)) {                                           \
3981                 if (buf[0] != '\0')                                     \
3982                         strlcat(buf, ", ", sizeof(buf));                \
3983                 strlcat(buf, (#flag) + 5, sizeof(buf));                 \
3984                 flags &= ~(flag);                                       \
3985         }                                                               \
3986 } while (0)
3987         MNT_KERN_FLAG(MNTK_UNMOUNTF);
3988         MNT_KERN_FLAG(MNTK_ASYNC);
3989         MNT_KERN_FLAG(MNTK_SOFTDEP);
3990         MNT_KERN_FLAG(MNTK_DRAINING);
3991         MNT_KERN_FLAG(MNTK_REFEXPIRE);
3992         MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
3993         MNT_KERN_FLAG(MNTK_SHARED_WRITES);
3994         MNT_KERN_FLAG(MNTK_NO_IOPF);
3995         MNT_KERN_FLAG(MNTK_VGONE_UPPER);
3996         MNT_KERN_FLAG(MNTK_VGONE_WAITER);
3997         MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
3998         MNT_KERN_FLAG(MNTK_MARKER);
3999         MNT_KERN_FLAG(MNTK_USES_BCACHE);
4000         MNT_KERN_FLAG(MNTK_NOASYNC);
4001         MNT_KERN_FLAG(MNTK_UNMOUNT);
4002         MNT_KERN_FLAG(MNTK_MWAIT);
4003         MNT_KERN_FLAG(MNTK_SUSPEND);
4004         MNT_KERN_FLAG(MNTK_SUSPEND2);
4005         MNT_KERN_FLAG(MNTK_SUSPENDED);
4006         MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
4007         MNT_KERN_FLAG(MNTK_NOKNOTE);
4008 #undef MNT_KERN_FLAG
4009         if (flags != 0) {
4010                 if (buf[0] != '\0')
4011                         strlcat(buf, ", ", sizeof(buf));
4012                 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
4013                     "0x%08x", flags);
4014         }
4015         db_printf("    mnt_kern_flag = %s\n", buf);
4016
4017         db_printf("    mnt_opt = ");
4018         opt = TAILQ_FIRST(mp->mnt_opt);
4019         if (opt != NULL) {
4020                 db_printf("%s", opt->name);
4021                 opt = TAILQ_NEXT(opt, link);
4022                 while (opt != NULL) {
4023                         db_printf(", %s", opt->name);
4024                         opt = TAILQ_NEXT(opt, link);
4025                 }
4026         }
4027         db_printf("\n");
4028
4029         sp = &mp->mnt_stat;
4030         db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
4031             "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
4032             "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
4033             "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
4034             (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
4035             (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
4036             (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
4037             (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
4038             (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
4039             (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
4040             (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
4041             (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
4042
4043         db_printf("    mnt_cred = { uid=%u ruid=%u",
4044             (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
4045         if (jailed(mp->mnt_cred))
4046                 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
4047         db_printf(" }\n");
4048         db_printf("    mnt_ref = %d (with %d in the struct)\n",
4049             vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref);
4050         db_printf("    mnt_gen = %d\n", mp->mnt_gen);
4051         db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
4052         db_printf("    mnt_activevnodelistsize = %d\n",
4053             mp->mnt_activevnodelistsize);
4054         db_printf("    mnt_writeopcount = %d (with %d in the struct)\n",
4055             vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount);
4056         db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
4057         db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
4058         db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
4059         db_printf("    mnt_lockref = %d (with %d in the struct)\n",
4060             vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref);
4061         db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
4062         db_printf("    mnt_secondary_accwrites = %d\n",
4063             mp->mnt_secondary_accwrites);
4064         db_printf("    mnt_gjprovider = %s\n",
4065             mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
4066         db_printf("    mnt_vfs_ops = %d\n", mp->mnt_vfs_ops);
4067
4068         db_printf("\n\nList of active vnodes\n");
4069         TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
4070                 if (vp->v_type != VMARKER) {
4071                         vn_printf(vp, "vnode ");
4072                         if (db_pager_quit)
4073                                 break;
4074                 }
4075         }
4076         db_printf("\n\nList of inactive vnodes\n");
4077         TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
4078                 if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
4079                         vn_printf(vp, "vnode ");
4080                         if (db_pager_quit)
4081                                 break;
4082                 }
4083         }
4084 }
4085 #endif  /* DDB */
4086
4087 /*
4088  * Fill in a struct xvfsconf based on a struct vfsconf.
4089  */
4090 static int
4091 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
4092 {
4093         struct xvfsconf xvfsp;
4094
4095         bzero(&xvfsp, sizeof(xvfsp));
4096         strcpy(xvfsp.vfc_name, vfsp->vfc_name);
4097         xvfsp.vfc_typenum = vfsp->vfc_typenum;
4098         xvfsp.vfc_refcount = vfsp->vfc_refcount;
4099         xvfsp.vfc_flags = vfsp->vfc_flags;
4100         /*
4101          * These are unused in userland, we keep them
4102          * to not break binary compatibility.
4103          */
4104         xvfsp.vfc_vfsops = NULL;
4105         xvfsp.vfc_next = NULL;
4106         return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
4107 }
4108
4109 #ifdef COMPAT_FREEBSD32
4110 struct xvfsconf32 {
4111         uint32_t        vfc_vfsops;
4112         char            vfc_name[MFSNAMELEN];
4113         int32_t         vfc_typenum;
4114         int32_t         vfc_refcount;
4115         int32_t         vfc_flags;
4116         uint32_t        vfc_next;
4117 };
4118
4119 static int
4120 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
4121 {
4122         struct xvfsconf32 xvfsp;
4123
4124         bzero(&xvfsp, sizeof(xvfsp));
4125         strcpy(xvfsp.vfc_name, vfsp->vfc_name);
4126         xvfsp.vfc_typenum = vfsp->vfc_typenum;
4127         xvfsp.vfc_refcount = vfsp->vfc_refcount;
4128         xvfsp.vfc_flags = vfsp->vfc_flags;
4129         return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
4130 }
4131 #endif
4132
4133 /*
4134  * Top level filesystem related information gathering.
4135  */
4136 static int
4137 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
4138 {
4139         struct vfsconf *vfsp;
4140         int error;
4141
4142         error = 0;
4143         vfsconf_slock();
4144         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
4145 #ifdef COMPAT_FREEBSD32
4146                 if (req->flags & SCTL_MASK32)
4147                         error = vfsconf2x32(req, vfsp);
4148                 else
4149 #endif
4150                         error = vfsconf2x(req, vfsp);
4151                 if (error)
4152                         break;
4153         }
4154         vfsconf_sunlock();
4155         return (error);
4156 }
4157
4158 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
4159     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
4160     "S,xvfsconf", "List of all configured filesystems");
4161
4162 #ifndef BURN_BRIDGES
4163 static int      sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
4164
4165 static int
4166 vfs_sysctl(SYSCTL_HANDLER_ARGS)
4167 {
4168         int *name = (int *)arg1 - 1;    /* XXX */
4169         u_int namelen = arg2 + 1;       /* XXX */
4170         struct vfsconf *vfsp;
4171
4172         log(LOG_WARNING, "userland calling deprecated sysctl, "
4173             "please rebuild world\n");
4174
4175 #if 1 || defined(COMPAT_PRELITE2)
4176         /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
4177         if (namelen == 1)
4178                 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
4179 #endif
4180
4181         switch (name[1]) {
4182         case VFS_MAXTYPENUM:
4183                 if (namelen != 2)
4184                         return (ENOTDIR);
4185                 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
4186         case VFS_CONF:
4187                 if (namelen != 3)
4188                         return (ENOTDIR);       /* overloaded */
4189                 vfsconf_slock();
4190                 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
4191                         if (vfsp->vfc_typenum == name[2])
4192                                 break;
4193                 }
4194                 vfsconf_sunlock();
4195                 if (vfsp == NULL)
4196                         return (EOPNOTSUPP);
4197 #ifdef COMPAT_FREEBSD32
4198                 if (req->flags & SCTL_MASK32)
4199                         return (vfsconf2x32(req, vfsp));
4200                 else
4201 #endif
4202                         return (vfsconf2x(req, vfsp));
4203         }
4204         return (EOPNOTSUPP);
4205 }
4206
4207 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
4208     CTLFLAG_MPSAFE, vfs_sysctl,
4209     "Generic filesystem");
4210
4211 #if 1 || defined(COMPAT_PRELITE2)
4212
4213 static int
4214 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
4215 {
4216         int error;
4217         struct vfsconf *vfsp;
4218         struct ovfsconf ovfs;
4219
4220         vfsconf_slock();
4221         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
4222                 bzero(&ovfs, sizeof(ovfs));
4223                 ovfs.vfc_vfsops = vfsp->vfc_vfsops;     /* XXX used as flag */
4224                 strcpy(ovfs.vfc_name, vfsp->vfc_name);
4225                 ovfs.vfc_index = vfsp->vfc_typenum;
4226                 ovfs.vfc_refcount = vfsp->vfc_refcount;
4227                 ovfs.vfc_flags = vfsp->vfc_flags;
4228                 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
4229                 if (error != 0) {
4230                         vfsconf_sunlock();
4231                         return (error);
4232                 }
4233         }
4234         vfsconf_sunlock();
4235         return (0);
4236 }
4237
4238 #endif /* 1 || COMPAT_PRELITE2 */
4239 #endif /* !BURN_BRIDGES */
4240
4241 #define KINFO_VNODESLOP         10
4242 #ifdef notyet
4243 /*
4244  * Dump vnode list (via sysctl).
4245  */
4246 /* ARGSUSED */
4247 static int
4248 sysctl_vnode(SYSCTL_HANDLER_ARGS)
4249 {
4250         struct xvnode *xvn;
4251         struct mount *mp;
4252         struct vnode *vp;
4253         int error, len, n;
4254
4255         /*
4256          * Stale numvnodes access is not fatal here.
4257          */
4258         req->lock = 0;
4259         len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
4260         if (!req->oldptr)
4261                 /* Make an estimate */
4262                 return (SYSCTL_OUT(req, 0, len));
4263
4264         error = sysctl_wire_old_buffer(req, 0);
4265         if (error != 0)
4266                 return (error);
4267         xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
4268         n = 0;
4269         mtx_lock(&mountlist_mtx);
4270         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4271                 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
4272                         continue;
4273                 MNT_ILOCK(mp);
4274                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
4275                         if (n == len)
4276                                 break;
4277                         vref(vp);
4278                         xvn[n].xv_size = sizeof *xvn;
4279                         xvn[n].xv_vnode = vp;
4280                         xvn[n].xv_id = 0;       /* XXX compat */
4281 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
4282                         XV_COPY(usecount);
4283                         XV_COPY(writecount);
4284                         XV_COPY(holdcnt);
4285                         XV_COPY(mount);
4286                         XV_COPY(numoutput);
4287                         XV_COPY(type);
4288 #undef XV_COPY
4289                         xvn[n].xv_flag = vp->v_vflag;
4290
4291                         switch (vp->v_type) {
4292                         case VREG:
4293                         case VDIR:
4294                         case VLNK:
4295                                 break;
4296                         case VBLK:
4297                         case VCHR:
4298                                 if (vp->v_rdev == NULL) {
4299                                         vrele(vp);
4300                                         continue;
4301                                 }
4302                                 xvn[n].xv_dev = dev2udev(vp->v_rdev);
4303                                 break;
4304                         case VSOCK:
4305                                 xvn[n].xv_socket = vp->v_socket;
4306                                 break;
4307                         case VFIFO:
4308                                 xvn[n].xv_fifo = vp->v_fifoinfo;
4309                                 break;
4310                         case VNON:
4311                         case VBAD:
4312                         default:
4313                                 /* shouldn't happen? */
4314                                 vrele(vp);
4315                                 continue;
4316                         }
4317                         vrele(vp);
4318                         ++n;
4319                 }
4320                 MNT_IUNLOCK(mp);
4321                 mtx_lock(&mountlist_mtx);
4322                 vfs_unbusy(mp);
4323                 if (n == len)
4324                         break;
4325         }
4326         mtx_unlock(&mountlist_mtx);
4327
4328         error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
4329         free(xvn, M_TEMP);
4330         return (error);
4331 }
4332
4333 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
4334     CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
4335     "");
4336 #endif
4337
4338 static void
4339 unmount_or_warn(struct mount *mp)
4340 {
4341         int error;
4342
4343         error = dounmount(mp, MNT_FORCE, curthread);
4344         if (error != 0) {
4345                 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
4346                 if (error == EBUSY)
4347                         printf("BUSY)\n");
4348                 else
4349                         printf("%d)\n", error);
4350         }
4351 }
4352
4353 /*
4354  * Unmount all filesystems. The list is traversed in reverse order
4355  * of mounting to avoid dependencies.
4356  */
4357 void
4358 vfs_unmountall(void)
4359 {
4360         struct mount *mp, *tmp;
4361
4362         CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
4363
4364         /*
4365          * Since this only runs when rebooting, it is not interlocked.
4366          */
4367         TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
4368                 vfs_ref(mp);
4369
4370                 /*
4371                  * Forcibly unmounting "/dev" before "/" would prevent clean
4372                  * unmount of the latter.
4373                  */
4374                 if (mp == rootdevmp)
4375                         continue;
4376
4377                 unmount_or_warn(mp);
4378         }
4379
4380         if (rootdevmp != NULL)
4381                 unmount_or_warn(rootdevmp);
4382 }
4383
4384 /*
4385  * perform msync on all vnodes under a mount point
4386  * the mount point must be locked.
4387  */
4388 void
4389 vfs_msync(struct mount *mp, int flags)
4390 {
4391         struct vnode *vp, *mvp;
4392         struct vm_object *obj;
4393         struct thread *td;
4394         int lkflags, objflags;
4395
4396         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
4397
4398         if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0)
4399                 return;
4400
4401         td = curthread;
4402
4403         lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
4404         if (flags != MNT_WAIT) {
4405                 lkflags |= LK_NOWAIT;
4406                 objflags = OBJPC_NOSYNC;
4407         } else {
4408                 objflags = OBJPC_SYNC;
4409         }
4410
4411         MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
4412                 obj = vp->v_object;
4413                 if (obj == NULL || !vm_object_mightbedirty(obj)) {
4414                         VI_UNLOCK(vp);
4415                         continue;
4416                 }
4417                 if (vget(vp, lkflags, td) == 0) {
4418                         obj = vp->v_object;
4419                         if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) {
4420                                 VM_OBJECT_WLOCK(obj);
4421                                 vm_object_page_clean(obj, 0, 0, objflags);
4422                                 VM_OBJECT_WUNLOCK(obj);
4423                         }
4424                         vput(vp);
4425                 }
4426         }
4427 }
4428
4429 static void
4430 destroy_vpollinfo_free(struct vpollinfo *vi)
4431 {
4432
4433         knlist_destroy(&vi->vpi_selinfo.si_note);
4434         mtx_destroy(&vi->vpi_lock);
4435         uma_zfree(vnodepoll_zone, vi);
4436 }
4437
4438 static void
4439 destroy_vpollinfo(struct vpollinfo *vi)
4440 {
4441
4442         knlist_clear(&vi->vpi_selinfo.si_note, 1);
4443         seldrain(&vi->vpi_selinfo);
4444         destroy_vpollinfo_free(vi);
4445 }
4446
4447 /*
4448  * Initialize per-vnode helper structure to hold poll-related state.
4449  */
4450 void
4451 v_addpollinfo(struct vnode *vp)
4452 {
4453         struct vpollinfo *vi;
4454
4455         if (vp->v_pollinfo != NULL)
4456                 return;
4457         vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO);
4458         mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
4459         knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
4460             vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
4461         VI_LOCK(vp);
4462         if (vp->v_pollinfo != NULL) {
4463                 VI_UNLOCK(vp);
4464                 destroy_vpollinfo_free(vi);
4465                 return;
4466         }
4467         vp->v_pollinfo = vi;
4468         VI_UNLOCK(vp);
4469 }
4470
4471 /*
4472  * Record a process's interest in events which might happen to
4473  * a vnode.  Because poll uses the historic select-style interface
4474  * internally, this routine serves as both the ``check for any
4475  * pending events'' and the ``record my interest in future events''
4476  * functions.  (These are done together, while the lock is held,
4477  * to avoid race conditions.)
4478  */
4479 int
4480 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
4481 {
4482
4483         v_addpollinfo(vp);
4484         mtx_lock(&vp->v_pollinfo->vpi_lock);
4485         if (vp->v_pollinfo->vpi_revents & events) {
4486                 /*
4487                  * This leaves events we are not interested
4488                  * in available for the other process which
4489                  * which presumably had requested them
4490                  * (otherwise they would never have been
4491                  * recorded).
4492                  */
4493                 events &= vp->v_pollinfo->vpi_revents;
4494                 vp->v_pollinfo->vpi_revents &= ~events;
4495
4496                 mtx_unlock(&vp->v_pollinfo->vpi_lock);
4497                 return (events);
4498         }
4499         vp->v_pollinfo->vpi_events |= events;
4500         selrecord(td, &vp->v_pollinfo->vpi_selinfo);
4501         mtx_unlock(&vp->v_pollinfo->vpi_lock);
4502         return (0);
4503 }
4504
4505 /*
4506  * Routine to create and manage a filesystem syncer vnode.
4507  */
4508 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
4509 static int      sync_fsync(struct  vop_fsync_args *);
4510 static int      sync_inactive(struct  vop_inactive_args *);
4511 static int      sync_reclaim(struct  vop_reclaim_args *);
4512
4513 static struct vop_vector sync_vnodeops = {
4514         .vop_bypass =   VOP_EOPNOTSUPP,
4515         .vop_close =    sync_close,             /* close */
4516         .vop_fsync =    sync_fsync,             /* fsync */
4517         .vop_inactive = sync_inactive,  /* inactive */
4518         .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */
4519         .vop_reclaim =  sync_reclaim,   /* reclaim */
4520         .vop_lock1 =    vop_stdlock,    /* lock */
4521         .vop_unlock =   vop_stdunlock,  /* unlock */
4522         .vop_islocked = vop_stdislocked,        /* islocked */
4523 };
4524 VFS_VOP_VECTOR_REGISTER(sync_vnodeops);
4525
4526 /*
4527  * Create a new filesystem syncer vnode for the specified mount point.
4528  */
4529 void
4530 vfs_allocate_syncvnode(struct mount *mp)
4531 {
4532         struct vnode *vp;
4533         struct bufobj *bo;
4534         static long start, incr, next;
4535         int error;
4536
4537         /* Allocate a new vnode */
4538         error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
4539         if (error != 0)
4540                 panic("vfs_allocate_syncvnode: getnewvnode() failed");
4541         vp->v_type = VNON;
4542         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4543         vp->v_vflag |= VV_FORCEINSMQ;
4544         error = insmntque(vp, mp);
4545         if (error != 0)
4546                 panic("vfs_allocate_syncvnode: insmntque() failed");
4547         vp->v_vflag &= ~VV_FORCEINSMQ;
4548         VOP_UNLOCK(vp);
4549         /*
4550          * Place the vnode onto the syncer worklist. We attempt to
4551          * scatter them about on the list so that they will go off
4552          * at evenly distributed times even if all the filesystems
4553          * are mounted at once.
4554          */
4555         next += incr;
4556         if (next == 0 || next > syncer_maxdelay) {
4557                 start /= 2;
4558                 incr /= 2;
4559                 if (start == 0) {
4560                         start = syncer_maxdelay / 2;
4561                         incr = syncer_maxdelay;
4562                 }
4563                 next = start;
4564         }
4565         bo = &vp->v_bufobj;
4566         BO_LOCK(bo);
4567         vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
4568         /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
4569         mtx_lock(&sync_mtx);
4570         sync_vnode_count++;
4571         if (mp->mnt_syncer == NULL) {
4572                 mp->mnt_syncer = vp;
4573                 vp = NULL;
4574         }
4575         mtx_unlock(&sync_mtx);
4576         BO_UNLOCK(bo);
4577         if (vp != NULL) {
4578                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4579                 vgone(vp);
4580                 vput(vp);
4581         }
4582 }
4583
4584 void
4585 vfs_deallocate_syncvnode(struct mount *mp)
4586 {
4587         struct vnode *vp;
4588
4589         mtx_lock(&sync_mtx);
4590         vp = mp->mnt_syncer;
4591         if (vp != NULL)
4592                 mp->mnt_syncer = NULL;
4593         mtx_unlock(&sync_mtx);
4594         if (vp != NULL)
4595                 vrele(vp);
4596 }
4597
4598 /*
4599  * Do a lazy sync of the filesystem.
4600  */
4601 static int
4602 sync_fsync(struct vop_fsync_args *ap)
4603 {
4604         struct vnode *syncvp = ap->a_vp;
4605         struct mount *mp = syncvp->v_mount;
4606         int error, save;
4607         struct bufobj *bo;
4608
4609         /*
4610          * We only need to do something if this is a lazy evaluation.
4611          */
4612         if (ap->a_waitfor != MNT_LAZY)
4613                 return (0);
4614
4615         /*
4616          * Move ourselves to the back of the sync list.
4617          */
4618         bo = &syncvp->v_bufobj;
4619         BO_LOCK(bo);
4620         vn_syncer_add_to_worklist(bo, syncdelay);
4621         BO_UNLOCK(bo);
4622
4623         /*
4624          * Walk the list of vnodes pushing all that are dirty and
4625          * not already on the sync list.
4626          */
4627         if (vfs_busy(mp, MBF_NOWAIT) != 0)
4628                 return (0);
4629         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
4630                 vfs_unbusy(mp);
4631                 return (0);
4632         }
4633         save = curthread_pflags_set(TDP_SYNCIO);
4634         /*
4635          * The filesystem at hand may be idle with free vnodes stored in the
4636          * batch.  Return them instead of letting them stay there indefinitely.
4637          */
4638         vnlru_return_batch(mp);
4639         vfs_msync(mp, MNT_NOWAIT);
4640         error = VFS_SYNC(mp, MNT_LAZY);
4641         curthread_pflags_restore(save);
4642         vn_finished_write(mp);
4643         vfs_unbusy(mp);
4644         return (error);
4645 }
4646
4647 /*
4648  * The syncer vnode is no referenced.
4649  */
4650 static int
4651 sync_inactive(struct vop_inactive_args *ap)
4652 {
4653
4654         vgone(ap->a_vp);
4655         return (0);
4656 }
4657
4658 /*
4659  * The syncer vnode is no longer needed and is being decommissioned.
4660  *
4661  * Modifications to the worklist must be protected by sync_mtx.
4662  */
4663 static int
4664 sync_reclaim(struct vop_reclaim_args *ap)
4665 {
4666         struct vnode *vp = ap->a_vp;
4667         struct bufobj *bo;
4668
4669         bo = &vp->v_bufobj;
4670         BO_LOCK(bo);
4671         mtx_lock(&sync_mtx);
4672         if (vp->v_mount->mnt_syncer == vp)
4673                 vp->v_mount->mnt_syncer = NULL;
4674         if (bo->bo_flag & BO_ONWORKLST) {
4675                 LIST_REMOVE(bo, bo_synclist);
4676                 syncer_worklist_len--;
4677                 sync_vnode_count--;
4678                 bo->bo_flag &= ~BO_ONWORKLST;
4679         }
4680         mtx_unlock(&sync_mtx);
4681         BO_UNLOCK(bo);
4682
4683         return (0);
4684 }
4685
4686 int
4687 vn_need_pageq_flush(struct vnode *vp)
4688 {
4689         struct vm_object *obj;
4690         int need;
4691
4692         MPASS(mtx_owned(VI_MTX(vp)));
4693         need = 0;
4694         if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
4695             vm_object_mightbedirty(obj))
4696                 need = 1;
4697         return (need);
4698 }
4699
4700 /*
4701  * Check if vnode represents a disk device
4702  */
4703 int
4704 vn_isdisk(struct vnode *vp, int *errp)
4705 {
4706         int error;
4707
4708         if (vp->v_type != VCHR) {
4709                 error = ENOTBLK;
4710                 goto out;
4711         }
4712         error = 0;
4713         dev_lock();
4714         if (vp->v_rdev == NULL)
4715                 error = ENXIO;
4716         else if (vp->v_rdev->si_devsw == NULL)
4717                 error = ENXIO;
4718         else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
4719                 error = ENOTBLK;
4720         dev_unlock();
4721 out:
4722         if (errp != NULL)
4723                 *errp = error;
4724         return (error == 0);
4725 }
4726
4727 /*
4728  * Common filesystem object access control check routine.  Accepts a
4729  * vnode's type, "mode", uid and gid, requested access mode, credentials,
4730  * and optional call-by-reference privused argument allowing vaccess()
4731  * to indicate to the caller whether privilege was used to satisfy the
4732  * request (obsoleted).  Returns 0 on success, or an errno on failure.
4733  */
4734 int
4735 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
4736     accmode_t accmode, struct ucred *cred, int *privused)
4737 {
4738         accmode_t dac_granted;
4739         accmode_t priv_granted;
4740
4741         KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
4742             ("invalid bit in accmode"));
4743         KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
4744             ("VAPPEND without VWRITE"));
4745
4746         /*
4747          * Look for a normal, non-privileged way to access the file/directory
4748          * as requested.  If it exists, go with that.
4749          */
4750
4751         if (privused != NULL)
4752                 *privused = 0;
4753
4754         dac_granted = 0;
4755
4756         /* Check the owner. */
4757         if (cred->cr_uid == file_uid) {
4758                 dac_granted |= VADMIN;
4759                 if (file_mode & S_IXUSR)
4760                         dac_granted |= VEXEC;
4761                 if (file_mode & S_IRUSR)
4762                         dac_granted |= VREAD;
4763                 if (file_mode & S_IWUSR)
4764                         dac_granted |= (VWRITE | VAPPEND);
4765
4766                 if ((accmode & dac_granted) == accmode)
4767                         return (0);
4768
4769                 goto privcheck;
4770         }
4771
4772         /* Otherwise, check the groups (first match) */
4773         if (groupmember(file_gid, cred)) {
4774                 if (file_mode & S_IXGRP)
4775                         dac_granted |= VEXEC;
4776                 if (file_mode & S_IRGRP)
4777                         dac_granted |= VREAD;
4778                 if (file_mode & S_IWGRP)
4779                         dac_granted |= (VWRITE | VAPPEND);
4780
4781                 if ((accmode & dac_granted) == accmode)
4782                         return (0);
4783
4784                 goto privcheck;
4785         }
4786
4787         /* Otherwise, check everyone else. */
4788         if (file_mode & S_IXOTH)
4789                 dac_granted |= VEXEC;
4790         if (file_mode & S_IROTH)
4791                 dac_granted |= VREAD;
4792         if (file_mode & S_IWOTH)
4793                 dac_granted |= (VWRITE | VAPPEND);
4794         if ((accmode & dac_granted) == accmode)
4795                 return (0);
4796
4797 privcheck:
4798         /*
4799          * Build a privilege mask to determine if the set of privileges
4800          * satisfies the requirements when combined with the granted mask
4801          * from above.  For each privilege, if the privilege is required,
4802          * bitwise or the request type onto the priv_granted mask.
4803          */
4804         priv_granted = 0;
4805
4806         if (type == VDIR) {
4807                 /*
4808                  * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
4809                  * requests, instead of PRIV_VFS_EXEC.
4810                  */
4811                 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
4812                     !priv_check_cred(cred, PRIV_VFS_LOOKUP))
4813                         priv_granted |= VEXEC;
4814         } else {
4815                 /*
4816                  * Ensure that at least one execute bit is on. Otherwise,
4817                  * a privileged user will always succeed, and we don't want
4818                  * this to happen unless the file really is executable.
4819                  */
4820                 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
4821                     (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
4822                     !priv_check_cred(cred, PRIV_VFS_EXEC))
4823                         priv_granted |= VEXEC;
4824         }
4825
4826         if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
4827             !priv_check_cred(cred, PRIV_VFS_READ))
4828                 priv_granted |= VREAD;
4829
4830         if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
4831             !priv_check_cred(cred, PRIV_VFS_WRITE))
4832                 priv_granted |= (VWRITE | VAPPEND);
4833
4834         if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
4835             !priv_check_cred(cred, PRIV_VFS_ADMIN))
4836                 priv_granted |= VADMIN;
4837
4838         if ((accmode & (priv_granted | dac_granted)) == accmode) {
4839                 /* XXX audit: privilege used */
4840                 if (privused != NULL)
4841                         *privused = 1;
4842                 return (0);
4843         }
4844
4845         return ((accmode & VADMIN) ? EPERM : EACCES);
4846 }
4847
4848 /*
4849  * Credential check based on process requesting service, and per-attribute
4850  * permissions.
4851  */
4852 int
4853 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
4854     struct thread *td, accmode_t accmode)
4855 {
4856
4857         /*
4858          * Kernel-invoked always succeeds.
4859          */
4860         if (cred == NOCRED)
4861                 return (0);
4862
4863         /*
4864          * Do not allow privileged processes in jail to directly manipulate
4865          * system attributes.
4866          */
4867         switch (attrnamespace) {
4868         case EXTATTR_NAMESPACE_SYSTEM:
4869                 /* Potentially should be: return (EPERM); */
4870                 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
4871         case EXTATTR_NAMESPACE_USER:
4872                 return (VOP_ACCESS(vp, accmode, cred, td));
4873         default:
4874                 return (EPERM);
4875         }
4876 }
4877
4878 #ifdef DEBUG_VFS_LOCKS
4879 /*
4880  * This only exists to suppress warnings from unlocked specfs accesses.  It is
4881  * no longer ok to have an unlocked VFS.
4882  */
4883 #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||            \
4884         (vp)->v_type == VCHR || (vp)->v_type == VBAD)
4885
4886 int vfs_badlock_ddb = 1;        /* Drop into debugger on violation. */
4887 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
4888     "Drop into debugger on lock violation");
4889
4890 int vfs_badlock_mutex = 1;      /* Check for interlock across VOPs. */
4891 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
4892     0, "Check for interlock across VOPs");
4893
4894 int vfs_badlock_print = 1;      /* Print lock violations. */
4895 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
4896     0, "Print lock violations");
4897
4898 int vfs_badlock_vnode = 1;      /* Print vnode details on lock violations. */
4899 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
4900     0, "Print vnode details on lock violations");
4901
4902 #ifdef KDB
4903 int vfs_badlock_backtrace = 1;  /* Print backtrace at lock violations. */
4904 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
4905     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
4906 #endif
4907
4908 static void
4909 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
4910 {
4911
4912 #ifdef KDB
4913         if (vfs_badlock_backtrace)
4914                 kdb_backtrace();
4915 #endif
4916         if (vfs_badlock_vnode)
4917                 vn_printf(vp, "vnode ");
4918         if (vfs_badlock_print)
4919                 printf("%s: %p %s\n", str, (void *)vp, msg);
4920         if (vfs_badlock_ddb)
4921                 kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4922 }
4923
4924 void
4925 assert_vi_locked(struct vnode *vp, const char *str)
4926 {
4927
4928         if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
4929                 vfs_badlock("interlock is not locked but should be", str, vp);
4930 }
4931
4932 void
4933 assert_vi_unlocked(struct vnode *vp, const char *str)
4934 {
4935
4936         if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
4937                 vfs_badlock("interlock is locked but should not be", str, vp);
4938 }
4939
4940 void
4941 assert_vop_locked(struct vnode *vp, const char *str)
4942 {
4943         int locked;
4944
4945         if (!IGNORE_LOCK(vp)) {
4946                 locked = VOP_ISLOCKED(vp);
4947                 if (locked == 0 || locked == LK_EXCLOTHER)
4948                         vfs_badlock("is not locked but should be", str, vp);
4949         }
4950 }
4951
4952 void
4953 assert_vop_unlocked(struct vnode *vp, const char *str)
4954 {
4955
4956         if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
4957                 vfs_badlock("is locked but should not be", str, vp);
4958 }
4959
4960 void
4961 assert_vop_elocked(struct vnode *vp, const char *str)
4962 {
4963
4964         if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
4965                 vfs_badlock("is not exclusive locked but should be", str, vp);
4966 }
4967 #endif /* DEBUG_VFS_LOCKS */
4968
4969 void
4970 vop_rename_fail(struct vop_rename_args *ap)
4971 {
4972
4973         if (ap->a_tvp != NULL)
4974                 vput(ap->a_tvp);
4975         if (ap->a_tdvp == ap->a_tvp)
4976                 vrele(ap->a_tdvp);
4977         else
4978                 vput(ap->a_tdvp);
4979         vrele(ap->a_fdvp);
4980         vrele(ap->a_fvp);
4981 }
4982
4983 void
4984 vop_rename_pre(void *ap)
4985 {
4986         struct vop_rename_args *a = ap;
4987
4988 #ifdef DEBUG_VFS_LOCKS
4989         if (a->a_tvp)
4990                 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
4991         ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
4992         ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
4993         ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
4994
4995         /* Check the source (from). */
4996         if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
4997             (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
4998                 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
4999         if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
5000                 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
5001
5002         /* Check the target. */
5003         if (a->a_tvp)
5004                 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
5005         ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
5006 #endif
5007         if (a->a_tdvp != a->a_fdvp)
5008                 vhold(a->a_fdvp);
5009         if (a->a_tvp != a->a_fvp)
5010                 vhold(a->a_fvp);
5011         vhold(a->a_tdvp);
5012         if (a->a_tvp)
5013                 vhold(a->a_tvp);
5014 }
5015
5016 #ifdef DEBUG_VFS_LOCKS
5017 void
5018 vop_strategy_pre(void *ap)
5019 {
5020         struct vop_strategy_args *a;
5021         struct buf *bp;
5022
5023         a = ap;
5024         bp = a->a_bp;
5025
5026         /*
5027          * Cluster ops lock their component buffers but not the IO container.
5028          */
5029         if ((bp->b_flags & B_CLUSTER) != 0)
5030                 return;
5031
5032         if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
5033                 if (vfs_badlock_print)
5034                         printf(
5035                             "VOP_STRATEGY: bp is not locked but should be\n");
5036                 if (vfs_badlock_ddb)
5037                         kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
5038         }
5039 }
5040
5041 void
5042 vop_lock_pre(void *ap)
5043 {
5044         struct vop_lock1_args *a = ap;
5045
5046         if ((a->a_flags & LK_INTERLOCK) == 0)
5047                 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
5048         else
5049                 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
5050 }
5051
5052 void
5053 vop_lock_post(void *ap, int rc)
5054 {
5055         struct vop_lock1_args *a = ap;
5056
5057         ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
5058         if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
5059                 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
5060 }
5061
5062 void
5063 vop_unlock_pre(void *ap)
5064 {
5065         struct vop_unlock_args *a = ap;
5066
5067         ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
5068 }
5069
5070 void
5071 vop_unlock_post(void *ap, int rc)
5072 {
5073         return;
5074 }
5075
5076 void
5077 vop_need_inactive_pre(void *ap)
5078 {
5079         struct vop_need_inactive_args *a = ap;
5080
5081         ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
5082 }
5083
5084 void
5085 vop_need_inactive_post(void *ap, int rc)
5086 {
5087         struct vop_need_inactive_args *a = ap;
5088
5089         ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
5090 }
5091 #endif
5092
5093 void
5094 vop_create_post(void *ap, int rc)
5095 {
5096         struct vop_create_args *a = ap;
5097
5098         if (!rc)
5099                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
5100 }
5101
5102 void
5103 vop_deleteextattr_post(void *ap, int rc)
5104 {
5105         struct vop_deleteextattr_args *a = ap;
5106
5107         if (!rc)
5108                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
5109 }
5110
5111 void
5112 vop_link_post(void *ap, int rc)
5113 {
5114         struct vop_link_args *a = ap;
5115
5116         if (!rc) {
5117                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
5118                 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
5119         }
5120 }
5121
5122 void
5123 vop_mkdir_post(void *ap, int rc)
5124 {
5125         struct vop_mkdir_args *a = ap;
5126
5127         if (!rc)
5128                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
5129 }
5130
5131 void
5132 vop_mknod_post(void *ap, int rc)
5133 {
5134         struct vop_mknod_args *a = ap;
5135
5136         if (!rc)
5137                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
5138 }
5139
5140 void
5141 vop_reclaim_post(void *ap, int rc)
5142 {
5143         struct vop_reclaim_args *a = ap;
5144
5145         if (!rc)
5146                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE);
5147 }
5148
5149 void
5150 vop_remove_post(void *ap, int rc)
5151 {
5152         struct vop_remove_args *a = ap;
5153
5154         if (!rc) {
5155                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
5156                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
5157         }
5158 }
5159
5160 void
5161 vop_rename_post(void *ap, int rc)
5162 {
5163         struct vop_rename_args *a = ap;
5164         long hint;
5165
5166         if (!rc) {
5167                 hint = NOTE_WRITE;
5168                 if (a->a_fdvp == a->a_tdvp) {
5169                         if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
5170                                 hint |= NOTE_LINK;
5171                         VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
5172                         VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
5173                 } else {
5174                         hint |= NOTE_EXTEND;
5175                         if (a->a_fvp->v_type == VDIR)
5176                                 hint |= NOTE_LINK;
5177                         VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
5178
5179                         if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
5180                             a->a_tvp->v_type == VDIR)
5181                                 hint &= ~NOTE_LINK;
5182                         VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
5183                 }
5184
5185                 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
5186                 if (a->a_tvp)
5187                         VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
5188         }
5189         if (a->a_tdvp != a->a_fdvp)
5190                 vdrop(a->a_fdvp);
5191         if (a->a_tvp != a->a_fvp)
5192                 vdrop(a->a_fvp);
5193         vdrop(a->a_tdvp);
5194         if (a->a_tvp)
5195                 vdrop(a->a_tvp);
5196 }
5197
5198 void
5199 vop_rmdir_post(void *ap, int rc)
5200 {
5201         struct vop_rmdir_args *a = ap;
5202
5203         if (!rc) {
5204                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
5205                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
5206         }
5207 }
5208
5209 void
5210 vop_setattr_post(void *ap, int rc)
5211 {
5212         struct vop_setattr_args *a = ap;
5213
5214         if (!rc)
5215                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
5216 }
5217
5218 void
5219 vop_setextattr_post(void *ap, int rc)
5220 {
5221         struct vop_setextattr_args *a = ap;
5222
5223         if (!rc)
5224                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
5225 }
5226
5227 void
5228 vop_symlink_post(void *ap, int rc)
5229 {
5230         struct vop_symlink_args *a = ap;
5231
5232         if (!rc)
5233                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
5234 }
5235
5236 void
5237 vop_open_post(void *ap, int rc)
5238 {
5239         struct vop_open_args *a = ap;
5240
5241         if (!rc)
5242                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
5243 }
5244
5245 void
5246 vop_close_post(void *ap, int rc)
5247 {
5248         struct vop_close_args *a = ap;
5249
5250         if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
5251             !VN_IS_DOOMED(a->a_vp))) {
5252                 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
5253                     NOTE_CLOSE_WRITE : NOTE_CLOSE);
5254         }
5255 }
5256
5257 void
5258 vop_read_post(void *ap, int rc)
5259 {
5260         struct vop_read_args *a = ap;
5261
5262         if (!rc)
5263                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
5264 }
5265
5266 void
5267 vop_readdir_post(void *ap, int rc)
5268 {
5269         struct vop_readdir_args *a = ap;
5270
5271         if (!rc)
5272                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
5273 }
5274
5275 static struct knlist fs_knlist;
5276
5277 static void
5278 vfs_event_init(void *arg)
5279 {
5280         knlist_init_mtx(&fs_knlist, NULL);
5281 }
5282 /* XXX - correct order? */
5283 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
5284
5285 void
5286 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
5287 {
5288
5289         KNOTE_UNLOCKED(&fs_knlist, event);
5290 }
5291
5292 static int      filt_fsattach(struct knote *kn);
5293 static void     filt_fsdetach(struct knote *kn);
5294 static int      filt_fsevent(struct knote *kn, long hint);
5295
5296 struct filterops fs_filtops = {
5297         .f_isfd = 0,
5298         .f_attach = filt_fsattach,
5299         .f_detach = filt_fsdetach,
5300         .f_event = filt_fsevent
5301 };
5302
5303 static int
5304 filt_fsattach(struct knote *kn)
5305 {
5306
5307         kn->kn_flags |= EV_CLEAR;
5308         knlist_add(&fs_knlist, kn, 0);
5309         return (0);
5310 }
5311
5312 static void
5313 filt_fsdetach(struct knote *kn)
5314 {
5315
5316         knlist_remove(&fs_knlist, kn, 0);
5317 }
5318
5319 static int
5320 filt_fsevent(struct knote *kn, long hint)
5321 {
5322
5323         kn->kn_fflags |= hint;
5324         return (kn->kn_fflags != 0);
5325 }
5326
5327 static int
5328 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
5329 {
5330         struct vfsidctl vc;
5331         int error;
5332         struct mount *mp;
5333
5334         error = SYSCTL_IN(req, &vc, sizeof(vc));
5335         if (error)
5336                 return (error);
5337         if (vc.vc_vers != VFS_CTL_VERS1)
5338                 return (EINVAL);
5339         mp = vfs_getvfs(&vc.vc_fsid);
5340         if (mp == NULL)
5341                 return (ENOENT);
5342         /* ensure that a specific sysctl goes to the right filesystem. */
5343         if (strcmp(vc.vc_fstypename, "*") != 0 &&
5344             strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
5345                 vfs_rel(mp);
5346                 return (EINVAL);
5347         }
5348         VCTLTOREQ(&vc, req);
5349         error = VFS_SYSCTL(mp, vc.vc_op, req);
5350         vfs_rel(mp);
5351         return (error);
5352 }
5353
5354 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
5355     NULL, 0, sysctl_vfs_ctl, "",
5356     "Sysctl by fsid");
5357
5358 /*
5359  * Function to initialize a va_filerev field sensibly.
5360  * XXX: Wouldn't a random number make a lot more sense ??
5361  */
5362 u_quad_t
5363 init_va_filerev(void)
5364 {
5365         struct bintime bt;
5366
5367         getbinuptime(&bt);
5368         return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
5369 }
5370
5371 static int      filt_vfsread(struct knote *kn, long hint);
5372 static int      filt_vfswrite(struct knote *kn, long hint);
5373 static int      filt_vfsvnode(struct knote *kn, long hint);
5374 static void     filt_vfsdetach(struct knote *kn);
5375 static struct filterops vfsread_filtops = {
5376         .f_isfd = 1,
5377         .f_detach = filt_vfsdetach,
5378         .f_event = filt_vfsread
5379 };
5380 static struct filterops vfswrite_filtops = {
5381         .f_isfd = 1,
5382         .f_detach = filt_vfsdetach,
5383         .f_event = filt_vfswrite
5384 };
5385 static struct filterops vfsvnode_filtops = {
5386         .f_isfd = 1,
5387         .f_detach = filt_vfsdetach,
5388         .f_event = filt_vfsvnode
5389 };
5390
5391 static void
5392 vfs_knllock(void *arg)
5393 {
5394         struct vnode *vp = arg;
5395
5396         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
5397 }
5398
5399 static void
5400 vfs_knlunlock(void *arg)
5401 {
5402         struct vnode *vp = arg;
5403
5404         VOP_UNLOCK(vp);
5405 }
5406
5407 static void
5408 vfs_knl_assert_locked(void *arg)
5409 {
5410 #ifdef DEBUG_VFS_LOCKS
5411         struct vnode *vp = arg;
5412
5413         ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
5414 #endif
5415 }
5416
5417 static void
5418 vfs_knl_assert_unlocked(void *arg)
5419 {
5420 #ifdef DEBUG_VFS_LOCKS
5421         struct vnode *vp = arg;
5422
5423         ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
5424 #endif
5425 }
5426
5427 int
5428 vfs_kqfilter(struct vop_kqfilter_args *ap)
5429 {
5430         struct vnode *vp = ap->a_vp;
5431         struct knote *kn = ap->a_kn;
5432         struct knlist *knl;
5433
5434         switch (kn->kn_filter) {
5435         case EVFILT_READ:
5436                 kn->kn_fop = &vfsread_filtops;
5437                 break;
5438         case EVFILT_WRITE:
5439                 kn->kn_fop = &vfswrite_filtops;
5440                 break;
5441         case EVFILT_VNODE:
5442                 kn->kn_fop = &vfsvnode_filtops;
5443                 break;
5444         default:
5445                 return (EINVAL);
5446         }
5447
5448         kn->kn_hook = (caddr_t)vp;
5449
5450         v_addpollinfo(vp);
5451         if (vp->v_pollinfo == NULL)
5452                 return (ENOMEM);
5453         knl = &vp->v_pollinfo->vpi_selinfo.si_note;
5454         vhold(vp);
5455         knlist_add(knl, kn, 0);
5456
5457         return (0);
5458 }
5459
5460 /*
5461  * Detach knote from vnode
5462  */
5463 static void
5464 filt_vfsdetach(struct knote *kn)
5465 {
5466         struct vnode *vp = (struct vnode *)kn->kn_hook;
5467
5468         KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
5469         knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
5470         vdrop(vp);
5471 }
5472
5473 /*ARGSUSED*/
5474 static int
5475 filt_vfsread(struct knote *kn, long hint)
5476 {
5477         struct vnode *vp = (struct vnode *)kn->kn_hook;
5478         struct vattr va;
5479         int res;
5480
5481         /*
5482          * filesystem is gone, so set the EOF flag and schedule
5483          * the knote for deletion.
5484          */
5485         if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
5486                 VI_LOCK(vp);
5487                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
5488                 VI_UNLOCK(vp);
5489                 return (1);
5490         }
5491
5492         if (VOP_GETATTR(vp, &va, curthread->td_ucred))
5493                 return (0);
5494
5495         VI_LOCK(vp);
5496         kn->kn_data = va.va_size - kn->kn_fp->f_offset;
5497         res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
5498         VI_UNLOCK(vp);
5499         return (res);
5500 }
5501
5502 /*ARGSUSED*/
5503 static int
5504 filt_vfswrite(struct knote *kn, long hint)
5505 {
5506         struct vnode *vp = (struct vnode *)kn->kn_hook;
5507
5508         VI_LOCK(vp);
5509
5510         /*
5511          * filesystem is gone, so set the EOF flag and schedule
5512          * the knote for deletion.
5513          */
5514         if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
5515                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
5516
5517         kn->kn_data = 0;
5518         VI_UNLOCK(vp);
5519         return (1);
5520 }
5521
5522 static int
5523 filt_vfsvnode(struct knote *kn, long hint)
5524 {
5525         struct vnode *vp = (struct vnode *)kn->kn_hook;
5526         int res;
5527
5528         VI_LOCK(vp);
5529         if (kn->kn_sfflags & hint)
5530                 kn->kn_fflags |= hint;
5531         if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
5532                 kn->kn_flags |= EV_EOF;
5533                 VI_UNLOCK(vp);
5534                 return (1);
5535         }
5536         res = (kn->kn_fflags != 0);
5537         VI_UNLOCK(vp);
5538         return (res);
5539 }
5540
5541 /*
5542  * Returns whether the directory is empty or not.
5543  * If it is empty, the return value is 0; otherwise
5544  * the return value is an error value (which may
5545  * be ENOTEMPTY).
5546  */
5547 int
5548 vfs_emptydir(struct vnode *vp)
5549 {
5550         struct uio uio;
5551         struct iovec iov;
5552         struct dirent *dirent, *dp, *endp;
5553         int error, eof;
5554
5555         error = 0;
5556         eof = 0;
5557
5558         ASSERT_VOP_LOCKED(vp, "vfs_emptydir");
5559
5560         dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK);
5561         iov.iov_base = dirent;
5562         iov.iov_len = sizeof(struct dirent);
5563
5564         uio.uio_iov = &iov;
5565         uio.uio_iovcnt = 1;
5566         uio.uio_offset = 0;
5567         uio.uio_resid = sizeof(struct dirent);
5568         uio.uio_segflg = UIO_SYSSPACE;
5569         uio.uio_rw = UIO_READ;
5570         uio.uio_td = curthread;
5571
5572         while (eof == 0 && error == 0) {
5573                 error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof,
5574                     NULL, NULL);
5575                 if (error != 0)
5576                         break;
5577                 endp = (void *)((uint8_t *)dirent +
5578                     sizeof(struct dirent) - uio.uio_resid);
5579                 for (dp = dirent; dp < endp;
5580                      dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) {
5581                         if (dp->d_type == DT_WHT)
5582                                 continue;
5583                         if (dp->d_namlen == 0)
5584                                 continue;
5585                         if (dp->d_type != DT_DIR &&
5586                             dp->d_type != DT_UNKNOWN) {
5587                                 error = ENOTEMPTY;
5588                                 break;
5589                         }
5590                         if (dp->d_namlen > 2) {
5591                                 error = ENOTEMPTY;
5592                                 break;
5593                         }
5594                         if (dp->d_namlen == 1 &&
5595                             dp->d_name[0] != '.') {
5596                                 error = ENOTEMPTY;
5597                                 break;
5598                         }
5599                         if (dp->d_namlen == 2 &&
5600                             dp->d_name[1] != '.') {
5601                                 error = ENOTEMPTY;
5602                                 break;
5603                         }
5604                         uio.uio_resid = sizeof(struct dirent);
5605                 }
5606         }
5607         free(dirent, M_TEMP);
5608         return (error);
5609 }
5610
5611 int
5612 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
5613 {
5614         int error;
5615
5616         if (dp->d_reclen > ap->a_uio->uio_resid)
5617                 return (ENAMETOOLONG);
5618         error = uiomove(dp, dp->d_reclen, ap->a_uio);
5619         if (error) {
5620                 if (ap->a_ncookies != NULL) {
5621                         if (ap->a_cookies != NULL)
5622                                 free(ap->a_cookies, M_TEMP);
5623                         ap->a_cookies = NULL;
5624                         *ap->a_ncookies = 0;
5625                 }
5626                 return (error);
5627         }
5628         if (ap->a_ncookies == NULL)
5629                 return (0);
5630
5631         KASSERT(ap->a_cookies,
5632             ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
5633
5634         *ap->a_cookies = realloc(*ap->a_cookies,
5635             (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
5636         (*ap->a_cookies)[*ap->a_ncookies] = off;
5637         *ap->a_ncookies += 1;
5638         return (0);
5639 }
5640
5641 /*
5642  * Mark for update the access time of the file if the filesystem
5643  * supports VOP_MARKATIME.  This functionality is used by execve and
5644  * mmap, so we want to avoid the I/O implied by directly setting
5645  * va_atime for the sake of efficiency.
5646  */
5647 void
5648 vfs_mark_atime(struct vnode *vp, struct ucred *cred)
5649 {
5650         struct mount *mp;
5651
5652         mp = vp->v_mount;
5653         ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
5654         if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
5655                 (void)VOP_MARKATIME(vp);
5656 }
5657
5658 /*
5659  * The purpose of this routine is to remove granularity from accmode_t,
5660  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
5661  * VADMIN and VAPPEND.
5662  *
5663  * If it returns 0, the caller is supposed to continue with the usual
5664  * access checks using 'accmode' as modified by this routine.  If it
5665  * returns nonzero value, the caller is supposed to return that value
5666  * as errno.
5667  *
5668  * Note that after this routine runs, accmode may be zero.
5669  */
5670 int
5671 vfs_unixify_accmode(accmode_t *accmode)
5672 {
5673         /*
5674          * There is no way to specify explicit "deny" rule using
5675          * file mode or POSIX.1e ACLs.
5676          */
5677         if (*accmode & VEXPLICIT_DENY) {
5678                 *accmode = 0;
5679                 return (0);
5680         }
5681
5682         /*
5683          * None of these can be translated into usual access bits.
5684          * Also, the common case for NFSv4 ACLs is to not contain
5685          * either of these bits. Caller should check for VWRITE
5686          * on the containing directory instead.
5687          */
5688         if (*accmode & (VDELETE_CHILD | VDELETE))
5689                 return (EPERM);
5690
5691         if (*accmode & VADMIN_PERMS) {
5692                 *accmode &= ~VADMIN_PERMS;
5693                 *accmode |= VADMIN;
5694         }
5695
5696         /*
5697          * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
5698          * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
5699          */
5700         *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
5701
5702         return (0);
5703 }
5704
5705 /*
5706  * Clear out a doomed vnode (if any) and replace it with a new one as long
5707  * as the fs is not being unmounted. Return the root vnode to the caller.
5708  */
5709 static int __noinline
5710 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp)
5711 {
5712         struct vnode *vp;
5713         int error;
5714
5715 restart:
5716         if (mp->mnt_rootvnode != NULL) {
5717                 MNT_ILOCK(mp);
5718                 vp = mp->mnt_rootvnode;
5719                 if (vp != NULL) {
5720                         if (!VN_IS_DOOMED(vp)) {
5721                                 vrefact(vp);
5722                                 MNT_IUNLOCK(mp);
5723                                 error = vn_lock(vp, flags);
5724                                 if (error == 0) {
5725                                         *vpp = vp;
5726                                         return (0);
5727                                 }
5728                                 vrele(vp);
5729                                 goto restart;
5730                         }
5731                         /*
5732                          * Clear the old one.
5733                          */
5734                         mp->mnt_rootvnode = NULL;
5735                 }
5736                 MNT_IUNLOCK(mp);
5737                 if (vp != NULL) {
5738                         /*
5739                          * Paired with a fence in vfs_op_thread_exit().
5740                          */
5741                         atomic_thread_fence_acq();
5742                         vfs_op_barrier_wait(mp);
5743                         vrele(vp);
5744                 }
5745         }
5746         error = VFS_CACHEDROOT(mp, flags, vpp);
5747         if (error != 0)
5748                 return (error);
5749         if (mp->mnt_vfs_ops == 0) {
5750                 MNT_ILOCK(mp);
5751                 if (mp->mnt_vfs_ops != 0) {
5752                         MNT_IUNLOCK(mp);
5753                         return (0);
5754                 }
5755                 if (mp->mnt_rootvnode == NULL) {
5756                         vrefact(*vpp);
5757                         mp->mnt_rootvnode = *vpp;
5758                 } else {
5759                         if (mp->mnt_rootvnode != *vpp) {
5760                                 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) {
5761                                         panic("%s: mismatch between vnode returned "
5762                                             " by VFS_CACHEDROOT and the one cached "
5763                                             " (%p != %p)",
5764                                             __func__, *vpp, mp->mnt_rootvnode);
5765                                 }
5766                         }
5767                 }
5768                 MNT_IUNLOCK(mp);
5769         }
5770         return (0);
5771 }
5772
5773 int
5774 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp)
5775 {
5776         struct vnode *vp;
5777         int error;
5778
5779         if (!vfs_op_thread_enter(mp))
5780                 return (vfs_cache_root_fallback(mp, flags, vpp));
5781         vp = (struct vnode *)atomic_load_ptr(&mp->mnt_rootvnode);
5782         if (vp == NULL || VN_IS_DOOMED(vp)) {
5783                 vfs_op_thread_exit(mp);
5784                 return (vfs_cache_root_fallback(mp, flags, vpp));
5785         }
5786         vrefact(vp);
5787         vfs_op_thread_exit(mp);
5788         error = vn_lock(vp, flags);
5789         if (error != 0) {
5790                 vrele(vp);
5791                 return (vfs_cache_root_fallback(mp, flags, vpp));
5792         }
5793         *vpp = vp;
5794         return (0);
5795 }
5796
5797 struct vnode *
5798 vfs_cache_root_clear(struct mount *mp)
5799 {
5800         struct vnode *vp;
5801
5802         /*
5803          * ops > 0 guarantees there is nobody who can see this vnode
5804          */
5805         MPASS(mp->mnt_vfs_ops > 0);
5806         vp = mp->mnt_rootvnode;
5807         mp->mnt_rootvnode = NULL;
5808         return (vp);
5809 }
5810
5811 void
5812 vfs_cache_root_set(struct mount *mp, struct vnode *vp)
5813 {
5814
5815         MPASS(mp->mnt_vfs_ops > 0);
5816         vrefact(vp);
5817         mp->mnt_rootvnode = vp;
5818 }
5819
5820 /*
5821  * These are helper functions for filesystems to traverse all
5822  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
5823  *
5824  * This interface replaces MNT_VNODE_FOREACH.
5825  */
5826
5827
5828 struct vnode *
5829 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
5830 {
5831         struct vnode *vp;
5832
5833         if (should_yield())
5834                 kern_yield(PRI_USER);
5835         MNT_ILOCK(mp);
5836         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
5837         for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
5838             vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
5839                 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
5840                 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
5841                         continue;
5842                 VI_LOCK(vp);
5843                 if (VN_IS_DOOMED(vp)) {
5844                         VI_UNLOCK(vp);
5845                         continue;
5846                 }
5847                 break;
5848         }
5849         if (vp == NULL) {
5850                 __mnt_vnode_markerfree_all(mvp, mp);
5851                 /* MNT_IUNLOCK(mp); -- done in above function */
5852                 mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
5853                 return (NULL);
5854         }
5855         TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
5856         TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
5857         MNT_IUNLOCK(mp);
5858         return (vp);
5859 }
5860
5861 struct vnode *
5862 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
5863 {
5864         struct vnode *vp;
5865
5866         *mvp = vn_alloc_marker(mp);
5867         MNT_ILOCK(mp);
5868         MNT_REF(mp);
5869
5870         TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
5871                 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
5872                 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
5873                         continue;
5874                 VI_LOCK(vp);
5875                 if (VN_IS_DOOMED(vp)) {
5876                         VI_UNLOCK(vp);
5877                         continue;
5878                 }
5879                 break;
5880         }
5881         if (vp == NULL) {
5882                 MNT_REL(mp);
5883                 MNT_IUNLOCK(mp);
5884                 vn_free_marker(*mvp);
5885                 *mvp = NULL;
5886                 return (NULL);
5887         }
5888         TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
5889         MNT_IUNLOCK(mp);
5890         return (vp);
5891 }
5892
5893 void
5894 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
5895 {
5896
5897         if (*mvp == NULL) {
5898                 MNT_IUNLOCK(mp);
5899                 return;
5900         }
5901
5902         mtx_assert(MNT_MTX(mp), MA_OWNED);
5903
5904         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
5905         TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
5906         MNT_REL(mp);
5907         MNT_IUNLOCK(mp);
5908         vn_free_marker(*mvp);
5909         *mvp = NULL;
5910 }
5911
5912 /*
5913  * These are helper functions for filesystems to traverse their
5914  * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
5915  */
5916 static void
5917 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
5918 {
5919
5920         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
5921
5922         MNT_ILOCK(mp);
5923         MNT_REL(mp);
5924         MNT_IUNLOCK(mp);
5925         vn_free_marker(*mvp);
5926         *mvp = NULL;
5927 }
5928
5929 /*
5930  * Relock the mp mount vnode list lock with the vp vnode interlock in the
5931  * conventional lock order during mnt_vnode_next_active iteration.
5932  *
5933  * On entry, the mount vnode list lock is held and the vnode interlock is not.
5934  * The list lock is dropped and reacquired.  On success, both locks are held.
5935  * On failure, the mount vnode list lock is held but the vnode interlock is
5936  * not, and the procedure may have yielded.
5937  */
5938 static bool
5939 mnt_vnode_next_active_relock(struct vnode *mvp, struct mount *mp,
5940     struct vnode *vp)
5941 {
5942         const struct vnode *tmp;
5943         bool held, ret;
5944
5945         VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
5946             TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp,
5947             ("%s: bad marker", __func__));
5948         VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
5949             ("%s: inappropriate vnode", __func__));
5950         ASSERT_VI_UNLOCKED(vp, __func__);
5951         mtx_assert(&mp->mnt_listmtx, MA_OWNED);
5952
5953         ret = false;
5954
5955         TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist);
5956         TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist);
5957
5958         /*
5959          * Use a hold to prevent vp from disappearing while the mount vnode
5960          * list lock is dropped and reacquired.  Normally a hold would be
5961          * acquired with vhold(), but that might try to acquire the vnode
5962          * interlock, which would be a LOR with the mount vnode list lock.
5963          */
5964         held = refcount_acquire_if_not_zero(&vp->v_holdcnt);
5965         mtx_unlock(&mp->mnt_listmtx);
5966         if (!held)
5967                 goto abort;
5968         VI_LOCK(vp);
5969         if (!refcount_release_if_not_last(&vp->v_holdcnt)) {
5970                 vdropl(vp);
5971                 goto abort;
5972         }
5973         mtx_lock(&mp->mnt_listmtx);
5974
5975         /*
5976          * Determine whether the vnode is still the next one after the marker,
5977          * excepting any other markers.  If the vnode has not been doomed by
5978          * vgone() then the hold should have ensured that it remained on the
5979          * active list.  If it has been doomed but is still on the active list,
5980          * don't abort, but rather skip over it (avoid spinning on doomed
5981          * vnodes).
5982          */
5983         tmp = mvp;
5984         do {
5985                 tmp = TAILQ_NEXT(tmp, v_actfreelist);
5986         } while (tmp != NULL && tmp->v_type == VMARKER);
5987         if (tmp != vp) {
5988                 mtx_unlock(&mp->mnt_listmtx);
5989                 VI_UNLOCK(vp);
5990                 goto abort;
5991         }
5992
5993         ret = true;
5994         goto out;
5995 abort:
5996         maybe_yield();
5997         mtx_lock(&mp->mnt_listmtx);
5998 out:
5999         if (ret)
6000                 ASSERT_VI_LOCKED(vp, __func__);
6001         else
6002                 ASSERT_VI_UNLOCKED(vp, __func__);
6003         mtx_assert(&mp->mnt_listmtx, MA_OWNED);
6004         return (ret);
6005 }
6006
6007 static struct vnode *
6008 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
6009 {
6010         struct vnode *vp, *nvp;
6011
6012         mtx_assert(&mp->mnt_listmtx, MA_OWNED);
6013         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
6014 restart:
6015         vp = TAILQ_NEXT(*mvp, v_actfreelist);
6016         while (vp != NULL) {
6017                 if (vp->v_type == VMARKER) {
6018                         vp = TAILQ_NEXT(vp, v_actfreelist);
6019                         continue;
6020                 }
6021                 /*
6022                  * Try-lock because this is the wrong lock order.  If that does
6023                  * not succeed, drop the mount vnode list lock and try to
6024                  * reacquire it and the vnode interlock in the right order.
6025                  */
6026                 if (!VI_TRYLOCK(vp) &&
6027                     !mnt_vnode_next_active_relock(*mvp, mp, vp))
6028                         goto restart;
6029                 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
6030                 KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
6031                     ("alien vnode on the active list %p %p", vp, mp));
6032                 if (vp->v_mount == mp && !VN_IS_DOOMED(vp))
6033                         break;
6034                 nvp = TAILQ_NEXT(vp, v_actfreelist);
6035                 VI_UNLOCK(vp);
6036                 vp = nvp;
6037         }
6038         TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
6039
6040         /* Check if we are done */
6041         if (vp == NULL) {
6042                 mtx_unlock(&mp->mnt_listmtx);
6043                 mnt_vnode_markerfree_active(mvp, mp);
6044                 return (NULL);
6045         }
6046         TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
6047         mtx_unlock(&mp->mnt_listmtx);
6048         ASSERT_VI_LOCKED(vp, "active iter");
6049         KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
6050         return (vp);
6051 }
6052
6053 struct vnode *
6054 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
6055 {
6056
6057         if (should_yield())
6058                 kern_yield(PRI_USER);
6059         mtx_lock(&mp->mnt_listmtx);
6060         return (mnt_vnode_next_active(mvp, mp));
6061 }
6062
6063 struct vnode *
6064 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
6065 {
6066         struct vnode *vp;
6067
6068         *mvp = vn_alloc_marker(mp);
6069         MNT_ILOCK(mp);
6070         MNT_REF(mp);
6071         MNT_IUNLOCK(mp);
6072
6073         mtx_lock(&mp->mnt_listmtx);
6074         vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
6075         if (vp == NULL) {
6076                 mtx_unlock(&mp->mnt_listmtx);
6077                 mnt_vnode_markerfree_active(mvp, mp);
6078                 return (NULL);
6079         }
6080         TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
6081         return (mnt_vnode_next_active(mvp, mp));
6082 }
6083
6084 void
6085 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
6086 {
6087
6088         if (*mvp == NULL)
6089                 return;
6090
6091         mtx_lock(&mp->mnt_listmtx);
6092         TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
6093         mtx_unlock(&mp->mnt_listmtx);
6094         mnt_vnode_markerfree_active(mvp, mp);
6095 }