sys/kern/vfs_subr.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  * (c) UNIX System Laboratories, Inc.
   7  * All or some portions of this file are derived from material licensed
   8  * to the University of California by American Telephone and Telegraph
   9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  10  * the permission of UNIX System Laboratories, Inc.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  */
  36
  37 /*
  38  * External virtual filesystem routines
  39  */
  40
  41 #include <sys/cdefs.h>
  42 #include "opt_ddb.h"
  43 #include "opt_watchdog.h"
  44
  45 #include <sys/param.h>
  46 #include <sys/systm.h>
  47 #include <sys/asan.h>
  48 #include <sys/bio.h>
  49 #include <sys/buf.h>
  50 #include <sys/capsicum.h>
  51 #include <sys/condvar.h>
  52 #include <sys/conf.h>
  53 #include <sys/counter.h>
  54 #include <sys/dirent.h>
  55 #include <sys/event.h>
  56 #include <sys/eventhandler.h>
  57 #include <sys/extattr.h>
  58 #include <sys/file.h>
  59 #include <sys/fcntl.h>
  60 #include <sys/jail.h>
  61 #include <sys/kdb.h>
  62 #include <sys/kernel.h>
  63 #include <sys/kthread.h>
  64 #include <sys/ktr.h>
  65 #include <sys/limits.h>
  66 #include <sys/lockf.h>
  67 #include <sys/malloc.h>
  68 #include <sys/mount.h>
  69 #include <sys/namei.h>
  70 #include <sys/pctrie.h>
  71 #include <sys/priv.h>
  72 #include <sys/reboot.h>
  73 #include <sys/refcount.h>
  74 #include <sys/rwlock.h>
  75 #include <sys/sched.h>
  76 #include <sys/sleepqueue.h>
  77 #include <sys/smr.h>
  78 #include <sys/smp.h>
  79 #include <sys/stat.h>
  80 #include <sys/sysctl.h>
  81 #include <sys/syslog.h>
  82 #include <sys/vmmeter.h>
  83 #include <sys/vnode.h>
  84 #include <sys/watchdog.h>
  85
  86 #include <machine/stdarg.h>
  87
  88 #include <security/mac/mac_framework.h>
  89
  90 #include <vm/vm.h>
  91 #include <vm/vm_object.h>
  92 #include <vm/vm_extern.h>
  93 #include <vm/pmap.h>
  94 #include <vm/vm_map.h>
  95 #include <vm/vm_page.h>
  96 #include <vm/vm_kern.h>
  97 #include <vm/uma.h>
  98
  99 #if defined(DEBUG_VFS_LOCKS) && (!defined(INVARIANTS) || !defined(WITNESS))
 100 #error DEBUG_VFS_LOCKS requires INVARIANTS and WITNESS
 101 #endif
 102
 103 #ifdef DDB
 104 #include <ddb/ddb.h>
 105 #endif
 106
 107 static void     delmntque(struct vnode *vp);
 108 static int      flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 109                     int slpflag, int slptimeo);
 110 static void     syncer_shutdown(void *arg, int howto);
 111 static int      vtryrecycle(struct vnode *vp, bool isvnlru);
 112 static void     v_init_counters(struct vnode *);
 113 static void     vn_seqc_init(struct vnode *);
 114 static void     vn_seqc_write_end_free(struct vnode *vp);
 115 static void     vgonel(struct vnode *);
 116 static bool     vhold_recycle_free(struct vnode *);
 117 static void     vdropl_recycle(struct vnode *vp);
 118 static void     vdrop_recycle(struct vnode *vp);
 119 static void     vfs_knllock(void *arg);
 120 static void     vfs_knlunlock(void *arg);
 121 static void     vfs_knl_assert_lock(void *arg, int what);
 122 static void     destroy_vpollinfo(struct vpollinfo *vi);
 123 static int      v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
 124                     daddr_t startlbn, daddr_t endlbn);
 125 static void     vnlru_recalc(void);
 126
 127 static SYSCTL_NODE(_vfs, OID_AUTO, vnode, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 128     "vnode configuration and statistics");
 129 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 130     "vnode configuration");
 131 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 132     "vnode statistics");
 133 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, vnlru, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 134     "vnode recycling");
 135
 136 /*
 137  * Number of vnodes in existence.  Increased whenever getnewvnode()
 138  * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode.
 139  */
 140 static u_long __exclusive_cache_line numvnodes;
 141
 142 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
 143     "Number of vnodes in existence (legacy)");
 144 SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, count, CTLFLAG_RD, &numvnodes, 0,
 145     "Number of vnodes in existence");
 146
 147 static counter_u64_t vnodes_created;
 148 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
 149     "Number of vnodes created by getnewvnode (legacy)");
 150 SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, created, CTLFLAG_RD, &vnodes_created,
 151     "Number of vnodes created by getnewvnode");
 152
 153 /*
 154  * Conversion tables for conversion from vnode types to inode formats
 155  * and back.
 156  */
 157 __enum_uint8(vtype) iftovt_tab[16] = {
 158         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 159         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 160 };
 161 int vttoif_tab[10] = {
 162         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 163         S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 164 };
 165
 166 /*
 167  * List of allocates vnodes in the system.
 168  */
 169 static TAILQ_HEAD(freelst, vnode) vnode_list;
 170 static struct vnode *vnode_list_free_marker;
 171 static struct vnode *vnode_list_reclaim_marker;
 172
 173 /*
 174  * "Free" vnode target.  Free vnodes are rarely completely free, but are
 175  * just ones that are cheap to recycle.  Usually they are for files which
 176  * have been stat'd but not read; these usually have inode and namecache
 177  * data attached to them.  This target is the preferred minimum size of a
 178  * sub-cache consisting mostly of such files. The system balances the size
 179  * of this sub-cache with its complement to try to prevent either from
 180  * thrashing while the other is relatively inactive.  The targets express
 181  * a preference for the best balance.
 182  *
 183  * "Above" this target there are 2 further targets (watermarks) related
 184  * to recyling of free vnodes.  In the best-operating case, the cache is
 185  * exactly full, the free list has size between vlowat and vhiwat above the
 186  * free target, and recycling from it and normal use maintains this state.
 187  * Sometimes the free list is below vlowat or even empty, but this state
 188  * is even better for immediate use provided the cache is not full.
 189  * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
 190  * ones) to reach one of these states.  The watermarks are currently hard-
 191  * coded as 4% and 9% of the available space higher.  These and the default
 192  * of 25% for wantfreevnodes are too large if the memory size is large.
 193  * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
 194  * whenever vnlru_proc() becomes active.
 195  */
 196 static long wantfreevnodes;
 197 static long __exclusive_cache_line freevnodes;
 198 static long freevnodes_old;
 199
 200 static u_long recycles_count;
 201 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS, &recycles_count, 0,
 202     "Number of vnodes recycled to meet vnode cache targets (legacy)");
 203 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS,
 204     &recycles_count, 0,
 205     "Number of vnodes recycled to meet vnode cache targets");
 206
 207 static u_long recycles_free_count;
 208 SYSCTL_ULONG(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS,
 209     &recycles_free_count, 0,
 210     "Number of free vnodes recycled to meet vnode cache targets (legacy)");
 211 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS,
 212     &recycles_free_count, 0,
 213     "Number of free vnodes recycled to meet vnode cache targets");
 214
 215 static counter_u64_t direct_recycles_free_count;
 216 SYSCTL_COUNTER_U64(_vfs_vnode_vnlru, OID_AUTO, direct_recycles_free, CTLFLAG_RD,
 217     &direct_recycles_free_count,
 218     "Number of free vnodes recycled by vn_alloc callers to meet vnode cache targets");
 219
 220 static counter_u64_t vnode_skipped_requeues;
 221 SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, skipped_requeues, CTLFLAG_RD, &vnode_skipped_requeues,
 222     "Number of times LRU requeue was skipped due to lock contention");
 223
 224 static u_long deferred_inact;
 225 SYSCTL_ULONG(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD,
 226     &deferred_inact, 0, "Number of times inactive processing was deferred");
 227
 228 /* To keep more than one thread at a time from running vfs_getnewfsid */
 229 static struct mtx mntid_mtx;
 230
 231 /*
 232  * Lock for any access to the following:
 233  *      vnode_list
 234  *      numvnodes
 235  *      freevnodes
 236  */
 237 static struct mtx __exclusive_cache_line vnode_list_mtx;
 238
 239 /* Publicly exported FS */
 240 struct nfs_public nfs_pub;
 241
 242 static uma_zone_t buf_trie_zone;
 243 static smr_t buf_trie_smr;
 244
 245 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 246 static uma_zone_t vnode_zone;
 247 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll");
 248
 249 __read_frequently smr_t vfs_smr;
 250
 251 /*
 252  * The workitem queue.
 253  *
 254  * It is useful to delay writes of file data and filesystem metadata
 255  * for tens of seconds so that quickly created and deleted files need
 256  * not waste disk bandwidth being created and removed. To realize this,
 257  * we append vnodes to a "workitem" queue. When running with a soft
 258  * updates implementation, most pending metadata dependencies should
 259  * not wait for more than a few seconds. Thus, mounted on block devices
 260  * are delayed only about a half the time that file data is delayed.
 261  * Similarly, directory updates are more critical, so are only delayed
 262  * about a third the time that file data is delayed. Thus, there are
 263  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
 264  * one each second (driven off the filesystem syncer process). The
 265  * syncer_delayno variable indicates the next queue that is to be processed.
 266  * Items that need to be processed soon are placed in this queue:
 267  *
 268  *      syncer_workitem_pending[syncer_delayno]
 269  *
 270  * A delay of fifteen seconds is done by placing the request fifteen
 271  * entries later in the queue:
 272  *
 273  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
 274  *
 275  */
 276 static int syncer_delayno;
 277 static long syncer_mask;
 278 LIST_HEAD(synclist, bufobj);
 279 static struct synclist *syncer_workitem_pending;
 280 /*
 281  * The sync_mtx protects:
 282  *      bo->bo_synclist
 283  *      sync_vnode_count
 284  *      syncer_delayno
 285  *      syncer_state
 286  *      syncer_workitem_pending
 287  *      syncer_worklist_len
 288  *      rushjob
 289  */
 290 static struct mtx sync_mtx;
 291 static struct cv sync_wakeup;
 292
 293 #define SYNCER_MAXDELAY         32
 294 static int syncer_maxdelay = SYNCER_MAXDELAY;   /* maximum delay time */
 295 static int syncdelay = 30;              /* max time to delay syncing data */
 296 static int filedelay = 30;              /* time to delay syncing files */
 297 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
 298     "Time to delay syncing files (in seconds)");
 299 static int dirdelay = 29;               /* time to delay syncing directories */
 300 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
 301     "Time to delay syncing directories (in seconds)");
 302 static int metadelay = 28;              /* time to delay syncing metadata */
 303 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
 304     "Time to delay syncing metadata (in seconds)");
 305 static int rushjob;             /* number of slots to run ASAP */
 306 static int stat_rush_requests;  /* number of times I/O speeded up */
 307 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
 308     "Number of times I/O speeded up (rush requests)");
 309
 310 #define VDBATCH_SIZE 8
 311 struct vdbatch {
 312         u_int index;
 313         struct mtx lock;
 314         struct vnode *tab[VDBATCH_SIZE];
 315 };
 316 DPCPU_DEFINE_STATIC(struct vdbatch, vd);
 317
 318 static void     vdbatch_dequeue(struct vnode *vp);
 319
 320 /*
 321  * When shutting down the syncer, run it at four times normal speed.
 322  */
 323 #define SYNCER_SHUTDOWN_SPEEDUP         4
 324 static int sync_vnode_count;
 325 static int syncer_worklist_len;
 326 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
 327     syncer_state;
 328
 329 /* Target for maximum number of vnodes. */
 330 u_long desiredvnodes;
 331 static u_long gapvnodes;                /* gap between wanted and desired */
 332 static u_long vhiwat;           /* enough extras after expansion */
 333 static u_long vlowat;           /* minimal extras before expansion */
 334 static bool vstir;              /* nonzero to stir non-free vnodes */
 335 static volatile int vsmalltrigger = 8;  /* pref to keep if > this many pages */
 336
 337 static u_long vnlru_read_freevnodes(void);
 338
 339 /*
 340  * Note that no attempt is made to sanitize these parameters.
 341  */
 342 static int
 343 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS)
 344 {
 345         u_long val;
 346         int error;
 347
 348         val = desiredvnodes;
 349         error = sysctl_handle_long(oidp, &val, 0, req);
 350         if (error != 0 || req->newptr == NULL)
 351                 return (error);
 352
 353         if (val == desiredvnodes)
 354                 return (0);
 355         mtx_lock(&vnode_list_mtx);
 356         desiredvnodes = val;
 357         wantfreevnodes = desiredvnodes / 4;
 358         vnlru_recalc();
 359         mtx_unlock(&vnode_list_mtx);
 360         /*
 361          * XXX There is no protection against multiple threads changing
 362          * desiredvnodes at the same time. Locking above only helps vnlru and
 363          * getnewvnode.
 364          */
 365         vfs_hash_changesize(desiredvnodes);
 366         cache_changesize(desiredvnodes);
 367         return (0);
 368 }
 369
 370 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
 371     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes,
 372     "LU", "Target for maximum number of vnodes (legacy)");
 373 SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, limit,
 374     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes,
 375     "LU", "Target for maximum number of vnodes");
 376
 377 static int
 378 sysctl_freevnodes(SYSCTL_HANDLER_ARGS)
 379 {
 380         u_long rfreevnodes;
 381
 382         rfreevnodes = vnlru_read_freevnodes();
 383         return (sysctl_handle_long(oidp, &rfreevnodes, 0, req));
 384 }
 385
 386 SYSCTL_PROC(_vfs, OID_AUTO, freevnodes,
 387     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes,
 388     "LU", "Number of \"free\" vnodes (legacy)");
 389 SYSCTL_PROC(_vfs_vnode_stats, OID_AUTO, free,
 390     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes,
 391     "LU", "Number of \"free\" vnodes");
 392
 393 static int
 394 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS)
 395 {
 396         u_long val;
 397         int error;
 398
 399         val = wantfreevnodes;
 400         error = sysctl_handle_long(oidp, &val, 0, req);
 401         if (error != 0 || req->newptr == NULL)
 402                 return (error);
 403
 404         if (val == wantfreevnodes)
 405                 return (0);
 406         mtx_lock(&vnode_list_mtx);
 407         wantfreevnodes = val;
 408         vnlru_recalc();
 409         mtx_unlock(&vnode_list_mtx);
 410         return (0);
 411 }
 412
 413 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes,
 414     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes,
 415     "LU", "Target for minimum number of \"free\" vnodes (legacy)");
 416 SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, wantfree,
 417     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes,
 418     "LU", "Target for minimum number of \"free\" vnodes");
 419
 420 static int vnlru_nowhere;
 421 SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, failed_runs, CTLFLAG_RD | CTLFLAG_STATS,
 422     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 423
 424 static int
 425 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
 426 {
 427         struct vnode *vp;
 428         struct nameidata nd;
 429         char *buf;
 430         unsigned long ndflags;
 431         int error;
 432
 433         if (req->newptr == NULL)
 434                 return (EINVAL);
 435         if (req->newlen >= PATH_MAX)
 436                 return (E2BIG);
 437
 438         buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
 439         error = SYSCTL_IN(req, buf, req->newlen);
 440         if (error != 0)
 441                 goto out;
 442
 443         buf[req->newlen] = '\0';
 444
 445         ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1;
 446         NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf);
 447         if ((error = namei(&nd)) != 0)
 448                 goto out;
 449         vp = nd.ni_vp;
 450
 451         if (VN_IS_DOOMED(vp)) {
 452                 /*
 453                  * This vnode is being recycled.  Return != 0 to let the caller
 454                  * know that the sysctl had no effect.  Return EAGAIN because a
 455                  * subsequent call will likely succeed (since namei will create
 456                  * a new vnode if necessary)
 457                  */
 458                 error = EAGAIN;
 459                 goto putvnode;
 460         }
 461
 462         vgone(vp);
 463 putvnode:
 464         vput(vp);
 465         NDFREE_PNBUF(&nd);
 466 out:
 467         free(buf, M_TEMP);
 468         return (error);
 469 }
 470
 471 static int
 472 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
 473 {
 474         struct thread *td = curthread;
 475         struct vnode *vp;
 476         struct file *fp;
 477         int error;
 478         int fd;
 479
 480         if (req->newptr == NULL)
 481                 return (EBADF);
 482
 483         error = sysctl_handle_int(oidp, &fd, 0, req);
 484         if (error != 0)
 485                 return (error);
 486         error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
 487         if (error != 0)
 488                 return (error);
 489         vp = fp->f_vnode;
 490
 491         error = vn_lock(vp, LK_EXCLUSIVE);
 492         if (error != 0)
 493                 goto drop;
 494
 495         vgone(vp);
 496         VOP_UNLOCK(vp);
 497 drop:
 498         fdrop(fp, td);
 499         return (error);
 500 }
 501
 502 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
 503     CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
 504     sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
 505 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
 506     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
 507     sysctl_ftry_reclaim_vnode, "I",
 508     "Try to reclaim a vnode by its file descriptor");
 509
 510 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
 511 #define vnsz2log 8
 512 #ifndef DEBUG_LOCKS
 513 _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log &&
 514     sizeof(struct vnode) < 1UL << (vnsz2log + 1),
 515     "vnsz2log needs to be updated");
 516 #endif
 517
 518 /*
 519  * Support for the bufobj clean & dirty pctrie.
 520  */
 521 static void *
 522 buf_trie_alloc(struct pctrie *ptree)
 523 {
 524         return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT));
 525 }
 526
 527 static void
 528 buf_trie_free(struct pctrie *ptree, void *node)
 529 {
 530         uma_zfree_smr(buf_trie_zone, node);
 531 }
 532 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free,
 533     buf_trie_smr);
 534
 535 /*
 536  * Initialize the vnode management data structures.
 537  *
 538  * Reevaluate the following cap on the number of vnodes after the physical
 539  * memory size exceeds 512GB.  In the limit, as the physical memory size
 540  * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
 541  */
 542 #ifndef MAXVNODES_MAX
 543 #define MAXVNODES_MAX   (512UL * 1024 * 1024 / 64)      /* 8M */
 544 #endif
 545
 546 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
 547
 548 static struct vnode *
 549 vn_alloc_marker(struct mount *mp)
 550 {
 551         struct vnode *vp;
 552
 553         vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
 554         vp->v_type = VMARKER;
 555         vp->v_mount = mp;
 556
 557         return (vp);
 558 }
 559
 560 static void
 561 vn_free_marker(struct vnode *vp)
 562 {
 563
 564         MPASS(vp->v_type == VMARKER);
 565         free(vp, M_VNODE_MARKER);
 566 }
 567
 568 #ifdef KASAN
 569 static int
 570 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused)
 571 {
 572         kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0);
 573         return (0);
 574 }
 575
 576 static void
 577 vnode_dtor(void *mem, int size, void *arg __unused)
 578 {
 579         size_t end1, end2, off1, off2;
 580
 581         _Static_assert(offsetof(struct vnode, v_vnodelist) <
 582             offsetof(struct vnode, v_dbatchcpu),
 583             "KASAN marks require updating");
 584
 585         off1 = offsetof(struct vnode, v_vnodelist);
 586         off2 = offsetof(struct vnode, v_dbatchcpu);
 587         end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist);
 588         end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu);
 589
 590         /*
 591          * Access to the v_vnodelist and v_dbatchcpu fields are permitted even
 592          * after the vnode has been freed.  Try to get some KASAN coverage by
 593          * marking everything except those two fields as invalid.  Because
 594          * KASAN's tracking is not byte-granular, any preceding fields sharing
 595          * the same 8-byte aligned word must also be marked valid.
 596          */
 597
 598         /* Handle the area from the start until v_vnodelist... */
 599         off1 = rounddown2(off1, KASAN_SHADOW_SCALE);
 600         kasan_mark(mem, off1, off1, KASAN_UMA_FREED);
 601
 602         /* ... then the area between v_vnodelist and v_dbatchcpu ... */
 603         off1 = roundup2(end1, KASAN_SHADOW_SCALE);
 604         off2 = rounddown2(off2, KASAN_SHADOW_SCALE);
 605         if (off2 > off1)
 606                 kasan_mark((void *)((char *)mem + off1), off2 - off1,
 607                     off2 - off1, KASAN_UMA_FREED);
 608
 609         /* ... and finally the area from v_dbatchcpu to the end. */
 610         off2 = roundup2(end2, KASAN_SHADOW_SCALE);
 611         kasan_mark((void *)((char *)mem + off2), size - off2, size - off2,
 612             KASAN_UMA_FREED);
 613 }
 614 #endif /* KASAN */
 615
 616 /*
 617  * Initialize a vnode as it first enters the zone.
 618  */
 619 static int
 620 vnode_init(void *mem, int size, int flags)
 621 {
 622         struct vnode *vp;
 623
 624         vp = mem;
 625         bzero(vp, size);
 626         /*
 627          * Setup locks.
 628          */
 629         vp->v_vnlock = &vp->v_lock;
 630         mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 631         /*
 632          * By default, don't allow shared locks unless filesystems opt-in.
 633          */
 634         lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
 635             LK_NOSHARE | LK_IS_VNODE);
 636         /*
 637          * Initialize bufobj.
 638          */
 639         bufobj_init(&vp->v_bufobj, vp);
 640         /*
 641          * Initialize namecache.
 642          */
 643         cache_vnode_init(vp);
 644         /*
 645          * Initialize rangelocks.
 646          */
 647         rangelock_init(&vp->v_rl);
 648
 649         vp->v_dbatchcpu = NOCPU;
 650
 651         vp->v_state = VSTATE_DEAD;
 652
 653         /*
 654          * Check vhold_recycle_free for an explanation.
 655          */
 656         vp->v_holdcnt = VHOLD_NO_SMR;
 657         vp->v_type = VNON;
 658         mtx_lock(&vnode_list_mtx);
 659         TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
 660         mtx_unlock(&vnode_list_mtx);
 661         return (0);
 662 }
 663
 664 /*
 665  * Free a vnode when it is cleared from the zone.
 666  */
 667 static void
 668 vnode_fini(void *mem, int size)
 669 {
 670         struct vnode *vp;
 671         struct bufobj *bo;
 672
 673         vp = mem;
 674         vdbatch_dequeue(vp);
 675         mtx_lock(&vnode_list_mtx);
 676         TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
 677         mtx_unlock(&vnode_list_mtx);
 678         rangelock_destroy(&vp->v_rl);
 679         lockdestroy(vp->v_vnlock);
 680         mtx_destroy(&vp->v_interlock);
 681         bo = &vp->v_bufobj;
 682         rw_destroy(BO_LOCKPTR(bo));
 683
 684         kasan_mark(mem, size, size, 0);
 685 }
 686
 687 /*
 688  * Provide the size of NFS nclnode and NFS fh for calculation of the
 689  * vnode memory consumption.  The size is specified directly to
 690  * eliminate dependency on NFS-private header.
 691  *
 692  * Other filesystems may use bigger or smaller (like UFS and ZFS)
 693  * private inode data, but the NFS-based estimation is ample enough.
 694  * Still, we care about differences in the size between 64- and 32-bit
 695  * platforms.
 696  *
 697  * Namecache structure size is heuristically
 698  * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
 699  */
 700 #ifdef _LP64
 701 #define NFS_NCLNODE_SZ  (528 + 64)
 702 #define NC_SZ           148
 703 #else
 704 #define NFS_NCLNODE_SZ  (360 + 32)
 705 #define NC_SZ           92
 706 #endif
 707
 708 static void
 709 vntblinit(void *dummy __unused)
 710 {
 711         struct vdbatch *vd;
 712         uma_ctor ctor;
 713         uma_dtor dtor;
 714         int cpu, physvnodes, virtvnodes;
 715
 716         /*
 717          * Desiredvnodes is a function of the physical memory size and the
 718          * kernel's heap size.  Generally speaking, it scales with the
 719          * physical memory size.  The ratio of desiredvnodes to the physical
 720          * memory size is 1:16 until desiredvnodes exceeds 98,304.
 721          * Thereafter, the
 722          * marginal ratio of desiredvnodes to the physical memory size is
 723          * 1:64.  However, desiredvnodes is limited by the kernel's heap
 724          * size.  The memory required by desiredvnodes vnodes and vm objects
 725          * must not exceed 1/10th of the kernel's heap size.
 726          */
 727         physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
 728             3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
 729         virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
 730             sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
 731         desiredvnodes = min(physvnodes, virtvnodes);
 732         if (desiredvnodes > MAXVNODES_MAX) {
 733                 if (bootverbose)
 734                         printf("Reducing kern.maxvnodes %lu -> %lu\n",
 735                             desiredvnodes, MAXVNODES_MAX);
 736                 desiredvnodes = MAXVNODES_MAX;
 737         }
 738         wantfreevnodes = desiredvnodes / 4;
 739         mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 740         TAILQ_INIT(&vnode_list);
 741         mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF);
 742         /*
 743          * The lock is taken to appease WITNESS.
 744          */
 745         mtx_lock(&vnode_list_mtx);
 746         vnlru_recalc();
 747         mtx_unlock(&vnode_list_mtx);
 748         vnode_list_free_marker = vn_alloc_marker(NULL);
 749         TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist);
 750         vnode_list_reclaim_marker = vn_alloc_marker(NULL);
 751         TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
 752
 753 #ifdef KASAN
 754         ctor = vnode_ctor;
 755         dtor = vnode_dtor;
 756 #else
 757         ctor = NULL;
 758         dtor = NULL;
 759 #endif
 760         vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor,
 761             vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN);
 762         uma_zone_set_smr(vnode_zone, vfs_smr);
 763
 764         /*
 765          * Preallocate enough nodes to support one-per buf so that
 766          * we can not fail an insert.  reassignbuf() callers can not
 767          * tolerate the insertion failure.
 768          */
 769         buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
 770             NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
 771             UMA_ZONE_NOFREE | UMA_ZONE_SMR);
 772         buf_trie_smr = uma_zone_get_smr(buf_trie_zone);
 773         uma_prealloc(buf_trie_zone, nbuf);
 774
 775         vnodes_created = counter_u64_alloc(M_WAITOK);
 776         direct_recycles_free_count = counter_u64_alloc(M_WAITOK);
 777         vnode_skipped_requeues = counter_u64_alloc(M_WAITOK);
 778
 779         /*
 780          * Initialize the filesystem syncer.
 781          */
 782         syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 783             &syncer_mask);
 784         syncer_maxdelay = syncer_mask + 1;
 785         mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 786         cv_init(&sync_wakeup, "syncer");
 787
 788         CPU_FOREACH(cpu) {
 789                 vd = DPCPU_ID_PTR((cpu), vd);
 790                 bzero(vd, sizeof(*vd));
 791                 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF);
 792         }
 793 }
 794 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
 795
 796 /*
 797  * Mark a mount point as busy. Used to synchronize access and to delay
 798  * unmounting. Eventually, mountlist_mtx is not released on failure.
 799  *
 800  * vfs_busy() is a custom lock, it can block the caller.
 801  * vfs_busy() only sleeps if the unmount is active on the mount point.
 802  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
 803  * vnode belonging to mp.
 804  *
 805  * Lookup uses vfs_busy() to traverse mount points.
 806  * root fs                      var fs
 807  * / vnode lock         A       / vnode lock (/var)             D
 808  * /var vnode lock      B       /log vnode lock(/var/log)       E
 809  * vfs_busy lock        C       vfs_busy lock                   F
 810  *
 811  * Within each file system, the lock order is C->A->B and F->D->E.
 812  *
 813  * When traversing across mounts, the system follows that lock order:
 814  *
 815  *        C->A->B
 816  *              |
 817  *              +->F->D->E
 818  *
 819  * The lookup() process for namei("/var") illustrates the process:
 820  *  1. VOP_LOOKUP() obtains B while A is held
 821  *  2. vfs_busy() obtains a shared lock on F while A and B are held
 822  *  3. vput() releases lock on B
 823  *  4. vput() releases lock on A
 824  *  5. VFS_ROOT() obtains lock on D while shared lock on F is held
 825  *  6. vfs_unbusy() releases shared lock on F
 826  *  7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
 827  *     Attempt to lock A (instead of vp_crossmp) while D is held would
 828  *     violate the global order, causing deadlocks.
 829  *
 830  * dounmount() locks B while F is drained.  Note that for stacked
 831  * filesystems, D and B in the example above may be the same lock,
 832  * which introdues potential lock order reversal deadlock between
 833  * dounmount() and step 5 above.  These filesystems may avoid the LOR
 834  * by setting VV_CROSSLOCK on the covered vnode so that lock B will
 835  * remain held until after step 5.
 836  */
 837 int
 838 vfs_busy(struct mount *mp, int flags)
 839 {
 840         struct mount_pcpu *mpcpu;
 841
 842         MPASS((flags & ~MBF_MASK) == 0);
 843         CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
 844
 845         if (vfs_op_thread_enter(mp, mpcpu)) {
 846                 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 847                 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0);
 848                 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0);
 849                 vfs_mp_count_add_pcpu(mpcpu, ref, 1);
 850                 vfs_mp_count_add_pcpu(mpcpu, lockref, 1);
 851                 vfs_op_thread_exit(mp, mpcpu);
 852                 if (flags & MBF_MNTLSTLOCK)
 853                         mtx_unlock(&mountlist_mtx);
 854                 return (0);
 855         }
 856
 857         MNT_ILOCK(mp);
 858         vfs_assert_mount_counters(mp);
 859         MNT_REF(mp);
 860         /*
 861          * If mount point is currently being unmounted, sleep until the
 862          * mount point fate is decided.  If thread doing the unmounting fails,
 863          * it will clear MNTK_UNMOUNT flag before waking us up, indicating
 864          * that this mount point has survived the unmount attempt and vfs_busy
 865          * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
 866          * flag in addition to MNTK_UNMOUNT, indicating that mount point is
 867          * about to be really destroyed.  vfs_busy needs to release its
 868          * reference on the mount point in this case and return with ENOENT,
 869          * telling the caller the mount it tried to busy is no longer valid.
 870          */
 871         while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 872                 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers),
 873                     ("%s: non-empty upper mount list with pending unmount",
 874                     __func__));
 875                 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
 876                         MNT_REL(mp);
 877                         MNT_IUNLOCK(mp);
 878                         CTR1(KTR_VFS, "%s: failed busying before sleeping",
 879                             __func__);
 880                         return (ENOENT);
 881                 }
 882                 if (flags & MBF_MNTLSTLOCK)
 883                         mtx_unlock(&mountlist_mtx);
 884                 mp->mnt_kern_flag |= MNTK_MWAIT;
 885                 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
 886                 if (flags & MBF_MNTLSTLOCK)
 887                         mtx_lock(&mountlist_mtx);
 888                 MNT_ILOCK(mp);
 889         }
 890         if (flags & MBF_MNTLSTLOCK)
 891                 mtx_unlock(&mountlist_mtx);
 892         mp->mnt_lockref++;
 893         MNT_IUNLOCK(mp);
 894         return (0);
 895 }
 896
 897 /*
 898  * Free a busy filesystem.
 899  */
 900 void
 901 vfs_unbusy(struct mount *mp)
 902 {
 903         struct mount_pcpu *mpcpu;
 904         int c;
 905
 906         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 907
 908         if (vfs_op_thread_enter(mp, mpcpu)) {
 909                 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 910                 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1);
 911                 vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
 912                 vfs_op_thread_exit(mp, mpcpu);
 913                 return;
 914         }
 915
 916         MNT_ILOCK(mp);
 917         vfs_assert_mount_counters(mp);
 918         MNT_REL(mp);
 919         c = --mp->mnt_lockref;
 920         if (mp->mnt_vfs_ops == 0) {
 921                 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 922                 MNT_IUNLOCK(mp);
 923                 return;
 924         }
 925         if (c < 0)
 926                 vfs_dump_mount_counters(mp);
 927         if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
 928                 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
 929                 CTR1(KTR_VFS, "%s: waking up waiters", __func__);
 930                 mp->mnt_kern_flag &= ~MNTK_DRAINING;
 931                 wakeup(&mp->mnt_lockref);
 932         }
 933         MNT_IUNLOCK(mp);
 934 }
 935
 936 /*
 937  * Lookup a mount point by filesystem identifier.
 938  */
 939 struct mount *
 940 vfs_getvfs(fsid_t *fsid)
 941 {
 942         struct mount *mp;
 943
 944         CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 945         mtx_lock(&mountlist_mtx);
 946         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 947                 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
 948                         vfs_ref(mp);
 949                         mtx_unlock(&mountlist_mtx);
 950                         return (mp);
 951                 }
 952         }
 953         mtx_unlock(&mountlist_mtx);
 954         CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 955         return ((struct mount *) 0);
 956 }
 957
 958 /*
 959  * Lookup a mount point by filesystem identifier, busying it before
 960  * returning.
 961  *
 962  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
 963  * cache for popular filesystem identifiers.  The cache is lockess, using
 964  * the fact that struct mount's are never freed.  In worst case we may
 965  * get pointer to unmounted or even different filesystem, so we have to
 966  * check what we got, and go slow way if so.
 967  */
 968 struct mount *
 969 vfs_busyfs(fsid_t *fsid)
 970 {
 971 #define FSID_CACHE_SIZE 256
 972         typedef struct mount * volatile vmp_t;
 973         static vmp_t cache[FSID_CACHE_SIZE];
 974         struct mount *mp;
 975         int error;
 976         uint32_t hash;
 977
 978         CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 979         hash = fsid->val[0] ^ fsid->val[1];
 980         hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
 981         mp = cache[hash];
 982         if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0)
 983                 goto slow;
 984         if (vfs_busy(mp, 0) != 0) {
 985                 cache[hash] = NULL;
 986                 goto slow;
 987         }
 988         if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0)
 989                 return (mp);
 990         else
 991             vfs_unbusy(mp);
 992
 993 slow:
 994         mtx_lock(&mountlist_mtx);
 995         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 996                 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
 997                         error = vfs_busy(mp, MBF_MNTLSTLOCK);
 998                         if (error) {
 999                                 cache[hash] = NULL;
1000                                 mtx_unlock(&mountlist_mtx);
1001                                 return (NULL);
1002                         }
1003                         cache[hash] = mp;
1004                         return (mp);
1005                 }
1006         }
1007         CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
1008         mtx_unlock(&mountlist_mtx);
1009         return ((struct mount *) 0);
1010 }
1011
1012 /*
1013  * Check if a user can access privileged mount options.
1014  */
1015 int
1016 vfs_suser(struct mount *mp, struct thread *td)
1017 {
1018         int error;
1019
1020         if (jailed(td->td_ucred)) {
1021                 /*
1022                  * If the jail of the calling thread lacks permission for
1023                  * this type of file system, deny immediately.
1024                  */
1025                 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag))
1026                         return (EPERM);
1027
1028                 /*
1029                  * If the file system was mounted outside the jail of the
1030                  * calling thread, deny immediately.
1031                  */
1032                 if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
1033                         return (EPERM);
1034         }
1035
1036         /*
1037          * If file system supports delegated administration, we don't check
1038          * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
1039          * by the file system itself.
1040          * If this is not the user that did original mount, we check for
1041          * the PRIV_VFS_MOUNT_OWNER privilege.
1042          */
1043         if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
1044             mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
1045                 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
1046                         return (error);
1047         }
1048         return (0);
1049 }
1050
1051 /*
1052  * Get a new unique fsid.  Try to make its val[0] unique, since this value
1053  * will be used to create fake device numbers for stat().  Also try (but
1054  * not so hard) make its val[0] unique mod 2^16, since some emulators only
1055  * support 16-bit device numbers.  We end up with unique val[0]'s for the
1056  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
1057  *
1058  * Keep in mind that several mounts may be running in parallel.  Starting
1059  * the search one past where the previous search terminated is both a
1060  * micro-optimization and a defense against returning the same fsid to
1061  * different mounts.
1062  */
1063 void
1064 vfs_getnewfsid(struct mount *mp)
1065 {
1066         static uint16_t mntid_base;
1067         struct mount *nmp;
1068         fsid_t tfsid;
1069         int mtype;
1070
1071         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
1072         mtx_lock(&mntid_mtx);
1073         mtype = mp->mnt_vfc->vfc_typenum;
1074         tfsid.val[1] = mtype;
1075         mtype = (mtype & 0xFF) << 24;
1076         for (;;) {
1077                 tfsid.val[0] = makedev(255,
1078                     mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
1079                 mntid_base++;
1080                 if ((nmp = vfs_getvfs(&tfsid)) == NULL)
1081                         break;
1082                 vfs_rel(nmp);
1083         }
1084         mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
1085         mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
1086         mtx_unlock(&mntid_mtx);
1087 }
1088
1089 /*
1090  * Knob to control the precision of file timestamps:
1091  *
1092  *   0 = seconds only; nanoseconds zeroed.
1093  *   1 = seconds and nanoseconds, accurate within 1/HZ.
1094  *   2 = seconds and nanoseconds, truncated to microseconds.
1095  * >=3 = seconds and nanoseconds, maximum precision.
1096  */
1097 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
1098
1099 static int timestamp_precision = TSP_USEC;
1100 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
1101     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
1102     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
1103     "3+: sec + ns (max. precision))");
1104
1105 /*
1106  * Get a current timestamp.
1107  */
1108 void
1109 vfs_timestamp(struct timespec *tsp)
1110 {
1111         struct timeval tv;
1112
1113         switch (timestamp_precision) {
1114         case TSP_SEC:
1115                 tsp->tv_sec = time_second;
1116                 tsp->tv_nsec = 0;
1117                 break;
1118         case TSP_HZ:
1119                 getnanotime(tsp);
1120                 break;
1121         case TSP_USEC:
1122                 microtime(&tv);
1123                 TIMEVAL_TO_TIMESPEC(&tv, tsp);
1124                 break;
1125         case TSP_NSEC:
1126         default:
1127                 nanotime(tsp);
1128                 break;
1129         }
1130 }
1131
1132 /*
1133  * Set vnode attributes to VNOVAL
1134  */
1135 void
1136 vattr_null(struct vattr *vap)
1137 {
1138
1139         vap->va_type = VNON;
1140         vap->va_size = VNOVAL;
1141         vap->va_bytes = VNOVAL;
1142         vap->va_mode = VNOVAL;
1143         vap->va_nlink = VNOVAL;
1144         vap->va_uid = VNOVAL;
1145         vap->va_gid = VNOVAL;
1146         vap->va_fsid = VNOVAL;
1147         vap->va_fileid = VNOVAL;
1148         vap->va_blocksize = VNOVAL;
1149         vap->va_rdev = VNOVAL;
1150         vap->va_atime.tv_sec = VNOVAL;
1151         vap->va_atime.tv_nsec = VNOVAL;
1152         vap->va_mtime.tv_sec = VNOVAL;
1153         vap->va_mtime.tv_nsec = VNOVAL;
1154         vap->va_ctime.tv_sec = VNOVAL;
1155         vap->va_ctime.tv_nsec = VNOVAL;
1156         vap->va_birthtime.tv_sec = VNOVAL;
1157         vap->va_birthtime.tv_nsec = VNOVAL;
1158         vap->va_flags = VNOVAL;
1159         vap->va_gen = VNOVAL;
1160         vap->va_vaflags = 0;
1161 }
1162
1163 /*
1164  * Try to reduce the total number of vnodes.
1165  *
1166  * This routine (and its user) are buggy in at least the following ways:
1167  * - all parameters were picked years ago when RAM sizes were significantly
1168  *   smaller
1169  * - it can pick vnodes based on pages used by the vm object, but filesystems
1170  *   like ZFS don't use it making the pick broken
1171  * - since ZFS has its own aging policy it gets partially combated by this one
1172  * - a dedicated method should be provided for filesystems to let them decide
1173  *   whether the vnode should be recycled
1174  *
1175  * This routine is called when we have too many vnodes.  It attempts
1176  * to free <count> vnodes and will potentially free vnodes that still
1177  * have VM backing store (VM backing store is typically the cause
1178  * of a vnode blowout so we want to do this).  Therefore, this operation
1179  * is not considered cheap.
1180  *
1181  * A number of conditions may prevent a vnode from being reclaimed.
1182  * the buffer cache may have references on the vnode, a directory
1183  * vnode may still have references due to the namei cache representing
1184  * underlying files, or the vnode may be in active use.   It is not
1185  * desirable to reuse such vnodes.  These conditions may cause the
1186  * number of vnodes to reach some minimum value regardless of what
1187  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
1188  *
1189  * @param reclaim_nc_src Only reclaim directories with outgoing namecache
1190  *                       entries if this argument is strue
1191  * @param trigger        Only reclaim vnodes with fewer than this many resident
1192  *                       pages.
1193  * @param target         How many vnodes to reclaim.
1194  * @return               The number of vnodes that were reclaimed.
1195  */
1196 static int
1197 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target)
1198 {
1199         struct vnode *vp, *mvp;
1200         struct mount *mp;
1201         struct vm_object *object;
1202         u_long done;
1203         bool retried;
1204
1205         mtx_assert(&vnode_list_mtx, MA_OWNED);
1206
1207         retried = false;
1208         done = 0;
1209
1210         mvp = vnode_list_reclaim_marker;
1211 restart:
1212         vp = mvp;
1213         while (done < target) {
1214                 vp = TAILQ_NEXT(vp, v_vnodelist);
1215                 if (__predict_false(vp == NULL))
1216                         break;
1217
1218                 if (__predict_false(vp->v_type == VMARKER))
1219                         continue;
1220
1221                 /*
1222                  * If it's been deconstructed already, it's still
1223                  * referenced, or it exceeds the trigger, skip it.
1224                  * Also skip free vnodes.  We are trying to make space
1225                  * for more free vnodes, not reduce their count.
1226                  */
1227                 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 ||
1228                     (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)))
1229                         goto next_iter;
1230
1231                 if (vp->v_type == VBAD || vp->v_type == VNON)
1232                         goto next_iter;
1233
1234                 object = atomic_load_ptr(&vp->v_object);
1235                 if (object == NULL || object->resident_page_count > trigger) {
1236                         goto next_iter;
1237                 }
1238
1239                 /*
1240                  * Handle races against vnode allocation. Filesystems lock the
1241                  * vnode some time after it gets returned from getnewvnode,
1242                  * despite type and hold count being manipulated earlier.
1243                  * Resorting to checking v_mount restores guarantees present
1244                  * before the global list was reworked to contain all vnodes.
1245                  */
1246                 if (!VI_TRYLOCK(vp))
1247                         goto next_iter;
1248                 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
1249                         VI_UNLOCK(vp);
1250                         goto next_iter;
1251                 }
1252                 if (vp->v_mount == NULL) {
1253                         VI_UNLOCK(vp);
1254                         goto next_iter;
1255                 }
1256                 vholdl(vp);
1257                 VI_UNLOCK(vp);
1258                 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1259                 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
1260                 mtx_unlock(&vnode_list_mtx);
1261
1262                 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1263                         vdrop_recycle(vp);
1264                         goto next_iter_unlocked;
1265                 }
1266                 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) {
1267                         vdrop_recycle(vp);
1268                         vn_finished_write(mp);
1269                         goto next_iter_unlocked;
1270                 }
1271
1272                 VI_LOCK(vp);
1273                 if (vp->v_usecount > 0 ||
1274                     (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
1275                     (vp->v_object != NULL && vp->v_object->handle == vp &&
1276                     vp->v_object->resident_page_count > trigger)) {
1277                         VOP_UNLOCK(vp);
1278                         vdropl_recycle(vp);
1279                         vn_finished_write(mp);
1280                         goto next_iter_unlocked;
1281                 }
1282                 recycles_count++;
1283                 vgonel(vp);
1284                 VOP_UNLOCK(vp);
1285                 vdropl_recycle(vp);
1286                 vn_finished_write(mp);
1287                 done++;
1288 next_iter_unlocked:
1289                 maybe_yield();
1290                 mtx_lock(&vnode_list_mtx);
1291                 goto restart;
1292 next_iter:
1293                 MPASS(vp->v_type != VMARKER);
1294                 if (!should_yield())
1295                         continue;
1296                 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1297                 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
1298                 mtx_unlock(&vnode_list_mtx);
1299                 kern_yield(PRI_USER);
1300                 mtx_lock(&vnode_list_mtx);
1301                 goto restart;
1302         }
1303         if (done == 0 && !retried) {
1304                 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1305                 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
1306                 retried = true;
1307                 goto restart;
1308         }
1309         return (done);
1310 }
1311
1312 static int max_free_per_call = 10000;
1313 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_free_per_call, 0,
1314     "limit on vnode free requests per call to the vnlru_free routine (legacy)");
1315 SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, max_free_per_call, CTLFLAG_RW,
1316     &max_free_per_call, 0,
1317     "limit on vnode free requests per call to the vnlru_free routine");
1318
1319 /*
1320  * Attempt to recycle requested amount of free vnodes.
1321  */
1322 static int
1323 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp, bool isvnlru)
1324 {
1325         struct vnode *vp;
1326         struct mount *mp;
1327         int ocount;
1328         bool retried;
1329
1330         mtx_assert(&vnode_list_mtx, MA_OWNED);
1331         if (count > max_free_per_call)
1332                 count = max_free_per_call;
1333         if (count == 0) {
1334                 mtx_unlock(&vnode_list_mtx);
1335                 return (0);
1336         }
1337         ocount = count;
1338         retried = false;
1339         vp = mvp;
1340         for (;;) {
1341                 vp = TAILQ_NEXT(vp, v_vnodelist);
1342                 if (__predict_false(vp == NULL)) {
1343                         /*
1344                          * The free vnode marker can be past eligible vnodes:
1345                          * 1. if vdbatch_process trylock failed
1346                          * 2. if vtryrecycle failed
1347                          *
1348                          * If so, start the scan from scratch.
1349                          */
1350                         if (!retried && vnlru_read_freevnodes() > 0) {
1351                                 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1352                                 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
1353                                 vp = mvp;
1354                                 retried = true;
1355                                 continue;
1356                         }
1357
1358                         /*
1359                          * Give up
1360                          */
1361                         TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1362                         TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist);
1363                         mtx_unlock(&vnode_list_mtx);
1364                         break;
1365                 }
1366                 if (__predict_false(vp->v_type == VMARKER))
1367                         continue;
1368                 if (vp->v_holdcnt > 0)
1369                         continue;
1370                 /*
1371                  * Don't recycle if our vnode is from different type
1372                  * of mount point.  Note that mp is type-safe, the
1373                  * check does not reach unmapped address even if
1374                  * vnode is reclaimed.
1375                  */
1376                 if (mnt_op != NULL && (mp = vp->v_mount) != NULL &&
1377                     mp->mnt_op != mnt_op) {
1378                         continue;
1379                 }
1380                 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
1381                         continue;
1382                 }
1383                 if (!vhold_recycle_free(vp))
1384                         continue;
1385                 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1386                 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
1387                 mtx_unlock(&vnode_list_mtx);
1388                 /*
1389                  * FIXME: ignores the return value, meaning it may be nothing
1390                  * got recycled but it claims otherwise to the caller.
1391                  *
1392                  * Originally the value started being ignored in 2005 with
1393                  * 114a1006a8204aa156e1f9ad6476cdff89cada7f .
1394                  *
1395                  * Respecting the value can run into significant stalls if most
1396                  * vnodes belong to one file system and it has writes
1397                  * suspended.  In presence of many threads and millions of
1398                  * vnodes they keep contending on the vnode_list_mtx lock only
1399                  * to find vnodes they can't recycle.
1400                  *
1401                  * The solution would be to pre-check if the vnode is likely to
1402                  * be recycle-able, but it needs to happen with the
1403                  * vnode_list_mtx lock held. This runs into a problem where
1404                  * VOP_GETWRITEMOUNT (currently needed to find out about if
1405                  * writes are frozen) can take locks which LOR against it.
1406                  *
1407                  * Check nullfs for one example (null_getwritemount).
1408                  */
1409                 vtryrecycle(vp, isvnlru);
1410                 count--;
1411                 if (count == 0) {
1412                         break;
1413                 }
1414                 mtx_lock(&vnode_list_mtx);
1415                 vp = mvp;
1416         }
1417         mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
1418         return (ocount - count);
1419 }
1420
1421 /*
1422  * XXX: returns without vnode_list_mtx locked!
1423  */
1424 static int
1425 vnlru_free_locked_direct(int count)
1426 {
1427         int ret;
1428
1429         mtx_assert(&vnode_list_mtx, MA_OWNED);
1430         ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, false);
1431         mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
1432         return (ret);
1433 }
1434
1435 static int
1436 vnlru_free_locked_vnlru(int count)
1437 {
1438         int ret;
1439
1440         mtx_assert(&vnode_list_mtx, MA_OWNED);
1441         ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, true);
1442         mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
1443         return (ret);
1444 }
1445
1446 static int
1447 vnlru_free_vnlru(int count)
1448 {
1449
1450         mtx_lock(&vnode_list_mtx);
1451         return (vnlru_free_locked_vnlru(count));
1452 }
1453
1454 void
1455 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp)
1456 {
1457
1458         MPASS(mnt_op != NULL);
1459         MPASS(mvp != NULL);
1460         VNPASS(mvp->v_type == VMARKER, mvp);
1461         mtx_lock(&vnode_list_mtx);
1462         vnlru_free_impl(count, mnt_op, mvp, true);
1463         mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
1464 }
1465
1466 struct vnode *
1467 vnlru_alloc_marker(void)
1468 {
1469         struct vnode *mvp;
1470
1471         mvp = vn_alloc_marker(NULL);
1472         mtx_lock(&vnode_list_mtx);
1473         TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist);
1474         mtx_unlock(&vnode_list_mtx);
1475         return (mvp);
1476 }
1477
1478 void
1479 vnlru_free_marker(struct vnode *mvp)
1480 {
1481         mtx_lock(&vnode_list_mtx);
1482         TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1483         mtx_unlock(&vnode_list_mtx);
1484         vn_free_marker(mvp);
1485 }
1486
1487 static void
1488 vnlru_recalc(void)
1489 {
1490
1491         mtx_assert(&vnode_list_mtx, MA_OWNED);
1492         gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
1493         vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
1494         vlowat = vhiwat / 2;
1495 }
1496
1497 /*
1498  * Attempt to recycle vnodes in a context that is always safe to block.
1499  * Calling vlrurecycle() from the bowels of filesystem code has some
1500  * interesting deadlock problems.
1501  */
1502 static struct proc *vnlruproc;
1503 static int vnlruproc_sig;
1504 static u_long vnlruproc_kicks;
1505
1506 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, kicks, CTLFLAG_RD, &vnlruproc_kicks, 0,
1507     "Number of times vnlru awakened due to vnode shortage");
1508
1509 #define VNLRU_COUNT_SLOP 100
1510
1511 /*
1512  * The main freevnodes counter is only updated when a counter local to CPU
1513  * diverges from 0 by more than VNLRU_FREEVNODES_SLOP. CPUs are conditionally
1514  * walked to compute a more accurate total.
1515  *
1516  * Note: the actual value at any given moment can still exceed slop, but it
1517  * should not be by significant margin in practice.
1518  */
1519 #define VNLRU_FREEVNODES_SLOP 126
1520
1521 static void __noinline
1522 vfs_freevnodes_rollup(int8_t *lfreevnodes)
1523 {
1524
1525         atomic_add_long(&freevnodes, *lfreevnodes);
1526         *lfreevnodes = 0;
1527         critical_exit();
1528 }
1529
1530 static __inline void
1531 vfs_freevnodes_inc(void)
1532 {
1533         int8_t *lfreevnodes;
1534
1535         critical_enter();
1536         lfreevnodes = PCPU_PTR(vfs_freevnodes);
1537         (*lfreevnodes)++;
1538         if (__predict_false(*lfreevnodes == VNLRU_FREEVNODES_SLOP))
1539                 vfs_freevnodes_rollup(lfreevnodes);
1540         else
1541                 critical_exit();
1542 }
1543
1544 static __inline void
1545 vfs_freevnodes_dec(void)
1546 {
1547         int8_t *lfreevnodes;
1548
1549         critical_enter();
1550         lfreevnodes = PCPU_PTR(vfs_freevnodes);
1551         (*lfreevnodes)--;
1552         if (__predict_false(*lfreevnodes == -VNLRU_FREEVNODES_SLOP))
1553                 vfs_freevnodes_rollup(lfreevnodes);
1554         else
1555                 critical_exit();
1556 }
1557
1558 static u_long
1559 vnlru_read_freevnodes(void)
1560 {
1561         long slop, rfreevnodes, rfreevnodes_old;
1562         int cpu;
1563
1564         rfreevnodes = atomic_load_long(&freevnodes);
1565         rfreevnodes_old = atomic_load_long(&freevnodes_old);
1566
1567         if (rfreevnodes > rfreevnodes_old)
1568                 slop = rfreevnodes - rfreevnodes_old;
1569         else
1570                 slop = rfreevnodes_old - rfreevnodes;
1571         if (slop < VNLRU_FREEVNODES_SLOP)
1572                 return (rfreevnodes >= 0 ? rfreevnodes : 0);
1573         CPU_FOREACH(cpu) {
1574                 rfreevnodes += cpuid_to_pcpu[cpu]->pc_vfs_freevnodes;
1575         }
1576         atomic_store_long(&freevnodes_old, rfreevnodes);
1577         return (freevnodes_old >= 0 ? freevnodes_old : 0);
1578 }
1579
1580 static bool
1581 vnlru_under(u_long rnumvnodes, u_long limit)
1582 {
1583         u_long rfreevnodes, space;
1584
1585         if (__predict_false(rnumvnodes > desiredvnodes))
1586                 return (true);
1587
1588         space = desiredvnodes - rnumvnodes;
1589         if (space < limit) {
1590                 rfreevnodes = vnlru_read_freevnodes();
1591                 if (rfreevnodes > wantfreevnodes)
1592                         space += rfreevnodes - wantfreevnodes;
1593         }
1594         return (space < limit);
1595 }
1596
1597 static void
1598 vnlru_kick_locked(void)
1599 {
1600
1601         mtx_assert(&vnode_list_mtx, MA_OWNED);
1602         if (vnlruproc_sig == 0) {
1603                 vnlruproc_sig = 1;
1604                 vnlruproc_kicks++;
1605                 wakeup(vnlruproc);
1606         }
1607 }
1608
1609 static void
1610 vnlru_kick_cond(void)
1611 {
1612
1613         if (vnlru_read_freevnodes() > wantfreevnodes)
1614                 return;
1615
1616         if (vnlruproc_sig)
1617                 return;
1618         mtx_lock(&vnode_list_mtx);
1619         vnlru_kick_locked();
1620         mtx_unlock(&vnode_list_mtx);
1621 }
1622
1623 static void
1624 vnlru_proc_sleep(void)
1625 {
1626
1627         if (vnlruproc_sig) {
1628                 vnlruproc_sig = 0;
1629                 wakeup(&vnlruproc_sig);
1630         }
1631         msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz);
1632 }
1633
1634 /*
1635  * A lighter version of the machinery below.
1636  *
1637  * Tries to reach goals only by recycling free vnodes and does not invoke
1638  * uma_reclaim(UMA_RECLAIM_DRAIN).
1639  *
1640  * This works around pathological behavior in vnlru in presence of tons of free
1641  * vnodes, but without having to rewrite the machinery at this time. Said
1642  * behavior boils down to continuously trying to reclaim all kinds of vnodes
1643  * (cycling through all levels of "force") when the count is transiently above
1644  * limit. This happens a lot when all vnodes are used up and vn_alloc
1645  * speculatively increments the counter.
1646  *
1647  * Sample testcase: vnode limit 8388608, 20 separate directory trees each with
1648  * 1 million files in total and 20 find(1) processes stating them in parallel
1649  * (one per each tree).
1650  *
1651  * On a kernel with only stock machinery this needs anywhere between 60 and 120
1652  * seconds to execute (time varies *wildly* between runs). With the workaround
1653  * it consistently stays around 20 seconds [it got further down with later
1654  * changes].
1655  *
1656  * That is to say the entire thing needs a fundamental redesign (most notably
1657  * to accommodate faster recycling), the above only tries to get it ouf the way.
1658  *
1659  * Return values are:
1660  * -1 -- fallback to regular vnlru loop
1661  *  0 -- do nothing, go to sleep
1662  * >0 -- recycle this many vnodes
1663  */
1664 static long
1665 vnlru_proc_light_pick(void)
1666 {
1667         u_long rnumvnodes, rfreevnodes;
1668
1669         if (vstir || vnlruproc_sig == 1)
1670                 return (-1);
1671
1672         rnumvnodes = atomic_load_long(&numvnodes);
1673         rfreevnodes = vnlru_read_freevnodes();
1674
1675         /*
1676          * vnode limit might have changed and now we may be at a significant
1677          * excess. Bail if we can't sort it out with free vnodes.
1678          *
1679          * Due to atomic updates the count can legitimately go above
1680          * the limit for a short period, don't bother doing anything in
1681          * that case.
1682          */
1683         if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP + 10) {
1684                 if (rnumvnodes - rfreevnodes >= desiredvnodes ||
1685                     rfreevnodes <= wantfreevnodes) {
1686                         return (-1);
1687                 }
1688
1689                 return (rnumvnodes - desiredvnodes);
1690         }
1691
1692         /*
1693          * Don't try to reach wantfreevnodes target if there are too few vnodes
1694          * to begin with.
1695          */
1696         if (rnumvnodes < wantfreevnodes) {
1697                 return (0);
1698         }
1699
1700         if (rfreevnodes < wantfreevnodes) {
1701                 return (-1);
1702         }
1703
1704         return (0);
1705 }
1706
1707 static bool
1708 vnlru_proc_light(void)
1709 {
1710         long freecount;
1711
1712         mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
1713
1714         freecount = vnlru_proc_light_pick();
1715         if (freecount == -1)
1716                 return (false);
1717
1718         if (freecount != 0) {
1719                 vnlru_free_vnlru(freecount);
1720         }
1721
1722         mtx_lock(&vnode_list_mtx);
1723         vnlru_proc_sleep();
1724         mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
1725         return (true);
1726 }
1727
1728 static u_long uma_reclaim_calls;
1729 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, uma_reclaim_calls, CTLFLAG_RD | CTLFLAG_STATS,
1730     &uma_reclaim_calls, 0, "Number of calls to uma_reclaim");
1731
1732 static void
1733 vnlru_proc(void)
1734 {
1735         u_long rnumvnodes, rfreevnodes, target;
1736         unsigned long onumvnodes;
1737         int done, force, trigger, usevnodes;
1738         bool reclaim_nc_src, want_reread;
1739
1740         EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
1741             SHUTDOWN_PRI_FIRST);
1742
1743         force = 0;
1744         want_reread = false;
1745         for (;;) {
1746                 kproc_suspend_check(vnlruproc);
1747
1748                 if (force == 0 && vnlru_proc_light())
1749                         continue;
1750
1751                 mtx_lock(&vnode_list_mtx);
1752                 rnumvnodes = atomic_load_long(&numvnodes);
1753
1754                 if (want_reread) {
1755                         force = vnlru_under(numvnodes, vhiwat) ? 1 : 0;
1756                         want_reread = false;
1757                 }
1758
1759                 /*
1760                  * If numvnodes is too large (due to desiredvnodes being
1761                  * adjusted using its sysctl, or emergency growth), first
1762                  * try to reduce it by discarding free vnodes.
1763                  */
1764                 if (rnumvnodes > desiredvnodes + 10) {
1765                         vnlru_free_locked_vnlru(rnumvnodes - desiredvnodes);
1766                         mtx_lock(&vnode_list_mtx);
1767                         rnumvnodes = atomic_load_long(&numvnodes);
1768                 }
1769                 /*
1770                  * Sleep if the vnode cache is in a good state.  This is
1771                  * when it is not over-full and has space for about a 4%
1772                  * or 9% expansion (by growing its size or inexcessively
1773                  * reducing free vnode count).  Otherwise, try to reclaim
1774                  * space for a 10% expansion.
1775                  */
1776                 if (vstir && force == 0) {
1777                         force = 1;
1778                         vstir = false;
1779                 }
1780                 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) {
1781                         vnlru_proc_sleep();
1782                         continue;
1783                 }
1784                 rfreevnodes = vnlru_read_freevnodes();
1785
1786                 onumvnodes = rnumvnodes;
1787                 /*
1788                  * Calculate parameters for recycling.  These are the same
1789                  * throughout the loop to give some semblance of fairness.
1790                  * The trigger point is to avoid recycling vnodes with lots
1791                  * of resident pages.  We aren't trying to free memory; we
1792                  * are trying to recycle or at least free vnodes.
1793                  */
1794                 if (rnumvnodes <= desiredvnodes)
1795                         usevnodes = rnumvnodes - rfreevnodes;
1796                 else
1797                         usevnodes = rnumvnodes;
1798                 if (usevnodes <= 0)
1799                         usevnodes = 1;
1800                 /*
1801                  * The trigger value is chosen to give a conservatively
1802                  * large value to ensure that it alone doesn't prevent
1803                  * making progress.  The value can easily be so large that
1804                  * it is effectively infinite in some congested and
1805                  * misconfigured cases, and this is necessary.  Normally
1806                  * it is about 8 to 100 (pages), which is quite large.
1807                  */
1808                 trigger = vm_cnt.v_page_count * 2 / usevnodes;
1809                 if (force < 2)
1810                         trigger = vsmalltrigger;
1811                 reclaim_nc_src = force >= 3;
1812                 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1);
1813                 target = target / 10 + 1;
1814                 done = vlrureclaim(reclaim_nc_src, trigger, target);
1815                 mtx_unlock(&vnode_list_mtx);
1816                 /*
1817                  * Total number of vnodes can transiently go slightly above the
1818                  * limit (see vn_alloc_hard), no need to call uma_reclaim if
1819                  * this happens.
1820                  */
1821                 if (onumvnodes + VNLRU_COUNT_SLOP + 1000 > desiredvnodes &&
1822                     numvnodes <= desiredvnodes) {
1823                         uma_reclaim_calls++;
1824                         uma_reclaim(UMA_RECLAIM_DRAIN);
1825                 }
1826                 if (done == 0) {
1827                         if (force == 0 || force == 1) {
1828                                 force = 2;
1829                                 continue;
1830                         }
1831                         if (force == 2) {
1832                                 force = 3;
1833                                 continue;
1834                         }
1835                         want_reread = true;
1836                         force = 0;
1837                         vnlru_nowhere++;
1838                         tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
1839                 } else {
1840                         want_reread = true;
1841                         kern_yield(PRI_USER);
1842                 }
1843         }
1844 }
1845
1846 static struct kproc_desc vnlru_kp = {
1847         "vnlru",
1848         vnlru_proc,
1849         &vnlruproc
1850 };
1851 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
1852     &vnlru_kp);
1853
1854 /*
1855  * Routines having to do with the management of the vnode table.
1856  */
1857
1858 /*
1859  * Try to recycle a freed vnode.
1860  */
1861 static int
1862 vtryrecycle(struct vnode *vp, bool isvnlru)
1863 {
1864         struct mount *vnmp;
1865
1866         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
1867         VNPASS(vp->v_holdcnt > 0, vp);
1868         /*
1869          * This vnode may found and locked via some other list, if so we
1870          * can't recycle it yet.
1871          */
1872         if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
1873                 CTR2(KTR_VFS,
1874                     "%s: impossible to recycle, vp %p lock is already held",
1875                     __func__, vp);
1876                 vdrop_recycle(vp);
1877                 return (EWOULDBLOCK);
1878         }
1879         /*
1880          * Don't recycle if its filesystem is being suspended.
1881          */
1882         if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
1883                 VOP_UNLOCK(vp);
1884                 CTR2(KTR_VFS,
1885                     "%s: impossible to recycle, cannot start the write for %p",
1886                     __func__, vp);
1887                 vdrop_recycle(vp);
1888                 return (EBUSY);
1889         }
1890         /*
1891          * If we got this far, we need to acquire the interlock and see if
1892          * anyone picked up this vnode from another list.  If not, we will
1893          * mark it with DOOMED via vgonel() so that anyone who does find it
1894          * will skip over it.
1895          */
1896         VI_LOCK(vp);
1897         if (vp->v_usecount) {
1898                 VOP_UNLOCK(vp);
1899                 vdropl_recycle(vp);
1900                 vn_finished_write(vnmp);
1901                 CTR2(KTR_VFS,
1902                     "%s: impossible to recycle, %p is already referenced",
1903                     __func__, vp);
1904                 return (EBUSY);
1905         }
1906         if (!VN_IS_DOOMED(vp)) {
1907                 if (isvnlru)
1908                         recycles_free_count++;
1909                 else
1910                         counter_u64_add(direct_recycles_free_count, 1);
1911                 vgonel(vp);
1912         }
1913         VOP_UNLOCK(vp);
1914         vdropl_recycle(vp);
1915         vn_finished_write(vnmp);
1916         return (0);
1917 }
1918
1919 /*
1920  * Allocate a new vnode.
1921  *
1922  * The operation never returns an error. Returning an error was disabled
1923  * in r145385 (dated 2005) with the following comment:
1924  *
1925  * XXX Not all VFS_VGET/ffs_vget callers check returns.
1926  *
1927  * Given the age of this commit (almost 15 years at the time of writing this
1928  * comment) restoring the ability to fail requires a significant audit of
1929  * all codepaths.
1930  *
1931  * The routine can try to free a vnode or stall for up to 1 second waiting for
1932  * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation.
1933  */
1934 static u_long vn_alloc_cyclecount;
1935 static u_long vn_alloc_sleeps;
1936
1937 SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, alloc_sleeps, CTLFLAG_RD, &vn_alloc_sleeps, 0,
1938     "Number of times vnode allocation blocked waiting on vnlru");
1939
1940 static struct vnode * __noinline
1941 vn_alloc_hard(struct mount *mp, u_long rnumvnodes, bool bumped)
1942 {
1943         u_long rfreevnodes;
1944
1945         if (bumped) {
1946                 if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP) {
1947                         atomic_subtract_long(&numvnodes, 1);
1948                         bumped = false;
1949                 }
1950         }
1951
1952         mtx_lock(&vnode_list_mtx);
1953
1954         if (vn_alloc_cyclecount != 0) {
1955                 rnumvnodes = atomic_load_long(&numvnodes);
1956                 if (rnumvnodes + 1 < desiredvnodes) {
1957                         vn_alloc_cyclecount = 0;
1958                         mtx_unlock(&vnode_list_mtx);
1959                         goto alloc;
1960                 }
1961
1962                 rfreevnodes = vnlru_read_freevnodes();
1963                 if (rfreevnodes < wantfreevnodes) {
1964                         if (vn_alloc_cyclecount++ >= rfreevnodes) {
1965                                 vn_alloc_cyclecount = 0;
1966                                 vstir = true;
1967                         }
1968                 } else {
1969                         vn_alloc_cyclecount = 0;
1970                 }
1971         }
1972
1973         /*
1974          * Grow the vnode cache if it will not be above its target max after
1975          * growing.  Otherwise, if there is at least one free vnode, try to
1976          * reclaim 1 item from it before growing the cache (possibly above its
1977          * target max if the reclamation failed or is delayed).
1978          */
1979         if (vnlru_free_locked_direct(1) > 0)
1980                 goto alloc;
1981         mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
1982         if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
1983                 /*
1984                  * Wait for space for a new vnode.
1985                  */
1986                 if (bumped) {
1987                         atomic_subtract_long(&numvnodes, 1);
1988                         bumped = false;
1989                 }
1990                 mtx_lock(&vnode_list_mtx);
1991                 vnlru_kick_locked();
1992                 vn_alloc_sleeps++;
1993                 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz);
1994                 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes &&
1995                     vnlru_read_freevnodes() > 1)
1996                         vnlru_free_locked_direct(1);
1997                 else
1998                         mtx_unlock(&vnode_list_mtx);
1999         }
2000 alloc:
2001         mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
2002         if (!bumped)
2003                 atomic_add_long(&numvnodes, 1);
2004         vnlru_kick_cond();
2005         return (uma_zalloc_smr(vnode_zone, M_WAITOK));
2006 }
2007
2008 static struct vnode *
2009 vn_alloc(struct mount *mp)
2010 {
2011         u_long rnumvnodes;
2012
2013         if (__predict_false(vn_alloc_cyclecount != 0))
2014                 return (vn_alloc_hard(mp, 0, false));
2015         rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
2016         if (__predict_false(vnlru_under(rnumvnodes, vlowat))) {
2017                 return (vn_alloc_hard(mp, rnumvnodes, true));
2018         }
2019
2020         return (uma_zalloc_smr(vnode_zone, M_WAITOK));
2021 }
2022
2023 static void
2024 vn_free(struct vnode *vp)
2025 {
2026
2027         atomic_subtract_long(&numvnodes, 1);
2028         uma_zfree_smr(vnode_zone, vp);
2029 }
2030
2031 /*
2032  * Allocate a new vnode.
2033  */
2034 int
2035 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
2036     struct vnode **vpp)
2037 {
2038         struct vnode *vp;
2039         struct thread *td;
2040         struct lock_object *lo;
2041
2042         CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
2043
2044         KASSERT(vops->registered,
2045             ("%s: not registered vector op %p\n", __func__, vops));
2046         cache_validate_vop_vector(mp, vops);
2047
2048         td = curthread;
2049         if (td->td_vp_reserved != NULL) {
2050                 vp = td->td_vp_reserved;
2051                 td->td_vp_reserved = NULL;
2052         } else {
2053                 vp = vn_alloc(mp);
2054         }
2055         counter_u64_add(vnodes_created, 1);
2056
2057         vn_set_state(vp, VSTATE_UNINITIALIZED);
2058
2059         /*
2060          * Locks are given the generic name "vnode" when created.
2061          * Follow the historic practice of using the filesystem
2062          * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
2063          *
2064          * Locks live in a witness group keyed on their name. Thus,
2065          * when a lock is renamed, it must also move from the witness
2066          * group of its old name to the witness group of its new name.
2067          *
2068          * The change only needs to be made when the vnode moves
2069          * from one filesystem type to another. We ensure that each
2070          * filesystem use a single static name pointer for its tag so
2071          * that we can compare pointers rather than doing a strcmp().
2072          */
2073         lo = &vp->v_vnlock->lock_object;
2074 #ifdef WITNESS
2075         if (lo->lo_name != tag) {
2076 #endif
2077                 lo->lo_name = tag;
2078 #ifdef WITNESS
2079                 WITNESS_DESTROY(lo);
2080                 WITNESS_INIT(lo, tag);
2081         }
2082 #endif
2083         /*
2084          * By default, don't allow shared locks unless filesystems opt-in.
2085          */
2086         vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
2087         /*
2088          * Finalize various vnode identity bits.
2089          */
2090         KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
2091         KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
2092         KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
2093         vp->v_type = VNON;
2094         vp->v_op = vops;
2095         vp->v_irflag = 0;
2096         v_init_counters(vp);
2097         vn_seqc_init(vp);
2098         vp->v_bufobj.bo_ops = &buf_ops_bio;
2099 #ifdef DIAGNOSTIC
2100         if (mp == NULL && vops != &dead_vnodeops)
2101                 printf("NULL mp in getnewvnode(9), tag %s\n", tag);
2102 #endif
2103 #ifdef MAC
2104         mac_vnode_init(vp);
2105         if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
2106                 mac_vnode_associate_singlelabel(mp, vp);
2107 #endif
2108         if (mp != NULL) {
2109                 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
2110         }
2111
2112         /*
2113          * For the filesystems which do not use vfs_hash_insert(),
2114          * still initialize v_hash to have vfs_hash_index() useful.
2115          * E.g., nullfs uses vfs_hash_index() on the lower vnode for
2116          * its own hashing.
2117          */
2118         vp->v_hash = (uintptr_t)vp >> vnsz2log;
2119
2120         *vpp = vp;
2121         return (0);
2122 }
2123
2124 void
2125 getnewvnode_reserve(void)
2126 {
2127         struct thread *td;
2128
2129         td = curthread;
2130         MPASS(td->td_vp_reserved == NULL);
2131         td->td_vp_reserved = vn_alloc(NULL);
2132 }
2133
2134 void
2135 getnewvnode_drop_reserve(void)
2136 {
2137         struct thread *td;
2138
2139         td = curthread;
2140         if (td->td_vp_reserved != NULL) {
2141                 vn_free(td->td_vp_reserved);
2142                 td->td_vp_reserved = NULL;
2143         }
2144 }
2145
2146 static void __noinline
2147 freevnode(struct vnode *vp)
2148 {
2149         struct bufobj *bo;
2150
2151         /*
2152          * The vnode has been marked for destruction, so free it.
2153          *
2154          * The vnode will be returned to the zone where it will
2155          * normally remain until it is needed for another vnode. We
2156          * need to cleanup (or verify that the cleanup has already
2157          * been done) any residual data left from its current use
2158          * so as not to contaminate the freshly allocated vnode.
2159          */
2160         CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
2161         /*
2162          * Paired with vgone.
2163          */
2164         vn_seqc_write_end_free(vp);
2165
2166         bo = &vp->v_bufobj;
2167         VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
2168         VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp);
2169         VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
2170         VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
2171         VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
2172         VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
2173         VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
2174             ("clean blk trie not empty"));
2175         VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
2176         VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
2177             ("dirty blk trie not empty"));
2178         VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
2179             ("Dangling rangelock waiters"));
2180         VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp,
2181             ("Leaked inactivation"));
2182         VI_UNLOCK(vp);
2183         cache_assert_no_entries(vp);
2184
2185 #ifdef MAC
2186         mac_vnode_destroy(vp);
2187 #endif
2188         if (vp->v_pollinfo != NULL) {
2189                 /*
2190                  * Use LK_NOWAIT to shut up witness about the lock. We may get
2191                  * here while having another vnode locked when trying to
2192                  * satisfy a lookup and needing to recycle.
2193                  */
2194                 VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT);
2195                 destroy_vpollinfo(vp->v_pollinfo);
2196                 VOP_UNLOCK(vp);
2197                 vp->v_pollinfo = NULL;
2198         }
2199         vp->v_mountedhere = NULL;
2200         vp->v_unpcb = NULL;
2201         vp->v_rdev = NULL;
2202         vp->v_fifoinfo = NULL;
2203         vp->v_iflag = 0;
2204         vp->v_vflag = 0;
2205         bo->bo_flag = 0;
2206         vn_free(vp);
2207 }
2208
2209 /*
2210  * Delete from old mount point vnode list, if on one.
2211  */
2212 static void
2213 delmntque(struct vnode *vp)
2214 {
2215         struct mount *mp;
2216
2217         VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
2218
2219         mp = vp->v_mount;
2220         MNT_ILOCK(mp);
2221         VI_LOCK(vp);
2222         vp->v_mount = NULL;
2223         VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
2224                 ("bad mount point vnode list size"));
2225         TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2226         mp->mnt_nvnodelistsize--;
2227         MNT_REL(mp);
2228         MNT_IUNLOCK(mp);
2229         /*
2230          * The caller expects the interlock to be still held.
2231          */
2232         ASSERT_VI_LOCKED(vp, __func__);
2233 }
2234
2235 static int
2236 insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr)
2237 {
2238
2239         KASSERT(vp->v_mount == NULL,
2240                 ("insmntque: vnode already on per mount vnode list"));
2241         VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
2242         if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) {
2243                 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
2244         } else {
2245                 KASSERT(!dtr,
2246                     ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup",
2247                     __func__));
2248         }
2249
2250         /*
2251          * We acquire the vnode interlock early to ensure that the
2252          * vnode cannot be recycled by another process releasing a
2253          * holdcnt on it before we get it on both the vnode list
2254          * and the active vnode list. The mount mutex protects only
2255          * manipulation of the vnode list and the vnode freelist
2256          * mutex protects only manipulation of the active vnode list.
2257          * Hence the need to hold the vnode interlock throughout.
2258          */
2259         MNT_ILOCK(mp);
2260         VI_LOCK(vp);
2261         if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 &&
2262             ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
2263             mp->mnt_nvnodelistsize == 0)) &&
2264             (vp->v_vflag & VV_FORCEINSMQ) == 0) {
2265                 VI_UNLOCK(vp);
2266                 MNT_IUNLOCK(mp);
2267                 if (dtr) {
2268                         vp->v_data = NULL;
2269                         vp->v_op = &dead_vnodeops;
2270                         vgone(vp);
2271                         vput(vp);
2272                 }
2273                 return (EBUSY);
2274         }
2275         vp->v_mount = mp;
2276         MNT_REF(mp);
2277         TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2278         VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
2279                 ("neg mount point vnode list size"));
2280         mp->mnt_nvnodelistsize++;
2281         VI_UNLOCK(vp);
2282         MNT_IUNLOCK(mp);
2283         return (0);
2284 }
2285
2286 /*
2287  * Insert into list of vnodes for the new mount point, if available.
2288  * insmntque() reclaims the vnode on insertion failure, insmntque1()
2289  * leaves handling of the vnode to the caller.
2290  */
2291 int
2292 insmntque(struct vnode *vp, struct mount *mp)
2293 {
2294         return (insmntque1_int(vp, mp, true));
2295 }
2296
2297 int
2298 insmntque1(struct vnode *vp, struct mount *mp)
2299 {
2300         return (insmntque1_int(vp, mp, false));
2301 }
2302
2303 /*
2304  * Flush out and invalidate all buffers associated with a bufobj
2305  * Called with the underlying object locked.
2306  */
2307 int
2308 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
2309 {
2310         int error;
2311
2312         BO_LOCK(bo);
2313         if (flags & V_SAVE) {
2314                 error = bufobj_wwait(bo, slpflag, slptimeo);
2315                 if (error) {
2316                         BO_UNLOCK(bo);
2317                         return (error);
2318                 }
2319                 if (bo->bo_dirty.bv_cnt > 0) {
2320                         BO_UNLOCK(bo);
2321                         do {
2322                                 error = BO_SYNC(bo, MNT_WAIT);
2323                         } while (error == ERELOOKUP);
2324                         if (error != 0)
2325                                 return (error);
2326                         BO_LOCK(bo);
2327                         if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) {
2328                                 BO_UNLOCK(bo);
2329                                 return (EBUSY);
2330                         }
2331                 }
2332         }
2333         /*
2334          * If you alter this loop please notice that interlock is dropped and
2335          * reacquired in flushbuflist.  Special care is needed to ensure that
2336          * no race conditions occur from this.
2337          */
2338         do {
2339                 error = flushbuflist(&bo->bo_clean,
2340                     flags, bo, slpflag, slptimeo);
2341                 if (error == 0 && !(flags & V_CLEANONLY))
2342                         error = flushbuflist(&bo->bo_dirty,
2343                             flags, bo, slpflag, slptimeo);
2344                 if (error != 0 && error != EAGAIN) {
2345                         BO_UNLOCK(bo);
2346                         return (error);
2347                 }
2348         } while (error != 0);
2349
2350         /*
2351          * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
2352          * have write I/O in-progress but if there is a VM object then the
2353          * VM object can also have read-I/O in-progress.
2354          */
2355         do {
2356                 bufobj_wwait(bo, 0, 0);
2357                 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) {
2358                         BO_UNLOCK(bo);
2359                         vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx");
2360                         BO_LOCK(bo);
2361                 }
2362         } while (bo->bo_numoutput > 0);
2363         BO_UNLOCK(bo);
2364
2365         /*
2366          * Destroy the copy in the VM cache, too.
2367          */
2368         if (bo->bo_object != NULL &&
2369             (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
2370                 VM_OBJECT_WLOCK(bo->bo_object);
2371                 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
2372                     OBJPR_CLEANONLY : 0);
2373                 VM_OBJECT_WUNLOCK(bo->bo_object);
2374         }
2375
2376 #ifdef INVARIANTS
2377         BO_LOCK(bo);
2378         if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
2379             V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
2380             bo->bo_clean.bv_cnt > 0))
2381                 panic("vinvalbuf: flush failed");
2382         if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
2383             bo->bo_dirty.bv_cnt > 0)
2384                 panic("vinvalbuf: flush dirty failed");
2385         BO_UNLOCK(bo);
2386 #endif
2387         return (0);
2388 }
2389
2390 /*
2391  * Flush out and invalidate all buffers associated with a vnode.
2392  * Called with the underlying object locked.
2393  */
2394 int
2395 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
2396 {
2397
2398         CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2399         ASSERT_VOP_LOCKED(vp, "vinvalbuf");
2400         if (vp->v_object != NULL && vp->v_object->handle != vp)
2401                 return (0);
2402         return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
2403 }
2404
2405 /*
2406  * Flush out buffers on the specified list.
2407  *
2408  */
2409 static int
2410 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
2411     int slptimeo)
2412 {
2413         struct buf *bp, *nbp;
2414         int retval, error;
2415         daddr_t lblkno;
2416         b_xflags_t xflags;
2417
2418         ASSERT_BO_WLOCKED(bo);
2419
2420         retval = 0;
2421         TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
2422                 /*
2423                  * If we are flushing both V_NORMAL and V_ALT buffers then
2424                  * do not skip any buffers. If we are flushing only V_NORMAL
2425                  * buffers then skip buffers marked as BX_ALTDATA. If we are
2426                  * flushing only V_ALT buffers then skip buffers not marked
2427                  * as BX_ALTDATA.
2428                  */
2429                 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) &&
2430                    (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) ||
2431                     ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) {
2432                         continue;
2433                 }
2434                 if (nbp != NULL) {
2435                         lblkno = nbp->b_lblkno;
2436                         xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
2437                 }
2438                 retval = EAGAIN;
2439                 error = BUF_TIMELOCK(bp,
2440                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
2441                     "flushbuf", slpflag, slptimeo);
2442                 if (error) {
2443                         BO_LOCK(bo);
2444                         return (error != ENOLCK ? error : EAGAIN);
2445                 }
2446                 KASSERT(bp->b_bufobj == bo,
2447                     ("bp %p wrong b_bufobj %p should be %p",
2448                     bp, bp->b_bufobj, bo));
2449                 /*
2450                  * XXX Since there are no node locks for NFS, I
2451                  * believe there is a slight chance that a delayed
2452                  * write will occur while sleeping just above, so
2453                  * check for it.
2454                  */
2455                 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
2456                     (flags & V_SAVE)) {
2457                         bremfree(bp);
2458                         bp->b_flags |= B_ASYNC;
2459                         bwrite(bp);
2460                         BO_LOCK(bo);
2461                         return (EAGAIN);        /* XXX: why not loop ? */
2462                 }
2463                 bremfree(bp);
2464                 bp->b_flags |= (B_INVAL | B_RELBUF);
2465                 bp->b_flags &= ~B_ASYNC;
2466                 brelse(bp);
2467                 BO_LOCK(bo);
2468                 if (nbp == NULL)
2469                         break;
2470                 nbp = gbincore(bo, lblkno);
2471                 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2472                     != xflags)
2473                         break;                  /* nbp invalid */
2474         }
2475         return (retval);
2476 }
2477
2478 int
2479 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
2480 {
2481         struct buf *bp;
2482         int error;
2483         daddr_t lblkno;
2484
2485         ASSERT_BO_LOCKED(bo);
2486
2487         for (lblkno = startn;;) {
2488 again:
2489                 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
2490                 if (bp == NULL || bp->b_lblkno >= endn ||
2491                     bp->b_lblkno < startn)
2492                         break;
2493                 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
2494                     LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
2495                 if (error != 0) {
2496                         BO_RLOCK(bo);
2497                         if (error == ENOLCK)
2498                                 goto again;
2499                         return (error);
2500                 }
2501                 KASSERT(bp->b_bufobj == bo,
2502                     ("bp %p wrong b_bufobj %p should be %p",
2503                     bp, bp->b_bufobj, bo));
2504                 lblkno = bp->b_lblkno + 1;
2505                 if ((bp->b_flags & B_MANAGED) == 0)
2506                         bremfree(bp);
2507                 bp->b_flags |= B_RELBUF;
2508                 /*
2509                  * In the VMIO case, use the B_NOREUSE flag to hint that the
2510                  * pages backing each buffer in the range are unlikely to be
2511                  * reused.  Dirty buffers will have the hint applied once
2512                  * they've been written.
2513                  */
2514                 if ((bp->b_flags & B_VMIO) != 0)
2515                         bp->b_flags |= B_NOREUSE;
2516                 brelse(bp);
2517                 BO_RLOCK(bo);
2518         }
2519         return (0);
2520 }
2521
2522 /*
2523  * Truncate a file's buffer and pages to a specified length.  This
2524  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
2525  * sync activity.
2526  */
2527 int
2528 vtruncbuf(struct vnode *vp, off_t length, int blksize)
2529 {
2530         struct buf *bp, *nbp;
2531         struct bufobj *bo;
2532         daddr_t startlbn;
2533
2534         CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
2535             vp, blksize, (uintmax_t)length);
2536
2537         /*
2538          * Round up to the *next* lbn.
2539          */
2540         startlbn = howmany(length, blksize);
2541
2542         ASSERT_VOP_LOCKED(vp, "vtruncbuf");
2543
2544         bo = &vp->v_bufobj;
2545 restart_unlocked:
2546         BO_LOCK(bo);
2547
2548         while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN)
2549                 ;
2550
2551         if (length > 0) {
2552 restartsync:
2553                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2554                         if (bp->b_lblkno > 0)
2555                                 continue;
2556                         /*
2557                          * Since we hold the vnode lock this should only
2558                          * fail if we're racing with the buf daemon.
2559                          */
2560                         if (BUF_LOCK(bp,
2561                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
2562                             BO_LOCKPTR(bo)) == ENOLCK)
2563                                 goto restart_unlocked;
2564
2565                         VNASSERT((bp->b_flags & B_DELWRI), vp,
2566                             ("buf(%p) on dirty queue without DELWRI", bp));
2567
2568                         bremfree(bp);
2569                         bawrite(bp);
2570                         BO_LOCK(bo);
2571                         goto restartsync;
2572                 }
2573         }
2574
2575         bufobj_wwait(bo, 0, 0);
2576         BO_UNLOCK(bo);
2577         vnode_pager_setsize(vp, length);
2578
2579         return (0);
2580 }
2581
2582 /*
2583  * Invalidate the cached pages of a file's buffer within the range of block
2584  * numbers [startlbn, endlbn).
2585  */
2586 void
2587 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
2588     int blksize)
2589 {
2590         struct bufobj *bo;
2591         off_t start, end;
2592
2593         ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
2594
2595         start = blksize * startlbn;
2596         end = blksize * endlbn;
2597
2598         bo = &vp->v_bufobj;
2599         BO_LOCK(bo);
2600         MPASS(blksize == bo->bo_bsize);
2601
2602         while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN)
2603                 ;
2604
2605         BO_UNLOCK(bo);
2606         vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1));
2607 }
2608
2609 static int
2610 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
2611     daddr_t startlbn, daddr_t endlbn)
2612 {
2613         struct buf *bp, *nbp;
2614         bool anyfreed;
2615
2616         ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked");
2617         ASSERT_BO_LOCKED(bo);
2618
2619         do {
2620                 anyfreed = false;
2621                 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
2622                         if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
2623                                 continue;
2624                         if (BUF_LOCK(bp,
2625                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
2626                             BO_LOCKPTR(bo)) == ENOLCK) {
2627                                 BO_LOCK(bo);
2628                                 return (EAGAIN);
2629                         }
2630
2631                         bremfree(bp);
2632                         bp->b_flags |= B_INVAL | B_RELBUF;
2633                         bp->b_flags &= ~B_ASYNC;
2634                         brelse(bp);
2635                         anyfreed = true;
2636
2637                         BO_LOCK(bo);
2638                         if (nbp != NULL &&
2639                             (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
2640                             nbp->b_vp != vp ||
2641                             (nbp->b_flags & B_DELWRI) != 0))
2642                                 return (EAGAIN);
2643                 }
2644
2645                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2646                         if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
2647                                 continue;
2648                         if (BUF_LOCK(bp,
2649                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
2650                             BO_LOCKPTR(bo)) == ENOLCK) {
2651                                 BO_LOCK(bo);
2652                                 return (EAGAIN);
2653                         }
2654                         bremfree(bp);
2655                         bp->b_flags |= B_INVAL | B_RELBUF;
2656                         bp->b_flags &= ~B_ASYNC;
2657                         brelse(bp);
2658                         anyfreed = true;
2659
2660                         BO_LOCK(bo);
2661                         if (nbp != NULL &&
2662                             (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
2663                             (nbp->b_vp != vp) ||
2664                             (nbp->b_flags & B_DELWRI) == 0))
2665                                 return (EAGAIN);
2666                 }
2667         } while (anyfreed);
2668         return (0);
2669 }
2670
2671 static void
2672 buf_vlist_remove(struct buf *bp)
2673 {
2674         struct bufv *bv;
2675         b_xflags_t flags;
2676
2677         flags = bp->b_xflags;
2678
2679         KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
2680         ASSERT_BO_WLOCKED(bp->b_bufobj);
2681         KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 &&
2682             (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN),
2683             ("%s: buffer %p has invalid queue state", __func__, bp));
2684
2685         if ((flags & BX_VNDIRTY) != 0)
2686                 bv = &bp->b_bufobj->bo_dirty;
2687         else
2688                 bv = &bp->b_bufobj->bo_clean;
2689         BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
2690         TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
2691         bv->bv_cnt--;
2692         bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
2693 }
2694
2695 /*
2696  * Add the buffer to the sorted clean or dirty block list.
2697  *
2698  * NOTE: xflags is passed as a constant, optimizing this inline function!
2699  */
2700 static void
2701 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
2702 {
2703         struct bufv *bv;
2704         struct buf *n;
2705         int error;
2706
2707         ASSERT_BO_WLOCKED(bo);
2708         KASSERT((bo->bo_flag & BO_NOBUFS) == 0,
2709             ("buf_vlist_add: bo %p does not allow bufs", bo));
2710         KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
2711             ("dead bo %p", bo));
2712         KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
2713             ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
2714         bp->b_xflags |= xflags;
2715         if (xflags & BX_VNDIRTY)
2716                 bv = &bo->bo_dirty;
2717         else
2718                 bv = &bo->bo_clean;
2719
2720         /*
2721          * Keep the list ordered.  Optimize empty list insertion.  Assume
2722          * we tend to grow at the tail so lookup_le should usually be cheaper
2723          * than _ge.
2724          */
2725         if (bv->bv_cnt == 0 ||
2726             bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
2727                 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
2728         else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
2729                 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
2730         else
2731                 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
2732         error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
2733         if (error)
2734                 panic("buf_vlist_add:  Preallocated nodes insufficient.");
2735         bv->bv_cnt++;
2736 }
2737
2738 /*
2739  * Look up a buffer using the buffer tries.
2740  */
2741 struct buf *
2742 gbincore(struct bufobj *bo, daddr_t lblkno)
2743 {
2744         struct buf *bp;
2745
2746         ASSERT_BO_LOCKED(bo);
2747         bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
2748         if (bp != NULL)
2749                 return (bp);
2750         return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno));
2751 }
2752
2753 /*
2754  * Look up a buf using the buffer tries, without the bufobj lock.  This relies
2755  * on SMR for safe lookup, and bufs being in a no-free zone to provide type
2756  * stability of the result.  Like other lockless lookups, the found buf may
2757  * already be invalid by the time this function returns.
2758  */
2759 struct buf *
2760 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno)
2761 {
2762         struct buf *bp;
2763
2764         ASSERT_BO_UNLOCKED(bo);
2765         bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno);
2766         if (bp != NULL)
2767                 return (bp);
2768         return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno));
2769 }
2770
2771 /*
2772  * Associate a buffer with a vnode.
2773  */
2774 void
2775 bgetvp(struct vnode *vp, struct buf *bp)
2776 {
2777         struct bufobj *bo;
2778
2779         bo = &vp->v_bufobj;
2780         ASSERT_BO_WLOCKED(bo);
2781         VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
2782
2783         CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
2784         VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
2785             ("bgetvp: bp already attached! %p", bp));
2786
2787         vhold(vp);
2788         bp->b_vp = vp;
2789         bp->b_bufobj = bo;
2790         /*
2791          * Insert onto list for new vnode.
2792          */
2793         buf_vlist_add(bp, bo, BX_VNCLEAN);
2794 }
2795
2796 /*
2797  * Disassociate a buffer from a vnode.
2798  */
2799 void
2800 brelvp(struct buf *bp)
2801 {
2802         struct bufobj *bo;
2803         struct vnode *vp;
2804
2805         CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
2806         KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
2807
2808         /*
2809          * Delete from old vnode list, if on one.
2810          */
2811         vp = bp->b_vp;          /* XXX */
2812         bo = bp->b_bufobj;
2813         BO_LOCK(bo);
2814         buf_vlist_remove(bp);
2815         if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2816                 bo->bo_flag &= ~BO_ONWORKLST;
2817                 mtx_lock(&sync_mtx);
2818                 LIST_REMOVE(bo, bo_synclist);
2819                 syncer_worklist_len--;
2820                 mtx_unlock(&sync_mtx);
2821         }
2822         bp->b_vp = NULL;
2823         bp->b_bufobj = NULL;
2824         BO_UNLOCK(bo);
2825         vdrop(vp);
2826 }
2827
2828 /*
2829  * Add an item to the syncer work queue.
2830  */
2831 static void
2832 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
2833 {
2834         int slot;
2835
2836         ASSERT_BO_WLOCKED(bo);
2837
2838         mtx_lock(&sync_mtx);
2839         if (bo->bo_flag & BO_ONWORKLST)
2840                 LIST_REMOVE(bo, bo_synclist);
2841         else {
2842                 bo->bo_flag |= BO_ONWORKLST;
2843                 syncer_worklist_len++;
2844         }
2845
2846         if (delay > syncer_maxdelay - 2)
2847                 delay = syncer_maxdelay - 2;
2848         slot = (syncer_delayno + delay) & syncer_mask;
2849
2850         LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
2851         mtx_unlock(&sync_mtx);
2852 }
2853
2854 static int
2855 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
2856 {
2857         int error, len;
2858
2859         mtx_lock(&sync_mtx);
2860         len = syncer_worklist_len - sync_vnode_count;
2861         mtx_unlock(&sync_mtx);
2862         error = SYSCTL_OUT(req, &len, sizeof(len));
2863         return (error);
2864 }
2865
2866 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len,
2867     CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0,
2868     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
2869
2870 static struct proc *updateproc;
2871 static void sched_sync(void);
2872 static struct kproc_desc up_kp = {
2873         "syncer",
2874         sched_sync,
2875         &updateproc
2876 };
2877 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
2878
2879 static int
2880 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
2881 {
2882         struct vnode *vp;
2883         struct mount *mp;
2884
2885         *bo = LIST_FIRST(slp);
2886         if (*bo == NULL)
2887                 return (0);
2888         vp = bo2vnode(*bo);
2889         if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
2890                 return (1);
2891         /*
2892          * We use vhold in case the vnode does not
2893          * successfully sync.  vhold prevents the vnode from
2894          * going away when we unlock the sync_mtx so that
2895          * we can acquire the vnode interlock.
2896          */
2897         vholdl(vp);
2898         mtx_unlock(&sync_mtx);
2899         VI_UNLOCK(vp);
2900         if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
2901                 vdrop(vp);
2902                 mtx_lock(&sync_mtx);
2903                 return (*bo == LIST_FIRST(slp));
2904         }
2905         MPASSERT(mp == NULL || (curthread->td_pflags & TDP_IGNSUSP) != 0 ||
2906             (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0, mp,
2907             ("suspended mp syncing vp %p", vp));
2908         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2909         (void) VOP_FSYNC(vp, MNT_LAZY, td);
2910         VOP_UNLOCK(vp);
2911         vn_finished_write(mp);
2912         BO_LOCK(*bo);
2913         if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
2914                 /*
2915                  * Put us back on the worklist.  The worklist
2916                  * routine will remove us from our current
2917                  * position and then add us back in at a later
2918                  * position.
2919                  */
2920                 vn_syncer_add_to_worklist(*bo, syncdelay);
2921         }
2922         BO_UNLOCK(*bo);
2923         vdrop(vp);
2924         mtx_lock(&sync_mtx);
2925         return (0);
2926 }
2927
2928 static int first_printf = 1;
2929
2930 /*
2931  * System filesystem synchronizer daemon.
2932  */
2933 static void
2934 sched_sync(void)
2935 {
2936         struct synclist *next, *slp;
2937         struct bufobj *bo;
2938         long starttime;
2939         struct thread *td = curthread;
2940         int last_work_seen;
2941         int net_worklist_len;
2942         int syncer_final_iter;
2943         int error;
2944
2945         last_work_seen = 0;
2946         syncer_final_iter = 0;
2947         syncer_state = SYNCER_RUNNING;
2948         starttime = time_uptime;
2949         td->td_pflags |= TDP_NORUNNINGBUF;
2950
2951         EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
2952             SHUTDOWN_PRI_LAST);
2953
2954         mtx_lock(&sync_mtx);
2955         for (;;) {
2956                 if (syncer_state == SYNCER_FINAL_DELAY &&
2957                     syncer_final_iter == 0) {
2958                         mtx_unlock(&sync_mtx);
2959                         kproc_suspend_check(td->td_proc);
2960                         mtx_lock(&sync_mtx);
2961                 }
2962                 net_worklist_len = syncer_worklist_len - sync_vnode_count;
2963                 if (syncer_state != SYNCER_RUNNING &&
2964                     starttime != time_uptime) {
2965                         if (first_printf) {
2966                                 printf("\nSyncing disks, vnodes remaining... ");
2967                                 first_printf = 0;
2968                         }
2969                         printf("%d ", net_worklist_len);
2970                 }
2971                 starttime = time_uptime;
2972
2973                 /*
2974                  * Push files whose dirty time has expired.  Be careful
2975                  * of interrupt race on slp queue.
2976                  *
2977                  * Skip over empty worklist slots when shutting down.
2978                  */
2979                 do {
2980                         slp = &syncer_workitem_pending[syncer_delayno];
2981                         syncer_delayno += 1;
2982                         if (syncer_delayno == syncer_maxdelay)
2983                                 syncer_delayno = 0;
2984                         next = &syncer_workitem_pending[syncer_delayno];
2985                         /*
2986                          * If the worklist has wrapped since the
2987                          * it was emptied of all but syncer vnodes,
2988                          * switch to the FINAL_DELAY state and run
2989                          * for one more second.
2990                          */
2991                         if (syncer_state == SYNCER_SHUTTING_DOWN &&
2992                             net_worklist_len == 0 &&
2993                             last_work_seen == syncer_delayno) {
2994                                 syncer_state = SYNCER_FINAL_DELAY;
2995                                 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
2996                         }
2997                 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
2998                     syncer_worklist_len > 0);
2999
3000                 /*
3001                  * Keep track of the last time there was anything
3002                  * on the worklist other than syncer vnodes.
3003                  * Return to the SHUTTING_DOWN state if any
3004                  * new work appears.
3005                  */
3006                 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
3007                         last_work_seen = syncer_delayno;
3008                 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
3009                         syncer_state = SYNCER_SHUTTING_DOWN;
3010                 while (!LIST_EMPTY(slp)) {
3011                         error = sync_vnode(slp, &bo, td);
3012                         if (error == 1) {
3013                                 LIST_REMOVE(bo, bo_synclist);
3014                                 LIST_INSERT_HEAD(next, bo, bo_synclist);
3015                                 continue;
3016                         }
3017
3018                         if (first_printf == 0) {
3019                                 /*
3020                                  * Drop the sync mutex, because some watchdog
3021                                  * drivers need to sleep while patting
3022                                  */
3023                                 mtx_unlock(&sync_mtx);
3024                                 wdog_kern_pat(WD_LASTVAL);
3025                                 mtx_lock(&sync_mtx);
3026                         }
3027                 }
3028                 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
3029                         syncer_final_iter--;
3030                 /*
3031                  * The variable rushjob allows the kernel to speed up the
3032                  * processing of the filesystem syncer process. A rushjob
3033                  * value of N tells the filesystem syncer to process the next
3034                  * N seconds worth of work on its queue ASAP. Currently rushjob
3035                  * is used by the soft update code to speed up the filesystem
3036                  * syncer process when the incore state is getting so far
3037                  * ahead of the disk that the kernel memory pool is being
3038                  * threatened with exhaustion.
3039                  */
3040                 if (rushjob > 0) {
3041                         rushjob -= 1;
3042                         continue;
3043                 }
3044                 /*
3045                  * Just sleep for a short period of time between
3046                  * iterations when shutting down to allow some I/O
3047                  * to happen.
3048                  *
3049                  * If it has taken us less than a second to process the
3050                  * current work, then wait. Otherwise start right over
3051                  * again. We can still lose time if any single round
3052                  * takes more than two seconds, but it does not really
3053                  * matter as we are just trying to generally pace the
3054                  * filesystem activity.
3055                  */
3056                 if (syncer_state != SYNCER_RUNNING ||
3057                     time_uptime == starttime) {
3058                         thread_lock(td);
3059                         sched_prio(td, PPAUSE);
3060                         thread_unlock(td);
3061                 }
3062                 if (syncer_state != SYNCER_RUNNING)
3063                         cv_timedwait(&sync_wakeup, &sync_mtx,
3064                             hz / SYNCER_SHUTDOWN_SPEEDUP);
3065                 else if (time_uptime == starttime)
3066                         cv_timedwait(&sync_wakeup, &sync_mtx, hz);
3067         }
3068 }
3069
3070 /*
3071  * Request the syncer daemon to speed up its work.
3072  * We never push it to speed up more than half of its
3073  * normal turn time, otherwise it could take over the cpu.
3074  */
3075 int
3076 speedup_syncer(void)
3077 {
3078         int ret = 0;
3079
3080         mtx_lock(&sync_mtx);
3081         if (rushjob < syncdelay / 2) {
3082                 rushjob += 1;
3083                 stat_rush_requests += 1;
3084                 ret = 1;
3085         }
3086         mtx_unlock(&sync_mtx);
3087         cv_broadcast(&sync_wakeup);
3088         return (ret);
3089 }
3090
3091 /*
3092  * Tell the syncer to speed up its work and run though its work
3093  * list several times, then tell it to shut down.
3094  */
3095 static void
3096 syncer_shutdown(void *arg, int howto)
3097 {
3098
3099         if (howto & RB_NOSYNC)
3100                 return;
3101         mtx_lock(&sync_mtx);
3102         syncer_state = SYNCER_SHUTTING_DOWN;
3103         rushjob = 0;
3104         mtx_unlock(&sync_mtx);
3105         cv_broadcast(&sync_wakeup);
3106         kproc_shutdown(arg, howto);
3107 }
3108
3109 void
3110 syncer_suspend(void)
3111 {
3112
3113         syncer_shutdown(updateproc, 0);
3114 }
3115
3116 void
3117 syncer_resume(void)
3118 {
3119
3120         mtx_lock(&sync_mtx);
3121         first_printf = 1;
3122         syncer_state = SYNCER_RUNNING;
3123         mtx_unlock(&sync_mtx);
3124         cv_broadcast(&sync_wakeup);
3125         kproc_resume(updateproc);
3126 }
3127
3128 /*
3129  * Move the buffer between the clean and dirty lists of its vnode.
3130  */
3131 void
3132 reassignbuf(struct buf *bp)
3133 {
3134         struct vnode *vp;
3135         struct bufobj *bo;
3136         int delay;
3137 #ifdef INVARIANTS
3138         struct bufv *bv;
3139 #endif
3140
3141         vp = bp->b_vp;
3142         bo = bp->b_bufobj;
3143
3144         KASSERT((bp->b_flags & B_PAGING) == 0,
3145             ("%s: cannot reassign paging buffer %p", __func__, bp));
3146
3147         CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
3148             bp, bp->b_vp, bp->b_flags);
3149
3150         BO_LOCK(bo);
3151         buf_vlist_remove(bp);
3152
3153         /*
3154          * If dirty, put on list of dirty buffers; otherwise insert onto list
3155          * of clean buffers.
3156          */
3157         if (bp->b_flags & B_DELWRI) {
3158                 if ((bo->bo_flag & BO_ONWORKLST) == 0) {
3159                         switch (vp->v_type) {
3160                         case VDIR:
3161                                 delay = dirdelay;
3162                                 break;
3163                         case VCHR:
3164                                 delay = metadelay;
3165                                 break;
3166                         default:
3167                                 delay = filedelay;
3168                         }
3169                         vn_syncer_add_to_worklist(bo, delay);
3170                 }
3171                 buf_vlist_add(bp, bo, BX_VNDIRTY);
3172         } else {
3173                 buf_vlist_add(bp, bo, BX_VNCLEAN);
3174
3175                 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
3176                         mtx_lock(&sync_mtx);
3177                         LIST_REMOVE(bo, bo_synclist);
3178                         syncer_worklist_len--;
3179                         mtx_unlock(&sync_mtx);
3180                         bo->bo_flag &= ~BO_ONWORKLST;
3181                 }
3182         }
3183 #ifdef INVARIANTS
3184         bv = &bo->bo_clean;
3185         bp = TAILQ_FIRST(&bv->bv_hd);
3186         KASSERT(bp == NULL || bp->b_bufobj == bo,
3187             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
3188         bp = TAILQ_LAST(&bv->bv_hd, buflists);
3189         KASSERT(bp == NULL || bp->b_bufobj == bo,
3190             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
3191         bv = &bo->bo_dirty;
3192         bp = TAILQ_FIRST(&bv->bv_hd);
3193         KASSERT(bp == NULL || bp->b_bufobj == bo,
3194             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
3195         bp = TAILQ_LAST(&bv->bv_hd, buflists);
3196         KASSERT(bp == NULL || bp->b_bufobj == bo,
3197             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
3198 #endif
3199         BO_UNLOCK(bo);
3200 }
3201
3202 static void
3203 v_init_counters(struct vnode *vp)
3204 {
3205
3206         VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
3207             vp, ("%s called for an initialized vnode", __FUNCTION__));
3208         ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
3209
3210         refcount_init(&vp->v_holdcnt, 1);
3211         refcount_init(&vp->v_usecount, 1);
3212 }
3213
3214 /*
3215  * Get a usecount on a vnode.
3216  *
3217  * vget and vget_finish may fail to lock the vnode if they lose a race against
3218  * it being doomed. LK_RETRY can be passed in flags to lock it anyway.
3219  *
3220  * Consumers which don't guarantee liveness of the vnode can use SMR to
3221  * try to get a reference. Note this operation can fail since the vnode
3222  * may be awaiting getting freed by the time they get to it.
3223  */
3224 enum vgetstate
3225 vget_prep_smr(struct vnode *vp)
3226 {
3227         enum vgetstate vs;
3228
3229         VFS_SMR_ASSERT_ENTERED();
3230
3231         if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
3232                 vs = VGET_USECOUNT;
3233         } else {
3234                 if (vhold_smr(vp))
3235                         vs = VGET_HOLDCNT;
3236                 else
3237                         vs = VGET_NONE;
3238         }
3239         return (vs);
3240 }
3241
3242 enum vgetstate
3243 vget_prep(struct vnode *vp)
3244 {
3245         enum vgetstate vs;
3246
3247         if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
3248                 vs = VGET_USECOUNT;
3249         } else {
3250                 vhold(vp);
3251                 vs = VGET_HOLDCNT;
3252         }
3253         return (vs);
3254 }
3255
3256 void
3257 vget_abort(struct vnode *vp, enum vgetstate vs)
3258 {
3259
3260         switch (vs) {
3261         case VGET_USECOUNT:
3262                 vrele(vp);
3263                 break;
3264         case VGET_HOLDCNT:
3265                 vdrop(vp);
3266                 break;
3267         default:
3268                 __assert_unreachable();
3269         }
3270 }
3271
3272 int
3273 vget(struct vnode *vp, int flags)
3274 {
3275         enum vgetstate vs;
3276
3277         vs = vget_prep(vp);
3278         return (vget_finish(vp, flags, vs));
3279 }
3280
3281 int
3282 vget_finish(struct vnode *vp, int flags, enum vgetstate vs)
3283 {
3284         int error;
3285
3286         if ((flags & LK_INTERLOCK) != 0)
3287                 ASSERT_VI_LOCKED(vp, __func__);
3288         else
3289                 ASSERT_VI_UNLOCKED(vp, __func__);
3290         VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
3291         VNPASS(vp->v_holdcnt > 0, vp);
3292         VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
3293
3294         error = vn_lock(vp, flags);
3295         if (__predict_false(error != 0)) {
3296                 vget_abort(vp, vs);
3297                 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
3298                     vp);
3299                 return (error);
3300         }
3301
3302         vget_finish_ref(vp, vs);
3303         return (0);
3304 }
3305
3306 void
3307 vget_finish_ref(struct vnode *vp, enum vgetstate vs)
3308 {
3309         int old;
3310
3311         VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
3312         VNPASS(vp->v_holdcnt > 0, vp);
3313         VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
3314
3315         if (vs == VGET_USECOUNT)
3316                 return;
3317
3318         /*
3319          * We hold the vnode. If the usecount is 0 it will be utilized to keep
3320          * the vnode around. Otherwise someone else lended their hold count and
3321          * we have to drop ours.
3322          */
3323         old = atomic_fetchadd_int(&vp->v_usecount, 1);
3324         VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old));
3325         if (old != 0) {
3326 #ifdef INVARIANTS
3327                 old = atomic_fetchadd_int(&vp->v_holdcnt, -1);
3328                 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old));
3329 #else
3330                 refcount_release(&vp->v_holdcnt);
3331 #endif
3332         }
3333 }
3334
3335 void
3336 vref(struct vnode *vp)
3337 {
3338         enum vgetstate vs;
3339
3340         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3341         vs = vget_prep(vp);
3342         vget_finish_ref(vp, vs);
3343 }
3344
3345 void
3346 vrefact(struct vnode *vp)
3347 {
3348
3349         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3350 #ifdef INVARIANTS
3351         int old = atomic_fetchadd_int(&vp->v_usecount, 1);
3352         VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old));
3353 #else
3354         refcount_acquire(&vp->v_usecount);
3355 #endif
3356 }
3357
3358 void
3359 vlazy(struct vnode *vp)
3360 {
3361         struct mount *mp;
3362
3363         VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__));
3364
3365         if ((vp->v_mflag & VMP_LAZYLIST) != 0)
3366                 return;
3367         /*
3368          * We may get here for inactive routines after the vnode got doomed.
3369          */
3370         if (VN_IS_DOOMED(vp))
3371                 return;
3372         mp = vp->v_mount;
3373         mtx_lock(&mp->mnt_listmtx);
3374         if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
3375                 vp->v_mflag |= VMP_LAZYLIST;
3376                 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist);
3377                 mp->mnt_lazyvnodelistsize++;
3378         }
3379         mtx_unlock(&mp->mnt_listmtx);
3380 }
3381
3382 static void
3383 vunlazy(struct vnode *vp)
3384 {
3385         struct mount *mp;
3386
3387         ASSERT_VI_LOCKED(vp, __func__);
3388         VNPASS(!VN_IS_DOOMED(vp), vp);
3389
3390         mp = vp->v_mount;
3391         mtx_lock(&mp->mnt_listmtx);
3392         VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
3393         /*
3394          * Don't remove the vnode from the lazy list if another thread
3395          * has increased the hold count. It may have re-enqueued the
3396          * vnode to the lazy list and is now responsible for its
3397          * removal.
3398          */
3399         if (vp->v_holdcnt == 0) {
3400                 vp->v_mflag &= ~VMP_LAZYLIST;
3401                 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
3402                 mp->mnt_lazyvnodelistsize--;
3403         }
3404         mtx_unlock(&mp->mnt_listmtx);
3405 }
3406
3407 /*
3408  * This routine is only meant to be called from vgonel prior to dooming
3409  * the vnode.
3410  */
3411 static void
3412 vunlazy_gone(struct vnode *vp)
3413 {
3414         struct mount *mp;
3415
3416         ASSERT_VOP_ELOCKED(vp, __func__);
3417         ASSERT_VI_LOCKED(vp, __func__);
3418         VNPASS(!VN_IS_DOOMED(vp), vp);
3419
3420         if (vp->v_mflag & VMP_LAZYLIST) {
3421                 mp = vp->v_mount;
3422                 mtx_lock(&mp->mnt_listmtx);
3423                 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
3424                 vp->v_mflag &= ~VMP_LAZYLIST;
3425                 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
3426                 mp->mnt_lazyvnodelistsize--;
3427                 mtx_unlock(&mp->mnt_listmtx);
3428         }
3429 }
3430
3431 static void
3432 vdefer_inactive(struct vnode *vp)
3433 {
3434
3435         ASSERT_VI_LOCKED(vp, __func__);
3436         VNPASS(vp->v_holdcnt > 0, vp);
3437         if (VN_IS_DOOMED(vp)) {
3438                 vdropl(vp);
3439                 return;
3440         }
3441         if (vp->v_iflag & VI_DEFINACT) {
3442                 VNPASS(vp->v_holdcnt > 1, vp);
3443                 vdropl(vp);
3444                 return;
3445         }
3446         if (vp->v_usecount > 0) {
3447                 vp->v_iflag &= ~VI_OWEINACT;
3448                 vdropl(vp);
3449                 return;
3450         }
3451         vlazy(vp);
3452         vp->v_iflag |= VI_DEFINACT;
3453         VI_UNLOCK(vp);
3454         atomic_add_long(&deferred_inact, 1);
3455 }
3456
3457 static void
3458 vdefer_inactive_unlocked(struct vnode *vp)
3459 {
3460
3461         VI_LOCK(vp);
3462         if ((vp->v_iflag & VI_OWEINACT) == 0) {
3463                 vdropl(vp);
3464                 return;
3465         }
3466         vdefer_inactive(vp);
3467 }
3468
3469 enum vput_op { VRELE, VPUT, VUNREF };
3470
3471 /*
3472  * Handle ->v_usecount transitioning to 0.
3473  *
3474  * By releasing the last usecount we take ownership of the hold count which
3475  * provides liveness of the vnode, meaning we have to vdrop.
3476  *
3477  * For all vnodes we may need to perform inactive processing. It requires an
3478  * exclusive lock on the vnode, while it is legal to call here with only a
3479  * shared lock (or no locks). If locking the vnode in an expected manner fails,
3480  * inactive processing gets deferred to the syncer.
3481  *
3482  * XXX Some filesystems pass in an exclusively locked vnode and strongly depend
3483  * on the lock being held all the way until VOP_INACTIVE. This in particular
3484  * happens with UFS which adds half-constructed vnodes to the hash, where they
3485  * can be found by other code.
3486  */
3487 static void
3488 vput_final(struct vnode *vp, enum vput_op func)
3489 {
3490         int error;
3491         bool want_unlock;
3492
3493         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3494         VNPASS(vp->v_holdcnt > 0, vp);
3495
3496         VI_LOCK(vp);
3497
3498         /*
3499          * By the time we got here someone else might have transitioned
3500          * the count back to > 0.
3501          */
3502         if (vp->v_usecount > 0)
3503                 goto out;
3504
3505         /*
3506          * If the vnode is doomed vgone already performed inactive processing
3507          * (if needed).
3508          */
3509         if (VN_IS_DOOMED(vp))
3510                 goto out;
3511
3512         if (__predict_true(VOP_NEED_INACTIVE(vp) == 0))
3513                 goto out;
3514
3515         if (vp->v_iflag & VI_DOINGINACT)
3516                 goto out;
3517
3518         /*
3519          * Locking operations here will drop the interlock and possibly the
3520          * vnode lock, opening a window where the vnode can get doomed all the
3521          * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to
3522          * perform inactive.
3523          */
3524         vp->v_iflag |= VI_OWEINACT;
3525         want_unlock = false;
3526         error = 0;
3527         switch (func) {
3528         case VRELE:
3529                 switch (VOP_ISLOCKED(vp)) {
3530                 case LK_EXCLUSIVE:
3531                         break;
3532                 case LK_EXCLOTHER:
3533                 case 0:
3534                         want_unlock = true;
3535                         error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
3536                         VI_LOCK(vp);
3537                         break;
3538                 default:
3539                         /*
3540                          * The lock has at least one sharer, but we have no way
3541                          * to conclude whether this is us. Play it safe and
3542                          * defer processing.
3543                          */
3544                         error = EAGAIN;
3545                         break;
3546                 }
3547                 break;
3548         case VPUT:
3549                 want_unlock = true;
3550                 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
3551                         error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
3552                             LK_NOWAIT);
3553                         VI_LOCK(vp);
3554                 }
3555                 break;
3556         case VUNREF:
3557                 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
3558                         error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
3559                         VI_LOCK(vp);
3560                 }
3561                 break;
3562         }
3563         if (error == 0) {
3564                 if (func == VUNREF) {
3565                         VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp,
3566                             ("recursive vunref"));
3567                         vp->v_vflag |= VV_UNREF;
3568                 }
3569                 for (;;) {
3570                         error = vinactive(vp);
3571                         if (want_unlock)
3572                                 VOP_UNLOCK(vp);
3573                         if (error != ERELOOKUP || !want_unlock)
3574                                 break;
3575                         VOP_LOCK(vp, LK_EXCLUSIVE);
3576                 }
3577                 if (func == VUNREF)
3578                         vp->v_vflag &= ~VV_UNREF;
3579                 vdropl(vp);
3580         } else {
3581                 vdefer_inactive(vp);
3582         }
3583         return;
3584 out:
3585         if (func == VPUT)
3586                 VOP_UNLOCK(vp);
3587         vdropl(vp);
3588 }
3589
3590 /*
3591  * Decrement ->v_usecount for a vnode.
3592  *
3593  * Releasing the last use count requires additional processing, see vput_final
3594  * above for details.
3595  *
3596  * Comment above each variant denotes lock state on entry and exit.
3597  */
3598
3599 /*
3600  * in: any
3601  * out: same as passed in
3602  */
3603 void
3604 vrele(struct vnode *vp)
3605 {
3606
3607         ASSERT_VI_UNLOCKED(vp, __func__);
3608         if (!refcount_release(&vp->v_usecount))
3609                 return;
3610         vput_final(vp, VRELE);
3611 }
3612
3613 /*
3614  * in: locked
3615  * out: unlocked
3616  */
3617 void
3618 vput(struct vnode *vp)
3619 {
3620
3621         ASSERT_VOP_LOCKED(vp, __func__);
3622         ASSERT_VI_UNLOCKED(vp, __func__);
3623         if (!refcount_release(&vp->v_usecount)) {
3624                 VOP_UNLOCK(vp);
3625                 return;
3626         }
3627         vput_final(vp, VPUT);
3628 }
3629
3630 /*
3631  * in: locked
3632  * out: locked
3633  */
3634 void
3635 vunref(struct vnode *vp)
3636 {
3637
3638         ASSERT_VOP_LOCKED(vp, __func__);
3639         ASSERT_VI_UNLOCKED(vp, __func__);
3640         if (!refcount_release(&vp->v_usecount))
3641                 return;
3642         vput_final(vp, VUNREF);
3643 }
3644
3645 void
3646 vhold(struct vnode *vp)
3647 {
3648         int old;
3649
3650         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3651         old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
3652         VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
3653             ("%s: wrong hold count %d", __func__, old));
3654         if (old == 0)
3655                 vfs_freevnodes_dec();
3656 }
3657
3658 void
3659 vholdnz(struct vnode *vp)
3660 {
3661
3662         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3663 #ifdef INVARIANTS
3664         int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
3665         VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
3666             ("%s: wrong hold count %d", __func__, old));
3667 #else
3668         atomic_add_int(&vp->v_holdcnt, 1);
3669 #endif
3670 }
3671
3672 /*
3673  * Grab a hold count unless the vnode is freed.
3674  *
3675  * Only use this routine if vfs smr is the only protection you have against
3676  * freeing the vnode.
3677  *
3678  * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag
3679  * is not set.  After the flag is set the vnode becomes immutable to anyone but
3680  * the thread which managed to set the flag.
3681  *
3682  * It may be tempting to replace the loop with:
3683  * count = atomic_fetchadd_int(&vp->v_holdcnt, 1);
3684  * if (count & VHOLD_NO_SMR) {
3685  *     backpedal and error out;
3686  * }
3687  *
3688  * However, while this is more performant, it hinders debugging by eliminating
3689  * the previously mentioned invariant.
3690  */
3691 bool
3692 vhold_smr(struct vnode *vp)
3693 {
3694         int count;
3695
3696         VFS_SMR_ASSERT_ENTERED();
3697
3698         count = atomic_load_int(&vp->v_holdcnt);
3699         for (;;) {
3700                 if (count & VHOLD_NO_SMR) {
3701                         VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
3702                             ("non-zero hold count with flags %d\n", count));
3703                         return (false);
3704                 }
3705                 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
3706                 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
3707                         if (count == 0)
3708                                 vfs_freevnodes_dec();
3709                         return (true);
3710                 }
3711         }
3712 }
3713
3714 /*
3715  * Hold a free vnode for recycling.
3716  *
3717  * Note: vnode_init references this comment.
3718  *
3719  * Attempts to recycle only need the global vnode list lock and have no use for
3720  * SMR.
3721  *
3722  * However, vnodes get inserted into the global list before they get fully
3723  * initialized and stay there until UMA decides to free the memory. This in
3724  * particular means the target can be found before it becomes usable and after
3725  * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to
3726  * VHOLD_NO_SMR.
3727  *
3728  * Note: the vnode may gain more references after we transition the count 0->1.
3729  */
3730 static bool
3731 vhold_recycle_free(struct vnode *vp)
3732 {
3733         int count;
3734
3735         mtx_assert(&vnode_list_mtx, MA_OWNED);
3736
3737         count = atomic_load_int(&vp->v_holdcnt);
3738         for (;;) {
3739                 if (count & VHOLD_NO_SMR) {
3740                         VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
3741                             ("non-zero hold count with flags %d\n", count));
3742                         return (false);
3743                 }
3744                 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
3745                 if (count > 0) {
3746                         return (false);
3747                 }
3748                 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
3749                         vfs_freevnodes_dec();
3750                         return (true);
3751                 }
3752         }
3753 }
3754
3755 static void __noinline
3756 vdbatch_process(struct vdbatch *vd)
3757 {
3758         struct vnode *vp;
3759         int i;
3760
3761         mtx_assert(&vd->lock, MA_OWNED);
3762         MPASS(curthread->td_pinned > 0);
3763         MPASS(vd->index == VDBATCH_SIZE);
3764
3765         /*
3766          * Attempt to requeue the passed batch, but give up easily.
3767          *
3768          * Despite batching the mechanism is prone to transient *significant*
3769          * lock contention, where vnode_list_mtx becomes the primary bottleneck
3770          * if multiple CPUs get here (one real-world example is highly parallel
3771          * do-nothing make , which will stat *tons* of vnodes). Since it is
3772          * quasi-LRU (read: not that great even if fully honoured) just dodge
3773          * the problem. Parties which don't like it are welcome to implement
3774          * something better.
3775          */
3776         critical_enter();
3777         if (mtx_trylock(&vnode_list_mtx)) {
3778                 for (i = 0; i < VDBATCH_SIZE; i++) {
3779                         vp = vd->tab[i];
3780                         vd->tab[i] = NULL;
3781                         TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
3782                         TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
3783                         MPASS(vp->v_dbatchcpu != NOCPU);
3784                         vp->v_dbatchcpu = NOCPU;
3785                 }
3786                 mtx_unlock(&vnode_list_mtx);
3787         } else {
3788                 counter_u64_add(vnode_skipped_requeues, 1);
3789
3790                 for (i = 0; i < VDBATCH_SIZE; i++) {
3791                         vp = vd->tab[i];
3792                         vd->tab[i] = NULL;
3793                         MPASS(vp->v_dbatchcpu != NOCPU);
3794                         vp->v_dbatchcpu = NOCPU;
3795                 }
3796         }
3797         vd->index = 0;
3798         critical_exit();
3799 }
3800
3801 static void
3802 vdbatch_enqueue(struct vnode *vp)
3803 {
3804         struct vdbatch *vd;
3805
3806         ASSERT_VI_LOCKED(vp, __func__);
3807         VNPASS(!VN_IS_DOOMED(vp), vp);
3808
3809         if (vp->v_dbatchcpu != NOCPU) {
3810                 VI_UNLOCK(vp);
3811                 return;
3812         }
3813
3814         sched_pin();
3815         vd = DPCPU_PTR(vd);
3816         mtx_lock(&vd->lock);
3817         MPASS(vd->index < VDBATCH_SIZE);
3818         MPASS(vd->tab[vd->index] == NULL);
3819         /*
3820          * A hack: we depend on being pinned so that we know what to put in
3821          * ->v_dbatchcpu.
3822          */
3823         vp->v_dbatchcpu = curcpu;
3824         vd->tab[vd->index] = vp;
3825         vd->index++;
3826         VI_UNLOCK(vp);
3827         if (vd->index == VDBATCH_SIZE)
3828                 vdbatch_process(vd);
3829         mtx_unlock(&vd->lock);
3830         sched_unpin();
3831 }
3832
3833 /*
3834  * This routine must only be called for vnodes which are about to be
3835  * deallocated. Supporting dequeue for arbitrary vndoes would require
3836  * validating that the locked batch matches.
3837  */
3838 static void
3839 vdbatch_dequeue(struct vnode *vp)
3840 {
3841         struct vdbatch *vd;
3842         int i;
3843         short cpu;
3844
3845         VNPASS(vp->v_type == VBAD || vp->v_type == VNON, vp);
3846
3847         cpu = vp->v_dbatchcpu;
3848         if (cpu == NOCPU)
3849                 return;
3850
3851         vd = DPCPU_ID_PTR(cpu, vd);
3852         mtx_lock(&vd->lock);
3853         for (i = 0; i < vd->index; i++) {
3854                 if (vd->tab[i] != vp)
3855                         continue;
3856                 vp->v_dbatchcpu = NOCPU;
3857                 vd->index--;
3858                 vd->tab[i] = vd->tab[vd->index];
3859                 vd->tab[vd->index] = NULL;
3860                 break;
3861         }
3862         mtx_unlock(&vd->lock);
3863         /*
3864          * Either we dequeued the vnode above or the target CPU beat us to it.
3865          */
3866         MPASS(vp->v_dbatchcpu == NOCPU);
3867 }
3868
3869 /*
3870  * Drop the hold count of the vnode.
3871  *
3872  * It will only get freed if this is the last hold *and* it has been vgone'd.
3873  *
3874  * Because the vnode vm object keeps a hold reference on the vnode if
3875  * there is at least one resident non-cached page, the vnode cannot
3876  * leave the active list without the page cleanup done.
3877  */
3878 static void __noinline
3879 vdropl_final(struct vnode *vp)
3880 {
3881
3882         ASSERT_VI_LOCKED(vp, __func__);
3883         VNPASS(VN_IS_DOOMED(vp), vp);
3884         /*
3885          * Set the VHOLD_NO_SMR flag.
3886          *
3887          * We may be racing against vhold_smr. If they win we can just pretend
3888          * we never got this far, they will vdrop later.
3889          */
3890         if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) {
3891                 vfs_freevnodes_inc();
3892                 VI_UNLOCK(vp);
3893                 /*
3894                  * We lost the aforementioned race. Any subsequent access is
3895                  * invalid as they might have managed to vdropl on their own.
3896                  */
3897                 return;
3898         }
3899         /*
3900          * Don't bump freevnodes as this one is going away.
3901          */
3902         freevnode(vp);
3903 }
3904
3905 void
3906 vdrop(struct vnode *vp)
3907 {
3908
3909         ASSERT_VI_UNLOCKED(vp, __func__);
3910         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3911         if (refcount_release_if_not_last(&vp->v_holdcnt))
3912                 return;
3913         VI_LOCK(vp);
3914         vdropl(vp);
3915 }
3916
3917 static void __always_inline
3918 vdropl_impl(struct vnode *vp, bool enqueue)
3919 {
3920
3921         ASSERT_VI_LOCKED(vp, __func__);
3922         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3923         if (!refcount_release(&vp->v_holdcnt)) {
3924                 VI_UNLOCK(vp);
3925                 return;
3926         }
3927         VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp);
3928         VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp);
3929         if (VN_IS_DOOMED(vp)) {
3930                 vdropl_final(vp);
3931                 return;
3932         }
3933
3934         vfs_freevnodes_inc();
3935         if (vp->v_mflag & VMP_LAZYLIST) {
3936                 vunlazy(vp);
3937         }
3938
3939         if (!enqueue) {
3940                 VI_UNLOCK(vp);
3941                 return;
3942         }
3943
3944         /*
3945          * Also unlocks the interlock. We can't assert on it as we
3946          * released our hold and by now the vnode might have been
3947          * freed.
3948          */
3949         vdbatch_enqueue(vp);
3950 }
3951
3952 void
3953 vdropl(struct vnode *vp)
3954 {
3955
3956         vdropl_impl(vp, true);
3957 }
3958
3959 /*
3960  * vdrop a vnode when recycling
3961  *
3962  * This is a special case routine only to be used when recycling, differs from
3963  * regular vdrop by not requeieing the vnode on LRU.
3964  *
3965  * Consider a case where vtryrecycle continuously fails with all vnodes (due to
3966  * e.g., frozen writes on the filesystem), filling the batch and causing it to
3967  * be requeued. Then vnlru will end up revisiting the same vnodes. This is a
3968  * loop which can last for as long as writes are frozen.
3969  */
3970 static void
3971 vdropl_recycle(struct vnode *vp)
3972 {
3973
3974         vdropl_impl(vp, false);
3975 }
3976
3977 static void
3978 vdrop_recycle(struct vnode *vp)
3979 {
3980
3981         VI_LOCK(vp);
3982         vdropl_recycle(vp);
3983 }
3984
3985 /*
3986  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
3987  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
3988  */
3989 static int
3990 vinactivef(struct vnode *vp)
3991 {
3992         struct vm_object *obj;
3993         int error;
3994
3995         ASSERT_VOP_ELOCKED(vp, "vinactive");
3996         ASSERT_VI_LOCKED(vp, "vinactive");
3997         VNPASS((vp->v_iflag & VI_DOINGINACT) == 0, vp);
3998         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3999         vp->v_iflag |= VI_DOINGINACT;
4000         vp->v_iflag &= ~VI_OWEINACT;
4001         VI_UNLOCK(vp);
4002         /*
4003          * Before moving off the active list, we must be sure that any
4004          * modified pages are converted into the vnode's dirty
4005          * buffers, since these will no longer be checked once the
4006          * vnode is on the inactive list.
4007          *
4008          * The write-out of the dirty pages is asynchronous.  At the
4009          * point that VOP_INACTIVE() is called, there could still be
4010          * pending I/O and dirty pages in the object.
4011          */
4012         if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
4013             vm_object_mightbedirty(obj)) {
4014                 VM_OBJECT_WLOCK(obj);
4015                 vm_object_page_clean(obj, 0, 0, 0);
4016                 VM_OBJECT_WUNLOCK(obj);
4017         }
4018         error = VOP_INACTIVE(vp);
4019         VI_LOCK(vp);
4020         VNPASS(vp->v_iflag & VI_DOINGINACT, vp);
4021         vp->v_iflag &= ~VI_DOINGINACT;
4022         return (error);
4023 }
4024
4025 int
4026 vinactive(struct vnode *vp)
4027 {
4028
4029         ASSERT_VOP_ELOCKED(vp, "vinactive");
4030         ASSERT_VI_LOCKED(vp, "vinactive");
4031         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
4032
4033         if ((vp->v_iflag & VI_OWEINACT) == 0)
4034                 return (0);
4035         if (vp->v_iflag & VI_DOINGINACT)
4036                 return (0);
4037         if (vp->v_usecount > 0) {
4038                 vp->v_iflag &= ~VI_OWEINACT;
4039                 return (0);
4040         }
4041         return (vinactivef(vp));
4042 }
4043
4044 /*
4045  * Remove any vnodes in the vnode table belonging to mount point mp.
4046  *
4047  * If FORCECLOSE is not specified, there should not be any active ones,
4048  * return error if any are found (nb: this is a user error, not a
4049  * system error). If FORCECLOSE is specified, detach any active vnodes
4050  * that are found.
4051  *
4052  * If WRITECLOSE is set, only flush out regular file vnodes open for
4053  * writing.
4054  *
4055  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
4056  *
4057  * `rootrefs' specifies the base reference count for the root vnode
4058  * of this filesystem. The root vnode is considered busy if its
4059  * v_usecount exceeds this value. On a successful return, vflush(, td)
4060  * will call vrele() on the root vnode exactly rootrefs times.
4061  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
4062  * be zero.
4063  */
4064 #ifdef DIAGNOSTIC
4065 static int busyprt = 0;         /* print out busy vnodes */
4066 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
4067 #endif
4068
4069 int
4070 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
4071 {
4072         struct vnode *vp, *mvp, *rootvp = NULL;
4073         struct vattr vattr;
4074         int busy = 0, error;
4075
4076         CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
4077             rootrefs, flags);
4078         if (rootrefs > 0) {
4079                 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
4080                     ("vflush: bad args"));
4081                 /*
4082                  * Get the filesystem root vnode. We can vput() it
4083                  * immediately, since with rootrefs > 0, it won't go away.
4084                  */
4085                 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
4086                         CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
4087                             __func__, error);
4088                         return (error);
4089                 }
4090                 vput(rootvp);
4091         }
4092 loop:
4093         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
4094                 vholdl(vp);
4095                 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
4096                 if (error) {
4097                         vdrop(vp);
4098                         MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
4099                         goto loop;
4100                 }
4101                 /*
4102                  * Skip over a vnodes marked VV_SYSTEM.
4103                  */
4104                 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
4105                         VOP_UNLOCK(vp);
4106                         vdrop(vp);
4107                         continue;
4108                 }
4109                 /*
4110                  * If WRITECLOSE is set, flush out unlinked but still open
4111                  * files (even if open only for reading) and regular file
4112                  * vnodes open for writing.
4113                  */
4114                 if (flags & WRITECLOSE) {
4115                         if (vp->v_object != NULL) {
4116                                 VM_OBJECT_WLOCK(vp->v_object);
4117                                 vm_object_page_clean(vp->v_object, 0, 0, 0);
4118                                 VM_OBJECT_WUNLOCK(vp->v_object);
4119                         }
4120                         do {
4121                                 error = VOP_FSYNC(vp, MNT_WAIT, td);
4122                         } while (error == ERELOOKUP);
4123                         if (error != 0) {
4124                                 VOP_UNLOCK(vp);
4125                                 vdrop(vp);
4126                                 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
4127                                 return (error);
4128                         }
4129                         error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4130                         VI_LOCK(vp);
4131
4132                         if ((vp->v_type == VNON ||
4133                             (error == 0 && vattr.va_nlink > 0)) &&
4134                             (vp->v_writecount <= 0 || vp->v_type != VREG)) {
4135                                 VOP_UNLOCK(vp);
4136                                 vdropl(vp);
4137                                 continue;
4138                         }
4139                 } else
4140                         VI_LOCK(vp);
4141                 /*
4142                  * With v_usecount == 0, all we need to do is clear out the
4143                  * vnode data structures and we are done.
4144                  *
4145                  * If FORCECLOSE is set, forcibly close the vnode.
4146                  */
4147                 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
4148                         vgonel(vp);
4149                 } else {
4150                         busy++;
4151 #ifdef DIAGNOSTIC
4152                         if (busyprt)
4153                                 vn_printf(vp, "vflush: busy vnode ");
4154 #endif
4155                 }
4156                 VOP_UNLOCK(vp);
4157                 vdropl(vp);
4158         }
4159         if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
4160                 /*
4161                  * If just the root vnode is busy, and if its refcount
4162                  * is equal to `rootrefs', then go ahead and kill it.
4163                  */
4164                 VI_LOCK(rootvp);
4165                 KASSERT(busy > 0, ("vflush: not busy"));
4166                 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
4167                     ("vflush: usecount %d < rootrefs %d",
4168                      rootvp->v_usecount, rootrefs));
4169                 if (busy == 1 && rootvp->v_usecount == rootrefs) {
4170                         VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
4171                         vgone(rootvp);
4172                         VOP_UNLOCK(rootvp);
4173                         busy = 0;
4174                 } else
4175                         VI_UNLOCK(rootvp);
4176         }
4177         if (busy) {
4178                 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
4179                     busy);
4180                 return (EBUSY);
4181         }
4182         for (; rootrefs > 0; rootrefs--)
4183                 vrele(rootvp);
4184         return (0);
4185 }
4186
4187 /*
4188  * Recycle an unused vnode.
4189  */
4190 int
4191 vrecycle(struct vnode *vp)
4192 {
4193         int recycled;
4194
4195         VI_LOCK(vp);
4196         recycled = vrecyclel(vp);
4197         VI_UNLOCK(vp);
4198         return (recycled);
4199 }
4200
4201 /*
4202  * vrecycle, with the vp interlock held.
4203  */
4204 int
4205 vrecyclel(struct vnode *vp)
4206 {
4207         int recycled;
4208
4209         ASSERT_VOP_ELOCKED(vp, __func__);
4210         ASSERT_VI_LOCKED(vp, __func__);
4211         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
4212         recycled = 0;
4213         if (vp->v_usecount == 0) {
4214                 recycled = 1;
4215                 vgonel(vp);
4216         }
4217         return (recycled);
4218 }
4219
4220 /*
4221  * Eliminate all activity associated with a vnode
4222  * in preparation for reuse.
4223  */
4224 void
4225 vgone(struct vnode *vp)
4226 {
4227         VI_LOCK(vp);
4228         vgonel(vp);
4229         VI_UNLOCK(vp);
4230 }
4231
4232 /*
4233  * Notify upper mounts about reclaimed or unlinked vnode.
4234  */
4235 void
4236 vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event)
4237 {
4238         struct mount *mp;
4239         struct mount_upper_node *ump;
4240
4241         mp = atomic_load_ptr(&vp->v_mount);
4242         if (mp == NULL)
4243                 return;
4244         if (TAILQ_EMPTY(&mp->mnt_notify))
4245                 return;
4246
4247         MNT_ILOCK(mp);
4248         mp->mnt_upper_pending++;
4249         KASSERT(mp->mnt_upper_pending > 0,
4250             ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending));
4251         TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) {
4252                 MNT_IUNLOCK(mp);
4253                 switch (event) {
4254                 case VFS_NOTIFY_UPPER_RECLAIM:
4255                         VFS_RECLAIM_LOWERVP(ump->mp, vp);
4256                         break;
4257                 case VFS_NOTIFY_UPPER_UNLINK:
4258                         VFS_UNLINK_LOWERVP(ump->mp, vp);
4259                         break;
4260                 }
4261                 MNT_ILOCK(mp);
4262         }
4263         mp->mnt_upper_pending--;
4264         if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 &&
4265             mp->mnt_upper_pending == 0) {
4266                 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER;
4267                 wakeup(&mp->mnt_uppers);
4268         }
4269         MNT_IUNLOCK(mp);
4270 }
4271
4272 /*
4273  * vgone, with the vp interlock held.
4274  */
4275 static void
4276 vgonel(struct vnode *vp)
4277 {
4278         struct thread *td;
4279         struct mount *mp;
4280         vm_object_t object;
4281         bool active, doinginact, oweinact;
4282
4283         ASSERT_VOP_ELOCKED(vp, "vgonel");
4284         ASSERT_VI_LOCKED(vp, "vgonel");
4285         VNASSERT(vp->v_holdcnt, vp,
4286             ("vgonel: vp %p has no reference.", vp));
4287         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
4288         td = curthread;
4289
4290         /*
4291          * Don't vgonel if we're already doomed.
4292          */
4293         if (VN_IS_DOOMED(vp)) {
4294                 VNPASS(vn_get_state(vp) == VSTATE_DESTROYING || \
4295                     vn_get_state(vp) == VSTATE_DEAD, vp);
4296                 return;
4297         }
4298         /*
4299          * Paired with freevnode.
4300          */
4301         vn_seqc_write_begin_locked(vp);
4302         vunlazy_gone(vp);
4303         vn_irflag_set_locked(vp, VIRF_DOOMED);
4304         vn_set_state(vp, VSTATE_DESTROYING);
4305
4306         /*
4307          * Check to see if the vnode is in use.  If so, we have to
4308          * call VOP_CLOSE() and VOP_INACTIVE().
4309          *
4310          * It could be that VOP_INACTIVE() requested reclamation, in
4311          * which case we should avoid recursion, so check
4312          * VI_DOINGINACT.  This is not precise but good enough.
4313          */
4314         active = vp->v_usecount > 0;
4315         oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
4316         doinginact = (vp->v_iflag & VI_DOINGINACT) != 0;
4317
4318         /*
4319          * If we need to do inactive VI_OWEINACT will be set.
4320          */
4321         if (vp->v_iflag & VI_DEFINACT) {
4322                 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
4323                 vp->v_iflag &= ~VI_DEFINACT;
4324                 vdropl(vp);
4325         } else {
4326                 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count"));
4327                 VI_UNLOCK(vp);
4328         }
4329         cache_purge_vgone(vp);
4330         vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
4331
4332         /*
4333          * If purging an active vnode, it must be closed and
4334          * deactivated before being reclaimed.
4335          */
4336         if (active)
4337                 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
4338         if (!doinginact) {
4339                 do {
4340                         if (oweinact || active) {
4341                                 VI_LOCK(vp);
4342                                 vinactivef(vp);
4343                                 oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
4344                                 VI_UNLOCK(vp);
4345                         }
4346                 } while (oweinact);
4347         }
4348         if (vp->v_type == VSOCK)
4349                 vfs_unp_reclaim(vp);
4350
4351         /*
4352          * Clean out any buffers associated with the vnode.
4353          * If the flush fails, just toss the buffers.
4354          */
4355         mp = NULL;
4356         if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
4357                 (void) vn_start_secondary_write(vp, &mp, V_WAIT);
4358         if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
4359                 while (vinvalbuf(vp, 0, 0, 0) != 0)
4360                         ;
4361         }
4362
4363         BO_LOCK(&vp->v_bufobj);
4364         KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
4365             vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
4366             TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
4367             vp->v_bufobj.bo_clean.bv_cnt == 0,
4368             ("vp %p bufobj not invalidated", vp));
4369
4370         /*
4371          * For VMIO bufobj, BO_DEAD is set later, or in
4372          * vm_object_terminate() after the object's page queue is
4373          * flushed.
4374          */
4375         object = vp->v_bufobj.bo_object;
4376         if (object == NULL)
4377                 vp->v_bufobj.bo_flag |= BO_DEAD;
4378         BO_UNLOCK(&vp->v_bufobj);
4379
4380         /*
4381          * Handle the VM part.  Tmpfs handles v_object on its own (the
4382          * OBJT_VNODE check).  Nullfs or other bypassing filesystems
4383          * should not touch the object borrowed from the lower vnode
4384          * (the handle check).
4385          */
4386         if (object != NULL && object->type == OBJT_VNODE &&
4387             object->handle == vp)
4388                 vnode_destroy_vobject(vp);
4389
4390         /*
4391          * Reclaim the vnode.
4392          */
4393         if (VOP_RECLAIM(vp))
4394                 panic("vgone: cannot reclaim");
4395         if (mp != NULL)
4396                 vn_finished_secondary_write(mp);
4397         VNASSERT(vp->v_object == NULL, vp,
4398             ("vop_reclaim left v_object vp=%p", vp));
4399         /*
4400          * Clear the advisory locks and wake up waiting threads.
4401          */
4402         if (vp->v_lockf != NULL) {
4403                 (void)VOP_ADVLOCKPURGE(vp);
4404                 vp->v_lockf = NULL;
4405         }
4406         /*
4407          * Delete from old mount point vnode list.
4408          */
4409         if (vp->v_mount == NULL) {
4410                 VI_LOCK(vp);
4411         } else {
4412                 delmntque(vp);
4413                 ASSERT_VI_LOCKED(vp, "vgonel 2");
4414         }
4415         /*
4416          * Done with purge, reset to the standard lock and invalidate
4417          * the vnode.
4418          */
4419         vp->v_vnlock = &vp->v_lock;
4420         vp->v_op = &dead_vnodeops;
4421         vp->v_type = VBAD;
4422         vn_set_state(vp, VSTATE_DEAD);
4423 }
4424
4425 /*
4426  * Print out a description of a vnode.
4427  */
4428 static const char *const vtypename[] = {
4429         [VNON] = "VNON",
4430         [VREG] = "VREG",
4431         [VDIR] = "VDIR",
4432         [VBLK] = "VBLK",
4433         [VCHR] = "VCHR",
4434         [VLNK] = "VLNK",
4435         [VSOCK] = "VSOCK",
4436         [VFIFO] = "VFIFO",
4437         [VBAD] = "VBAD",
4438         [VMARKER] = "VMARKER",
4439 };
4440 _Static_assert(nitems(vtypename) == VLASTTYPE + 1,
4441     "vnode type name not added to vtypename");
4442
4443 static const char *const vstatename[] = {
4444         [VSTATE_UNINITIALIZED] = "VSTATE_UNINITIALIZED",
4445         [VSTATE_CONSTRUCTED] = "VSTATE_CONSTRUCTED",
4446         [VSTATE_DESTROYING] = "VSTATE_DESTROYING",
4447         [VSTATE_DEAD] = "VSTATE_DEAD",
4448 };
4449 _Static_assert(nitems(vstatename) == VLASTSTATE + 1,
4450     "vnode state name not added to vstatename");
4451
4452 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0,
4453     "new hold count flag not added to vn_printf");
4454
4455 void
4456 vn_printf(struct vnode *vp, const char *fmt, ...)
4457 {
4458         va_list ap;
4459         char buf[256], buf2[16];
4460         u_long flags;
4461         u_int holdcnt;
4462         short irflag;
4463
4464         va_start(ap, fmt);
4465         vprintf(fmt, ap);
4466         va_end(ap);
4467         printf("%p: ", (void *)vp);
4468         printf("type %s state %s op %p\n", vtypename[vp->v_type],
4469             vstatename[vp->v_state], vp->v_op);
4470         holdcnt = atomic_load_int(&vp->v_holdcnt);
4471         printf("    usecount %d, writecount %d, refcount %d seqc users %d",
4472             vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS,
4473             vp->v_seqc_users);
4474         switch (vp->v_type) {
4475         case VDIR:
4476                 printf(" mountedhere %p\n", vp->v_mountedhere);
4477                 break;
4478         case VCHR:
4479                 printf(" rdev %p\n", vp->v_rdev);
4480                 break;
4481         case VSOCK:
4482                 printf(" socket %p\n", vp->v_unpcb);
4483                 break;
4484         case VFIFO:
4485                 printf(" fifoinfo %p\n", vp->v_fifoinfo);
4486                 break;
4487         default:
4488                 printf("\n");
4489                 break;
4490         }
4491         buf[0] = '\0';
4492         buf[1] = '\0';
4493         if (holdcnt & VHOLD_NO_SMR)
4494                 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf));
4495         printf("    hold count flags (%s)\n", buf + 1);
4496
4497         buf[0] = '\0';
4498         buf[1] = '\0';
4499         irflag = vn_irflag_read(vp);
4500         if (irflag & VIRF_DOOMED)
4501                 strlcat(buf, "|VIRF_DOOMED", sizeof(buf));
4502         if (irflag & VIRF_PGREAD)
4503                 strlcat(buf, "|VIRF_PGREAD", sizeof(buf));
4504         if (irflag & VIRF_MOUNTPOINT)
4505                 strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf));
4506         if (irflag & VIRF_TEXT_REF)
4507                 strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf));
4508         flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF);
4509         if (flags != 0) {
4510                 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags);
4511                 strlcat(buf, buf2, sizeof(buf));
4512         }
4513         if (vp->v_vflag & VV_ROOT)
4514                 strlcat(buf, "|VV_ROOT", sizeof(buf));
4515         if (vp->v_vflag & VV_ISTTY)
4516                 strlcat(buf, "|VV_ISTTY", sizeof(buf));
4517         if (vp->v_vflag & VV_NOSYNC)
4518                 strlcat(buf, "|VV_NOSYNC", sizeof(buf));
4519         if (vp->v_vflag & VV_ETERNALDEV)
4520                 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
4521         if (vp->v_vflag & VV_CACHEDLABEL)
4522                 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
4523         if (vp->v_vflag & VV_VMSIZEVNLOCK)
4524                 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf));
4525         if (vp->v_vflag & VV_COPYONWRITE)
4526                 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
4527         if (vp->v_vflag & VV_SYSTEM)
4528                 strlcat(buf, "|VV_SYSTEM", sizeof(buf));
4529         if (vp->v_vflag & VV_PROCDEP)
4530                 strlcat(buf, "|VV_PROCDEP", sizeof(buf));
4531         if (vp->v_vflag & VV_DELETED)
4532                 strlcat(buf, "|VV_DELETED", sizeof(buf));
4533         if (vp->v_vflag & VV_MD)
4534                 strlcat(buf, "|VV_MD", sizeof(buf));
4535         if (vp->v_vflag & VV_FORCEINSMQ)
4536                 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
4537         if (vp->v_vflag & VV_READLINK)
4538                 strlcat(buf, "|VV_READLINK", sizeof(buf));
4539         flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
4540             VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM |
4541             VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK);
4542         if (flags != 0) {
4543                 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
4544                 strlcat(buf, buf2, sizeof(buf));
4545         }
4546         if (vp->v_iflag & VI_MOUNT)
4547                 strlcat(buf, "|VI_MOUNT", sizeof(buf));
4548         if (vp->v_iflag & VI_DOINGINACT)
4549                 strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
4550         if (vp->v_iflag & VI_OWEINACT)
4551                 strlcat(buf, "|VI_OWEINACT", sizeof(buf));
4552         if (vp->v_iflag & VI_DEFINACT)
4553                 strlcat(buf, "|VI_DEFINACT", sizeof(buf));
4554         if (vp->v_iflag & VI_FOPENING)
4555                 strlcat(buf, "|VI_FOPENING", sizeof(buf));
4556         flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT |
4557             VI_OWEINACT | VI_DEFINACT | VI_FOPENING);
4558         if (flags != 0) {
4559                 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
4560                 strlcat(buf, buf2, sizeof(buf));
4561         }
4562         if (vp->v_mflag & VMP_LAZYLIST)
4563                 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf));
4564         flags = vp->v_mflag & ~(VMP_LAZYLIST);
4565         if (flags != 0) {
4566                 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags);
4567                 strlcat(buf, buf2, sizeof(buf));
4568         }
4569         printf("    flags (%s)", buf + 1);
4570         if (mtx_owned(VI_MTX(vp)))
4571                 printf(" VI_LOCKed");
4572         printf("\n");
4573         if (vp->v_object != NULL)
4574                 printf("    v_object %p ref %d pages %d "
4575                     "cleanbuf %d dirtybuf %d\n",
4576                     vp->v_object, vp->v_object->ref_count,
4577                     vp->v_object->resident_page_count,
4578                     vp->v_bufobj.bo_clean.bv_cnt,
4579                     vp->v_bufobj.bo_dirty.bv_cnt);
4580         printf("    ");
4581         lockmgr_printinfo(vp->v_vnlock);
4582         if (vp->v_data != NULL)
4583                 VOP_PRINT(vp);
4584 }
4585
4586 #ifdef DDB
4587 /*
4588  * List all of the locked vnodes in the system.
4589  * Called when debugging the kernel.
4590  */
4591 DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE)
4592 {
4593         struct mount *mp;
4594         struct vnode *vp;
4595
4596         /*
4597          * Note: because this is DDB, we can't obey the locking semantics
4598          * for these structures, which means we could catch an inconsistent
4599          * state and dereference a nasty pointer.  Not much to be done
4600          * about that.
4601          */
4602         db_printf("Locked vnodes\n");
4603         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4604                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
4605                         if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
4606                                 vn_printf(vp, "vnode ");
4607                 }
4608         }
4609 }
4610
4611 /*
4612  * Show details about the given vnode.
4613  */
4614 DB_SHOW_COMMAND(vnode, db_show_vnode)
4615 {
4616         struct vnode *vp;
4617
4618         if (!have_addr)
4619                 return;
4620         vp = (struct vnode *)addr;
4621         vn_printf(vp, "vnode ");
4622 }
4623
4624 /*
4625  * Show details about the given mount point.
4626  */
4627 DB_SHOW_COMMAND(mount, db_show_mount)
4628 {
4629         struct mount *mp;
4630         struct vfsopt *opt;
4631         struct statfs *sp;
4632         struct vnode *vp;
4633         char buf[512];
4634         uint64_t mflags;
4635         u_int flags;
4636
4637         if (!have_addr) {
4638                 /* No address given, print short info about all mount points. */
4639                 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4640                         db_printf("%p %s on %s (%s)\n", mp,
4641                             mp->mnt_stat.f_mntfromname,
4642                             mp->mnt_stat.f_mntonname,
4643                             mp->mnt_stat.f_fstypename);
4644                         if (db_pager_quit)
4645                                 break;
4646                 }
4647                 db_printf("\nMore info: show mount <addr>\n");
4648                 return;
4649         }
4650
4651         mp = (struct mount *)addr;
4652         db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
4653             mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
4654
4655         buf[0] = '\0';
4656         mflags = mp->mnt_flag;
4657 #define MNT_FLAG(flag)  do {                                            \
4658         if (mflags & (flag)) {                                          \
4659                 if (buf[0] != '\0')                                     \
4660                         strlcat(buf, ", ", sizeof(buf));                \
4661                 strlcat(buf, (#flag) + 4, sizeof(buf));                 \
4662                 mflags &= ~(flag);                                      \
4663         }                                                               \
4664 } while (0)
4665         MNT_FLAG(MNT_RDONLY);
4666         MNT_FLAG(MNT_SYNCHRONOUS);
4667         MNT_FLAG(MNT_NOEXEC);
4668         MNT_FLAG(MNT_NOSUID);
4669         MNT_FLAG(MNT_NFS4ACLS);
4670         MNT_FLAG(MNT_UNION);
4671         MNT_FLAG(MNT_ASYNC);
4672         MNT_FLAG(MNT_SUIDDIR);
4673         MNT_FLAG(MNT_SOFTDEP);
4674         MNT_FLAG(MNT_NOSYMFOLLOW);
4675         MNT_FLAG(MNT_GJOURNAL);
4676         MNT_FLAG(MNT_MULTILABEL);
4677         MNT_FLAG(MNT_ACLS);
4678         MNT_FLAG(MNT_NOATIME);
4679         MNT_FLAG(MNT_NOCLUSTERR);
4680         MNT_FLAG(MNT_NOCLUSTERW);
4681         MNT_FLAG(MNT_SUJ);
4682         MNT_FLAG(MNT_EXRDONLY);
4683         MNT_FLAG(MNT_EXPORTED);
4684         MNT_FLAG(MNT_DEFEXPORTED);
4685         MNT_FLAG(MNT_EXPORTANON);
4686         MNT_FLAG(MNT_EXKERB);
4687         MNT_FLAG(MNT_EXPUBLIC);
4688         MNT_FLAG(MNT_LOCAL);
4689         MNT_FLAG(MNT_QUOTA);
4690         MNT_FLAG(MNT_ROOTFS);
4691         MNT_FLAG(MNT_USER);
4692         MNT_FLAG(MNT_IGNORE);
4693         MNT_FLAG(MNT_UPDATE);
4694         MNT_FLAG(MNT_DELEXPORT);
4695         MNT_FLAG(MNT_RELOAD);
4696         MNT_FLAG(MNT_FORCE);
4697         MNT_FLAG(MNT_SNAPSHOT);
4698         MNT_FLAG(MNT_BYFSID);
4699 #undef MNT_FLAG
4700         if (mflags != 0) {
4701                 if (buf[0] != '\0')
4702                         strlcat(buf, ", ", sizeof(buf));
4703                 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
4704                     "0x%016jx", mflags);
4705         }
4706         db_printf("    mnt_flag = %s\n", buf);
4707
4708         buf[0] = '\0';
4709         flags = mp->mnt_kern_flag;
4710 #define MNT_KERN_FLAG(flag)     do {                                    \
4711         if (flags & (flag)) {                                           \
4712                 if (buf[0] != '\0')                                     \
4713                         strlcat(buf, ", ", sizeof(buf));                \
4714                 strlcat(buf, (#flag) + 5, sizeof(buf));                 \
4715                 flags &= ~(flag);                                       \
4716         }                                                               \
4717 } while (0)
4718         MNT_KERN_FLAG(MNTK_UNMOUNTF);
4719         MNT_KERN_FLAG(MNTK_ASYNC);
4720         MNT_KERN_FLAG(MNTK_SOFTDEP);
4721         MNT_KERN_FLAG(MNTK_NOMSYNC);
4722         MNT_KERN_FLAG(MNTK_DRAINING);
4723         MNT_KERN_FLAG(MNTK_REFEXPIRE);
4724         MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
4725         MNT_KERN_FLAG(MNTK_SHARED_WRITES);
4726         MNT_KERN_FLAG(MNTK_NO_IOPF);
4727         MNT_KERN_FLAG(MNTK_RECURSE);
4728         MNT_KERN_FLAG(MNTK_UPPER_WAITER);
4729         MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE);
4730         MNT_KERN_FLAG(MNTK_USES_BCACHE);
4731         MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG);
4732         MNT_KERN_FLAG(MNTK_FPLOOKUP);
4733         MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER);
4734         MNT_KERN_FLAG(MNTK_NOASYNC);
4735         MNT_KERN_FLAG(MNTK_UNMOUNT);
4736         MNT_KERN_FLAG(MNTK_MWAIT);
4737         MNT_KERN_FLAG(MNTK_SUSPEND);
4738         MNT_KERN_FLAG(MNTK_SUSPEND2);
4739         MNT_KERN_FLAG(MNTK_SUSPENDED);
4740         MNT_KERN_FLAG(MNTK_NULL_NOCACHE);
4741         MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
4742 #undef MNT_KERN_FLAG
4743         if (flags != 0) {
4744                 if (buf[0] != '\0')
4745                         strlcat(buf, ", ", sizeof(buf));
4746                 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
4747                     "0x%08x", flags);
4748         }
4749         db_printf("    mnt_kern_flag = %s\n", buf);
4750
4751         db_printf("    mnt_opt = ");
4752         opt = TAILQ_FIRST(mp->mnt_opt);
4753         if (opt != NULL) {
4754                 db_printf("%s", opt->name);
4755                 opt = TAILQ_NEXT(opt, link);
4756                 while (opt != NULL) {
4757                         db_printf(", %s", opt->name);
4758                         opt = TAILQ_NEXT(opt, link);
4759                 }
4760         }
4761         db_printf("\n");
4762
4763         sp = &mp->mnt_stat;
4764         db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
4765             "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
4766             "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
4767             "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
4768             (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
4769             (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
4770             (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
4771             (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
4772             (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
4773             (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
4774             (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
4775             (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
4776
4777         db_printf("    mnt_cred = { uid=%u ruid=%u",
4778             (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
4779         if (jailed(mp->mnt_cred))
4780                 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
4781         db_printf(" }\n");
4782         db_printf("    mnt_ref = %d (with %d in the struct)\n",
4783             vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref);
4784         db_printf("    mnt_gen = %d\n", mp->mnt_gen);
4785         db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
4786         db_printf("    mnt_lazyvnodelistsize = %d\n",
4787             mp->mnt_lazyvnodelistsize);
4788         db_printf("    mnt_writeopcount = %d (with %d in the struct)\n",
4789             vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount);
4790         db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
4791         db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
4792         db_printf("    mnt_lockref = %d (with %d in the struct)\n",
4793             vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref);
4794         db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
4795         db_printf("    mnt_secondary_accwrites = %d\n",
4796             mp->mnt_secondary_accwrites);
4797         db_printf("    mnt_gjprovider = %s\n",
4798             mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
4799         db_printf("    mnt_vfs_ops = %d\n", mp->mnt_vfs_ops);
4800
4801         db_printf("\n\nList of active vnodes\n");
4802         TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
4803                 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) {
4804                         vn_printf(vp, "vnode ");
4805                         if (db_pager_quit)
4806                                 break;
4807                 }
4808         }
4809         db_printf("\n\nList of inactive vnodes\n");
4810         TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
4811                 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) {
4812                         vn_printf(vp, "vnode ");
4813                         if (db_pager_quit)
4814                                 break;
4815                 }
4816         }
4817 }
4818 #endif  /* DDB */
4819
4820 /*
4821  * Fill in a struct xvfsconf based on a struct vfsconf.
4822  */
4823 static int
4824 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
4825 {
4826         struct xvfsconf xvfsp;
4827
4828         bzero(&xvfsp, sizeof(xvfsp));
4829         strcpy(xvfsp.vfc_name, vfsp->vfc_name);
4830         xvfsp.vfc_typenum = vfsp->vfc_typenum;
4831         xvfsp.vfc_refcount = vfsp->vfc_refcount;
4832         xvfsp.vfc_flags = vfsp->vfc_flags;
4833         /*
4834          * These are unused in userland, we keep them
4835          * to not break binary compatibility.
4836          */
4837         xvfsp.vfc_vfsops = NULL;
4838         xvfsp.vfc_next = NULL;
4839         return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
4840 }
4841
4842 #ifdef COMPAT_FREEBSD32
4843 struct xvfsconf32 {
4844         uint32_t        vfc_vfsops;
4845         char            vfc_name[MFSNAMELEN];
4846         int32_t         vfc_typenum;
4847         int32_t         vfc_refcount;
4848         int32_t         vfc_flags;
4849         uint32_t        vfc_next;
4850 };
4851
4852 static int
4853 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
4854 {
4855         struct xvfsconf32 xvfsp;
4856
4857         bzero(&xvfsp, sizeof(xvfsp));
4858         strcpy(xvfsp.vfc_name, vfsp->vfc_name);
4859         xvfsp.vfc_typenum = vfsp->vfc_typenum;
4860         xvfsp.vfc_refcount = vfsp->vfc_refcount;
4861         xvfsp.vfc_flags = vfsp->vfc_flags;
4862         return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
4863 }
4864 #endif
4865
4866 /*
4867  * Top level filesystem related information gathering.
4868  */
4869 static int
4870 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
4871 {
4872         struct vfsconf *vfsp;
4873         int error;
4874
4875         error = 0;
4876         vfsconf_slock();
4877         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
4878 #ifdef COMPAT_FREEBSD32
4879                 if (req->flags & SCTL_MASK32)
4880                         error = vfsconf2x32(req, vfsp);
4881                 else
4882 #endif
4883                         error = vfsconf2x(req, vfsp);
4884                 if (error)
4885                         break;
4886         }
4887         vfsconf_sunlock();
4888         return (error);
4889 }
4890
4891 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
4892     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
4893     "S,xvfsconf", "List of all configured filesystems");
4894
4895 #ifndef BURN_BRIDGES
4896 static int      sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
4897
4898 static int
4899 vfs_sysctl(SYSCTL_HANDLER_ARGS)
4900 {
4901         int *name = (int *)arg1 - 1;    /* XXX */
4902         u_int namelen = arg2 + 1;       /* XXX */
4903         struct vfsconf *vfsp;
4904
4905         log(LOG_WARNING, "userland calling deprecated sysctl, "
4906             "please rebuild world\n");
4907
4908 #if 1 || defined(COMPAT_PRELITE2)
4909         /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
4910         if (namelen == 1)
4911                 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
4912 #endif
4913
4914         switch (name[1]) {
4915         case VFS_MAXTYPENUM:
4916                 if (namelen != 2)
4917                         return (ENOTDIR);
4918                 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
4919         case VFS_CONF:
4920                 if (namelen != 3)
4921                         return (ENOTDIR);       /* overloaded */
4922                 vfsconf_slock();
4923                 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
4924                         if (vfsp->vfc_typenum == name[2])
4925                                 break;
4926                 }
4927                 vfsconf_sunlock();
4928                 if (vfsp == NULL)
4929                         return (EOPNOTSUPP);
4930 #ifdef COMPAT_FREEBSD32
4931                 if (req->flags & SCTL_MASK32)
4932                         return (vfsconf2x32(req, vfsp));
4933                 else
4934 #endif
4935                         return (vfsconf2x(req, vfsp));
4936         }
4937         return (EOPNOTSUPP);
4938 }
4939
4940 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
4941     CTLFLAG_MPSAFE, vfs_sysctl,
4942     "Generic filesystem");
4943
4944 #if 1 || defined(COMPAT_PRELITE2)
4945
4946 static int
4947 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
4948 {
4949         int error;
4950         struct vfsconf *vfsp;
4951         struct ovfsconf ovfs;
4952
4953         vfsconf_slock();
4954         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
4955                 bzero(&ovfs, sizeof(ovfs));
4956                 ovfs.vfc_vfsops = vfsp->vfc_vfsops;     /* XXX used as flag */
4957                 strcpy(ovfs.vfc_name, vfsp->vfc_name);
4958                 ovfs.vfc_index = vfsp->vfc_typenum;
4959                 ovfs.vfc_refcount = vfsp->vfc_refcount;
4960                 ovfs.vfc_flags = vfsp->vfc_flags;
4961                 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
4962                 if (error != 0) {
4963                         vfsconf_sunlock();
4964                         return (error);
4965                 }
4966         }
4967         vfsconf_sunlock();
4968         return (0);
4969 }
4970
4971 #endif /* 1 || COMPAT_PRELITE2 */
4972 #endif /* !BURN_BRIDGES */
4973
4974 static void
4975 unmount_or_warn(struct mount *mp)
4976 {
4977         int error;
4978
4979         error = dounmount(mp, MNT_FORCE, curthread);
4980         if (error != 0) {
4981                 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
4982                 if (error == EBUSY)
4983                         printf("BUSY)\n");
4984                 else
4985                         printf("%d)\n", error);
4986         }
4987 }
4988
4989 /*
4990  * Unmount all filesystems. The list is traversed in reverse order
4991  * of mounting to avoid dependencies.
4992  */
4993 void
4994 vfs_unmountall(void)
4995 {
4996         struct mount *mp, *tmp;
4997
4998         CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
4999
5000         /*
5001          * Since this only runs when rebooting, it is not interlocked.
5002          */
5003         TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
5004                 vfs_ref(mp);
5005
5006                 /*
5007                  * Forcibly unmounting "/dev" before "/" would prevent clean
5008                  * unmount of the latter.
5009                  */
5010                 if (mp == rootdevmp)
5011                         continue;
5012
5013                 unmount_or_warn(mp);
5014         }
5015
5016         if (rootdevmp != NULL)
5017                 unmount_or_warn(rootdevmp);
5018 }
5019
5020 static void
5021 vfs_deferred_inactive(struct vnode *vp, int lkflags)
5022 {
5023
5024         ASSERT_VI_LOCKED(vp, __func__);
5025         VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp);
5026         if ((vp->v_iflag & VI_OWEINACT) == 0) {
5027                 vdropl(vp);
5028                 return;
5029         }
5030         if (vn_lock(vp, lkflags) == 0) {
5031                 VI_LOCK(vp);
5032                 vinactive(vp);
5033                 VOP_UNLOCK(vp);
5034                 vdropl(vp);
5035                 return;
5036         }
5037         vdefer_inactive_unlocked(vp);
5038 }
5039
5040 static int
5041 vfs_periodic_inactive_filter(struct vnode *vp, void *arg)
5042 {
5043
5044         return (vp->v_iflag & VI_DEFINACT);
5045 }
5046
5047 static void __noinline
5048 vfs_periodic_inactive(struct mount *mp, int flags)
5049 {
5050         struct vnode *vp, *mvp;
5051         int lkflags;
5052
5053         lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
5054         if (flags != MNT_WAIT)
5055                 lkflags |= LK_NOWAIT;
5056
5057         MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) {
5058                 if ((vp->v_iflag & VI_DEFINACT) == 0) {
5059                         VI_UNLOCK(vp);
5060                         continue;
5061                 }
5062                 vp->v_iflag &= ~VI_DEFINACT;
5063                 vfs_deferred_inactive(vp, lkflags);
5064         }
5065 }
5066
5067 static inline bool
5068 vfs_want_msync(struct vnode *vp)
5069 {
5070         struct vm_object *obj;
5071
5072         /*
5073          * This test may be performed without any locks held.
5074          * We rely on vm_object's type stability.
5075          */
5076         if (vp->v_vflag & VV_NOSYNC)
5077                 return (false);
5078         obj = vp->v_object;
5079         return (obj != NULL && vm_object_mightbedirty(obj));
5080 }
5081
5082 static int
5083 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused)
5084 {
5085
5086         if (vp->v_vflag & VV_NOSYNC)
5087                 return (false);
5088         if (vp->v_iflag & VI_DEFINACT)
5089                 return (true);
5090         return (vfs_want_msync(vp));
5091 }
5092
5093 static void __noinline
5094 vfs_periodic_msync_inactive(struct mount *mp, int flags)
5095 {
5096         struct vnode *vp, *mvp;
5097         struct vm_object *obj;
5098         int lkflags, objflags;
5099         bool seen_defer;
5100
5101         lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
5102         if (flags != MNT_WAIT) {
5103                 lkflags |= LK_NOWAIT;
5104                 objflags = OBJPC_NOSYNC;
5105         } else {
5106                 objflags = OBJPC_SYNC;
5107         }
5108
5109         MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) {
5110                 seen_defer = false;
5111                 if (vp->v_iflag & VI_DEFINACT) {
5112                         vp->v_iflag &= ~VI_DEFINACT;
5113                         seen_defer = true;
5114                 }
5115                 if (!vfs_want_msync(vp)) {
5116                         if (seen_defer)
5117                                 vfs_deferred_inactive(vp, lkflags);
5118                         else
5119                                 VI_UNLOCK(vp);
5120                         continue;
5121                 }
5122                 if (vget(vp, lkflags) == 0) {
5123                         obj = vp->v_object;
5124                         if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) {
5125                                 VM_OBJECT_WLOCK(obj);
5126                                 vm_object_page_clean(obj, 0, 0, objflags);
5127                                 VM_OBJECT_WUNLOCK(obj);
5128                         }
5129                         vput(vp);
5130                         if (seen_defer)
5131                                 vdrop(vp);
5132                 } else {
5133                         if (seen_defer)
5134                                 vdefer_inactive_unlocked(vp);
5135                 }
5136         }
5137 }
5138
5139 void
5140 vfs_periodic(struct mount *mp, int flags)
5141 {
5142
5143         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
5144
5145         if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0)
5146                 vfs_periodic_inactive(mp, flags);
5147         else
5148                 vfs_periodic_msync_inactive(mp, flags);
5149 }
5150
5151 static void
5152 destroy_vpollinfo_free(struct vpollinfo *vi)
5153 {
5154
5155         knlist_destroy(&vi->vpi_selinfo.si_note);
5156         mtx_destroy(&vi->vpi_lock);
5157         free(vi, M_VNODEPOLL);
5158 }
5159
5160 static void
5161 destroy_vpollinfo(struct vpollinfo *vi)
5162 {
5163
5164         knlist_clear(&vi->vpi_selinfo.si_note, 1);
5165         seldrain(&vi->vpi_selinfo);
5166         destroy_vpollinfo_free(vi);
5167 }
5168
5169 /*
5170  * Initialize per-vnode helper structure to hold poll-related state.
5171  */
5172 void
5173 v_addpollinfo(struct vnode *vp)
5174 {
5175         struct vpollinfo *vi;
5176
5177         if (vp->v_pollinfo != NULL)
5178                 return;
5179         vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO);
5180         mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
5181         knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
5182             vfs_knlunlock, vfs_knl_assert_lock);
5183         VI_LOCK(vp);
5184         if (vp->v_pollinfo != NULL) {
5185                 VI_UNLOCK(vp);
5186                 destroy_vpollinfo_free(vi);
5187                 return;
5188         }
5189         vp->v_pollinfo = vi;
5190         VI_UNLOCK(vp);
5191 }
5192
5193 /*
5194  * Record a process's interest in events which might happen to
5195  * a vnode.  Because poll uses the historic select-style interface
5196  * internally, this routine serves as both the ``check for any
5197  * pending events'' and the ``record my interest in future events''
5198  * functions.  (These are done together, while the lock is held,
5199  * to avoid race conditions.)
5200  */
5201 int
5202 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
5203 {
5204
5205         v_addpollinfo(vp);
5206         mtx_lock(&vp->v_pollinfo->vpi_lock);
5207         if (vp->v_pollinfo->vpi_revents & events) {
5208                 /*
5209                  * This leaves events we are not interested
5210                  * in available for the other process which
5211                  * which presumably had requested them
5212                  * (otherwise they would never have been
5213                  * recorded).
5214                  */
5215                 events &= vp->v_pollinfo->vpi_revents;
5216                 vp->v_pollinfo->vpi_revents &= ~events;
5217
5218                 mtx_unlock(&vp->v_pollinfo->vpi_lock);
5219                 return (events);
5220         }
5221         vp->v_pollinfo->vpi_events |= events;
5222         selrecord(td, &vp->v_pollinfo->vpi_selinfo);
5223         mtx_unlock(&vp->v_pollinfo->vpi_lock);
5224         return (0);
5225 }
5226
5227 /*
5228  * Routine to create and manage a filesystem syncer vnode.
5229  */
5230 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
5231 static int      sync_fsync(struct  vop_fsync_args *);
5232 static int      sync_inactive(struct  vop_inactive_args *);
5233 static int      sync_reclaim(struct  vop_reclaim_args *);
5234
5235 static struct vop_vector sync_vnodeops = {
5236         .vop_bypass =   VOP_EOPNOTSUPP,
5237         .vop_close =    sync_close,
5238         .vop_fsync =    sync_fsync,
5239         .vop_getwritemount = vop_stdgetwritemount,
5240         .vop_inactive = sync_inactive,
5241         .vop_need_inactive = vop_stdneed_inactive,
5242         .vop_reclaim =  sync_reclaim,
5243         .vop_lock1 =    vop_stdlock,
5244         .vop_unlock =   vop_stdunlock,
5245         .vop_islocked = vop_stdislocked,
5246         .vop_fplookup_vexec = VOP_EAGAIN,
5247         .vop_fplookup_symlink = VOP_EAGAIN,
5248 };
5249 VFS_VOP_VECTOR_REGISTER(sync_vnodeops);
5250
5251 /*
5252  * Create a new filesystem syncer vnode for the specified mount point.
5253  */
5254 void
5255 vfs_allocate_syncvnode(struct mount *mp)
5256 {
5257         struct vnode *vp;
5258         struct bufobj *bo;
5259         static long start, incr, next;
5260         int error;
5261
5262         /* Allocate a new vnode */
5263         error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
5264         if (error != 0)
5265                 panic("vfs_allocate_syncvnode: getnewvnode() failed");
5266         vp->v_type = VNON;
5267         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
5268         vp->v_vflag |= VV_FORCEINSMQ;
5269         error = insmntque1(vp, mp);
5270         if (error != 0)
5271                 panic("vfs_allocate_syncvnode: insmntque() failed");
5272         vp->v_vflag &= ~VV_FORCEINSMQ;
5273         vn_set_state(vp, VSTATE_CONSTRUCTED);
5274         VOP_UNLOCK(vp);
5275         /*
5276          * Place the vnode onto the syncer worklist. We attempt to
5277          * scatter them about on the list so that they will go off
5278          * at evenly distributed times even if all the filesystems
5279          * are mounted at once.
5280          */
5281         next += incr;
5282         if (next == 0 || next > syncer_maxdelay) {
5283                 start /= 2;
5284                 incr /= 2;
5285                 if (start == 0) {
5286                         start = syncer_maxdelay / 2;
5287                         incr = syncer_maxdelay;
5288                 }
5289                 next = start;
5290         }
5291         bo = &vp->v_bufobj;
5292         BO_LOCK(bo);
5293         vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
5294         /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
5295         mtx_lock(&sync_mtx);
5296         sync_vnode_count++;
5297         if (mp->mnt_syncer == NULL) {
5298                 mp->mnt_syncer = vp;
5299                 vp = NULL;
5300         }
5301         mtx_unlock(&sync_mtx);
5302         BO_UNLOCK(bo);
5303         if (vp != NULL) {
5304                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
5305                 vgone(vp);
5306                 vput(vp);
5307         }
5308 }
5309
5310 void
5311 vfs_deallocate_syncvnode(struct mount *mp)
5312 {
5313         struct vnode *vp;
5314
5315         mtx_lock(&sync_mtx);
5316         vp = mp->mnt_syncer;
5317         if (vp != NULL)
5318                 mp->mnt_syncer = NULL;
5319         mtx_unlock(&sync_mtx);
5320         if (vp != NULL)
5321                 vrele(vp);
5322 }
5323
5324 /*
5325  * Do a lazy sync of the filesystem.
5326  */
5327 static int
5328 sync_fsync(struct vop_fsync_args *ap)
5329 {
5330         struct vnode *syncvp = ap->a_vp;
5331         struct mount *mp = syncvp->v_mount;
5332         int error, save;
5333         struct bufobj *bo;
5334
5335         /*
5336          * We only need to do something if this is a lazy evaluation.
5337          */
5338         if (ap->a_waitfor != MNT_LAZY)
5339                 return (0);
5340
5341         /*
5342          * Move ourselves to the back of the sync list.
5343          */
5344         bo = &syncvp->v_bufobj;
5345         BO_LOCK(bo);
5346         vn_syncer_add_to_worklist(bo, syncdelay);
5347         BO_UNLOCK(bo);
5348
5349         /*
5350          * Walk the list of vnodes pushing all that are dirty and
5351          * not already on the sync list.
5352          */
5353         if (vfs_busy(mp, MBF_NOWAIT) != 0)
5354                 return (0);
5355         VOP_UNLOCK(syncvp);
5356         save = curthread_pflags_set(TDP_SYNCIO);
5357         /*
5358          * The filesystem at hand may be idle with free vnodes stored in the
5359          * batch.  Return them instead of letting them stay there indefinitely.
5360          */
5361         vfs_periodic(mp, MNT_NOWAIT);
5362         error = VFS_SYNC(mp, MNT_LAZY);
5363         curthread_pflags_restore(save);
5364         vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY);
5365         vfs_unbusy(mp);
5366         return (error);
5367 }
5368
5369 /*
5370  * The syncer vnode is no referenced.
5371  */
5372 static int
5373 sync_inactive(struct vop_inactive_args *ap)
5374 {
5375
5376         vgone(ap->a_vp);
5377         return (0);
5378 }
5379
5380 /*
5381  * The syncer vnode is no longer needed and is being decommissioned.
5382  *
5383  * Modifications to the worklist must be protected by sync_mtx.
5384  */
5385 static int
5386 sync_reclaim(struct vop_reclaim_args *ap)
5387 {
5388         struct vnode *vp = ap->a_vp;
5389         struct bufobj *bo;
5390
5391         bo = &vp->v_bufobj;
5392         BO_LOCK(bo);
5393         mtx_lock(&sync_mtx);
5394         if (vp->v_mount->mnt_syncer == vp)
5395                 vp->v_mount->mnt_syncer = NULL;
5396         if (bo->bo_flag & BO_ONWORKLST) {
5397                 LIST_REMOVE(bo, bo_synclist);
5398                 syncer_worklist_len--;
5399                 sync_vnode_count--;
5400                 bo->bo_flag &= ~BO_ONWORKLST;
5401         }
5402         mtx_unlock(&sync_mtx);
5403         BO_UNLOCK(bo);
5404
5405         return (0);
5406 }
5407
5408 int
5409 vn_need_pageq_flush(struct vnode *vp)
5410 {
5411         struct vm_object *obj;
5412
5413         obj = vp->v_object;
5414         return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
5415             vm_object_mightbedirty(obj));
5416 }
5417
5418 /*
5419  * Check if vnode represents a disk device
5420  */
5421 bool
5422 vn_isdisk_error(struct vnode *vp, int *errp)
5423 {
5424         int error;
5425
5426         if (vp->v_type != VCHR) {
5427                 error = ENOTBLK;
5428                 goto out;
5429         }
5430         error = 0;
5431         dev_lock();
5432         if (vp->v_rdev == NULL)
5433                 error = ENXIO;
5434         else if (vp->v_rdev->si_devsw == NULL)
5435                 error = ENXIO;
5436         else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
5437                 error = ENOTBLK;
5438         dev_unlock();
5439 out:
5440         *errp = error;
5441         return (error == 0);
5442 }
5443
5444 bool
5445 vn_isdisk(struct vnode *vp)
5446 {
5447         int error;
5448
5449         return (vn_isdisk_error(vp, &error));
5450 }
5451
5452 /*
5453  * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
5454  * the comment above cache_fplookup for details.
5455  */
5456 int
5457 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred)
5458 {
5459         int error;
5460
5461         VFS_SMR_ASSERT_ENTERED();
5462
5463         /* Check the owner. */
5464         if (cred->cr_uid == file_uid) {
5465                 if (file_mode & S_IXUSR)
5466                         return (0);
5467                 goto out_error;
5468         }
5469
5470         /* Otherwise, check the groups (first match) */
5471         if (groupmember(file_gid, cred)) {
5472                 if (file_mode & S_IXGRP)
5473                         return (0);
5474                 goto out_error;
5475         }
5476
5477         /* Otherwise, check everyone else. */
5478         if (file_mode & S_IXOTH)
5479                 return (0);
5480 out_error:
5481         /*
5482          * Permission check failed, but it is possible denial will get overwritten
5483          * (e.g., when root is traversing through a 700 directory owned by someone
5484          * else).
5485          *
5486          * vaccess() calls priv_check_cred which in turn can descent into MAC
5487          * modules overriding this result. It's quite unclear what semantics
5488          * are allowed for them to operate, thus for safety we don't call them
5489          * from within the SMR section. This also means if any such modules
5490          * are present, we have to let the regular lookup decide.
5491          */
5492         error = priv_check_cred_vfs_lookup_nomac(cred);
5493         switch (error) {
5494         case 0:
5495                 return (0);
5496         case EAGAIN:
5497                 /*
5498                  * MAC modules present.
5499                  */
5500                 return (EAGAIN);
5501         case EPERM:
5502                 return (EACCES);
5503         default:
5504                 return (error);
5505         }
5506 }
5507
5508 /*
5509  * Common filesystem object access control check routine.  Accepts a
5510  * vnode's type, "mode", uid and gid, requested access mode, and credentials.
5511  * Returns 0 on success, or an errno on failure.
5512  */
5513 int
5514 vaccess(__enum_uint8(vtype) type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
5515     accmode_t accmode, struct ucred *cred)
5516 {
5517         accmode_t dac_granted;
5518         accmode_t priv_granted;
5519
5520         KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
5521             ("invalid bit in accmode"));
5522         KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
5523             ("VAPPEND without VWRITE"));
5524
5525         /*
5526          * Look for a normal, non-privileged way to access the file/directory
5527          * as requested.  If it exists, go with that.
5528          */
5529
5530         dac_granted = 0;
5531
5532         /* Check the owner. */
5533         if (cred->cr_uid == file_uid) {
5534                 dac_granted |= VADMIN;
5535                 if (file_mode & S_IXUSR)
5536                         dac_granted |= VEXEC;
5537                 if (file_mode & S_IRUSR)
5538                         dac_granted |= VREAD;
5539                 if (file_mode & S_IWUSR)
5540                         dac_granted |= (VWRITE | VAPPEND);
5541
5542                 if ((accmode & dac_granted) == accmode)
5543                         return (0);
5544
5545                 goto privcheck;
5546         }
5547
5548         /* Otherwise, check the groups (first match) */
5549         if (groupmember(file_gid, cred)) {
5550                 if (file_mode & S_IXGRP)
5551                         dac_granted |= VEXEC;
5552                 if (file_mode & S_IRGRP)
5553                         dac_granted |= VREAD;
5554                 if (file_mode & S_IWGRP)
5555                         dac_granted |= (VWRITE | VAPPEND);
5556
5557                 if ((accmode & dac_granted) == accmode)
5558                         return (0);
5559
5560                 goto privcheck;
5561         }
5562
5563         /* Otherwise, check everyone else. */
5564         if (file_mode & S_IXOTH)
5565                 dac_granted |= VEXEC;
5566         if (file_mode & S_IROTH)
5567                 dac_granted |= VREAD;
5568         if (file_mode & S_IWOTH)
5569                 dac_granted |= (VWRITE | VAPPEND);
5570         if ((accmode & dac_granted) == accmode)
5571                 return (0);
5572
5573 privcheck:
5574         /*
5575          * Build a privilege mask to determine if the set of privileges
5576          * satisfies the requirements when combined with the granted mask
5577          * from above.  For each privilege, if the privilege is required,
5578          * bitwise or the request type onto the priv_granted mask.
5579          */
5580         priv_granted = 0;
5581
5582         if (type == VDIR) {
5583                 /*
5584                  * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
5585                  * requests, instead of PRIV_VFS_EXEC.
5586                  */
5587                 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
5588                     !priv_check_cred(cred, PRIV_VFS_LOOKUP))
5589                         priv_granted |= VEXEC;
5590         } else {
5591                 /*
5592                  * Ensure that at least one execute bit is on. Otherwise,
5593                  * a privileged user will always succeed, and we don't want
5594                  * this to happen unless the file really is executable.
5595                  */
5596                 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
5597                     (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
5598                     !priv_check_cred(cred, PRIV_VFS_EXEC))
5599                         priv_granted |= VEXEC;
5600         }
5601
5602         if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
5603             !priv_check_cred(cred, PRIV_VFS_READ))
5604                 priv_granted |= VREAD;
5605
5606         if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
5607             !priv_check_cred(cred, PRIV_VFS_WRITE))
5608                 priv_granted |= (VWRITE | VAPPEND);
5609
5610         if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
5611             !priv_check_cred(cred, PRIV_VFS_ADMIN))
5612                 priv_granted |= VADMIN;
5613
5614         if ((accmode & (priv_granted | dac_granted)) == accmode) {
5615                 return (0);
5616         }
5617
5618         return ((accmode & VADMIN) ? EPERM : EACCES);
5619 }
5620
5621 /*
5622  * Credential check based on process requesting service, and per-attribute
5623  * permissions.
5624  */
5625 int
5626 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
5627     struct thread *td, accmode_t accmode)
5628 {
5629
5630         /*
5631          * Kernel-invoked always succeeds.
5632          */
5633         if (cred == NOCRED)
5634                 return (0);
5635
5636         /*
5637          * Do not allow privileged processes in jail to directly manipulate
5638          * system attributes.
5639          */
5640         switch (attrnamespace) {
5641         case EXTATTR_NAMESPACE_SYSTEM:
5642                 /* Potentially should be: return (EPERM); */
5643                 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
5644         case EXTATTR_NAMESPACE_USER:
5645                 return (VOP_ACCESS(vp, accmode, cred, td));
5646         default:
5647                 return (EPERM);
5648         }
5649 }
5650
5651 #ifdef DEBUG_VFS_LOCKS
5652 int vfs_badlock_ddb = 1;        /* Drop into debugger on violation. */
5653 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
5654     "Drop into debugger on lock violation");
5655
5656 int vfs_badlock_mutex = 1;      /* Check for interlock across VOPs. */
5657 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
5658     0, "Check for interlock across VOPs");
5659
5660 int vfs_badlock_print = 1;      /* Print lock violations. */
5661 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
5662     0, "Print lock violations");
5663
5664 int vfs_badlock_vnode = 1;      /* Print vnode details on lock violations. */
5665 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
5666     0, "Print vnode details on lock violations");
5667
5668 #ifdef KDB
5669 int vfs_badlock_backtrace = 1;  /* Print backtrace at lock violations. */
5670 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
5671     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
5672 #endif
5673
5674 static void
5675 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
5676 {
5677
5678 #ifdef KDB
5679         if (vfs_badlock_backtrace)
5680                 kdb_backtrace();
5681 #endif
5682         if (vfs_badlock_vnode)
5683                 vn_printf(vp, "vnode ");
5684         if (vfs_badlock_print)
5685                 printf("%s: %p %s\n", str, (void *)vp, msg);
5686         if (vfs_badlock_ddb)
5687                 kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
5688 }
5689
5690 void
5691 assert_vi_locked(struct vnode *vp, const char *str)
5692 {
5693
5694         if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
5695                 vfs_badlock("interlock is not locked but should be", str, vp);
5696 }
5697
5698 void
5699 assert_vi_unlocked(struct vnode *vp, const char *str)
5700 {
5701
5702         if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
5703                 vfs_badlock("interlock is locked but should not be", str, vp);
5704 }
5705
5706 void
5707 assert_vop_locked(struct vnode *vp, const char *str)
5708 {
5709         if (KERNEL_PANICKED() || vp == NULL)
5710                 return;
5711
5712 #ifdef WITNESS
5713         if ((vp->v_irflag & VIRF_CROSSMP) == 0 &&
5714             witness_is_owned(&vp->v_vnlock->lock_object) == -1)
5715 #else
5716         int locked = VOP_ISLOCKED(vp);
5717         if (locked == 0 || locked == LK_EXCLOTHER)
5718 #endif
5719                 vfs_badlock("is not locked but should be", str, vp);
5720 }
5721
5722 void
5723 assert_vop_unlocked(struct vnode *vp, const char *str)
5724 {
5725         if (KERNEL_PANICKED() || vp == NULL)
5726                 return;
5727
5728 #ifdef WITNESS
5729         if ((vp->v_irflag & VIRF_CROSSMP) == 0 &&
5730             witness_is_owned(&vp->v_vnlock->lock_object) == 1)
5731 #else
5732         if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
5733 #endif
5734                 vfs_badlock("is locked but should not be", str, vp);
5735 }
5736
5737 void
5738 assert_vop_elocked(struct vnode *vp, const char *str)
5739 {
5740         if (KERNEL_PANICKED() || vp == NULL)
5741                 return;
5742
5743         if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
5744                 vfs_badlock("is not exclusive locked but should be", str, vp);
5745 }
5746 #endif /* DEBUG_VFS_LOCKS */
5747
5748 void
5749 vop_rename_fail(struct vop_rename_args *ap)
5750 {
5751
5752         if (ap->a_tvp != NULL)
5753                 vput(ap->a_tvp);
5754         if (ap->a_tdvp == ap->a_tvp)
5755                 vrele(ap->a_tdvp);
5756         else
5757                 vput(ap->a_tdvp);
5758         vrele(ap->a_fdvp);
5759         vrele(ap->a_fvp);
5760 }
5761
5762 void
5763 vop_rename_pre(void *ap)
5764 {
5765         struct vop_rename_args *a = ap;
5766
5767 #ifdef DEBUG_VFS_LOCKS
5768         if (a->a_tvp)
5769                 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
5770         ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
5771         ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
5772         ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
5773
5774         /* Check the source (from). */
5775         if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
5776             (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
5777                 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
5778         if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
5779                 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
5780
5781         /* Check the target. */
5782         if (a->a_tvp)
5783                 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
5784         ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
5785 #endif
5786         /*
5787          * It may be tempting to add vn_seqc_write_begin/end calls here and
5788          * in vop_rename_post but that's not going to work out since some
5789          * filesystems relookup vnodes mid-rename. This is probably a bug.
5790          *
5791          * For now filesystems are expected to do the relevant calls after they
5792          * decide what vnodes to operate on.
5793          */
5794         if (a->a_tdvp != a->a_fdvp)
5795                 vhold(a->a_fdvp);
5796         if (a->a_tvp != a->a_fvp)
5797                 vhold(a->a_fvp);
5798         vhold(a->a_tdvp);
5799         if (a->a_tvp)
5800                 vhold(a->a_tvp);
5801 }
5802
5803 #ifdef DEBUG_VFS_LOCKS
5804 void
5805 vop_fplookup_vexec_debugpre(void *ap __unused)
5806 {
5807
5808         VFS_SMR_ASSERT_ENTERED();
5809 }
5810
5811 void
5812 vop_fplookup_vexec_debugpost(void *ap, int rc)
5813 {
5814         struct vop_fplookup_vexec_args *a;
5815         struct vnode *vp;
5816
5817         a = ap;
5818         vp = a->a_vp;
5819
5820         VFS_SMR_ASSERT_ENTERED();
5821         if (rc == EOPNOTSUPP)
5822                 VNPASS(VN_IS_DOOMED(vp), vp);
5823 }
5824
5825 void
5826 vop_fplookup_symlink_debugpre(void *ap __unused)
5827 {
5828
5829         VFS_SMR_ASSERT_ENTERED();
5830 }
5831
5832 void
5833 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused)
5834 {
5835
5836         VFS_SMR_ASSERT_ENTERED();
5837 }
5838
5839 static void
5840 vop_fsync_debugprepost(struct vnode *vp, const char *name)
5841 {
5842         if (vp->v_type == VCHR)
5843                 ;
5844         else if (MNT_EXTENDED_SHARED(vp->v_mount))
5845                 ASSERT_VOP_LOCKED(vp, name);
5846         else
5847                 ASSERT_VOP_ELOCKED(vp, name);
5848 }
5849
5850 void
5851 vop_fsync_debugpre(void *a)
5852 {
5853         struct vop_fsync_args *ap;
5854
5855         ap = a;
5856         vop_fsync_debugprepost(ap->a_vp, "fsync");
5857 }
5858
5859 void
5860 vop_fsync_debugpost(void *a, int rc __unused)
5861 {
5862         struct vop_fsync_args *ap;
5863
5864         ap = a;
5865         vop_fsync_debugprepost(ap->a_vp, "fsync");
5866 }
5867
5868 void
5869 vop_fdatasync_debugpre(void *a)
5870 {
5871         struct vop_fdatasync_args *ap;
5872
5873         ap = a;
5874         vop_fsync_debugprepost(ap->a_vp, "fsync");
5875 }
5876
5877 void
5878 vop_fdatasync_debugpost(void *a, int rc __unused)
5879 {
5880         struct vop_fdatasync_args *ap;
5881
5882         ap = a;
5883         vop_fsync_debugprepost(ap->a_vp, "fsync");
5884 }
5885
5886 void
5887 vop_strategy_debugpre(void *ap)
5888 {
5889         struct vop_strategy_args *a;
5890         struct buf *bp;
5891
5892         a = ap;
5893         bp = a->a_bp;
5894
5895         /*
5896          * Cluster ops lock their component buffers but not the IO container.
5897          */
5898         if ((bp->b_flags & B_CLUSTER) != 0)
5899                 return;
5900
5901         if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) {
5902                 if (vfs_badlock_print)
5903                         printf(
5904                             "VOP_STRATEGY: bp is not locked but should be\n");
5905                 if (vfs_badlock_ddb)
5906                         kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
5907         }
5908 }
5909
5910 void
5911 vop_lock_debugpre(void *ap)
5912 {
5913         struct vop_lock1_args *a = ap;
5914
5915         if ((a->a_flags & LK_INTERLOCK) == 0)
5916                 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
5917         else
5918                 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
5919 }
5920
5921 void
5922 vop_lock_debugpost(void *ap, int rc)
5923 {
5924         struct vop_lock1_args *a = ap;
5925
5926         ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
5927         if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
5928                 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
5929 }
5930
5931 void
5932 vop_unlock_debugpre(void *ap)
5933 {
5934         struct vop_unlock_args *a = ap;
5935         struct vnode *vp = a->a_vp;
5936
5937         VNPASS(vn_get_state(vp) != VSTATE_UNINITIALIZED, vp);
5938         ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK");
5939 }
5940
5941 void
5942 vop_need_inactive_debugpre(void *ap)
5943 {
5944         struct vop_need_inactive_args *a = ap;
5945
5946         ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
5947 }
5948
5949 void
5950 vop_need_inactive_debugpost(void *ap, int rc)
5951 {
5952         struct vop_need_inactive_args *a = ap;
5953
5954         ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
5955 }
5956 #endif
5957
5958 void
5959 vop_create_pre(void *ap)
5960 {
5961         struct vop_create_args *a;
5962         struct vnode *dvp;
5963
5964         a = ap;
5965         dvp = a->a_dvp;
5966         vn_seqc_write_begin(dvp);
5967 }
5968
5969 void
5970 vop_create_post(void *ap, int rc)
5971 {
5972         struct vop_create_args *a;
5973         struct vnode *dvp;
5974
5975         a = ap;
5976         dvp = a->a_dvp;
5977         vn_seqc_write_end(dvp);
5978         if (!rc)
5979                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
5980 }
5981
5982 void
5983 vop_whiteout_pre(void *ap)
5984 {
5985         struct vop_whiteout_args *a;
5986         struct vnode *dvp;
5987
5988         a = ap;
5989         dvp = a->a_dvp;
5990         vn_seqc_write_begin(dvp);
5991 }
5992
5993 void
5994 vop_whiteout_post(void *ap, int rc)
5995 {
5996         struct vop_whiteout_args *a;
5997         struct vnode *dvp;
5998
5999         a = ap;
6000         dvp = a->a_dvp;
6001         vn_seqc_write_end(dvp);
6002 }
6003
6004 void
6005 vop_deleteextattr_pre(void *ap)
6006 {
6007         struct vop_deleteextattr_args *a;
6008         struct vnode *vp;
6009
6010         a = ap;
6011         vp = a->a_vp;
6012         vn_seqc_write_begin(vp);
6013 }
6014
6015 void
6016 vop_deleteextattr_post(void *ap, int rc)
6017 {
6018         struct vop_deleteextattr_args *a;
6019         struct vnode *vp;
6020
6021         a = ap;
6022         vp = a->a_vp;
6023         vn_seqc_write_end(vp);
6024         if (!rc)
6025                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
6026 }
6027
6028 void
6029 vop_link_pre(void *ap)
6030 {
6031         struct vop_link_args *a;
6032         struct vnode *vp, *tdvp;
6033
6034         a = ap;
6035         vp = a->a_vp;
6036         tdvp = a->a_tdvp;
6037         vn_seqc_write_begin(vp);
6038         vn_seqc_write_begin(tdvp);
6039 }
6040
6041 void
6042 vop_link_post(void *ap, int rc)
6043 {
6044         struct vop_link_args *a;
6045         struct vnode *vp, *tdvp;
6046
6047         a = ap;
6048         vp = a->a_vp;
6049         tdvp = a->a_tdvp;
6050         vn_seqc_write_end(vp);
6051         vn_seqc_write_end(tdvp);
6052         if (!rc) {
6053                 VFS_KNOTE_LOCKED(vp, NOTE_LINK);
6054                 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE);
6055         }
6056 }
6057
6058 void
6059 vop_mkdir_pre(void *ap)
6060 {
6061         struct vop_mkdir_args *a;
6062         struct vnode *dvp;
6063
6064         a = ap;
6065         dvp = a->a_dvp;
6066         vn_seqc_write_begin(dvp);
6067 }
6068
6069 void
6070 vop_mkdir_post(void *ap, int rc)
6071 {
6072         struct vop_mkdir_args *a;
6073         struct vnode *dvp;
6074
6075         a = ap;
6076         dvp = a->a_dvp;
6077         vn_seqc_write_end(dvp);
6078         if (!rc)
6079                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
6080 }
6081
6082 #ifdef DEBUG_VFS_LOCKS
6083 void
6084 vop_mkdir_debugpost(void *ap, int rc)
6085 {
6086         struct vop_mkdir_args *a;
6087
6088         a = ap;
6089         if (!rc)
6090                 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp);
6091 }
6092 #endif
6093
6094 void
6095 vop_mknod_pre(void *ap)
6096 {
6097         struct vop_mknod_args *a;
6098         struct vnode *dvp;
6099
6100         a = ap;
6101         dvp = a->a_dvp;
6102         vn_seqc_write_begin(dvp);
6103 }
6104
6105 void
6106 vop_mknod_post(void *ap, int rc)
6107 {
6108         struct vop_mknod_args *a;
6109         struct vnode *dvp;
6110
6111         a = ap;
6112         dvp = a->a_dvp;
6113         vn_seqc_write_end(dvp);
6114         if (!rc)
6115                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
6116 }
6117
6118 void
6119 vop_reclaim_post(void *ap, int rc)
6120 {
6121         struct vop_reclaim_args *a;
6122         struct vnode *vp;
6123
6124         a = ap;
6125         vp = a->a_vp;
6126         ASSERT_VOP_IN_SEQC(vp);
6127         if (!rc)
6128                 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE);
6129 }
6130
6131 void
6132 vop_remove_pre(void *ap)
6133 {
6134         struct vop_remove_args *a;
6135         struct vnode *dvp, *vp;
6136
6137         a = ap;
6138         dvp = a->a_dvp;
6139         vp = a->a_vp;
6140         vn_seqc_write_begin(dvp);
6141         vn_seqc_write_begin(vp);
6142 }
6143
6144 void
6145 vop_remove_post(void *ap, int rc)
6146 {
6147         struct vop_remove_args *a;
6148         struct vnode *dvp, *vp;
6149
6150         a = ap;
6151         dvp = a->a_dvp;
6152         vp = a->a_vp;
6153         vn_seqc_write_end(dvp);
6154         vn_seqc_write_end(vp);
6155         if (!rc) {
6156                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
6157                 VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
6158         }
6159 }
6160
6161 void
6162 vop_rename_post(void *ap, int rc)
6163 {
6164         struct vop_rename_args *a = ap;
6165         long hint;
6166
6167         if (!rc) {
6168                 hint = NOTE_WRITE;
6169                 if (a->a_fdvp == a->a_tdvp) {
6170                         if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
6171                                 hint |= NOTE_LINK;
6172                         VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
6173                         VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
6174                 } else {
6175                         hint |= NOTE_EXTEND;
6176                         if (a->a_fvp->v_type == VDIR)
6177                                 hint |= NOTE_LINK;
6178                         VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
6179
6180                         if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
6181                             a->a_tvp->v_type == VDIR)
6182                                 hint &= ~NOTE_LINK;
6183                         VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
6184                 }
6185
6186                 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
6187                 if (a->a_tvp)
6188                         VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
6189         }
6190         if (a->a_tdvp != a->a_fdvp)
6191                 vdrop(a->a_fdvp);
6192         if (a->a_tvp != a->a_fvp)
6193                 vdrop(a->a_fvp);
6194         vdrop(a->a_tdvp);
6195         if (a->a_tvp)
6196                 vdrop(a->a_tvp);
6197 }
6198
6199 void
6200 vop_rmdir_pre(void *ap)
6201 {
6202         struct vop_rmdir_args *a;
6203         struct vnode *dvp, *vp;
6204
6205         a = ap;
6206         dvp = a->a_dvp;
6207         vp = a->a_vp;
6208         vn_seqc_write_begin(dvp);
6209         vn_seqc_write_begin(vp);
6210 }
6211
6212 void
6213 vop_rmdir_post(void *ap, int rc)
6214 {
6215         struct vop_rmdir_args *a;
6216         struct vnode *dvp, *vp;
6217
6218         a = ap;
6219         dvp = a->a_dvp;
6220         vp = a->a_vp;
6221         vn_seqc_write_end(dvp);
6222         vn_seqc_write_end(vp);
6223         if (!rc) {
6224                 vp->v_vflag |= VV_UNLINKED;
6225                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
6226                 VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
6227         }
6228 }
6229
6230 void
6231 vop_setattr_pre(void *ap)
6232 {
6233         struct vop_setattr_args *a;
6234         struct vnode *vp;
6235
6236         a = ap;
6237         vp = a->a_vp;
6238         vn_seqc_write_begin(vp);
6239 }
6240
6241 void
6242 vop_setattr_post(void *ap, int rc)
6243 {
6244         struct vop_setattr_args *a;
6245         struct vnode *vp;
6246
6247         a = ap;
6248         vp = a->a_vp;
6249         vn_seqc_write_end(vp);
6250         if (!rc)
6251                 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
6252 }
6253
6254 void
6255 vop_setacl_pre(void *ap)
6256 {
6257         struct vop_setacl_args *a;
6258         struct vnode *vp;
6259
6260         a = ap;
6261         vp = a->a_vp;
6262         vn_seqc_write_begin(vp);
6263 }
6264
6265 void
6266 vop_setacl_post(void *ap, int rc __unused)
6267 {
6268         struct vop_setacl_args *a;
6269         struct vnode *vp;
6270
6271         a = ap;
6272         vp = a->a_vp;
6273         vn_seqc_write_end(vp);
6274 }
6275
6276 void
6277 vop_setextattr_pre(void *ap)
6278 {
6279         struct vop_setextattr_args *a;
6280         struct vnode *vp;
6281
6282         a = ap;
6283         vp = a->a_vp;
6284         vn_seqc_write_begin(vp);
6285 }
6286
6287 void
6288 vop_setextattr_post(void *ap, int rc)
6289 {
6290         struct vop_setextattr_args *a;
6291         struct vnode *vp;
6292
6293         a = ap;
6294         vp = a->a_vp;
6295         vn_seqc_write_end(vp);
6296         if (!rc)
6297                 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
6298 }
6299
6300 void
6301 vop_symlink_pre(void *ap)
6302 {
6303         struct vop_symlink_args *a;
6304         struct vnode *dvp;
6305
6306         a = ap;
6307         dvp = a->a_dvp;
6308         vn_seqc_write_begin(dvp);
6309 }
6310
6311 void
6312 vop_symlink_post(void *ap, int rc)
6313 {
6314         struct vop_symlink_args *a;
6315         struct vnode *dvp;
6316
6317         a = ap;
6318         dvp = a->a_dvp;
6319         vn_seqc_write_end(dvp);
6320         if (!rc)
6321                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
6322 }
6323
6324 void
6325 vop_open_post(void *ap, int rc)
6326 {
6327         struct vop_open_args *a = ap;
6328
6329         if (!rc)
6330                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
6331 }
6332
6333 void
6334 vop_close_post(void *ap, int rc)
6335 {
6336         struct vop_close_args *a = ap;
6337
6338         if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
6339             !VN_IS_DOOMED(a->a_vp))) {
6340                 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
6341                     NOTE_CLOSE_WRITE : NOTE_CLOSE);
6342         }
6343 }
6344
6345 void
6346 vop_read_post(void *ap, int rc)
6347 {
6348         struct vop_read_args *a = ap;
6349
6350         if (!rc)
6351                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
6352 }
6353
6354 void
6355 vop_read_pgcache_post(void *ap, int rc)
6356 {
6357         struct vop_read_pgcache_args *a = ap;
6358
6359         if (!rc)
6360                 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ);
6361 }
6362
6363 void
6364 vop_readdir_post(void *ap, int rc)
6365 {
6366         struct vop_readdir_args *a = ap;
6367
6368         if (!rc)
6369                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
6370 }
6371
6372 static struct knlist fs_knlist;
6373
6374 static void
6375 vfs_event_init(void *arg)
6376 {
6377         knlist_init_mtx(&fs_knlist, NULL);
6378 }
6379 /* XXX - correct order? */
6380 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
6381
6382 void
6383 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
6384 {
6385
6386         KNOTE_UNLOCKED(&fs_knlist, event);
6387 }
6388
6389 static int      filt_fsattach(struct knote *kn);
6390 static void     filt_fsdetach(struct knote *kn);
6391 static int      filt_fsevent(struct knote *kn, long hint);
6392
6393 struct filterops fs_filtops = {
6394         .f_isfd = 0,
6395         .f_attach = filt_fsattach,
6396         .f_detach = filt_fsdetach,
6397         .f_event = filt_fsevent
6398 };
6399
6400 static int
6401 filt_fsattach(struct knote *kn)
6402 {
6403
6404         kn->kn_flags |= EV_CLEAR;
6405         knlist_add(&fs_knlist, kn, 0);
6406         return (0);
6407 }
6408
6409 static void
6410 filt_fsdetach(struct knote *kn)
6411 {
6412
6413         knlist_remove(&fs_knlist, kn, 0);
6414 }
6415
6416 static int
6417 filt_fsevent(struct knote *kn, long hint)
6418 {
6419
6420         kn->kn_fflags |= kn->kn_sfflags & hint;
6421
6422         return (kn->kn_fflags != 0);
6423 }
6424
6425 static int
6426 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
6427 {
6428         struct vfsidctl vc;
6429         int error;
6430         struct mount *mp;
6431
6432         error = SYSCTL_IN(req, &vc, sizeof(vc));
6433         if (error)
6434                 return (error);
6435         if (vc.vc_vers != VFS_CTL_VERS1)
6436                 return (EINVAL);
6437         mp = vfs_getvfs(&vc.vc_fsid);
6438         if (mp == NULL)
6439                 return (ENOENT);
6440         /* ensure that a specific sysctl goes to the right filesystem. */
6441         if (strcmp(vc.vc_fstypename, "*") != 0 &&
6442             strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
6443                 vfs_rel(mp);
6444                 return (EINVAL);
6445         }
6446         VCTLTOREQ(&vc, req);
6447         error = VFS_SYSCTL(mp, vc.vc_op, req);
6448         vfs_rel(mp);
6449         return (error);
6450 }
6451
6452 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR,
6453     NULL, 0, sysctl_vfs_ctl, "",
6454     "Sysctl by fsid");
6455
6456 /*
6457  * Function to initialize a va_filerev field sensibly.
6458  * XXX: Wouldn't a random number make a lot more sense ??
6459  */
6460 u_quad_t
6461 init_va_filerev(void)
6462 {
6463         struct bintime bt;
6464
6465         getbinuptime(&bt);
6466         return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
6467 }
6468
6469 static int      filt_vfsread(struct knote *kn, long hint);
6470 static int      filt_vfswrite(struct knote *kn, long hint);
6471 static int      filt_vfsvnode(struct knote *kn, long hint);
6472 static void     filt_vfsdetach(struct knote *kn);
6473 static struct filterops vfsread_filtops = {
6474         .f_isfd = 1,
6475         .f_detach = filt_vfsdetach,
6476         .f_event = filt_vfsread
6477 };
6478 static struct filterops vfswrite_filtops = {
6479         .f_isfd = 1,
6480         .f_detach = filt_vfsdetach,
6481         .f_event = filt_vfswrite
6482 };
6483 static struct filterops vfsvnode_filtops = {
6484         .f_isfd = 1,
6485         .f_detach = filt_vfsdetach,
6486         .f_event = filt_vfsvnode
6487 };
6488
6489 static void
6490 vfs_knllock(void *arg)
6491 {
6492         struct vnode *vp = arg;
6493
6494         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
6495 }
6496
6497 static void
6498 vfs_knlunlock(void *arg)
6499 {
6500         struct vnode *vp = arg;
6501
6502         VOP_UNLOCK(vp);
6503 }
6504
6505 static void
6506 vfs_knl_assert_lock(void *arg, int what)
6507 {
6508 #ifdef DEBUG_VFS_LOCKS
6509         struct vnode *vp = arg;
6510
6511         if (what == LA_LOCKED)
6512                 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
6513         else
6514                 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
6515 #endif
6516 }
6517
6518 int
6519 vfs_kqfilter(struct vop_kqfilter_args *ap)
6520 {
6521         struct vnode *vp = ap->a_vp;
6522         struct knote *kn = ap->a_kn;
6523         struct knlist *knl;
6524
6525         KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ &&
6526             kn->kn_filter != EVFILT_WRITE),
6527             ("READ/WRITE filter on a FIFO leaked through"));
6528         switch (kn->kn_filter) {
6529         case EVFILT_READ:
6530                 kn->kn_fop = &vfsread_filtops;
6531                 break;
6532         case EVFILT_WRITE:
6533                 kn->kn_fop = &vfswrite_filtops;
6534                 break;
6535         case EVFILT_VNODE:
6536                 kn->kn_fop = &vfsvnode_filtops;
6537                 break;
6538         default:
6539                 return (EINVAL);
6540         }
6541
6542         kn->kn_hook = (caddr_t)vp;
6543
6544         v_addpollinfo(vp);
6545         if (vp->v_pollinfo == NULL)
6546                 return (ENOMEM);
6547         knl = &vp->v_pollinfo->vpi_selinfo.si_note;
6548         vhold(vp);
6549         knlist_add(knl, kn, 0);
6550
6551         return (0);
6552 }
6553
6554 /*
6555  * Detach knote from vnode
6556  */
6557 static void
6558 filt_vfsdetach(struct knote *kn)
6559 {
6560         struct vnode *vp = (struct vnode *)kn->kn_hook;
6561
6562         KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
6563         knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
6564         vdrop(vp);
6565 }
6566
6567 /*ARGSUSED*/
6568 static int
6569 filt_vfsread(struct knote *kn, long hint)
6570 {
6571         struct vnode *vp = (struct vnode *)kn->kn_hook;
6572         off_t size;
6573         int res;
6574
6575         /*
6576          * filesystem is gone, so set the EOF flag and schedule
6577          * the knote for deletion.
6578          */
6579         if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
6580                 VI_LOCK(vp);
6581                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
6582                 VI_UNLOCK(vp);
6583                 return (1);
6584         }
6585
6586         if (vn_getsize_locked(vp, &size, curthread->td_ucred) != 0)
6587                 return (0);
6588
6589         VI_LOCK(vp);
6590         kn->kn_data = size - kn->kn_fp->f_offset;
6591         res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
6592         VI_UNLOCK(vp);
6593         return (res);
6594 }
6595
6596 /*ARGSUSED*/
6597 static int
6598 filt_vfswrite(struct knote *kn, long hint)
6599 {
6600         struct vnode *vp = (struct vnode *)kn->kn_hook;
6601
6602         VI_LOCK(vp);
6603
6604         /*
6605          * filesystem is gone, so set the EOF flag and schedule
6606          * the knote for deletion.
6607          */
6608         if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
6609                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
6610
6611         kn->kn_data = 0;
6612         VI_UNLOCK(vp);
6613         return (1);
6614 }
6615
6616 static int
6617 filt_vfsvnode(struct knote *kn, long hint)
6618 {
6619         struct vnode *vp = (struct vnode *)kn->kn_hook;
6620         int res;
6621
6622         VI_LOCK(vp);
6623         if (kn->kn_sfflags & hint)
6624                 kn->kn_fflags |= hint;
6625         if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
6626                 kn->kn_flags |= EV_EOF;
6627                 VI_UNLOCK(vp);
6628                 return (1);
6629         }
6630         res = (kn->kn_fflags != 0);
6631         VI_UNLOCK(vp);
6632         return (res);
6633 }
6634
6635 int
6636 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
6637 {
6638         int error;
6639
6640         if (dp->d_reclen > ap->a_uio->uio_resid)
6641                 return (ENAMETOOLONG);
6642         error = uiomove(dp, dp->d_reclen, ap->a_uio);
6643         if (error) {
6644                 if (ap->a_ncookies != NULL) {
6645                         if (ap->a_cookies != NULL)
6646                                 free(ap->a_cookies, M_TEMP);
6647                         ap->a_cookies = NULL;
6648                         *ap->a_ncookies = 0;
6649                 }
6650                 return (error);
6651         }
6652         if (ap->a_ncookies == NULL)
6653                 return (0);
6654
6655         KASSERT(ap->a_cookies,
6656             ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
6657
6658         *ap->a_cookies = realloc(*ap->a_cookies,
6659             (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO);
6660         (*ap->a_cookies)[*ap->a_ncookies] = off;
6661         *ap->a_ncookies += 1;
6662         return (0);
6663 }
6664
6665 /*
6666  * The purpose of this routine is to remove granularity from accmode_t,
6667  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
6668  * VADMIN and VAPPEND.
6669  *
6670  * If it returns 0, the caller is supposed to continue with the usual
6671  * access checks using 'accmode' as modified by this routine.  If it
6672  * returns nonzero value, the caller is supposed to return that value
6673  * as errno.
6674  *
6675  * Note that after this routine runs, accmode may be zero.
6676  */
6677 int
6678 vfs_unixify_accmode(accmode_t *accmode)
6679 {
6680         /*
6681          * There is no way to specify explicit "deny" rule using
6682          * file mode or POSIX.1e ACLs.
6683          */
6684         if (*accmode & VEXPLICIT_DENY) {
6685                 *accmode = 0;
6686                 return (0);
6687         }
6688
6689         /*
6690          * None of these can be translated into usual access bits.
6691          * Also, the common case for NFSv4 ACLs is to not contain
6692          * either of these bits. Caller should check for VWRITE
6693          * on the containing directory instead.
6694          */
6695         if (*accmode & (VDELETE_CHILD | VDELETE))
6696                 return (EPERM);
6697
6698         if (*accmode & VADMIN_PERMS) {
6699                 *accmode &= ~VADMIN_PERMS;
6700                 *accmode |= VADMIN;
6701         }
6702
6703         /*
6704          * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
6705          * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
6706          */
6707         *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
6708
6709         return (0);
6710 }
6711
6712 /*
6713  * Clear out a doomed vnode (if any) and replace it with a new one as long
6714  * as the fs is not being unmounted. Return the root vnode to the caller.
6715  */
6716 static int __noinline
6717 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp)
6718 {
6719         struct vnode *vp;
6720         int error;
6721
6722 restart:
6723         if (mp->mnt_rootvnode != NULL) {
6724                 MNT_ILOCK(mp);
6725                 vp = mp->mnt_rootvnode;
6726                 if (vp != NULL) {
6727                         if (!VN_IS_DOOMED(vp)) {
6728                                 vrefact(vp);
6729                                 MNT_IUNLOCK(mp);
6730                                 error = vn_lock(vp, flags);
6731                                 if (error == 0) {
6732                                         *vpp = vp;
6733                                         return (0);
6734                                 }
6735                                 vrele(vp);
6736                                 goto restart;
6737                         }
6738                         /*
6739                          * Clear the old one.
6740                          */
6741                         mp->mnt_rootvnode = NULL;
6742                 }
6743                 MNT_IUNLOCK(mp);
6744                 if (vp != NULL) {
6745                         vfs_op_barrier_wait(mp);
6746                         vrele(vp);
6747                 }
6748         }
6749         error = VFS_CACHEDROOT(mp, flags, vpp);
6750         if (error != 0)
6751                 return (error);
6752         if (mp->mnt_vfs_ops == 0) {
6753                 MNT_ILOCK(mp);
6754                 if (mp->mnt_vfs_ops != 0) {
6755                         MNT_IUNLOCK(mp);
6756                         return (0);
6757                 }
6758                 if (mp->mnt_rootvnode == NULL) {
6759                         vrefact(*vpp);
6760                         mp->mnt_rootvnode = *vpp;
6761                 } else {
6762                         if (mp->mnt_rootvnode != *vpp) {
6763                                 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) {
6764                                         panic("%s: mismatch between vnode returned "
6765                                             " by VFS_CACHEDROOT and the one cached "
6766                                             " (%p != %p)",
6767                                             __func__, *vpp, mp->mnt_rootvnode);
6768                                 }
6769                         }
6770                 }
6771                 MNT_IUNLOCK(mp);
6772         }
6773         return (0);
6774 }
6775
6776 int
6777 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp)
6778 {
6779         struct mount_pcpu *mpcpu;
6780         struct vnode *vp;
6781         int error;
6782
6783         if (!vfs_op_thread_enter(mp, mpcpu))
6784                 return (vfs_cache_root_fallback(mp, flags, vpp));
6785         vp = atomic_load_ptr(&mp->mnt_rootvnode);
6786         if (vp == NULL || VN_IS_DOOMED(vp)) {
6787                 vfs_op_thread_exit(mp, mpcpu);
6788                 return (vfs_cache_root_fallback(mp, flags, vpp));
6789         }
6790         vrefact(vp);
6791         vfs_op_thread_exit(mp, mpcpu);
6792         error = vn_lock(vp, flags);
6793         if (error != 0) {
6794                 vrele(vp);
6795                 return (vfs_cache_root_fallback(mp, flags, vpp));
6796         }
6797         *vpp = vp;
6798         return (0);
6799 }
6800
6801 struct vnode *
6802 vfs_cache_root_clear(struct mount *mp)
6803 {
6804         struct vnode *vp;
6805
6806         /*
6807          * ops > 0 guarantees there is nobody who can see this vnode
6808          */
6809         MPASS(mp->mnt_vfs_ops > 0);
6810         vp = mp->mnt_rootvnode;
6811         if (vp != NULL)
6812                 vn_seqc_write_begin(vp);
6813         mp->mnt_rootvnode = NULL;
6814         return (vp);
6815 }
6816
6817 void
6818 vfs_cache_root_set(struct mount *mp, struct vnode *vp)
6819 {
6820
6821         MPASS(mp->mnt_vfs_ops > 0);
6822         vrefact(vp);
6823         mp->mnt_rootvnode = vp;
6824 }
6825
6826 /*
6827  * These are helper functions for filesystems to traverse all
6828  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
6829  *
6830  * This interface replaces MNT_VNODE_FOREACH.
6831  */
6832
6833 struct vnode *
6834 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
6835 {
6836         struct vnode *vp;
6837
6838         maybe_yield();
6839         MNT_ILOCK(mp);
6840         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
6841         for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
6842             vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
6843                 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
6844                 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
6845                         continue;
6846                 VI_LOCK(vp);
6847                 if (VN_IS_DOOMED(vp)) {
6848                         VI_UNLOCK(vp);
6849                         continue;
6850                 }
6851                 break;
6852         }
6853         if (vp == NULL) {
6854                 __mnt_vnode_markerfree_all(mvp, mp);
6855                 /* MNT_IUNLOCK(mp); -- done in above function */
6856                 mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
6857                 return (NULL);
6858         }
6859         TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
6860         TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
6861         MNT_IUNLOCK(mp);
6862         return (vp);
6863 }
6864
6865 struct vnode *
6866 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
6867 {
6868         struct vnode *vp;
6869
6870         *mvp = vn_alloc_marker(mp);
6871         MNT_ILOCK(mp);
6872         MNT_REF(mp);
6873
6874         TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
6875                 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
6876                 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
6877                         continue;
6878                 VI_LOCK(vp);
6879                 if (VN_IS_DOOMED(vp)) {
6880                         VI_UNLOCK(vp);
6881                         continue;
6882                 }
6883                 break;
6884         }
6885         if (vp == NULL) {
6886                 MNT_REL(mp);
6887                 MNT_IUNLOCK(mp);
6888                 vn_free_marker(*mvp);
6889                 *mvp = NULL;
6890                 return (NULL);
6891         }
6892         TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
6893         MNT_IUNLOCK(mp);
6894         return (vp);
6895 }
6896
6897 void
6898 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
6899 {
6900
6901         if (*mvp == NULL) {
6902                 MNT_IUNLOCK(mp);
6903                 return;
6904         }
6905
6906         mtx_assert(MNT_MTX(mp), MA_OWNED);
6907
6908         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
6909         TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
6910         MNT_REL(mp);
6911         MNT_IUNLOCK(mp);
6912         vn_free_marker(*mvp);
6913         *mvp = NULL;
6914 }
6915
6916 /*
6917  * These are helper functions for filesystems to traverse their
6918  * lazy vnodes.  See MNT_VNODE_FOREACH_LAZY() in sys/mount.h
6919  */
6920 static void
6921 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
6922 {
6923
6924         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
6925
6926         MNT_ILOCK(mp);
6927         MNT_REL(mp);
6928         MNT_IUNLOCK(mp);
6929         vn_free_marker(*mvp);
6930         *mvp = NULL;
6931 }
6932
6933 /*
6934  * Relock the mp mount vnode list lock with the vp vnode interlock in the
6935  * conventional lock order during mnt_vnode_next_lazy iteration.
6936  *
6937  * On entry, the mount vnode list lock is held and the vnode interlock is not.
6938  * The list lock is dropped and reacquired.  On success, both locks are held.
6939  * On failure, the mount vnode list lock is held but the vnode interlock is
6940  * not, and the procedure may have yielded.
6941  */
6942 static bool
6943 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp,
6944     struct vnode *vp)
6945 {
6946
6947         VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
6948             TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp,
6949             ("%s: bad marker", __func__));
6950         VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
6951             ("%s: inappropriate vnode", __func__));
6952         ASSERT_VI_UNLOCKED(vp, __func__);
6953         mtx_assert(&mp->mnt_listmtx, MA_OWNED);
6954
6955         TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist);
6956         TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist);
6957
6958         /*
6959          * Note we may be racing against vdrop which transitioned the hold
6960          * count to 0 and now waits for the ->mnt_listmtx lock. This is fine,
6961          * if we are the only user after we get the interlock we will just
6962          * vdrop.
6963          */
6964         vhold(vp);
6965         mtx_unlock(&mp->mnt_listmtx);
6966         VI_LOCK(vp);
6967         if (VN_IS_DOOMED(vp)) {
6968                 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
6969                 goto out_lost;
6970         }
6971         VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
6972         /*
6973          * There is nothing to do if we are the last user.
6974          */
6975         if (!refcount_release_if_not_last(&vp->v_holdcnt))
6976                 goto out_lost;
6977         mtx_lock(&mp->mnt_listmtx);
6978         return (true);
6979 out_lost:
6980         vdropl(vp);
6981         maybe_yield();
6982         mtx_lock(&mp->mnt_listmtx);
6983         return (false);
6984 }
6985
6986 static struct vnode *
6987 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
6988     void *cbarg)
6989 {
6990         struct vnode *vp;
6991
6992         mtx_assert(&mp->mnt_listmtx, MA_OWNED);
6993         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
6994 restart:
6995         vp = TAILQ_NEXT(*mvp, v_lazylist);
6996         while (vp != NULL) {
6997                 if (vp->v_type == VMARKER) {
6998                         vp = TAILQ_NEXT(vp, v_lazylist);
6999                         continue;
7000                 }
7001                 /*
7002                  * See if we want to process the vnode. Note we may encounter a
7003                  * long string of vnodes we don't care about and hog the list
7004                  * as a result. Check for it and requeue the marker.
7005                  */
7006                 VNPASS(!VN_IS_DOOMED(vp), vp);
7007                 if (!cb(vp, cbarg)) {
7008                         if (!should_yield()) {
7009                                 vp = TAILQ_NEXT(vp, v_lazylist);
7010                                 continue;
7011                         }
7012                         TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp,
7013                             v_lazylist);
7014                         TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp,
7015                             v_lazylist);
7016                         mtx_unlock(&mp->mnt_listmtx);
7017                         kern_yield(PRI_USER);
7018                         mtx_lock(&mp->mnt_listmtx);
7019                         goto restart;
7020                 }
7021                 /*
7022                  * Try-lock because this is the wrong lock order.
7023                  */
7024                 if (!VI_TRYLOCK(vp) &&
7025                     !mnt_vnode_next_lazy_relock(*mvp, mp, vp))
7026                         goto restart;
7027                 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
7028                 KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
7029                     ("alien vnode on the lazy list %p %p", vp, mp));
7030                 VNPASS(vp->v_mount == mp, vp);
7031                 VNPASS(!VN_IS_DOOMED(vp), vp);
7032                 break;
7033         }
7034         TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
7035
7036         /* Check if we are done */
7037         if (vp == NULL) {
7038                 mtx_unlock(&mp->mnt_listmtx);
7039                 mnt_vnode_markerfree_lazy(mvp, mp);
7040                 return (NULL);
7041         }
7042         TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist);
7043         mtx_unlock(&mp->mnt_listmtx);
7044         ASSERT_VI_LOCKED(vp, "lazy iter");
7045         return (vp);
7046 }
7047
7048 struct vnode *
7049 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
7050     void *cbarg)
7051 {
7052
7053         maybe_yield();
7054         mtx_lock(&mp->mnt_listmtx);
7055         return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
7056 }
7057
7058 struct vnode *
7059 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
7060     void *cbarg)
7061 {
7062         struct vnode *vp;
7063
7064         if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist))
7065                 return (NULL);
7066
7067         *mvp = vn_alloc_marker(mp);
7068         MNT_ILOCK(mp);
7069         MNT_REF(mp);
7070         MNT_IUNLOCK(mp);
7071
7072         mtx_lock(&mp->mnt_listmtx);
7073         vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist);
7074         if (vp == NULL) {
7075                 mtx_unlock(&mp->mnt_listmtx);
7076                 mnt_vnode_markerfree_lazy(mvp, mp);
7077                 return (NULL);
7078         }
7079         TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist);
7080         return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
7081 }
7082
7083 void
7084 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
7085 {
7086
7087         if (*mvp == NULL)
7088                 return;
7089
7090         mtx_lock(&mp->mnt_listmtx);
7091         TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
7092         mtx_unlock(&mp->mnt_listmtx);
7093         mnt_vnode_markerfree_lazy(mvp, mp);
7094 }
7095
7096 int
7097 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp)
7098 {
7099
7100         if ((cnp->cn_flags & NOEXECCHECK) != 0) {
7101                 cnp->cn_flags &= ~NOEXECCHECK;
7102                 return (0);
7103         }
7104
7105         return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread));
7106 }
7107
7108 /*
7109  * Do not use this variant unless you have means other than the hold count
7110  * to prevent the vnode from getting freed.
7111  */
7112 void
7113 vn_seqc_write_begin_locked(struct vnode *vp)
7114 {
7115
7116         ASSERT_VI_LOCKED(vp, __func__);
7117         VNPASS(vp->v_holdcnt > 0, vp);
7118         VNPASS(vp->v_seqc_users >= 0, vp);
7119         vp->v_seqc_users++;
7120         if (vp->v_seqc_users == 1)
7121                 seqc_sleepable_write_begin(&vp->v_seqc);
7122 }
7123
7124 void
7125 vn_seqc_write_begin(struct vnode *vp)
7126 {
7127
7128         VI_LOCK(vp);
7129         vn_seqc_write_begin_locked(vp);
7130         VI_UNLOCK(vp);
7131 }
7132
7133 void
7134 vn_seqc_write_end_locked(struct vnode *vp)
7135 {
7136
7137         ASSERT_VI_LOCKED(vp, __func__);
7138         VNPASS(vp->v_seqc_users > 0, vp);
7139         vp->v_seqc_users--;
7140         if (vp->v_seqc_users == 0)
7141                 seqc_sleepable_write_end(&vp->v_seqc);
7142 }
7143
7144 void
7145 vn_seqc_write_end(struct vnode *vp)
7146 {
7147
7148         VI_LOCK(vp);
7149         vn_seqc_write_end_locked(vp);
7150         VI_UNLOCK(vp);
7151 }
7152
7153 /*
7154  * Special case handling for allocating and freeing vnodes.
7155  *
7156  * The counter remains unchanged on free so that a doomed vnode will
7157  * keep testing as in modify as long as it is accessible with SMR.
7158  */
7159 static void
7160 vn_seqc_init(struct vnode *vp)
7161 {
7162
7163         vp->v_seqc = 0;
7164         vp->v_seqc_users = 0;
7165 }
7166
7167 static void
7168 vn_seqc_write_end_free(struct vnode *vp)
7169 {
7170
7171         VNPASS(seqc_in_modify(vp->v_seqc), vp);
7172         VNPASS(vp->v_seqc_users == 1, vp);
7173 }
7174
7175 void
7176 vn_irflag_set_locked(struct vnode *vp, short toset)
7177 {
7178         short flags;
7179
7180         ASSERT_VI_LOCKED(vp, __func__);
7181         flags = vn_irflag_read(vp);
7182         VNASSERT((flags & toset) == 0, vp,
7183             ("%s: some of the passed flags already set (have %d, passed %d)\n",
7184             __func__, flags, toset));
7185         atomic_store_short(&vp->v_irflag, flags | toset);
7186 }
7187
7188 void
7189 vn_irflag_set(struct vnode *vp, short toset)
7190 {
7191
7192         VI_LOCK(vp);
7193         vn_irflag_set_locked(vp, toset);
7194         VI_UNLOCK(vp);
7195 }
7196
7197 void
7198 vn_irflag_set_cond_locked(struct vnode *vp, short toset)
7199 {
7200         short flags;
7201
7202         ASSERT_VI_LOCKED(vp, __func__);
7203         flags = vn_irflag_read(vp);
7204         atomic_store_short(&vp->v_irflag, flags | toset);
7205 }
7206
7207 void
7208 vn_irflag_set_cond(struct vnode *vp, short toset)
7209 {
7210
7211         VI_LOCK(vp);
7212         vn_irflag_set_cond_locked(vp, toset);
7213         VI_UNLOCK(vp);
7214 }
7215
7216 void
7217 vn_irflag_unset_locked(struct vnode *vp, short tounset)
7218 {
7219         short flags;
7220
7221         ASSERT_VI_LOCKED(vp, __func__);
7222         flags = vn_irflag_read(vp);
7223         VNASSERT((flags & tounset) == tounset, vp,
7224             ("%s: some of the passed flags not set (have %d, passed %d)\n",
7225             __func__, flags, tounset));
7226         atomic_store_short(&vp->v_irflag, flags & ~tounset);
7227 }
7228
7229 void
7230 vn_irflag_unset(struct vnode *vp, short tounset)
7231 {
7232
7233         VI_LOCK(vp);
7234         vn_irflag_unset_locked(vp, tounset);
7235         VI_UNLOCK(vp);
7236 }
7237
7238 int
7239 vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *cred)
7240 {
7241         struct vattr vattr;
7242         int error;
7243
7244         ASSERT_VOP_LOCKED(vp, __func__);
7245         error = VOP_GETATTR(vp, &vattr, cred);
7246         if (__predict_true(error == 0)) {
7247                 if (vattr.va_size <= OFF_MAX)
7248                         *size = vattr.va_size;
7249                 else
7250                         error = EFBIG;
7251         }
7252         return (error);
7253 }
7254
7255 int
7256 vn_getsize(struct vnode *vp, off_t *size, struct ucred *cred)
7257 {
7258         int error;
7259
7260         VOP_LOCK(vp, LK_SHARED);
7261         error = vn_getsize_locked(vp, size, cred);
7262         VOP_UNLOCK(vp);
7263         return (error);
7264 }
7265
7266 #ifdef INVARIANTS
7267 void
7268 vn_set_state_validate(struct vnode *vp, __enum_uint8(vstate) state)
7269 {
7270
7271         switch (vp->v_state) {
7272         case VSTATE_UNINITIALIZED:
7273                 switch (state) {
7274                 case VSTATE_CONSTRUCTED:
7275                 case VSTATE_DESTROYING:
7276                         return;
7277                 default:
7278                         break;
7279                 }
7280                 break;
7281         case VSTATE_CONSTRUCTED:
7282                 ASSERT_VOP_ELOCKED(vp, __func__);
7283                 switch (state) {
7284                 case VSTATE_DESTROYING:
7285                         return;
7286                 default:
7287                         break;
7288                 }
7289                 break;
7290         case VSTATE_DESTROYING:
7291                 ASSERT_VOP_ELOCKED(vp, __func__);
7292                 switch (state) {
7293                 case VSTATE_DEAD:
7294                         return;
7295                 default:
7296                         break;
7297                 }
7298                 break;
7299         case VSTATE_DEAD:
7300                 switch (state) {
7301                 case VSTATE_UNINITIALIZED:
7302                         return;
7303                 default:
7304                         break;
7305                 }
7306                 break;
7307         }
7308
7309         vn_printf(vp, "invalid state transition %d -> %d\n", vp->v_state, state);
7310         panic("invalid state transition %d -> %d\n", vp->v_state, state);
7311 }
7312 #endif