sys/kern/vfs_subr.c

   1 /*-
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 4. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
  35  */
  36
  37 /*
  38  * External virtual filesystem routines
  39  */
  40
  41 #include <sys/cdefs.h>
  42 __FBSDID("$FreeBSD$");
  43
  44 #include "opt_compat.h"
  45 #include "opt_ddb.h"
  46 #include "opt_watchdog.h"
  47
  48 #include <sys/param.h>
  49 #include <sys/systm.h>
  50 #include <sys/bio.h>
  51 #include <sys/buf.h>
  52 #include <sys/condvar.h>
  53 #include <sys/conf.h>
  54 #include <sys/dirent.h>
  55 #include <sys/event.h>
  56 #include <sys/eventhandler.h>
  57 #include <sys/extattr.h>
  58 #include <sys/file.h>
  59 #include <sys/fcntl.h>
  60 #include <sys/jail.h>
  61 #include <sys/kdb.h>
  62 #include <sys/kernel.h>
  63 #include <sys/kthread.h>
  64 #include <sys/lockf.h>
  65 #include <sys/malloc.h>
  66 #include <sys/mount.h>
  67 #include <sys/namei.h>
  68 #include <sys/pctrie.h>
  69 #include <sys/priv.h>
  70 #include <sys/reboot.h>
  71 #include <sys/rwlock.h>
  72 #include <sys/sched.h>
  73 #include <sys/sleepqueue.h>
  74 #include <sys/smp.h>
  75 #include <sys/stat.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/syslog.h>
  78 #include <sys/vmmeter.h>
  79 #include <sys/vnode.h>
  80 #include <sys/watchdog.h>
  81
  82 #include <machine/stdarg.h>
  83
  84 #include <security/mac/mac_framework.h>
  85
  86 #include <vm/vm.h>
  87 #include <vm/vm_object.h>
  88 #include <vm/vm_extern.h>
  89 #include <vm/pmap.h>
  90 #include <vm/vm_map.h>
  91 #include <vm/vm_page.h>
  92 #include <vm/vm_kern.h>
  93 #include <vm/uma.h>
  94
  95 #ifdef DDB
  96 #include <ddb/ddb.h>
  97 #endif
  98
  99 static void     delmntque(struct vnode *vp);
 100 static int      flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 101                     int slpflag, int slptimeo);
 102 static void     syncer_shutdown(void *arg, int howto);
 103 static int      vtryrecycle(struct vnode *vp);
 104 static void     v_incr_usecount(struct vnode *);
 105 static void     v_decr_usecount(struct vnode *);
 106 static void     v_decr_useonly(struct vnode *);
 107 static void     v_upgrade_usecount(struct vnode *);
 108 static void     vnlru_free(int);
 109 static void     vgonel(struct vnode *);
 110 static void     vfs_knllock(void *arg);
 111 static void     vfs_knlunlock(void *arg);
 112 static void     vfs_knl_assert_locked(void *arg);
 113 static void     vfs_knl_assert_unlocked(void *arg);
 114 static void     destroy_vpollinfo(struct vpollinfo *vi);
 115
 116 /*
 117  * Number of vnodes in existence.  Increased whenever getnewvnode()
 118  * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
 119  */
 120 static unsigned long    numvnodes;
 121
 122 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
 123     "Number of vnodes in existence");
 124
 125 static u_long vnodes_created;
 126 SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
 127     0, "Number of vnodes created by getnewvnode");
 128
 129 /*
 130  * Conversion tables for conversion from vnode types to inode formats
 131  * and back.
 132  */
 133 enum vtype iftovt_tab[16] = {
 134         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 135         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 136 };
 137 int vttoif_tab[10] = {
 138         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 139         S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 140 };
 141
 142 /*
 143  * List of vnodes that are ready for recycling.
 144  */
 145 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 146
 147 /*
 148  * Free vnode target.  Free vnodes may simply be files which have been stat'd
 149  * but not read.  This is somewhat common, and a small cache of such files
 150  * should be kept to avoid recreation costs.
 151  */
 152 static u_long wantfreevnodes;
 153 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 154 /* Number of vnodes in the free list. */
 155 static u_long freevnodes;
 156 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
 157     "Number of vnodes in the free list");
 158
 159 static int vlru_allow_cache_src;
 160 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
 161     &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
 162
 163 static u_long recycles_count;
 164 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
 165     "Number of vnodes recycled to avoid exceding kern.maxvnodes");
 166
 167 /*
 168  * Various variables used for debugging the new implementation of
 169  * reassignbuf().
 170  * XXX these are probably of (very) limited utility now.
 171  */
 172 static int reassignbufcalls;
 173 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
 174     "Number of calls to reassignbuf");
 175
 176 /*
 177  * Cache for the mount type id assigned to NFS.  This is used for
 178  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
 179  */
 180 int     nfs_mount_type = -1;
 181
 182 /* To keep more than one thread at a time from running vfs_getnewfsid */
 183 static struct mtx mntid_mtx;
 184
 185 /*
 186  * Lock for any access to the following:
 187  *      vnode_free_list
 188  *      numvnodes
 189  *      freevnodes
 190  */
 191 static struct mtx vnode_free_list_mtx;
 192
 193 /* Publicly exported FS */
 194 struct nfs_public nfs_pub;
 195
 196 static uma_zone_t buf_trie_zone;
 197
 198 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 199 static uma_zone_t vnode_zone;
 200 static uma_zone_t vnodepoll_zone;
 201
 202 /*
 203  * The workitem queue.
 204  *
 205  * It is useful to delay writes of file data and filesystem metadata
 206  * for tens of seconds so that quickly created and deleted files need
 207  * not waste disk bandwidth being created and removed. To realize this,
 208  * we append vnodes to a "workitem" queue. When running with a soft
 209  * updates implementation, most pending metadata dependencies should
 210  * not wait for more than a few seconds. Thus, mounted on block devices
 211  * are delayed only about a half the time that file data is delayed.
 212  * Similarly, directory updates are more critical, so are only delayed
 213  * about a third the time that file data is delayed. Thus, there are
 214  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
 215  * one each second (driven off the filesystem syncer process). The
 216  * syncer_delayno variable indicates the next queue that is to be processed.
 217  * Items that need to be processed soon are placed in this queue:
 218  *
 219  *      syncer_workitem_pending[syncer_delayno]
 220  *
 221  * A delay of fifteen seconds is done by placing the request fifteen
 222  * entries later in the queue:
 223  *
 224  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
 225  *
 226  */
 227 static int syncer_delayno;
 228 static long syncer_mask;
 229 LIST_HEAD(synclist, bufobj);
 230 static struct synclist *syncer_workitem_pending;
 231 /*
 232  * The sync_mtx protects:
 233  *      bo->bo_synclist
 234  *      sync_vnode_count
 235  *      syncer_delayno
 236  *      syncer_state
 237  *      syncer_workitem_pending
 238  *      syncer_worklist_len
 239  *      rushjob
 240  */
 241 static struct mtx sync_mtx;
 242 static struct cv sync_wakeup;
 243
 244 #define SYNCER_MAXDELAY         32
 245 static int syncer_maxdelay = SYNCER_MAXDELAY;   /* maximum delay time */
 246 static int syncdelay = 30;              /* max time to delay syncing data */
 247 static int filedelay = 30;              /* time to delay syncing files */
 248 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
 249     "Time to delay syncing files (in seconds)");
 250 static int dirdelay = 29;               /* time to delay syncing directories */
 251 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
 252     "Time to delay syncing directories (in seconds)");
 253 static int metadelay = 28;              /* time to delay syncing metadata */
 254 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
 255     "Time to delay syncing metadata (in seconds)");
 256 static int rushjob;             /* number of slots to run ASAP */
 257 static int stat_rush_requests;  /* number of times I/O speeded up */
 258 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
 259     "Number of times I/O speeded up (rush requests)");
 260
 261 /*
 262  * When shutting down the syncer, run it at four times normal speed.
 263  */
 264 #define SYNCER_SHUTDOWN_SPEEDUP         4
 265 static int sync_vnode_count;
 266 static int syncer_worklist_len;
 267 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
 268     syncer_state;
 269
 270 /*
 271  * Number of vnodes we want to exist at any one time.  This is mostly used
 272  * to size hash tables in vnode-related code.  It is normally not used in
 273  * getnewvnode(), as wantfreevnodes is normally nonzero.)
 274  *
 275  * XXX desiredvnodes is historical cruft and should not exist.
 276  */
 277 int desiredvnodes;
 278 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
 279     &desiredvnodes, 0, "Maximum number of vnodes");
 280 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
 281     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
 282 static int vnlru_nowhere;
 283 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
 284     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 285
 286 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
 287 static int vnsz2log;
 288
 289 /*
 290  * Support for the bufobj clean & dirty pctrie.
 291  */
 292 static void *
 293 buf_trie_alloc(struct pctrie *ptree)
 294 {
 295
 296         return uma_zalloc(buf_trie_zone, M_NOWAIT);
 297 }
 298
 299 static void
 300 buf_trie_free(struct pctrie *ptree, void *node)
 301 {
 302
 303         uma_zfree(buf_trie_zone, node);
 304 }
 305 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
 306
 307 /*
 308  * Initialize the vnode management data structures.
 309  *
 310  * Reevaluate the following cap on the number of vnodes after the physical
 311  * memory size exceeds 512GB.  In the limit, as the physical memory size
 312  * grows, the ratio of physical pages to vnodes approaches sixteen to one.
 313  */
 314 #ifndef MAXVNODES_MAX
 315 #define MAXVNODES_MAX   (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
 316 #endif
 317 static void
 318 vntblinit(void *dummy __unused)
 319 {
 320         u_int i;
 321         int physvnodes, virtvnodes;
 322
 323         /*
 324          * Desiredvnodes is a function of the physical memory size and the
 325          * kernel's heap size.  Generally speaking, it scales with the
 326          * physical memory size.  The ratio of desiredvnodes to physical pages
 327          * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
 328          * marginal ratio of desiredvnodes to physical pages is one to
 329          * sixteen.  However, desiredvnodes is limited by the kernel's heap
 330          * size.  The memory required by desiredvnodes vnodes and vm objects
 331          * may not exceed one seventh of the kernel's heap size.
 332          */
 333         physvnodes = maxproc + vm_cnt.v_page_count / 16 + 3 * min(98304 * 4,
 334             vm_cnt.v_page_count) / 16;
 335         virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
 336             sizeof(struct vnode)));
 337         desiredvnodes = min(physvnodes, virtvnodes);
 338         if (desiredvnodes > MAXVNODES_MAX) {
 339                 if (bootverbose)
 340                         printf("Reducing kern.maxvnodes %d -> %d\n",
 341                             desiredvnodes, MAXVNODES_MAX);
 342                 desiredvnodes = MAXVNODES_MAX;
 343         }
 344         wantfreevnodes = desiredvnodes / 4;
 345         mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 346         TAILQ_INIT(&vnode_free_list);
 347         mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 348         vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 349             NULL, NULL, UMA_ALIGN_PTR, 0);
 350         vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 351             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 352         /*
 353          * Preallocate enough nodes to support one-per buf so that
 354          * we can not fail an insert.  reassignbuf() callers can not
 355          * tolerate the insertion failure.
 356          */
 357         buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
 358             NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
 359             UMA_ZONE_NOFREE | UMA_ZONE_VM);
 360         uma_prealloc(buf_trie_zone, nbuf);
 361         /*
 362          * Initialize the filesystem syncer.
 363          */
 364         syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 365             &syncer_mask);
 366         syncer_maxdelay = syncer_mask + 1;
 367         mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 368         cv_init(&sync_wakeup, "syncer");
 369         for (i = 1; i <= sizeof(struct vnode); i <<= 1)
 370                 vnsz2log++;
 371         vnsz2log--;
 372 }
 373 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
 374
 375
 376 /*
 377  * Mark a mount point as busy. Used to synchronize access and to delay
 378  * unmounting. Eventually, mountlist_mtx is not released on failure.
 379  *
 380  * vfs_busy() is a custom lock, it can block the caller.
 381  * vfs_busy() only sleeps if the unmount is active on the mount point.
 382  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
 383  * vnode belonging to mp.
 384  *
 385  * Lookup uses vfs_busy() to traverse mount points.
 386  * root fs                      var fs
 387  * / vnode lock         A       / vnode lock (/var)             D
 388  * /var vnode lock      B       /log vnode lock(/var/log)       E
 389  * vfs_busy lock        C       vfs_busy lock                   F
 390  *
 391  * Within each file system, the lock order is C->A->B and F->D->E.
 392  *
 393  * When traversing across mounts, the system follows that lock order:
 394  *
 395  *        C->A->B
 396  *              |
 397  *              +->F->D->E
 398  *
 399  * The lookup() process for namei("/var") illustrates the process:
 400  *  VOP_LOOKUP() obtains B while A is held
 401  *  vfs_busy() obtains a shared lock on F while A and B are held
 402  *  vput() releases lock on B
 403  *  vput() releases lock on A
 404  *  VFS_ROOT() obtains lock on D while shared lock on F is held
 405  *  vfs_unbusy() releases shared lock on F
 406  *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
 407  *    Attempt to lock A (instead of vp_crossmp) while D is held would
 408  *    violate the global order, causing deadlocks.
 409  *
 410  * dounmount() locks B while F is drained.
 411  */
 412 int
 413 vfs_busy(struct mount *mp, int flags)
 414 {
 415
 416         MPASS((flags & ~MBF_MASK) == 0);
 417         CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
 418
 419         MNT_ILOCK(mp);
 420         MNT_REF(mp);
 421         /*
 422          * If mount point is currenly being unmounted, sleep until the
 423          * mount point fate is decided.  If thread doing the unmounting fails,
 424          * it will clear MNTK_UNMOUNT flag before waking us up, indicating
 425          * that this mount point has survived the unmount attempt and vfs_busy
 426          * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
 427          * flag in addition to MNTK_UNMOUNT, indicating that mount point is
 428          * about to be really destroyed.  vfs_busy needs to release its
 429          * reference on the mount point in this case and return with ENOENT,
 430          * telling the caller that mount mount it tried to busy is no longer
 431          * valid.
 432          */
 433         while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 434                 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
 435                         MNT_REL(mp);
 436                         MNT_IUNLOCK(mp);
 437                         CTR1(KTR_VFS, "%s: failed busying before sleeping",
 438                             __func__);
 439                         return (ENOENT);
 440                 }
 441                 if (flags & MBF_MNTLSTLOCK)
 442                         mtx_unlock(&mountlist_mtx);
 443                 mp->mnt_kern_flag |= MNTK_MWAIT;
 444                 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
 445                 if (flags & MBF_MNTLSTLOCK)
 446                         mtx_lock(&mountlist_mtx);
 447                 MNT_ILOCK(mp);
 448         }
 449         if (flags & MBF_MNTLSTLOCK)
 450                 mtx_unlock(&mountlist_mtx);
 451         mp->mnt_lockref++;
 452         MNT_IUNLOCK(mp);
 453         return (0);
 454 }
 455
 456 /*
 457  * Free a busy filesystem.
 458  */
 459 void
 460 vfs_unbusy(struct mount *mp)
 461 {
 462
 463         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 464         MNT_ILOCK(mp);
 465         MNT_REL(mp);
 466         KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
 467         mp->mnt_lockref--;
 468         if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
 469                 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
 470                 CTR1(KTR_VFS, "%s: waking up waiters", __func__);
 471                 mp->mnt_kern_flag &= ~MNTK_DRAINING;
 472                 wakeup(&mp->mnt_lockref);
 473         }
 474         MNT_IUNLOCK(mp);
 475 }
 476
 477 /*
 478  * Lookup a mount point by filesystem identifier.
 479  */
 480 struct mount *
 481 vfs_getvfs(fsid_t *fsid)
 482 {
 483         struct mount *mp;
 484
 485         CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 486         mtx_lock(&mountlist_mtx);
 487         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 488                 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 489                     mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 490                         vfs_ref(mp);
 491                         mtx_unlock(&mountlist_mtx);
 492                         return (mp);
 493                 }
 494         }
 495         mtx_unlock(&mountlist_mtx);
 496         CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 497         return ((struct mount *) 0);
 498 }
 499
 500 /*
 501  * Lookup a mount point by filesystem identifier, busying it before
 502  * returning.
 503  *
 504  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
 505  * cache for popular filesystem identifiers.  The cache is lockess, using
 506  * the fact that struct mount's are never freed.  In worst case we may
 507  * get pointer to unmounted or even different filesystem, so we have to
 508  * check what we got, and go slow way if so.
 509  */
 510 struct mount *
 511 vfs_busyfs(fsid_t *fsid)
 512 {
 513 #define FSID_CACHE_SIZE 256
 514         typedef struct mount * volatile vmp_t;
 515         static vmp_t cache[FSID_CACHE_SIZE];
 516         struct mount *mp;
 517         int error;
 518         uint32_t hash;
 519
 520         CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 521         hash = fsid->val[0] ^ fsid->val[1];
 522         hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
 523         mp = cache[hash];
 524         if (mp == NULL ||
 525             mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
 526             mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
 527                 goto slow;
 528         if (vfs_busy(mp, 0) != 0) {
 529                 cache[hash] = NULL;
 530                 goto slow;
 531         }
 532         if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 533             mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
 534                 return (mp);
 535         else
 536             vfs_unbusy(mp);
 537
 538 slow:
 539         mtx_lock(&mountlist_mtx);
 540         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 541                 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 542                     mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 543                         error = vfs_busy(mp, MBF_MNTLSTLOCK);
 544                         if (error) {
 545                                 cache[hash] = NULL;
 546                                 mtx_unlock(&mountlist_mtx);
 547                                 return (NULL);
 548                         }
 549                         cache[hash] = mp;
 550                         return (mp);
 551                 }
 552         }
 553         CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 554         mtx_unlock(&mountlist_mtx);
 555         return ((struct mount *) 0);
 556 }
 557
 558 /*
 559  * Check if a user can access privileged mount options.
 560  */
 561 int
 562 vfs_suser(struct mount *mp, struct thread *td)
 563 {
 564         int error;
 565
 566         /*
 567          * If the thread is jailed, but this is not a jail-friendly file
 568          * system, deny immediately.
 569          */
 570         if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
 571                 return (EPERM);
 572
 573         /*
 574          * If the file system was mounted outside the jail of the calling
 575          * thread, deny immediately.
 576          */
 577         if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
 578                 return (EPERM);
 579
 580         /*
 581          * If file system supports delegated administration, we don't check
 582          * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
 583          * by the file system itself.
 584          * If this is not the user that did original mount, we check for
 585          * the PRIV_VFS_MOUNT_OWNER privilege.
 586          */
 587         if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
 588             mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 589                 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
 590                         return (error);
 591         }
 592         return (0);
 593 }
 594
 595 /*
 596  * Get a new unique fsid.  Try to make its val[0] unique, since this value
 597  * will be used to create fake device numbers for stat().  Also try (but
 598  * not so hard) make its val[0] unique mod 2^16, since some emulators only
 599  * support 16-bit device numbers.  We end up with unique val[0]'s for the
 600  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
 601  *
 602  * Keep in mind that several mounts may be running in parallel.  Starting
 603  * the search one past where the previous search terminated is both a
 604  * micro-optimization and a defense against returning the same fsid to
 605  * different mounts.
 606  */
 607 void
 608 vfs_getnewfsid(struct mount *mp)
 609 {
 610         static uint16_t mntid_base;
 611         struct mount *nmp;
 612         fsid_t tfsid;
 613         int mtype;
 614
 615         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 616         mtx_lock(&mntid_mtx);
 617         mtype = mp->mnt_vfc->vfc_typenum;
 618         tfsid.val[1] = mtype;
 619         mtype = (mtype & 0xFF) << 24;
 620         for (;;) {
 621                 tfsid.val[0] = makedev(255,
 622                     mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 623                 mntid_base++;
 624                 if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 625                         break;
 626                 vfs_rel(nmp);
 627         }
 628         mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 629         mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 630         mtx_unlock(&mntid_mtx);
 631 }
 632
 633 /*
 634  * Knob to control the precision of file timestamps:
 635  *
 636  *   0 = seconds only; nanoseconds zeroed.
 637  *   1 = seconds and nanoseconds, accurate within 1/HZ.
 638  *   2 = seconds and nanoseconds, truncated to microseconds.
 639  * >=3 = seconds and nanoseconds, maximum precision.
 640  */
 641 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 642
 643 static int timestamp_precision = TSP_USEC;
 644 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
 645     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
 646     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
 647     "3+: sec + ns (max. precision))");
 648
 649 /*
 650  * Get a current timestamp.
 651  */
 652 void
 653 vfs_timestamp(struct timespec *tsp)
 654 {
 655         struct timeval tv;
 656
 657         switch (timestamp_precision) {
 658         case TSP_SEC:
 659                 tsp->tv_sec = time_second;
 660                 tsp->tv_nsec = 0;
 661                 break;
 662         case TSP_HZ:
 663                 getnanotime(tsp);
 664                 break;
 665         case TSP_USEC:
 666                 microtime(&tv);
 667                 TIMEVAL_TO_TIMESPEC(&tv, tsp);
 668                 break;
 669         case TSP_NSEC:
 670         default:
 671                 nanotime(tsp);
 672                 break;
 673         }
 674 }
 675
 676 /*
 677  * Set vnode attributes to VNOVAL
 678  */
 679 void
 680 vattr_null(struct vattr *vap)
 681 {
 682
 683         vap->va_type = VNON;
 684         vap->va_size = VNOVAL;
 685         vap->va_bytes = VNOVAL;
 686         vap->va_mode = VNOVAL;
 687         vap->va_nlink = VNOVAL;
 688         vap->va_uid = VNOVAL;
 689         vap->va_gid = VNOVAL;
 690         vap->va_fsid = VNOVAL;
 691         vap->va_fileid = VNOVAL;
 692         vap->va_blocksize = VNOVAL;
 693         vap->va_rdev = VNOVAL;
 694         vap->va_atime.tv_sec = VNOVAL;
 695         vap->va_atime.tv_nsec = VNOVAL;
 696         vap->va_mtime.tv_sec = VNOVAL;
 697         vap->va_mtime.tv_nsec = VNOVAL;
 698         vap->va_ctime.tv_sec = VNOVAL;
 699         vap->va_ctime.tv_nsec = VNOVAL;
 700         vap->va_birthtime.tv_sec = VNOVAL;
 701         vap->va_birthtime.tv_nsec = VNOVAL;
 702         vap->va_flags = VNOVAL;
 703         vap->va_gen = VNOVAL;
 704         vap->va_vaflags = 0;
 705 }
 706
 707 /*
 708  * This routine is called when we have too many vnodes.  It attempts
 709  * to free <count> vnodes and will potentially free vnodes that still
 710  * have VM backing store (VM backing store is typically the cause
 711  * of a vnode blowout so we want to do this).  Therefore, this operation
 712  * is not considered cheap.
 713  *
 714  * A number of conditions may prevent a vnode from being reclaimed.
 715  * the buffer cache may have references on the vnode, a directory
 716  * vnode may still have references due to the namei cache representing
 717  * underlying files, or the vnode may be in active use.   It is not
 718  * desireable to reuse such vnodes.  These conditions may cause the
 719  * number of vnodes to reach some minimum value regardless of what
 720  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
 721  */
 722 static int
 723 vlrureclaim(struct mount *mp)
 724 {
 725         struct vnode *vp;
 726         int done;
 727         int trigger;
 728         int usevnodes;
 729         int count;
 730
 731         /*
 732          * Calculate the trigger point, don't allow user
 733          * screwups to blow us up.   This prevents us from
 734          * recycling vnodes with lots of resident pages.  We
 735          * aren't trying to free memory, we are trying to
 736          * free vnodes.
 737          */
 738         usevnodes = desiredvnodes;
 739         if (usevnodes <= 0)
 740                 usevnodes = 1;
 741         trigger = vm_cnt.v_page_count * 2 / usevnodes;
 742         done = 0;
 743         vn_start_write(NULL, &mp, V_WAIT);
 744         MNT_ILOCK(mp);
 745         count = mp->mnt_nvnodelistsize / 10 + 1;
 746         while (count != 0) {
 747                 vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 748                 while (vp != NULL && vp->v_type == VMARKER)
 749                         vp = TAILQ_NEXT(vp, v_nmntvnodes);
 750                 if (vp == NULL)
 751                         break;
 752                 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 753                 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 754                 --count;
 755                 if (!VI_TRYLOCK(vp))
 756                         goto next_iter;
 757                 /*
 758                  * If it's been deconstructed already, it's still
 759                  * referenced, or it exceeds the trigger, skip it.
 760                  */
 761                 if (vp->v_usecount ||
 762                     (!vlru_allow_cache_src &&
 763                         !LIST_EMPTY(&(vp)->v_cache_src)) ||
 764                     (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
 765                     vp->v_object->resident_page_count > trigger)) {
 766                         VI_UNLOCK(vp);
 767                         goto next_iter;
 768                 }
 769                 MNT_IUNLOCK(mp);
 770                 vholdl(vp);
 771                 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
 772                         vdrop(vp);
 773                         goto next_iter_mntunlocked;
 774                 }
 775                 VI_LOCK(vp);
 776                 /*
 777                  * v_usecount may have been bumped after VOP_LOCK() dropped
 778                  * the vnode interlock and before it was locked again.
 779                  *
 780                  * It is not necessary to recheck VI_DOOMED because it can
 781                  * only be set by another thread that holds both the vnode
 782                  * lock and vnode interlock.  If another thread has the
 783                  * vnode lock before we get to VOP_LOCK() and obtains the
 784                  * vnode interlock after VOP_LOCK() drops the vnode
 785                  * interlock, the other thread will be unable to drop the
 786                  * vnode lock before our VOP_LOCK() call fails.
 787                  */
 788                 if (vp->v_usecount ||
 789                     (!vlru_allow_cache_src &&
 790                         !LIST_EMPTY(&(vp)->v_cache_src)) ||
 791                     (vp->v_object != NULL &&
 792                     vp->v_object->resident_page_count > trigger)) {
 793                         VOP_UNLOCK(vp, LK_INTERLOCK);
 794                         vdrop(vp);
 795                         goto next_iter_mntunlocked;
 796                 }
 797                 KASSERT((vp->v_iflag & VI_DOOMED) == 0,
 798                     ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
 799                 atomic_add_long(&recycles_count, 1);
 800                 vgonel(vp);
 801                 VOP_UNLOCK(vp, 0);
 802                 vdropl(vp);
 803                 done++;
 804 next_iter_mntunlocked:
 805                 if (!should_yield())
 806                         goto relock_mnt;
 807                 goto yield;
 808 next_iter:
 809                 if (!should_yield())
 810                         continue;
 811                 MNT_IUNLOCK(mp);
 812 yield:
 813                 kern_yield(PRI_USER);
 814 relock_mnt:
 815                 MNT_ILOCK(mp);
 816         }
 817         MNT_IUNLOCK(mp);
 818         vn_finished_write(mp);
 819         return done;
 820 }
 821
 822 /*
 823  * Attempt to keep the free list at wantfreevnodes length.
 824  */
 825 static void
 826 vnlru_free(int count)
 827 {
 828         struct vnode *vp;
 829
 830         mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 831         for (; count > 0; count--) {
 832                 vp = TAILQ_FIRST(&vnode_free_list);
 833                 /*
 834                  * The list can be modified while the free_list_mtx
 835                  * has been dropped and vp could be NULL here.
 836                  */
 837                 if (!vp)
 838                         break;
 839                 VNASSERT(vp->v_op != NULL, vp,
 840                     ("vnlru_free: vnode already reclaimed."));
 841                 KASSERT((vp->v_iflag & VI_FREE) != 0,
 842                     ("Removing vnode not on freelist"));
 843                 KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
 844                     ("Mangling active vnode"));
 845                 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
 846                 /*
 847                  * Don't recycle if we can't get the interlock.
 848                  */
 849                 if (!VI_TRYLOCK(vp)) {
 850                         TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
 851                         continue;
 852                 }
 853                 VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
 854                     vp, ("vp inconsistent on freelist"));
 855
 856                 /*
 857                  * The clear of VI_FREE prevents activation of the
 858                  * vnode.  There is no sense in putting the vnode on
 859                  * the mount point active list, only to remove it
 860                  * later during recycling.  Inline the relevant part
 861                  * of vholdl(), to avoid triggering assertions or
 862                  * activating.
 863                  */
 864                 freevnodes--;
 865                 vp->v_iflag &= ~VI_FREE;
 866                 vp->v_holdcnt++;
 867
 868                 mtx_unlock(&vnode_free_list_mtx);
 869                 VI_UNLOCK(vp);
 870                 vtryrecycle(vp);
 871                 /*
 872                  * If the recycled succeeded this vdrop will actually free
 873                  * the vnode.  If not it will simply place it back on
 874                  * the free list.
 875                  */
 876                 vdrop(vp);
 877                 mtx_lock(&vnode_free_list_mtx);
 878         }
 879 }
 880 /*
 881  * Attempt to recycle vnodes in a context that is always safe to block.
 882  * Calling vlrurecycle() from the bowels of filesystem code has some
 883  * interesting deadlock problems.
 884  */
 885 static struct proc *vnlruproc;
 886 static int vnlruproc_sig;
 887
 888 static void
 889 vnlru_proc(void)
 890 {
 891         struct mount *mp, *nmp;
 892         int done;
 893         struct proc *p = vnlruproc;
 894
 895         EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
 896             SHUTDOWN_PRI_FIRST);
 897
 898         for (;;) {
 899                 kproc_suspend_check(p);
 900                 mtx_lock(&vnode_free_list_mtx);
 901                 if (freevnodes > wantfreevnodes)
 902                         vnlru_free(freevnodes - wantfreevnodes);
 903                 if (numvnodes <= desiredvnodes * 9 / 10) {
 904                         vnlruproc_sig = 0;
 905                         wakeup(&vnlruproc_sig);
 906                         msleep(vnlruproc, &vnode_free_list_mtx,
 907                             PVFS|PDROP, "vlruwt", hz);
 908                         continue;
 909                 }
 910                 mtx_unlock(&vnode_free_list_mtx);
 911                 done = 0;
 912                 mtx_lock(&mountlist_mtx);
 913                 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 914                         if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 915                                 nmp = TAILQ_NEXT(mp, mnt_list);
 916                                 continue;
 917                         }
 918                         done += vlrureclaim(mp);
 919                         mtx_lock(&mountlist_mtx);
 920                         nmp = TAILQ_NEXT(mp, mnt_list);
 921                         vfs_unbusy(mp);
 922                 }
 923                 mtx_unlock(&mountlist_mtx);
 924                 if (done == 0) {
 925 #if 0
 926                         /* These messages are temporary debugging aids */
 927                         if (vnlru_nowhere < 5)
 928                                 printf("vnlru process getting nowhere..\n");
 929                         else if (vnlru_nowhere == 5)
 930                                 printf("vnlru process messages stopped.\n");
 931 #endif
 932                         vnlru_nowhere++;
 933                         tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 934                 } else
 935                         kern_yield(PRI_USER);
 936         }
 937 }
 938
 939 static struct kproc_desc vnlru_kp = {
 940         "vnlru",
 941         vnlru_proc,
 942         &vnlruproc
 943 };
 944 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
 945     &vnlru_kp);
 946
 947 /*
 948  * Routines having to do with the management of the vnode table.
 949  */
 950
 951 /*
 952  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
 953  * before we actually vgone().  This function must be called with the vnode
 954  * held to prevent the vnode from being returned to the free list midway
 955  * through vgone().
 956  */
 957 static int
 958 vtryrecycle(struct vnode *vp)
 959 {
 960         struct mount *vnmp;
 961
 962         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 963         VNASSERT(vp->v_holdcnt, vp,
 964             ("vtryrecycle: Recycling vp %p without a reference.", vp));
 965         /*
 966          * This vnode may found and locked via some other list, if so we
 967          * can't recycle it yet.
 968          */
 969         if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 970                 CTR2(KTR_VFS,
 971                     "%s: impossible to recycle, vp %p lock is already held",
 972                     __func__, vp);
 973                 return (EWOULDBLOCK);
 974         }
 975         /*
 976          * Don't recycle if its filesystem is being suspended.
 977          */
 978         if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 979                 VOP_UNLOCK(vp, 0);
 980                 CTR2(KTR_VFS,
 981                     "%s: impossible to recycle, cannot start the write for %p",
 982                     __func__, vp);
 983                 return (EBUSY);
 984         }
 985         /*
 986          * If we got this far, we need to acquire the interlock and see if
 987          * anyone picked up this vnode from another list.  If not, we will
 988          * mark it with DOOMED via vgonel() so that anyone who does find it
 989          * will skip over it.
 990          */
 991         VI_LOCK(vp);
 992         if (vp->v_usecount) {
 993                 VOP_UNLOCK(vp, LK_INTERLOCK);
 994                 vn_finished_write(vnmp);
 995                 CTR2(KTR_VFS,
 996                     "%s: impossible to recycle, %p is already referenced",
 997                     __func__, vp);
 998                 return (EBUSY);
 999         }
1000         if ((vp->v_iflag & VI_DOOMED) == 0) {
1001                 atomic_add_long(&recycles_count, 1);
1002                 vgonel(vp);
1003         }
1004         VOP_UNLOCK(vp, LK_INTERLOCK);
1005         vn_finished_write(vnmp);
1006         return (0);
1007 }
1008
1009 /*
1010  * Wait for available vnodes.
1011  */
1012 static int
1013 getnewvnode_wait(int suspended)
1014 {
1015
1016         mtx_assert(&vnode_free_list_mtx, MA_OWNED);
1017         if (numvnodes > desiredvnodes) {
1018                 if (suspended) {
1019                         /*
1020                          * File system is beeing suspended, we cannot risk a
1021                          * deadlock here, so allocate new vnode anyway.
1022                          */
1023                         if (freevnodes > wantfreevnodes)
1024                                 vnlru_free(freevnodes - wantfreevnodes);
1025                         return (0);
1026                 }
1027                 if (vnlruproc_sig == 0) {
1028                         vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
1029                         wakeup(vnlruproc);
1030                 }
1031                 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
1032                     "vlruwk", hz);
1033         }
1034         return (numvnodes > desiredvnodes ? ENFILE : 0);
1035 }
1036
1037 void
1038 getnewvnode_reserve(u_int count)
1039 {
1040         struct thread *td;
1041
1042         td = curthread;
1043         /* First try to be quick and racy. */
1044         if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
1045                 td->td_vp_reserv += count;
1046                 return;
1047         } else
1048                 atomic_subtract_long(&numvnodes, count);
1049
1050         mtx_lock(&vnode_free_list_mtx);
1051         while (count > 0) {
1052                 if (getnewvnode_wait(0) == 0) {
1053                         count--;
1054                         td->td_vp_reserv++;
1055                         atomic_add_long(&numvnodes, 1);
1056                 }
1057         }
1058         mtx_unlock(&vnode_free_list_mtx);
1059 }
1060
1061 void
1062 getnewvnode_drop_reserve(void)
1063 {
1064         struct thread *td;
1065
1066         td = curthread;
1067         atomic_subtract_long(&numvnodes, td->td_vp_reserv);
1068         td->td_vp_reserv = 0;
1069 }
1070
1071 /*
1072  * Return the next vnode from the free list.
1073  */
1074 int
1075 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
1076     struct vnode **vpp)
1077 {
1078         struct vnode *vp;
1079         struct bufobj *bo;
1080         struct thread *td;
1081         int error;
1082
1083         CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1084         vp = NULL;
1085         td = curthread;
1086         if (td->td_vp_reserv > 0) {
1087                 td->td_vp_reserv -= 1;
1088                 goto alloc;
1089         }
1090         mtx_lock(&vnode_free_list_mtx);
1091         /*
1092          * Lend our context to reclaim vnodes if they've exceeded the max.
1093          */
1094         if (freevnodes > wantfreevnodes)
1095                 vnlru_free(1);
1096         error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
1097             MNTK_SUSPEND));
1098 #if 0   /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
1099         if (error != 0) {
1100                 mtx_unlock(&vnode_free_list_mtx);
1101                 return (error);
1102         }
1103 #endif
1104         atomic_add_long(&numvnodes, 1);
1105         mtx_unlock(&vnode_free_list_mtx);
1106 alloc:
1107         atomic_add_long(&vnodes_created, 1);
1108         vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
1109         /*
1110          * Setup locks.
1111          */
1112         vp->v_vnlock = &vp->v_lock;
1113         mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
1114         /*
1115          * By default, don't allow shared locks unless filesystems
1116          * opt-in.
1117          */
1118         lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
1119         /*
1120          * Initialize bufobj.
1121          */
1122         bo = &vp->v_bufobj;
1123         bo->__bo_vnode = vp;
1124         rw_init(BO_LOCKPTR(bo), "bufobj interlock");
1125         bo->bo_ops = &buf_ops_bio;
1126         bo->bo_private = vp;
1127         TAILQ_INIT(&bo->bo_clean.bv_hd);
1128         TAILQ_INIT(&bo->bo_dirty.bv_hd);
1129         /*
1130          * Initialize namecache.
1131          */
1132         LIST_INIT(&vp->v_cache_src);
1133         TAILQ_INIT(&vp->v_cache_dst);
1134         /*
1135          * Finalize various vnode identity bits.
1136          */
1137         vp->v_type = VNON;
1138         vp->v_tag = tag;
1139         vp->v_op = vops;
1140         v_incr_usecount(vp);
1141         vp->v_data = NULL;
1142 #ifdef MAC
1143         mac_vnode_init(vp);
1144         if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1145                 mac_vnode_associate_singlelabel(mp, vp);
1146         else if (mp == NULL && vops != &dead_vnodeops)
1147                 printf("NULL mp in getnewvnode()\n");
1148 #endif
1149         if (mp != NULL) {
1150                 bo->bo_bsize = mp->mnt_stat.f_iosize;
1151                 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1152                         vp->v_vflag |= VV_NOKNOTE;
1153         }
1154         rangelock_init(&vp->v_rl);
1155
1156         /*
1157          * For the filesystems which do not use vfs_hash_insert(),
1158          * still initialize v_hash to have vfs_hash_index() useful.
1159          * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1160          * its own hashing.
1161          */
1162         vp->v_hash = (uintptr_t)vp >> vnsz2log;
1163
1164         *vpp = vp;
1165         return (0);
1166 }
1167
1168 /*
1169  * Delete from old mount point vnode list, if on one.
1170  */
1171 static void
1172 delmntque(struct vnode *vp)
1173 {
1174         struct mount *mp;
1175         int active;
1176
1177         mp = vp->v_mount;
1178         if (mp == NULL)
1179                 return;
1180         MNT_ILOCK(mp);
1181         VI_LOCK(vp);
1182         KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
1183             ("Active vnode list size %d > Vnode list size %d",
1184              mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
1185         active = vp->v_iflag & VI_ACTIVE;
1186         vp->v_iflag &= ~VI_ACTIVE;
1187         if (active) {
1188                 mtx_lock(&vnode_free_list_mtx);
1189                 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
1190                 mp->mnt_activevnodelistsize--;
1191                 mtx_unlock(&vnode_free_list_mtx);
1192         }
1193         vp->v_mount = NULL;
1194         VI_UNLOCK(vp);
1195         VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1196                 ("bad mount point vnode list size"));
1197         TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1198         mp->mnt_nvnodelistsize--;
1199         MNT_REL(mp);
1200         MNT_IUNLOCK(mp);
1201 }
1202
1203 static void
1204 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1205 {
1206
1207         vp->v_data = NULL;
1208         vp->v_op = &dead_vnodeops;
1209         vgone(vp);
1210         vput(vp);
1211 }
1212
1213 /*
1214  * Insert into list of vnodes for the new mount point, if available.
1215  */
1216 int
1217 insmntque1(struct vnode *vp, struct mount *mp,
1218         void (*dtr)(struct vnode *, void *), void *dtr_arg)
1219 {
1220
1221         KASSERT(vp->v_mount == NULL,
1222                 ("insmntque: vnode already on per mount vnode list"));
1223         VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1224         ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1225
1226         /*
1227          * We acquire the vnode interlock early to ensure that the
1228          * vnode cannot be recycled by another process releasing a
1229          * holdcnt on it before we get it on both the vnode list
1230          * and the active vnode list. The mount mutex protects only
1231          * manipulation of the vnode list and the vnode freelist
1232          * mutex protects only manipulation of the active vnode list.
1233          * Hence the need to hold the vnode interlock throughout.
1234          */
1235         MNT_ILOCK(mp);
1236         VI_LOCK(vp);
1237         if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1238             ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1239             mp->mnt_nvnodelistsize == 0)) &&
1240             (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1241                 VI_UNLOCK(vp);
1242                 MNT_IUNLOCK(mp);
1243                 if (dtr != NULL)
1244                         dtr(vp, dtr_arg);
1245                 return (EBUSY);
1246         }
1247         vp->v_mount = mp;
1248         MNT_REF(mp);
1249         TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1250         VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1251                 ("neg mount point vnode list size"));
1252         mp->mnt_nvnodelistsize++;
1253         KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
1254             ("Activating already active vnode"));
1255         vp->v_iflag |= VI_ACTIVE;
1256         mtx_lock(&vnode_free_list_mtx);
1257         TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
1258         mp->mnt_activevnodelistsize++;
1259         mtx_unlock(&vnode_free_list_mtx);
1260         VI_UNLOCK(vp);
1261         MNT_IUNLOCK(mp);
1262         return (0);
1263 }
1264
1265 int
1266 insmntque(struct vnode *vp, struct mount *mp)
1267 {
1268
1269         return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1270 }
1271
1272 /*
1273  * Flush out and invalidate all buffers associated with a bufobj
1274  * Called with the underlying object locked.
1275  */
1276 int
1277 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1278 {
1279         int error;
1280
1281         BO_LOCK(bo);
1282         if (flags & V_SAVE) {
1283                 error = bufobj_wwait(bo, slpflag, slptimeo);
1284                 if (error) {
1285                         BO_UNLOCK(bo);
1286                         return (error);
1287                 }
1288                 if (bo->bo_dirty.bv_cnt > 0) {
1289                         BO_UNLOCK(bo);
1290                         if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1291                                 return (error);
1292                         /*
1293                          * XXX We could save a lock/unlock if this was only
1294                          * enabled under INVARIANTS
1295                          */
1296                         BO_LOCK(bo);
1297                         if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1298                                 panic("vinvalbuf: dirty bufs");
1299                 }
1300         }
1301         /*
1302          * If you alter this loop please notice that interlock is dropped and
1303          * reacquired in flushbuflist.  Special care is needed to ensure that
1304          * no race conditions occur from this.
1305          */
1306         do {
1307                 error = flushbuflist(&bo->bo_clean,
1308                     flags, bo, slpflag, slptimeo);
1309                 if (error == 0 && !(flags & V_CLEANONLY))
1310                         error = flushbuflist(&bo->bo_dirty,
1311                             flags, bo, slpflag, slptimeo);
1312                 if (error != 0 && error != EAGAIN) {
1313                         BO_UNLOCK(bo);
1314                         return (error);
1315                 }
1316         } while (error != 0);
1317
1318         /*
1319          * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1320          * have write I/O in-progress but if there is a VM object then the
1321          * VM object can also have read-I/O in-progress.
1322          */
1323         do {
1324                 bufobj_wwait(bo, 0, 0);
1325                 BO_UNLOCK(bo);
1326                 if (bo->bo_object != NULL) {
1327                         VM_OBJECT_WLOCK(bo->bo_object);
1328                         vm_object_pip_wait(bo->bo_object, "bovlbx");
1329                         VM_OBJECT_WUNLOCK(bo->bo_object);
1330                 }
1331                 BO_LOCK(bo);
1332         } while (bo->bo_numoutput > 0);
1333         BO_UNLOCK(bo);
1334
1335         /*
1336          * Destroy the copy in the VM cache, too.
1337          */
1338         if (bo->bo_object != NULL &&
1339             (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
1340                 VM_OBJECT_WLOCK(bo->bo_object);
1341                 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1342                     OBJPR_CLEANONLY : 0);
1343                 VM_OBJECT_WUNLOCK(bo->bo_object);
1344         }
1345
1346 #ifdef INVARIANTS
1347         BO_LOCK(bo);
1348         if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
1349             (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1350                 panic("vinvalbuf: flush failed");
1351         BO_UNLOCK(bo);
1352 #endif
1353         return (0);
1354 }
1355
1356 /*
1357  * Flush out and invalidate all buffers associated with a vnode.
1358  * Called with the underlying object locked.
1359  */
1360 int
1361 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1362 {
1363
1364         CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1365         ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1366         if (vp->v_object != NULL && vp->v_object->handle != vp)
1367                 return (0);
1368         return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1369 }
1370
1371 /*
1372  * Flush out buffers on the specified list.
1373  *
1374  */
1375 static int
1376 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1377     int slptimeo)
1378 {
1379         struct buf *bp, *nbp;
1380         int retval, error;
1381         daddr_t lblkno;
1382         b_xflags_t xflags;
1383
1384         ASSERT_BO_WLOCKED(bo);
1385
1386         retval = 0;
1387         TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1388                 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1389                     ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1390                         continue;
1391                 }
1392                 lblkno = 0;
1393                 xflags = 0;
1394                 if (nbp != NULL) {
1395                         lblkno = nbp->b_lblkno;
1396                         xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
1397                 }
1398                 retval = EAGAIN;
1399                 error = BUF_TIMELOCK(bp,
1400                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
1401                     "flushbuf", slpflag, slptimeo);
1402                 if (error) {
1403                         BO_LOCK(bo);
1404                         return (error != ENOLCK ? error : EAGAIN);
1405                 }
1406                 KASSERT(bp->b_bufobj == bo,
1407                     ("bp %p wrong b_bufobj %p should be %p",
1408                     bp, bp->b_bufobj, bo));
1409                 if (bp->b_bufobj != bo) {       /* XXX: necessary ? */
1410                         BUF_UNLOCK(bp);
1411                         BO_LOCK(bo);
1412                         return (EAGAIN);
1413                 }
1414                 /*
1415                  * XXX Since there are no node locks for NFS, I
1416                  * believe there is a slight chance that a delayed
1417                  * write will occur while sleeping just above, so
1418                  * check for it.
1419                  */
1420                 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1421                     (flags & V_SAVE)) {
1422                         bremfree(bp);
1423                         bp->b_flags |= B_ASYNC;
1424                         bwrite(bp);
1425                         BO_LOCK(bo);
1426                         return (EAGAIN);        /* XXX: why not loop ? */
1427                 }
1428                 bremfree(bp);
1429                 bp->b_flags |= (B_INVAL | B_RELBUF);
1430                 bp->b_flags &= ~B_ASYNC;
1431                 brelse(bp);
1432                 BO_LOCK(bo);
1433                 if (nbp != NULL &&
1434                     (nbp->b_bufobj != bo ||
1435                      nbp->b_lblkno != lblkno ||
1436                      (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1437                         break;                  /* nbp invalid */
1438         }
1439         return (retval);
1440 }
1441
1442 /*
1443  * Truncate a file's buffer and pages to a specified length.  This
1444  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1445  * sync activity.
1446  */
1447 int
1448 vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
1449 {
1450         struct buf *bp, *nbp;
1451         int anyfreed;
1452         int trunclbn;
1453         struct bufobj *bo;
1454
1455         CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1456             vp, cred, blksize, (uintmax_t)length);
1457
1458         /*
1459          * Round up to the *next* lbn.
1460          */
1461         trunclbn = (length + blksize - 1) / blksize;
1462
1463         ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1464 restart:
1465         bo = &vp->v_bufobj;
1466         BO_LOCK(bo);
1467         anyfreed = 1;
1468         for (;anyfreed;) {
1469                 anyfreed = 0;
1470                 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1471                         if (bp->b_lblkno < trunclbn)
1472                                 continue;
1473                         if (BUF_LOCK(bp,
1474                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1475                             BO_LOCKPTR(bo)) == ENOLCK)
1476                                 goto restart;
1477
1478                         bremfree(bp);
1479                         bp->b_flags |= (B_INVAL | B_RELBUF);
1480                         bp->b_flags &= ~B_ASYNC;
1481                         brelse(bp);
1482                         anyfreed = 1;
1483
1484                         BO_LOCK(bo);
1485                         if (nbp != NULL &&
1486                             (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1487                             (nbp->b_vp != vp) ||
1488                             (nbp->b_flags & B_DELWRI))) {
1489                                 BO_UNLOCK(bo);
1490                                 goto restart;
1491                         }
1492                 }
1493
1494                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1495                         if (bp->b_lblkno < trunclbn)
1496                                 continue;
1497                         if (BUF_LOCK(bp,
1498                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1499                             BO_LOCKPTR(bo)) == ENOLCK)
1500                                 goto restart;
1501                         bremfree(bp);
1502                         bp->b_flags |= (B_INVAL | B_RELBUF);
1503                         bp->b_flags &= ~B_ASYNC;
1504                         brelse(bp);
1505                         anyfreed = 1;
1506
1507                         BO_LOCK(bo);
1508                         if (nbp != NULL &&
1509                             (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1510                             (nbp->b_vp != vp) ||
1511                             (nbp->b_flags & B_DELWRI) == 0)) {
1512                                 BO_UNLOCK(bo);
1513                                 goto restart;
1514                         }
1515                 }
1516         }
1517
1518         if (length > 0) {
1519 restartsync:
1520                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1521                         if (bp->b_lblkno > 0)
1522                                 continue;
1523                         /*
1524                          * Since we hold the vnode lock this should only
1525                          * fail if we're racing with the buf daemon.
1526                          */
1527                         if (BUF_LOCK(bp,
1528                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1529                             BO_LOCKPTR(bo)) == ENOLCK) {
1530                                 goto restart;
1531                         }
1532                         VNASSERT((bp->b_flags & B_DELWRI), vp,
1533                             ("buf(%p) on dirty queue without DELWRI", bp));
1534
1535                         bremfree(bp);
1536                         bawrite(bp);
1537                         BO_LOCK(bo);
1538                         goto restartsync;
1539                 }
1540         }
1541
1542         bufobj_wwait(bo, 0, 0);
1543         BO_UNLOCK(bo);
1544         vnode_pager_setsize(vp, length);
1545
1546         return (0);
1547 }
1548
1549 static void
1550 buf_vlist_remove(struct buf *bp)
1551 {
1552         struct bufv *bv;
1553
1554         KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1555         ASSERT_BO_WLOCKED(bp->b_bufobj);
1556         KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1557             (BX_VNDIRTY|BX_VNCLEAN),
1558             ("buf_vlist_remove: Buf %p is on two lists", bp));
1559         if (bp->b_xflags & BX_VNDIRTY)
1560                 bv = &bp->b_bufobj->bo_dirty;
1561         else
1562                 bv = &bp->b_bufobj->bo_clean;
1563         BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
1564         TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1565         bv->bv_cnt--;
1566         bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1567 }
1568
1569 /*
1570  * Add the buffer to the sorted clean or dirty block list.
1571  *
1572  * NOTE: xflags is passed as a constant, optimizing this inline function!
1573  */
1574 static void
1575 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1576 {
1577         struct bufv *bv;
1578         struct buf *n;
1579         int error;
1580
1581         ASSERT_BO_WLOCKED(bo);
1582         KASSERT((bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo));
1583         KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1584             ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1585         bp->b_xflags |= xflags;
1586         if (xflags & BX_VNDIRTY)
1587                 bv = &bo->bo_dirty;
1588         else
1589                 bv = &bo->bo_clean;
1590
1591         /*
1592          * Keep the list ordered.  Optimize empty list insertion.  Assume
1593          * we tend to grow at the tail so lookup_le should usually be cheaper
1594          * than _ge.
1595          */
1596         if (bv->bv_cnt == 0 ||
1597             bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
1598                 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1599         else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
1600                 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
1601         else
1602                 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
1603         error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
1604         if (error)
1605                 panic("buf_vlist_add:  Preallocated nodes insufficient.");
1606         bv->bv_cnt++;
1607 }
1608
1609 /*
1610  * Lookup a buffer using the splay tree.  Note that we specifically avoid
1611  * shadow buffers used in background bitmap writes.
1612  *
1613  * This code isn't quite efficient as it could be because we are maintaining
1614  * two sorted lists and do not know which list the block resides in.
1615  *
1616  * During a "make buildworld" the desired buffer is found at one of
1617  * the roots more than 60% of the time.  Thus, checking both roots
1618  * before performing either splay eliminates unnecessary splays on the
1619  * first tree splayed.
1620  */
1621 struct buf *
1622 gbincore(struct bufobj *bo, daddr_t lblkno)
1623 {
1624         struct buf *bp;
1625
1626         ASSERT_BO_LOCKED(bo);
1627         bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
1628         if (bp != NULL)
1629                 return (bp);
1630         return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
1631 }
1632
1633 /*
1634  * Associate a buffer with a vnode.
1635  */
1636 void
1637 bgetvp(struct vnode *vp, struct buf *bp)
1638 {
1639         struct bufobj *bo;
1640
1641         bo = &vp->v_bufobj;
1642         ASSERT_BO_WLOCKED(bo);
1643         VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1644
1645         CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1646         VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1647             ("bgetvp: bp already attached! %p", bp));
1648
1649         vhold(vp);
1650         bp->b_vp = vp;
1651         bp->b_bufobj = bo;
1652         /*
1653          * Insert onto list for new vnode.
1654          */
1655         buf_vlist_add(bp, bo, BX_VNCLEAN);
1656 }
1657
1658 /*
1659  * Disassociate a buffer from a vnode.
1660  */
1661 void
1662 brelvp(struct buf *bp)
1663 {
1664         struct bufobj *bo;
1665         struct vnode *vp;
1666
1667         CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1668         KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1669
1670         /*
1671          * Delete from old vnode list, if on one.
1672          */
1673         vp = bp->b_vp;          /* XXX */
1674         bo = bp->b_bufobj;
1675         BO_LOCK(bo);
1676         if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1677                 buf_vlist_remove(bp);
1678         else
1679                 panic("brelvp: Buffer %p not on queue.", bp);
1680         if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1681                 bo->bo_flag &= ~BO_ONWORKLST;
1682                 mtx_lock(&sync_mtx);
1683                 LIST_REMOVE(bo, bo_synclist);
1684                 syncer_worklist_len--;
1685                 mtx_unlock(&sync_mtx);
1686         }
1687         bp->b_vp = NULL;
1688         bp->b_bufobj = NULL;
1689         BO_UNLOCK(bo);
1690         vdrop(vp);
1691 }
1692
1693 /*
1694  * Add an item to the syncer work queue.
1695  */
1696 static void
1697 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1698 {
1699         int slot;
1700
1701         ASSERT_BO_WLOCKED(bo);
1702
1703         mtx_lock(&sync_mtx);
1704         if (bo->bo_flag & BO_ONWORKLST)
1705                 LIST_REMOVE(bo, bo_synclist);
1706         else {
1707                 bo->bo_flag |= BO_ONWORKLST;
1708                 syncer_worklist_len++;
1709         }
1710
1711         if (delay > syncer_maxdelay - 2)
1712                 delay = syncer_maxdelay - 2;
1713         slot = (syncer_delayno + delay) & syncer_mask;
1714
1715         LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1716         mtx_unlock(&sync_mtx);
1717 }
1718
1719 static int
1720 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1721 {
1722         int error, len;
1723
1724         mtx_lock(&sync_mtx);
1725         len = syncer_worklist_len - sync_vnode_count;
1726         mtx_unlock(&sync_mtx);
1727         error = SYSCTL_OUT(req, &len, sizeof(len));
1728         return (error);
1729 }
1730
1731 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1732     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1733
1734 static struct proc *updateproc;
1735 static void sched_sync(void);
1736 static struct kproc_desc up_kp = {
1737         "syncer",
1738         sched_sync,
1739         &updateproc
1740 };
1741 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1742
1743 static int
1744 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1745 {
1746         struct vnode *vp;
1747         struct mount *mp;
1748
1749         *bo = LIST_FIRST(slp);
1750         if (*bo == NULL)
1751                 return (0);
1752         vp = (*bo)->__bo_vnode; /* XXX */
1753         if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1754                 return (1);
1755         /*
1756          * We use vhold in case the vnode does not
1757          * successfully sync.  vhold prevents the vnode from
1758          * going away when we unlock the sync_mtx so that
1759          * we can acquire the vnode interlock.
1760          */
1761         vholdl(vp);
1762         mtx_unlock(&sync_mtx);
1763         VI_UNLOCK(vp);
1764         if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1765                 vdrop(vp);
1766                 mtx_lock(&sync_mtx);
1767                 return (*bo == LIST_FIRST(slp));
1768         }
1769         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1770         (void) VOP_FSYNC(vp, MNT_LAZY, td);
1771         VOP_UNLOCK(vp, 0);
1772         vn_finished_write(mp);
1773         BO_LOCK(*bo);
1774         if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1775                 /*
1776                  * Put us back on the worklist.  The worklist
1777                  * routine will remove us from our current
1778                  * position and then add us back in at a later
1779                  * position.
1780                  */
1781                 vn_syncer_add_to_worklist(*bo, syncdelay);
1782         }
1783         BO_UNLOCK(*bo);
1784         vdrop(vp);
1785         mtx_lock(&sync_mtx);
1786         return (0);
1787 }
1788
1789 static int first_printf = 1;
1790
1791 /*
1792  * System filesystem synchronizer daemon.
1793  */
1794 static void
1795 sched_sync(void)
1796 {
1797         struct synclist *next, *slp;
1798         struct bufobj *bo;
1799         long starttime;
1800         struct thread *td = curthread;
1801         int last_work_seen;
1802         int net_worklist_len;
1803         int syncer_final_iter;
1804         int error;
1805
1806         last_work_seen = 0;
1807         syncer_final_iter = 0;
1808         syncer_state = SYNCER_RUNNING;
1809         starttime = time_uptime;
1810         td->td_pflags |= TDP_NORUNNINGBUF;
1811
1812         EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1813             SHUTDOWN_PRI_LAST);
1814
1815         mtx_lock(&sync_mtx);
1816         for (;;) {
1817                 if (syncer_state == SYNCER_FINAL_DELAY &&
1818                     syncer_final_iter == 0) {
1819                         mtx_unlock(&sync_mtx);
1820                         kproc_suspend_check(td->td_proc);
1821                         mtx_lock(&sync_mtx);
1822                 }
1823                 net_worklist_len = syncer_worklist_len - sync_vnode_count;
1824                 if (syncer_state != SYNCER_RUNNING &&
1825                     starttime != time_uptime) {
1826                         if (first_printf) {
1827                                 printf("\nSyncing disks, vnodes remaining...");
1828                                 first_printf = 0;
1829                         }
1830                         printf("%d ", net_worklist_len);
1831                 }
1832                 starttime = time_uptime;
1833
1834                 /*
1835                  * Push files whose dirty time has expired.  Be careful
1836                  * of interrupt race on slp queue.
1837                  *
1838                  * Skip over empty worklist slots when shutting down.
1839                  */
1840                 do {
1841                         slp = &syncer_workitem_pending[syncer_delayno];
1842                         syncer_delayno += 1;
1843                         if (syncer_delayno == syncer_maxdelay)
1844                                 syncer_delayno = 0;
1845                         next = &syncer_workitem_pending[syncer_delayno];
1846                         /*
1847                          * If the worklist has wrapped since the
1848                          * it was emptied of all but syncer vnodes,
1849                          * switch to the FINAL_DELAY state and run
1850                          * for one more second.
1851                          */
1852                         if (syncer_state == SYNCER_SHUTTING_DOWN &&
1853                             net_worklist_len == 0 &&
1854                             last_work_seen == syncer_delayno) {
1855                                 syncer_state = SYNCER_FINAL_DELAY;
1856                                 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1857                         }
1858                 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1859                     syncer_worklist_len > 0);
1860
1861                 /*
1862                  * Keep track of the last time there was anything
1863                  * on the worklist other than syncer vnodes.
1864                  * Return to the SHUTTING_DOWN state if any
1865                  * new work appears.
1866                  */
1867                 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1868                         last_work_seen = syncer_delayno;
1869                 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1870                         syncer_state = SYNCER_SHUTTING_DOWN;
1871                 while (!LIST_EMPTY(slp)) {
1872                         error = sync_vnode(slp, &bo, td);
1873                         if (error == 1) {
1874                                 LIST_REMOVE(bo, bo_synclist);
1875                                 LIST_INSERT_HEAD(next, bo, bo_synclist);
1876                                 continue;
1877                         }
1878
1879                         if (first_printf == 0) {
1880                                 /*
1881                                  * Drop the sync mutex, because some watchdog
1882                                  * drivers need to sleep while patting
1883                                  */
1884                                 mtx_unlock(&sync_mtx);
1885                                 wdog_kern_pat(WD_LASTVAL);
1886                                 mtx_lock(&sync_mtx);
1887                         }
1888
1889                 }
1890                 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1891                         syncer_final_iter--;
1892                 /*
1893                  * The variable rushjob allows the kernel to speed up the
1894                  * processing of the filesystem syncer process. A rushjob
1895                  * value of N tells the filesystem syncer to process the next
1896                  * N seconds worth of work on its queue ASAP. Currently rushjob
1897                  * is used by the soft update code to speed up the filesystem
1898                  * syncer process when the incore state is getting so far
1899                  * ahead of the disk that the kernel memory pool is being
1900                  * threatened with exhaustion.
1901                  */
1902                 if (rushjob > 0) {
1903                         rushjob -= 1;
1904                         continue;
1905                 }
1906                 /*
1907                  * Just sleep for a short period of time between
1908                  * iterations when shutting down to allow some I/O
1909                  * to happen.
1910                  *
1911                  * If it has taken us less than a second to process the
1912                  * current work, then wait. Otherwise start right over
1913                  * again. We can still lose time if any single round
1914                  * takes more than two seconds, but it does not really
1915                  * matter as we are just trying to generally pace the
1916                  * filesystem activity.
1917                  */
1918                 if (syncer_state != SYNCER_RUNNING ||
1919                     time_uptime == starttime) {
1920                         thread_lock(td);
1921                         sched_prio(td, PPAUSE);
1922                         thread_unlock(td);
1923                 }
1924                 if (syncer_state != SYNCER_RUNNING)
1925                         cv_timedwait(&sync_wakeup, &sync_mtx,
1926                             hz / SYNCER_SHUTDOWN_SPEEDUP);
1927                 else if (time_uptime == starttime)
1928                         cv_timedwait(&sync_wakeup, &sync_mtx, hz);
1929         }
1930 }
1931
1932 /*
1933  * Request the syncer daemon to speed up its work.
1934  * We never push it to speed up more than half of its
1935  * normal turn time, otherwise it could take over the cpu.
1936  */
1937 int
1938 speedup_syncer(void)
1939 {
1940         int ret = 0;
1941
1942         mtx_lock(&sync_mtx);
1943         if (rushjob < syncdelay / 2) {
1944                 rushjob += 1;
1945                 stat_rush_requests += 1;
1946                 ret = 1;
1947         }
1948         mtx_unlock(&sync_mtx);
1949         cv_broadcast(&sync_wakeup);
1950         return (ret);
1951 }
1952
1953 /*
1954  * Tell the syncer to speed up its work and run though its work
1955  * list several times, then tell it to shut down.
1956  */
1957 static void
1958 syncer_shutdown(void *arg, int howto)
1959 {
1960
1961         if (howto & RB_NOSYNC)
1962                 return;
1963         mtx_lock(&sync_mtx);
1964         syncer_state = SYNCER_SHUTTING_DOWN;
1965         rushjob = 0;
1966         mtx_unlock(&sync_mtx);
1967         cv_broadcast(&sync_wakeup);
1968         kproc_shutdown(arg, howto);
1969 }
1970
1971 void
1972 syncer_suspend(void)
1973 {
1974
1975         syncer_shutdown(updateproc, 0);
1976 }
1977
1978 void
1979 syncer_resume(void)
1980 {
1981
1982         mtx_lock(&sync_mtx);
1983         first_printf = 1;
1984         syncer_state = SYNCER_RUNNING;
1985         mtx_unlock(&sync_mtx);
1986         cv_broadcast(&sync_wakeup);
1987         kproc_resume(updateproc);
1988 }
1989
1990 /*
1991  * Reassign a buffer from one vnode to another.
1992  * Used to assign file specific control information
1993  * (indirect blocks) to the vnode to which they belong.
1994  */
1995 void
1996 reassignbuf(struct buf *bp)
1997 {
1998         struct vnode *vp;
1999         struct bufobj *bo;
2000         int delay;
2001 #ifdef INVARIANTS
2002         struct bufv *bv;
2003 #endif
2004
2005         vp = bp->b_vp;
2006         bo = bp->b_bufobj;
2007         ++reassignbufcalls;
2008
2009         CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
2010             bp, bp->b_vp, bp->b_flags);
2011         /*
2012          * B_PAGING flagged buffers cannot be reassigned because their vp
2013          * is not fully linked in.
2014          */
2015         if (bp->b_flags & B_PAGING)
2016                 panic("cannot reassign paging buffer");
2017
2018         /*
2019          * Delete from old vnode list, if on one.
2020          */
2021         BO_LOCK(bo);
2022         if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2023                 buf_vlist_remove(bp);
2024         else
2025                 panic("reassignbuf: Buffer %p not on queue.", bp);
2026         /*
2027          * If dirty, put on list of dirty buffers; otherwise insert onto list
2028          * of clean buffers.
2029          */
2030         if (bp->b_flags & B_DELWRI) {
2031                 if ((bo->bo_flag & BO_ONWORKLST) == 0) {
2032                         switch (vp->v_type) {
2033                         case VDIR:
2034                                 delay = dirdelay;
2035                                 break;
2036                         case VCHR:
2037                                 delay = metadelay;
2038                                 break;
2039                         default:
2040                                 delay = filedelay;
2041                         }
2042                         vn_syncer_add_to_worklist(bo, delay);
2043                 }
2044                 buf_vlist_add(bp, bo, BX_VNDIRTY);
2045         } else {
2046                 buf_vlist_add(bp, bo, BX_VNCLEAN);
2047
2048                 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2049                         mtx_lock(&sync_mtx);
2050                         LIST_REMOVE(bo, bo_synclist);
2051                         syncer_worklist_len--;
2052                         mtx_unlock(&sync_mtx);
2053                         bo->bo_flag &= ~BO_ONWORKLST;
2054                 }
2055         }
2056 #ifdef INVARIANTS
2057         bv = &bo->bo_clean;
2058         bp = TAILQ_FIRST(&bv->bv_hd);
2059         KASSERT(bp == NULL || bp->b_bufobj == bo,
2060             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2061         bp = TAILQ_LAST(&bv->bv_hd, buflists);
2062         KASSERT(bp == NULL || bp->b_bufobj == bo,
2063             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2064         bv = &bo->bo_dirty;
2065         bp = TAILQ_FIRST(&bv->bv_hd);
2066         KASSERT(bp == NULL || bp->b_bufobj == bo,
2067             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2068         bp = TAILQ_LAST(&bv->bv_hd, buflists);
2069         KASSERT(bp == NULL || bp->b_bufobj == bo,
2070             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2071 #endif
2072         BO_UNLOCK(bo);
2073 }
2074
2075 /*
2076  * Increment the use and hold counts on the vnode, taking care to reference
2077  * the driver's usecount if this is a chardev.  The vholdl() will remove
2078  * the vnode from the free list if it is presently free.  Requires the
2079  * vnode interlock and returns with it held.
2080  */
2081 static void
2082 v_incr_usecount(struct vnode *vp)
2083 {
2084
2085         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2086         vholdl(vp);
2087         vp->v_usecount++;
2088         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2089                 dev_lock();
2090                 vp->v_rdev->si_usecount++;
2091                 dev_unlock();
2092         }
2093 }
2094
2095 /*
2096  * Turn a holdcnt into a use+holdcnt such that only one call to
2097  * v_decr_usecount is needed.
2098  */
2099 static void
2100 v_upgrade_usecount(struct vnode *vp)
2101 {
2102
2103         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2104         vp->v_usecount++;
2105         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2106                 dev_lock();
2107                 vp->v_rdev->si_usecount++;
2108                 dev_unlock();
2109         }
2110 }
2111
2112 /*
2113  * Decrement the vnode use and hold count along with the driver's usecount
2114  * if this is a chardev.  The vdropl() below releases the vnode interlock
2115  * as it may free the vnode.
2116  */
2117 static void
2118 v_decr_usecount(struct vnode *vp)
2119 {
2120
2121         ASSERT_VI_LOCKED(vp, __FUNCTION__);
2122         VNASSERT(vp->v_usecount > 0, vp,
2123             ("v_decr_usecount: negative usecount"));
2124         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2125         vp->v_usecount--;
2126         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2127                 dev_lock();
2128                 vp->v_rdev->si_usecount--;
2129                 dev_unlock();
2130         }
2131         vdropl(vp);
2132 }
2133
2134 /*
2135  * Decrement only the use count and driver use count.  This is intended to
2136  * be paired with a follow on vdropl() to release the remaining hold count.
2137  * In this way we may vgone() a vnode with a 0 usecount without risk of
2138  * having it end up on a free list because the hold count is kept above 0.
2139  */
2140 static void
2141 v_decr_useonly(struct vnode *vp)
2142 {
2143
2144         ASSERT_VI_LOCKED(vp, __FUNCTION__);
2145         VNASSERT(vp->v_usecount > 0, vp,
2146             ("v_decr_useonly: negative usecount"));
2147         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2148         vp->v_usecount--;
2149         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2150                 dev_lock();
2151                 vp->v_rdev->si_usecount--;
2152                 dev_unlock();
2153         }
2154 }
2155
2156 /*
2157  * Grab a particular vnode from the free list, increment its
2158  * reference count and lock it.  VI_DOOMED is set if the vnode
2159  * is being destroyed.  Only callers who specify LK_RETRY will
2160  * see doomed vnodes.  If inactive processing was delayed in
2161  * vput try to do it here.
2162  */
2163 int
2164 vget(struct vnode *vp, int flags, struct thread *td)
2165 {
2166         int error;
2167
2168         error = 0;
2169         VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2170             ("vget: invalid lock operation"));
2171         CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2172
2173         if ((flags & LK_INTERLOCK) == 0)
2174                 VI_LOCK(vp);
2175         vholdl(vp);
2176         if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
2177                 vdrop(vp);
2178                 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2179                     vp);
2180                 return (error);
2181         }
2182         if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2183                 panic("vget: vn_lock failed to return ENOENT\n");
2184         VI_LOCK(vp);
2185         /* Upgrade our holdcnt to a usecount. */
2186         v_upgrade_usecount(vp);
2187         /*
2188          * We don't guarantee that any particular close will
2189          * trigger inactive processing so just make a best effort
2190          * here at preventing a reference to a removed file.  If
2191          * we don't succeed no harm is done.
2192          */
2193         if (vp->v_iflag & VI_OWEINACT) {
2194                 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2195                     (flags & LK_NOWAIT) == 0)
2196                         vinactive(vp, td);
2197                 vp->v_iflag &= ~VI_OWEINACT;
2198         }
2199         VI_UNLOCK(vp);
2200         return (0);
2201 }
2202
2203 /*
2204  * Increase the reference count of a vnode.
2205  */
2206 void
2207 vref(struct vnode *vp)
2208 {
2209
2210         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2211         VI_LOCK(vp);
2212         v_incr_usecount(vp);
2213         VI_UNLOCK(vp);
2214 }
2215
2216 /*
2217  * Return reference count of a vnode.
2218  *
2219  * The results of this call are only guaranteed when some mechanism other
2220  * than the VI lock is used to stop other processes from gaining references
2221  * to the vnode.  This may be the case if the caller holds the only reference.
2222  * This is also useful when stale data is acceptable as race conditions may
2223  * be accounted for by some other means.
2224  */
2225 int
2226 vrefcnt(struct vnode *vp)
2227 {
2228         int usecnt;
2229
2230         VI_LOCK(vp);
2231         usecnt = vp->v_usecount;
2232         VI_UNLOCK(vp);
2233
2234         return (usecnt);
2235 }
2236
2237 #define VPUTX_VRELE     1
2238 #define VPUTX_VPUT      2
2239 #define VPUTX_VUNREF    3
2240
2241 static void
2242 vputx(struct vnode *vp, int func)
2243 {
2244         int error;
2245
2246         KASSERT(vp != NULL, ("vputx: null vp"));
2247         if (func == VPUTX_VUNREF)
2248                 ASSERT_VOP_LOCKED(vp, "vunref");
2249         else if (func == VPUTX_VPUT)
2250                 ASSERT_VOP_LOCKED(vp, "vput");
2251         else
2252                 KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2253         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2254         VI_LOCK(vp);
2255
2256         /* Skip this v_writecount check if we're going to panic below. */
2257         VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2258             ("vputx: missed vn_close"));
2259         error = 0;
2260
2261         if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2262             vp->v_usecount == 1)) {
2263                 if (func == VPUTX_VPUT)
2264                         VOP_UNLOCK(vp, 0);
2265                 v_decr_usecount(vp);
2266                 return;
2267         }
2268
2269         if (vp->v_usecount != 1) {
2270                 vprint("vputx: negative ref count", vp);
2271                 panic("vputx: negative ref cnt");
2272         }
2273         CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2274         /*
2275          * We want to hold the vnode until the inactive finishes to
2276          * prevent vgone() races.  We drop the use count here and the
2277          * hold count below when we're done.
2278          */
2279         v_decr_useonly(vp);
2280         /*
2281          * We must call VOP_INACTIVE with the node locked. Mark
2282          * as VI_DOINGINACT to avoid recursion.
2283          */
2284         vp->v_iflag |= VI_OWEINACT;
2285         switch (func) {
2286         case VPUTX_VRELE:
2287                 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2288                 VI_LOCK(vp);
2289                 break;
2290         case VPUTX_VPUT:
2291                 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2292                         error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2293                             LK_NOWAIT);
2294                         VI_LOCK(vp);
2295                 }
2296                 break;
2297         case VPUTX_VUNREF:
2298                 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2299                         error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
2300                         VI_LOCK(vp);
2301                 }
2302                 break;
2303         }
2304         if (vp->v_usecount > 0)
2305                 vp->v_iflag &= ~VI_OWEINACT;
2306         if (error == 0) {
2307                 if (vp->v_iflag & VI_OWEINACT)
2308                         vinactive(vp, curthread);
2309                 if (func != VPUTX_VUNREF)
2310                         VOP_UNLOCK(vp, 0);
2311         }
2312         vdropl(vp);
2313 }
2314
2315 /*
2316  * Vnode put/release.
2317  * If count drops to zero, call inactive routine and return to freelist.
2318  */
2319 void
2320 vrele(struct vnode *vp)
2321 {
2322
2323         vputx(vp, VPUTX_VRELE);
2324 }
2325
2326 /*
2327  * Release an already locked vnode.  This give the same effects as
2328  * unlock+vrele(), but takes less time and avoids releasing and
2329  * re-aquiring the lock (as vrele() acquires the lock internally.)
2330  */
2331 void
2332 vput(struct vnode *vp)
2333 {
2334
2335         vputx(vp, VPUTX_VPUT);
2336 }
2337
2338 /*
2339  * Release an exclusively locked vnode. Do not unlock the vnode lock.
2340  */
2341 void
2342 vunref(struct vnode *vp)
2343 {
2344
2345         vputx(vp, VPUTX_VUNREF);
2346 }
2347
2348 /*
2349  * Somebody doesn't want the vnode recycled.
2350  */
2351 void
2352 vhold(struct vnode *vp)
2353 {
2354
2355         VI_LOCK(vp);
2356         vholdl(vp);
2357         VI_UNLOCK(vp);
2358 }
2359
2360 /*
2361  * Increase the hold count and activate if this is the first reference.
2362  */
2363 void
2364 vholdl(struct vnode *vp)
2365 {
2366         struct mount *mp;
2367
2368         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2369 #ifdef INVARIANTS
2370         /* getnewvnode() calls v_incr_usecount() without holding interlock. */
2371         if (vp->v_type != VNON || vp->v_data != NULL) {
2372                 ASSERT_VI_LOCKED(vp, "vholdl");
2373                 VNASSERT(vp->v_holdcnt > 0 || (vp->v_iflag & VI_FREE) != 0,
2374                     vp, ("vholdl: free vnode is held"));
2375         }
2376 #endif
2377         vp->v_holdcnt++;
2378         if ((vp->v_iflag & VI_FREE) == 0)
2379                 return;
2380         VNASSERT(vp->v_holdcnt == 1, vp, ("vholdl: wrong hold count"));
2381         VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
2382         /*
2383          * Remove a vnode from the free list, mark it as in use,
2384          * and put it on the active list.
2385          */
2386         mtx_lock(&vnode_free_list_mtx);
2387         TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
2388         freevnodes--;
2389         vp->v_iflag &= ~(VI_FREE|VI_AGE);
2390         KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
2391             ("Activating already active vnode"));
2392         vp->v_iflag |= VI_ACTIVE;
2393         mp = vp->v_mount;
2394         TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
2395         mp->mnt_activevnodelistsize++;
2396         mtx_unlock(&vnode_free_list_mtx);
2397 }
2398
2399 /*
2400  * Note that there is one less who cares about this vnode.
2401  * vdrop() is the opposite of vhold().
2402  */
2403 void
2404 vdrop(struct vnode *vp)
2405 {
2406
2407         VI_LOCK(vp);
2408         vdropl(vp);
2409 }
2410
2411 /*
2412  * Drop the hold count of the vnode.  If this is the last reference to
2413  * the vnode we place it on the free list unless it has been vgone'd
2414  * (marked VI_DOOMED) in which case we will free it.
2415  */
2416 void
2417 vdropl(struct vnode *vp)
2418 {
2419         struct bufobj *bo;
2420         struct mount *mp;
2421         int active;
2422
2423         ASSERT_VI_LOCKED(vp, "vdropl");
2424         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2425         if (vp->v_holdcnt <= 0)
2426                 panic("vdrop: holdcnt %d", vp->v_holdcnt);
2427         vp->v_holdcnt--;
2428         VNASSERT(vp->v_holdcnt >= vp->v_usecount, vp,
2429             ("hold count less than use count"));
2430         if (vp->v_holdcnt > 0) {
2431                 VI_UNLOCK(vp);
2432                 return;
2433         }
2434         if ((vp->v_iflag & VI_DOOMED) == 0) {
2435                 /*
2436                  * Mark a vnode as free: remove it from its active list
2437                  * and put it up for recycling on the freelist.
2438                  */
2439                 VNASSERT(vp->v_op != NULL, vp,
2440                     ("vdropl: vnode already reclaimed."));
2441                 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2442                     ("vnode already free"));
2443                 VNASSERT(vp->v_holdcnt == 0, vp,
2444                     ("vdropl: freeing when we shouldn't"));
2445                 active = vp->v_iflag & VI_ACTIVE;
2446                 vp->v_iflag &= ~VI_ACTIVE;
2447                 mp = vp->v_mount;
2448                 mtx_lock(&vnode_free_list_mtx);
2449                 if (active) {
2450                         TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
2451                             v_actfreelist);
2452                         mp->mnt_activevnodelistsize--;
2453                 }
2454                 if (vp->v_iflag & VI_AGE) {
2455                         TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_actfreelist);
2456                 } else {
2457                         TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
2458                 }
2459                 freevnodes++;
2460                 vp->v_iflag &= ~VI_AGE;
2461                 vp->v_iflag |= VI_FREE;
2462                 mtx_unlock(&vnode_free_list_mtx);
2463                 VI_UNLOCK(vp);
2464                 return;
2465         }
2466         /*
2467          * The vnode has been marked for destruction, so free it.
2468          */
2469         CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
2470         atomic_subtract_long(&numvnodes, 1);
2471         bo = &vp->v_bufobj;
2472         VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2473             ("cleaned vnode still on the free list."));
2474         VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
2475         VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
2476         VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
2477         VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
2478         VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
2479         VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
2480         VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
2481             ("clean blk trie not empty"));
2482         VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
2483         VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
2484             ("dirty blk trie not empty"));
2485         VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
2486         VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
2487         VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
2488         VI_UNLOCK(vp);
2489 #ifdef MAC
2490         mac_vnode_destroy(vp);
2491 #endif
2492         if (vp->v_pollinfo != NULL)
2493                 destroy_vpollinfo(vp->v_pollinfo);
2494 #ifdef INVARIANTS
2495         /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
2496         vp->v_op = NULL;
2497 #endif
2498         rangelock_destroy(&vp->v_rl);
2499         lockdestroy(vp->v_vnlock);
2500         mtx_destroy(&vp->v_interlock);
2501         rw_destroy(BO_LOCKPTR(bo));
2502         uma_zfree(vnode_zone, vp);
2503 }
2504
2505 /*
2506  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2507  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2508  * OWEINACT tracks whether a vnode missed a call to inactive due to a
2509  * failed lock upgrade.
2510  */
2511 void
2512 vinactive(struct vnode *vp, struct thread *td)
2513 {
2514         struct vm_object *obj;
2515
2516         ASSERT_VOP_ELOCKED(vp, "vinactive");
2517         ASSERT_VI_LOCKED(vp, "vinactive");
2518         VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2519             ("vinactive: recursed on VI_DOINGINACT"));
2520         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2521         vp->v_iflag |= VI_DOINGINACT;
2522         vp->v_iflag &= ~VI_OWEINACT;
2523         VI_UNLOCK(vp);
2524         /*
2525          * Before moving off the active list, we must be sure that any
2526          * modified pages are on the vnode's dirty list since these will
2527          * no longer be checked once the vnode is on the inactive list.
2528          * Because the vnode vm object keeps a hold reference on the vnode
2529          * if there is at least one resident non-cached page, the vnode
2530          * cannot leave the active list without the page cleanup done.
2531          */
2532         obj = vp->v_object;
2533         if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
2534                 VM_OBJECT_WLOCK(obj);
2535                 vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
2536                 VM_OBJECT_WUNLOCK(obj);
2537         }
2538         VOP_INACTIVE(vp, td);
2539         VI_LOCK(vp);
2540         VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2541             ("vinactive: lost VI_DOINGINACT"));
2542         vp->v_iflag &= ~VI_DOINGINACT;
2543 }
2544
2545 /*
2546  * Remove any vnodes in the vnode table belonging to mount point mp.
2547  *
2548  * If FORCECLOSE is not specified, there should not be any active ones,
2549  * return error if any are found (nb: this is a user error, not a
2550  * system error). If FORCECLOSE is specified, detach any active vnodes
2551  * that are found.
2552  *
2553  * If WRITECLOSE is set, only flush out regular file vnodes open for
2554  * writing.
2555  *
2556  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2557  *
2558  * `rootrefs' specifies the base reference count for the root vnode
2559  * of this filesystem. The root vnode is considered busy if its
2560  * v_usecount exceeds this value. On a successful return, vflush(, td)
2561  * will call vrele() on the root vnode exactly rootrefs times.
2562  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2563  * be zero.
2564  */
2565 #ifdef DIAGNOSTIC
2566 static int busyprt = 0;         /* print out busy vnodes */
2567 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2568 #endif
2569
2570 int
2571 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2572 {
2573         struct vnode *vp, *mvp, *rootvp = NULL;
2574         struct vattr vattr;
2575         int busy = 0, error;
2576
2577         CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2578             rootrefs, flags);
2579         if (rootrefs > 0) {
2580                 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2581                     ("vflush: bad args"));
2582                 /*
2583                  * Get the filesystem root vnode. We can vput() it
2584                  * immediately, since with rootrefs > 0, it won't go away.
2585                  */
2586                 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2587                         CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2588                             __func__, error);
2589                         return (error);
2590                 }
2591                 vput(rootvp);
2592         }
2593 loop:
2594         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2595                 vholdl(vp);
2596                 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2597                 if (error) {
2598                         vdrop(vp);
2599                         MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2600                         goto loop;
2601                 }
2602                 /*
2603                  * Skip over a vnodes marked VV_SYSTEM.
2604                  */
2605                 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2606                         VOP_UNLOCK(vp, 0);
2607                         vdrop(vp);
2608                         continue;
2609                 }
2610                 /*
2611                  * If WRITECLOSE is set, flush out unlinked but still open
2612                  * files (even if open only for reading) and regular file
2613                  * vnodes open for writing.
2614                  */
2615                 if (flags & WRITECLOSE) {
2616                         if (vp->v_object != NULL) {
2617                                 VM_OBJECT_WLOCK(vp->v_object);
2618                                 vm_object_page_clean(vp->v_object, 0, 0, 0);
2619                                 VM_OBJECT_WUNLOCK(vp->v_object);
2620                         }
2621                         error = VOP_FSYNC(vp, MNT_WAIT, td);
2622                         if (error != 0) {
2623                                 VOP_UNLOCK(vp, 0);
2624                                 vdrop(vp);
2625                                 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2626                                 return (error);
2627                         }
2628                         error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2629                         VI_LOCK(vp);
2630
2631                         if ((vp->v_type == VNON ||
2632                             (error == 0 && vattr.va_nlink > 0)) &&
2633                             (vp->v_writecount == 0 || vp->v_type != VREG)) {
2634                                 VOP_UNLOCK(vp, 0);
2635                                 vdropl(vp);
2636                                 continue;
2637                         }
2638                 } else
2639                         VI_LOCK(vp);
2640                 /*
2641                  * With v_usecount == 0, all we need to do is clear out the
2642                  * vnode data structures and we are done.
2643                  *
2644                  * If FORCECLOSE is set, forcibly close the vnode.
2645                  */
2646                 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2647                         VNASSERT(vp->v_usecount == 0 ||
2648                             vp->v_op != &devfs_specops ||
2649                             (vp->v_type != VCHR && vp->v_type != VBLK), vp,
2650                             ("device VNODE %p is FORCECLOSED", vp));
2651                         vgonel(vp);
2652                 } else {
2653                         busy++;
2654 #ifdef DIAGNOSTIC
2655                         if (busyprt)
2656                                 vprint("vflush: busy vnode", vp);
2657 #endif
2658                 }
2659                 VOP_UNLOCK(vp, 0);
2660                 vdropl(vp);
2661         }
2662         if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2663                 /*
2664                  * If just the root vnode is busy, and if its refcount
2665                  * is equal to `rootrefs', then go ahead and kill it.
2666                  */
2667                 VI_LOCK(rootvp);
2668                 KASSERT(busy > 0, ("vflush: not busy"));
2669                 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2670                     ("vflush: usecount %d < rootrefs %d",
2671                      rootvp->v_usecount, rootrefs));
2672                 if (busy == 1 && rootvp->v_usecount == rootrefs) {
2673                         VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2674                         vgone(rootvp);
2675                         VOP_UNLOCK(rootvp, 0);
2676                         busy = 0;
2677                 } else
2678                         VI_UNLOCK(rootvp);
2679         }
2680         if (busy) {
2681                 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2682                     busy);
2683                 return (EBUSY);
2684         }
2685         for (; rootrefs > 0; rootrefs--)
2686                 vrele(rootvp);
2687         return (0);
2688 }
2689
2690 /*
2691  * Recycle an unused vnode to the front of the free list.
2692  */
2693 int
2694 vrecycle(struct vnode *vp)
2695 {
2696         int recycled;
2697
2698         ASSERT_VOP_ELOCKED(vp, "vrecycle");
2699         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2700         recycled = 0;
2701         VI_LOCK(vp);
2702         if (vp->v_usecount == 0) {
2703                 recycled = 1;
2704                 vgonel(vp);
2705         }
2706         VI_UNLOCK(vp);
2707         return (recycled);
2708 }
2709
2710 /*
2711  * Eliminate all activity associated with a vnode
2712  * in preparation for reuse.
2713  */
2714 void
2715 vgone(struct vnode *vp)
2716 {
2717         VI_LOCK(vp);
2718         vgonel(vp);
2719         VI_UNLOCK(vp);
2720 }
2721
2722 static void
2723 notify_lowervp_vfs_dummy(struct mount *mp __unused,
2724     struct vnode *lowervp __unused)
2725 {
2726 }
2727
2728 /*
2729  * Notify upper mounts about reclaimed or unlinked vnode.
2730  */
2731 void
2732 vfs_notify_upper(struct vnode *vp, int event)
2733 {
2734         static struct vfsops vgonel_vfsops = {
2735                 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
2736                 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
2737         };
2738         struct mount *mp, *ump, *mmp;
2739
2740         mp = vp->v_mount;
2741         if (mp == NULL)
2742                 return;
2743
2744         MNT_ILOCK(mp);
2745         if (TAILQ_EMPTY(&mp->mnt_uppers))
2746                 goto unlock;
2747         MNT_IUNLOCK(mp);
2748         mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
2749         mmp->mnt_op = &vgonel_vfsops;
2750         mmp->mnt_kern_flag |= MNTK_MARKER;
2751         MNT_ILOCK(mp);
2752         mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
2753         for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
2754                 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
2755                         ump = TAILQ_NEXT(ump, mnt_upper_link);
2756                         continue;
2757                 }
2758                 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
2759                 MNT_IUNLOCK(mp);
2760                 switch (event) {
2761                 case VFS_NOTIFY_UPPER_RECLAIM:
2762                         VFS_RECLAIM_LOWERVP(ump, vp);
2763                         break;
2764                 case VFS_NOTIFY_UPPER_UNLINK:
2765                         VFS_UNLINK_LOWERVP(ump, vp);
2766                         break;
2767                 default:
2768                         KASSERT(0, ("invalid event %d", event));
2769                         break;
2770                 }
2771                 MNT_ILOCK(mp);
2772                 ump = TAILQ_NEXT(mmp, mnt_upper_link);
2773                 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
2774         }
2775         free(mmp, M_TEMP);
2776         mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
2777         if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
2778                 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
2779                 wakeup(&mp->mnt_uppers);
2780         }
2781 unlock:
2782         MNT_IUNLOCK(mp);
2783 }
2784
2785 /*
2786  * vgone, with the vp interlock held.
2787  */
2788 void
2789 vgonel(struct vnode *vp)
2790 {
2791         struct thread *td;
2792         int oweinact;
2793         int active;
2794         struct mount *mp;
2795
2796         ASSERT_VOP_ELOCKED(vp, "vgonel");
2797         ASSERT_VI_LOCKED(vp, "vgonel");
2798         VNASSERT(vp->v_holdcnt, vp,
2799             ("vgonel: vp %p has no reference.", vp));
2800         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2801         td = curthread;
2802
2803         /*
2804          * Don't vgonel if we're already doomed.
2805          */
2806         if (vp->v_iflag & VI_DOOMED)
2807                 return;
2808         vp->v_iflag |= VI_DOOMED;
2809
2810         /*
2811          * Check to see if the vnode is in use.  If so, we have to call
2812          * VOP_CLOSE() and VOP_INACTIVE().
2813          */
2814         active = vp->v_usecount;
2815         oweinact = (vp->v_iflag & VI_OWEINACT);
2816         VI_UNLOCK(vp);
2817         vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
2818
2819         /*
2820          * If purging an active vnode, it must be closed and
2821          * deactivated before being reclaimed.
2822          */
2823         if (active)
2824                 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2825         if (oweinact || active) {
2826                 VI_LOCK(vp);
2827                 if ((vp->v_iflag & VI_DOINGINACT) == 0)
2828                         vinactive(vp, td);
2829                 VI_UNLOCK(vp);
2830         }
2831         if (vp->v_type == VSOCK)
2832                 vfs_unp_reclaim(vp);
2833
2834         /*
2835          * Clean out any buffers associated with the vnode.
2836          * If the flush fails, just toss the buffers.
2837          */
2838         mp = NULL;
2839         if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2840                 (void) vn_start_secondary_write(vp, &mp, V_WAIT);
2841         if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
2842                 while (vinvalbuf(vp, 0, 0, 0) != 0)
2843                         ;
2844         }
2845 #ifdef INVARIANTS
2846         BO_LOCK(&vp->v_bufobj);
2847         KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
2848             vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
2849             TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
2850             vp->v_bufobj.bo_clean.bv_cnt == 0,
2851             ("vp %p bufobj not invalidated", vp));
2852         vp->v_bufobj.bo_flag |= BO_DEAD;
2853         BO_UNLOCK(&vp->v_bufobj);
2854 #endif
2855
2856         /*
2857          * Reclaim the vnode.
2858          */
2859         if (VOP_RECLAIM(vp, td))
2860                 panic("vgone: cannot reclaim");
2861         if (mp != NULL)
2862                 vn_finished_secondary_write(mp);
2863         VNASSERT(vp->v_object == NULL, vp,
2864             ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2865         /*
2866          * Clear the advisory locks and wake up waiting threads.
2867          */
2868         (void)VOP_ADVLOCKPURGE(vp);
2869         /*
2870          * Delete from old mount point vnode list.
2871          */
2872         delmntque(vp);
2873         cache_purge(vp);
2874         /*
2875          * Done with purge, reset to the standard lock and invalidate
2876          * the vnode.
2877          */
2878         VI_LOCK(vp);
2879         vp->v_vnlock = &vp->v_lock;
2880         vp->v_op = &dead_vnodeops;
2881         vp->v_tag = "none";
2882         vp->v_type = VBAD;
2883 }
2884
2885 /*
2886  * Calculate the total number of references to a special device.
2887  */
2888 int
2889 vcount(struct vnode *vp)
2890 {
2891         int count;
2892
2893         dev_lock();
2894         count = vp->v_rdev->si_usecount;
2895         dev_unlock();
2896         return (count);
2897 }
2898
2899 /*
2900  * Same as above, but using the struct cdev *as argument
2901  */
2902 int
2903 count_dev(struct cdev *dev)
2904 {
2905         int count;
2906
2907         dev_lock();
2908         count = dev->si_usecount;
2909         dev_unlock();
2910         return(count);
2911 }
2912
2913 /*
2914  * Print out a description of a vnode.
2915  */
2916 static char *typename[] =
2917 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2918  "VMARKER"};
2919
2920 void
2921 vn_printf(struct vnode *vp, const char *fmt, ...)
2922 {
2923         va_list ap;
2924         char buf[256], buf2[16];
2925         u_long flags;
2926
2927         va_start(ap, fmt);
2928         vprintf(fmt, ap);
2929         va_end(ap);
2930         printf("%p: ", (void *)vp);
2931         printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2932         printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2933             vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2934         buf[0] = '\0';
2935         buf[1] = '\0';
2936         if (vp->v_vflag & VV_ROOT)
2937                 strlcat(buf, "|VV_ROOT", sizeof(buf));
2938         if (vp->v_vflag & VV_ISTTY)
2939                 strlcat(buf, "|VV_ISTTY", sizeof(buf));
2940         if (vp->v_vflag & VV_NOSYNC)
2941                 strlcat(buf, "|VV_NOSYNC", sizeof(buf));
2942         if (vp->v_vflag & VV_ETERNALDEV)
2943                 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
2944         if (vp->v_vflag & VV_CACHEDLABEL)
2945                 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
2946         if (vp->v_vflag & VV_TEXT)
2947                 strlcat(buf, "|VV_TEXT", sizeof(buf));
2948         if (vp->v_vflag & VV_COPYONWRITE)
2949                 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
2950         if (vp->v_vflag & VV_SYSTEM)
2951                 strlcat(buf, "|VV_SYSTEM", sizeof(buf));
2952         if (vp->v_vflag & VV_PROCDEP)
2953                 strlcat(buf, "|VV_PROCDEP", sizeof(buf));
2954         if (vp->v_vflag & VV_NOKNOTE)
2955                 strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
2956         if (vp->v_vflag & VV_DELETED)
2957                 strlcat(buf, "|VV_DELETED", sizeof(buf));
2958         if (vp->v_vflag & VV_MD)
2959                 strlcat(buf, "|VV_MD", sizeof(buf));
2960         if (vp->v_vflag & VV_FORCEINSMQ)
2961                 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
2962         flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
2963             VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
2964             VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
2965         if (flags != 0) {
2966                 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
2967                 strlcat(buf, buf2, sizeof(buf));
2968         }
2969         if (vp->v_iflag & VI_MOUNT)
2970                 strlcat(buf, "|VI_MOUNT", sizeof(buf));
2971         if (vp->v_iflag & VI_AGE)
2972                 strlcat(buf, "|VI_AGE", sizeof(buf));
2973         if (vp->v_iflag & VI_DOOMED)
2974                 strlcat(buf, "|VI_DOOMED", sizeof(buf));
2975         if (vp->v_iflag & VI_FREE)
2976                 strlcat(buf, "|VI_FREE", sizeof(buf));
2977         if (vp->v_iflag & VI_ACTIVE)
2978                 strlcat(buf, "|VI_ACTIVE", sizeof(buf));
2979         if (vp->v_iflag & VI_DOINGINACT)
2980                 strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
2981         if (vp->v_iflag & VI_OWEINACT)
2982                 strlcat(buf, "|VI_OWEINACT", sizeof(buf));
2983         flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
2984             VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
2985         if (flags != 0) {
2986                 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
2987                 strlcat(buf, buf2, sizeof(buf));
2988         }
2989         printf("    flags (%s)\n", buf + 1);
2990         if (mtx_owned(VI_MTX(vp)))
2991                 printf(" VI_LOCKed");
2992         if (vp->v_object != NULL)
2993                 printf("    v_object %p ref %d pages %d "
2994                     "cleanbuf %d dirtybuf %d\n",
2995                     vp->v_object, vp->v_object->ref_count,
2996                     vp->v_object->resident_page_count,
2997                     vp->v_bufobj.bo_dirty.bv_cnt,
2998                     vp->v_bufobj.bo_clean.bv_cnt);
2999         printf("    ");
3000         lockmgr_printinfo(vp->v_vnlock);
3001         if (vp->v_data != NULL)
3002                 VOP_PRINT(vp);
3003 }
3004
3005 #ifdef DDB
3006 /*
3007  * List all of the locked vnodes in the system.
3008  * Called when debugging the kernel.
3009  */
3010 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
3011 {
3012         struct mount *mp;
3013         struct vnode *vp;
3014
3015         /*
3016          * Note: because this is DDB, we can't obey the locking semantics
3017          * for these structures, which means we could catch an inconsistent
3018          * state and dereference a nasty pointer.  Not much to be done
3019          * about that.
3020          */
3021         db_printf("Locked vnodes\n");
3022         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3023                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3024                         if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
3025                                 vprint("", vp);
3026                 }
3027         }
3028 }
3029
3030 /*
3031  * Show details about the given vnode.
3032  */
3033 DB_SHOW_COMMAND(vnode, db_show_vnode)
3034 {
3035         struct vnode *vp;
3036
3037         if (!have_addr)
3038                 return;
3039         vp = (struct vnode *)addr;
3040         vn_printf(vp, "vnode ");
3041 }
3042
3043 /*
3044  * Show details about the given mount point.
3045  */
3046 DB_SHOW_COMMAND(mount, db_show_mount)
3047 {
3048         struct mount *mp;
3049         struct vfsopt *opt;
3050         struct statfs *sp;
3051         struct vnode *vp;
3052         char buf[512];
3053         uint64_t mflags;
3054         u_int flags;
3055
3056         if (!have_addr) {
3057                 /* No address given, print short info about all mount points. */
3058                 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3059                         db_printf("%p %s on %s (%s)\n", mp,
3060                             mp->mnt_stat.f_mntfromname,
3061                             mp->mnt_stat.f_mntonname,
3062                             mp->mnt_stat.f_fstypename);
3063                         if (db_pager_quit)
3064                                 break;
3065                 }
3066                 db_printf("\nMore info: show mount <addr>\n");
3067                 return;
3068         }
3069
3070         mp = (struct mount *)addr;
3071         db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
3072             mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
3073
3074         buf[0] = '\0';
3075         mflags = mp->mnt_flag;
3076 #define MNT_FLAG(flag)  do {                                            \
3077         if (mflags & (flag)) {                                          \
3078                 if (buf[0] != '\0')                                     \
3079                         strlcat(buf, ", ", sizeof(buf));                \
3080                 strlcat(buf, (#flag) + 4, sizeof(buf));                 \
3081                 mflags &= ~(flag);                                      \
3082         }                                                               \
3083 } while (0)
3084         MNT_FLAG(MNT_RDONLY);
3085         MNT_FLAG(MNT_SYNCHRONOUS);
3086         MNT_FLAG(MNT_NOEXEC);
3087         MNT_FLAG(MNT_NOSUID);
3088         MNT_FLAG(MNT_NFS4ACLS);
3089         MNT_FLAG(MNT_UNION);
3090         MNT_FLAG(MNT_ASYNC);
3091         MNT_FLAG(MNT_SUIDDIR);
3092         MNT_FLAG(MNT_SOFTDEP);
3093         MNT_FLAG(MNT_NOSYMFOLLOW);
3094         MNT_FLAG(MNT_GJOURNAL);
3095         MNT_FLAG(MNT_MULTILABEL);
3096         MNT_FLAG(MNT_ACLS);
3097         MNT_FLAG(MNT_NOATIME);
3098         MNT_FLAG(MNT_NOCLUSTERR);
3099         MNT_FLAG(MNT_NOCLUSTERW);
3100         MNT_FLAG(MNT_SUJ);
3101         MNT_FLAG(MNT_EXRDONLY);
3102         MNT_FLAG(MNT_EXPORTED);
3103         MNT_FLAG(MNT_DEFEXPORTED);
3104         MNT_FLAG(MNT_EXPORTANON);
3105         MNT_FLAG(MNT_EXKERB);
3106         MNT_FLAG(MNT_EXPUBLIC);
3107         MNT_FLAG(MNT_LOCAL);
3108         MNT_FLAG(MNT_QUOTA);
3109         MNT_FLAG(MNT_ROOTFS);
3110         MNT_FLAG(MNT_USER);
3111         MNT_FLAG(MNT_IGNORE);
3112         MNT_FLAG(MNT_UPDATE);
3113         MNT_FLAG(MNT_DELEXPORT);
3114         MNT_FLAG(MNT_RELOAD);
3115         MNT_FLAG(MNT_FORCE);
3116         MNT_FLAG(MNT_SNAPSHOT);
3117         MNT_FLAG(MNT_BYFSID);
3118 #undef MNT_FLAG
3119         if (mflags != 0) {
3120                 if (buf[0] != '\0')
3121                         strlcat(buf, ", ", sizeof(buf));
3122                 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3123                     "0x%016jx", mflags);
3124         }
3125         db_printf("    mnt_flag = %s\n", buf);
3126
3127         buf[0] = '\0';
3128         flags = mp->mnt_kern_flag;
3129 #define MNT_KERN_FLAG(flag)     do {                                    \
3130         if (flags & (flag)) {                                           \
3131                 if (buf[0] != '\0')                                     \
3132                         strlcat(buf, ", ", sizeof(buf));                \
3133                 strlcat(buf, (#flag) + 5, sizeof(buf));                 \
3134                 flags &= ~(flag);                                       \
3135         }                                                               \
3136 } while (0)
3137         MNT_KERN_FLAG(MNTK_UNMOUNTF);
3138         MNT_KERN_FLAG(MNTK_ASYNC);
3139         MNT_KERN_FLAG(MNTK_SOFTDEP);
3140         MNT_KERN_FLAG(MNTK_NOINSMNTQ);
3141         MNT_KERN_FLAG(MNTK_DRAINING);
3142         MNT_KERN_FLAG(MNTK_REFEXPIRE);
3143         MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
3144         MNT_KERN_FLAG(MNTK_SHARED_WRITES);
3145         MNT_KERN_FLAG(MNTK_NO_IOPF);
3146         MNT_KERN_FLAG(MNTK_VGONE_UPPER);
3147         MNT_KERN_FLAG(MNTK_VGONE_WAITER);
3148         MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
3149         MNT_KERN_FLAG(MNTK_MARKER);
3150         MNT_KERN_FLAG(MNTK_USES_BCACHE);
3151         MNT_KERN_FLAG(MNTK_NOASYNC);
3152         MNT_KERN_FLAG(MNTK_UNMOUNT);
3153         MNT_KERN_FLAG(MNTK_MWAIT);
3154         MNT_KERN_FLAG(MNTK_SUSPEND);
3155         MNT_KERN_FLAG(MNTK_SUSPEND2);
3156         MNT_KERN_FLAG(MNTK_SUSPENDED);
3157         MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
3158         MNT_KERN_FLAG(MNTK_NOKNOTE);
3159 #undef MNT_KERN_FLAG
3160         if (flags != 0) {
3161                 if (buf[0] != '\0')
3162                         strlcat(buf, ", ", sizeof(buf));
3163                 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3164                     "0x%08x", flags);
3165         }
3166         db_printf("    mnt_kern_flag = %s\n", buf);
3167
3168         db_printf("    mnt_opt = ");
3169         opt = TAILQ_FIRST(mp->mnt_opt);
3170         if (opt != NULL) {
3171                 db_printf("%s", opt->name);
3172                 opt = TAILQ_NEXT(opt, link);
3173                 while (opt != NULL) {
3174                         db_printf(", %s", opt->name);
3175                         opt = TAILQ_NEXT(opt, link);
3176                 }
3177         }
3178         db_printf("\n");
3179
3180         sp = &mp->mnt_stat;
3181         db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
3182             "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
3183             "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
3184             "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
3185             (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
3186             (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
3187             (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
3188             (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
3189             (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
3190             (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
3191             (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
3192             (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
3193
3194         db_printf("    mnt_cred = { uid=%u ruid=%u",
3195             (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
3196         if (jailed(mp->mnt_cred))
3197                 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
3198         db_printf(" }\n");
3199         db_printf("    mnt_ref = %d\n", mp->mnt_ref);
3200         db_printf("    mnt_gen = %d\n", mp->mnt_gen);
3201         db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
3202         db_printf("    mnt_activevnodelistsize = %d\n",
3203             mp->mnt_activevnodelistsize);
3204         db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
3205         db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
3206         db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
3207         db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
3208         db_printf("    mnt_lockref = %d\n", mp->mnt_lockref);
3209         db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
3210         db_printf("    mnt_secondary_accwrites = %d\n",
3211             mp->mnt_secondary_accwrites);
3212         db_printf("    mnt_gjprovider = %s\n",
3213             mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
3214
3215         db_printf("\n\nList of active vnodes\n");
3216         TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
3217                 if (vp->v_type != VMARKER) {
3218                         vn_printf(vp, "vnode ");
3219                         if (db_pager_quit)
3220                                 break;
3221                 }
3222         }
3223         db_printf("\n\nList of inactive vnodes\n");
3224         TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3225                 if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
3226                         vn_printf(vp, "vnode ");
3227                         if (db_pager_quit)
3228                                 break;
3229                 }
3230         }
3231 }
3232 #endif  /* DDB */
3233
3234 /*
3235  * Fill in a struct xvfsconf based on a struct vfsconf.
3236  */
3237 static int
3238 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
3239 {
3240         struct xvfsconf xvfsp;
3241
3242         bzero(&xvfsp, sizeof(xvfsp));
3243         strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3244         xvfsp.vfc_typenum = vfsp->vfc_typenum;
3245         xvfsp.vfc_refcount = vfsp->vfc_refcount;
3246         xvfsp.vfc_flags = vfsp->vfc_flags;
3247         /*
3248          * These are unused in userland, we keep them
3249          * to not break binary compatibility.
3250          */
3251         xvfsp.vfc_vfsops = NULL;
3252         xvfsp.vfc_next = NULL;
3253         return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3254 }
3255
3256 #ifdef COMPAT_FREEBSD32
3257 struct xvfsconf32 {
3258         uint32_t        vfc_vfsops;
3259         char            vfc_name[MFSNAMELEN];
3260         int32_t         vfc_typenum;
3261         int32_t         vfc_refcount;
3262         int32_t         vfc_flags;
3263         uint32_t        vfc_next;
3264 };
3265
3266 static int
3267 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
3268 {
3269         struct xvfsconf32 xvfsp;
3270
3271         strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3272         xvfsp.vfc_typenum = vfsp->vfc_typenum;
3273         xvfsp.vfc_refcount = vfsp->vfc_refcount;
3274         xvfsp.vfc_flags = vfsp->vfc_flags;
3275         xvfsp.vfc_vfsops = 0;
3276         xvfsp.vfc_next = 0;
3277         return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3278 }
3279 #endif
3280
3281 /*
3282  * Top level filesystem related information gathering.
3283  */
3284 static int
3285 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
3286 {
3287         struct vfsconf *vfsp;
3288         int error;
3289
3290         error = 0;
3291         vfsconf_slock();
3292         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3293 #ifdef COMPAT_FREEBSD32
3294                 if (req->flags & SCTL_MASK32)
3295                         error = vfsconf2x32(req, vfsp);
3296                 else
3297 #endif
3298                         error = vfsconf2x(req, vfsp);
3299                 if (error)
3300                         break;
3301         }
3302         vfsconf_sunlock();
3303         return (error);
3304 }
3305
3306 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
3307     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
3308     "S,xvfsconf", "List of all configured filesystems");
3309
3310 #ifndef BURN_BRIDGES
3311 static int      sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3312
3313 static int
3314 vfs_sysctl(SYSCTL_HANDLER_ARGS)
3315 {
3316         int *name = (int *)arg1 - 1;    /* XXX */
3317         u_int namelen = arg2 + 1;       /* XXX */
3318         struct vfsconf *vfsp;
3319
3320         log(LOG_WARNING, "userland calling deprecated sysctl, "
3321             "please rebuild world\n");
3322
3323 #if 1 || defined(COMPAT_PRELITE2)
3324         /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3325         if (namelen == 1)
3326                 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3327 #endif
3328
3329         switch (name[1]) {
3330         case VFS_MAXTYPENUM:
3331                 if (namelen != 2)
3332                         return (ENOTDIR);
3333                 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3334         case VFS_CONF:
3335                 if (namelen != 3)
3336                         return (ENOTDIR);       /* overloaded */
3337                 vfsconf_slock();
3338                 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3339                         if (vfsp->vfc_typenum == name[2])
3340                                 break;
3341                 }
3342                 vfsconf_sunlock();
3343                 if (vfsp == NULL)
3344                         return (EOPNOTSUPP);
3345 #ifdef COMPAT_FREEBSD32
3346                 if (req->flags & SCTL_MASK32)
3347                         return (vfsconf2x32(req, vfsp));
3348                 else
3349 #endif
3350                         return (vfsconf2x(req, vfsp));
3351         }
3352         return (EOPNOTSUPP);
3353 }
3354
3355 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
3356     CTLFLAG_MPSAFE, vfs_sysctl,
3357     "Generic filesystem");
3358
3359 #if 1 || defined(COMPAT_PRELITE2)
3360
3361 static int
3362 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3363 {
3364         int error;
3365         struct vfsconf *vfsp;
3366         struct ovfsconf ovfs;
3367
3368         vfsconf_slock();
3369         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3370                 bzero(&ovfs, sizeof(ovfs));
3371                 ovfs.vfc_vfsops = vfsp->vfc_vfsops;     /* XXX used as flag */
3372                 strcpy(ovfs.vfc_name, vfsp->vfc_name);
3373                 ovfs.vfc_index = vfsp->vfc_typenum;
3374                 ovfs.vfc_refcount = vfsp->vfc_refcount;
3375                 ovfs.vfc_flags = vfsp->vfc_flags;
3376                 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3377                 if (error != 0) {
3378                         vfsconf_sunlock();
3379                         return (error);
3380                 }
3381         }
3382         vfsconf_sunlock();
3383         return (0);
3384 }
3385
3386 #endif /* 1 || COMPAT_PRELITE2 */
3387 #endif /* !BURN_BRIDGES */
3388
3389 #define KINFO_VNODESLOP         10
3390 #ifdef notyet
3391 /*
3392  * Dump vnode list (via sysctl).
3393  */
3394 /* ARGSUSED */
3395 static int
3396 sysctl_vnode(SYSCTL_HANDLER_ARGS)
3397 {
3398         struct xvnode *xvn;
3399         struct mount *mp;
3400         struct vnode *vp;
3401         int error, len, n;
3402
3403         /*
3404          * Stale numvnodes access is not fatal here.
3405          */
3406         req->lock = 0;
3407         len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3408         if (!req->oldptr)
3409                 /* Make an estimate */
3410                 return (SYSCTL_OUT(req, 0, len));
3411
3412         error = sysctl_wire_old_buffer(req, 0);
3413         if (error != 0)
3414                 return (error);
3415         xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3416         n = 0;
3417         mtx_lock(&mountlist_mtx);
3418         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3419                 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3420                         continue;
3421                 MNT_ILOCK(mp);
3422                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3423                         if (n == len)
3424                                 break;
3425                         vref(vp);
3426                         xvn[n].xv_size = sizeof *xvn;
3427                         xvn[n].xv_vnode = vp;
3428                         xvn[n].xv_id = 0;       /* XXX compat */
3429 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3430                         XV_COPY(usecount);
3431                         XV_COPY(writecount);
3432                         XV_COPY(holdcnt);
3433                         XV_COPY(mount);
3434                         XV_COPY(numoutput);
3435                         XV_COPY(type);
3436 #undef XV_COPY
3437                         xvn[n].xv_flag = vp->v_vflag;
3438
3439                         switch (vp->v_type) {
3440                         case VREG:
3441                         case VDIR:
3442                         case VLNK:
3443                                 break;
3444                         case VBLK:
3445                         case VCHR:
3446                                 if (vp->v_rdev == NULL) {
3447                                         vrele(vp);
3448                                         continue;
3449                                 }
3450                                 xvn[n].xv_dev = dev2udev(vp->v_rdev);
3451                                 break;
3452                         case VSOCK:
3453                                 xvn[n].xv_socket = vp->v_socket;
3454                                 break;
3455                         case VFIFO:
3456                                 xvn[n].xv_fifo = vp->v_fifoinfo;
3457                                 break;
3458                         case VNON:
3459                         case VBAD:
3460                         default:
3461                                 /* shouldn't happen? */
3462                                 vrele(vp);
3463                                 continue;
3464                         }
3465                         vrele(vp);
3466                         ++n;
3467                 }
3468                 MNT_IUNLOCK(mp);
3469                 mtx_lock(&mountlist_mtx);
3470                 vfs_unbusy(mp);
3471                 if (n == len)
3472                         break;
3473         }
3474         mtx_unlock(&mountlist_mtx);
3475
3476         error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3477         free(xvn, M_TEMP);
3478         return (error);
3479 }
3480
3481 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
3482     CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
3483     "");
3484 #endif
3485
3486 /*
3487  * Unmount all filesystems. The list is traversed in reverse order
3488  * of mounting to avoid dependencies.
3489  */
3490 void
3491 vfs_unmountall(void)
3492 {
3493         struct mount *mp;
3494         struct thread *td;
3495         int error;
3496
3497         CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3498         td = curthread;
3499
3500         /*
3501          * Since this only runs when rebooting, it is not interlocked.
3502          */
3503         while(!TAILQ_EMPTY(&mountlist)) {
3504                 mp = TAILQ_LAST(&mountlist, mntlist);
3505                 error = dounmount(mp, MNT_FORCE, td);
3506                 if (error) {
3507                         TAILQ_REMOVE(&mountlist, mp, mnt_list);
3508                         /*
3509                          * XXX: Due to the way in which we mount the root
3510                          * file system off of devfs, devfs will generate a
3511                          * "busy" warning when we try to unmount it before
3512                          * the root.  Don't print a warning as a result in
3513                          * order to avoid false positive errors that may
3514                          * cause needless upset.
3515                          */
3516                         if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
3517                                 printf("unmount of %s failed (",
3518                                     mp->mnt_stat.f_mntonname);
3519                                 if (error == EBUSY)
3520                                         printf("BUSY)\n");
3521                                 else
3522                                         printf("%d)\n", error);
3523                         }
3524                 } else {
3525                         /* The unmount has removed mp from the mountlist */
3526                 }
3527         }
3528 }
3529
3530 /*
3531  * perform msync on all vnodes under a mount point
3532  * the mount point must be locked.
3533  */
3534 void
3535 vfs_msync(struct mount *mp, int flags)
3536 {
3537         struct vnode *vp, *mvp;
3538         struct vm_object *obj;
3539
3540         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3541         MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
3542                 obj = vp->v_object;
3543                 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3544                     (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3545                         if (!vget(vp,
3546                             LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3547                             curthread)) {
3548                                 if (vp->v_vflag & VV_NOSYNC) {  /* unlinked */
3549                                         vput(vp);
3550                                         continue;
3551                                 }
3552
3553                                 obj = vp->v_object;
3554                                 if (obj != NULL) {
3555                                         VM_OBJECT_WLOCK(obj);
3556                                         vm_object_page_clean(obj, 0, 0,
3557                                             flags == MNT_WAIT ?
3558                                             OBJPC_SYNC : OBJPC_NOSYNC);
3559                                         VM_OBJECT_WUNLOCK(obj);
3560                                 }
3561                                 vput(vp);
3562                         }
3563                 } else
3564                         VI_UNLOCK(vp);
3565         }
3566 }
3567
3568 static void
3569 destroy_vpollinfo_free(struct vpollinfo *vi)
3570 {
3571
3572         knlist_destroy(&vi->vpi_selinfo.si_note);
3573         mtx_destroy(&vi->vpi_lock);
3574         uma_zfree(vnodepoll_zone, vi);
3575 }
3576
3577 static void
3578 destroy_vpollinfo(struct vpollinfo *vi)
3579 {
3580
3581         knlist_clear(&vi->vpi_selinfo.si_note, 1);
3582         seldrain(&vi->vpi_selinfo);
3583         destroy_vpollinfo_free(vi);
3584 }
3585
3586 /*
3587  * Initalize per-vnode helper structure to hold poll-related state.
3588  */
3589 void
3590 v_addpollinfo(struct vnode *vp)
3591 {
3592         struct vpollinfo *vi;
3593
3594         if (vp->v_pollinfo != NULL)
3595                 return;
3596         vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
3597         mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3598         knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3599             vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3600         VI_LOCK(vp);
3601         if (vp->v_pollinfo != NULL) {
3602                 VI_UNLOCK(vp);
3603                 destroy_vpollinfo_free(vi);
3604                 return;
3605         }
3606         vp->v_pollinfo = vi;
3607         VI_UNLOCK(vp);
3608 }
3609
3610 /*
3611  * Record a process's interest in events which might happen to
3612  * a vnode.  Because poll uses the historic select-style interface
3613  * internally, this routine serves as both the ``check for any
3614  * pending events'' and the ``record my interest in future events''
3615  * functions.  (These are done together, while the lock is held,
3616  * to avoid race conditions.)
3617  */
3618 int
3619 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3620 {
3621
3622         v_addpollinfo(vp);
3623         mtx_lock(&vp->v_pollinfo->vpi_lock);
3624         if (vp->v_pollinfo->vpi_revents & events) {
3625                 /*
3626                  * This leaves events we are not interested
3627                  * in available for the other process which
3628                  * which presumably had requested them
3629                  * (otherwise they would never have been
3630                  * recorded).
3631                  */
3632                 events &= vp->v_pollinfo->vpi_revents;
3633                 vp->v_pollinfo->vpi_revents &= ~events;
3634
3635                 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3636                 return (events);
3637         }
3638         vp->v_pollinfo->vpi_events |= events;
3639         selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3640         mtx_unlock(&vp->v_pollinfo->vpi_lock);
3641         return (0);
3642 }
3643
3644 /*
3645  * Routine to create and manage a filesystem syncer vnode.
3646  */
3647 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
3648 static int      sync_fsync(struct  vop_fsync_args *);
3649 static int      sync_inactive(struct  vop_inactive_args *);
3650 static int      sync_reclaim(struct  vop_reclaim_args *);
3651
3652 static struct vop_vector sync_vnodeops = {
3653         .vop_bypass =   VOP_EOPNOTSUPP,
3654         .vop_close =    sync_close,             /* close */
3655         .vop_fsync =    sync_fsync,             /* fsync */
3656         .vop_inactive = sync_inactive,  /* inactive */
3657         .vop_reclaim =  sync_reclaim,   /* reclaim */
3658         .vop_lock1 =    vop_stdlock,    /* lock */
3659         .vop_unlock =   vop_stdunlock,  /* unlock */
3660         .vop_islocked = vop_stdislocked,        /* islocked */
3661 };
3662
3663 /*
3664  * Create a new filesystem syncer vnode for the specified mount point.
3665  */
3666 void
3667 vfs_allocate_syncvnode(struct mount *mp)
3668 {
3669         struct vnode *vp;
3670         struct bufobj *bo;
3671         static long start, incr, next;
3672         int error;
3673
3674         /* Allocate a new vnode */
3675         error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3676         if (error != 0)
3677                 panic("vfs_allocate_syncvnode: getnewvnode() failed");
3678         vp->v_type = VNON;
3679         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3680         vp->v_vflag |= VV_FORCEINSMQ;
3681         error = insmntque(vp, mp);
3682         if (error != 0)
3683                 panic("vfs_allocate_syncvnode: insmntque() failed");
3684         vp->v_vflag &= ~VV_FORCEINSMQ;
3685         VOP_UNLOCK(vp, 0);
3686         /*
3687          * Place the vnode onto the syncer worklist. We attempt to
3688          * scatter them about on the list so that they will go off
3689          * at evenly distributed times even if all the filesystems
3690          * are mounted at once.
3691          */
3692         next += incr;
3693         if (next == 0 || next > syncer_maxdelay) {
3694                 start /= 2;
3695                 incr /= 2;
3696                 if (start == 0) {
3697                         start = syncer_maxdelay / 2;
3698                         incr = syncer_maxdelay;
3699                 }
3700                 next = start;
3701         }
3702         bo = &vp->v_bufobj;
3703         BO_LOCK(bo);
3704         vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3705         /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3706         mtx_lock(&sync_mtx);
3707         sync_vnode_count++;
3708         if (mp->mnt_syncer == NULL) {
3709                 mp->mnt_syncer = vp;
3710                 vp = NULL;
3711         }
3712         mtx_unlock(&sync_mtx);
3713         BO_UNLOCK(bo);
3714         if (vp != NULL) {
3715                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3716                 vgone(vp);
3717                 vput(vp);
3718         }
3719 }
3720
3721 void
3722 vfs_deallocate_syncvnode(struct mount *mp)
3723 {
3724         struct vnode *vp;
3725
3726         mtx_lock(&sync_mtx);
3727         vp = mp->mnt_syncer;
3728         if (vp != NULL)
3729                 mp->mnt_syncer = NULL;
3730         mtx_unlock(&sync_mtx);
3731         if (vp != NULL)
3732                 vrele(vp);
3733 }
3734
3735 /*
3736  * Do a lazy sync of the filesystem.
3737  */
3738 static int
3739 sync_fsync(struct vop_fsync_args *ap)
3740 {
3741         struct vnode *syncvp = ap->a_vp;
3742         struct mount *mp = syncvp->v_mount;
3743         int error, save;
3744         struct bufobj *bo;
3745
3746         /*
3747          * We only need to do something if this is a lazy evaluation.
3748          */
3749         if (ap->a_waitfor != MNT_LAZY)
3750                 return (0);
3751
3752         /*
3753          * Move ourselves to the back of the sync list.
3754          */
3755         bo = &syncvp->v_bufobj;
3756         BO_LOCK(bo);
3757         vn_syncer_add_to_worklist(bo, syncdelay);
3758         BO_UNLOCK(bo);
3759
3760         /*
3761          * Walk the list of vnodes pushing all that are dirty and
3762          * not already on the sync list.
3763          */
3764         if (vfs_busy(mp, MBF_NOWAIT) != 0)
3765                 return (0);
3766         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3767                 vfs_unbusy(mp);
3768                 return (0);
3769         }
3770         save = curthread_pflags_set(TDP_SYNCIO);
3771         vfs_msync(mp, MNT_NOWAIT);
3772         error = VFS_SYNC(mp, MNT_LAZY);
3773         curthread_pflags_restore(save);
3774         vn_finished_write(mp);
3775         vfs_unbusy(mp);
3776         return (error);
3777 }
3778
3779 /*
3780  * The syncer vnode is no referenced.
3781  */
3782 static int
3783 sync_inactive(struct vop_inactive_args *ap)
3784 {
3785
3786         vgone(ap->a_vp);
3787         return (0);
3788 }
3789
3790 /*
3791  * The syncer vnode is no longer needed and is being decommissioned.
3792  *
3793  * Modifications to the worklist must be protected by sync_mtx.
3794  */
3795 static int
3796 sync_reclaim(struct vop_reclaim_args *ap)
3797 {
3798         struct vnode *vp = ap->a_vp;
3799         struct bufobj *bo;
3800
3801         bo = &vp->v_bufobj;
3802         BO_LOCK(bo);
3803         mtx_lock(&sync_mtx);
3804         if (vp->v_mount->mnt_syncer == vp)
3805                 vp->v_mount->mnt_syncer = NULL;
3806         if (bo->bo_flag & BO_ONWORKLST) {
3807                 LIST_REMOVE(bo, bo_synclist);
3808                 syncer_worklist_len--;
3809                 sync_vnode_count--;
3810                 bo->bo_flag &= ~BO_ONWORKLST;
3811         }
3812         mtx_unlock(&sync_mtx);
3813         BO_UNLOCK(bo);
3814
3815         return (0);
3816 }
3817
3818 /*
3819  * Check if vnode represents a disk device
3820  */
3821 int
3822 vn_isdisk(struct vnode *vp, int *errp)
3823 {
3824         int error;
3825
3826         if (vp->v_type != VCHR) {
3827                 error = ENOTBLK;
3828                 goto out;
3829         }
3830         error = 0;
3831         dev_lock();
3832         if (vp->v_rdev == NULL)
3833                 error = ENXIO;
3834         else if (vp->v_rdev->si_devsw == NULL)
3835                 error = ENXIO;
3836         else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3837                 error = ENOTBLK;
3838         dev_unlock();
3839 out:
3840         if (errp != NULL)
3841                 *errp = error;
3842         return (error == 0);
3843 }
3844
3845 /*
3846  * Common filesystem object access control check routine.  Accepts a
3847  * vnode's type, "mode", uid and gid, requested access mode, credentials,
3848  * and optional call-by-reference privused argument allowing vaccess()
3849  * to indicate to the caller whether privilege was used to satisfy the
3850  * request (obsoleted).  Returns 0 on success, or an errno on failure.
3851  */
3852 int
3853 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3854     accmode_t accmode, struct ucred *cred, int *privused)
3855 {
3856         accmode_t dac_granted;
3857         accmode_t priv_granted;
3858
3859         KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
3860             ("invalid bit in accmode"));
3861         KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
3862             ("VAPPEND without VWRITE"));
3863
3864         /*
3865          * Look for a normal, non-privileged way to access the file/directory
3866          * as requested.  If it exists, go with that.
3867          */
3868
3869         if (privused != NULL)
3870                 *privused = 0;
3871
3872         dac_granted = 0;
3873
3874         /* Check the owner. */
3875         if (cred->cr_uid == file_uid) {
3876                 dac_granted |= VADMIN;
3877                 if (file_mode & S_IXUSR)
3878                         dac_granted |= VEXEC;
3879                 if (file_mode & S_IRUSR)
3880                         dac_granted |= VREAD;
3881                 if (file_mode & S_IWUSR)
3882                         dac_granted |= (VWRITE | VAPPEND);
3883
3884                 if ((accmode & dac_granted) == accmode)
3885                         return (0);
3886
3887                 goto privcheck;
3888         }
3889
3890         /* Otherwise, check the groups (first match) */
3891         if (groupmember(file_gid, cred)) {
3892                 if (file_mode & S_IXGRP)
3893                         dac_granted |= VEXEC;
3894                 if (file_mode & S_IRGRP)
3895                         dac_granted |= VREAD;
3896                 if (file_mode & S_IWGRP)
3897                         dac_granted |= (VWRITE | VAPPEND);
3898
3899                 if ((accmode & dac_granted) == accmode)
3900                         return (0);
3901
3902                 goto privcheck;
3903         }
3904
3905         /* Otherwise, check everyone else. */
3906         if (file_mode & S_IXOTH)
3907                 dac_granted |= VEXEC;
3908         if (file_mode & S_IROTH)
3909                 dac_granted |= VREAD;
3910         if (file_mode & S_IWOTH)
3911                 dac_granted |= (VWRITE | VAPPEND);
3912         if ((accmode & dac_granted) == accmode)
3913                 return (0);
3914
3915 privcheck:
3916         /*
3917          * Build a privilege mask to determine if the set of privileges
3918          * satisfies the requirements when combined with the granted mask
3919          * from above.  For each privilege, if the privilege is required,
3920          * bitwise or the request type onto the priv_granted mask.
3921          */
3922         priv_granted = 0;
3923
3924         if (type == VDIR) {
3925                 /*
3926                  * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3927                  * requests, instead of PRIV_VFS_EXEC.
3928                  */
3929                 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3930                     !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3931                         priv_granted |= VEXEC;
3932         } else {
3933                 /*
3934                  * Ensure that at least one execute bit is on. Otherwise,
3935                  * a privileged user will always succeed, and we don't want
3936                  * this to happen unless the file really is executable.
3937                  */
3938                 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3939                     (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
3940                     !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
3941                         priv_granted |= VEXEC;
3942         }
3943
3944         if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
3945             !priv_check_cred(cred, PRIV_VFS_READ, 0))
3946                 priv_granted |= VREAD;
3947
3948         if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3949             !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
3950                 priv_granted |= (VWRITE | VAPPEND);
3951
3952         if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3953             !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
3954                 priv_granted |= VADMIN;
3955
3956         if ((accmode & (priv_granted | dac_granted)) == accmode) {
3957                 /* XXX audit: privilege used */
3958                 if (privused != NULL)
3959                         *privused = 1;
3960                 return (0);
3961         }
3962
3963         return ((accmode & VADMIN) ? EPERM : EACCES);
3964 }
3965
3966 /*
3967  * Credential check based on process requesting service, and per-attribute
3968  * permissions.
3969  */
3970 int
3971 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
3972     struct thread *td, accmode_t accmode)
3973 {
3974
3975         /*
3976          * Kernel-invoked always succeeds.
3977          */
3978         if (cred == NOCRED)
3979                 return (0);
3980
3981         /*
3982          * Do not allow privileged processes in jail to directly manipulate
3983          * system attributes.
3984          */
3985         switch (attrnamespace) {
3986         case EXTATTR_NAMESPACE_SYSTEM:
3987                 /* Potentially should be: return (EPERM); */
3988                 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
3989         case EXTATTR_NAMESPACE_USER:
3990                 return (VOP_ACCESS(vp, accmode, cred, td));
3991         default:
3992                 return (EPERM);
3993         }
3994 }
3995
3996 #ifdef DEBUG_VFS_LOCKS
3997 /*
3998  * This only exists to supress warnings from unlocked specfs accesses.  It is
3999  * no longer ok to have an unlocked VFS.
4000  */
4001 #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||            \
4002         (vp)->v_type == VCHR || (vp)->v_type == VBAD)
4003
4004 int vfs_badlock_ddb = 1;        /* Drop into debugger on violation. */
4005 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
4006     "Drop into debugger on lock violation");
4007
4008 int vfs_badlock_mutex = 1;      /* Check for interlock across VOPs. */
4009 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
4010     0, "Check for interlock across VOPs");
4011
4012 int vfs_badlock_print = 1;      /* Print lock violations. */
4013 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
4014     0, "Print lock violations");
4015
4016 #ifdef KDB
4017 int vfs_badlock_backtrace = 1;  /* Print backtrace at lock violations. */
4018 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
4019     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
4020 #endif
4021
4022 static void
4023 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
4024 {
4025
4026 #ifdef KDB
4027         if (vfs_badlock_backtrace)
4028                 kdb_backtrace();
4029 #endif
4030         if (vfs_badlock_print)
4031                 printf("%s: %p %s\n", str, (void *)vp, msg);
4032         if (vfs_badlock_ddb)
4033                 kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4034 }
4035
4036 void
4037 assert_vi_locked(struct vnode *vp, const char *str)
4038 {
4039
4040         if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
4041                 vfs_badlock("interlock is not locked but should be", str, vp);
4042 }
4043
4044 void
4045 assert_vi_unlocked(struct vnode *vp, const char *str)
4046 {
4047
4048         if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
4049                 vfs_badlock("interlock is locked but should not be", str, vp);
4050 }
4051
4052 void
4053 assert_vop_locked(struct vnode *vp, const char *str)
4054 {
4055         int locked;
4056
4057         if (!IGNORE_LOCK(vp)) {
4058                 locked = VOP_ISLOCKED(vp);
4059                 if (locked == 0 || locked == LK_EXCLOTHER)
4060                         vfs_badlock("is not locked but should be", str, vp);
4061         }
4062 }
4063
4064 void
4065 assert_vop_unlocked(struct vnode *vp, const char *str)
4066 {
4067
4068         if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
4069                 vfs_badlock("is locked but should not be", str, vp);
4070 }
4071
4072 void
4073 assert_vop_elocked(struct vnode *vp, const char *str)
4074 {
4075
4076         if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
4077                 vfs_badlock("is not exclusive locked but should be", str, vp);
4078 }
4079
4080 #if 0
4081 void
4082 assert_vop_elocked_other(struct vnode *vp, const char *str)
4083 {
4084
4085         if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
4086                 vfs_badlock("is not exclusive locked by another thread",
4087                     str, vp);
4088 }
4089
4090 void
4091 assert_vop_slocked(struct vnode *vp, const char *str)
4092 {
4093
4094         if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
4095                 vfs_badlock("is not locked shared but should be", str, vp);
4096 }
4097 #endif /* 0 */
4098 #endif /* DEBUG_VFS_LOCKS */
4099
4100 void
4101 vop_rename_fail(struct vop_rename_args *ap)
4102 {
4103
4104         if (ap->a_tvp != NULL)
4105                 vput(ap->a_tvp);
4106         if (ap->a_tdvp == ap->a_tvp)
4107                 vrele(ap->a_tdvp);
4108         else
4109                 vput(ap->a_tdvp);
4110         vrele(ap->a_fdvp);
4111         vrele(ap->a_fvp);
4112 }
4113
4114 void
4115 vop_rename_pre(void *ap)
4116 {
4117         struct vop_rename_args *a = ap;
4118
4119 #ifdef DEBUG_VFS_LOCKS
4120         if (a->a_tvp)
4121                 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
4122         ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
4123         ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
4124         ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
4125
4126         /* Check the source (from). */
4127         if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
4128             (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
4129                 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
4130         if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
4131                 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
4132
4133         /* Check the target. */
4134         if (a->a_tvp)
4135                 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
4136         ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
4137 #endif
4138         if (a->a_tdvp != a->a_fdvp)
4139                 vhold(a->a_fdvp);
4140         if (a->a_tvp != a->a_fvp)
4141                 vhold(a->a_fvp);
4142         vhold(a->a_tdvp);
4143         if (a->a_tvp)
4144                 vhold(a->a_tvp);
4145 }
4146
4147 void
4148 vop_strategy_pre(void *ap)
4149 {
4150 #ifdef DEBUG_VFS_LOCKS
4151         struct vop_strategy_args *a;
4152         struct buf *bp;
4153
4154         a = ap;
4155         bp = a->a_bp;
4156
4157         /*
4158          * Cluster ops lock their component buffers but not the IO container.
4159          */
4160         if ((bp->b_flags & B_CLUSTER) != 0)
4161                 return;
4162
4163         if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
4164                 if (vfs_badlock_print)
4165                         printf(
4166                             "VOP_STRATEGY: bp is not locked but should be\n");
4167                 if (vfs_badlock_ddb)
4168                         kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4169         }
4170 #endif
4171 }
4172
4173 void
4174 vop_lock_pre(void *ap)
4175 {
4176 #ifdef DEBUG_VFS_LOCKS
4177         struct vop_lock1_args *a = ap;
4178
4179         if ((a->a_flags & LK_INTERLOCK) == 0)
4180                 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4181         else
4182                 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
4183 #endif
4184 }
4185
4186 void
4187 vop_lock_post(void *ap, int rc)
4188 {
4189 #ifdef DEBUG_VFS_LOCKS
4190         struct vop_lock1_args *a = ap;
4191
4192         ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4193         if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
4194                 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
4195 #endif
4196 }
4197
4198 void
4199 vop_unlock_pre(void *ap)
4200 {
4201 #ifdef DEBUG_VFS_LOCKS
4202         struct vop_unlock_args *a = ap;
4203
4204         if (a->a_flags & LK_INTERLOCK)
4205                 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
4206         ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
4207 #endif
4208 }
4209
4210 void
4211 vop_unlock_post(void *ap, int rc)
4212 {
4213 #ifdef DEBUG_VFS_LOCKS
4214         struct vop_unlock_args *a = ap;
4215
4216         if (a->a_flags & LK_INTERLOCK)
4217                 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
4218 #endif
4219 }
4220
4221 void
4222 vop_create_post(void *ap, int rc)
4223 {
4224         struct vop_create_args *a = ap;
4225
4226         if (!rc)
4227                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4228 }
4229
4230 void
4231 vop_deleteextattr_post(void *ap, int rc)
4232 {
4233         struct vop_deleteextattr_args *a = ap;
4234
4235         if (!rc)
4236                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4237 }
4238
4239 void
4240 vop_link_post(void *ap, int rc)
4241 {
4242         struct vop_link_args *a = ap;
4243
4244         if (!rc) {
4245                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4246                 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4247         }
4248 }
4249
4250 void
4251 vop_mkdir_post(void *ap, int rc)
4252 {
4253         struct vop_mkdir_args *a = ap;
4254
4255         if (!rc)
4256                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4257 }
4258
4259 void
4260 vop_mknod_post(void *ap, int rc)
4261 {
4262         struct vop_mknod_args *a = ap;
4263
4264         if (!rc)
4265                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4266 }
4267
4268 void
4269 vop_remove_post(void *ap, int rc)
4270 {
4271         struct vop_remove_args *a = ap;
4272
4273         if (!rc) {
4274                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4275                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4276         }
4277 }
4278
4279 void
4280 vop_rename_post(void *ap, int rc)
4281 {
4282         struct vop_rename_args *a = ap;
4283
4284         if (!rc) {
4285                 VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4286                 VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4287                 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4288                 if (a->a_tvp)
4289                         VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4290         }
4291         if (a->a_tdvp != a->a_fdvp)
4292                 vdrop(a->a_fdvp);
4293         if (a->a_tvp != a->a_fvp)
4294                 vdrop(a->a_fvp);
4295         vdrop(a->a_tdvp);
4296         if (a->a_tvp)
4297                 vdrop(a->a_tvp);
4298 }
4299
4300 void
4301 vop_rmdir_post(void *ap, int rc)
4302 {
4303         struct vop_rmdir_args *a = ap;
4304
4305         if (!rc) {
4306                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4307                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4308         }
4309 }
4310
4311 void
4312 vop_setattr_post(void *ap, int rc)
4313 {
4314         struct vop_setattr_args *a = ap;
4315
4316         if (!rc)
4317                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4318 }
4319
4320 void
4321 vop_setextattr_post(void *ap, int rc)
4322 {
4323         struct vop_setextattr_args *a = ap;
4324
4325         if (!rc)
4326                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4327 }
4328
4329 void
4330 vop_symlink_post(void *ap, int rc)
4331 {
4332         struct vop_symlink_args *a = ap;
4333
4334         if (!rc)
4335                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4336 }
4337
4338 static struct knlist fs_knlist;
4339
4340 static void
4341 vfs_event_init(void *arg)
4342 {
4343         knlist_init_mtx(&fs_knlist, NULL);
4344 }
4345 /* XXX - correct order? */
4346 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4347
4348 void
4349 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4350 {
4351
4352         KNOTE_UNLOCKED(&fs_knlist, event);
4353 }
4354
4355 static int      filt_fsattach(struct knote *kn);
4356 static void     filt_fsdetach(struct knote *kn);
4357 static int      filt_fsevent(struct knote *kn, long hint);
4358
4359 struct filterops fs_filtops = {
4360         .f_isfd = 0,
4361         .f_attach = filt_fsattach,
4362         .f_detach = filt_fsdetach,
4363         .f_event = filt_fsevent
4364 };
4365
4366 static int
4367 filt_fsattach(struct knote *kn)
4368 {
4369
4370         kn->kn_flags |= EV_CLEAR;
4371         knlist_add(&fs_knlist, kn, 0);
4372         return (0);
4373 }
4374
4375 static void
4376 filt_fsdetach(struct knote *kn)
4377 {
4378
4379         knlist_remove(&fs_knlist, kn, 0);
4380 }
4381
4382 static int
4383 filt_fsevent(struct knote *kn, long hint)
4384 {
4385
4386         kn->kn_fflags |= hint;
4387         return (kn->kn_fflags != 0);
4388 }
4389
4390 static int
4391 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4392 {
4393         struct vfsidctl vc;
4394         int error;
4395         struct mount *mp;
4396
4397         error = SYSCTL_IN(req, &vc, sizeof(vc));
4398         if (error)
4399                 return (error);
4400         if (vc.vc_vers != VFS_CTL_VERS1)
4401                 return (EINVAL);
4402         mp = vfs_getvfs(&vc.vc_fsid);
4403         if (mp == NULL)
4404                 return (ENOENT);
4405         /* ensure that a specific sysctl goes to the right filesystem. */
4406         if (strcmp(vc.vc_fstypename, "*") != 0 &&
4407             strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4408                 vfs_rel(mp);
4409                 return (EINVAL);
4410         }
4411         VCTLTOREQ(&vc, req);
4412         error = VFS_SYSCTL(mp, vc.vc_op, req);
4413         vfs_rel(mp);
4414         return (error);
4415 }
4416
4417 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4418     NULL, 0, sysctl_vfs_ctl, "",
4419     "Sysctl by fsid");
4420
4421 /*
4422  * Function to initialize a va_filerev field sensibly.
4423  * XXX: Wouldn't a random number make a lot more sense ??
4424  */
4425 u_quad_t
4426 init_va_filerev(void)
4427 {
4428         struct bintime bt;
4429
4430         getbinuptime(&bt);
4431         return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4432 }
4433
4434 static int      filt_vfsread(struct knote *kn, long hint);
4435 static int      filt_vfswrite(struct knote *kn, long hint);
4436 static int      filt_vfsvnode(struct knote *kn, long hint);
4437 static void     filt_vfsdetach(struct knote *kn);
4438 static struct filterops vfsread_filtops = {
4439         .f_isfd = 1,
4440         .f_detach = filt_vfsdetach,
4441         .f_event = filt_vfsread
4442 };
4443 static struct filterops vfswrite_filtops = {
4444         .f_isfd = 1,
4445         .f_detach = filt_vfsdetach,
4446         .f_event = filt_vfswrite
4447 };
4448 static struct filterops vfsvnode_filtops = {
4449         .f_isfd = 1,
4450         .f_detach = filt_vfsdetach,
4451         .f_event = filt_vfsvnode
4452 };
4453
4454 static void
4455 vfs_knllock(void *arg)
4456 {
4457         struct vnode *vp = arg;
4458
4459         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4460 }
4461
4462 static void
4463 vfs_knlunlock(void *arg)
4464 {
4465         struct vnode *vp = arg;
4466
4467         VOP_UNLOCK(vp, 0);
4468 }
4469
4470 static void
4471 vfs_knl_assert_locked(void *arg)
4472 {
4473 #ifdef DEBUG_VFS_LOCKS
4474         struct vnode *vp = arg;
4475
4476         ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4477 #endif
4478 }
4479
4480 static void
4481 vfs_knl_assert_unlocked(void *arg)
4482 {
4483 #ifdef DEBUG_VFS_LOCKS
4484         struct vnode *vp = arg;
4485
4486         ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4487 #endif
4488 }
4489
4490 int
4491 vfs_kqfilter(struct vop_kqfilter_args *ap)
4492 {
4493         struct vnode *vp = ap->a_vp;
4494         struct knote *kn = ap->a_kn;
4495         struct knlist *knl;
4496
4497         switch (kn->kn_filter) {
4498         case EVFILT_READ:
4499                 kn->kn_fop = &vfsread_filtops;
4500                 break;
4501         case EVFILT_WRITE:
4502                 kn->kn_fop = &vfswrite_filtops;
4503                 break;
4504         case EVFILT_VNODE:
4505                 kn->kn_fop = &vfsvnode_filtops;
4506                 break;
4507         default:
4508                 return (EINVAL);
4509         }
4510
4511         kn->kn_hook = (caddr_t)vp;
4512
4513         v_addpollinfo(vp);
4514         if (vp->v_pollinfo == NULL)
4515                 return (ENOMEM);
4516         knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4517         vhold(vp);
4518         knlist_add(knl, kn, 0);
4519
4520         return (0);
4521 }
4522
4523 /*
4524  * Detach knote from vnode
4525  */
4526 static void
4527 filt_vfsdetach(struct knote *kn)
4528 {
4529         struct vnode *vp = (struct vnode *)kn->kn_hook;
4530
4531         KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4532         knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4533         vdrop(vp);
4534 }
4535
4536 /*ARGSUSED*/
4537 static int
4538 filt_vfsread(struct knote *kn, long hint)
4539 {
4540         struct vnode *vp = (struct vnode *)kn->kn_hook;
4541         struct vattr va;
4542         int res;
4543
4544         /*
4545          * filesystem is gone, so set the EOF flag and schedule
4546          * the knote for deletion.
4547          */
4548         if (hint == NOTE_REVOKE) {
4549                 VI_LOCK(vp);
4550                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4551                 VI_UNLOCK(vp);
4552                 return (1);
4553         }
4554
4555         if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4556                 return (0);
4557
4558         VI_LOCK(vp);
4559         kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4560         res = (kn->kn_data != 0);
4561         VI_UNLOCK(vp);
4562         return (res);
4563 }
4564
4565 /*ARGSUSED*/
4566 static int
4567 filt_vfswrite(struct knote *kn, long hint)
4568 {
4569         struct vnode *vp = (struct vnode *)kn->kn_hook;
4570
4571         VI_LOCK(vp);
4572
4573         /*
4574          * filesystem is gone, so set the EOF flag and schedule
4575          * the knote for deletion.
4576          */
4577         if (hint == NOTE_REVOKE)
4578                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4579
4580         kn->kn_data = 0;
4581         VI_UNLOCK(vp);
4582         return (1);
4583 }
4584
4585 static int
4586 filt_vfsvnode(struct knote *kn, long hint)
4587 {
4588         struct vnode *vp = (struct vnode *)kn->kn_hook;
4589         int res;
4590
4591         VI_LOCK(vp);
4592         if (kn->kn_sfflags & hint)
4593                 kn->kn_fflags |= hint;
4594         if (hint == NOTE_REVOKE) {
4595                 kn->kn_flags |= EV_EOF;
4596                 VI_UNLOCK(vp);
4597                 return (1);
4598         }
4599         res = (kn->kn_fflags != 0);
4600         VI_UNLOCK(vp);
4601         return (res);
4602 }
4603
4604 int
4605 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4606 {
4607         int error;
4608
4609         if (dp->d_reclen > ap->a_uio->uio_resid)
4610                 return (ENAMETOOLONG);
4611         error = uiomove(dp, dp->d_reclen, ap->a_uio);
4612         if (error) {
4613                 if (ap->a_ncookies != NULL) {
4614                         if (ap->a_cookies != NULL)
4615                                 free(ap->a_cookies, M_TEMP);
4616                         ap->a_cookies = NULL;
4617                         *ap->a_ncookies = 0;
4618                 }
4619                 return (error);
4620         }
4621         if (ap->a_ncookies == NULL)
4622                 return (0);
4623
4624         KASSERT(ap->a_cookies,
4625             ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4626
4627         *ap->a_cookies = realloc(*ap->a_cookies,
4628             (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4629         (*ap->a_cookies)[*ap->a_ncookies] = off;
4630         return (0);
4631 }
4632
4633 /*
4634  * Mark for update the access time of the file if the filesystem
4635  * supports VOP_MARKATIME.  This functionality is used by execve and
4636  * mmap, so we want to avoid the I/O implied by directly setting
4637  * va_atime for the sake of efficiency.
4638  */
4639 void
4640 vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4641 {
4642         struct mount *mp;
4643
4644         mp = vp->v_mount;
4645         ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4646         if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4647                 (void)VOP_MARKATIME(vp);
4648 }
4649
4650 /*
4651  * The purpose of this routine is to remove granularity from accmode_t,
4652  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4653  * VADMIN and VAPPEND.
4654  *
4655  * If it returns 0, the caller is supposed to continue with the usual
4656  * access checks using 'accmode' as modified by this routine.  If it
4657  * returns nonzero value, the caller is supposed to return that value
4658  * as errno.
4659  *
4660  * Note that after this routine runs, accmode may be zero.
4661  */
4662 int
4663 vfs_unixify_accmode(accmode_t *accmode)
4664 {
4665         /*
4666          * There is no way to specify explicit "deny" rule using
4667          * file mode or POSIX.1e ACLs.
4668          */
4669         if (*accmode & VEXPLICIT_DENY) {
4670                 *accmode = 0;
4671                 return (0);
4672         }
4673
4674         /*
4675          * None of these can be translated into usual access bits.
4676          * Also, the common case for NFSv4 ACLs is to not contain
4677          * either of these bits. Caller should check for VWRITE
4678          * on the containing directory instead.
4679          */
4680         if (*accmode & (VDELETE_CHILD | VDELETE))
4681                 return (EPERM);
4682
4683         if (*accmode & VADMIN_PERMS) {
4684                 *accmode &= ~VADMIN_PERMS;
4685                 *accmode |= VADMIN;
4686         }
4687
4688         /*
4689          * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4690          * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4691          */
4692         *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4693
4694         return (0);
4695 }
4696
4697 /*
4698  * These are helper functions for filesystems to traverse all
4699  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
4700  *
4701  * This interface replaces MNT_VNODE_FOREACH.
4702  */
4703
4704 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
4705
4706 struct vnode *
4707 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
4708 {
4709         struct vnode *vp;
4710
4711         if (should_yield())
4712                 kern_yield(PRI_USER);
4713         MNT_ILOCK(mp);
4714         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4715         vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
4716         while (vp != NULL && (vp->v_type == VMARKER ||
4717             (vp->v_iflag & VI_DOOMED) != 0))
4718                 vp = TAILQ_NEXT(vp, v_nmntvnodes);
4719
4720         /* Check if we are done */
4721         if (vp == NULL) {
4722                 __mnt_vnode_markerfree_all(mvp, mp);
4723                 /* MNT_IUNLOCK(mp); -- done in above function */
4724                 mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
4725                 return (NULL);
4726         }
4727         TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4728         TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4729         VI_LOCK(vp);
4730         MNT_IUNLOCK(mp);
4731         return (vp);
4732 }
4733
4734 struct vnode *
4735 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
4736 {
4737         struct vnode *vp;
4738
4739         *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4740         MNT_ILOCK(mp);
4741         MNT_REF(mp);
4742         (*mvp)->v_type = VMARKER;
4743
4744         vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
4745         while (vp != NULL && (vp->v_type == VMARKER ||
4746             (vp->v_iflag & VI_DOOMED) != 0))
4747                 vp = TAILQ_NEXT(vp, v_nmntvnodes);
4748
4749         /* Check if we are done */
4750         if (vp == NULL) {
4751                 MNT_REL(mp);
4752                 MNT_IUNLOCK(mp);
4753                 free(*mvp, M_VNODE_MARKER);
4754                 *mvp = NULL;
4755                 return (NULL);
4756         }
4757         (*mvp)->v_mount = mp;
4758         TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4759         VI_LOCK(vp);
4760         MNT_IUNLOCK(mp);
4761         return (vp);
4762 }
4763
4764
4765 void
4766 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
4767 {
4768
4769         if (*mvp == NULL) {
4770                 MNT_IUNLOCK(mp);
4771                 return;
4772         }
4773
4774         mtx_assert(MNT_MTX(mp), MA_OWNED);
4775
4776         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4777         TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4778         MNT_REL(mp);
4779         MNT_IUNLOCK(mp);
4780         free(*mvp, M_VNODE_MARKER);
4781         *mvp = NULL;
4782 }
4783
4784 /*
4785  * These are helper functions for filesystems to traverse their
4786  * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
4787  */
4788 static void
4789 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4790 {
4791
4792         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4793
4794         MNT_ILOCK(mp);
4795         MNT_REL(mp);
4796         MNT_IUNLOCK(mp);
4797         free(*mvp, M_VNODE_MARKER);
4798         *mvp = NULL;
4799 }
4800
4801 static struct vnode *
4802 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4803 {
4804         struct vnode *vp, *nvp;
4805
4806         mtx_assert(&vnode_free_list_mtx, MA_OWNED);
4807         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4808 restart:
4809         vp = TAILQ_NEXT(*mvp, v_actfreelist);
4810         TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4811         while (vp != NULL) {
4812                 if (vp->v_type == VMARKER) {
4813                         vp = TAILQ_NEXT(vp, v_actfreelist);
4814                         continue;
4815                 }
4816                 if (!VI_TRYLOCK(vp)) {
4817                         if (mp_ncpus == 1 || should_yield()) {
4818                                 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4819                                 mtx_unlock(&vnode_free_list_mtx);
4820                                 pause("vnacti", 1);
4821                                 mtx_lock(&vnode_free_list_mtx);
4822                                 goto restart;
4823                         }
4824                         continue;
4825                 }
4826                 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
4827                 KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
4828                     ("alien vnode on the active list %p %p", vp, mp));
4829                 if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
4830                         break;
4831                 nvp = TAILQ_NEXT(vp, v_actfreelist);
4832                 VI_UNLOCK(vp);
4833                 vp = nvp;
4834         }
4835
4836         /* Check if we are done */
4837         if (vp == NULL) {
4838                 mtx_unlock(&vnode_free_list_mtx);
4839                 mnt_vnode_markerfree_active(mvp, mp);
4840                 return (NULL);
4841         }
4842         TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
4843         mtx_unlock(&vnode_free_list_mtx);
4844         ASSERT_VI_LOCKED(vp, "active iter");
4845         KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
4846         return (vp);
4847 }
4848
4849 struct vnode *
4850 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4851 {
4852
4853         if (should_yield())
4854                 kern_yield(PRI_USER);
4855         mtx_lock(&vnode_free_list_mtx);
4856         return (mnt_vnode_next_active(mvp, mp));
4857 }
4858
4859 struct vnode *
4860 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
4861 {
4862         struct vnode *vp;
4863
4864         *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4865         MNT_ILOCK(mp);
4866         MNT_REF(mp);
4867         MNT_IUNLOCK(mp);
4868         (*mvp)->v_type = VMARKER;
4869         (*mvp)->v_mount = mp;
4870
4871         mtx_lock(&vnode_free_list_mtx);
4872         vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
4873         if (vp == NULL) {
4874                 mtx_unlock(&vnode_free_list_mtx);
4875                 mnt_vnode_markerfree_active(mvp, mp);
4876                 return (NULL);
4877         }
4878         TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4879         return (mnt_vnode_next_active(mvp, mp));
4880 }
4881
4882 void
4883 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4884 {
4885
4886         if (*mvp == NULL)
4887                 return;
4888
4889         mtx_lock(&vnode_free_list_mtx);
4890         TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4891         mtx_unlock(&vnode_free_list_mtx);
4892         mnt_vnode_markerfree_active(mvp, mp);
4893 }