sys/kern/vfs_subr.c

   1 /*-
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 4. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
  35  */
  36
  37 /*
  38  * External virtual filesystem routines
  39  */
  40
  41 #include <sys/cdefs.h>
  42 __FBSDID("$FreeBSD$");
  43
  44 #include "opt_compat.h"
  45 #include "opt_ddb.h"
  46 #include "opt_watchdog.h"
  47
  48 #include <sys/param.h>
  49 #include <sys/systm.h>
  50 #include <sys/bio.h>
  51 #include <sys/buf.h>
  52 #include <sys/condvar.h>
  53 #include <sys/conf.h>
  54 #include <sys/dirent.h>
  55 #include <sys/event.h>
  56 #include <sys/eventhandler.h>
  57 #include <sys/extattr.h>
  58 #include <sys/file.h>
  59 #include <sys/fcntl.h>
  60 #include <sys/jail.h>
  61 #include <sys/kdb.h>
  62 #include <sys/kernel.h>
  63 #include <sys/kthread.h>
  64 #include <sys/lockf.h>
  65 #include <sys/malloc.h>
  66 #include <sys/mount.h>
  67 #include <sys/namei.h>
  68 #include <sys/pctrie.h>
  69 #include <sys/priv.h>
  70 #include <sys/reboot.h>
  71 #include <sys/rwlock.h>
  72 #include <sys/sched.h>
  73 #include <sys/sleepqueue.h>
  74 #include <sys/smp.h>
  75 #include <sys/stat.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/syslog.h>
  78 #include <sys/vmmeter.h>
  79 #include <sys/vnode.h>
  80 #include <sys/watchdog.h>
  81
  82 #include <machine/stdarg.h>
  83
  84 #include <security/mac/mac_framework.h>
  85
  86 #include <vm/vm.h>
  87 #include <vm/vm_object.h>
  88 #include <vm/vm_extern.h>
  89 #include <vm/pmap.h>
  90 #include <vm/vm_map.h>
  91 #include <vm/vm_page.h>
  92 #include <vm/vm_kern.h>
  93 #include <vm/uma.h>
  94
  95 #ifdef DDB
  96 #include <ddb/ddb.h>
  97 #endif
  98
  99 static void     delmntque(struct vnode *vp);
 100 static int      flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 101                     int slpflag, int slptimeo);
 102 static void     syncer_shutdown(void *arg, int howto);
 103 static int      vtryrecycle(struct vnode *vp);
 104 static void     v_incr_usecount(struct vnode *);
 105 static void     v_decr_usecount(struct vnode *);
 106 static void     v_decr_useonly(struct vnode *);
 107 static void     v_upgrade_usecount(struct vnode *);
 108 static void     vnlru_free(int);
 109 static void     vgonel(struct vnode *);
 110 static void     vfs_knllock(void *arg);
 111 static void     vfs_knlunlock(void *arg);
 112 static void     vfs_knl_assert_locked(void *arg);
 113 static void     vfs_knl_assert_unlocked(void *arg);
 114 static void     destroy_vpollinfo(struct vpollinfo *vi);
 115
 116 /*
 117  * Number of vnodes in existence.  Increased whenever getnewvnode()
 118  * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
 119  */
 120 static unsigned long    numvnodes;
 121
 122 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
 123     "Number of vnodes in existence");
 124
 125 static u_long vnodes_created;
 126 SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
 127     0, "Number of vnodes created by getnewvnode");
 128
 129 /*
 130  * Conversion tables for conversion from vnode types to inode formats
 131  * and back.
 132  */
 133 enum vtype iftovt_tab[16] = {
 134         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 135         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 136 };
 137 int vttoif_tab[10] = {
 138         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 139         S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 140 };
 141
 142 /*
 143  * List of vnodes that are ready for recycling.
 144  */
 145 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 146
 147 /*
 148  * Free vnode target.  Free vnodes may simply be files which have been stat'd
 149  * but not read.  This is somewhat common, and a small cache of such files
 150  * should be kept to avoid recreation costs.
 151  */
 152 static u_long wantfreevnodes;
 153 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 154 /* Number of vnodes in the free list. */
 155 static u_long freevnodes;
 156 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
 157     "Number of vnodes in the free list");
 158
 159 static int vlru_allow_cache_src;
 160 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
 161     &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
 162
 163 static u_long recycles_count;
 164 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
 165     "Number of vnodes recycled to avoid exceding kern.maxvnodes");
 166
 167 /*
 168  * Various variables used for debugging the new implementation of
 169  * reassignbuf().
 170  * XXX these are probably of (very) limited utility now.
 171  */
 172 static int reassignbufcalls;
 173 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
 174     "Number of calls to reassignbuf");
 175
 176 /*
 177  * Cache for the mount type id assigned to NFS.  This is used for
 178  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
 179  */
 180 int     nfs_mount_type = -1;
 181
 182 /* To keep more than one thread at a time from running vfs_getnewfsid */
 183 static struct mtx mntid_mtx;
 184
 185 /*
 186  * Lock for any access to the following:
 187  *      vnode_free_list
 188  *      numvnodes
 189  *      freevnodes
 190  */
 191 static struct mtx vnode_free_list_mtx;
 192
 193 /* Publicly exported FS */
 194 struct nfs_public nfs_pub;
 195
 196 static uma_zone_t buf_trie_zone;
 197
 198 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 199 static uma_zone_t vnode_zone;
 200 static uma_zone_t vnodepoll_zone;
 201
 202 /*
 203  * The workitem queue.
 204  *
 205  * It is useful to delay writes of file data and filesystem metadata
 206  * for tens of seconds so that quickly created and deleted files need
 207  * not waste disk bandwidth being created and removed. To realize this,
 208  * we append vnodes to a "workitem" queue. When running with a soft
 209  * updates implementation, most pending metadata dependencies should
 210  * not wait for more than a few seconds. Thus, mounted on block devices
 211  * are delayed only about a half the time that file data is delayed.
 212  * Similarly, directory updates are more critical, so are only delayed
 213  * about a third the time that file data is delayed. Thus, there are
 214  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
 215  * one each second (driven off the filesystem syncer process). The
 216  * syncer_delayno variable indicates the next queue that is to be processed.
 217  * Items that need to be processed soon are placed in this queue:
 218  *
 219  *      syncer_workitem_pending[syncer_delayno]
 220  *
 221  * A delay of fifteen seconds is done by placing the request fifteen
 222  * entries later in the queue:
 223  *
 224  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
 225  *
 226  */
 227 static int syncer_delayno;
 228 static long syncer_mask;
 229 LIST_HEAD(synclist, bufobj);
 230 static struct synclist *syncer_workitem_pending;
 231 /*
 232  * The sync_mtx protects:
 233  *      bo->bo_synclist
 234  *      sync_vnode_count
 235  *      syncer_delayno
 236  *      syncer_state
 237  *      syncer_workitem_pending
 238  *      syncer_worklist_len
 239  *      rushjob
 240  */
 241 static struct mtx sync_mtx;
 242 static struct cv sync_wakeup;
 243
 244 #define SYNCER_MAXDELAY         32
 245 static int syncer_maxdelay = SYNCER_MAXDELAY;   /* maximum delay time */
 246 static int syncdelay = 30;              /* max time to delay syncing data */
 247 static int filedelay = 30;              /* time to delay syncing files */
 248 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
 249     "Time to delay syncing files (in seconds)");
 250 static int dirdelay = 29;               /* time to delay syncing directories */
 251 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
 252     "Time to delay syncing directories (in seconds)");
 253 static int metadelay = 28;              /* time to delay syncing metadata */
 254 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
 255     "Time to delay syncing metadata (in seconds)");
 256 static int rushjob;             /* number of slots to run ASAP */
 257 static int stat_rush_requests;  /* number of times I/O speeded up */
 258 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
 259     "Number of times I/O speeded up (rush requests)");
 260
 261 /*
 262  * When shutting down the syncer, run it at four times normal speed.
 263  */
 264 #define SYNCER_SHUTDOWN_SPEEDUP         4
 265 static int sync_vnode_count;
 266 static int syncer_worklist_len;
 267 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
 268     syncer_state;
 269
 270 /*
 271  * Number of vnodes we want to exist at any one time.  This is mostly used
 272  * to size hash tables in vnode-related code.  It is normally not used in
 273  * getnewvnode(), as wantfreevnodes is normally nonzero.)
 274  *
 275  * XXX desiredvnodes is historical cruft and should not exist.
 276  */
 277 int desiredvnodes;
 278 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
 279     &desiredvnodes, 0, "Maximum number of vnodes");
 280 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
 281     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
 282 static int vnlru_nowhere;
 283 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
 284     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 285
 286 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
 287 static int vnsz2log;
 288
 289 /*
 290  * Support for the bufobj clean & dirty pctrie.
 291  */
 292 static void *
 293 buf_trie_alloc(struct pctrie *ptree)
 294 {
 295
 296         return uma_zalloc(buf_trie_zone, M_NOWAIT);
 297 }
 298
 299 static void
 300 buf_trie_free(struct pctrie *ptree, void *node)
 301 {
 302
 303         uma_zfree(buf_trie_zone, node);
 304 }
 305 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
 306
 307 /*
 308  * Initialize the vnode management data structures.
 309  *
 310  * Reevaluate the following cap on the number of vnodes after the physical
 311  * memory size exceeds 512GB.  In the limit, as the physical memory size
 312  * grows, the ratio of physical pages to vnodes approaches sixteen to one.
 313  */
 314 #ifndef MAXVNODES_MAX
 315 #define MAXVNODES_MAX   (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
 316 #endif
 317 static void
 318 vntblinit(void *dummy __unused)
 319 {
 320         u_int i;
 321         int physvnodes, virtvnodes;
 322
 323         /*
 324          * Desiredvnodes is a function of the physical memory size and the
 325          * kernel's heap size.  Generally speaking, it scales with the
 326          * physical memory size.  The ratio of desiredvnodes to physical pages
 327          * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
 328          * marginal ratio of desiredvnodes to physical pages is one to
 329          * sixteen.  However, desiredvnodes is limited by the kernel's heap
 330          * size.  The memory required by desiredvnodes vnodes and vm objects
 331          * may not exceed one seventh of the kernel's heap size.
 332          */
 333         physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
 334             cnt.v_page_count) / 16;
 335         virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
 336             sizeof(struct vnode)));
 337         desiredvnodes = min(physvnodes, virtvnodes);
 338         if (desiredvnodes > MAXVNODES_MAX) {
 339                 if (bootverbose)
 340                         printf("Reducing kern.maxvnodes %d -> %d\n",
 341                             desiredvnodes, MAXVNODES_MAX);
 342                 desiredvnodes = MAXVNODES_MAX;
 343         }
 344         wantfreevnodes = desiredvnodes / 4;
 345         mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 346         TAILQ_INIT(&vnode_free_list);
 347         mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 348         vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 349             NULL, NULL, UMA_ALIGN_PTR, 0);
 350         vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 351             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 352         /*
 353          * Preallocate enough nodes to support one-per buf so that
 354          * we can not fail an insert.  reassignbuf() callers can not
 355          * tolerate the insertion failure.
 356          */
 357         buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
 358             NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
 359             UMA_ZONE_NOFREE | UMA_ZONE_VM);
 360         uma_prealloc(buf_trie_zone, nbuf);
 361         /*
 362          * Initialize the filesystem syncer.
 363          */
 364         syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 365             &syncer_mask);
 366         syncer_maxdelay = syncer_mask + 1;
 367         mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 368         cv_init(&sync_wakeup, "syncer");
 369         for (i = 1; i <= sizeof(struct vnode); i <<= 1)
 370                 vnsz2log++;
 371         vnsz2log--;
 372 }
 373 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
 374
 375
 376 /*
 377  * Mark a mount point as busy. Used to synchronize access and to delay
 378  * unmounting. Eventually, mountlist_mtx is not released on failure.
 379  *
 380  * vfs_busy() is a custom lock, it can block the caller.
 381  * vfs_busy() only sleeps if the unmount is active on the mount point.
 382  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
 383  * vnode belonging to mp.
 384  *
 385  * Lookup uses vfs_busy() to traverse mount points.
 386  * root fs                      var fs
 387  * / vnode lock         A       / vnode lock (/var)             D
 388  * /var vnode lock      B       /log vnode lock(/var/log)       E
 389  * vfs_busy lock        C       vfs_busy lock                   F
 390  *
 391  * Within each file system, the lock order is C->A->B and F->D->E.
 392  *
 393  * When traversing across mounts, the system follows that lock order:
 394  *
 395  *        C->A->B
 396  *              |
 397  *              +->F->D->E
 398  *
 399  * The lookup() process for namei("/var") illustrates the process:
 400  *  VOP_LOOKUP() obtains B while A is held
 401  *  vfs_busy() obtains a shared lock on F while A and B are held
 402  *  vput() releases lock on B
 403  *  vput() releases lock on A
 404  *  VFS_ROOT() obtains lock on D while shared lock on F is held
 405  *  vfs_unbusy() releases shared lock on F
 406  *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
 407  *    Attempt to lock A (instead of vp_crossmp) while D is held would
 408  *    violate the global order, causing deadlocks.
 409  *
 410  * dounmount() locks B while F is drained.
 411  */
 412 int
 413 vfs_busy(struct mount *mp, int flags)
 414 {
 415
 416         MPASS((flags & ~MBF_MASK) == 0);
 417         CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
 418
 419         MNT_ILOCK(mp);
 420         MNT_REF(mp);
 421         /*
 422          * If mount point is currenly being unmounted, sleep until the
 423          * mount point fate is decided.  If thread doing the unmounting fails,
 424          * it will clear MNTK_UNMOUNT flag before waking us up, indicating
 425          * that this mount point has survived the unmount attempt and vfs_busy
 426          * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
 427          * flag in addition to MNTK_UNMOUNT, indicating that mount point is
 428          * about to be really destroyed.  vfs_busy needs to release its
 429          * reference on the mount point in this case and return with ENOENT,
 430          * telling the caller that mount mount it tried to busy is no longer
 431          * valid.
 432          */
 433         while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 434                 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
 435                         MNT_REL(mp);
 436                         MNT_IUNLOCK(mp);
 437                         CTR1(KTR_VFS, "%s: failed busying before sleeping",
 438                             __func__);
 439                         return (ENOENT);
 440                 }
 441                 if (flags & MBF_MNTLSTLOCK)
 442                         mtx_unlock(&mountlist_mtx);
 443                 mp->mnt_kern_flag |= MNTK_MWAIT;
 444                 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
 445                 if (flags & MBF_MNTLSTLOCK)
 446                         mtx_lock(&mountlist_mtx);
 447                 MNT_ILOCK(mp);
 448         }
 449         if (flags & MBF_MNTLSTLOCK)
 450                 mtx_unlock(&mountlist_mtx);
 451         mp->mnt_lockref++;
 452         MNT_IUNLOCK(mp);
 453         return (0);
 454 }
 455
 456 /*
 457  * Free a busy filesystem.
 458  */
 459 void
 460 vfs_unbusy(struct mount *mp)
 461 {
 462
 463         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 464         MNT_ILOCK(mp);
 465         MNT_REL(mp);
 466         KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
 467         mp->mnt_lockref--;
 468         if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
 469                 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
 470                 CTR1(KTR_VFS, "%s: waking up waiters", __func__);
 471                 mp->mnt_kern_flag &= ~MNTK_DRAINING;
 472                 wakeup(&mp->mnt_lockref);
 473         }
 474         MNT_IUNLOCK(mp);
 475 }
 476
 477 /*
 478  * Lookup a mount point by filesystem identifier.
 479  */
 480 struct mount *
 481 vfs_getvfs(fsid_t *fsid)
 482 {
 483         struct mount *mp;
 484
 485         CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 486         mtx_lock(&mountlist_mtx);
 487         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 488                 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 489                     mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 490                         vfs_ref(mp);
 491                         mtx_unlock(&mountlist_mtx);
 492                         return (mp);
 493                 }
 494         }
 495         mtx_unlock(&mountlist_mtx);
 496         CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 497         return ((struct mount *) 0);
 498 }
 499
 500 /*
 501  * Lookup a mount point by filesystem identifier, busying it before
 502  * returning.
 503  *
 504  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
 505  * cache for popular filesystem identifiers.  The cache is lockess, using
 506  * the fact that struct mount's are never freed.  In worst case we may
 507  * get pointer to unmounted or even different filesystem, so we have to
 508  * check what we got, and go slow way if so.
 509  */
 510 struct mount *
 511 vfs_busyfs(fsid_t *fsid)
 512 {
 513 #define FSID_CACHE_SIZE 256
 514         typedef struct mount * volatile vmp_t;
 515         static vmp_t cache[FSID_CACHE_SIZE];
 516         struct mount *mp;
 517         int error;
 518         uint32_t hash;
 519
 520         CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 521         hash = fsid->val[0] ^ fsid->val[1];
 522         hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
 523         mp = cache[hash];
 524         if (mp == NULL ||
 525             mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
 526             mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
 527                 goto slow;
 528         if (vfs_busy(mp, 0) != 0) {
 529                 cache[hash] = NULL;
 530                 goto slow;
 531         }
 532         if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 533             mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
 534                 return (mp);
 535         else
 536             vfs_unbusy(mp);
 537
 538 slow:
 539         mtx_lock(&mountlist_mtx);
 540         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 541                 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 542                     mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 543                         error = vfs_busy(mp, MBF_MNTLSTLOCK);
 544                         if (error) {
 545                                 cache[hash] = NULL;
 546                                 mtx_unlock(&mountlist_mtx);
 547                                 return (NULL);
 548                         }
 549                         cache[hash] = mp;
 550                         return (mp);
 551                 }
 552         }
 553         CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 554         mtx_unlock(&mountlist_mtx);
 555         return ((struct mount *) 0);
 556 }
 557
 558 /*
 559  * Check if a user can access privileged mount options.
 560  */
 561 int
 562 vfs_suser(struct mount *mp, struct thread *td)
 563 {
 564         int error;
 565
 566         /*
 567          * If the thread is jailed, but this is not a jail-friendly file
 568          * system, deny immediately.
 569          */
 570         if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
 571                 return (EPERM);
 572
 573         /*
 574          * If the file system was mounted outside the jail of the calling
 575          * thread, deny immediately.
 576          */
 577         if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
 578                 return (EPERM);
 579
 580         /*
 581          * If file system supports delegated administration, we don't check
 582          * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
 583          * by the file system itself.
 584          * If this is not the user that did original mount, we check for
 585          * the PRIV_VFS_MOUNT_OWNER privilege.
 586          */
 587         if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
 588             mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 589                 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
 590                         return (error);
 591         }
 592         return (0);
 593 }
 594
 595 /*
 596  * Get a new unique fsid.  Try to make its val[0] unique, since this value
 597  * will be used to create fake device numbers for stat().  Also try (but
 598  * not so hard) make its val[0] unique mod 2^16, since some emulators only
 599  * support 16-bit device numbers.  We end up with unique val[0]'s for the
 600  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
 601  *
 602  * Keep in mind that several mounts may be running in parallel.  Starting
 603  * the search one past where the previous search terminated is both a
 604  * micro-optimization and a defense against returning the same fsid to
 605  * different mounts.
 606  */
 607 void
 608 vfs_getnewfsid(struct mount *mp)
 609 {
 610         static uint16_t mntid_base;
 611         struct mount *nmp;
 612         fsid_t tfsid;
 613         int mtype;
 614
 615         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 616         mtx_lock(&mntid_mtx);
 617         mtype = mp->mnt_vfc->vfc_typenum;
 618         tfsid.val[1] = mtype;
 619         mtype = (mtype & 0xFF) << 24;
 620         for (;;) {
 621                 tfsid.val[0] = makedev(255,
 622                     mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 623                 mntid_base++;
 624                 if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 625                         break;
 626                 vfs_rel(nmp);
 627         }
 628         mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 629         mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 630         mtx_unlock(&mntid_mtx);
 631 }
 632
 633 /*
 634  * Knob to control the precision of file timestamps:
 635  *
 636  *   0 = seconds only; nanoseconds zeroed.
 637  *   1 = seconds and nanoseconds, accurate within 1/HZ.
 638  *   2 = seconds and nanoseconds, truncated to microseconds.
 639  * >=3 = seconds and nanoseconds, maximum precision.
 640  */
 641 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 642
 643 static int timestamp_precision = TSP_USEC;
 644 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
 645     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
 646     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
 647     "3+: sec + ns (max. precision))");
 648
 649 /*
 650  * Get a current timestamp.
 651  */
 652 void
 653 vfs_timestamp(struct timespec *tsp)
 654 {
 655         struct timeval tv;
 656
 657         switch (timestamp_precision) {
 658         case TSP_SEC:
 659                 tsp->tv_sec = time_second;
 660                 tsp->tv_nsec = 0;
 661                 break;
 662         case TSP_HZ:
 663                 getnanotime(tsp);
 664                 break;
 665         case TSP_USEC:
 666                 microtime(&tv);
 667                 TIMEVAL_TO_TIMESPEC(&tv, tsp);
 668                 break;
 669         case TSP_NSEC:
 670         default:
 671                 nanotime(tsp);
 672                 break;
 673         }
 674 }
 675
 676 /*
 677  * Set vnode attributes to VNOVAL
 678  */
 679 void
 680 vattr_null(struct vattr *vap)
 681 {
 682
 683         vap->va_type = VNON;
 684         vap->va_size = VNOVAL;
 685         vap->va_bytes = VNOVAL;
 686         vap->va_mode = VNOVAL;
 687         vap->va_nlink = VNOVAL;
 688         vap->va_uid = VNOVAL;
 689         vap->va_gid = VNOVAL;
 690         vap->va_fsid = VNOVAL;
 691         vap->va_fileid = VNOVAL;
 692         vap->va_blocksize = VNOVAL;
 693         vap->va_rdev = VNOVAL;
 694         vap->va_atime.tv_sec = VNOVAL;
 695         vap->va_atime.tv_nsec = VNOVAL;
 696         vap->va_mtime.tv_sec = VNOVAL;
 697         vap->va_mtime.tv_nsec = VNOVAL;
 698         vap->va_ctime.tv_sec = VNOVAL;
 699         vap->va_ctime.tv_nsec = VNOVAL;
 700         vap->va_birthtime.tv_sec = VNOVAL;
 701         vap->va_birthtime.tv_nsec = VNOVAL;
 702         vap->va_flags = VNOVAL;
 703         vap->va_gen = VNOVAL;
 704         vap->va_vaflags = 0;
 705 }
 706
 707 /*
 708  * This routine is called when we have too many vnodes.  It attempts
 709  * to free <count> vnodes and will potentially free vnodes that still
 710  * have VM backing store (VM backing store is typically the cause
 711  * of a vnode blowout so we want to do this).  Therefore, this operation
 712  * is not considered cheap.
 713  *
 714  * A number of conditions may prevent a vnode from being reclaimed.
 715  * the buffer cache may have references on the vnode, a directory
 716  * vnode may still have references due to the namei cache representing
 717  * underlying files, or the vnode may be in active use.   It is not
 718  * desireable to reuse such vnodes.  These conditions may cause the
 719  * number of vnodes to reach some minimum value regardless of what
 720  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
 721  */
 722 static int
 723 vlrureclaim(struct mount *mp)
 724 {
 725         struct vnode *vp;
 726         int done;
 727         int trigger;
 728         int usevnodes;
 729         int count;
 730
 731         /*
 732          * Calculate the trigger point, don't allow user
 733          * screwups to blow us up.   This prevents us from
 734          * recycling vnodes with lots of resident pages.  We
 735          * aren't trying to free memory, we are trying to
 736          * free vnodes.
 737          */
 738         usevnodes = desiredvnodes;
 739         if (usevnodes <= 0)
 740                 usevnodes = 1;
 741         trigger = cnt.v_page_count * 2 / usevnodes;
 742         done = 0;
 743         vn_start_write(NULL, &mp, V_WAIT);
 744         MNT_ILOCK(mp);
 745         count = mp->mnt_nvnodelistsize / 10 + 1;
 746         while (count != 0) {
 747                 vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 748                 while (vp != NULL && vp->v_type == VMARKER)
 749                         vp = TAILQ_NEXT(vp, v_nmntvnodes);
 750                 if (vp == NULL)
 751                         break;
 752                 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 753                 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 754                 --count;
 755                 if (!VI_TRYLOCK(vp))
 756                         goto next_iter;
 757                 /*
 758                  * If it's been deconstructed already, it's still
 759                  * referenced, or it exceeds the trigger, skip it.
 760                  */
 761                 if (vp->v_usecount ||
 762                     (!vlru_allow_cache_src &&
 763                         !LIST_EMPTY(&(vp)->v_cache_src)) ||
 764                     (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
 765                     vp->v_object->resident_page_count > trigger)) {
 766                         VI_UNLOCK(vp);
 767                         goto next_iter;
 768                 }
 769                 MNT_IUNLOCK(mp);
 770                 vholdl(vp);
 771                 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
 772                         vdrop(vp);
 773                         goto next_iter_mntunlocked;
 774                 }
 775                 VI_LOCK(vp);
 776                 /*
 777                  * v_usecount may have been bumped after VOP_LOCK() dropped
 778                  * the vnode interlock and before it was locked again.
 779                  *
 780                  * It is not necessary to recheck VI_DOOMED because it can
 781                  * only be set by another thread that holds both the vnode
 782                  * lock and vnode interlock.  If another thread has the
 783                  * vnode lock before we get to VOP_LOCK() and obtains the
 784                  * vnode interlock after VOP_LOCK() drops the vnode
 785                  * interlock, the other thread will be unable to drop the
 786                  * vnode lock before our VOP_LOCK() call fails.
 787                  */
 788                 if (vp->v_usecount ||
 789                     (!vlru_allow_cache_src &&
 790                         !LIST_EMPTY(&(vp)->v_cache_src)) ||
 791                     (vp->v_object != NULL &&
 792                     vp->v_object->resident_page_count > trigger)) {
 793                         VOP_UNLOCK(vp, LK_INTERLOCK);
 794                         vdrop(vp);
 795                         goto next_iter_mntunlocked;
 796                 }
 797                 KASSERT((vp->v_iflag & VI_DOOMED) == 0,
 798                     ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
 799                 atomic_add_long(&recycles_count, 1);
 800                 vgonel(vp);
 801                 VOP_UNLOCK(vp, 0);
 802                 vdropl(vp);
 803                 done++;
 804 next_iter_mntunlocked:
 805                 if (!should_yield())
 806                         goto relock_mnt;
 807                 goto yield;
 808 next_iter:
 809                 if (!should_yield())
 810                         continue;
 811                 MNT_IUNLOCK(mp);
 812 yield:
 813                 kern_yield(PRI_USER);
 814 relock_mnt:
 815                 MNT_ILOCK(mp);
 816         }
 817         MNT_IUNLOCK(mp);
 818         vn_finished_write(mp);
 819         return done;
 820 }
 821
 822 /*
 823  * Attempt to keep the free list at wantfreevnodes length.
 824  */
 825 static void
 826 vnlru_free(int count)
 827 {
 828         struct vnode *vp;
 829
 830         mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 831         for (; count > 0; count--) {
 832                 vp = TAILQ_FIRST(&vnode_free_list);
 833                 /*
 834                  * The list can be modified while the free_list_mtx
 835                  * has been dropped and vp could be NULL here.
 836                  */
 837                 if (!vp)
 838                         break;
 839                 VNASSERT(vp->v_op != NULL, vp,
 840                     ("vnlru_free: vnode already reclaimed."));
 841                 KASSERT((vp->v_iflag & VI_FREE) != 0,
 842                     ("Removing vnode not on freelist"));
 843                 KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
 844                     ("Mangling active vnode"));
 845                 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
 846                 /*
 847                  * Don't recycle if we can't get the interlock.
 848                  */
 849                 if (!VI_TRYLOCK(vp)) {
 850                         TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
 851                         continue;
 852                 }
 853                 VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
 854                     vp, ("vp inconsistent on freelist"));
 855
 856                 /*
 857                  * The clear of VI_FREE prevents activation of the
 858                  * vnode.  There is no sense in putting the vnode on
 859                  * the mount point active list, only to remove it
 860                  * later during recycling.  Inline the relevant part
 861                  * of vholdl(), to avoid triggering assertions or
 862                  * activating.
 863                  */
 864                 freevnodes--;
 865                 vp->v_iflag &= ~VI_FREE;
 866                 vp->v_holdcnt++;
 867
 868                 mtx_unlock(&vnode_free_list_mtx);
 869                 VI_UNLOCK(vp);
 870                 vtryrecycle(vp);
 871                 /*
 872                  * If the recycled succeeded this vdrop will actually free
 873                  * the vnode.  If not it will simply place it back on
 874                  * the free list.
 875                  */
 876                 vdrop(vp);
 877                 mtx_lock(&vnode_free_list_mtx);
 878         }
 879 }
 880 /*
 881  * Attempt to recycle vnodes in a context that is always safe to block.
 882  * Calling vlrurecycle() from the bowels of filesystem code has some
 883  * interesting deadlock problems.
 884  */
 885 static struct proc *vnlruproc;
 886 static int vnlruproc_sig;
 887
 888 static void
 889 vnlru_proc(void)
 890 {
 891         struct mount *mp, *nmp;
 892         int done;
 893         struct proc *p = vnlruproc;
 894
 895         EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
 896             SHUTDOWN_PRI_FIRST);
 897
 898         for (;;) {
 899                 kproc_suspend_check(p);
 900                 mtx_lock(&vnode_free_list_mtx);
 901                 if (freevnodes > wantfreevnodes)
 902                         vnlru_free(freevnodes - wantfreevnodes);
 903                 if (numvnodes <= desiredvnodes * 9 / 10) {
 904                         vnlruproc_sig = 0;
 905                         wakeup(&vnlruproc_sig);
 906                         msleep(vnlruproc, &vnode_free_list_mtx,
 907                             PVFS|PDROP, "vlruwt", hz);
 908                         continue;
 909                 }
 910                 mtx_unlock(&vnode_free_list_mtx);
 911                 done = 0;
 912                 mtx_lock(&mountlist_mtx);
 913                 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 914                         if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 915                                 nmp = TAILQ_NEXT(mp, mnt_list);
 916                                 continue;
 917                         }
 918                         done += vlrureclaim(mp);
 919                         mtx_lock(&mountlist_mtx);
 920                         nmp = TAILQ_NEXT(mp, mnt_list);
 921                         vfs_unbusy(mp);
 922                 }
 923                 mtx_unlock(&mountlist_mtx);
 924                 if (done == 0) {
 925 #if 0
 926                         /* These messages are temporary debugging aids */
 927                         if (vnlru_nowhere < 5)
 928                                 printf("vnlru process getting nowhere..\n");
 929                         else if (vnlru_nowhere == 5)
 930                                 printf("vnlru process messages stopped.\n");
 931 #endif
 932                         vnlru_nowhere++;
 933                         tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 934                 } else
 935                         kern_yield(PRI_USER);
 936         }
 937 }
 938
 939 static struct kproc_desc vnlru_kp = {
 940         "vnlru",
 941         vnlru_proc,
 942         &vnlruproc
 943 };
 944 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
 945     &vnlru_kp);
 946
 947 /*
 948  * Routines having to do with the management of the vnode table.
 949  */
 950
 951 /*
 952  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
 953  * before we actually vgone().  This function must be called with the vnode
 954  * held to prevent the vnode from being returned to the free list midway
 955  * through vgone().
 956  */
 957 static int
 958 vtryrecycle(struct vnode *vp)
 959 {
 960         struct mount *vnmp;
 961
 962         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 963         VNASSERT(vp->v_holdcnt, vp,
 964             ("vtryrecycle: Recycling vp %p without a reference.", vp));
 965         /*
 966          * This vnode may found and locked via some other list, if so we
 967          * can't recycle it yet.
 968          */
 969         if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 970                 CTR2(KTR_VFS,
 971                     "%s: impossible to recycle, vp %p lock is already held",
 972                     __func__, vp);
 973                 return (EWOULDBLOCK);
 974         }
 975         /*
 976          * Don't recycle if its filesystem is being suspended.
 977          */
 978         if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 979                 VOP_UNLOCK(vp, 0);
 980                 CTR2(KTR_VFS,
 981                     "%s: impossible to recycle, cannot start the write for %p",
 982                     __func__, vp);
 983                 return (EBUSY);
 984         }
 985         /*
 986          * If we got this far, we need to acquire the interlock and see if
 987          * anyone picked up this vnode from another list.  If not, we will
 988          * mark it with DOOMED via vgonel() so that anyone who does find it
 989          * will skip over it.
 990          */
 991         VI_LOCK(vp);
 992         if (vp->v_usecount) {
 993                 VOP_UNLOCK(vp, LK_INTERLOCK);
 994                 vn_finished_write(vnmp);
 995                 CTR2(KTR_VFS,
 996                     "%s: impossible to recycle, %p is already referenced",
 997                     __func__, vp);
 998                 return (EBUSY);
 999         }
1000         if ((vp->v_iflag & VI_DOOMED) == 0) {
1001                 atomic_add_long(&recycles_count, 1);
1002                 vgonel(vp);
1003         }
1004         VOP_UNLOCK(vp, LK_INTERLOCK);
1005         vn_finished_write(vnmp);
1006         return (0);
1007 }
1008
1009 /*
1010  * Wait for available vnodes.
1011  */
1012 static int
1013 getnewvnode_wait(int suspended)
1014 {
1015
1016         mtx_assert(&vnode_free_list_mtx, MA_OWNED);
1017         if (numvnodes > desiredvnodes) {
1018                 if (suspended) {
1019                         /*
1020                          * File system is beeing suspended, we cannot risk a
1021                          * deadlock here, so allocate new vnode anyway.
1022                          */
1023                         if (freevnodes > wantfreevnodes)
1024                                 vnlru_free(freevnodes - wantfreevnodes);
1025                         return (0);
1026                 }
1027                 if (vnlruproc_sig == 0) {
1028                         vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
1029                         wakeup(vnlruproc);
1030                 }
1031                 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
1032                     "vlruwk", hz);
1033         }
1034         return (numvnodes > desiredvnodes ? ENFILE : 0);
1035 }
1036
1037 void
1038 getnewvnode_reserve(u_int count)
1039 {
1040         struct thread *td;
1041
1042         td = curthread;
1043         /* First try to be quick and racy. */
1044         if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
1045                 td->td_vp_reserv += count;
1046                 return;
1047         } else
1048                 atomic_subtract_long(&numvnodes, count);
1049
1050         mtx_lock(&vnode_free_list_mtx);
1051         while (count > 0) {
1052                 if (getnewvnode_wait(0) == 0) {
1053                         count--;
1054                         td->td_vp_reserv++;
1055                         atomic_add_long(&numvnodes, 1);
1056                 }
1057         }
1058         mtx_unlock(&vnode_free_list_mtx);
1059 }
1060
1061 void
1062 getnewvnode_drop_reserve(void)
1063 {
1064         struct thread *td;
1065
1066         td = curthread;
1067         atomic_subtract_long(&numvnodes, td->td_vp_reserv);
1068         td->td_vp_reserv = 0;
1069 }
1070
1071 /*
1072  * Return the next vnode from the free list.
1073  */
1074 int
1075 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
1076     struct vnode **vpp)
1077 {
1078         struct vnode *vp;
1079         struct bufobj *bo;
1080         struct thread *td;
1081         int error;
1082
1083         CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1084         vp = NULL;
1085         td = curthread;
1086         if (td->td_vp_reserv > 0) {
1087                 td->td_vp_reserv -= 1;
1088                 goto alloc;
1089         }
1090         mtx_lock(&vnode_free_list_mtx);
1091         /*
1092          * Lend our context to reclaim vnodes if they've exceeded the max.
1093          */
1094         if (freevnodes > wantfreevnodes)
1095                 vnlru_free(1);
1096         error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
1097             MNTK_SUSPEND));
1098 #if 0   /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
1099         if (error != 0) {
1100                 mtx_unlock(&vnode_free_list_mtx);
1101                 return (error);
1102         }
1103 #endif
1104         atomic_add_long(&numvnodes, 1);
1105         mtx_unlock(&vnode_free_list_mtx);
1106 alloc:
1107         atomic_add_long(&vnodes_created, 1);
1108         vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
1109         /*
1110          * Setup locks.
1111          */
1112         vp->v_vnlock = &vp->v_lock;
1113         mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
1114         /*
1115          * By default, don't allow shared locks unless filesystems
1116          * opt-in.
1117          */
1118         lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
1119         /*
1120          * Initialize bufobj.
1121          */
1122         bo = &vp->v_bufobj;
1123         bo->__bo_vnode = vp;
1124         rw_init(BO_LOCKPTR(bo), "bufobj interlock");
1125         bo->bo_ops = &buf_ops_bio;
1126         bo->bo_private = vp;
1127         TAILQ_INIT(&bo->bo_clean.bv_hd);
1128         TAILQ_INIT(&bo->bo_dirty.bv_hd);
1129         /*
1130          * Initialize namecache.
1131          */
1132         LIST_INIT(&vp->v_cache_src);
1133         TAILQ_INIT(&vp->v_cache_dst);
1134         /*
1135          * Finalize various vnode identity bits.
1136          */
1137         vp->v_type = VNON;
1138         vp->v_tag = tag;
1139         vp->v_op = vops;
1140         v_incr_usecount(vp);
1141         vp->v_data = NULL;
1142 #ifdef MAC
1143         mac_vnode_init(vp);
1144         if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1145                 mac_vnode_associate_singlelabel(mp, vp);
1146         else if (mp == NULL && vops != &dead_vnodeops)
1147                 printf("NULL mp in getnewvnode()\n");
1148 #endif
1149         if (mp != NULL) {
1150                 bo->bo_bsize = mp->mnt_stat.f_iosize;
1151                 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1152                         vp->v_vflag |= VV_NOKNOTE;
1153         }
1154         rangelock_init(&vp->v_rl);
1155
1156         /*
1157          * For the filesystems which do not use vfs_hash_insert(),
1158          * still initialize v_hash to have vfs_hash_index() useful.
1159          * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1160          * its own hashing.
1161          */
1162         vp->v_hash = (uintptr_t)vp >> vnsz2log;
1163
1164         *vpp = vp;
1165         return (0);
1166 }
1167
1168 /*
1169  * Delete from old mount point vnode list, if on one.
1170  */
1171 static void
1172 delmntque(struct vnode *vp)
1173 {
1174         struct mount *mp;
1175         int active;
1176
1177         mp = vp->v_mount;
1178         if (mp == NULL)
1179                 return;
1180         MNT_ILOCK(mp);
1181         VI_LOCK(vp);
1182         KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
1183             ("Active vnode list size %d > Vnode list size %d",
1184              mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
1185         active = vp->v_iflag & VI_ACTIVE;
1186         vp->v_iflag &= ~VI_ACTIVE;
1187         if (active) {
1188                 mtx_lock(&vnode_free_list_mtx);
1189                 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
1190                 mp->mnt_activevnodelistsize--;
1191                 mtx_unlock(&vnode_free_list_mtx);
1192         }
1193         vp->v_mount = NULL;
1194         VI_UNLOCK(vp);
1195         VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1196                 ("bad mount point vnode list size"));
1197         TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1198         mp->mnt_nvnodelistsize--;
1199         MNT_REL(mp);
1200         MNT_IUNLOCK(mp);
1201 }
1202
1203 static void
1204 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1205 {
1206
1207         vp->v_data = NULL;
1208         vp->v_op = &dead_vnodeops;
1209         vgone(vp);
1210         vput(vp);
1211 }
1212
1213 /*
1214  * Insert into list of vnodes for the new mount point, if available.
1215  */
1216 int
1217 insmntque1(struct vnode *vp, struct mount *mp,
1218         void (*dtr)(struct vnode *, void *), void *dtr_arg)
1219 {
1220
1221         KASSERT(vp->v_mount == NULL,
1222                 ("insmntque: vnode already on per mount vnode list"));
1223         VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1224         ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1225
1226         /*
1227          * We acquire the vnode interlock early to ensure that the
1228          * vnode cannot be recycled by another process releasing a
1229          * holdcnt on it before we get it on both the vnode list
1230          * and the active vnode list. The mount mutex protects only
1231          * manipulation of the vnode list and the vnode freelist
1232          * mutex protects only manipulation of the active vnode list.
1233          * Hence the need to hold the vnode interlock throughout.
1234          */
1235         MNT_ILOCK(mp);
1236         VI_LOCK(vp);
1237         if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1238             ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1239             mp->mnt_nvnodelistsize == 0)) &&
1240             (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1241                 VI_UNLOCK(vp);
1242                 MNT_IUNLOCK(mp);
1243                 if (dtr != NULL)
1244                         dtr(vp, dtr_arg);
1245                 return (EBUSY);
1246         }
1247         vp->v_mount = mp;
1248         MNT_REF(mp);
1249         TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1250         VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1251                 ("neg mount point vnode list size"));
1252         mp->mnt_nvnodelistsize++;
1253         KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
1254             ("Activating already active vnode"));
1255         vp->v_iflag |= VI_ACTIVE;
1256         mtx_lock(&vnode_free_list_mtx);
1257         TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
1258         mp->mnt_activevnodelistsize++;
1259         mtx_unlock(&vnode_free_list_mtx);
1260         VI_UNLOCK(vp);
1261         MNT_IUNLOCK(mp);
1262         return (0);
1263 }
1264
1265 int
1266 insmntque(struct vnode *vp, struct mount *mp)
1267 {
1268
1269         return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1270 }
1271
1272 /*
1273  * Flush out and invalidate all buffers associated with a bufobj
1274  * Called with the underlying object locked.
1275  */
1276 int
1277 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1278 {
1279         int error;
1280
1281         BO_LOCK(bo);
1282         if (flags & V_SAVE) {
1283                 error = bufobj_wwait(bo, slpflag, slptimeo);
1284                 if (error) {
1285                         BO_UNLOCK(bo);
1286                         return (error);
1287                 }
1288                 if (bo->bo_dirty.bv_cnt > 0) {
1289                         BO_UNLOCK(bo);
1290                         if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1291                                 return (error);
1292                         /*
1293                          * XXX We could save a lock/unlock if this was only
1294                          * enabled under INVARIANTS
1295                          */
1296                         BO_LOCK(bo);
1297                         if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1298                                 panic("vinvalbuf: dirty bufs");
1299                 }
1300         }
1301         /*
1302          * If you alter this loop please notice that interlock is dropped and
1303          * reacquired in flushbuflist.  Special care is needed to ensure that
1304          * no race conditions occur from this.
1305          */
1306         do {
1307                 error = flushbuflist(&bo->bo_clean,
1308                     flags, bo, slpflag, slptimeo);
1309                 if (error == 0 && !(flags & V_CLEANONLY))
1310                         error = flushbuflist(&bo->bo_dirty,
1311                             flags, bo, slpflag, slptimeo);
1312                 if (error != 0 && error != EAGAIN) {
1313                         BO_UNLOCK(bo);
1314                         return (error);
1315                 }
1316         } while (error != 0);
1317
1318         /*
1319          * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1320          * have write I/O in-progress but if there is a VM object then the
1321          * VM object can also have read-I/O in-progress.
1322          */
1323         do {
1324                 bufobj_wwait(bo, 0, 0);
1325                 BO_UNLOCK(bo);
1326                 if (bo->bo_object != NULL) {
1327                         VM_OBJECT_WLOCK(bo->bo_object);
1328                         vm_object_pip_wait(bo->bo_object, "bovlbx");
1329                         VM_OBJECT_WUNLOCK(bo->bo_object);
1330                 }
1331                 BO_LOCK(bo);
1332         } while (bo->bo_numoutput > 0);
1333         BO_UNLOCK(bo);
1334
1335         /*
1336          * Destroy the copy in the VM cache, too.
1337          */
1338         if (bo->bo_object != NULL &&
1339             (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
1340                 VM_OBJECT_WLOCK(bo->bo_object);
1341                 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1342                     OBJPR_CLEANONLY : 0);
1343                 VM_OBJECT_WUNLOCK(bo->bo_object);
1344         }
1345
1346 #ifdef INVARIANTS
1347         BO_LOCK(bo);
1348         if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
1349             (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1350                 panic("vinvalbuf: flush failed");
1351         BO_UNLOCK(bo);
1352 #endif
1353         return (0);
1354 }
1355
1356 /*
1357  * Flush out and invalidate all buffers associated with a vnode.
1358  * Called with the underlying object locked.
1359  */
1360 int
1361 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1362 {
1363
1364         CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1365         ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1366         if (vp->v_object != NULL && vp->v_object->handle != vp)
1367                 return (0);
1368         return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1369 }
1370
1371 /*
1372  * Flush out buffers on the specified list.
1373  *
1374  */
1375 static int
1376 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1377     int slptimeo)
1378 {
1379         struct buf *bp, *nbp;
1380         int retval, error;
1381         daddr_t lblkno;
1382         b_xflags_t xflags;
1383
1384         ASSERT_BO_WLOCKED(bo);
1385
1386         retval = 0;
1387         TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1388                 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1389                     ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1390                         continue;
1391                 }
1392                 lblkno = 0;
1393                 xflags = 0;
1394                 if (nbp != NULL) {
1395                         lblkno = nbp->b_lblkno;
1396                         xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
1397                 }
1398                 retval = EAGAIN;
1399                 error = BUF_TIMELOCK(bp,
1400                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
1401                     "flushbuf", slpflag, slptimeo);
1402                 if (error) {
1403                         BO_LOCK(bo);
1404                         return (error != ENOLCK ? error : EAGAIN);
1405                 }
1406                 KASSERT(bp->b_bufobj == bo,
1407                     ("bp %p wrong b_bufobj %p should be %p",
1408                     bp, bp->b_bufobj, bo));
1409                 if (bp->b_bufobj != bo) {       /* XXX: necessary ? */
1410                         BUF_UNLOCK(bp);
1411                         BO_LOCK(bo);
1412                         return (EAGAIN);
1413                 }
1414                 /*
1415                  * XXX Since there are no node locks for NFS, I
1416                  * believe there is a slight chance that a delayed
1417                  * write will occur while sleeping just above, so
1418                  * check for it.
1419                  */
1420                 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1421                     (flags & V_SAVE)) {
1422                         bremfree(bp);
1423                         bp->b_flags |= B_ASYNC;
1424                         bwrite(bp);
1425                         BO_LOCK(bo);
1426                         return (EAGAIN);        /* XXX: why not loop ? */
1427                 }
1428                 bremfree(bp);
1429                 bp->b_flags |= (B_INVAL | B_RELBUF);
1430                 bp->b_flags &= ~B_ASYNC;
1431                 brelse(bp);
1432                 BO_LOCK(bo);
1433                 if (nbp != NULL &&
1434                     (nbp->b_bufobj != bo ||
1435                      nbp->b_lblkno != lblkno ||
1436                      (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1437                         break;                  /* nbp invalid */
1438         }
1439         return (retval);
1440 }
1441
1442 /*
1443  * Truncate a file's buffer and pages to a specified length.  This
1444  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1445  * sync activity.
1446  */
1447 int
1448 vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
1449 {
1450         struct buf *bp, *nbp;
1451         int anyfreed;
1452         int trunclbn;
1453         struct bufobj *bo;
1454
1455         CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1456             vp, cred, blksize, (uintmax_t)length);
1457
1458         /*
1459          * Round up to the *next* lbn.
1460          */
1461         trunclbn = (length + blksize - 1) / blksize;
1462
1463         ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1464 restart:
1465         bo = &vp->v_bufobj;
1466         BO_LOCK(bo);
1467         anyfreed = 1;
1468         for (;anyfreed;) {
1469                 anyfreed = 0;
1470                 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1471                         if (bp->b_lblkno < trunclbn)
1472                                 continue;
1473                         if (BUF_LOCK(bp,
1474                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1475                             BO_LOCKPTR(bo)) == ENOLCK)
1476                                 goto restart;
1477
1478                         bremfree(bp);
1479                         bp->b_flags |= (B_INVAL | B_RELBUF);
1480                         bp->b_flags &= ~B_ASYNC;
1481                         brelse(bp);
1482                         anyfreed = 1;
1483
1484                         BO_LOCK(bo);
1485                         if (nbp != NULL &&
1486                             (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1487                             (nbp->b_vp != vp) ||
1488                             (nbp->b_flags & B_DELWRI))) {
1489                                 BO_UNLOCK(bo);
1490                                 goto restart;
1491                         }
1492                 }
1493
1494                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1495                         if (bp->b_lblkno < trunclbn)
1496                                 continue;
1497                         if (BUF_LOCK(bp,
1498                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1499                             BO_LOCKPTR(bo)) == ENOLCK)
1500                                 goto restart;
1501                         bremfree(bp);
1502                         bp->b_flags |= (B_INVAL | B_RELBUF);
1503                         bp->b_flags &= ~B_ASYNC;
1504                         brelse(bp);
1505                         anyfreed = 1;
1506
1507                         BO_LOCK(bo);
1508                         if (nbp != NULL &&
1509                             (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1510                             (nbp->b_vp != vp) ||
1511                             (nbp->b_flags & B_DELWRI) == 0)) {
1512                                 BO_UNLOCK(bo);
1513                                 goto restart;
1514                         }
1515                 }
1516         }
1517
1518         if (length > 0) {
1519 restartsync:
1520                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1521                         if (bp->b_lblkno > 0)
1522                                 continue;
1523                         /*
1524                          * Since we hold the vnode lock this should only
1525                          * fail if we're racing with the buf daemon.
1526                          */
1527                         if (BUF_LOCK(bp,
1528                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1529                             BO_LOCKPTR(bo)) == ENOLCK) {
1530                                 goto restart;
1531                         }
1532                         VNASSERT((bp->b_flags & B_DELWRI), vp,
1533                             ("buf(%p) on dirty queue without DELWRI", bp));
1534
1535                         bremfree(bp);
1536                         bawrite(bp);
1537                         BO_LOCK(bo);
1538                         goto restartsync;
1539                 }
1540         }
1541
1542         bufobj_wwait(bo, 0, 0);
1543         BO_UNLOCK(bo);
1544         vnode_pager_setsize(vp, length);
1545
1546         return (0);
1547 }
1548
1549 static void
1550 buf_vlist_remove(struct buf *bp)
1551 {
1552         struct bufv *bv;
1553
1554         KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1555         ASSERT_BO_WLOCKED(bp->b_bufobj);
1556         KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1557             (BX_VNDIRTY|BX_VNCLEAN),
1558             ("buf_vlist_remove: Buf %p is on two lists", bp));
1559         if (bp->b_xflags & BX_VNDIRTY)
1560                 bv = &bp->b_bufobj->bo_dirty;
1561         else
1562                 bv = &bp->b_bufobj->bo_clean;
1563         BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
1564         TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1565         bv->bv_cnt--;
1566         bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1567 }
1568
1569 /*
1570  * Add the buffer to the sorted clean or dirty block list.
1571  *
1572  * NOTE: xflags is passed as a constant, optimizing this inline function!
1573  */
1574 static void
1575 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1576 {
1577         struct bufv *bv;
1578         struct buf *n;
1579         int error;
1580
1581         ASSERT_BO_WLOCKED(bo);
1582         KASSERT((bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo));
1583         KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1584             ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1585         bp->b_xflags |= xflags;
1586         if (xflags & BX_VNDIRTY)
1587                 bv = &bo->bo_dirty;
1588         else
1589                 bv = &bo->bo_clean;
1590
1591         /*
1592          * Keep the list ordered.  Optimize empty list insertion.  Assume
1593          * we tend to grow at the tail so lookup_le should usually be cheaper
1594          * than _ge.
1595          */
1596         if (bv->bv_cnt == 0 ||
1597             bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
1598                 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1599         else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
1600                 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
1601         else
1602                 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
1603         error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
1604         if (error)
1605                 panic("buf_vlist_add:  Preallocated nodes insufficient.");
1606         bv->bv_cnt++;
1607 }
1608
1609 /*
1610  * Lookup a buffer using the splay tree.  Note that we specifically avoid
1611  * shadow buffers used in background bitmap writes.
1612  *
1613  * This code isn't quite efficient as it could be because we are maintaining
1614  * two sorted lists and do not know which list the block resides in.
1615  *
1616  * During a "make buildworld" the desired buffer is found at one of
1617  * the roots more than 60% of the time.  Thus, checking both roots
1618  * before performing either splay eliminates unnecessary splays on the
1619  * first tree splayed.
1620  */
1621 struct buf *
1622 gbincore(struct bufobj *bo, daddr_t lblkno)
1623 {
1624         struct buf *bp;
1625
1626         ASSERT_BO_LOCKED(bo);
1627         bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
1628         if (bp != NULL)
1629                 return (bp);
1630         return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
1631 }
1632
1633 /*
1634  * Associate a buffer with a vnode.
1635  */
1636 void
1637 bgetvp(struct vnode *vp, struct buf *bp)
1638 {
1639         struct bufobj *bo;
1640
1641         bo = &vp->v_bufobj;
1642         ASSERT_BO_WLOCKED(bo);
1643         VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1644
1645         CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1646         VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1647             ("bgetvp: bp already attached! %p", bp));
1648
1649         vhold(vp);
1650         bp->b_vp = vp;
1651         bp->b_bufobj = bo;
1652         /*
1653          * Insert onto list for new vnode.
1654          */
1655         buf_vlist_add(bp, bo, BX_VNCLEAN);
1656 }
1657
1658 /*
1659  * Disassociate a buffer from a vnode.
1660  */
1661 void
1662 brelvp(struct buf *bp)
1663 {
1664         struct bufobj *bo;
1665         struct vnode *vp;
1666
1667         CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1668         KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1669
1670         /*
1671          * Delete from old vnode list, if on one.
1672          */
1673         vp = bp->b_vp;          /* XXX */
1674         bo = bp->b_bufobj;
1675         BO_LOCK(bo);
1676         if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1677                 buf_vlist_remove(bp);
1678         else
1679                 panic("brelvp: Buffer %p not on queue.", bp);
1680         if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1681                 bo->bo_flag &= ~BO_ONWORKLST;
1682                 mtx_lock(&sync_mtx);
1683                 LIST_REMOVE(bo, bo_synclist);
1684                 syncer_worklist_len--;
1685                 mtx_unlock(&sync_mtx);
1686         }
1687         bp->b_vp = NULL;
1688         bp->b_bufobj = NULL;
1689         BO_UNLOCK(bo);
1690         vdrop(vp);
1691 }
1692
1693 /*
1694  * Add an item to the syncer work queue.
1695  */
1696 static void
1697 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1698 {
1699         int slot;
1700
1701         ASSERT_BO_WLOCKED(bo);
1702
1703         mtx_lock(&sync_mtx);
1704         if (bo->bo_flag & BO_ONWORKLST)
1705                 LIST_REMOVE(bo, bo_synclist);
1706         else {
1707                 bo->bo_flag |= BO_ONWORKLST;
1708                 syncer_worklist_len++;
1709         }
1710
1711         if (delay > syncer_maxdelay - 2)
1712                 delay = syncer_maxdelay - 2;
1713         slot = (syncer_delayno + delay) & syncer_mask;
1714
1715         LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1716         mtx_unlock(&sync_mtx);
1717 }
1718
1719 static int
1720 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1721 {
1722         int error, len;
1723
1724         mtx_lock(&sync_mtx);
1725         len = syncer_worklist_len - sync_vnode_count;
1726         mtx_unlock(&sync_mtx);
1727         error = SYSCTL_OUT(req, &len, sizeof(len));
1728         return (error);
1729 }
1730
1731 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1732     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1733
1734 static struct proc *updateproc;
1735 static void sched_sync(void);
1736 static struct kproc_desc up_kp = {
1737         "syncer",
1738         sched_sync,
1739         &updateproc
1740 };
1741 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1742
1743 static int
1744 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1745 {
1746         struct vnode *vp;
1747         struct mount *mp;
1748
1749         *bo = LIST_FIRST(slp);
1750         if (*bo == NULL)
1751                 return (0);
1752         vp = (*bo)->__bo_vnode; /* XXX */
1753         if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1754                 return (1);
1755         /*
1756          * We use vhold in case the vnode does not
1757          * successfully sync.  vhold prevents the vnode from
1758          * going away when we unlock the sync_mtx so that
1759          * we can acquire the vnode interlock.
1760          */
1761         vholdl(vp);
1762         mtx_unlock(&sync_mtx);
1763         VI_UNLOCK(vp);
1764         if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1765                 vdrop(vp);
1766                 mtx_lock(&sync_mtx);
1767                 return (*bo == LIST_FIRST(slp));
1768         }
1769         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1770         (void) VOP_FSYNC(vp, MNT_LAZY, td);
1771         VOP_UNLOCK(vp, 0);
1772         vn_finished_write(mp);
1773         BO_LOCK(*bo);
1774         if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1775                 /*
1776                  * Put us back on the worklist.  The worklist
1777                  * routine will remove us from our current
1778                  * position and then add us back in at a later
1779                  * position.
1780                  */
1781                 vn_syncer_add_to_worklist(*bo, syncdelay);
1782         }
1783         BO_UNLOCK(*bo);
1784         vdrop(vp);
1785         mtx_lock(&sync_mtx);
1786         return (0);
1787 }
1788
1789 static int first_printf = 1;
1790
1791 /*
1792  * System filesystem synchronizer daemon.
1793  */
1794 static void
1795 sched_sync(void)
1796 {
1797         struct synclist *next, *slp;
1798         struct bufobj *bo;
1799         long starttime;
1800         struct thread *td = curthread;
1801         int last_work_seen;
1802         int net_worklist_len;
1803         int syncer_final_iter;
1804         int error;
1805
1806         last_work_seen = 0;
1807         syncer_final_iter = 0;
1808         syncer_state = SYNCER_RUNNING;
1809         starttime = time_uptime;
1810         td->td_pflags |= TDP_NORUNNINGBUF;
1811
1812         EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1813             SHUTDOWN_PRI_LAST);
1814
1815         mtx_lock(&sync_mtx);
1816         for (;;) {
1817                 if (syncer_state == SYNCER_FINAL_DELAY &&
1818                     syncer_final_iter == 0) {
1819                         mtx_unlock(&sync_mtx);
1820                         kproc_suspend_check(td->td_proc);
1821                         mtx_lock(&sync_mtx);
1822                 }
1823                 net_worklist_len = syncer_worklist_len - sync_vnode_count;
1824                 if (syncer_state != SYNCER_RUNNING &&
1825                     starttime != time_uptime) {
1826                         if (first_printf) {
1827                                 printf("\nSyncing disks, vnodes remaining...");
1828                                 first_printf = 0;
1829                         }
1830                         printf("%d ", net_worklist_len);
1831                 }
1832                 starttime = time_uptime;
1833
1834                 /*
1835                  * Push files whose dirty time has expired.  Be careful
1836                  * of interrupt race on slp queue.
1837                  *
1838                  * Skip over empty worklist slots when shutting down.
1839                  */
1840                 do {
1841                         slp = &syncer_workitem_pending[syncer_delayno];
1842                         syncer_delayno += 1;
1843                         if (syncer_delayno == syncer_maxdelay)
1844                                 syncer_delayno = 0;
1845                         next = &syncer_workitem_pending[syncer_delayno];
1846                         /*
1847                          * If the worklist has wrapped since the
1848                          * it was emptied of all but syncer vnodes,
1849                          * switch to the FINAL_DELAY state and run
1850                          * for one more second.
1851                          */
1852                         if (syncer_state == SYNCER_SHUTTING_DOWN &&
1853                             net_worklist_len == 0 &&
1854                             last_work_seen == syncer_delayno) {
1855                                 syncer_state = SYNCER_FINAL_DELAY;
1856                                 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1857                         }
1858                 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1859                     syncer_worklist_len > 0);
1860
1861                 /*
1862                  * Keep track of the last time there was anything
1863                  * on the worklist other than syncer vnodes.
1864                  * Return to the SHUTTING_DOWN state if any
1865                  * new work appears.
1866                  */
1867                 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1868                         last_work_seen = syncer_delayno;
1869                 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1870                         syncer_state = SYNCER_SHUTTING_DOWN;
1871                 while (!LIST_EMPTY(slp)) {
1872                         error = sync_vnode(slp, &bo, td);
1873                         if (error == 1) {
1874                                 LIST_REMOVE(bo, bo_synclist);
1875                                 LIST_INSERT_HEAD(next, bo, bo_synclist);
1876                                 continue;
1877                         }
1878
1879                         if (first_printf == 0)
1880                                 wdog_kern_pat(WD_LASTVAL);
1881
1882                 }
1883                 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1884                         syncer_final_iter--;
1885                 /*
1886                  * The variable rushjob allows the kernel to speed up the
1887                  * processing of the filesystem syncer process. A rushjob
1888                  * value of N tells the filesystem syncer to process the next
1889                  * N seconds worth of work on its queue ASAP. Currently rushjob
1890                  * is used by the soft update code to speed up the filesystem
1891                  * syncer process when the incore state is getting so far
1892                  * ahead of the disk that the kernel memory pool is being
1893                  * threatened with exhaustion.
1894                  */
1895                 if (rushjob > 0) {
1896                         rushjob -= 1;
1897                         continue;
1898                 }
1899                 /*
1900                  * Just sleep for a short period of time between
1901                  * iterations when shutting down to allow some I/O
1902                  * to happen.
1903                  *
1904                  * If it has taken us less than a second to process the
1905                  * current work, then wait. Otherwise start right over
1906                  * again. We can still lose time if any single round
1907                  * takes more than two seconds, but it does not really
1908                  * matter as we are just trying to generally pace the
1909                  * filesystem activity.
1910                  */
1911                 if (syncer_state != SYNCER_RUNNING ||
1912                     time_uptime == starttime) {
1913                         thread_lock(td);
1914                         sched_prio(td, PPAUSE);
1915                         thread_unlock(td);
1916                 }
1917                 if (syncer_state != SYNCER_RUNNING)
1918                         cv_timedwait(&sync_wakeup, &sync_mtx,
1919                             hz / SYNCER_SHUTDOWN_SPEEDUP);
1920                 else if (time_uptime == starttime)
1921                         cv_timedwait(&sync_wakeup, &sync_mtx, hz);
1922         }
1923 }
1924
1925 /*
1926  * Request the syncer daemon to speed up its work.
1927  * We never push it to speed up more than half of its
1928  * normal turn time, otherwise it could take over the cpu.
1929  */
1930 int
1931 speedup_syncer(void)
1932 {
1933         int ret = 0;
1934
1935         mtx_lock(&sync_mtx);
1936         if (rushjob < syncdelay / 2) {
1937                 rushjob += 1;
1938                 stat_rush_requests += 1;
1939                 ret = 1;
1940         }
1941         mtx_unlock(&sync_mtx);
1942         cv_broadcast(&sync_wakeup);
1943         return (ret);
1944 }
1945
1946 /*
1947  * Tell the syncer to speed up its work and run though its work
1948  * list several times, then tell it to shut down.
1949  */
1950 static void
1951 syncer_shutdown(void *arg, int howto)
1952 {
1953
1954         if (howto & RB_NOSYNC)
1955                 return;
1956         mtx_lock(&sync_mtx);
1957         syncer_state = SYNCER_SHUTTING_DOWN;
1958         rushjob = 0;
1959         mtx_unlock(&sync_mtx);
1960         cv_broadcast(&sync_wakeup);
1961         kproc_shutdown(arg, howto);
1962 }
1963
1964 void
1965 syncer_suspend(void)
1966 {
1967
1968         syncer_shutdown(updateproc, 0);
1969 }
1970
1971 void
1972 syncer_resume(void)
1973 {
1974
1975         mtx_lock(&sync_mtx);
1976         first_printf = 1;
1977         syncer_state = SYNCER_RUNNING;
1978         mtx_unlock(&sync_mtx);
1979         cv_broadcast(&sync_wakeup);
1980         kproc_resume(updateproc);
1981 }
1982
1983 /*
1984  * Reassign a buffer from one vnode to another.
1985  * Used to assign file specific control information
1986  * (indirect blocks) to the vnode to which they belong.
1987  */
1988 void
1989 reassignbuf(struct buf *bp)
1990 {
1991         struct vnode *vp;
1992         struct bufobj *bo;
1993         int delay;
1994 #ifdef INVARIANTS
1995         struct bufv *bv;
1996 #endif
1997
1998         vp = bp->b_vp;
1999         bo = bp->b_bufobj;
2000         ++reassignbufcalls;
2001
2002         CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
2003             bp, bp->b_vp, bp->b_flags);
2004         /*
2005          * B_PAGING flagged buffers cannot be reassigned because their vp
2006          * is not fully linked in.
2007          */
2008         if (bp->b_flags & B_PAGING)
2009                 panic("cannot reassign paging buffer");
2010
2011         /*
2012          * Delete from old vnode list, if on one.
2013          */
2014         BO_LOCK(bo);
2015         if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2016                 buf_vlist_remove(bp);
2017         else
2018                 panic("reassignbuf: Buffer %p not on queue.", bp);
2019         /*
2020          * If dirty, put on list of dirty buffers; otherwise insert onto list
2021          * of clean buffers.
2022          */
2023         if (bp->b_flags & B_DELWRI) {
2024                 if ((bo->bo_flag & BO_ONWORKLST) == 0) {
2025                         switch (vp->v_type) {
2026                         case VDIR:
2027                                 delay = dirdelay;
2028                                 break;
2029                         case VCHR:
2030                                 delay = metadelay;
2031                                 break;
2032                         default:
2033                                 delay = filedelay;
2034                         }
2035                         vn_syncer_add_to_worklist(bo, delay);
2036                 }
2037                 buf_vlist_add(bp, bo, BX_VNDIRTY);
2038         } else {
2039                 buf_vlist_add(bp, bo, BX_VNCLEAN);
2040
2041                 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2042                         mtx_lock(&sync_mtx);
2043                         LIST_REMOVE(bo, bo_synclist);
2044                         syncer_worklist_len--;
2045                         mtx_unlock(&sync_mtx);
2046                         bo->bo_flag &= ~BO_ONWORKLST;
2047                 }
2048         }
2049 #ifdef INVARIANTS
2050         bv = &bo->bo_clean;
2051         bp = TAILQ_FIRST(&bv->bv_hd);
2052         KASSERT(bp == NULL || bp->b_bufobj == bo,
2053             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2054         bp = TAILQ_LAST(&bv->bv_hd, buflists);
2055         KASSERT(bp == NULL || bp->b_bufobj == bo,
2056             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2057         bv = &bo->bo_dirty;
2058         bp = TAILQ_FIRST(&bv->bv_hd);
2059         KASSERT(bp == NULL || bp->b_bufobj == bo,
2060             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2061         bp = TAILQ_LAST(&bv->bv_hd, buflists);
2062         KASSERT(bp == NULL || bp->b_bufobj == bo,
2063             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2064 #endif
2065         BO_UNLOCK(bo);
2066 }
2067
2068 /*
2069  * Increment the use and hold counts on the vnode, taking care to reference
2070  * the driver's usecount if this is a chardev.  The vholdl() will remove
2071  * the vnode from the free list if it is presently free.  Requires the
2072  * vnode interlock and returns with it held.
2073  */
2074 static void
2075 v_incr_usecount(struct vnode *vp)
2076 {
2077
2078         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2079         vholdl(vp);
2080         vp->v_usecount++;
2081         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2082                 dev_lock();
2083                 vp->v_rdev->si_usecount++;
2084                 dev_unlock();
2085         }
2086 }
2087
2088 /*
2089  * Turn a holdcnt into a use+holdcnt such that only one call to
2090  * v_decr_usecount is needed.
2091  */
2092 static void
2093 v_upgrade_usecount(struct vnode *vp)
2094 {
2095
2096         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2097         vp->v_usecount++;
2098         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2099                 dev_lock();
2100                 vp->v_rdev->si_usecount++;
2101                 dev_unlock();
2102         }
2103 }
2104
2105 /*
2106  * Decrement the vnode use and hold count along with the driver's usecount
2107  * if this is a chardev.  The vdropl() below releases the vnode interlock
2108  * as it may free the vnode.
2109  */
2110 static void
2111 v_decr_usecount(struct vnode *vp)
2112 {
2113
2114         ASSERT_VI_LOCKED(vp, __FUNCTION__);
2115         VNASSERT(vp->v_usecount > 0, vp,
2116             ("v_decr_usecount: negative usecount"));
2117         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2118         vp->v_usecount--;
2119         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2120                 dev_lock();
2121                 vp->v_rdev->si_usecount--;
2122                 dev_unlock();
2123         }
2124         vdropl(vp);
2125 }
2126
2127 /*
2128  * Decrement only the use count and driver use count.  This is intended to
2129  * be paired with a follow on vdropl() to release the remaining hold count.
2130  * In this way we may vgone() a vnode with a 0 usecount without risk of
2131  * having it end up on a free list because the hold count is kept above 0.
2132  */
2133 static void
2134 v_decr_useonly(struct vnode *vp)
2135 {
2136
2137         ASSERT_VI_LOCKED(vp, __FUNCTION__);
2138         VNASSERT(vp->v_usecount > 0, vp,
2139             ("v_decr_useonly: negative usecount"));
2140         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2141         vp->v_usecount--;
2142         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2143                 dev_lock();
2144                 vp->v_rdev->si_usecount--;
2145                 dev_unlock();
2146         }
2147 }
2148
2149 /*
2150  * Grab a particular vnode from the free list, increment its
2151  * reference count and lock it.  VI_DOOMED is set if the vnode
2152  * is being destroyed.  Only callers who specify LK_RETRY will
2153  * see doomed vnodes.  If inactive processing was delayed in
2154  * vput try to do it here.
2155  */
2156 int
2157 vget(struct vnode *vp, int flags, struct thread *td)
2158 {
2159         int error;
2160
2161         error = 0;
2162         VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2163             ("vget: invalid lock operation"));
2164         CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2165
2166         if ((flags & LK_INTERLOCK) == 0)
2167                 VI_LOCK(vp);
2168         vholdl(vp);
2169         if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
2170                 vdrop(vp);
2171                 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2172                     vp);
2173                 return (error);
2174         }
2175         if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2176                 panic("vget: vn_lock failed to return ENOENT\n");
2177         VI_LOCK(vp);
2178         /* Upgrade our holdcnt to a usecount. */
2179         v_upgrade_usecount(vp);
2180         /*
2181          * We don't guarantee that any particular close will
2182          * trigger inactive processing so just make a best effort
2183          * here at preventing a reference to a removed file.  If
2184          * we don't succeed no harm is done.
2185          */
2186         if (vp->v_iflag & VI_OWEINACT) {
2187                 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2188                     (flags & LK_NOWAIT) == 0)
2189                         vinactive(vp, td);
2190                 vp->v_iflag &= ~VI_OWEINACT;
2191         }
2192         VI_UNLOCK(vp);
2193         return (0);
2194 }
2195
2196 /*
2197  * Increase the reference count of a vnode.
2198  */
2199 void
2200 vref(struct vnode *vp)
2201 {
2202
2203         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2204         VI_LOCK(vp);
2205         v_incr_usecount(vp);
2206         VI_UNLOCK(vp);
2207 }
2208
2209 /*
2210  * Return reference count of a vnode.
2211  *
2212  * The results of this call are only guaranteed when some mechanism other
2213  * than the VI lock is used to stop other processes from gaining references
2214  * to the vnode.  This may be the case if the caller holds the only reference.
2215  * This is also useful when stale data is acceptable as race conditions may
2216  * be accounted for by some other means.
2217  */
2218 int
2219 vrefcnt(struct vnode *vp)
2220 {
2221         int usecnt;
2222
2223         VI_LOCK(vp);
2224         usecnt = vp->v_usecount;
2225         VI_UNLOCK(vp);
2226
2227         return (usecnt);
2228 }
2229
2230 #define VPUTX_VRELE     1
2231 #define VPUTX_VPUT      2
2232 #define VPUTX_VUNREF    3
2233
2234 static void
2235 vputx(struct vnode *vp, int func)
2236 {
2237         int error;
2238
2239         KASSERT(vp != NULL, ("vputx: null vp"));
2240         if (func == VPUTX_VUNREF)
2241                 ASSERT_VOP_LOCKED(vp, "vunref");
2242         else if (func == VPUTX_VPUT)
2243                 ASSERT_VOP_LOCKED(vp, "vput");
2244         else
2245                 KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2246         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2247         VI_LOCK(vp);
2248
2249         /* Skip this v_writecount check if we're going to panic below. */
2250         VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2251             ("vputx: missed vn_close"));
2252         error = 0;
2253
2254         if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2255             vp->v_usecount == 1)) {
2256                 if (func == VPUTX_VPUT)
2257                         VOP_UNLOCK(vp, 0);
2258                 v_decr_usecount(vp);
2259                 return;
2260         }
2261
2262         if (vp->v_usecount != 1) {
2263                 vprint("vputx: negative ref count", vp);
2264                 panic("vputx: negative ref cnt");
2265         }
2266         CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2267         /*
2268          * We want to hold the vnode until the inactive finishes to
2269          * prevent vgone() races.  We drop the use count here and the
2270          * hold count below when we're done.
2271          */
2272         v_decr_useonly(vp);
2273         /*
2274          * We must call VOP_INACTIVE with the node locked. Mark
2275          * as VI_DOINGINACT to avoid recursion.
2276          */
2277         vp->v_iflag |= VI_OWEINACT;
2278         switch (func) {
2279         case VPUTX_VRELE:
2280                 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2281                 VI_LOCK(vp);
2282                 break;
2283         case VPUTX_VPUT:
2284                 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2285                         error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2286                             LK_NOWAIT);
2287                         VI_LOCK(vp);
2288                 }
2289                 break;
2290         case VPUTX_VUNREF:
2291                 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2292                         error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
2293                         VI_LOCK(vp);
2294                 }
2295                 break;
2296         }
2297         if (vp->v_usecount > 0)
2298                 vp->v_iflag &= ~VI_OWEINACT;
2299         if (error == 0) {
2300                 if (vp->v_iflag & VI_OWEINACT)
2301                         vinactive(vp, curthread);
2302                 if (func != VPUTX_VUNREF)
2303                         VOP_UNLOCK(vp, 0);
2304         }
2305         vdropl(vp);
2306 }
2307
2308 /*
2309  * Vnode put/release.
2310  * If count drops to zero, call inactive routine and return to freelist.
2311  */
2312 void
2313 vrele(struct vnode *vp)
2314 {
2315
2316         vputx(vp, VPUTX_VRELE);
2317 }
2318
2319 /*
2320  * Release an already locked vnode.  This give the same effects as
2321  * unlock+vrele(), but takes less time and avoids releasing and
2322  * re-aquiring the lock (as vrele() acquires the lock internally.)
2323  */
2324 void
2325 vput(struct vnode *vp)
2326 {
2327
2328         vputx(vp, VPUTX_VPUT);
2329 }
2330
2331 /*
2332  * Release an exclusively locked vnode. Do not unlock the vnode lock.
2333  */
2334 void
2335 vunref(struct vnode *vp)
2336 {
2337
2338         vputx(vp, VPUTX_VUNREF);
2339 }
2340
2341 /*
2342  * Somebody doesn't want the vnode recycled.
2343  */
2344 void
2345 vhold(struct vnode *vp)
2346 {
2347
2348         VI_LOCK(vp);
2349         vholdl(vp);
2350         VI_UNLOCK(vp);
2351 }
2352
2353 /*
2354  * Increase the hold count and activate if this is the first reference.
2355  */
2356 void
2357 vholdl(struct vnode *vp)
2358 {
2359         struct mount *mp;
2360
2361         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2362 #ifdef INVARIANTS
2363         /* getnewvnode() calls v_incr_usecount() without holding interlock. */
2364         if (vp->v_type != VNON || vp->v_data != NULL) {
2365                 ASSERT_VI_LOCKED(vp, "vholdl");
2366                 VNASSERT(vp->v_holdcnt > 0 || (vp->v_iflag & VI_FREE) != 0,
2367                     vp, ("vholdl: free vnode is held"));
2368         }
2369 #endif
2370         vp->v_holdcnt++;
2371         if ((vp->v_iflag & VI_FREE) == 0)
2372                 return;
2373         VNASSERT(vp->v_holdcnt == 1, vp, ("vholdl: wrong hold count"));
2374         VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
2375         /*
2376          * Remove a vnode from the free list, mark it as in use,
2377          * and put it on the active list.
2378          */
2379         mtx_lock(&vnode_free_list_mtx);
2380         TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
2381         freevnodes--;
2382         vp->v_iflag &= ~(VI_FREE|VI_AGE);
2383         KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
2384             ("Activating already active vnode"));
2385         vp->v_iflag |= VI_ACTIVE;
2386         mp = vp->v_mount;
2387         TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
2388         mp->mnt_activevnodelistsize++;
2389         mtx_unlock(&vnode_free_list_mtx);
2390 }
2391
2392 /*
2393  * Note that there is one less who cares about this vnode.
2394  * vdrop() is the opposite of vhold().
2395  */
2396 void
2397 vdrop(struct vnode *vp)
2398 {
2399
2400         VI_LOCK(vp);
2401         vdropl(vp);
2402 }
2403
2404 /*
2405  * Drop the hold count of the vnode.  If this is the last reference to
2406  * the vnode we place it on the free list unless it has been vgone'd
2407  * (marked VI_DOOMED) in which case we will free it.
2408  */
2409 void
2410 vdropl(struct vnode *vp)
2411 {
2412         struct bufobj *bo;
2413         struct mount *mp;
2414         int active;
2415
2416         ASSERT_VI_LOCKED(vp, "vdropl");
2417         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2418         if (vp->v_holdcnt <= 0)
2419                 panic("vdrop: holdcnt %d", vp->v_holdcnt);
2420         vp->v_holdcnt--;
2421         if (vp->v_holdcnt > 0) {
2422                 VI_UNLOCK(vp);
2423                 return;
2424         }
2425         if ((vp->v_iflag & VI_DOOMED) == 0) {
2426                 /*
2427                  * Mark a vnode as free: remove it from its active list
2428                  * and put it up for recycling on the freelist.
2429                  */
2430                 VNASSERT(vp->v_op != NULL, vp,
2431                     ("vdropl: vnode already reclaimed."));
2432                 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2433                     ("vnode already free"));
2434                 VNASSERT(vp->v_holdcnt == 0, vp,
2435                     ("vdropl: freeing when we shouldn't"));
2436                 active = vp->v_iflag & VI_ACTIVE;
2437                 vp->v_iflag &= ~VI_ACTIVE;
2438                 mp = vp->v_mount;
2439                 mtx_lock(&vnode_free_list_mtx);
2440                 if (active) {
2441                         TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
2442                             v_actfreelist);
2443                         mp->mnt_activevnodelistsize--;
2444                 }
2445                 if (vp->v_iflag & VI_AGE) {
2446                         TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_actfreelist);
2447                 } else {
2448                         TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
2449                 }
2450                 freevnodes++;
2451                 vp->v_iflag &= ~VI_AGE;
2452                 vp->v_iflag |= VI_FREE;
2453                 mtx_unlock(&vnode_free_list_mtx);
2454                 VI_UNLOCK(vp);
2455                 return;
2456         }
2457         /*
2458          * The vnode has been marked for destruction, so free it.
2459          */
2460         CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
2461         atomic_subtract_long(&numvnodes, 1);
2462         bo = &vp->v_bufobj;
2463         VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2464             ("cleaned vnode still on the free list."));
2465         VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
2466         VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
2467         VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
2468         VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
2469         VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
2470         VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
2471         VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
2472             ("clean blk trie not empty"));
2473         VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
2474         VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
2475             ("dirty blk trie not empty"));
2476         VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
2477         VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
2478         VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
2479         VI_UNLOCK(vp);
2480 #ifdef MAC
2481         mac_vnode_destroy(vp);
2482 #endif
2483         if (vp->v_pollinfo != NULL)
2484                 destroy_vpollinfo(vp->v_pollinfo);
2485 #ifdef INVARIANTS
2486         /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
2487         vp->v_op = NULL;
2488 #endif
2489         rangelock_destroy(&vp->v_rl);
2490         lockdestroy(vp->v_vnlock);
2491         mtx_destroy(&vp->v_interlock);
2492         rw_destroy(BO_LOCKPTR(bo));
2493         uma_zfree(vnode_zone, vp);
2494 }
2495
2496 /*
2497  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2498  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2499  * OWEINACT tracks whether a vnode missed a call to inactive due to a
2500  * failed lock upgrade.
2501  */
2502 void
2503 vinactive(struct vnode *vp, struct thread *td)
2504 {
2505         struct vm_object *obj;
2506
2507         ASSERT_VOP_ELOCKED(vp, "vinactive");
2508         ASSERT_VI_LOCKED(vp, "vinactive");
2509         VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2510             ("vinactive: recursed on VI_DOINGINACT"));
2511         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2512         vp->v_iflag |= VI_DOINGINACT;
2513         vp->v_iflag &= ~VI_OWEINACT;
2514         VI_UNLOCK(vp);
2515         /*
2516          * Before moving off the active list, we must be sure that any
2517          * modified pages are on the vnode's dirty list since these will
2518          * no longer be checked once the vnode is on the inactive list.
2519          * Because the vnode vm object keeps a hold reference on the vnode
2520          * if there is at least one resident non-cached page, the vnode
2521          * cannot leave the active list without the page cleanup done.
2522          */
2523         obj = vp->v_object;
2524         if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
2525                 VM_OBJECT_WLOCK(obj);
2526                 vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
2527                 VM_OBJECT_WUNLOCK(obj);
2528         }
2529         VOP_INACTIVE(vp, td);
2530         VI_LOCK(vp);
2531         VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2532             ("vinactive: lost VI_DOINGINACT"));
2533         vp->v_iflag &= ~VI_DOINGINACT;
2534 }
2535
2536 /*
2537  * Remove any vnodes in the vnode table belonging to mount point mp.
2538  *
2539  * If FORCECLOSE is not specified, there should not be any active ones,
2540  * return error if any are found (nb: this is a user error, not a
2541  * system error). If FORCECLOSE is specified, detach any active vnodes
2542  * that are found.
2543  *
2544  * If WRITECLOSE is set, only flush out regular file vnodes open for
2545  * writing.
2546  *
2547  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2548  *
2549  * `rootrefs' specifies the base reference count for the root vnode
2550  * of this filesystem. The root vnode is considered busy if its
2551  * v_usecount exceeds this value. On a successful return, vflush(, td)
2552  * will call vrele() on the root vnode exactly rootrefs times.
2553  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2554  * be zero.
2555  */
2556 #ifdef DIAGNOSTIC
2557 static int busyprt = 0;         /* print out busy vnodes */
2558 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2559 #endif
2560
2561 int
2562 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2563 {
2564         struct vnode *vp, *mvp, *rootvp = NULL;
2565         struct vattr vattr;
2566         int busy = 0, error;
2567
2568         CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2569             rootrefs, flags);
2570         if (rootrefs > 0) {
2571                 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2572                     ("vflush: bad args"));
2573                 /*
2574                  * Get the filesystem root vnode. We can vput() it
2575                  * immediately, since with rootrefs > 0, it won't go away.
2576                  */
2577                 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2578                         CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2579                             __func__, error);
2580                         return (error);
2581                 }
2582                 vput(rootvp);
2583         }
2584 loop:
2585         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2586                 vholdl(vp);
2587                 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2588                 if (error) {
2589                         vdrop(vp);
2590                         MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2591                         goto loop;
2592                 }
2593                 /*
2594                  * Skip over a vnodes marked VV_SYSTEM.
2595                  */
2596                 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2597                         VOP_UNLOCK(vp, 0);
2598                         vdrop(vp);
2599                         continue;
2600                 }
2601                 /*
2602                  * If WRITECLOSE is set, flush out unlinked but still open
2603                  * files (even if open only for reading) and regular file
2604                  * vnodes open for writing.
2605                  */
2606                 if (flags & WRITECLOSE) {
2607                         if (vp->v_object != NULL) {
2608                                 VM_OBJECT_WLOCK(vp->v_object);
2609                                 vm_object_page_clean(vp->v_object, 0, 0, 0);
2610                                 VM_OBJECT_WUNLOCK(vp->v_object);
2611                         }
2612                         error = VOP_FSYNC(vp, MNT_WAIT, td);
2613                         if (error != 0) {
2614                                 VOP_UNLOCK(vp, 0);
2615                                 vdrop(vp);
2616                                 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2617                                 return (error);
2618                         }
2619                         error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2620                         VI_LOCK(vp);
2621
2622                         if ((vp->v_type == VNON ||
2623                             (error == 0 && vattr.va_nlink > 0)) &&
2624                             (vp->v_writecount == 0 || vp->v_type != VREG)) {
2625                                 VOP_UNLOCK(vp, 0);
2626                                 vdropl(vp);
2627                                 continue;
2628                         }
2629                 } else
2630                         VI_LOCK(vp);
2631                 /*
2632                  * With v_usecount == 0, all we need to do is clear out the
2633                  * vnode data structures and we are done.
2634                  *
2635                  * If FORCECLOSE is set, forcibly close the vnode.
2636                  */
2637                 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2638                         VNASSERT(vp->v_usecount == 0 ||
2639                             vp->v_op != &devfs_specops ||
2640                             (vp->v_type != VCHR && vp->v_type != VBLK), vp,
2641                             ("device VNODE %p is FORCECLOSED", vp));
2642                         vgonel(vp);
2643                 } else {
2644                         busy++;
2645 #ifdef DIAGNOSTIC
2646                         if (busyprt)
2647                                 vprint("vflush: busy vnode", vp);
2648 #endif
2649                 }
2650                 VOP_UNLOCK(vp, 0);
2651                 vdropl(vp);
2652         }
2653         if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2654                 /*
2655                  * If just the root vnode is busy, and if its refcount
2656                  * is equal to `rootrefs', then go ahead and kill it.
2657                  */
2658                 VI_LOCK(rootvp);
2659                 KASSERT(busy > 0, ("vflush: not busy"));
2660                 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2661                     ("vflush: usecount %d < rootrefs %d",
2662                      rootvp->v_usecount, rootrefs));
2663                 if (busy == 1 && rootvp->v_usecount == rootrefs) {
2664                         VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2665                         vgone(rootvp);
2666                         VOP_UNLOCK(rootvp, 0);
2667                         busy = 0;
2668                 } else
2669                         VI_UNLOCK(rootvp);
2670         }
2671         if (busy) {
2672                 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2673                     busy);
2674                 return (EBUSY);
2675         }
2676         for (; rootrefs > 0; rootrefs--)
2677                 vrele(rootvp);
2678         return (0);
2679 }
2680
2681 /*
2682  * Recycle an unused vnode to the front of the free list.
2683  */
2684 int
2685 vrecycle(struct vnode *vp)
2686 {
2687         int recycled;
2688
2689         ASSERT_VOP_ELOCKED(vp, "vrecycle");
2690         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2691         recycled = 0;
2692         VI_LOCK(vp);
2693         if (vp->v_usecount == 0) {
2694                 recycled = 1;
2695                 vgonel(vp);
2696         }
2697         VI_UNLOCK(vp);
2698         return (recycled);
2699 }
2700
2701 /*
2702  * Eliminate all activity associated with a vnode
2703  * in preparation for reuse.
2704  */
2705 void
2706 vgone(struct vnode *vp)
2707 {
2708         VI_LOCK(vp);
2709         vgonel(vp);
2710         VI_UNLOCK(vp);
2711 }
2712
2713 static void
2714 notify_lowervp_vfs_dummy(struct mount *mp __unused,
2715     struct vnode *lowervp __unused)
2716 {
2717 }
2718
2719 /*
2720  * Notify upper mounts about reclaimed or unlinked vnode.
2721  */
2722 void
2723 vfs_notify_upper(struct vnode *vp, int event)
2724 {
2725         static struct vfsops vgonel_vfsops = {
2726                 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
2727                 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
2728         };
2729         struct mount *mp, *ump, *mmp;
2730
2731         mp = vp->v_mount;
2732         if (mp == NULL)
2733                 return;
2734
2735         MNT_ILOCK(mp);
2736         if (TAILQ_EMPTY(&mp->mnt_uppers))
2737                 goto unlock;
2738         MNT_IUNLOCK(mp);
2739         mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
2740         mmp->mnt_op = &vgonel_vfsops;
2741         mmp->mnt_kern_flag |= MNTK_MARKER;
2742         MNT_ILOCK(mp);
2743         mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
2744         for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
2745                 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
2746                         ump = TAILQ_NEXT(ump, mnt_upper_link);
2747                         continue;
2748                 }
2749                 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
2750                 MNT_IUNLOCK(mp);
2751                 switch (event) {
2752                 case VFS_NOTIFY_UPPER_RECLAIM:
2753                         VFS_RECLAIM_LOWERVP(ump, vp);
2754                         break;
2755                 case VFS_NOTIFY_UPPER_UNLINK:
2756                         VFS_UNLINK_LOWERVP(ump, vp);
2757                         break;
2758                 default:
2759                         KASSERT(0, ("invalid event %d", event));
2760                         break;
2761                 }
2762                 MNT_ILOCK(mp);
2763                 ump = TAILQ_NEXT(mmp, mnt_upper_link);
2764                 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
2765         }
2766         free(mmp, M_TEMP);
2767         mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
2768         if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
2769                 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
2770                 wakeup(&mp->mnt_uppers);
2771         }
2772 unlock:
2773         MNT_IUNLOCK(mp);
2774 }
2775
2776 /*
2777  * vgone, with the vp interlock held.
2778  */
2779 void
2780 vgonel(struct vnode *vp)
2781 {
2782         struct thread *td;
2783         int oweinact;
2784         int active;
2785         struct mount *mp;
2786
2787         ASSERT_VOP_ELOCKED(vp, "vgonel");
2788         ASSERT_VI_LOCKED(vp, "vgonel");
2789         VNASSERT(vp->v_holdcnt, vp,
2790             ("vgonel: vp %p has no reference.", vp));
2791         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2792         td = curthread;
2793
2794         /*
2795          * Don't vgonel if we're already doomed.
2796          */
2797         if (vp->v_iflag & VI_DOOMED)
2798                 return;
2799         vp->v_iflag |= VI_DOOMED;
2800
2801         /*
2802          * Check to see if the vnode is in use.  If so, we have to call
2803          * VOP_CLOSE() and VOP_INACTIVE().
2804          */
2805         active = vp->v_usecount;
2806         oweinact = (vp->v_iflag & VI_OWEINACT);
2807         VI_UNLOCK(vp);
2808         vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
2809
2810         /*
2811          * If purging an active vnode, it must be closed and
2812          * deactivated before being reclaimed.
2813          */
2814         if (active)
2815                 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2816         if (oweinact || active) {
2817                 VI_LOCK(vp);
2818                 if ((vp->v_iflag & VI_DOINGINACT) == 0)
2819                         vinactive(vp, td);
2820                 VI_UNLOCK(vp);
2821         }
2822         if (vp->v_type == VSOCK)
2823                 vfs_unp_reclaim(vp);
2824
2825         /*
2826          * Clean out any buffers associated with the vnode.
2827          * If the flush fails, just toss the buffers.
2828          */
2829         mp = NULL;
2830         if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2831                 (void) vn_start_secondary_write(vp, &mp, V_WAIT);
2832         if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
2833                 while (vinvalbuf(vp, 0, 0, 0) != 0)
2834                         ;
2835         }
2836 #ifdef INVARIANTS
2837         BO_LOCK(&vp->v_bufobj);
2838         KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
2839             vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
2840             TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
2841             vp->v_bufobj.bo_clean.bv_cnt == 0,
2842             ("vp %p bufobj not invalidated", vp));
2843         vp->v_bufobj.bo_flag |= BO_DEAD;
2844         BO_UNLOCK(&vp->v_bufobj);
2845 #endif
2846
2847         /*
2848          * Reclaim the vnode.
2849          */
2850         if (VOP_RECLAIM(vp, td))
2851                 panic("vgone: cannot reclaim");
2852         if (mp != NULL)
2853                 vn_finished_secondary_write(mp);
2854         VNASSERT(vp->v_object == NULL, vp,
2855             ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2856         /*
2857          * Clear the advisory locks and wake up waiting threads.
2858          */
2859         (void)VOP_ADVLOCKPURGE(vp);
2860         /*
2861          * Delete from old mount point vnode list.
2862          */
2863         delmntque(vp);
2864         cache_purge(vp);
2865         /*
2866          * Done with purge, reset to the standard lock and invalidate
2867          * the vnode.
2868          */
2869         VI_LOCK(vp);
2870         vp->v_vnlock = &vp->v_lock;
2871         vp->v_op = &dead_vnodeops;
2872         vp->v_tag = "none";
2873         vp->v_type = VBAD;
2874 }
2875
2876 /*
2877  * Calculate the total number of references to a special device.
2878  */
2879 int
2880 vcount(struct vnode *vp)
2881 {
2882         int count;
2883
2884         dev_lock();
2885         count = vp->v_rdev->si_usecount;
2886         dev_unlock();
2887         return (count);
2888 }
2889
2890 /*
2891  * Same as above, but using the struct cdev *as argument
2892  */
2893 int
2894 count_dev(struct cdev *dev)
2895 {
2896         int count;
2897
2898         dev_lock();
2899         count = dev->si_usecount;
2900         dev_unlock();
2901         return(count);
2902 }
2903
2904 /*
2905  * Print out a description of a vnode.
2906  */
2907 static char *typename[] =
2908 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2909  "VMARKER"};
2910
2911 void
2912 vn_printf(struct vnode *vp, const char *fmt, ...)
2913 {
2914         va_list ap;
2915         char buf[256], buf2[16];
2916         u_long flags;
2917
2918         va_start(ap, fmt);
2919         vprintf(fmt, ap);
2920         va_end(ap);
2921         printf("%p: ", (void *)vp);
2922         printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2923         printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2924             vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2925         buf[0] = '\0';
2926         buf[1] = '\0';
2927         if (vp->v_vflag & VV_ROOT)
2928                 strlcat(buf, "|VV_ROOT", sizeof(buf));
2929         if (vp->v_vflag & VV_ISTTY)
2930                 strlcat(buf, "|VV_ISTTY", sizeof(buf));
2931         if (vp->v_vflag & VV_NOSYNC)
2932                 strlcat(buf, "|VV_NOSYNC", sizeof(buf));
2933         if (vp->v_vflag & VV_ETERNALDEV)
2934                 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
2935         if (vp->v_vflag & VV_CACHEDLABEL)
2936                 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
2937         if (vp->v_vflag & VV_TEXT)
2938                 strlcat(buf, "|VV_TEXT", sizeof(buf));
2939         if (vp->v_vflag & VV_COPYONWRITE)
2940                 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
2941         if (vp->v_vflag & VV_SYSTEM)
2942                 strlcat(buf, "|VV_SYSTEM", sizeof(buf));
2943         if (vp->v_vflag & VV_PROCDEP)
2944                 strlcat(buf, "|VV_PROCDEP", sizeof(buf));
2945         if (vp->v_vflag & VV_NOKNOTE)
2946                 strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
2947         if (vp->v_vflag & VV_DELETED)
2948                 strlcat(buf, "|VV_DELETED", sizeof(buf));
2949         if (vp->v_vflag & VV_MD)
2950                 strlcat(buf, "|VV_MD", sizeof(buf));
2951         if (vp->v_vflag & VV_FORCEINSMQ)
2952                 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
2953         flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
2954             VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
2955             VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
2956         if (flags != 0) {
2957                 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
2958                 strlcat(buf, buf2, sizeof(buf));
2959         }
2960         if (vp->v_iflag & VI_MOUNT)
2961                 strlcat(buf, "|VI_MOUNT", sizeof(buf));
2962         if (vp->v_iflag & VI_AGE)
2963                 strlcat(buf, "|VI_AGE", sizeof(buf));
2964         if (vp->v_iflag & VI_DOOMED)
2965                 strlcat(buf, "|VI_DOOMED", sizeof(buf));
2966         if (vp->v_iflag & VI_FREE)
2967                 strlcat(buf, "|VI_FREE", sizeof(buf));
2968         if (vp->v_iflag & VI_ACTIVE)
2969                 strlcat(buf, "|VI_ACTIVE", sizeof(buf));
2970         if (vp->v_iflag & VI_DOINGINACT)
2971                 strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
2972         if (vp->v_iflag & VI_OWEINACT)
2973                 strlcat(buf, "|VI_OWEINACT", sizeof(buf));
2974         flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
2975             VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
2976         if (flags != 0) {
2977                 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
2978                 strlcat(buf, buf2, sizeof(buf));
2979         }
2980         printf("    flags (%s)\n", buf + 1);
2981         if (mtx_owned(VI_MTX(vp)))
2982                 printf(" VI_LOCKed");
2983         if (vp->v_object != NULL)
2984                 printf("    v_object %p ref %d pages %d "
2985                     "cleanbuf %d dirtybuf %d\n",
2986                     vp->v_object, vp->v_object->ref_count,
2987                     vp->v_object->resident_page_count,
2988                     vp->v_bufobj.bo_dirty.bv_cnt,
2989                     vp->v_bufobj.bo_clean.bv_cnt);
2990         printf("    ");
2991         lockmgr_printinfo(vp->v_vnlock);
2992         if (vp->v_data != NULL)
2993                 VOP_PRINT(vp);
2994 }
2995
2996 #ifdef DDB
2997 /*
2998  * List all of the locked vnodes in the system.
2999  * Called when debugging the kernel.
3000  */
3001 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
3002 {
3003         struct mount *mp;
3004         struct vnode *vp;
3005
3006         /*
3007          * Note: because this is DDB, we can't obey the locking semantics
3008          * for these structures, which means we could catch an inconsistent
3009          * state and dereference a nasty pointer.  Not much to be done
3010          * about that.
3011          */
3012         db_printf("Locked vnodes\n");
3013         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3014                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3015                         if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
3016                                 vprint("", vp);
3017                 }
3018         }
3019 }
3020
3021 /*
3022  * Show details about the given vnode.
3023  */
3024 DB_SHOW_COMMAND(vnode, db_show_vnode)
3025 {
3026         struct vnode *vp;
3027
3028         if (!have_addr)
3029                 return;
3030         vp = (struct vnode *)addr;
3031         vn_printf(vp, "vnode ");
3032 }
3033
3034 /*
3035  * Show details about the given mount point.
3036  */
3037 DB_SHOW_COMMAND(mount, db_show_mount)
3038 {
3039         struct mount *mp;
3040         struct vfsopt *opt;
3041         struct statfs *sp;
3042         struct vnode *vp;
3043         char buf[512];
3044         uint64_t mflags;
3045         u_int flags;
3046
3047         if (!have_addr) {
3048                 /* No address given, print short info about all mount points. */
3049                 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3050                         db_printf("%p %s on %s (%s)\n", mp,
3051                             mp->mnt_stat.f_mntfromname,
3052                             mp->mnt_stat.f_mntonname,
3053                             mp->mnt_stat.f_fstypename);
3054                         if (db_pager_quit)
3055                                 break;
3056                 }
3057                 db_printf("\nMore info: show mount <addr>\n");
3058                 return;
3059         }
3060
3061         mp = (struct mount *)addr;
3062         db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
3063             mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
3064
3065         buf[0] = '\0';
3066         mflags = mp->mnt_flag;
3067 #define MNT_FLAG(flag)  do {                                            \
3068         if (mflags & (flag)) {                                          \
3069                 if (buf[0] != '\0')                                     \
3070                         strlcat(buf, ", ", sizeof(buf));                \
3071                 strlcat(buf, (#flag) + 4, sizeof(buf));                 \
3072                 mflags &= ~(flag);                                      \
3073         }                                                               \
3074 } while (0)
3075         MNT_FLAG(MNT_RDONLY);
3076         MNT_FLAG(MNT_SYNCHRONOUS);
3077         MNT_FLAG(MNT_NOEXEC);
3078         MNT_FLAG(MNT_NOSUID);
3079         MNT_FLAG(MNT_NFS4ACLS);
3080         MNT_FLAG(MNT_UNION);
3081         MNT_FLAG(MNT_ASYNC);
3082         MNT_FLAG(MNT_SUIDDIR);
3083         MNT_FLAG(MNT_SOFTDEP);
3084         MNT_FLAG(MNT_NOSYMFOLLOW);
3085         MNT_FLAG(MNT_GJOURNAL);
3086         MNT_FLAG(MNT_MULTILABEL);
3087         MNT_FLAG(MNT_ACLS);
3088         MNT_FLAG(MNT_NOATIME);
3089         MNT_FLAG(MNT_NOCLUSTERR);
3090         MNT_FLAG(MNT_NOCLUSTERW);
3091         MNT_FLAG(MNT_SUJ);
3092         MNT_FLAG(MNT_EXRDONLY);
3093         MNT_FLAG(MNT_EXPORTED);
3094         MNT_FLAG(MNT_DEFEXPORTED);
3095         MNT_FLAG(MNT_EXPORTANON);
3096         MNT_FLAG(MNT_EXKERB);
3097         MNT_FLAG(MNT_EXPUBLIC);
3098         MNT_FLAG(MNT_LOCAL);
3099         MNT_FLAG(MNT_QUOTA);
3100         MNT_FLAG(MNT_ROOTFS);
3101         MNT_FLAG(MNT_USER);
3102         MNT_FLAG(MNT_IGNORE);
3103         MNT_FLAG(MNT_UPDATE);
3104         MNT_FLAG(MNT_DELEXPORT);
3105         MNT_FLAG(MNT_RELOAD);
3106         MNT_FLAG(MNT_FORCE);
3107         MNT_FLAG(MNT_SNAPSHOT);
3108         MNT_FLAG(MNT_BYFSID);
3109 #undef MNT_FLAG
3110         if (mflags != 0) {
3111                 if (buf[0] != '\0')
3112                         strlcat(buf, ", ", sizeof(buf));
3113                 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3114                     "0x%016jx", mflags);
3115         }
3116         db_printf("    mnt_flag = %s\n", buf);
3117
3118         buf[0] = '\0';
3119         flags = mp->mnt_kern_flag;
3120 #define MNT_KERN_FLAG(flag)     do {                                    \
3121         if (flags & (flag)) {                                           \
3122                 if (buf[0] != '\0')                                     \
3123                         strlcat(buf, ", ", sizeof(buf));                \
3124                 strlcat(buf, (#flag) + 5, sizeof(buf));                 \
3125                 flags &= ~(flag);                                       \
3126         }                                                               \
3127 } while (0)
3128         MNT_KERN_FLAG(MNTK_UNMOUNTF);
3129         MNT_KERN_FLAG(MNTK_ASYNC);
3130         MNT_KERN_FLAG(MNTK_SOFTDEP);
3131         MNT_KERN_FLAG(MNTK_NOINSMNTQ);
3132         MNT_KERN_FLAG(MNTK_DRAINING);
3133         MNT_KERN_FLAG(MNTK_REFEXPIRE);
3134         MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
3135         MNT_KERN_FLAG(MNTK_SHARED_WRITES);
3136         MNT_KERN_FLAG(MNTK_NO_IOPF);
3137         MNT_KERN_FLAG(MNTK_VGONE_UPPER);
3138         MNT_KERN_FLAG(MNTK_VGONE_WAITER);
3139         MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
3140         MNT_KERN_FLAG(MNTK_MARKER);
3141         MNT_KERN_FLAG(MNTK_USES_BCACHE);
3142         MNT_KERN_FLAG(MNTK_NOASYNC);
3143         MNT_KERN_FLAG(MNTK_UNMOUNT);
3144         MNT_KERN_FLAG(MNTK_MWAIT);
3145         MNT_KERN_FLAG(MNTK_SUSPEND);
3146         MNT_KERN_FLAG(MNTK_SUSPEND2);
3147         MNT_KERN_FLAG(MNTK_SUSPENDED);
3148         MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
3149         MNT_KERN_FLAG(MNTK_NOKNOTE);
3150 #undef MNT_KERN_FLAG
3151         if (flags != 0) {
3152                 if (buf[0] != '\0')
3153                         strlcat(buf, ", ", sizeof(buf));
3154                 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3155                     "0x%08x", flags);
3156         }
3157         db_printf("    mnt_kern_flag = %s\n", buf);
3158
3159         db_printf("    mnt_opt = ");
3160         opt = TAILQ_FIRST(mp->mnt_opt);
3161         if (opt != NULL) {
3162                 db_printf("%s", opt->name);
3163                 opt = TAILQ_NEXT(opt, link);
3164                 while (opt != NULL) {
3165                         db_printf(", %s", opt->name);
3166                         opt = TAILQ_NEXT(opt, link);
3167                 }
3168         }
3169         db_printf("\n");
3170
3171         sp = &mp->mnt_stat;
3172         db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
3173             "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
3174             "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
3175             "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
3176             (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
3177             (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
3178             (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
3179             (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
3180             (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
3181             (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
3182             (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
3183             (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
3184
3185         db_printf("    mnt_cred = { uid=%u ruid=%u",
3186             (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
3187         if (jailed(mp->mnt_cred))
3188                 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
3189         db_printf(" }\n");
3190         db_printf("    mnt_ref = %d\n", mp->mnt_ref);
3191         db_printf("    mnt_gen = %d\n", mp->mnt_gen);
3192         db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
3193         db_printf("    mnt_activevnodelistsize = %d\n",
3194             mp->mnt_activevnodelistsize);
3195         db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
3196         db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
3197         db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
3198         db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
3199         db_printf("    mnt_lockref = %d\n", mp->mnt_lockref);
3200         db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
3201         db_printf("    mnt_secondary_accwrites = %d\n",
3202             mp->mnt_secondary_accwrites);
3203         db_printf("    mnt_gjprovider = %s\n",
3204             mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
3205
3206         db_printf("\n\nList of active vnodes\n");
3207         TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
3208                 if (vp->v_type != VMARKER) {
3209                         vn_printf(vp, "vnode ");
3210                         if (db_pager_quit)
3211                                 break;
3212                 }
3213         }
3214         db_printf("\n\nList of inactive vnodes\n");
3215         TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3216                 if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
3217                         vn_printf(vp, "vnode ");
3218                         if (db_pager_quit)
3219                                 break;
3220                 }
3221         }
3222 }
3223 #endif  /* DDB */
3224
3225 /*
3226  * Fill in a struct xvfsconf based on a struct vfsconf.
3227  */
3228 static int
3229 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
3230 {
3231         struct xvfsconf xvfsp;
3232
3233         bzero(&xvfsp, sizeof(xvfsp));
3234         strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3235         xvfsp.vfc_typenum = vfsp->vfc_typenum;
3236         xvfsp.vfc_refcount = vfsp->vfc_refcount;
3237         xvfsp.vfc_flags = vfsp->vfc_flags;
3238         /*
3239          * These are unused in userland, we keep them
3240          * to not break binary compatibility.
3241          */
3242         xvfsp.vfc_vfsops = NULL;
3243         xvfsp.vfc_next = NULL;
3244         return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3245 }
3246
3247 #ifdef COMPAT_FREEBSD32
3248 struct xvfsconf32 {
3249         uint32_t        vfc_vfsops;
3250         char            vfc_name[MFSNAMELEN];
3251         int32_t         vfc_typenum;
3252         int32_t         vfc_refcount;
3253         int32_t         vfc_flags;
3254         uint32_t        vfc_next;
3255 };
3256
3257 static int
3258 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
3259 {
3260         struct xvfsconf32 xvfsp;
3261
3262         strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3263         xvfsp.vfc_typenum = vfsp->vfc_typenum;
3264         xvfsp.vfc_refcount = vfsp->vfc_refcount;
3265         xvfsp.vfc_flags = vfsp->vfc_flags;
3266         xvfsp.vfc_vfsops = 0;
3267         xvfsp.vfc_next = 0;
3268         return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3269 }
3270 #endif
3271
3272 /*
3273  * Top level filesystem related information gathering.
3274  */
3275 static int
3276 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
3277 {
3278         struct vfsconf *vfsp;
3279         int error;
3280
3281         error = 0;
3282         vfsconf_slock();
3283         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3284 #ifdef COMPAT_FREEBSD32
3285                 if (req->flags & SCTL_MASK32)
3286                         error = vfsconf2x32(req, vfsp);
3287                 else
3288 #endif
3289                         error = vfsconf2x(req, vfsp);
3290                 if (error)
3291                         break;
3292         }
3293         vfsconf_sunlock();
3294         return (error);
3295 }
3296
3297 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
3298     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
3299     "S,xvfsconf", "List of all configured filesystems");
3300
3301 #ifndef BURN_BRIDGES
3302 static int      sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3303
3304 static int
3305 vfs_sysctl(SYSCTL_HANDLER_ARGS)
3306 {
3307         int *name = (int *)arg1 - 1;    /* XXX */
3308         u_int namelen = arg2 + 1;       /* XXX */
3309         struct vfsconf *vfsp;
3310
3311         log(LOG_WARNING, "userland calling deprecated sysctl, "
3312             "please rebuild world\n");
3313
3314 #if 1 || defined(COMPAT_PRELITE2)
3315         /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3316         if (namelen == 1)
3317                 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3318 #endif
3319
3320         switch (name[1]) {
3321         case VFS_MAXTYPENUM:
3322                 if (namelen != 2)
3323                         return (ENOTDIR);
3324                 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3325         case VFS_CONF:
3326                 if (namelen != 3)
3327                         return (ENOTDIR);       /* overloaded */
3328                 vfsconf_slock();
3329                 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3330                         if (vfsp->vfc_typenum == name[2])
3331                                 break;
3332                 }
3333                 vfsconf_sunlock();
3334                 if (vfsp == NULL)
3335                         return (EOPNOTSUPP);
3336 #ifdef COMPAT_FREEBSD32
3337                 if (req->flags & SCTL_MASK32)
3338                         return (vfsconf2x32(req, vfsp));
3339                 else
3340 #endif
3341                         return (vfsconf2x(req, vfsp));
3342         }
3343         return (EOPNOTSUPP);
3344 }
3345
3346 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
3347     CTLFLAG_MPSAFE, vfs_sysctl,
3348     "Generic filesystem");
3349
3350 #if 1 || defined(COMPAT_PRELITE2)
3351
3352 static int
3353 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3354 {
3355         int error;
3356         struct vfsconf *vfsp;
3357         struct ovfsconf ovfs;
3358
3359         vfsconf_slock();
3360         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3361                 bzero(&ovfs, sizeof(ovfs));
3362                 ovfs.vfc_vfsops = vfsp->vfc_vfsops;     /* XXX used as flag */
3363                 strcpy(ovfs.vfc_name, vfsp->vfc_name);
3364                 ovfs.vfc_index = vfsp->vfc_typenum;
3365                 ovfs.vfc_refcount = vfsp->vfc_refcount;
3366                 ovfs.vfc_flags = vfsp->vfc_flags;
3367                 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3368                 if (error != 0) {
3369                         vfsconf_sunlock();
3370                         return (error);
3371                 }
3372         }
3373         vfsconf_sunlock();
3374         return (0);
3375 }
3376
3377 #endif /* 1 || COMPAT_PRELITE2 */
3378 #endif /* !BURN_BRIDGES */
3379
3380 #define KINFO_VNODESLOP         10
3381 #ifdef notyet
3382 /*
3383  * Dump vnode list (via sysctl).
3384  */
3385 /* ARGSUSED */
3386 static int
3387 sysctl_vnode(SYSCTL_HANDLER_ARGS)
3388 {
3389         struct xvnode *xvn;
3390         struct mount *mp;
3391         struct vnode *vp;
3392         int error, len, n;
3393
3394         /*
3395          * Stale numvnodes access is not fatal here.
3396          */
3397         req->lock = 0;
3398         len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3399         if (!req->oldptr)
3400                 /* Make an estimate */
3401                 return (SYSCTL_OUT(req, 0, len));
3402
3403         error = sysctl_wire_old_buffer(req, 0);
3404         if (error != 0)
3405                 return (error);
3406         xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3407         n = 0;
3408         mtx_lock(&mountlist_mtx);
3409         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3410                 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3411                         continue;
3412                 MNT_ILOCK(mp);
3413                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3414                         if (n == len)
3415                                 break;
3416                         vref(vp);
3417                         xvn[n].xv_size = sizeof *xvn;
3418                         xvn[n].xv_vnode = vp;
3419                         xvn[n].xv_id = 0;       /* XXX compat */
3420 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3421                         XV_COPY(usecount);
3422                         XV_COPY(writecount);
3423                         XV_COPY(holdcnt);
3424                         XV_COPY(mount);
3425                         XV_COPY(numoutput);
3426                         XV_COPY(type);
3427 #undef XV_COPY
3428                         xvn[n].xv_flag = vp->v_vflag;
3429
3430                         switch (vp->v_type) {
3431                         case VREG:
3432                         case VDIR:
3433                         case VLNK:
3434                                 break;
3435                         case VBLK:
3436                         case VCHR:
3437                                 if (vp->v_rdev == NULL) {
3438                                         vrele(vp);
3439                                         continue;
3440                                 }
3441                                 xvn[n].xv_dev = dev2udev(vp->v_rdev);
3442                                 break;
3443                         case VSOCK:
3444                                 xvn[n].xv_socket = vp->v_socket;
3445                                 break;
3446                         case VFIFO:
3447                                 xvn[n].xv_fifo = vp->v_fifoinfo;
3448                                 break;
3449                         case VNON:
3450                         case VBAD:
3451                         default:
3452                                 /* shouldn't happen? */
3453                                 vrele(vp);
3454                                 continue;
3455                         }
3456                         vrele(vp);
3457                         ++n;
3458                 }
3459                 MNT_IUNLOCK(mp);
3460                 mtx_lock(&mountlist_mtx);
3461                 vfs_unbusy(mp);
3462                 if (n == len)
3463                         break;
3464         }
3465         mtx_unlock(&mountlist_mtx);
3466
3467         error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3468         free(xvn, M_TEMP);
3469         return (error);
3470 }
3471
3472 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
3473     CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
3474     "");
3475 #endif
3476
3477 /*
3478  * Unmount all filesystems. The list is traversed in reverse order
3479  * of mounting to avoid dependencies.
3480  */
3481 void
3482 vfs_unmountall(void)
3483 {
3484         struct mount *mp;
3485         struct thread *td;
3486         int error;
3487
3488         CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3489         td = curthread;
3490
3491         /*
3492          * Since this only runs when rebooting, it is not interlocked.
3493          */
3494         while(!TAILQ_EMPTY(&mountlist)) {
3495                 mp = TAILQ_LAST(&mountlist, mntlist);
3496                 error = dounmount(mp, MNT_FORCE, td);
3497                 if (error) {
3498                         TAILQ_REMOVE(&mountlist, mp, mnt_list);
3499                         /*
3500                          * XXX: Due to the way in which we mount the root
3501                          * file system off of devfs, devfs will generate a
3502                          * "busy" warning when we try to unmount it before
3503                          * the root.  Don't print a warning as a result in
3504                          * order to avoid false positive errors that may
3505                          * cause needless upset.
3506                          */
3507                         if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
3508                                 printf("unmount of %s failed (",
3509                                     mp->mnt_stat.f_mntonname);
3510                                 if (error == EBUSY)
3511                                         printf("BUSY)\n");
3512                                 else
3513                                         printf("%d)\n", error);
3514                         }
3515                 } else {
3516                         /* The unmount has removed mp from the mountlist */
3517                 }
3518         }
3519 }
3520
3521 /*
3522  * perform msync on all vnodes under a mount point
3523  * the mount point must be locked.
3524  */
3525 void
3526 vfs_msync(struct mount *mp, int flags)
3527 {
3528         struct vnode *vp, *mvp;
3529         struct vm_object *obj;
3530
3531         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3532         MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
3533                 obj = vp->v_object;
3534                 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3535                     (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3536                         if (!vget(vp,
3537                             LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3538                             curthread)) {
3539                                 if (vp->v_vflag & VV_NOSYNC) {  /* unlinked */
3540                                         vput(vp);
3541                                         continue;
3542                                 }
3543
3544                                 obj = vp->v_object;
3545                                 if (obj != NULL) {
3546                                         VM_OBJECT_WLOCK(obj);
3547                                         vm_object_page_clean(obj, 0, 0,
3548                                             flags == MNT_WAIT ?
3549                                             OBJPC_SYNC : OBJPC_NOSYNC);
3550                                         VM_OBJECT_WUNLOCK(obj);
3551                                 }
3552                                 vput(vp);
3553                         }
3554                 } else
3555                         VI_UNLOCK(vp);
3556         }
3557 }
3558
3559 static void
3560 destroy_vpollinfo_free(struct vpollinfo *vi)
3561 {
3562
3563         knlist_destroy(&vi->vpi_selinfo.si_note);
3564         mtx_destroy(&vi->vpi_lock);
3565         uma_zfree(vnodepoll_zone, vi);
3566 }
3567
3568 static void
3569 destroy_vpollinfo(struct vpollinfo *vi)
3570 {
3571
3572         knlist_clear(&vi->vpi_selinfo.si_note, 1);
3573         seldrain(&vi->vpi_selinfo);
3574         destroy_vpollinfo_free(vi);
3575 }
3576
3577 /*
3578  * Initalize per-vnode helper structure to hold poll-related state.
3579  */
3580 void
3581 v_addpollinfo(struct vnode *vp)
3582 {
3583         struct vpollinfo *vi;
3584
3585         if (vp->v_pollinfo != NULL)
3586                 return;
3587         vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
3588         mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3589         knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3590             vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3591         VI_LOCK(vp);
3592         if (vp->v_pollinfo != NULL) {
3593                 VI_UNLOCK(vp);
3594                 destroy_vpollinfo_free(vi);
3595                 return;
3596         }
3597         vp->v_pollinfo = vi;
3598         VI_UNLOCK(vp);
3599 }
3600
3601 /*
3602  * Record a process's interest in events which might happen to
3603  * a vnode.  Because poll uses the historic select-style interface
3604  * internally, this routine serves as both the ``check for any
3605  * pending events'' and the ``record my interest in future events''
3606  * functions.  (These are done together, while the lock is held,
3607  * to avoid race conditions.)
3608  */
3609 int
3610 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3611 {
3612
3613         v_addpollinfo(vp);
3614         mtx_lock(&vp->v_pollinfo->vpi_lock);
3615         if (vp->v_pollinfo->vpi_revents & events) {
3616                 /*
3617                  * This leaves events we are not interested
3618                  * in available for the other process which
3619                  * which presumably had requested them
3620                  * (otherwise they would never have been
3621                  * recorded).
3622                  */
3623                 events &= vp->v_pollinfo->vpi_revents;
3624                 vp->v_pollinfo->vpi_revents &= ~events;
3625
3626                 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3627                 return (events);
3628         }
3629         vp->v_pollinfo->vpi_events |= events;
3630         selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3631         mtx_unlock(&vp->v_pollinfo->vpi_lock);
3632         return (0);
3633 }
3634
3635 /*
3636  * Routine to create and manage a filesystem syncer vnode.
3637  */
3638 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
3639 static int      sync_fsync(struct  vop_fsync_args *);
3640 static int      sync_inactive(struct  vop_inactive_args *);
3641 static int      sync_reclaim(struct  vop_reclaim_args *);
3642
3643 static struct vop_vector sync_vnodeops = {
3644         .vop_bypass =   VOP_EOPNOTSUPP,
3645         .vop_close =    sync_close,             /* close */
3646         .vop_fsync =    sync_fsync,             /* fsync */
3647         .vop_inactive = sync_inactive,  /* inactive */
3648         .vop_reclaim =  sync_reclaim,   /* reclaim */
3649         .vop_lock1 =    vop_stdlock,    /* lock */
3650         .vop_unlock =   vop_stdunlock,  /* unlock */
3651         .vop_islocked = vop_stdislocked,        /* islocked */
3652 };
3653
3654 /*
3655  * Create a new filesystem syncer vnode for the specified mount point.
3656  */
3657 void
3658 vfs_allocate_syncvnode(struct mount *mp)
3659 {
3660         struct vnode *vp;
3661         struct bufobj *bo;
3662         static long start, incr, next;
3663         int error;
3664
3665         /* Allocate a new vnode */
3666         error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3667         if (error != 0)
3668                 panic("vfs_allocate_syncvnode: getnewvnode() failed");
3669         vp->v_type = VNON;
3670         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3671         vp->v_vflag |= VV_FORCEINSMQ;
3672         error = insmntque(vp, mp);
3673         if (error != 0)
3674                 panic("vfs_allocate_syncvnode: insmntque() failed");
3675         vp->v_vflag &= ~VV_FORCEINSMQ;
3676         VOP_UNLOCK(vp, 0);
3677         /*
3678          * Place the vnode onto the syncer worklist. We attempt to
3679          * scatter them about on the list so that they will go off
3680          * at evenly distributed times even if all the filesystems
3681          * are mounted at once.
3682          */
3683         next += incr;
3684         if (next == 0 || next > syncer_maxdelay) {
3685                 start /= 2;
3686                 incr /= 2;
3687                 if (start == 0) {
3688                         start = syncer_maxdelay / 2;
3689                         incr = syncer_maxdelay;
3690                 }
3691                 next = start;
3692         }
3693         bo = &vp->v_bufobj;
3694         BO_LOCK(bo);
3695         vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3696         /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3697         mtx_lock(&sync_mtx);
3698         sync_vnode_count++;
3699         if (mp->mnt_syncer == NULL) {
3700                 mp->mnt_syncer = vp;
3701                 vp = NULL;
3702         }
3703         mtx_unlock(&sync_mtx);
3704         BO_UNLOCK(bo);
3705         if (vp != NULL) {
3706                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3707                 vgone(vp);
3708                 vput(vp);
3709         }
3710 }
3711
3712 void
3713 vfs_deallocate_syncvnode(struct mount *mp)
3714 {
3715         struct vnode *vp;
3716
3717         mtx_lock(&sync_mtx);
3718         vp = mp->mnt_syncer;
3719         if (vp != NULL)
3720                 mp->mnt_syncer = NULL;
3721         mtx_unlock(&sync_mtx);
3722         if (vp != NULL)
3723                 vrele(vp);
3724 }
3725
3726 /*
3727  * Do a lazy sync of the filesystem.
3728  */
3729 static int
3730 sync_fsync(struct vop_fsync_args *ap)
3731 {
3732         struct vnode *syncvp = ap->a_vp;
3733         struct mount *mp = syncvp->v_mount;
3734         int error, save;
3735         struct bufobj *bo;
3736
3737         /*
3738          * We only need to do something if this is a lazy evaluation.
3739          */
3740         if (ap->a_waitfor != MNT_LAZY)
3741                 return (0);
3742
3743         /*
3744          * Move ourselves to the back of the sync list.
3745          */
3746         bo = &syncvp->v_bufobj;
3747         BO_LOCK(bo);
3748         vn_syncer_add_to_worklist(bo, syncdelay);
3749         BO_UNLOCK(bo);
3750
3751         /*
3752          * Walk the list of vnodes pushing all that are dirty and
3753          * not already on the sync list.
3754          */
3755         if (vfs_busy(mp, MBF_NOWAIT) != 0)
3756                 return (0);
3757         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3758                 vfs_unbusy(mp);
3759                 return (0);
3760         }
3761         save = curthread_pflags_set(TDP_SYNCIO);
3762         vfs_msync(mp, MNT_NOWAIT);
3763         error = VFS_SYNC(mp, MNT_LAZY);
3764         curthread_pflags_restore(save);
3765         vn_finished_write(mp);
3766         vfs_unbusy(mp);
3767         return (error);
3768 }
3769
3770 /*
3771  * The syncer vnode is no referenced.
3772  */
3773 static int
3774 sync_inactive(struct vop_inactive_args *ap)
3775 {
3776
3777         vgone(ap->a_vp);
3778         return (0);
3779 }
3780
3781 /*
3782  * The syncer vnode is no longer needed and is being decommissioned.
3783  *
3784  * Modifications to the worklist must be protected by sync_mtx.
3785  */
3786 static int
3787 sync_reclaim(struct vop_reclaim_args *ap)
3788 {
3789         struct vnode *vp = ap->a_vp;
3790         struct bufobj *bo;
3791
3792         bo = &vp->v_bufobj;
3793         BO_LOCK(bo);
3794         mtx_lock(&sync_mtx);
3795         if (vp->v_mount->mnt_syncer == vp)
3796                 vp->v_mount->mnt_syncer = NULL;
3797         if (bo->bo_flag & BO_ONWORKLST) {
3798                 LIST_REMOVE(bo, bo_synclist);
3799                 syncer_worklist_len--;
3800                 sync_vnode_count--;
3801                 bo->bo_flag &= ~BO_ONWORKLST;
3802         }
3803         mtx_unlock(&sync_mtx);
3804         BO_UNLOCK(bo);
3805
3806         return (0);
3807 }
3808
3809 /*
3810  * Check if vnode represents a disk device
3811  */
3812 int
3813 vn_isdisk(struct vnode *vp, int *errp)
3814 {
3815         int error;
3816
3817         error = 0;
3818         dev_lock();
3819         if (vp->v_type != VCHR)
3820                 error = ENOTBLK;
3821         else if (vp->v_rdev == NULL)
3822                 error = ENXIO;
3823         else if (vp->v_rdev->si_devsw == NULL)
3824                 error = ENXIO;
3825         else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3826                 error = ENOTBLK;
3827         dev_unlock();
3828         if (errp != NULL)
3829                 *errp = error;
3830         return (error == 0);
3831 }
3832
3833 /*
3834  * Common filesystem object access control check routine.  Accepts a
3835  * vnode's type, "mode", uid and gid, requested access mode, credentials,
3836  * and optional call-by-reference privused argument allowing vaccess()
3837  * to indicate to the caller whether privilege was used to satisfy the
3838  * request (obsoleted).  Returns 0 on success, or an errno on failure.
3839  */
3840 int
3841 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3842     accmode_t accmode, struct ucred *cred, int *privused)
3843 {
3844         accmode_t dac_granted;
3845         accmode_t priv_granted;
3846
3847         KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
3848             ("invalid bit in accmode"));
3849         KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
3850             ("VAPPEND without VWRITE"));
3851
3852         /*
3853          * Look for a normal, non-privileged way to access the file/directory
3854          * as requested.  If it exists, go with that.
3855          */
3856
3857         if (privused != NULL)
3858                 *privused = 0;
3859
3860         dac_granted = 0;
3861
3862         /* Check the owner. */
3863         if (cred->cr_uid == file_uid) {
3864                 dac_granted |= VADMIN;
3865                 if (file_mode & S_IXUSR)
3866                         dac_granted |= VEXEC;
3867                 if (file_mode & S_IRUSR)
3868                         dac_granted |= VREAD;
3869                 if (file_mode & S_IWUSR)
3870                         dac_granted |= (VWRITE | VAPPEND);
3871
3872                 if ((accmode & dac_granted) == accmode)
3873                         return (0);
3874
3875                 goto privcheck;
3876         }
3877
3878         /* Otherwise, check the groups (first match) */
3879         if (groupmember(file_gid, cred)) {
3880                 if (file_mode & S_IXGRP)
3881                         dac_granted |= VEXEC;
3882                 if (file_mode & S_IRGRP)
3883                         dac_granted |= VREAD;
3884                 if (file_mode & S_IWGRP)
3885                         dac_granted |= (VWRITE | VAPPEND);
3886
3887                 if ((accmode & dac_granted) == accmode)
3888                         return (0);
3889
3890                 goto privcheck;
3891         }
3892
3893         /* Otherwise, check everyone else. */
3894         if (file_mode & S_IXOTH)
3895                 dac_granted |= VEXEC;
3896         if (file_mode & S_IROTH)
3897                 dac_granted |= VREAD;
3898         if (file_mode & S_IWOTH)
3899                 dac_granted |= (VWRITE | VAPPEND);
3900         if ((accmode & dac_granted) == accmode)
3901                 return (0);
3902
3903 privcheck:
3904         /*
3905          * Build a privilege mask to determine if the set of privileges
3906          * satisfies the requirements when combined with the granted mask
3907          * from above.  For each privilege, if the privilege is required,
3908          * bitwise or the request type onto the priv_granted mask.
3909          */
3910         priv_granted = 0;
3911
3912         if (type == VDIR) {
3913                 /*
3914                  * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3915                  * requests, instead of PRIV_VFS_EXEC.
3916                  */
3917                 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3918                     !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3919                         priv_granted |= VEXEC;
3920         } else {
3921                 /*
3922                  * Ensure that at least one execute bit is on. Otherwise,
3923                  * a privileged user will always succeed, and we don't want
3924                  * this to happen unless the file really is executable.
3925                  */
3926                 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3927                     (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
3928                     !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
3929                         priv_granted |= VEXEC;
3930         }
3931
3932         if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
3933             !priv_check_cred(cred, PRIV_VFS_READ, 0))
3934                 priv_granted |= VREAD;
3935
3936         if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3937             !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
3938                 priv_granted |= (VWRITE | VAPPEND);
3939
3940         if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3941             !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
3942                 priv_granted |= VADMIN;
3943
3944         if ((accmode & (priv_granted | dac_granted)) == accmode) {
3945                 /* XXX audit: privilege used */
3946                 if (privused != NULL)
3947                         *privused = 1;
3948                 return (0);
3949         }
3950
3951         return ((accmode & VADMIN) ? EPERM : EACCES);
3952 }
3953
3954 /*
3955  * Credential check based on process requesting service, and per-attribute
3956  * permissions.
3957  */
3958 int
3959 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
3960     struct thread *td, accmode_t accmode)
3961 {
3962
3963         /*
3964          * Kernel-invoked always succeeds.
3965          */
3966         if (cred == NOCRED)
3967                 return (0);
3968
3969         /*
3970          * Do not allow privileged processes in jail to directly manipulate
3971          * system attributes.
3972          */
3973         switch (attrnamespace) {
3974         case EXTATTR_NAMESPACE_SYSTEM:
3975                 /* Potentially should be: return (EPERM); */
3976                 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
3977         case EXTATTR_NAMESPACE_USER:
3978                 return (VOP_ACCESS(vp, accmode, cred, td));
3979         default:
3980                 return (EPERM);
3981         }
3982 }
3983
3984 #ifdef DEBUG_VFS_LOCKS
3985 /*
3986  * This only exists to supress warnings from unlocked specfs accesses.  It is
3987  * no longer ok to have an unlocked VFS.
3988  */
3989 #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||            \
3990         (vp)->v_type == VCHR || (vp)->v_type == VBAD)
3991
3992 int vfs_badlock_ddb = 1;        /* Drop into debugger on violation. */
3993 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
3994     "Drop into debugger on lock violation");
3995
3996 int vfs_badlock_mutex = 1;      /* Check for interlock across VOPs. */
3997 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
3998     0, "Check for interlock across VOPs");
3999
4000 int vfs_badlock_print = 1;      /* Print lock violations. */
4001 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
4002     0, "Print lock violations");
4003
4004 #ifdef KDB
4005 int vfs_badlock_backtrace = 1;  /* Print backtrace at lock violations. */
4006 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
4007     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
4008 #endif
4009
4010 static void
4011 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
4012 {
4013
4014 #ifdef KDB
4015         if (vfs_badlock_backtrace)
4016                 kdb_backtrace();
4017 #endif
4018         if (vfs_badlock_print)
4019                 printf("%s: %p %s\n", str, (void *)vp, msg);
4020         if (vfs_badlock_ddb)
4021                 kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4022 }
4023
4024 void
4025 assert_vi_locked(struct vnode *vp, const char *str)
4026 {
4027
4028         if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
4029                 vfs_badlock("interlock is not locked but should be", str, vp);
4030 }
4031
4032 void
4033 assert_vi_unlocked(struct vnode *vp, const char *str)
4034 {
4035
4036         if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
4037                 vfs_badlock("interlock is locked but should not be", str, vp);
4038 }
4039
4040 void
4041 assert_vop_locked(struct vnode *vp, const char *str)
4042 {
4043         int locked;
4044
4045         if (!IGNORE_LOCK(vp)) {
4046                 locked = VOP_ISLOCKED(vp);
4047                 if (locked == 0 || locked == LK_EXCLOTHER)
4048                         vfs_badlock("is not locked but should be", str, vp);
4049         }
4050 }
4051
4052 void
4053 assert_vop_unlocked(struct vnode *vp, const char *str)
4054 {
4055
4056         if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
4057                 vfs_badlock("is locked but should not be", str, vp);
4058 }
4059
4060 void
4061 assert_vop_elocked(struct vnode *vp, const char *str)
4062 {
4063
4064         if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
4065                 vfs_badlock("is not exclusive locked but should be", str, vp);
4066 }
4067
4068 #if 0
4069 void
4070 assert_vop_elocked_other(struct vnode *vp, const char *str)
4071 {
4072
4073         if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
4074                 vfs_badlock("is not exclusive locked by another thread",
4075                     str, vp);
4076 }
4077
4078 void
4079 assert_vop_slocked(struct vnode *vp, const char *str)
4080 {
4081
4082         if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
4083                 vfs_badlock("is not locked shared but should be", str, vp);
4084 }
4085 #endif /* 0 */
4086 #endif /* DEBUG_VFS_LOCKS */
4087
4088 void
4089 vop_rename_fail(struct vop_rename_args *ap)
4090 {
4091
4092         if (ap->a_tvp != NULL)
4093                 vput(ap->a_tvp);
4094         if (ap->a_tdvp == ap->a_tvp)
4095                 vrele(ap->a_tdvp);
4096         else
4097                 vput(ap->a_tdvp);
4098         vrele(ap->a_fdvp);
4099         vrele(ap->a_fvp);
4100 }
4101
4102 void
4103 vop_rename_pre(void *ap)
4104 {
4105         struct vop_rename_args *a = ap;
4106
4107 #ifdef DEBUG_VFS_LOCKS
4108         if (a->a_tvp)
4109                 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
4110         ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
4111         ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
4112         ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
4113
4114         /* Check the source (from). */
4115         if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
4116             (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
4117                 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
4118         if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
4119                 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
4120
4121         /* Check the target. */
4122         if (a->a_tvp)
4123                 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
4124         ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
4125 #endif
4126         if (a->a_tdvp != a->a_fdvp)
4127                 vhold(a->a_fdvp);
4128         if (a->a_tvp != a->a_fvp)
4129                 vhold(a->a_fvp);
4130         vhold(a->a_tdvp);
4131         if (a->a_tvp)
4132                 vhold(a->a_tvp);
4133 }
4134
4135 void
4136 vop_strategy_pre(void *ap)
4137 {
4138 #ifdef DEBUG_VFS_LOCKS
4139         struct vop_strategy_args *a;
4140         struct buf *bp;
4141
4142         a = ap;
4143         bp = a->a_bp;
4144
4145         /*
4146          * Cluster ops lock their component buffers but not the IO container.
4147          */
4148         if ((bp->b_flags & B_CLUSTER) != 0)
4149                 return;
4150
4151         if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
4152                 if (vfs_badlock_print)
4153                         printf(
4154                             "VOP_STRATEGY: bp is not locked but should be\n");
4155                 if (vfs_badlock_ddb)
4156                         kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4157         }
4158 #endif
4159 }
4160
4161 void
4162 vop_lock_pre(void *ap)
4163 {
4164 #ifdef DEBUG_VFS_LOCKS
4165         struct vop_lock1_args *a = ap;
4166
4167         if ((a->a_flags & LK_INTERLOCK) == 0)
4168                 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4169         else
4170                 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
4171 #endif
4172 }
4173
4174 void
4175 vop_lock_post(void *ap, int rc)
4176 {
4177 #ifdef DEBUG_VFS_LOCKS
4178         struct vop_lock1_args *a = ap;
4179
4180         ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4181         if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
4182                 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
4183 #endif
4184 }
4185
4186 void
4187 vop_unlock_pre(void *ap)
4188 {
4189 #ifdef DEBUG_VFS_LOCKS
4190         struct vop_unlock_args *a = ap;
4191
4192         if (a->a_flags & LK_INTERLOCK)
4193                 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
4194         ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
4195 #endif
4196 }
4197
4198 void
4199 vop_unlock_post(void *ap, int rc)
4200 {
4201 #ifdef DEBUG_VFS_LOCKS
4202         struct vop_unlock_args *a = ap;
4203
4204         if (a->a_flags & LK_INTERLOCK)
4205                 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
4206 #endif
4207 }
4208
4209 void
4210 vop_create_post(void *ap, int rc)
4211 {
4212         struct vop_create_args *a = ap;
4213
4214         if (!rc)
4215                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4216 }
4217
4218 void
4219 vop_deleteextattr_post(void *ap, int rc)
4220 {
4221         struct vop_deleteextattr_args *a = ap;
4222
4223         if (!rc)
4224                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4225 }
4226
4227 void
4228 vop_link_post(void *ap, int rc)
4229 {
4230         struct vop_link_args *a = ap;
4231
4232         if (!rc) {
4233                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4234                 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4235         }
4236 }
4237
4238 void
4239 vop_mkdir_post(void *ap, int rc)
4240 {
4241         struct vop_mkdir_args *a = ap;
4242
4243         if (!rc)
4244                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4245 }
4246
4247 void
4248 vop_mknod_post(void *ap, int rc)
4249 {
4250         struct vop_mknod_args *a = ap;
4251
4252         if (!rc)
4253                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4254 }
4255
4256 void
4257 vop_remove_post(void *ap, int rc)
4258 {
4259         struct vop_remove_args *a = ap;
4260
4261         if (!rc) {
4262                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4263                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4264         }
4265 }
4266
4267 void
4268 vop_rename_post(void *ap, int rc)
4269 {
4270         struct vop_rename_args *a = ap;
4271
4272         if (!rc) {
4273                 VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4274                 VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4275                 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4276                 if (a->a_tvp)
4277                         VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4278         }
4279         if (a->a_tdvp != a->a_fdvp)
4280                 vdrop(a->a_fdvp);
4281         if (a->a_tvp != a->a_fvp)
4282                 vdrop(a->a_fvp);
4283         vdrop(a->a_tdvp);
4284         if (a->a_tvp)
4285                 vdrop(a->a_tvp);
4286 }
4287
4288 void
4289 vop_rmdir_post(void *ap, int rc)
4290 {
4291         struct vop_rmdir_args *a = ap;
4292
4293         if (!rc) {
4294                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4295                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4296         }
4297 }
4298
4299 void
4300 vop_setattr_post(void *ap, int rc)
4301 {
4302         struct vop_setattr_args *a = ap;
4303
4304         if (!rc)
4305                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4306 }
4307
4308 void
4309 vop_setextattr_post(void *ap, int rc)
4310 {
4311         struct vop_setextattr_args *a = ap;
4312
4313         if (!rc)
4314                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4315 }
4316
4317 void
4318 vop_symlink_post(void *ap, int rc)
4319 {
4320         struct vop_symlink_args *a = ap;
4321
4322         if (!rc)
4323                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4324 }
4325
4326 static struct knlist fs_knlist;
4327
4328 static void
4329 vfs_event_init(void *arg)
4330 {
4331         knlist_init_mtx(&fs_knlist, NULL);
4332 }
4333 /* XXX - correct order? */
4334 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4335
4336 void
4337 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4338 {
4339
4340         KNOTE_UNLOCKED(&fs_knlist, event);
4341 }
4342
4343 static int      filt_fsattach(struct knote *kn);
4344 static void     filt_fsdetach(struct knote *kn);
4345 static int      filt_fsevent(struct knote *kn, long hint);
4346
4347 struct filterops fs_filtops = {
4348         .f_isfd = 0,
4349         .f_attach = filt_fsattach,
4350         .f_detach = filt_fsdetach,
4351         .f_event = filt_fsevent
4352 };
4353
4354 static int
4355 filt_fsattach(struct knote *kn)
4356 {
4357
4358         kn->kn_flags |= EV_CLEAR;
4359         knlist_add(&fs_knlist, kn, 0);
4360         return (0);
4361 }
4362
4363 static void
4364 filt_fsdetach(struct knote *kn)
4365 {
4366
4367         knlist_remove(&fs_knlist, kn, 0);
4368 }
4369
4370 static int
4371 filt_fsevent(struct knote *kn, long hint)
4372 {
4373
4374         kn->kn_fflags |= hint;
4375         return (kn->kn_fflags != 0);
4376 }
4377
4378 static int
4379 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4380 {
4381         struct vfsidctl vc;
4382         int error;
4383         struct mount *mp;
4384
4385         error = SYSCTL_IN(req, &vc, sizeof(vc));
4386         if (error)
4387                 return (error);
4388         if (vc.vc_vers != VFS_CTL_VERS1)
4389                 return (EINVAL);
4390         mp = vfs_getvfs(&vc.vc_fsid);
4391         if (mp == NULL)
4392                 return (ENOENT);
4393         /* ensure that a specific sysctl goes to the right filesystem. */
4394         if (strcmp(vc.vc_fstypename, "*") != 0 &&
4395             strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4396                 vfs_rel(mp);
4397                 return (EINVAL);
4398         }
4399         VCTLTOREQ(&vc, req);
4400         error = VFS_SYSCTL(mp, vc.vc_op, req);
4401         vfs_rel(mp);
4402         return (error);
4403 }
4404
4405 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4406     NULL, 0, sysctl_vfs_ctl, "",
4407     "Sysctl by fsid");
4408
4409 /*
4410  * Function to initialize a va_filerev field sensibly.
4411  * XXX: Wouldn't a random number make a lot more sense ??
4412  */
4413 u_quad_t
4414 init_va_filerev(void)
4415 {
4416         struct bintime bt;
4417
4418         getbinuptime(&bt);
4419         return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4420 }
4421
4422 static int      filt_vfsread(struct knote *kn, long hint);
4423 static int      filt_vfswrite(struct knote *kn, long hint);
4424 static int      filt_vfsvnode(struct knote *kn, long hint);
4425 static void     filt_vfsdetach(struct knote *kn);
4426 static struct filterops vfsread_filtops = {
4427         .f_isfd = 1,
4428         .f_detach = filt_vfsdetach,
4429         .f_event = filt_vfsread
4430 };
4431 static struct filterops vfswrite_filtops = {
4432         .f_isfd = 1,
4433         .f_detach = filt_vfsdetach,
4434         .f_event = filt_vfswrite
4435 };
4436 static struct filterops vfsvnode_filtops = {
4437         .f_isfd = 1,
4438         .f_detach = filt_vfsdetach,
4439         .f_event = filt_vfsvnode
4440 };
4441
4442 static void
4443 vfs_knllock(void *arg)
4444 {
4445         struct vnode *vp = arg;
4446
4447         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4448 }
4449
4450 static void
4451 vfs_knlunlock(void *arg)
4452 {
4453         struct vnode *vp = arg;
4454
4455         VOP_UNLOCK(vp, 0);
4456 }
4457
4458 static void
4459 vfs_knl_assert_locked(void *arg)
4460 {
4461 #ifdef DEBUG_VFS_LOCKS
4462         struct vnode *vp = arg;
4463
4464         ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4465 #endif
4466 }
4467
4468 static void
4469 vfs_knl_assert_unlocked(void *arg)
4470 {
4471 #ifdef DEBUG_VFS_LOCKS
4472         struct vnode *vp = arg;
4473
4474         ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4475 #endif
4476 }
4477
4478 int
4479 vfs_kqfilter(struct vop_kqfilter_args *ap)
4480 {
4481         struct vnode *vp = ap->a_vp;
4482         struct knote *kn = ap->a_kn;
4483         struct knlist *knl;
4484
4485         switch (kn->kn_filter) {
4486         case EVFILT_READ:
4487                 kn->kn_fop = &vfsread_filtops;
4488                 break;
4489         case EVFILT_WRITE:
4490                 kn->kn_fop = &vfswrite_filtops;
4491                 break;
4492         case EVFILT_VNODE:
4493                 kn->kn_fop = &vfsvnode_filtops;
4494                 break;
4495         default:
4496                 return (EINVAL);
4497         }
4498
4499         kn->kn_hook = (caddr_t)vp;
4500
4501         v_addpollinfo(vp);
4502         if (vp->v_pollinfo == NULL)
4503                 return (ENOMEM);
4504         knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4505         vhold(vp);
4506         knlist_add(knl, kn, 0);
4507
4508         return (0);
4509 }
4510
4511 /*
4512  * Detach knote from vnode
4513  */
4514 static void
4515 filt_vfsdetach(struct knote *kn)
4516 {
4517         struct vnode *vp = (struct vnode *)kn->kn_hook;
4518
4519         KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4520         knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4521         vdrop(vp);
4522 }
4523
4524 /*ARGSUSED*/
4525 static int
4526 filt_vfsread(struct knote *kn, long hint)
4527 {
4528         struct vnode *vp = (struct vnode *)kn->kn_hook;
4529         struct vattr va;
4530         int res;
4531
4532         /*
4533          * filesystem is gone, so set the EOF flag and schedule
4534          * the knote for deletion.
4535          */
4536         if (hint == NOTE_REVOKE) {
4537                 VI_LOCK(vp);
4538                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4539                 VI_UNLOCK(vp);
4540                 return (1);
4541         }
4542
4543         if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4544                 return (0);
4545
4546         VI_LOCK(vp);
4547         kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4548         res = (kn->kn_data != 0);
4549         VI_UNLOCK(vp);
4550         return (res);
4551 }
4552
4553 /*ARGSUSED*/
4554 static int
4555 filt_vfswrite(struct knote *kn, long hint)
4556 {
4557         struct vnode *vp = (struct vnode *)kn->kn_hook;
4558
4559         VI_LOCK(vp);
4560
4561         /*
4562          * filesystem is gone, so set the EOF flag and schedule
4563          * the knote for deletion.
4564          */
4565         if (hint == NOTE_REVOKE)
4566                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4567
4568         kn->kn_data = 0;
4569         VI_UNLOCK(vp);
4570         return (1);
4571 }
4572
4573 static int
4574 filt_vfsvnode(struct knote *kn, long hint)
4575 {
4576         struct vnode *vp = (struct vnode *)kn->kn_hook;
4577         int res;
4578
4579         VI_LOCK(vp);
4580         if (kn->kn_sfflags & hint)
4581                 kn->kn_fflags |= hint;
4582         if (hint == NOTE_REVOKE) {
4583                 kn->kn_flags |= EV_EOF;
4584                 VI_UNLOCK(vp);
4585                 return (1);
4586         }
4587         res = (kn->kn_fflags != 0);
4588         VI_UNLOCK(vp);
4589         return (res);
4590 }
4591
4592 int
4593 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4594 {
4595         int error;
4596
4597         if (dp->d_reclen > ap->a_uio->uio_resid)
4598                 return (ENAMETOOLONG);
4599         error = uiomove(dp, dp->d_reclen, ap->a_uio);
4600         if (error) {
4601                 if (ap->a_ncookies != NULL) {
4602                         if (ap->a_cookies != NULL)
4603                                 free(ap->a_cookies, M_TEMP);
4604                         ap->a_cookies = NULL;
4605                         *ap->a_ncookies = 0;
4606                 }
4607                 return (error);
4608         }
4609         if (ap->a_ncookies == NULL)
4610                 return (0);
4611
4612         KASSERT(ap->a_cookies,
4613             ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4614
4615         *ap->a_cookies = realloc(*ap->a_cookies,
4616             (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4617         (*ap->a_cookies)[*ap->a_ncookies] = off;
4618         return (0);
4619 }
4620
4621 /*
4622  * Mark for update the access time of the file if the filesystem
4623  * supports VOP_MARKATIME.  This functionality is used by execve and
4624  * mmap, so we want to avoid the I/O implied by directly setting
4625  * va_atime for the sake of efficiency.
4626  */
4627 void
4628 vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4629 {
4630         struct mount *mp;
4631
4632         mp = vp->v_mount;
4633         ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4634         if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4635                 (void)VOP_MARKATIME(vp);
4636 }
4637
4638 /*
4639  * The purpose of this routine is to remove granularity from accmode_t,
4640  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4641  * VADMIN and VAPPEND.
4642  *
4643  * If it returns 0, the caller is supposed to continue with the usual
4644  * access checks using 'accmode' as modified by this routine.  If it
4645  * returns nonzero value, the caller is supposed to return that value
4646  * as errno.
4647  *
4648  * Note that after this routine runs, accmode may be zero.
4649  */
4650 int
4651 vfs_unixify_accmode(accmode_t *accmode)
4652 {
4653         /*
4654          * There is no way to specify explicit "deny" rule using
4655          * file mode or POSIX.1e ACLs.
4656          */
4657         if (*accmode & VEXPLICIT_DENY) {
4658                 *accmode = 0;
4659                 return (0);
4660         }
4661
4662         /*
4663          * None of these can be translated into usual access bits.
4664          * Also, the common case for NFSv4 ACLs is to not contain
4665          * either of these bits. Caller should check for VWRITE
4666          * on the containing directory instead.
4667          */
4668         if (*accmode & (VDELETE_CHILD | VDELETE))
4669                 return (EPERM);
4670
4671         if (*accmode & VADMIN_PERMS) {
4672                 *accmode &= ~VADMIN_PERMS;
4673                 *accmode |= VADMIN;
4674         }
4675
4676         /*
4677          * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4678          * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4679          */
4680         *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4681
4682         return (0);
4683 }
4684
4685 /*
4686  * These are helper functions for filesystems to traverse all
4687  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
4688  *
4689  * This interface replaces MNT_VNODE_FOREACH.
4690  */
4691
4692 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
4693
4694 struct vnode *
4695 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
4696 {
4697         struct vnode *vp;
4698
4699         if (should_yield())
4700                 kern_yield(PRI_USER);
4701         MNT_ILOCK(mp);
4702         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4703         vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
4704         while (vp != NULL && (vp->v_type == VMARKER ||
4705             (vp->v_iflag & VI_DOOMED) != 0))
4706                 vp = TAILQ_NEXT(vp, v_nmntvnodes);
4707
4708         /* Check if we are done */
4709         if (vp == NULL) {
4710                 __mnt_vnode_markerfree_all(mvp, mp);
4711                 /* MNT_IUNLOCK(mp); -- done in above function */
4712                 mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
4713                 return (NULL);
4714         }
4715         TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4716         TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4717         VI_LOCK(vp);
4718         MNT_IUNLOCK(mp);
4719         return (vp);
4720 }
4721
4722 struct vnode *
4723 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
4724 {
4725         struct vnode *vp;
4726
4727         *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4728         MNT_ILOCK(mp);
4729         MNT_REF(mp);
4730         (*mvp)->v_type = VMARKER;
4731
4732         vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
4733         while (vp != NULL && (vp->v_type == VMARKER ||
4734             (vp->v_iflag & VI_DOOMED) != 0))
4735                 vp = TAILQ_NEXT(vp, v_nmntvnodes);
4736
4737         /* Check if we are done */
4738         if (vp == NULL) {
4739                 MNT_REL(mp);
4740                 MNT_IUNLOCK(mp);
4741                 free(*mvp, M_VNODE_MARKER);
4742                 *mvp = NULL;
4743                 return (NULL);
4744         }
4745         (*mvp)->v_mount = mp;
4746         TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4747         VI_LOCK(vp);
4748         MNT_IUNLOCK(mp);
4749         return (vp);
4750 }
4751
4752
4753 void
4754 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
4755 {
4756
4757         if (*mvp == NULL) {
4758                 MNT_IUNLOCK(mp);
4759                 return;
4760         }
4761
4762         mtx_assert(MNT_MTX(mp), MA_OWNED);
4763
4764         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4765         TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4766         MNT_REL(mp);
4767         MNT_IUNLOCK(mp);
4768         free(*mvp, M_VNODE_MARKER);
4769         *mvp = NULL;
4770 }
4771
4772 /*
4773  * These are helper functions for filesystems to traverse their
4774  * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
4775  */
4776 static void
4777 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4778 {
4779
4780         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4781
4782         MNT_ILOCK(mp);
4783         MNT_REL(mp);
4784         MNT_IUNLOCK(mp);
4785         free(*mvp, M_VNODE_MARKER);
4786         *mvp = NULL;
4787 }
4788
4789 static struct vnode *
4790 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4791 {
4792         struct vnode *vp, *nvp;
4793
4794         mtx_assert(&vnode_free_list_mtx, MA_OWNED);
4795         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4796 restart:
4797         vp = TAILQ_NEXT(*mvp, v_actfreelist);
4798         TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4799         while (vp != NULL) {
4800                 if (vp->v_type == VMARKER) {
4801                         vp = TAILQ_NEXT(vp, v_actfreelist);
4802                         continue;
4803                 }
4804                 if (!VI_TRYLOCK(vp)) {
4805                         if (mp_ncpus == 1 || should_yield()) {
4806                                 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4807                                 mtx_unlock(&vnode_free_list_mtx);
4808                                 pause("vnacti", 1);
4809                                 mtx_lock(&vnode_free_list_mtx);
4810                                 goto restart;
4811                         }
4812                         continue;
4813                 }
4814                 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
4815                 KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
4816                     ("alien vnode on the active list %p %p", vp, mp));
4817                 if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
4818                         break;
4819                 nvp = TAILQ_NEXT(vp, v_actfreelist);
4820                 VI_UNLOCK(vp);
4821                 vp = nvp;
4822         }
4823
4824         /* Check if we are done */
4825         if (vp == NULL) {
4826                 mtx_unlock(&vnode_free_list_mtx);
4827                 mnt_vnode_markerfree_active(mvp, mp);
4828                 return (NULL);
4829         }
4830         TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
4831         mtx_unlock(&vnode_free_list_mtx);
4832         ASSERT_VI_LOCKED(vp, "active iter");
4833         KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
4834         return (vp);
4835 }
4836
4837 struct vnode *
4838 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4839 {
4840
4841         if (should_yield())
4842                 kern_yield(PRI_USER);
4843         mtx_lock(&vnode_free_list_mtx);
4844         return (mnt_vnode_next_active(mvp, mp));
4845 }
4846
4847 struct vnode *
4848 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
4849 {
4850         struct vnode *vp;
4851
4852         *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4853         MNT_ILOCK(mp);
4854         MNT_REF(mp);
4855         MNT_IUNLOCK(mp);
4856         (*mvp)->v_type = VMARKER;
4857         (*mvp)->v_mount = mp;
4858
4859         mtx_lock(&vnode_free_list_mtx);
4860         vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
4861         if (vp == NULL) {
4862                 mtx_unlock(&vnode_free_list_mtx);
4863                 mnt_vnode_markerfree_active(mvp, mp);
4864                 return (NULL);
4865         }
4866         TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4867         return (mnt_vnode_next_active(mvp, mp));
4868 }
4869
4870 void
4871 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4872 {
4873
4874         if (*mvp == NULL)
4875                 return;
4876
4877         mtx_lock(&vnode_free_list_mtx);
4878         TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4879         mtx_unlock(&vnode_free_list_mtx);
4880         mnt_vnode_markerfree_active(mvp, mp);
4881 }