sys/kern/vfs_subr.c

   1 /*-
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 4. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
  35  */
  36
  37 /*
  38  * External virtual filesystem routines
  39  */
  40
  41 #include <sys/cdefs.h>
  42 __FBSDID("$FreeBSD$");
  43
  44 #include "opt_ddb.h"
  45 #include "opt_mac.h"
  46
  47 #include <sys/param.h>
  48 #include <sys/systm.h>
  49 #include <sys/bio.h>
  50 #include <sys/buf.h>
  51 #include <sys/conf.h>
  52 #include <sys/dirent.h>
  53 #include <sys/event.h>
  54 #include <sys/eventhandler.h>
  55 #include <sys/extattr.h>
  56 #include <sys/file.h>
  57 #include <sys/fcntl.h>
  58 #include <sys/jail.h>
  59 #include <sys/kdb.h>
  60 #include <sys/kernel.h>
  61 #include <sys/kthread.h>
  62 #include <sys/lockf.h>
  63 #include <sys/malloc.h>
  64 #include <sys/mount.h>
  65 #include <sys/namei.h>
  66 #include <sys/priv.h>
  67 #include <sys/reboot.h>
  68 #include <sys/sleepqueue.h>
  69 #include <sys/stat.h>
  70 #include <sys/sysctl.h>
  71 #include <sys/syslog.h>
  72 #include <sys/vmmeter.h>
  73 #include <sys/vnode.h>
  74
  75 #include <machine/stdarg.h>
  76
  77 #include <security/mac/mac_framework.h>
  78
  79 #include <vm/vm.h>
  80 #include <vm/vm_object.h>
  81 #include <vm/vm_extern.h>
  82 #include <vm/pmap.h>
  83 #include <vm/vm_map.h>
  84 #include <vm/vm_page.h>
  85 #include <vm/vm_kern.h>
  86 #include <vm/uma.h>
  87
  88 #ifdef DDB
  89 #include <ddb/ddb.h>
  90 #endif
  91
  92 #define WI_MPSAFEQ      0
  93 #define WI_GIANTQ       1
  94
  95 static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure");
  96
  97 static void     delmntque(struct vnode *vp);
  98 static int      flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
  99                     int slpflag, int slptimeo);
 100 static void     syncer_shutdown(void *arg, int howto);
 101 static int      vtryrecycle(struct vnode *vp);
 102 static void     vbusy(struct vnode *vp);
 103 static void     vinactive(struct vnode *, struct thread *);
 104 static void     v_incr_usecount(struct vnode *);
 105 static void     v_decr_usecount(struct vnode *);
 106 static void     v_decr_useonly(struct vnode *);
 107 static void     v_upgrade_usecount(struct vnode *);
 108 static void     vfree(struct vnode *);
 109 static void     vnlru_free(int);
 110 static void     vdestroy(struct vnode *);
 111 static void     vgonel(struct vnode *);
 112 static void     vfs_knllock(void *arg);
 113 static void     vfs_knlunlock(void *arg);
 114 static int      vfs_knllocked(void *arg);
 115
 116
 117 /*
 118  * Enable Giant pushdown based on whether or not the vm is mpsafe in this
 119  * build.  Without mpsafevm the buffer cache can not run Giant free.
 120  */
 121 int mpsafe_vfs = 1;
 122 TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs);
 123 SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0,
 124     "MPSAFE VFS");
 125
 126 /*
 127  * Number of vnodes in existence.  Increased whenever getnewvnode()
 128  * allocates a new vnode, decreased on vdestroy() called on VI_DOOMed
 129  * vnode.
 130  */
 131 static unsigned long    numvnodes;
 132
 133 SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
 134
 135 /*
 136  * Conversion tables for conversion from vnode types to inode formats
 137  * and back.
 138  */
 139 enum vtype iftovt_tab[16] = {
 140         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 141         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 142 };
 143 int vttoif_tab[10] = {
 144         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 145         S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 146 };
 147
 148 /*
 149  * List of vnodes that are ready for recycling.
 150  */
 151 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 152
 153 /*
 154  * Free vnode target.  Free vnodes may simply be files which have been stat'd
 155  * but not read.  This is somewhat common, and a small cache of such files
 156  * should be kept to avoid recreation costs.
 157  */
 158 static u_long wantfreevnodes;
 159 SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 160 /* Number of vnodes in the free list. */
 161 static u_long freevnodes;
 162 SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
 163
 164 /*
 165  * Various variables used for debugging the new implementation of
 166  * reassignbuf().
 167  * XXX these are probably of (very) limited utility now.
 168  */
 169 static int reassignbufcalls;
 170 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
 171
 172 /*
 173  * Cache for the mount type id assigned to NFS.  This is used for
 174  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
 175  */
 176 int     nfs_mount_type = -1;
 177
 178 /* To keep more than one thread at a time from running vfs_getnewfsid */
 179 static struct mtx mntid_mtx;
 180
 181 /*
 182  * Lock for any access to the following:
 183  *      vnode_free_list
 184  *      numvnodes
 185  *      freevnodes
 186  */
 187 static struct mtx vnode_free_list_mtx;
 188
 189 /* Publicly exported FS */
 190 struct nfs_public nfs_pub;
 191
 192 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 193 static uma_zone_t vnode_zone;
 194 static uma_zone_t vnodepoll_zone;
 195
 196 /* Set to 1 to print out reclaim of active vnodes */
 197 int     prtactive;
 198
 199 /*
 200  * The workitem queue.
 201  *
 202  * It is useful to delay writes of file data and filesystem metadata
 203  * for tens of seconds so that quickly created and deleted files need
 204  * not waste disk bandwidth being created and removed. To realize this,
 205  * we append vnodes to a "workitem" queue. When running with a soft
 206  * updates implementation, most pending metadata dependencies should
 207  * not wait for more than a few seconds. Thus, mounted on block devices
 208  * are delayed only about a half the time that file data is delayed.
 209  * Similarly, directory updates are more critical, so are only delayed
 210  * about a third the time that file data is delayed. Thus, there are
 211  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
 212  * one each second (driven off the filesystem syncer process). The
 213  * syncer_delayno variable indicates the next queue that is to be processed.
 214  * Items that need to be processed soon are placed in this queue:
 215  *
 216  *      syncer_workitem_pending[syncer_delayno]
 217  *
 218  * A delay of fifteen seconds is done by placing the request fifteen
 219  * entries later in the queue:
 220  *
 221  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
 222  *
 223  */
 224 static int syncer_delayno;
 225 static long syncer_mask;
 226 LIST_HEAD(synclist, bufobj);
 227 static struct synclist *syncer_workitem_pending[2];
 228 /*
 229  * The sync_mtx protects:
 230  *      bo->bo_synclist
 231  *      sync_vnode_count
 232  *      syncer_delayno
 233  *      syncer_state
 234  *      syncer_workitem_pending
 235  *      syncer_worklist_len
 236  *      rushjob
 237  */
 238 static struct mtx sync_mtx;
 239
 240 #define SYNCER_MAXDELAY         32
 241 static int syncer_maxdelay = SYNCER_MAXDELAY;   /* maximum delay time */
 242 static int syncdelay = 30;              /* max time to delay syncing data */
 243 static int filedelay = 30;              /* time to delay syncing files */
 244 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
 245 static int dirdelay = 29;               /* time to delay syncing directories */
 246 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
 247 static int metadelay = 28;              /* time to delay syncing metadata */
 248 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
 249 static int rushjob;             /* number of slots to run ASAP */
 250 static int stat_rush_requests;  /* number of times I/O speeded up */
 251 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
 252
 253 /*
 254  * When shutting down the syncer, run it at four times normal speed.
 255  */
 256 #define SYNCER_SHUTDOWN_SPEEDUP         4
 257 static int sync_vnode_count;
 258 static int syncer_worklist_len;
 259 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
 260     syncer_state;
 261
 262 /*
 263  * Number of vnodes we want to exist at any one time.  This is mostly used
 264  * to size hash tables in vnode-related code.  It is normally not used in
 265  * getnewvnode(), as wantfreevnodes is normally nonzero.)
 266  *
 267  * XXX desiredvnodes is historical cruft and should not exist.
 268  */
 269 int desiredvnodes;
 270 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
 271     &desiredvnodes, 0, "Maximum number of vnodes");
 272 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
 273     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
 274 static int vnlru_nowhere;
 275 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
 276     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 277
 278 /*
 279  * Macros to control when a vnode is freed and recycled.  All require
 280  * the vnode interlock.
 281  */
 282 #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
 283 #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
 284 #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
 285
 286
 287 /*
 288  * Initialize the vnode management data structures.
 289  */
 290 #ifndef MAXVNODES_MAX
 291 #define MAXVNODES_MAX   100000
 292 #endif
 293 static void
 294 vntblinit(void *dummy __unused)
 295 {
 296
 297         /*
 298          * Desiredvnodes is a function of the physical memory size and
 299          * the kernel's heap size.  Specifically, desiredvnodes scales
 300          * in proportion to the physical memory size until two fifths
 301          * of the kernel's heap size is consumed by vnodes and vm
 302          * objects.
 303          */
 304         desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
 305             (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
 306         if (desiredvnodes > MAXVNODES_MAX) {
 307                 if (bootverbose)
 308                         printf("Reducing kern.maxvnodes %d -> %d\n",
 309                             desiredvnodes, MAXVNODES_MAX);
 310                 desiredvnodes = MAXVNODES_MAX;
 311         }
 312         wantfreevnodes = desiredvnodes / 4;
 313         mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 314         TAILQ_INIT(&vnode_free_list);
 315         mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 316         vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 317             NULL, NULL, UMA_ALIGN_PTR, 0);
 318         vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 319             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 320         /*
 321          * Initialize the filesystem syncer.
 322          */
 323         syncer_workitem_pending[WI_MPSAFEQ] = hashinit(syncer_maxdelay, M_VNODE,
 324             &syncer_mask);
 325         syncer_workitem_pending[WI_GIANTQ] = hashinit(syncer_maxdelay, M_VNODE,
 326             &syncer_mask);
 327         syncer_maxdelay = syncer_mask + 1;
 328         mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 329 }
 330 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
 331
 332
 333 /*
 334  * Mark a mount point as busy. Used to synchronize access and to delay
 335  * unmounting. Interlock is not released on failure.
 336  */
 337 int
 338 vfs_busy(struct mount *mp, int flags, struct mtx *interlkp,
 339     struct thread *td)
 340 {
 341         int lkflags;
 342
 343         MNT_ILOCK(mp);
 344         MNT_REF(mp);
 345         if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 346                 if (flags & LK_NOWAIT) {
 347                         MNT_REL(mp);
 348                         MNT_IUNLOCK(mp);
 349                         return (ENOENT);
 350                 }
 351                 if (interlkp)
 352                         mtx_unlock(interlkp);
 353                 mp->mnt_kern_flag |= MNTK_MWAIT;
 354                 /*
 355                  * Since all busy locks are shared except the exclusive
 356                  * lock granted when unmounting, the only place that a
 357                  * wakeup needs to be done is at the release of the
 358                  * exclusive lock at the end of dounmount.
 359                  */
 360                 msleep(mp, MNT_MTX(mp), PVFS, "vfs_busy", 0);
 361                 MNT_REL(mp);
 362                 MNT_IUNLOCK(mp);
 363                 if (interlkp)
 364                         mtx_lock(interlkp);
 365                 return (ENOENT);
 366         }
 367         if (interlkp)
 368                 mtx_unlock(interlkp);
 369         lkflags = LK_SHARED | LK_INTERLOCK;
 370         if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp)))
 371                 panic("vfs_busy: unexpected lock failure");
 372         return (0);
 373 }
 374
 375 /*
 376  * Free a busy filesystem.
 377  */
 378 void
 379 vfs_unbusy(struct mount *mp, struct thread *td)
 380 {
 381
 382         lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
 383         vfs_rel(mp);
 384 }
 385
 386 /*
 387  * Lookup a mount point by filesystem identifier.
 388  */
 389 struct mount *
 390 vfs_getvfs(fsid_t *fsid)
 391 {
 392         struct mount *mp;
 393
 394         mtx_lock(&mountlist_mtx);
 395         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 396                 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 397                     mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 398                         vfs_ref(mp);
 399                         mtx_unlock(&mountlist_mtx);
 400                         return (mp);
 401                 }
 402         }
 403         mtx_unlock(&mountlist_mtx);
 404         return ((struct mount *) 0);
 405 }
 406
 407 /*
 408  * Check if a user can access privileged mount options.
 409  */
 410 int
 411 vfs_suser(struct mount *mp, struct thread *td)
 412 {
 413         int error;
 414
 415         /*
 416          * If the thread is jailed, but this is not a jail-friendly file
 417          * system, deny immediately.
 418          */
 419         if (jailed(td->td_ucred) && !(mp->mnt_vfc->vfc_flags & VFCF_JAIL))
 420                 return (EPERM);
 421
 422         /*
 423          * If the file system was mounted outside a jail and a jailed thread
 424          * tries to access it, deny immediately.
 425          */
 426         if (!jailed(mp->mnt_cred) && jailed(td->td_ucred))
 427                 return (EPERM);
 428
 429         /*
 430          * If the file system was mounted inside different jail that the jail of
 431          * the calling thread, deny immediately.
 432          */
 433         if (jailed(mp->mnt_cred) && jailed(td->td_ucred) &&
 434             mp->mnt_cred->cr_prison != td->td_ucred->cr_prison) {
 435                 return (EPERM);
 436         }
 437
 438         if ((mp->mnt_flag & MNT_USER) == 0 ||
 439             mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 440                 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
 441                         return (error);
 442         }
 443         return (0);
 444 }
 445
 446 /*
 447  * Get a new unique fsid.  Try to make its val[0] unique, since this value
 448  * will be used to create fake device numbers for stat().  Also try (but
 449  * not so hard) make its val[0] unique mod 2^16, since some emulators only
 450  * support 16-bit device numbers.  We end up with unique val[0]'s for the
 451  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
 452  *
 453  * Keep in mind that several mounts may be running in parallel.  Starting
 454  * the search one past where the previous search terminated is both a
 455  * micro-optimization and a defense against returning the same fsid to
 456  * different mounts.
 457  */
 458 void
 459 vfs_getnewfsid(struct mount *mp)
 460 {
 461         static u_int16_t mntid_base;
 462         struct mount *nmp;
 463         fsid_t tfsid;
 464         int mtype;
 465
 466         mtx_lock(&mntid_mtx);
 467         mtype = mp->mnt_vfc->vfc_typenum;
 468         tfsid.val[1] = mtype;
 469         mtype = (mtype & 0xFF) << 24;
 470         for (;;) {
 471                 tfsid.val[0] = makedev(255,
 472                     mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 473                 mntid_base++;
 474                 if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 475                         break;
 476                 vfs_rel(nmp);
 477         }
 478         mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 479         mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 480         mtx_unlock(&mntid_mtx);
 481 }
 482
 483 /*
 484  * Knob to control the precision of file timestamps:
 485  *
 486  *   0 = seconds only; nanoseconds zeroed.
 487  *   1 = seconds and nanoseconds, accurate within 1/HZ.
 488  *   2 = seconds and nanoseconds, truncated to microseconds.
 489  * >=3 = seconds and nanoseconds, maximum precision.
 490  */
 491 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 492
 493 static int timestamp_precision = TSP_SEC;
 494 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
 495     &timestamp_precision, 0, "");
 496
 497 /*
 498  * Get a current timestamp.
 499  */
 500 void
 501 vfs_timestamp(struct timespec *tsp)
 502 {
 503         struct timeval tv;
 504
 505         switch (timestamp_precision) {
 506         case TSP_SEC:
 507                 tsp->tv_sec = time_second;
 508                 tsp->tv_nsec = 0;
 509                 break;
 510         case TSP_HZ:
 511                 getnanotime(tsp);
 512                 break;
 513         case TSP_USEC:
 514                 microtime(&tv);
 515                 TIMEVAL_TO_TIMESPEC(&tv, tsp);
 516                 break;
 517         case TSP_NSEC:
 518         default:
 519                 nanotime(tsp);
 520                 break;
 521         }
 522 }
 523
 524 /*
 525  * Set vnode attributes to VNOVAL
 526  */
 527 void
 528 vattr_null(struct vattr *vap)
 529 {
 530
 531         vap->va_type = VNON;
 532         vap->va_size = VNOVAL;
 533         vap->va_bytes = VNOVAL;
 534         vap->va_mode = VNOVAL;
 535         vap->va_nlink = VNOVAL;
 536         vap->va_uid = VNOVAL;
 537         vap->va_gid = VNOVAL;
 538         vap->va_fsid = VNOVAL;
 539         vap->va_fileid = VNOVAL;
 540         vap->va_blocksize = VNOVAL;
 541         vap->va_rdev = VNOVAL;
 542         vap->va_atime.tv_sec = VNOVAL;
 543         vap->va_atime.tv_nsec = VNOVAL;
 544         vap->va_mtime.tv_sec = VNOVAL;
 545         vap->va_mtime.tv_nsec = VNOVAL;
 546         vap->va_ctime.tv_sec = VNOVAL;
 547         vap->va_ctime.tv_nsec = VNOVAL;
 548         vap->va_birthtime.tv_sec = VNOVAL;
 549         vap->va_birthtime.tv_nsec = VNOVAL;
 550         vap->va_flags = VNOVAL;
 551         vap->va_gen = VNOVAL;
 552         vap->va_vaflags = 0;
 553 }
 554
 555 /*
 556  * This routine is called when we have too many vnodes.  It attempts
 557  * to free <count> vnodes and will potentially free vnodes that still
 558  * have VM backing store (VM backing store is typically the cause
 559  * of a vnode blowout so we want to do this).  Therefore, this operation
 560  * is not considered cheap.
 561  *
 562  * A number of conditions may prevent a vnode from being reclaimed.
 563  * the buffer cache may have references on the vnode, a directory
 564  * vnode may still have references due to the namei cache representing
 565  * underlying files, or the vnode may be in active use.   It is not
 566  * desireable to reuse such vnodes.  These conditions may cause the
 567  * number of vnodes to reach some minimum value regardless of what
 568  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
 569  */
 570 static int
 571 vlrureclaim(struct mount *mp)
 572 {
 573         struct thread *td;
 574         struct vnode *vp;
 575         int done;
 576         int trigger;
 577         int usevnodes;
 578         int count;
 579
 580         /*
 581          * Calculate the trigger point, don't allow user
 582          * screwups to blow us up.   This prevents us from
 583          * recycling vnodes with lots of resident pages.  We
 584          * aren't trying to free memory, we are trying to
 585          * free vnodes.
 586          */
 587         usevnodes = desiredvnodes;
 588         if (usevnodes <= 0)
 589                 usevnodes = 1;
 590         trigger = cnt.v_page_count * 2 / usevnodes;
 591         done = 0;
 592         td = curthread;
 593         vn_start_write(NULL, &mp, V_WAIT);
 594         MNT_ILOCK(mp);
 595         count = mp->mnt_nvnodelistsize / 10 + 1;
 596         while (count != 0) {
 597                 vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 598                 while (vp != NULL && vp->v_type == VMARKER)
 599                         vp = TAILQ_NEXT(vp, v_nmntvnodes);
 600                 if (vp == NULL)
 601                         break;
 602                 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 603                 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 604                 --count;
 605                 if (!VI_TRYLOCK(vp))
 606                         goto next_iter;
 607                 /*
 608                  * If it's been deconstructed already, it's still
 609                  * referenced, or it exceeds the trigger, skip it.
 610                  */
 611                 if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
 612                     (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
 613                     vp->v_object->resident_page_count > trigger)) {
 614                         VI_UNLOCK(vp);
 615                         goto next_iter;
 616                 }
 617                 MNT_IUNLOCK(mp);
 618                 vholdl(vp);
 619                 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
 620                         vdrop(vp);
 621                         goto next_iter_mntunlocked;
 622                 }
 623                 VI_LOCK(vp);
 624                 /*
 625                  * v_usecount may have been bumped after VOP_LOCK() dropped
 626                  * the vnode interlock and before it was locked again.
 627                  *
 628                  * It is not necessary to recheck VI_DOOMED because it can
 629                  * only be set by another thread that holds both the vnode
 630                  * lock and vnode interlock.  If another thread has the
 631                  * vnode lock before we get to VOP_LOCK() and obtains the
 632                  * vnode interlock after VOP_LOCK() drops the vnode
 633                  * interlock, the other thread will be unable to drop the
 634                  * vnode lock before our VOP_LOCK() call fails.
 635                  */
 636                 if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
 637                     (vp->v_object != NULL &&
 638                     vp->v_object->resident_page_count > trigger)) {
 639                         VOP_UNLOCK(vp, LK_INTERLOCK);
 640                         goto next_iter_mntunlocked;
 641                 }
 642                 KASSERT((vp->v_iflag & VI_DOOMED) == 0,
 643                     ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
 644                 vgonel(vp);
 645                 VOP_UNLOCK(vp, 0);
 646                 vdropl(vp);
 647                 done++;
 648 next_iter_mntunlocked:
 649                 if ((count % 256) != 0)
 650                         goto relock_mnt;
 651                 goto yield;
 652 next_iter:
 653                 if ((count % 256) != 0)
 654                         continue;
 655                 MNT_IUNLOCK(mp);
 656 yield:
 657                 uio_yield();
 658 relock_mnt:
 659                 MNT_ILOCK(mp);
 660         }
 661         MNT_IUNLOCK(mp);
 662         vn_finished_write(mp);
 663         return done;
 664 }
 665
 666 /*
 667  * Attempt to keep the free list at wantfreevnodes length.
 668  */
 669 static void
 670 vnlru_free(int count)
 671 {
 672         struct vnode *vp;
 673         int vfslocked;
 674
 675         mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 676         for (; count > 0; count--) {
 677                 vp = TAILQ_FIRST(&vnode_free_list);
 678                 /*
 679                  * The list can be modified while the free_list_mtx
 680                  * has been dropped and vp could be NULL here.
 681                  */
 682                 if (!vp)
 683                         break;
 684                 VNASSERT(vp->v_op != NULL, vp,
 685                     ("vnlru_free: vnode already reclaimed."));
 686                 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 687                 /*
 688                  * Don't recycle if we can't get the interlock.
 689                  */
 690                 if (!VI_TRYLOCK(vp)) {
 691                         TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 692                         continue;
 693                 }
 694                 VNASSERT(VCANRECYCLE(vp), vp,
 695                     ("vp inconsistent on freelist"));
 696                 freevnodes--;
 697                 vp->v_iflag &= ~VI_FREE;
 698                 vholdl(vp);
 699                 mtx_unlock(&vnode_free_list_mtx);
 700                 VI_UNLOCK(vp);
 701                 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 702                 vtryrecycle(vp);
 703                 VFS_UNLOCK_GIANT(vfslocked);
 704                 /*
 705                  * If the recycled succeeded this vdrop will actually free
 706                  * the vnode.  If not it will simply place it back on
 707                  * the free list.
 708                  */
 709                 vdrop(vp);
 710                 mtx_lock(&vnode_free_list_mtx);
 711         }
 712 }
 713 /*
 714  * Attempt to recycle vnodes in a context that is always safe to block.
 715  * Calling vlrurecycle() from the bowels of filesystem code has some
 716  * interesting deadlock problems.
 717  */
 718 static struct proc *vnlruproc;
 719 static int vnlruproc_sig;
 720
 721 static void
 722 vnlru_proc(void)
 723 {
 724         struct mount *mp, *nmp;
 725         int done;
 726         struct proc *p = vnlruproc;
 727         struct thread *td = curthread;
 728
 729         EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
 730             SHUTDOWN_PRI_FIRST);
 731
 732         mtx_lock(&Giant);
 733
 734         for (;;) {
 735                 kproc_suspend_check(p);
 736                 mtx_lock(&vnode_free_list_mtx);
 737                 if (freevnodes > wantfreevnodes)
 738                         vnlru_free(freevnodes - wantfreevnodes);
 739                 if (numvnodes <= desiredvnodes * 9 / 10) {
 740                         vnlruproc_sig = 0;
 741                         wakeup(&vnlruproc_sig);
 742                         msleep(vnlruproc, &vnode_free_list_mtx,
 743                             PVFS|PDROP, "vlruwt", hz);
 744                         continue;
 745                 }
 746                 mtx_unlock(&vnode_free_list_mtx);
 747                 done = 0;
 748                 mtx_lock(&mountlist_mtx);
 749                 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 750                         int vfsunlocked;
 751                         if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 752                                 nmp = TAILQ_NEXT(mp, mnt_list);
 753                                 continue;
 754                         }
 755                         if (!VFS_NEEDSGIANT(mp)) {
 756                                 mtx_unlock(&Giant);
 757                                 vfsunlocked = 1;
 758                         } else
 759                                 vfsunlocked = 0;
 760                         done += vlrureclaim(mp);
 761                         if (vfsunlocked)
 762                                 mtx_lock(&Giant);
 763                         mtx_lock(&mountlist_mtx);
 764                         nmp = TAILQ_NEXT(mp, mnt_list);
 765                         vfs_unbusy(mp, td);
 766                 }
 767                 mtx_unlock(&mountlist_mtx);
 768                 if (done == 0) {
 769                         EVENTHANDLER_INVOKE(vfs_lowvnodes, desiredvnodes / 10);
 770 #if 0
 771                         /* These messages are temporary debugging aids */
 772                         if (vnlru_nowhere < 5)
 773                                 printf("vnlru process getting nowhere..\n");
 774                         else if (vnlru_nowhere == 5)
 775                                 printf("vnlru process messages stopped.\n");
 776 #endif
 777                         vnlru_nowhere++;
 778                         tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 779                 } else
 780                         uio_yield();
 781         }
 782 }
 783
 784 static struct kproc_desc vnlru_kp = {
 785         "vnlru",
 786         vnlru_proc,
 787         &vnlruproc
 788 };
 789 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
 790     &vnlru_kp);
 791
 792 /*
 793  * Routines having to do with the management of the vnode table.
 794  */
 795
 796 static void
 797 vdestroy(struct vnode *vp)
 798 {
 799         struct bufobj *bo;
 800
 801         CTR1(KTR_VFS, "vdestroy vp %p", vp);
 802         mtx_lock(&vnode_free_list_mtx);
 803         numvnodes--;
 804         mtx_unlock(&vnode_free_list_mtx);
 805         bo = &vp->v_bufobj;
 806         VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 807             ("cleaned vnode still on the free list."));
 808         VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
 809         VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
 810         VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 811         VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 812         VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 813         VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
 814         VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
 815         VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 816         VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
 817         VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
 818         VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
 819         VI_UNLOCK(vp);
 820 #ifdef MAC
 821         mac_vnode_destroy(vp);
 822 #endif
 823         if (vp->v_pollinfo != NULL) {
 824                 knlist_destroy(&vp->v_pollinfo->vpi_selinfo.si_note);
 825                 mtx_destroy(&vp->v_pollinfo->vpi_lock);
 826                 uma_zfree(vnodepoll_zone, vp->v_pollinfo);
 827         }
 828 #ifdef INVARIANTS
 829         /* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
 830         vp->v_op = NULL;
 831 #endif
 832         lockdestroy(vp->v_vnlock);
 833         mtx_destroy(&vp->v_interlock);
 834         mtx_destroy(BO_MTX(bo));
 835         uma_zfree(vnode_zone, vp);
 836 }
 837
 838 /*
 839  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
 840  * before we actually vgone().  This function must be called with the vnode
 841  * held to prevent the vnode from being returned to the free list midway
 842  * through vgone().
 843  */
 844 static int
 845 vtryrecycle(struct vnode *vp)
 846 {
 847         struct mount *vnmp;
 848
 849         CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp);
 850         VNASSERT(vp->v_holdcnt, vp,
 851             ("vtryrecycle: Recycling vp %p without a reference.", vp));
 852         /*
 853          * This vnode may found and locked via some other list, if so we
 854          * can't recycle it yet.
 855          */
 856         if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
 857                 return (EWOULDBLOCK);
 858         /*
 859          * Don't recycle if its filesystem is being suspended.
 860          */
 861         if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 862                 VOP_UNLOCK(vp, 0);
 863                 return (EBUSY);
 864         }
 865         /*
 866          * If we got this far, we need to acquire the interlock and see if
 867          * anyone picked up this vnode from another list.  If not, we will
 868          * mark it with DOOMED via vgonel() so that anyone who does find it
 869          * will skip over it.
 870          */
 871         VI_LOCK(vp);
 872         if (vp->v_usecount) {
 873                 VOP_UNLOCK(vp, LK_INTERLOCK);
 874                 vn_finished_write(vnmp);
 875                 return (EBUSY);
 876         }
 877         if ((vp->v_iflag & VI_DOOMED) == 0)
 878                 vgonel(vp);
 879         VOP_UNLOCK(vp, LK_INTERLOCK);
 880         vn_finished_write(vnmp);
 881         CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp);
 882         return (0);
 883 }
 884
 885 /*
 886  * Return the next vnode from the free list.
 887  */
 888 int
 889 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
 890     struct vnode **vpp)
 891 {
 892         struct vnode *vp = NULL;
 893         struct bufobj *bo;
 894
 895         mtx_lock(&vnode_free_list_mtx);
 896         /*
 897          * Lend our context to reclaim vnodes if they've exceeded the max.
 898          */
 899         if (freevnodes > wantfreevnodes)
 900                 vnlru_free(1);
 901         /*
 902          * Wait for available vnodes.
 903          */
 904         if (numvnodes > desiredvnodes) {
 905                 if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) {
 906                         /*
 907                          * File system is beeing suspended, we cannot risk a
 908                          * deadlock here, so allocate new vnode anyway.
 909                          */
 910                         if (freevnodes > wantfreevnodes)
 911                                 vnlru_free(freevnodes - wantfreevnodes);
 912                         goto alloc;
 913                 }
 914                 if (vnlruproc_sig == 0) {
 915                         vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
 916                         wakeup(vnlruproc);
 917                 }
 918                 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
 919                     "vlruwk", hz);
 920 #if 0   /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
 921                 if (numvnodes > desiredvnodes) {
 922                         mtx_unlock(&vnode_free_list_mtx);
 923                         return (ENFILE);
 924                 }
 925 #endif
 926         }
 927 alloc:
 928         numvnodes++;
 929         mtx_unlock(&vnode_free_list_mtx);
 930         vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
 931         /*
 932          * Setup locks.
 933          */
 934         vp->v_vnlock = &vp->v_lock;
 935         mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 936         /*
 937          * By default, don't allow shared locks unless filesystems
 938          * opt-in.
 939          */
 940         lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
 941         /*
 942          * Initialize bufobj.
 943          */
 944         bo = &vp->v_bufobj;
 945         bo->__bo_vnode = vp;
 946         mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF);
 947         bo->bo_ops = &buf_ops_bio;
 948         bo->bo_private = vp;
 949         TAILQ_INIT(&bo->bo_clean.bv_hd);
 950         TAILQ_INIT(&bo->bo_dirty.bv_hd);
 951         /*
 952          * Initialize namecache.
 953          */
 954         LIST_INIT(&vp->v_cache_src);
 955         TAILQ_INIT(&vp->v_cache_dst);
 956         /*
 957          * Finalize various vnode identity bits.
 958          */
 959         vp->v_type = VNON;
 960         vp->v_tag = tag;
 961         vp->v_op = vops;
 962         v_incr_usecount(vp);
 963         vp->v_data = 0;
 964 #ifdef MAC
 965         mac_vnode_init(vp);
 966         if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 967                 mac_vnode_associate_singlelabel(mp, vp);
 968         else if (mp == NULL)
 969                 printf("NULL mp in getnewvnode()\n");
 970 #endif
 971         if (mp != NULL) {
 972                 bo->bo_bsize = mp->mnt_stat.f_iosize;
 973                 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 974                         vp->v_vflag |= VV_NOKNOTE;
 975         }
 976
 977         CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp);
 978         *vpp = vp;
 979         return (0);
 980 }
 981
 982 /*
 983  * Delete from old mount point vnode list, if on one.
 984  */
 985 static void
 986 delmntque(struct vnode *vp)
 987 {
 988         struct mount *mp;
 989
 990         mp = vp->v_mount;
 991         if (mp == NULL)
 992                 return;
 993         MNT_ILOCK(mp);
 994         vp->v_mount = NULL;
 995         VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
 996                 ("bad mount point vnode list size"));
 997         TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 998         mp->mnt_nvnodelistsize--;
 999         MNT_REL(mp);
1000         MNT_IUNLOCK(mp);
1001 }
1002
1003 static void
1004 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1005 {
1006
1007         vp->v_data = NULL;
1008         vp->v_op = &dead_vnodeops;
1009         /* XXX non mp-safe fs may still call insmntque with vnode
1010            unlocked */
1011         if (!VOP_ISLOCKED(vp))
1012                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1013         vgone(vp);
1014         vput(vp);
1015 }
1016
1017 /*
1018  * Insert into list of vnodes for the new mount point, if available.
1019  */
1020 int
1021 insmntque1(struct vnode *vp, struct mount *mp,
1022         void (*dtr)(struct vnode *, void *), void *dtr_arg)
1023 {
1024
1025         KASSERT(vp->v_mount == NULL,
1026                 ("insmntque: vnode already on per mount vnode list"));
1027         VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1028         MNT_ILOCK(mp);
1029         if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1030             mp->mnt_nvnodelistsize == 0) {
1031                 MNT_IUNLOCK(mp);
1032                 if (dtr != NULL)
1033                         dtr(vp, dtr_arg);
1034                 return (EBUSY);
1035         }
1036         vp->v_mount = mp;
1037         MNT_REF(mp);
1038         TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1039         VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1040                 ("neg mount point vnode list size"));
1041         mp->mnt_nvnodelistsize++;
1042         MNT_IUNLOCK(mp);
1043         return (0);
1044 }
1045
1046 int
1047 insmntque(struct vnode *vp, struct mount *mp)
1048 {
1049
1050         return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1051 }
1052
1053 /*
1054  * Flush out and invalidate all buffers associated with a bufobj
1055  * Called with the underlying object locked.
1056  */
1057 int
1058 bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag,
1059     int slptimeo)
1060 {
1061         int error;
1062
1063         BO_LOCK(bo);
1064         if (flags & V_SAVE) {
1065                 error = bufobj_wwait(bo, slpflag, slptimeo);
1066                 if (error) {
1067                         BO_UNLOCK(bo);
1068                         return (error);
1069                 }
1070                 if (bo->bo_dirty.bv_cnt > 0) {
1071                         BO_UNLOCK(bo);
1072                         if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0)
1073                                 return (error);
1074                         /*
1075                          * XXX We could save a lock/unlock if this was only
1076                          * enabled under INVARIANTS
1077                          */
1078                         BO_LOCK(bo);
1079                         if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1080                                 panic("vinvalbuf: dirty bufs");
1081                 }
1082         }
1083         /*
1084          * If you alter this loop please notice that interlock is dropped and
1085          * reacquired in flushbuflist.  Special care is needed to ensure that
1086          * no race conditions occur from this.
1087          */
1088         do {
1089                 error = flushbuflist(&bo->bo_clean,
1090                     flags, bo, slpflag, slptimeo);
1091                 if (error == 0)
1092                         error = flushbuflist(&bo->bo_dirty,
1093                             flags, bo, slpflag, slptimeo);
1094                 if (error != 0 && error != EAGAIN) {
1095                         BO_UNLOCK(bo);
1096                         return (error);
1097                 }
1098         } while (error != 0);
1099
1100         /*
1101          * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1102          * have write I/O in-progress but if there is a VM object then the
1103          * VM object can also have read-I/O in-progress.
1104          */
1105         do {
1106                 bufobj_wwait(bo, 0, 0);
1107                 BO_UNLOCK(bo);
1108                 if (bo->bo_object != NULL) {
1109                         VM_OBJECT_LOCK(bo->bo_object);
1110                         vm_object_pip_wait(bo->bo_object, "bovlbx");
1111                         VM_OBJECT_UNLOCK(bo->bo_object);
1112                 }
1113                 BO_LOCK(bo);
1114         } while (bo->bo_numoutput > 0);
1115         BO_UNLOCK(bo);
1116
1117         /*
1118          * Destroy the copy in the VM cache, too.
1119          */
1120         if (bo->bo_object != NULL) {
1121                 VM_OBJECT_LOCK(bo->bo_object);
1122                 vm_object_page_remove(bo->bo_object, 0, 0,
1123                         (flags & V_SAVE) ? TRUE : FALSE);
1124                 VM_OBJECT_UNLOCK(bo->bo_object);
1125         }
1126
1127 #ifdef INVARIANTS
1128         BO_LOCK(bo);
1129         if ((flags & (V_ALT | V_NORMAL)) == 0 &&
1130             (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1131                 panic("vinvalbuf: flush failed");
1132         BO_UNLOCK(bo);
1133 #endif
1134         return (0);
1135 }
1136
1137 /*
1138  * Flush out and invalidate all buffers associated with a vnode.
1139  * Called with the underlying object locked.
1140  */
1141 int
1142 vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag,
1143     int slptimeo)
1144 {
1145
1146         CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags);
1147         ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1148         return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo));
1149 }
1150
1151 /*
1152  * Flush out buffers on the specified list.
1153  *
1154  */
1155 static int
1156 flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1157     int slptimeo)
1158 {
1159         struct buf *bp, *nbp;
1160         int retval, error;
1161         daddr_t lblkno;
1162         b_xflags_t xflags;
1163
1164         ASSERT_BO_LOCKED(bo);
1165
1166         retval = 0;
1167         TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1168                 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1169                     ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1170                         continue;
1171                 }
1172                 lblkno = 0;
1173                 xflags = 0;
1174                 if (nbp != NULL) {
1175                         lblkno = nbp->b_lblkno;
1176                         xflags = nbp->b_xflags &
1177                                 (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
1178                 }
1179                 retval = EAGAIN;
1180                 error = BUF_TIMELOCK(bp,
1181                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
1182                     "flushbuf", slpflag, slptimeo);
1183                 if (error) {
1184                         BO_LOCK(bo);
1185                         return (error != ENOLCK ? error : EAGAIN);
1186                 }
1187                 KASSERT(bp->b_bufobj == bo,
1188                     ("bp %p wrong b_bufobj %p should be %p",
1189                     bp, bp->b_bufobj, bo));
1190                 if (bp->b_bufobj != bo) {       /* XXX: necessary ? */
1191                         BUF_UNLOCK(bp);
1192                         BO_LOCK(bo);
1193                         return (EAGAIN);
1194                 }
1195                 /*
1196                  * XXX Since there are no node locks for NFS, I
1197                  * believe there is a slight chance that a delayed
1198                  * write will occur while sleeping just above, so
1199                  * check for it.
1200                  */
1201                 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1202                     (flags & V_SAVE)) {
1203                         bremfree(bp);
1204                         bp->b_flags |= B_ASYNC;
1205                         bwrite(bp);
1206                         BO_LOCK(bo);
1207                         return (EAGAIN);        /* XXX: why not loop ? */
1208                 }
1209                 bremfree(bp);
1210                 bp->b_flags |= (B_INVAL | B_RELBUF);
1211                 bp->b_flags &= ~B_ASYNC;
1212                 brelse(bp);
1213                 BO_LOCK(bo);
1214                 if (nbp != NULL &&
1215                     (nbp->b_bufobj != bo ||
1216                      nbp->b_lblkno != lblkno ||
1217                      (nbp->b_xflags &
1218                       (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1219                         break;                  /* nbp invalid */
1220         }
1221         return (retval);
1222 }
1223
1224 /*
1225  * Truncate a file's buffer and pages to a specified length.  This
1226  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1227  * sync activity.
1228  */
1229 int
1230 vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
1231     off_t length, int blksize)
1232 {
1233         struct buf *bp, *nbp;
1234         int anyfreed;
1235         int trunclbn;
1236         struct bufobj *bo;
1237
1238         CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length);
1239         /*
1240          * Round up to the *next* lbn.
1241          */
1242         trunclbn = (length + blksize - 1) / blksize;
1243
1244         ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1245 restart:
1246         bo = &vp->v_bufobj;
1247         BO_LOCK(bo);
1248         anyfreed = 1;
1249         for (;anyfreed;) {
1250                 anyfreed = 0;
1251                 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1252                         if (bp->b_lblkno < trunclbn)
1253                                 continue;
1254                         if (BUF_LOCK(bp,
1255                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1256                             BO_MTX(bo)) == ENOLCK)
1257                                 goto restart;
1258
1259                         bremfree(bp);
1260                         bp->b_flags |= (B_INVAL | B_RELBUF);
1261                         bp->b_flags &= ~B_ASYNC;
1262                         brelse(bp);
1263                         anyfreed = 1;
1264
1265                         if (nbp != NULL &&
1266                             (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1267                             (nbp->b_vp != vp) ||
1268                             (nbp->b_flags & B_DELWRI))) {
1269                                 goto restart;
1270                         }
1271                         BO_LOCK(bo);
1272                 }
1273
1274                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1275                         if (bp->b_lblkno < trunclbn)
1276                                 continue;
1277                         if (BUF_LOCK(bp,
1278                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1279                             BO_MTX(bo)) == ENOLCK)
1280                                 goto restart;
1281                         bremfree(bp);
1282                         bp->b_flags |= (B_INVAL | B_RELBUF);
1283                         bp->b_flags &= ~B_ASYNC;
1284                         brelse(bp);
1285                         anyfreed = 1;
1286                         if (nbp != NULL &&
1287                             (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1288                             (nbp->b_vp != vp) ||
1289                             (nbp->b_flags & B_DELWRI) == 0)) {
1290                                 goto restart;
1291                         }
1292                         BO_LOCK(bo);
1293                 }
1294         }
1295
1296         if (length > 0) {
1297 restartsync:
1298                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1299                         if (bp->b_lblkno > 0)
1300                                 continue;
1301                         /*
1302                          * Since we hold the vnode lock this should only
1303                          * fail if we're racing with the buf daemon.
1304                          */
1305                         if (BUF_LOCK(bp,
1306                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1307                             BO_MTX(bo)) == ENOLCK) {
1308                                 goto restart;
1309                         }
1310                         VNASSERT((bp->b_flags & B_DELWRI), vp,
1311                             ("buf(%p) on dirty queue without DELWRI", bp));
1312
1313                         bremfree(bp);
1314                         bawrite(bp);
1315                         BO_LOCK(bo);
1316                         goto restartsync;
1317                 }
1318         }
1319
1320         bufobj_wwait(bo, 0, 0);
1321         BO_UNLOCK(bo);
1322         vnode_pager_setsize(vp, length);
1323
1324         return (0);
1325 }
1326
1327 /*
1328  * buf_splay() - splay tree core for the clean/dirty list of buffers in
1329  *               a vnode.
1330  *
1331  *      NOTE: We have to deal with the special case of a background bitmap
1332  *      buffer, a situation where two buffers will have the same logical
1333  *      block offset.  We want (1) only the foreground buffer to be accessed
1334  *      in a lookup and (2) must differentiate between the foreground and
1335  *      background buffer in the splay tree algorithm because the splay
1336  *      tree cannot normally handle multiple entities with the same 'index'.
1337  *      We accomplish this by adding differentiating flags to the splay tree's
1338  *      numerical domain.
1339  */
1340 static
1341 struct buf *
1342 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1343 {
1344         struct buf dummy;
1345         struct buf *lefttreemax, *righttreemin, *y;
1346
1347         if (root == NULL)
1348                 return (NULL);
1349         lefttreemax = righttreemin = &dummy;
1350         for (;;) {
1351                 if (lblkno < root->b_lblkno ||
1352                     (lblkno == root->b_lblkno &&
1353                     (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1354                         if ((y = root->b_left) == NULL)
1355                                 break;
1356                         if (lblkno < y->b_lblkno) {
1357                                 /* Rotate right. */
1358                                 root->b_left = y->b_right;
1359                                 y->b_right = root;
1360                                 root = y;
1361                                 if ((y = root->b_left) == NULL)
1362                                         break;
1363                         }
1364                         /* Link into the new root's right tree. */
1365                         righttreemin->b_left = root;
1366                         righttreemin = root;
1367                 } else if (lblkno > root->b_lblkno ||
1368                     (lblkno == root->b_lblkno &&
1369                     (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1370                         if ((y = root->b_right) == NULL)
1371                                 break;
1372                         if (lblkno > y->b_lblkno) {
1373                                 /* Rotate left. */
1374                                 root->b_right = y->b_left;
1375                                 y->b_left = root;
1376                                 root = y;
1377                                 if ((y = root->b_right) == NULL)
1378                                         break;
1379                         }
1380                         /* Link into the new root's left tree. */
1381                         lefttreemax->b_right = root;
1382                         lefttreemax = root;
1383                 } else {
1384                         break;
1385                 }
1386                 root = y;
1387         }
1388         /* Assemble the new root. */
1389         lefttreemax->b_right = root->b_left;
1390         righttreemin->b_left = root->b_right;
1391         root->b_left = dummy.b_right;
1392         root->b_right = dummy.b_left;
1393         return (root);
1394 }
1395
1396 static void
1397 buf_vlist_remove(struct buf *bp)
1398 {
1399         struct buf *root;
1400         struct bufv *bv;
1401
1402         KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1403         ASSERT_BO_LOCKED(bp->b_bufobj);
1404         KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1405             (BX_VNDIRTY|BX_VNCLEAN),
1406             ("buf_vlist_remove: Buf %p is on two lists", bp));
1407         if (bp->b_xflags & BX_VNDIRTY)
1408                 bv = &bp->b_bufobj->bo_dirty;
1409         else
1410                 bv = &bp->b_bufobj->bo_clean;
1411         if (bp != bv->bv_root) {
1412                 root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1413                 KASSERT(root == bp, ("splay lookup failed in remove"));
1414         }
1415         if (bp->b_left == NULL) {
1416                 root = bp->b_right;
1417         } else {
1418                 root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1419                 root->b_right = bp->b_right;
1420         }
1421         bv->bv_root = root;
1422         TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1423         bv->bv_cnt--;
1424         bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1425 }
1426
1427 /*
1428  * Add the buffer to the sorted clean or dirty block list using a
1429  * splay tree algorithm.
1430  *
1431  * NOTE: xflags is passed as a constant, optimizing this inline function!
1432  */
1433 static void
1434 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1435 {
1436         struct buf *root;
1437         struct bufv *bv;
1438
1439         ASSERT_BO_LOCKED(bo);
1440         KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1441             ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1442         bp->b_xflags |= xflags;
1443         if (xflags & BX_VNDIRTY)
1444                 bv = &bo->bo_dirty;
1445         else
1446                 bv = &bo->bo_clean;
1447
1448         root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1449         if (root == NULL) {
1450                 bp->b_left = NULL;
1451                 bp->b_right = NULL;
1452                 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1453         } else if (bp->b_lblkno < root->b_lblkno ||
1454             (bp->b_lblkno == root->b_lblkno &&
1455             (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1456                 bp->b_left = root->b_left;
1457                 bp->b_right = root;
1458                 root->b_left = NULL;
1459                 TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
1460         } else {
1461                 bp->b_right = root->b_right;
1462                 bp->b_left = root;
1463                 root->b_right = NULL;
1464                 TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
1465         }
1466         bv->bv_cnt++;
1467         bv->bv_root = bp;
1468 }
1469
1470 /*
1471  * Lookup a buffer using the splay tree.  Note that we specifically avoid
1472  * shadow buffers used in background bitmap writes.
1473  *
1474  * This code isn't quite efficient as it could be because we are maintaining
1475  * two sorted lists and do not know which list the block resides in.
1476  *
1477  * During a "make buildworld" the desired buffer is found at one of
1478  * the roots more than 60% of the time.  Thus, checking both roots
1479  * before performing either splay eliminates unnecessary splays on the
1480  * first tree splayed.
1481  */
1482 struct buf *
1483 gbincore(struct bufobj *bo, daddr_t lblkno)
1484 {
1485         struct buf *bp;
1486
1487         ASSERT_BO_LOCKED(bo);
1488         if ((bp = bo->bo_clean.bv_root) != NULL &&
1489             bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1490                 return (bp);
1491         if ((bp = bo->bo_dirty.bv_root) != NULL &&
1492             bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1493                 return (bp);
1494         if ((bp = bo->bo_clean.bv_root) != NULL) {
1495                 bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
1496                 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1497                         return (bp);
1498         }
1499         if ((bp = bo->bo_dirty.bv_root) != NULL) {
1500                 bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
1501                 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1502                         return (bp);
1503         }
1504         return (NULL);
1505 }
1506
1507 /*
1508  * Associate a buffer with a vnode.
1509  */
1510 void
1511 bgetvp(struct vnode *vp, struct buf *bp)
1512 {
1513         struct bufobj *bo;
1514
1515         bo = &vp->v_bufobj;
1516         ASSERT_BO_LOCKED(bo);
1517         VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1518
1519         CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1520         VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1521             ("bgetvp: bp already attached! %p", bp));
1522
1523         vhold(vp);
1524         if (VFS_NEEDSGIANT(vp->v_mount) || bo->bo_flag & BO_NEEDSGIANT)
1525                 bp->b_flags |= B_NEEDSGIANT;
1526         bp->b_vp = vp;
1527         bp->b_bufobj = bo;
1528         /*
1529          * Insert onto list for new vnode.
1530          */
1531         buf_vlist_add(bp, bo, BX_VNCLEAN);
1532 }
1533
1534 /*
1535  * Disassociate a buffer from a vnode.
1536  */
1537 void
1538 brelvp(struct buf *bp)
1539 {
1540         struct bufobj *bo;
1541         struct vnode *vp;
1542
1543         CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1544         KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1545
1546         /*
1547          * Delete from old vnode list, if on one.
1548          */
1549         vp = bp->b_vp;          /* XXX */
1550         bo = bp->b_bufobj;
1551         BO_LOCK(bo);
1552         if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1553                 buf_vlist_remove(bp);
1554         else
1555                 panic("brelvp: Buffer %p not on queue.", bp);
1556         if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1557                 bo->bo_flag &= ~BO_ONWORKLST;
1558                 mtx_lock(&sync_mtx);
1559                 LIST_REMOVE(bo, bo_synclist);
1560                 syncer_worklist_len--;
1561                 mtx_unlock(&sync_mtx);
1562         }
1563         bp->b_flags &= ~B_NEEDSGIANT;
1564         bp->b_vp = NULL;
1565         bp->b_bufobj = NULL;
1566         BO_UNLOCK(bo);
1567         vdrop(vp);
1568 }
1569
1570 /*
1571  * Add an item to the syncer work queue.
1572  */
1573 static void
1574 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1575 {
1576         int queue, slot;
1577
1578         ASSERT_BO_LOCKED(bo);
1579
1580         mtx_lock(&sync_mtx);
1581         if (bo->bo_flag & BO_ONWORKLST)
1582                 LIST_REMOVE(bo, bo_synclist);
1583         else {
1584                 bo->bo_flag |= BO_ONWORKLST;
1585                 syncer_worklist_len++;
1586         }
1587
1588         if (delay > syncer_maxdelay - 2)
1589                 delay = syncer_maxdelay - 2;
1590         slot = (syncer_delayno + delay) & syncer_mask;
1591
1592         queue = VFS_NEEDSGIANT(bo->__bo_vnode->v_mount) ? WI_GIANTQ :
1593             WI_MPSAFEQ;
1594         LIST_INSERT_HEAD(&syncer_workitem_pending[queue][slot], bo,
1595             bo_synclist);
1596         mtx_unlock(&sync_mtx);
1597 }
1598
1599 static int
1600 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1601 {
1602         int error, len;
1603
1604         mtx_lock(&sync_mtx);
1605         len = syncer_worklist_len - sync_vnode_count;
1606         mtx_unlock(&sync_mtx);
1607         error = SYSCTL_OUT(req, &len, sizeof(len));
1608         return (error);
1609 }
1610
1611 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1612     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1613
1614 static struct proc *updateproc;
1615 static void sched_sync(void);
1616 static struct kproc_desc up_kp = {
1617         "syncer",
1618         sched_sync,
1619         &updateproc
1620 };
1621 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1622
1623 static int
1624 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1625 {
1626         struct vnode *vp;
1627         struct mount *mp;
1628
1629         *bo = LIST_FIRST(slp);
1630         if (*bo == NULL)
1631                 return (0);
1632         vp = (*bo)->__bo_vnode; /* XXX */
1633         if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1634                 return (1);
1635         /*
1636          * We use vhold in case the vnode does not
1637          * successfully sync.  vhold prevents the vnode from
1638          * going away when we unlock the sync_mtx so that
1639          * we can acquire the vnode interlock.
1640          */
1641         vholdl(vp);
1642         mtx_unlock(&sync_mtx);
1643         VI_UNLOCK(vp);
1644         if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1645                 vdrop(vp);
1646                 mtx_lock(&sync_mtx);
1647                 return (*bo == LIST_FIRST(slp));
1648         }
1649         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1650         (void) VOP_FSYNC(vp, MNT_LAZY, td);
1651         VOP_UNLOCK(vp, 0);
1652         vn_finished_write(mp);
1653         BO_LOCK(*bo);
1654         if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1655                 /*
1656                  * Put us back on the worklist.  The worklist
1657                  * routine will remove us from our current
1658                  * position and then add us back in at a later
1659                  * position.
1660                  */
1661                 vn_syncer_add_to_worklist(*bo, syncdelay);
1662         }
1663         BO_UNLOCK(*bo);
1664         vdrop(vp);
1665         mtx_lock(&sync_mtx);
1666         return (0);
1667 }
1668
1669 /*
1670  * System filesystem synchronizer daemon.
1671  */
1672 static void
1673 sched_sync(void)
1674 {
1675         struct synclist *gnext, *next;
1676         struct synclist *gslp, *slp;
1677         struct bufobj *bo;
1678         long starttime;
1679         struct thread *td = curthread;
1680         static int dummychan;
1681         int last_work_seen;
1682         int net_worklist_len;
1683         int syncer_final_iter;
1684         int first_printf;
1685         int error;
1686
1687         last_work_seen = 0;
1688         syncer_final_iter = 0;
1689         first_printf = 1;
1690         syncer_state = SYNCER_RUNNING;
1691         starttime = time_uptime;
1692         td->td_pflags |= TDP_NORUNNINGBUF;
1693
1694         EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1695             SHUTDOWN_PRI_LAST);
1696
1697         mtx_lock(&sync_mtx);
1698         for (;;) {
1699                 if (syncer_state == SYNCER_FINAL_DELAY &&
1700                     syncer_final_iter == 0) {
1701                         mtx_unlock(&sync_mtx);
1702                         kproc_suspend_check(td->td_proc);
1703                         mtx_lock(&sync_mtx);
1704                 }
1705                 net_worklist_len = syncer_worklist_len - sync_vnode_count;
1706                 if (syncer_state != SYNCER_RUNNING &&
1707                     starttime != time_uptime) {
1708                         if (first_printf) {
1709                                 printf("\nSyncing disks, vnodes remaining...");
1710                                 first_printf = 0;
1711                         }
1712                         printf("%d ", net_worklist_len);
1713                 }
1714                 starttime = time_uptime;
1715
1716                 /*
1717                  * Push files whose dirty time has expired.  Be careful
1718                  * of interrupt race on slp queue.
1719                  *
1720                  * Skip over empty worklist slots when shutting down.
1721                  */
1722                 do {
1723                         slp = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
1724                         gslp = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
1725                         syncer_delayno += 1;
1726                         if (syncer_delayno == syncer_maxdelay)
1727                                 syncer_delayno = 0;
1728                         next = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
1729                         gnext = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
1730                         /*
1731                          * If the worklist has wrapped since the
1732                          * it was emptied of all but syncer vnodes,
1733                          * switch to the FINAL_DELAY state and run
1734                          * for one more second.
1735                          */
1736                         if (syncer_state == SYNCER_SHUTTING_DOWN &&
1737                             net_worklist_len == 0 &&
1738                             last_work_seen == syncer_delayno) {
1739                                 syncer_state = SYNCER_FINAL_DELAY;
1740                                 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1741                         }
1742                 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1743                     LIST_EMPTY(gslp) && syncer_worklist_len > 0);
1744
1745                 /*
1746                  * Keep track of the last time there was anything
1747                  * on the worklist other than syncer vnodes.
1748                  * Return to the SHUTTING_DOWN state if any
1749                  * new work appears.
1750                  */
1751                 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1752                         last_work_seen = syncer_delayno;
1753                 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1754                         syncer_state = SYNCER_SHUTTING_DOWN;
1755                 while (!LIST_EMPTY(slp)) {
1756                         error = sync_vnode(slp, &bo, td);
1757                         if (error == 1) {
1758                                 LIST_REMOVE(bo, bo_synclist);
1759                                 LIST_INSERT_HEAD(next, bo, bo_synclist);
1760                                 continue;
1761                         }
1762                 }
1763                 if (!LIST_EMPTY(gslp)) {
1764                         mtx_unlock(&sync_mtx);
1765                         mtx_lock(&Giant);
1766                         mtx_lock(&sync_mtx);
1767                         while (!LIST_EMPTY(gslp)) {
1768                                 error = sync_vnode(gslp, &bo, td);
1769                                 if (error == 1) {
1770                                         LIST_REMOVE(bo, bo_synclist);
1771                                         LIST_INSERT_HEAD(gnext, bo,
1772                                             bo_synclist);
1773                                         continue;
1774                                 }
1775                         }
1776                         mtx_unlock(&Giant);
1777                 }
1778                 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1779                         syncer_final_iter--;
1780                 /*
1781                  * The variable rushjob allows the kernel to speed up the
1782                  * processing of the filesystem syncer process. A rushjob
1783                  * value of N tells the filesystem syncer to process the next
1784                  * N seconds worth of work on its queue ASAP. Currently rushjob
1785                  * is used by the soft update code to speed up the filesystem
1786                  * syncer process when the incore state is getting so far
1787                  * ahead of the disk that the kernel memory pool is being
1788                  * threatened with exhaustion.
1789                  */
1790                 if (rushjob > 0) {
1791                         rushjob -= 1;
1792                         continue;
1793                 }
1794                 /*
1795                  * Just sleep for a short period of time between
1796                  * iterations when shutting down to allow some I/O
1797                  * to happen.
1798                  *
1799                  * If it has taken us less than a second to process the
1800                  * current work, then wait. Otherwise start right over
1801                  * again. We can still lose time if any single round
1802                  * takes more than two seconds, but it does not really
1803                  * matter as we are just trying to generally pace the
1804                  * filesystem activity.
1805                  */
1806                 if (syncer_state != SYNCER_RUNNING)
1807                         msleep(&dummychan, &sync_mtx, PPAUSE, "syncfnl",
1808                             hz / SYNCER_SHUTDOWN_SPEEDUP);
1809                 else if (time_uptime == starttime)
1810                         msleep(&lbolt, &sync_mtx, PPAUSE, "syncer", 0);
1811         }
1812 }
1813
1814 /*
1815  * Request the syncer daemon to speed up its work.
1816  * We never push it to speed up more than half of its
1817  * normal turn time, otherwise it could take over the cpu.
1818  */
1819 int
1820 speedup_syncer(void)
1821 {
1822         struct thread *td;
1823         int ret = 0;
1824
1825         td = FIRST_THREAD_IN_PROC(updateproc);
1826         mtx_lock(&sync_mtx);
1827         if (rushjob < syncdelay / 2) {
1828                 rushjob += 1;
1829                 stat_rush_requests += 1;
1830                 ret = 1;
1831         }
1832         mtx_unlock(&sync_mtx);
1833         sleepq_remove(td, &lbolt);
1834         return (ret);
1835 }
1836
1837 /*
1838  * Tell the syncer to speed up its work and run though its work
1839  * list several times, then tell it to shut down.
1840  */
1841 static void
1842 syncer_shutdown(void *arg, int howto)
1843 {
1844         struct thread *td;
1845
1846         if (howto & RB_NOSYNC)
1847                 return;
1848         td = FIRST_THREAD_IN_PROC(updateproc);
1849         mtx_lock(&sync_mtx);
1850         syncer_state = SYNCER_SHUTTING_DOWN;
1851         rushjob = 0;
1852         mtx_unlock(&sync_mtx);
1853         sleepq_remove(td, &lbolt);
1854         kproc_shutdown(arg, howto);
1855 }
1856
1857 /*
1858  * Reassign a buffer from one vnode to another.
1859  * Used to assign file specific control information
1860  * (indirect blocks) to the vnode to which they belong.
1861  */
1862 void
1863 reassignbuf(struct buf *bp)
1864 {
1865         struct vnode *vp;
1866         struct bufobj *bo;
1867         int delay;
1868 #ifdef INVARIANTS
1869         struct bufv *bv;
1870 #endif
1871
1872         vp = bp->b_vp;
1873         bo = bp->b_bufobj;
1874         ++reassignbufcalls;
1875
1876         CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
1877             bp, bp->b_vp, bp->b_flags);
1878         /*
1879          * B_PAGING flagged buffers cannot be reassigned because their vp
1880          * is not fully linked in.
1881          */
1882         if (bp->b_flags & B_PAGING)
1883                 panic("cannot reassign paging buffer");
1884
1885         /*
1886          * Delete from old vnode list, if on one.
1887          */
1888         BO_LOCK(bo);
1889         if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1890                 buf_vlist_remove(bp);
1891         else
1892                 panic("reassignbuf: Buffer %p not on queue.", bp);
1893         /*
1894          * If dirty, put on list of dirty buffers; otherwise insert onto list
1895          * of clean buffers.
1896          */
1897         if (bp->b_flags & B_DELWRI) {
1898                 if ((bo->bo_flag & BO_ONWORKLST) == 0) {
1899                         switch (vp->v_type) {
1900                         case VDIR:
1901                                 delay = dirdelay;
1902                                 break;
1903                         case VCHR:
1904                                 delay = metadelay;
1905                                 break;
1906                         default:
1907                                 delay = filedelay;
1908                         }
1909                         vn_syncer_add_to_worklist(bo, delay);
1910                 }
1911                 buf_vlist_add(bp, bo, BX_VNDIRTY);
1912         } else {
1913                 buf_vlist_add(bp, bo, BX_VNCLEAN);
1914
1915                 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1916                         mtx_lock(&sync_mtx);
1917                         LIST_REMOVE(bo, bo_synclist);
1918                         syncer_worklist_len--;
1919                         mtx_unlock(&sync_mtx);
1920                         bo->bo_flag &= ~BO_ONWORKLST;
1921                 }
1922         }
1923 #ifdef INVARIANTS
1924         bv = &bo->bo_clean;
1925         bp = TAILQ_FIRST(&bv->bv_hd);
1926         KASSERT(bp == NULL || bp->b_bufobj == bo,
1927             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1928         bp = TAILQ_LAST(&bv->bv_hd, buflists);
1929         KASSERT(bp == NULL || bp->b_bufobj == bo,
1930             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1931         bv = &bo->bo_dirty;
1932         bp = TAILQ_FIRST(&bv->bv_hd);
1933         KASSERT(bp == NULL || bp->b_bufobj == bo,
1934             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1935         bp = TAILQ_LAST(&bv->bv_hd, buflists);
1936         KASSERT(bp == NULL || bp->b_bufobj == bo,
1937             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1938 #endif
1939         BO_UNLOCK(bo);
1940 }
1941
1942 /*
1943  * Increment the use and hold counts on the vnode, taking care to reference
1944  * the driver's usecount if this is a chardev.  The vholdl() will remove
1945  * the vnode from the free list if it is presently free.  Requires the
1946  * vnode interlock and returns with it held.
1947  */
1948 static void
1949 v_incr_usecount(struct vnode *vp)
1950 {
1951
1952         CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n",
1953             vp, vp->v_holdcnt, vp->v_usecount);
1954         vp->v_usecount++;
1955         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1956                 dev_lock();
1957                 vp->v_rdev->si_usecount++;
1958                 dev_unlock();
1959         }
1960         vholdl(vp);
1961 }
1962
1963 /*
1964  * Turn a holdcnt into a use+holdcnt such that only one call to
1965  * v_decr_usecount is needed.
1966  */
1967 static void
1968 v_upgrade_usecount(struct vnode *vp)
1969 {
1970
1971         CTR3(KTR_VFS, "v_upgrade_usecount: vp %p holdcnt %d usecount %d\n",
1972             vp, vp->v_holdcnt, vp->v_usecount);
1973         vp->v_usecount++;
1974         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1975                 dev_lock();
1976                 vp->v_rdev->si_usecount++;
1977                 dev_unlock();
1978         }
1979 }
1980
1981 /*
1982  * Decrement the vnode use and hold count along with the driver's usecount
1983  * if this is a chardev.  The vdropl() below releases the vnode interlock
1984  * as it may free the vnode.
1985  */
1986 static void
1987 v_decr_usecount(struct vnode *vp)
1988 {
1989
1990         CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n",
1991             vp, vp->v_holdcnt, vp->v_usecount);
1992         ASSERT_VI_LOCKED(vp, __FUNCTION__);
1993         VNASSERT(vp->v_usecount > 0, vp,
1994             ("v_decr_usecount: negative usecount"));
1995         vp->v_usecount--;
1996         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1997                 dev_lock();
1998                 vp->v_rdev->si_usecount--;
1999                 dev_unlock();
2000         }
2001         vdropl(vp);
2002 }
2003
2004 /*
2005  * Decrement only the use count and driver use count.  This is intended to
2006  * be paired with a follow on vdropl() to release the remaining hold count.
2007  * In this way we may vgone() a vnode with a 0 usecount without risk of
2008  * having it end up on a free list because the hold count is kept above 0.
2009  */
2010 static void
2011 v_decr_useonly(struct vnode *vp)
2012 {
2013
2014         CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n",
2015             vp, vp->v_holdcnt, vp->v_usecount);
2016         ASSERT_VI_LOCKED(vp, __FUNCTION__);
2017         VNASSERT(vp->v_usecount > 0, vp,
2018             ("v_decr_useonly: negative usecount"));
2019         vp->v_usecount--;
2020         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2021                 dev_lock();
2022                 vp->v_rdev->si_usecount--;
2023                 dev_unlock();
2024         }
2025 }
2026
2027 /*
2028  * Grab a particular vnode from the free list, increment its
2029  * reference count and lock it.  VI_DOOMED is set if the vnode
2030  * is being destroyed.  Only callers who specify LK_RETRY will
2031  * see doomed vnodes.  If inactive processing was delayed in
2032  * vput try to do it here.
2033  */
2034 int
2035 vget(struct vnode *vp, int flags, struct thread *td)
2036 {
2037         int error;
2038
2039         error = 0;
2040         VFS_ASSERT_GIANT(vp->v_mount);
2041         if ((flags & LK_INTERLOCK) == 0)
2042                 VI_LOCK(vp);
2043         vholdl(vp);
2044         if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
2045                 vdrop(vp);
2046                 return (error);
2047         }
2048         if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2049                 panic("vget: vn_lock failed to return ENOENT\n");
2050         VI_LOCK(vp);
2051         /* Upgrade our holdcnt to a usecount. */
2052         v_upgrade_usecount(vp);
2053         /*
2054          * We don't guarantee that any particular close will
2055          * trigger inactive processing so just make a best effort
2056          * here at preventing a reference to a removed file.  If
2057          * we don't succeed no harm is done.
2058          */
2059         if (vp->v_iflag & VI_OWEINACT) {
2060                 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2061                     (flags & LK_NOWAIT) == 0)
2062                         vinactive(vp, td);
2063                 vp->v_iflag &= ~VI_OWEINACT;
2064         }
2065         VI_UNLOCK(vp);
2066         return (0);
2067 }
2068
2069 /*
2070  * Increase the reference count of a vnode.
2071  */
2072 void
2073 vref(struct vnode *vp)
2074 {
2075
2076         VI_LOCK(vp);
2077         v_incr_usecount(vp);
2078         VI_UNLOCK(vp);
2079 }
2080
2081 /*
2082  * Return reference count of a vnode.
2083  *
2084  * The results of this call are only guaranteed when some mechanism other
2085  * than the VI lock is used to stop other processes from gaining references
2086  * to the vnode.  This may be the case if the caller holds the only reference.
2087  * This is also useful when stale data is acceptable as race conditions may
2088  * be accounted for by some other means.
2089  */
2090 int
2091 vrefcnt(struct vnode *vp)
2092 {
2093         int usecnt;
2094
2095         VI_LOCK(vp);
2096         usecnt = vp->v_usecount;
2097         VI_UNLOCK(vp);
2098
2099         return (usecnt);
2100 }
2101
2102
2103 /*
2104  * Vnode put/release.
2105  * If count drops to zero, call inactive routine and return to freelist.
2106  */
2107 void
2108 vrele(struct vnode *vp)
2109 {
2110         struct thread *td = curthread;  /* XXX */
2111
2112         KASSERT(vp != NULL, ("vrele: null vp"));
2113         VFS_ASSERT_GIANT(vp->v_mount);
2114
2115         VI_LOCK(vp);
2116
2117         /* Skip this v_writecount check if we're going to panic below. */
2118         VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2119             ("vrele: missed vn_close"));
2120
2121         if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2122             vp->v_usecount == 1)) {
2123                 v_decr_usecount(vp);
2124                 return;
2125         }
2126         if (vp->v_usecount != 1) {
2127 #ifdef DIAGNOSTIC
2128                 vprint("vrele: negative ref count", vp);
2129 #endif
2130                 VI_UNLOCK(vp);
2131                 panic("vrele: negative ref cnt");
2132         }
2133         /*
2134          * We want to hold the vnode until the inactive finishes to
2135          * prevent vgone() races.  We drop the use count here and the
2136          * hold count below when we're done.
2137          */
2138         v_decr_useonly(vp);
2139         /*
2140          * We must call VOP_INACTIVE with the node locked. Mark
2141          * as VI_DOINGINACT to avoid recursion.
2142          */
2143         vp->v_iflag |= VI_OWEINACT;
2144         if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) {
2145                 VI_LOCK(vp);
2146                 if (vp->v_usecount > 0)
2147                         vp->v_iflag &= ~VI_OWEINACT;
2148                 if (vp->v_iflag & VI_OWEINACT)
2149                         vinactive(vp, td);
2150                 VOP_UNLOCK(vp, 0);
2151         } else {
2152                 VI_LOCK(vp);
2153                 if (vp->v_usecount > 0)
2154                         vp->v_iflag &= ~VI_OWEINACT;
2155         }
2156         vdropl(vp);
2157 }
2158
2159 /*
2160  * Release an already locked vnode.  This give the same effects as
2161  * unlock+vrele(), but takes less time and avoids releasing and
2162  * re-aquiring the lock (as vrele() acquires the lock internally.)
2163  */
2164 void
2165 vput(struct vnode *vp)
2166 {
2167         struct thread *td = curthread;  /* XXX */
2168         int error;
2169
2170         KASSERT(vp != NULL, ("vput: null vp"));
2171         ASSERT_VOP_LOCKED(vp, "vput");
2172         VFS_ASSERT_GIANT(vp->v_mount);
2173         VI_LOCK(vp);
2174         /* Skip this v_writecount check if we're going to panic below. */
2175         VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2176             ("vput: missed vn_close"));
2177         error = 0;
2178
2179         if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2180             vp->v_usecount == 1)) {
2181                 VOP_UNLOCK(vp, 0);
2182                 v_decr_usecount(vp);
2183                 return;
2184         }
2185
2186         if (vp->v_usecount != 1) {
2187 #ifdef DIAGNOSTIC
2188                 vprint("vput: negative ref count", vp);
2189 #endif
2190                 panic("vput: negative ref cnt");
2191         }
2192         /*
2193          * We want to hold the vnode until the inactive finishes to
2194          * prevent vgone() races.  We drop the use count here and the
2195          * hold count below when we're done.
2196          */
2197         v_decr_useonly(vp);
2198         vp->v_iflag |= VI_OWEINACT;
2199         if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2200                 error = VOP_LOCK(vp, LK_UPGRADE|LK_INTERLOCK|LK_NOWAIT);
2201                 VI_LOCK(vp);
2202                 if (error) {
2203                         if (vp->v_usecount > 0)
2204                                 vp->v_iflag &= ~VI_OWEINACT;
2205                         goto done;
2206                 }
2207         }
2208         if (vp->v_usecount > 0)
2209                 vp->v_iflag &= ~VI_OWEINACT;
2210         if (vp->v_iflag & VI_OWEINACT)
2211                 vinactive(vp, td);
2212         VOP_UNLOCK(vp, 0);
2213 done:
2214         vdropl(vp);
2215 }
2216
2217 /*
2218  * Somebody doesn't want the vnode recycled.
2219  */
2220 void
2221 vhold(struct vnode *vp)
2222 {
2223
2224         VI_LOCK(vp);
2225         vholdl(vp);
2226         VI_UNLOCK(vp);
2227 }
2228
2229 void
2230 vholdl(struct vnode *vp)
2231 {
2232
2233         vp->v_holdcnt++;
2234         if (VSHOULDBUSY(vp))
2235                 vbusy(vp);
2236 }
2237
2238 /*
2239  * Note that there is one less who cares about this vnode.  vdrop() is the
2240  * opposite of vhold().
2241  */
2242 void
2243 vdrop(struct vnode *vp)
2244 {
2245
2246         VI_LOCK(vp);
2247         vdropl(vp);
2248 }
2249
2250 /*
2251  * Drop the hold count of the vnode.  If this is the last reference to
2252  * the vnode we will free it if it has been vgone'd otherwise it is
2253  * placed on the free list.
2254  */
2255 void
2256 vdropl(struct vnode *vp)
2257 {
2258
2259         ASSERT_VI_LOCKED(vp, "vdropl");
2260         if (vp->v_holdcnt <= 0)
2261                 panic("vdrop: holdcnt %d", vp->v_holdcnt);
2262         vp->v_holdcnt--;
2263         if (vp->v_holdcnt == 0) {
2264                 if (vp->v_iflag & VI_DOOMED) {
2265                         vdestroy(vp);
2266                         return;
2267                 } else
2268                         vfree(vp);
2269         }
2270         VI_UNLOCK(vp);
2271 }
2272
2273 /*
2274  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2275  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2276  * OWEINACT tracks whether a vnode missed a call to inactive due to a
2277  * failed lock upgrade.
2278  */
2279 static void
2280 vinactive(struct vnode *vp, struct thread *td)
2281 {
2282
2283         ASSERT_VOP_LOCKED(vp, "vinactive");
2284         ASSERT_VI_LOCKED(vp, "vinactive");
2285         VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2286             ("vinactive: recursed on VI_DOINGINACT"));
2287         vp->v_iflag |= VI_DOINGINACT;
2288         vp->v_iflag &= ~VI_OWEINACT;
2289         VI_UNLOCK(vp);
2290         VOP_INACTIVE(vp, td);
2291         VI_LOCK(vp);
2292         VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2293             ("vinactive: lost VI_DOINGINACT"));
2294         vp->v_iflag &= ~VI_DOINGINACT;
2295 }
2296
2297 /*
2298  * Remove any vnodes in the vnode table belonging to mount point mp.
2299  *
2300  * If FORCECLOSE is not specified, there should not be any active ones,
2301  * return error if any are found (nb: this is a user error, not a
2302  * system error). If FORCECLOSE is specified, detach any active vnodes
2303  * that are found.
2304  *
2305  * If WRITECLOSE is set, only flush out regular file vnodes open for
2306  * writing.
2307  *
2308  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2309  *
2310  * `rootrefs' specifies the base reference count for the root vnode
2311  * of this filesystem. The root vnode is considered busy if its
2312  * v_usecount exceeds this value. On a successful return, vflush(, td)
2313  * will call vrele() on the root vnode exactly rootrefs times.
2314  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2315  * be zero.
2316  */
2317 #ifdef DIAGNOSTIC
2318 static int busyprt = 0;         /* print out busy vnodes */
2319 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2320 #endif
2321
2322 int
2323 vflush( struct mount *mp, int rootrefs, int flags, struct thread *td)
2324 {
2325         struct vnode *vp, *mvp, *rootvp = NULL;
2326         struct vattr vattr;
2327         int busy = 0, error;
2328
2329         CTR1(KTR_VFS, "vflush: mp %p", mp);
2330         if (rootrefs > 0) {
2331                 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2332                     ("vflush: bad args"));
2333                 /*
2334                  * Get the filesystem root vnode. We can vput() it
2335                  * immediately, since with rootrefs > 0, it won't go away.
2336                  */
2337                 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0)
2338                         return (error);
2339                 vput(rootvp);
2340
2341         }
2342         MNT_ILOCK(mp);
2343 loop:
2344         MNT_VNODE_FOREACH(vp, mp, mvp) {
2345
2346                 VI_LOCK(vp);
2347                 vholdl(vp);
2348                 MNT_IUNLOCK(mp);
2349                 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2350                 if (error) {
2351                         vdrop(vp);
2352                         MNT_ILOCK(mp);
2353                         MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
2354                         goto loop;
2355                 }
2356                 /*
2357                  * Skip over a vnodes marked VV_SYSTEM.
2358                  */
2359                 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2360                         VOP_UNLOCK(vp, 0);
2361                         vdrop(vp);
2362                         MNT_ILOCK(mp);
2363                         continue;
2364                 }
2365                 /*
2366                  * If WRITECLOSE is set, flush out unlinked but still open
2367                  * files (even if open only for reading) and regular file
2368                  * vnodes open for writing.
2369                  */
2370                 if (flags & WRITECLOSE) {
2371                         error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
2372                         VI_LOCK(vp);
2373
2374                         if ((vp->v_type == VNON ||
2375                             (error == 0 && vattr.va_nlink > 0)) &&
2376                             (vp->v_writecount == 0 || vp->v_type != VREG)) {
2377                                 VOP_UNLOCK(vp, 0);
2378                                 vdropl(vp);
2379                                 MNT_ILOCK(mp);
2380                                 continue;
2381                         }
2382                 } else
2383                         VI_LOCK(vp);
2384                 /*
2385                  * With v_usecount == 0, all we need to do is clear out the
2386                  * vnode data structures and we are done.
2387                  *
2388                  * If FORCECLOSE is set, forcibly close the vnode.
2389                  */
2390                 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2391                         VNASSERT(vp->v_usecount == 0 ||
2392                             (vp->v_type != VCHR && vp->v_type != VBLK), vp,
2393                             ("device VNODE %p is FORCECLOSED", vp));
2394                         vgonel(vp);
2395                 } else {
2396                         busy++;
2397 #ifdef DIAGNOSTIC
2398                         if (busyprt)
2399                                 vprint("vflush: busy vnode", vp);
2400 #endif
2401                 }
2402                 VOP_UNLOCK(vp, 0);
2403                 vdropl(vp);
2404                 MNT_ILOCK(mp);
2405         }
2406         MNT_IUNLOCK(mp);
2407         if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2408                 /*
2409                  * If just the root vnode is busy, and if its refcount
2410                  * is equal to `rootrefs', then go ahead and kill it.
2411                  */
2412                 VI_LOCK(rootvp);
2413                 KASSERT(busy > 0, ("vflush: not busy"));
2414                 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2415                     ("vflush: usecount %d < rootrefs %d",
2416                      rootvp->v_usecount, rootrefs));
2417                 if (busy == 1 && rootvp->v_usecount == rootrefs) {
2418                         VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2419                         vgone(rootvp);
2420                         VOP_UNLOCK(rootvp, 0);
2421                         busy = 0;
2422                 } else
2423                         VI_UNLOCK(rootvp);
2424         }
2425         if (busy)
2426                 return (EBUSY);
2427         for (; rootrefs > 0; rootrefs--)
2428                 vrele(rootvp);
2429         return (0);
2430 }
2431
2432 /*
2433  * Recycle an unused vnode to the front of the free list.
2434  */
2435 int
2436 vrecycle(struct vnode *vp, struct thread *td)
2437 {
2438         int recycled;
2439
2440         ASSERT_VOP_LOCKED(vp, "vrecycle");
2441         recycled = 0;
2442         VI_LOCK(vp);
2443         if (vp->v_usecount == 0) {
2444                 recycled = 1;
2445                 vgonel(vp);
2446         }
2447         VI_UNLOCK(vp);
2448         return (recycled);
2449 }
2450
2451 /*
2452  * Eliminate all activity associated with a vnode
2453  * in preparation for reuse.
2454  */
2455 void
2456 vgone(struct vnode *vp)
2457 {
2458         VI_LOCK(vp);
2459         vgonel(vp);
2460         VI_UNLOCK(vp);
2461 }
2462
2463 /*
2464  * vgone, with the vp interlock held.
2465  */
2466 void
2467 vgonel(struct vnode *vp)
2468 {
2469         struct thread *td;
2470         int oweinact;
2471         int active;
2472         struct mount *mp;
2473
2474         CTR1(KTR_VFS, "vgonel: vp %p", vp);
2475         ASSERT_VOP_LOCKED(vp, "vgonel");
2476         ASSERT_VI_LOCKED(vp, "vgonel");
2477         VNASSERT(vp->v_holdcnt, vp,
2478             ("vgonel: vp %p has no reference.", vp));
2479         td = curthread;
2480
2481         /*
2482          * Don't vgonel if we're already doomed.
2483          */
2484         if (vp->v_iflag & VI_DOOMED)
2485                 return;
2486         vp->v_iflag |= VI_DOOMED;
2487         /*
2488          * Check to see if the vnode is in use.  If so, we have to call
2489          * VOP_CLOSE() and VOP_INACTIVE().
2490          */
2491         active = vp->v_usecount;
2492         oweinact = (vp->v_iflag & VI_OWEINACT);
2493         VI_UNLOCK(vp);
2494         /*
2495          * Clean out any buffers associated with the vnode.
2496          * If the flush fails, just toss the buffers.
2497          */
2498         mp = NULL;
2499         if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2500                 (void) vn_start_secondary_write(vp, &mp, V_WAIT);
2501         if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0)
2502                 vinvalbuf(vp, 0, td, 0, 0);
2503
2504         /*
2505          * If purging an active vnode, it must be closed and
2506          * deactivated before being reclaimed.
2507          */
2508         if (active)
2509                 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2510         if (oweinact || active) {
2511                 VI_LOCK(vp);
2512                 if ((vp->v_iflag & VI_DOINGINACT) == 0)
2513                         vinactive(vp, td);
2514                 VI_UNLOCK(vp);
2515         }
2516         /*
2517          * Reclaim the vnode.
2518          */
2519         if (VOP_RECLAIM(vp, td))
2520                 panic("vgone: cannot reclaim");
2521         if (mp != NULL)
2522                 vn_finished_secondary_write(mp);
2523         VNASSERT(vp->v_object == NULL, vp,
2524             ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2525         /*
2526          * Clear the advisory locks and wake up waiting threads.
2527          */
2528         lf_purgelocks(vp, &(vp->v_lockf));
2529         /*
2530          * Delete from old mount point vnode list.
2531          */
2532         delmntque(vp);
2533         cache_purge(vp);
2534         /*
2535          * Done with purge, reset to the standard lock and invalidate
2536          * the vnode.
2537          */
2538         VI_LOCK(vp);
2539         vp->v_vnlock = &vp->v_lock;
2540         vp->v_op = &dead_vnodeops;
2541         vp->v_tag = "none";
2542         vp->v_type = VBAD;
2543 }
2544
2545 /*
2546  * Calculate the total number of references to a special device.
2547  */
2548 int
2549 vcount(struct vnode *vp)
2550 {
2551         int count;
2552
2553         dev_lock();
2554         count = vp->v_rdev->si_usecount;
2555         dev_unlock();
2556         return (count);
2557 }
2558
2559 /*
2560  * Same as above, but using the struct cdev *as argument
2561  */
2562 int
2563 count_dev(struct cdev *dev)
2564 {
2565         int count;
2566
2567         dev_lock();
2568         count = dev->si_usecount;
2569         dev_unlock();
2570         return(count);
2571 }
2572
2573 /*
2574  * Print out a description of a vnode.
2575  */
2576 static char *typename[] =
2577 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2578  "VMARKER"};
2579
2580 void
2581 vn_printf(struct vnode *vp, const char *fmt, ...)
2582 {
2583         va_list ap;
2584         char buf[256], buf2[16];
2585         u_long flags;
2586
2587         va_start(ap, fmt);
2588         vprintf(fmt, ap);
2589         va_end(ap);
2590         printf("%p: ", (void *)vp);
2591         printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2592         printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2593             vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2594         buf[0] = '\0';
2595         buf[1] = '\0';
2596         if (vp->v_vflag & VV_ROOT)
2597                 strlcat(buf, "|VV_ROOT", sizeof(buf));
2598         if (vp->v_vflag & VV_ISTTY)
2599                 strlcat(buf, "|VV_ISTTY", sizeof(buf));
2600         if (vp->v_vflag & VV_NOSYNC)
2601                 strlcat(buf, "|VV_NOSYNC", sizeof(buf));
2602         if (vp->v_vflag & VV_CACHEDLABEL)
2603                 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
2604         if (vp->v_vflag & VV_TEXT)
2605                 strlcat(buf, "|VV_TEXT", sizeof(buf));
2606         if (vp->v_vflag & VV_COPYONWRITE)
2607                 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
2608         if (vp->v_vflag & VV_SYSTEM)
2609                 strlcat(buf, "|VV_SYSTEM", sizeof(buf));
2610         if (vp->v_vflag & VV_PROCDEP)
2611                 strlcat(buf, "|VV_PROCDEP", sizeof(buf));
2612         if (vp->v_vflag & VV_NOKNOTE)
2613                 strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
2614         if (vp->v_vflag & VV_DELETED)
2615                 strlcat(buf, "|VV_DELETED", sizeof(buf));
2616         if (vp->v_vflag & VV_MD)
2617                 strlcat(buf, "|VV_MD", sizeof(buf));
2618         flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC |
2619             VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
2620             VV_NOKNOTE | VV_DELETED | VV_MD);
2621         if (flags != 0) {
2622                 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
2623                 strlcat(buf, buf2, sizeof(buf));
2624         }
2625         if (vp->v_iflag & VI_MOUNT)
2626                 strlcat(buf, "|VI_MOUNT", sizeof(buf));
2627         if (vp->v_iflag & VI_AGE)
2628                 strlcat(buf, "|VI_AGE", sizeof(buf));
2629         if (vp->v_iflag & VI_DOOMED)
2630                 strlcat(buf, "|VI_DOOMED", sizeof(buf));
2631         if (vp->v_iflag & VI_FREE)
2632                 strlcat(buf, "|VI_FREE", sizeof(buf));
2633         if (vp->v_iflag & VI_OBJDIRTY)
2634                 strlcat(buf, "|VI_OBJDIRTY", sizeof(buf));
2635         if (vp->v_iflag & VI_DOINGINACT)
2636                 strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
2637         if (vp->v_iflag & VI_OWEINACT)
2638                 strlcat(buf, "|VI_OWEINACT", sizeof(buf));
2639         flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
2640             VI_OBJDIRTY | VI_DOINGINACT | VI_OWEINACT);
2641         if (flags != 0) {
2642                 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
2643                 strlcat(buf, buf2, sizeof(buf));
2644         }
2645         printf("    flags (%s)\n", buf + 1);
2646         if (mtx_owned(VI_MTX(vp)))
2647                 printf(" VI_LOCKed");
2648         if (vp->v_object != NULL)
2649                 printf("    v_object %p ref %d pages %d\n",
2650                     vp->v_object, vp->v_object->ref_count,
2651                     vp->v_object->resident_page_count);
2652         printf("    ");
2653         lockmgr_printinfo(vp->v_vnlock);
2654         printf("\n");
2655         if (vp->v_data != NULL)
2656                 VOP_PRINT(vp);
2657 }
2658
2659 #ifdef DDB
2660 /*
2661  * List all of the locked vnodes in the system.
2662  * Called when debugging the kernel.
2663  */
2664 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2665 {
2666         struct mount *mp, *nmp;
2667         struct vnode *vp;
2668
2669         /*
2670          * Note: because this is DDB, we can't obey the locking semantics
2671          * for these structures, which means we could catch an inconsistent
2672          * state and dereference a nasty pointer.  Not much to be done
2673          * about that.
2674          */
2675         db_printf("Locked vnodes\n");
2676         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2677                 nmp = TAILQ_NEXT(mp, mnt_list);
2678                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2679                         if (vp->v_type != VMARKER &&
2680                             VOP_ISLOCKED(vp))
2681                                 vprint("", vp);
2682                 }
2683                 nmp = TAILQ_NEXT(mp, mnt_list);
2684         }
2685 }
2686
2687 /*
2688  * Show details about the given vnode.
2689  */
2690 DB_SHOW_COMMAND(vnode, db_show_vnode)
2691 {
2692         struct vnode *vp;
2693
2694         if (!have_addr)
2695                 return;
2696         vp = (struct vnode *)addr;
2697         vn_printf(vp, "vnode ");
2698 }
2699
2700 /*
2701  * Show details about the given mount point.
2702  */
2703 DB_SHOW_COMMAND(mount, db_show_mount)
2704 {
2705         struct mount *mp;
2706         struct statfs *sp;
2707         struct vnode *vp;
2708         char buf[512];
2709         u_int flags;
2710
2711         if (!have_addr) {
2712                 /* No address given, print short info about all mount points. */
2713                 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2714                         db_printf("%p %s on %s (%s)\n", mp,
2715                             mp->mnt_stat.f_mntfromname,
2716                             mp->mnt_stat.f_mntonname,
2717                             mp->mnt_stat.f_fstypename);
2718                         if (db_pager_quit)
2719                                 break;
2720                 }
2721                 db_printf("\nMore info: show mount <addr>\n");
2722                 return;
2723         }
2724
2725         mp = (struct mount *)addr;
2726         db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
2727             mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
2728
2729         buf[0] = '\0';
2730         flags = mp->mnt_flag;
2731 #define MNT_FLAG(flag)  do {                                            \
2732         if (flags & (flag)) {                                           \
2733                 if (buf[0] != '\0')                                     \
2734                         strlcat(buf, ", ", sizeof(buf));                \
2735                 strlcat(buf, (#flag) + 4, sizeof(buf));                 \
2736                 flags &= ~(flag);                                       \
2737         }                                                               \
2738 } while (0)
2739         MNT_FLAG(MNT_RDONLY);
2740         MNT_FLAG(MNT_SYNCHRONOUS);
2741         MNT_FLAG(MNT_NOEXEC);
2742         MNT_FLAG(MNT_NOSUID);
2743         MNT_FLAG(MNT_UNION);
2744         MNT_FLAG(MNT_ASYNC);
2745         MNT_FLAG(MNT_SUIDDIR);
2746         MNT_FLAG(MNT_SOFTDEP);
2747         MNT_FLAG(MNT_NOSYMFOLLOW);
2748         MNT_FLAG(MNT_GJOURNAL);
2749         MNT_FLAG(MNT_MULTILABEL);
2750         MNT_FLAG(MNT_ACLS);
2751         MNT_FLAG(MNT_NOATIME);
2752         MNT_FLAG(MNT_NOCLUSTERR);
2753         MNT_FLAG(MNT_NOCLUSTERW);
2754         MNT_FLAG(MNT_EXRDONLY);
2755         MNT_FLAG(MNT_EXPORTED);
2756         MNT_FLAG(MNT_DEFEXPORTED);
2757         MNT_FLAG(MNT_EXPORTANON);
2758         MNT_FLAG(MNT_EXKERB);
2759         MNT_FLAG(MNT_EXPUBLIC);
2760         MNT_FLAG(MNT_LOCAL);
2761         MNT_FLAG(MNT_QUOTA);
2762         MNT_FLAG(MNT_ROOTFS);
2763         MNT_FLAG(MNT_USER);
2764         MNT_FLAG(MNT_IGNORE);
2765         MNT_FLAG(MNT_UPDATE);
2766         MNT_FLAG(MNT_DELEXPORT);
2767         MNT_FLAG(MNT_RELOAD);
2768         MNT_FLAG(MNT_FORCE);
2769         MNT_FLAG(MNT_SNAPSHOT);
2770         MNT_FLAG(MNT_BYFSID);
2771 #undef MNT_FLAG
2772         if (flags != 0) {
2773                 if (buf[0] != '\0')
2774                         strlcat(buf, ", ", sizeof(buf));
2775                 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
2776                     "0x%08x", flags);
2777         }
2778         db_printf("    mnt_flag = %s\n", buf);
2779
2780         buf[0] = '\0';
2781         flags = mp->mnt_kern_flag;
2782 #define MNT_KERN_FLAG(flag)     do {                                    \
2783         if (flags & (flag)) {                                           \
2784                 if (buf[0] != '\0')                                     \
2785                         strlcat(buf, ", ", sizeof(buf));                \
2786                 strlcat(buf, (#flag) + 5, sizeof(buf));                 \
2787                 flags &= ~(flag);                                       \
2788         }                                                               \
2789 } while (0)
2790         MNT_KERN_FLAG(MNTK_UNMOUNTF);
2791         MNT_KERN_FLAG(MNTK_ASYNC);
2792         MNT_KERN_FLAG(MNTK_SOFTDEP);
2793         MNT_KERN_FLAG(MNTK_NOINSMNTQ);
2794         MNT_KERN_FLAG(MNTK_UNMOUNT);
2795         MNT_KERN_FLAG(MNTK_MWAIT);
2796         MNT_KERN_FLAG(MNTK_SUSPEND);
2797         MNT_KERN_FLAG(MNTK_SUSPEND2);
2798         MNT_KERN_FLAG(MNTK_SUSPENDED);
2799         MNT_KERN_FLAG(MNTK_MPSAFE);
2800         MNT_KERN_FLAG(MNTK_NOKNOTE);
2801         MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
2802 #undef MNT_KERN_FLAG
2803         if (flags != 0) {
2804                 if (buf[0] != '\0')
2805                         strlcat(buf, ", ", sizeof(buf));
2806                 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
2807                     "0x%08x", flags);
2808         }
2809         db_printf("    mnt_kern_flag = %s\n", buf);
2810
2811         sp = &mp->mnt_stat;
2812         db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
2813             "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
2814             "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
2815             "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
2816             (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
2817             (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
2818             (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
2819             (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
2820             (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
2821             (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
2822             (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
2823             (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
2824
2825         db_printf("    mnt_cred = { uid=%u ruid=%u",
2826             (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
2827         if (mp->mnt_cred->cr_prison != NULL)
2828                 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
2829         db_printf(" }\n");
2830         db_printf("    mnt_ref = %d\n", mp->mnt_ref);
2831         db_printf("    mnt_gen = %d\n", mp->mnt_gen);
2832         db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
2833         db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
2834         db_printf("    mnt_noasync = %u\n", mp->mnt_noasync);
2835         db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
2836         db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
2837         db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
2838         db_printf("    mnt_markercnt = %d\n", mp->mnt_markercnt);
2839         db_printf("    mnt_holdcnt = %d\n", mp->mnt_holdcnt);
2840         db_printf("    mnt_holdcntwaiters = %d\n", mp->mnt_holdcntwaiters);
2841         db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
2842         db_printf("    mnt_secondary_accwrites = %d\n",
2843             mp->mnt_secondary_accwrites);
2844         db_printf("    mnt_gjprovider = %s\n",
2845             mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
2846         db_printf("\n");
2847
2848         TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2849                 if (vp->v_type != VMARKER) {
2850                         vn_printf(vp, "vnode ");
2851                         if (db_pager_quit)
2852                                 break;
2853                 }
2854         }
2855 }
2856 #endif  /* DDB */
2857
2858 /*
2859  * Fill in a struct xvfsconf based on a struct vfsconf.
2860  */
2861 static void
2862 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2863 {
2864
2865         strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2866         xvfsp->vfc_typenum = vfsp->vfc_typenum;
2867         xvfsp->vfc_refcount = vfsp->vfc_refcount;
2868         xvfsp->vfc_flags = vfsp->vfc_flags;
2869         /*
2870          * These are unused in userland, we keep them
2871          * to not break binary compatibility.
2872          */
2873         xvfsp->vfc_vfsops = NULL;
2874         xvfsp->vfc_next = NULL;
2875 }
2876
2877 /*
2878  * Top level filesystem related information gathering.
2879  */
2880 static int
2881 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2882 {
2883         struct vfsconf *vfsp;
2884         struct xvfsconf xvfsp;
2885         int error;
2886
2887         error = 0;
2888         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
2889                 bzero(&xvfsp, sizeof(xvfsp));
2890                 vfsconf2x(vfsp, &xvfsp);
2891                 error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
2892                 if (error)
2893                         break;
2894         }
2895         return (error);
2896 }
2897
2898 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2899     "S,xvfsconf", "List of all configured filesystems");
2900
2901 #ifndef BURN_BRIDGES
2902 static int      sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2903
2904 static int
2905 vfs_sysctl(SYSCTL_HANDLER_ARGS)
2906 {
2907         int *name = (int *)arg1 - 1;    /* XXX */
2908         u_int namelen = arg2 + 1;       /* XXX */
2909         struct vfsconf *vfsp;
2910         struct xvfsconf xvfsp;
2911
2912         printf("WARNING: userland calling deprecated sysctl, "
2913             "please rebuild world\n");
2914
2915 #if 1 || defined(COMPAT_PRELITE2)
2916         /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2917         if (namelen == 1)
2918                 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2919 #endif
2920
2921         switch (name[1]) {
2922         case VFS_MAXTYPENUM:
2923                 if (namelen != 2)
2924                         return (ENOTDIR);
2925                 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2926         case VFS_CONF:
2927                 if (namelen != 3)
2928                         return (ENOTDIR);       /* overloaded */
2929                 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
2930                         if (vfsp->vfc_typenum == name[2])
2931                                 break;
2932                 if (vfsp == NULL)
2933                         return (EOPNOTSUPP);
2934                 bzero(&xvfsp, sizeof(xvfsp));
2935                 vfsconf2x(vfsp, &xvfsp);
2936                 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2937         }
2938         return (EOPNOTSUPP);
2939 }
2940
2941 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
2942         vfs_sysctl, "Generic filesystem");
2943
2944 #if 1 || defined(COMPAT_PRELITE2)
2945
2946 static int
2947 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2948 {
2949         int error;
2950         struct vfsconf *vfsp;
2951         struct ovfsconf ovfs;
2952
2953         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
2954                 bzero(&ovfs, sizeof(ovfs));
2955                 ovfs.vfc_vfsops = vfsp->vfc_vfsops;     /* XXX used as flag */
2956                 strcpy(ovfs.vfc_name, vfsp->vfc_name);
2957                 ovfs.vfc_index = vfsp->vfc_typenum;
2958                 ovfs.vfc_refcount = vfsp->vfc_refcount;
2959                 ovfs.vfc_flags = vfsp->vfc_flags;
2960                 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2961                 if (error)
2962                         return error;
2963         }
2964         return 0;
2965 }
2966
2967 #endif /* 1 || COMPAT_PRELITE2 */
2968 #endif /* !BURN_BRIDGES */
2969
2970 #define KINFO_VNODESLOP         10
2971 #ifdef notyet
2972 /*
2973  * Dump vnode list (via sysctl).
2974  */
2975 /* ARGSUSED */
2976 static int
2977 sysctl_vnode(SYSCTL_HANDLER_ARGS)
2978 {
2979         struct xvnode *xvn;
2980         struct thread *td = req->td;
2981         struct mount *mp;
2982         struct vnode *vp;
2983         int error, len, n;
2984
2985         /*
2986          * Stale numvnodes access is not fatal here.
2987          */
2988         req->lock = 0;
2989         len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
2990         if (!req->oldptr)
2991                 /* Make an estimate */
2992                 return (SYSCTL_OUT(req, 0, len));
2993
2994         error = sysctl_wire_old_buffer(req, 0);
2995         if (error != 0)
2996                 return (error);
2997         xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
2998         n = 0;
2999         mtx_lock(&mountlist_mtx);
3000         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3001                 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
3002                         continue;
3003                 MNT_ILOCK(mp);
3004                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3005                         if (n == len)
3006                                 break;
3007                         vref(vp);
3008                         xvn[n].xv_size = sizeof *xvn;
3009                         xvn[n].xv_vnode = vp;
3010                         xvn[n].xv_id = 0;       /* XXX compat */
3011 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3012                         XV_COPY(usecount);
3013                         XV_COPY(writecount);
3014                         XV_COPY(holdcnt);
3015                         XV_COPY(mount);
3016                         XV_COPY(numoutput);
3017                         XV_COPY(type);
3018 #undef XV_COPY
3019                         xvn[n].xv_flag = vp->v_vflag;
3020
3021                         switch (vp->v_type) {
3022                         case VREG:
3023                         case VDIR:
3024                         case VLNK:
3025                                 break;
3026                         case VBLK:
3027                         case VCHR:
3028                                 if (vp->v_rdev == NULL) {
3029                                         vrele(vp);
3030                                         continue;
3031                                 }
3032                                 xvn[n].xv_dev = dev2udev(vp->v_rdev);
3033                                 break;
3034                         case VSOCK:
3035                                 xvn[n].xv_socket = vp->v_socket;
3036                                 break;
3037                         case VFIFO:
3038                                 xvn[n].xv_fifo = vp->v_fifoinfo;
3039                                 break;
3040                         case VNON:
3041                         case VBAD:
3042                         default:
3043                                 /* shouldn't happen? */
3044                                 vrele(vp);
3045                                 continue;
3046                         }
3047                         vrele(vp);
3048                         ++n;
3049                 }
3050                 MNT_IUNLOCK(mp);
3051                 mtx_lock(&mountlist_mtx);
3052                 vfs_unbusy(mp, td);
3053                 if (n == len)
3054                         break;
3055         }
3056         mtx_unlock(&mountlist_mtx);
3057
3058         error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3059         free(xvn, M_TEMP);
3060         return (error);
3061 }
3062
3063 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
3064         0, 0, sysctl_vnode, "S,xvnode", "");
3065 #endif
3066
3067 /*
3068  * Unmount all filesystems. The list is traversed in reverse order
3069  * of mounting to avoid dependencies.
3070  */
3071 void
3072 vfs_unmountall(void)
3073 {
3074         struct mount *mp;
3075         struct thread *td;
3076         int error;
3077
3078         KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread"));
3079         td = curthread;
3080         /*
3081          * Since this only runs when rebooting, it is not interlocked.
3082          */
3083         while(!TAILQ_EMPTY(&mountlist)) {
3084                 mp = TAILQ_LAST(&mountlist, mntlist);
3085                 error = dounmount(mp, MNT_FORCE, td);
3086                 if (error) {
3087                         TAILQ_REMOVE(&mountlist, mp, mnt_list);
3088                         /*
3089                          * XXX: Due to the way in which we mount the root
3090                          * file system off of devfs, devfs will generate a
3091                          * "busy" warning when we try to unmount it before
3092                          * the root.  Don't print a warning as a result in
3093                          * order to avoid false positive errors that may
3094                          * cause needless upset.
3095                          */
3096                         if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
3097                                 printf("unmount of %s failed (",
3098                                     mp->mnt_stat.f_mntonname);
3099                                 if (error == EBUSY)
3100                                         printf("BUSY)\n");
3101                                 else
3102                                         printf("%d)\n", error);
3103                         }
3104                 } else {
3105                         /* The unmount has removed mp from the mountlist */
3106                 }
3107         }
3108 }
3109
3110 /*
3111  * perform msync on all vnodes under a mount point
3112  * the mount point must be locked.
3113  */
3114 void
3115 vfs_msync(struct mount *mp, int flags)
3116 {
3117         struct vnode *vp, *mvp;
3118         struct vm_object *obj;
3119
3120         MNT_ILOCK(mp);
3121         MNT_VNODE_FOREACH(vp, mp, mvp) {
3122                 VI_LOCK(vp);
3123                 if ((vp->v_iflag & VI_OBJDIRTY) &&
3124                     (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3125                         MNT_IUNLOCK(mp);
3126                         if (!vget(vp,
3127                             LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3128                             curthread)) {
3129                                 if (vp->v_vflag & VV_NOSYNC) {  /* unlinked */
3130                                         vput(vp);
3131                                         MNT_ILOCK(mp);
3132                                         continue;
3133                                 }
3134
3135                                 obj = vp->v_object;
3136                                 if (obj != NULL) {
3137                                         VM_OBJECT_LOCK(obj);
3138                                         vm_object_page_clean(obj, 0, 0,
3139                                             flags == MNT_WAIT ?
3140                                             OBJPC_SYNC : OBJPC_NOSYNC);
3141                                         VM_OBJECT_UNLOCK(obj);
3142                                 }
3143                                 vput(vp);
3144                         }
3145                         MNT_ILOCK(mp);
3146                 } else
3147                         VI_UNLOCK(vp);
3148         }
3149         MNT_IUNLOCK(mp);
3150 }
3151
3152 /*
3153  * Mark a vnode as free, putting it up for recycling.
3154  */
3155 static void
3156 vfree(struct vnode *vp)
3157 {
3158
3159         CTR1(KTR_VFS, "vfree vp %p", vp);
3160         ASSERT_VI_LOCKED(vp, "vfree");
3161         mtx_lock(&vnode_free_list_mtx);
3162         VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed."));
3163         VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free"));
3164         VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't"));
3165         VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp,
3166             ("vfree: Freeing doomed vnode"));
3167         if (vp->v_iflag & VI_AGE) {
3168                 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3169         } else {
3170                 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3171         }
3172         freevnodes++;
3173         vp->v_iflag &= ~VI_AGE;
3174         vp->v_iflag |= VI_FREE;
3175         mtx_unlock(&vnode_free_list_mtx);
3176 }
3177
3178 /*
3179  * Opposite of vfree() - mark a vnode as in use.
3180  */
3181 static void
3182 vbusy(struct vnode *vp)
3183 {
3184         CTR1(KTR_VFS, "vbusy vp %p", vp);
3185         ASSERT_VI_LOCKED(vp, "vbusy");
3186         VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
3187         VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed."));
3188
3189         mtx_lock(&vnode_free_list_mtx);
3190         TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3191         freevnodes--;
3192         vp->v_iflag &= ~(VI_FREE|VI_AGE);
3193         mtx_unlock(&vnode_free_list_mtx);
3194 }
3195
3196 /*
3197  * Initalize per-vnode helper structure to hold poll-related state.
3198  */
3199 void
3200 v_addpollinfo(struct vnode *vp)
3201 {
3202         struct vpollinfo *vi;
3203
3204         vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
3205         if (vp->v_pollinfo != NULL) {
3206                 uma_zfree(vnodepoll_zone, vi);
3207                 return;
3208         }
3209         vp->v_pollinfo = vi;
3210         mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3211         knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, vp, vfs_knllock,
3212             vfs_knlunlock, vfs_knllocked);
3213 }
3214
3215 /*
3216  * Record a process's interest in events which might happen to
3217  * a vnode.  Because poll uses the historic select-style interface
3218  * internally, this routine serves as both the ``check for any
3219  * pending events'' and the ``record my interest in future events''
3220  * functions.  (These are done together, while the lock is held,
3221  * to avoid race conditions.)
3222  */
3223 int
3224 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3225 {
3226
3227         if (vp->v_pollinfo == NULL)
3228                 v_addpollinfo(vp);
3229         mtx_lock(&vp->v_pollinfo->vpi_lock);
3230         if (vp->v_pollinfo->vpi_revents & events) {
3231                 /*
3232                  * This leaves events we are not interested
3233                  * in available for the other process which
3234                  * which presumably had requested them
3235                  * (otherwise they would never have been
3236                  * recorded).
3237                  */
3238                 events &= vp->v_pollinfo->vpi_revents;
3239                 vp->v_pollinfo->vpi_revents &= ~events;
3240
3241                 mtx_unlock(&vp->v_pollinfo->vpi_lock);
3242                 return events;
3243         }
3244         vp->v_pollinfo->vpi_events |= events;
3245         selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3246         mtx_unlock(&vp->v_pollinfo->vpi_lock);
3247         return 0;
3248 }
3249
3250 /*
3251  * Routine to create and manage a filesystem syncer vnode.
3252  */
3253 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
3254 static int      sync_fsync(struct  vop_fsync_args *);
3255 static int      sync_inactive(struct  vop_inactive_args *);
3256 static int      sync_reclaim(struct  vop_reclaim_args *);
3257
3258 static struct vop_vector sync_vnodeops = {
3259         .vop_bypass =   VOP_EOPNOTSUPP,
3260         .vop_close =    sync_close,             /* close */
3261         .vop_fsync =    sync_fsync,             /* fsync */
3262         .vop_inactive = sync_inactive,  /* inactive */
3263         .vop_reclaim =  sync_reclaim,   /* reclaim */
3264         .vop_lock1 =    vop_stdlock,    /* lock */
3265         .vop_unlock =   vop_stdunlock,  /* unlock */
3266         .vop_islocked = vop_stdislocked,        /* islocked */
3267 };
3268
3269 /*
3270  * Create a new filesystem syncer vnode for the specified mount point.
3271  */
3272 int
3273 vfs_allocate_syncvnode(struct mount *mp)
3274 {
3275         struct vnode *vp;
3276         struct bufobj *bo;
3277         static long start, incr, next;
3278         int error;
3279
3280         /* Allocate a new vnode */
3281         if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) {
3282                 mp->mnt_syncer = NULL;
3283                 return (error);
3284         }
3285         vp->v_type = VNON;
3286         error = insmntque(vp, mp);
3287         if (error != 0)
3288                 panic("vfs_allocate_syncvnode: insmntque failed");
3289         /*
3290          * Place the vnode onto the syncer worklist. We attempt to
3291          * scatter them about on the list so that they will go off
3292          * at evenly distributed times even if all the filesystems
3293          * are mounted at once.
3294          */
3295         next += incr;
3296         if (next == 0 || next > syncer_maxdelay) {
3297                 start /= 2;
3298                 incr /= 2;
3299                 if (start == 0) {
3300                         start = syncer_maxdelay / 2;
3301                         incr = syncer_maxdelay;
3302                 }
3303                 next = start;
3304         }
3305         bo = &vp->v_bufobj;
3306         BO_LOCK(bo);
3307         vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3308         /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3309         mtx_lock(&sync_mtx);
3310         sync_vnode_count++;
3311         mtx_unlock(&sync_mtx);
3312         BO_UNLOCK(bo);
3313         mp->mnt_syncer = vp;
3314         return (0);
3315 }
3316
3317 /*
3318  * Do a lazy sync of the filesystem.
3319  */
3320 static int
3321 sync_fsync(struct vop_fsync_args *ap)
3322 {
3323         struct vnode *syncvp = ap->a_vp;
3324         struct mount *mp = syncvp->v_mount;
3325         struct thread *td = ap->a_td;
3326         int error;
3327         struct bufobj *bo;
3328
3329         /*
3330          * We only need to do something if this is a lazy evaluation.
3331          */
3332         if (ap->a_waitfor != MNT_LAZY)
3333                 return (0);
3334
3335         /*
3336          * Move ourselves to the back of the sync list.
3337          */
3338         bo = &syncvp->v_bufobj;
3339         BO_LOCK(bo);
3340         vn_syncer_add_to_worklist(bo, syncdelay);
3341         BO_UNLOCK(bo);
3342
3343         /*
3344          * Walk the list of vnodes pushing all that are dirty and
3345          * not already on the sync list.
3346          */
3347         mtx_lock(&mountlist_mtx);
3348         if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
3349                 mtx_unlock(&mountlist_mtx);
3350                 return (0);
3351         }
3352         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3353                 vfs_unbusy(mp, td);
3354                 return (0);
3355         }
3356         MNT_ILOCK(mp);
3357         mp->mnt_noasync++;
3358         mp->mnt_kern_flag &= ~MNTK_ASYNC;
3359         MNT_IUNLOCK(mp);
3360         vfs_msync(mp, MNT_NOWAIT);
3361         error = VFS_SYNC(mp, MNT_LAZY, td);
3362         MNT_ILOCK(mp);
3363         mp->mnt_noasync--;
3364         if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
3365                 mp->mnt_kern_flag |= MNTK_ASYNC;
3366         MNT_IUNLOCK(mp);
3367         vn_finished_write(mp);
3368         vfs_unbusy(mp, td);
3369         return (error);
3370 }
3371
3372 /*
3373  * The syncer vnode is no referenced.
3374  */
3375 static int
3376 sync_inactive(struct vop_inactive_args *ap)
3377 {
3378
3379         vgone(ap->a_vp);
3380         return (0);
3381 }
3382
3383 /*
3384  * The syncer vnode is no longer needed and is being decommissioned.
3385  *
3386  * Modifications to the worklist must be protected by sync_mtx.
3387  */
3388 static int
3389 sync_reclaim(struct vop_reclaim_args *ap)
3390 {
3391         struct vnode *vp = ap->a_vp;
3392         struct bufobj *bo;
3393
3394         bo = &vp->v_bufobj;
3395         BO_LOCK(bo);
3396         vp->v_mount->mnt_syncer = NULL;
3397         if (bo->bo_flag & BO_ONWORKLST) {
3398                 mtx_lock(&sync_mtx);
3399                 LIST_REMOVE(bo, bo_synclist);
3400                 syncer_worklist_len--;
3401                 sync_vnode_count--;
3402                 mtx_unlock(&sync_mtx);
3403                 bo->bo_flag &= ~BO_ONWORKLST;
3404         }
3405         BO_UNLOCK(bo);
3406
3407         return (0);
3408 }
3409
3410 /*
3411  * Check if vnode represents a disk device
3412  */
3413 int
3414 vn_isdisk(struct vnode *vp, int *errp)
3415 {
3416         int error;
3417
3418         error = 0;
3419         dev_lock();
3420         if (vp->v_type != VCHR)
3421                 error = ENOTBLK;
3422         else if (vp->v_rdev == NULL)
3423                 error = ENXIO;
3424         else if (vp->v_rdev->si_devsw == NULL)
3425                 error = ENXIO;
3426         else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3427                 error = ENOTBLK;
3428         dev_unlock();
3429         if (errp != NULL)
3430                 *errp = error;
3431         return (error == 0);
3432 }
3433
3434 /*
3435  * Common filesystem object access control check routine.  Accepts a
3436  * vnode's type, "mode", uid and gid, requested access mode, credentials,
3437  * and optional call-by-reference privused argument allowing vaccess()
3438  * to indicate to the caller whether privilege was used to satisfy the
3439  * request (obsoleted).  Returns 0 on success, or an errno on failure.
3440  *
3441  * The ifdef'd CAPABILITIES version is here for reference, but is not
3442  * actually used.
3443  */
3444 int
3445 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3446     mode_t acc_mode, struct ucred *cred, int *privused)
3447 {
3448         mode_t dac_granted;
3449         mode_t priv_granted;
3450
3451         /*
3452          * Look for a normal, non-privileged way to access the file/directory
3453          * as requested.  If it exists, go with that.
3454          */
3455
3456         if (privused != NULL)
3457                 *privused = 0;
3458
3459         dac_granted = 0;
3460
3461         /* Check the owner. */
3462         if (cred->cr_uid == file_uid) {
3463                 dac_granted |= VADMIN;
3464                 if (file_mode & S_IXUSR)
3465                         dac_granted |= VEXEC;
3466                 if (file_mode & S_IRUSR)
3467                         dac_granted |= VREAD;
3468                 if (file_mode & S_IWUSR)
3469                         dac_granted |= (VWRITE | VAPPEND);
3470
3471                 if ((acc_mode & dac_granted) == acc_mode)
3472                         return (0);
3473
3474                 goto privcheck;
3475         }
3476
3477         /* Otherwise, check the groups (first match) */
3478         if (groupmember(file_gid, cred)) {
3479                 if (file_mode & S_IXGRP)
3480                         dac_granted |= VEXEC;
3481                 if (file_mode & S_IRGRP)
3482                         dac_granted |= VREAD;
3483                 if (file_mode & S_IWGRP)
3484                         dac_granted |= (VWRITE | VAPPEND);
3485
3486                 if ((acc_mode & dac_granted) == acc_mode)
3487                         return (0);
3488
3489                 goto privcheck;
3490         }
3491
3492         /* Otherwise, check everyone else. */
3493         if (file_mode & S_IXOTH)
3494                 dac_granted |= VEXEC;
3495         if (file_mode & S_IROTH)
3496                 dac_granted |= VREAD;
3497         if (file_mode & S_IWOTH)
3498                 dac_granted |= (VWRITE | VAPPEND);
3499         if ((acc_mode & dac_granted) == acc_mode)
3500                 return (0);
3501
3502 privcheck:
3503         /*
3504          * Build a privilege mask to determine if the set of privileges
3505          * satisfies the requirements when combined with the granted mask
3506          * from above.  For each privilege, if the privilege is required,
3507          * bitwise or the request type onto the priv_granted mask.
3508          */
3509         priv_granted = 0;
3510
3511         if (type == VDIR) {
3512                 /*
3513                  * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3514                  * requests, instead of PRIV_VFS_EXEC.
3515                  */
3516                 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3517                     !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3518                         priv_granted |= VEXEC;
3519         } else {
3520                 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3521                     !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
3522                         priv_granted |= VEXEC;
3523         }
3524
3525         if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3526             !priv_check_cred(cred, PRIV_VFS_READ, 0))
3527                 priv_granted |= VREAD;
3528
3529         if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3530             !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
3531                 priv_granted |= (VWRITE | VAPPEND);
3532
3533         if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3534             !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
3535                 priv_granted |= VADMIN;
3536
3537         if ((acc_mode & (priv_granted | dac_granted)) == acc_mode) {
3538                 /* XXX audit: privilege used */
3539                 if (privused != NULL)
3540                         *privused = 1;
3541                 return (0);
3542         }
3543
3544         return ((acc_mode & VADMIN) ? EPERM : EACCES);
3545 }
3546
3547 /*
3548  * Credential check based on process requesting service, and per-attribute
3549  * permissions.
3550  */
3551 int
3552 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
3553     struct thread *td, int access)
3554 {
3555
3556         /*
3557          * Kernel-invoked always succeeds.
3558          */
3559         if (cred == NOCRED)
3560                 return (0);
3561
3562         /*
3563          * Do not allow privileged processes in jail to directly manipulate
3564          * system attributes.
3565          */
3566         switch (attrnamespace) {
3567         case EXTATTR_NAMESPACE_SYSTEM:
3568                 /* Potentially should be: return (EPERM); */
3569                 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
3570         case EXTATTR_NAMESPACE_USER:
3571                 return (VOP_ACCESS(vp, access, cred, td));
3572         default:
3573                 return (EPERM);
3574         }
3575 }
3576
3577 #ifdef DEBUG_VFS_LOCKS
3578 /*
3579  * This only exists to supress warnings from unlocked specfs accesses.  It is
3580  * no longer ok to have an unlocked VFS.
3581  */
3582 #define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
3583
3584 int vfs_badlock_ddb = 1;        /* Drop into debugger on violation. */
3585 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "");
3586
3587 int vfs_badlock_mutex = 1;      /* Check for interlock across VOPs. */
3588 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "");
3589
3590 int vfs_badlock_print = 1;      /* Print lock violations. */
3591 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "");
3592
3593 #ifdef KDB
3594 int vfs_badlock_backtrace = 1;  /* Print backtrace at lock violations. */
3595 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "");
3596 #endif
3597
3598 static void
3599 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3600 {
3601
3602 #ifdef KDB
3603         if (vfs_badlock_backtrace)
3604                 kdb_backtrace();
3605 #endif
3606         if (vfs_badlock_print)
3607                 printf("%s: %p %s\n", str, (void *)vp, msg);
3608         if (vfs_badlock_ddb)
3609                 kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3610 }
3611
3612 void
3613 assert_vi_locked(struct vnode *vp, const char *str)
3614 {
3615
3616         if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3617                 vfs_badlock("interlock is not locked but should be", str, vp);
3618 }
3619
3620 void
3621 assert_vi_unlocked(struct vnode *vp, const char *str)
3622 {
3623
3624         if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3625                 vfs_badlock("interlock is locked but should not be", str, vp);
3626 }
3627
3628 void
3629 assert_vop_locked(struct vnode *vp, const char *str)
3630 {
3631
3632         if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == 0)
3633                 vfs_badlock("is not locked but should be", str, vp);
3634 }
3635
3636 void
3637 assert_vop_unlocked(struct vnode *vp, const char *str)
3638 {
3639
3640         if (vp && !IGNORE_LOCK(vp) &&
3641             VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
3642                 vfs_badlock("is locked but should not be", str, vp);
3643 }
3644
3645 void
3646 assert_vop_elocked(struct vnode *vp, const char *str)
3647 {
3648
3649         if (vp && !IGNORE_LOCK(vp) &&
3650             VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
3651                 vfs_badlock("is not exclusive locked but should be", str, vp);
3652 }
3653
3654 #if 0
3655 void
3656 assert_vop_elocked_other(struct vnode *vp, const char *str)
3657 {
3658
3659         if (vp && !IGNORE_LOCK(vp) &&
3660             VOP_ISLOCKED(vp) != LK_EXCLOTHER)
3661                 vfs_badlock("is not exclusive locked by another thread",
3662                     str, vp);
3663 }
3664
3665 void
3666 assert_vop_slocked(struct vnode *vp, const char *str)
3667 {
3668
3669         if (vp && !IGNORE_LOCK(vp) &&
3670             VOP_ISLOCKED(vp) != LK_SHARED)
3671                 vfs_badlock("is not locked shared but should be", str, vp);
3672 }
3673 #endif /* 0 */
3674 #endif /* DEBUG_VFS_LOCKS */
3675
3676 void
3677 vop_rename_pre(void *ap)
3678 {
3679         struct vop_rename_args *a = ap;
3680
3681 #ifdef DEBUG_VFS_LOCKS
3682         if (a->a_tvp)
3683                 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
3684         ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
3685         ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
3686         ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
3687
3688         /* Check the source (from). */
3689         if (a->a_tdvp != a->a_fdvp && a->a_tvp != a->a_fdvp)
3690                 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
3691         if (a->a_tvp != a->a_fvp)
3692                 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
3693
3694         /* Check the target. */
3695         if (a->a_tvp)
3696                 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
3697         ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
3698 #endif
3699         if (a->a_tdvp != a->a_fdvp)
3700                 vhold(a->a_fdvp);
3701         if (a->a_tvp != a->a_fvp)
3702                 vhold(a->a_fvp);
3703         vhold(a->a_tdvp);
3704         if (a->a_tvp)
3705                 vhold(a->a_tvp);
3706 }
3707
3708 void
3709 vop_strategy_pre(void *ap)
3710 {
3711 #ifdef DEBUG_VFS_LOCKS
3712         struct vop_strategy_args *a;
3713         struct buf *bp;
3714
3715         a = ap;
3716         bp = a->a_bp;
3717
3718         /*
3719          * Cluster ops lock their component buffers but not the IO container.
3720          */
3721         if ((bp->b_flags & B_CLUSTER) != 0)
3722                 return;
3723
3724         if (!BUF_ISLOCKED(bp)) {
3725                 if (vfs_badlock_print)
3726                         printf(
3727                             "VOP_STRATEGY: bp is not locked but should be\n");
3728                 if (vfs_badlock_ddb)
3729                         kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3730         }
3731 #endif
3732 }
3733
3734 void
3735 vop_lookup_pre(void *ap)
3736 {
3737 #ifdef DEBUG_VFS_LOCKS
3738         struct vop_lookup_args *a;
3739         struct vnode *dvp;
3740
3741         a = ap;
3742         dvp = a->a_dvp;
3743         ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3744         ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3745 #endif
3746 }
3747
3748 void
3749 vop_lookup_post(void *ap, int rc)
3750 {
3751 #ifdef DEBUG_VFS_LOCKS
3752         struct vop_lookup_args *a;
3753         struct vnode *dvp;
3754         struct vnode *vp;
3755
3756         a = ap;
3757         dvp = a->a_dvp;
3758         vp = *(a->a_vpp);
3759
3760         ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3761         ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3762
3763         if (!rc)
3764                 ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)");
3765 #endif
3766 }
3767
3768 void
3769 vop_lock_pre(void *ap)
3770 {
3771 #ifdef DEBUG_VFS_LOCKS
3772         struct vop_lock1_args *a = ap;
3773
3774         if ((a->a_flags & LK_INTERLOCK) == 0)
3775                 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3776         else
3777                 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
3778 #endif
3779 }
3780
3781 void
3782 vop_lock_post(void *ap, int rc)
3783 {
3784 #ifdef DEBUG_VFS_LOCKS
3785         struct vop_lock1_args *a = ap;
3786
3787         ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3788         if (rc == 0)
3789                 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
3790 #endif
3791 }
3792
3793 void
3794 vop_unlock_pre(void *ap)
3795 {
3796 #ifdef DEBUG_VFS_LOCKS
3797         struct vop_unlock_args *a = ap;
3798
3799         if (a->a_flags & LK_INTERLOCK)
3800                 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
3801         ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
3802 #endif
3803 }
3804
3805 void
3806 vop_unlock_post(void *ap, int rc)
3807 {
3808 #ifdef DEBUG_VFS_LOCKS
3809         struct vop_unlock_args *a = ap;
3810
3811         if (a->a_flags & LK_INTERLOCK)
3812                 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
3813 #endif
3814 }
3815
3816 void
3817 vop_create_post(void *ap, int rc)
3818 {
3819         struct vop_create_args *a = ap;
3820
3821         if (!rc)
3822                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3823 }
3824
3825 void
3826 vop_link_post(void *ap, int rc)
3827 {
3828         struct vop_link_args *a = ap;
3829
3830         if (!rc) {
3831                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
3832                 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
3833         }
3834 }
3835
3836 void
3837 vop_mkdir_post(void *ap, int rc)
3838 {
3839         struct vop_mkdir_args *a = ap;
3840
3841         if (!rc)
3842                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
3843 }
3844
3845 void
3846 vop_mknod_post(void *ap, int rc)
3847 {
3848         struct vop_mknod_args *a = ap;
3849
3850         if (!rc)
3851                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3852 }
3853
3854 void
3855 vop_remove_post(void *ap, int rc)
3856 {
3857         struct vop_remove_args *a = ap;
3858
3859         if (!rc) {
3860                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3861                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
3862         }
3863 }
3864
3865 void
3866 vop_rename_post(void *ap, int rc)
3867 {
3868         struct vop_rename_args *a = ap;
3869
3870         if (!rc) {
3871                 VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
3872                 VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
3873                 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
3874                 if (a->a_tvp)
3875                         VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
3876         }
3877         if (a->a_tdvp != a->a_fdvp)
3878                 vdrop(a->a_fdvp);
3879         if (a->a_tvp != a->a_fvp)
3880                 vdrop(a->a_fvp);
3881         vdrop(a->a_tdvp);
3882         if (a->a_tvp)
3883                 vdrop(a->a_tvp);
3884 }
3885
3886 void
3887 vop_rmdir_post(void *ap, int rc)
3888 {
3889         struct vop_rmdir_args *a = ap;
3890
3891         if (!rc) {
3892                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
3893                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
3894         }
3895 }
3896
3897 void
3898 vop_setattr_post(void *ap, int rc)
3899 {
3900         struct vop_setattr_args *a = ap;
3901
3902         if (!rc)
3903                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
3904 }
3905
3906 void
3907 vop_symlink_post(void *ap, int rc)
3908 {
3909         struct vop_symlink_args *a = ap;
3910
3911         if (!rc)
3912                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3913 }
3914
3915 static struct knlist fs_knlist;
3916
3917 static void
3918 vfs_event_init(void *arg)
3919 {
3920         knlist_init(&fs_knlist, NULL, NULL, NULL, NULL);
3921 }
3922 /* XXX - correct order? */
3923 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
3924
3925 void
3926 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused)
3927 {
3928
3929         KNOTE_UNLOCKED(&fs_knlist, event);
3930 }
3931
3932 static int      filt_fsattach(struct knote *kn);
3933 static void     filt_fsdetach(struct knote *kn);
3934 static int      filt_fsevent(struct knote *kn, long hint);
3935
3936 struct filterops fs_filtops =
3937         { 0, filt_fsattach, filt_fsdetach, filt_fsevent };
3938
3939 static int
3940 filt_fsattach(struct knote *kn)
3941 {
3942
3943         kn->kn_flags |= EV_CLEAR;
3944         knlist_add(&fs_knlist, kn, 0);
3945         return (0);
3946 }
3947
3948 static void
3949 filt_fsdetach(struct knote *kn)
3950 {
3951
3952         knlist_remove(&fs_knlist, kn, 0);
3953 }
3954
3955 static int
3956 filt_fsevent(struct knote *kn, long hint)
3957 {
3958
3959         kn->kn_fflags |= hint;
3960         return (kn->kn_fflags != 0);
3961 }
3962
3963 static int
3964 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
3965 {
3966         struct vfsidctl vc;
3967         int error;
3968         struct mount *mp;
3969
3970         error = SYSCTL_IN(req, &vc, sizeof(vc));
3971         if (error)
3972                 return (error);
3973         if (vc.vc_vers != VFS_CTL_VERS1)
3974                 return (EINVAL);
3975         mp = vfs_getvfs(&vc.vc_fsid);
3976         if (mp == NULL)
3977                 return (ENOENT);
3978         /* ensure that a specific sysctl goes to the right filesystem. */
3979         if (strcmp(vc.vc_fstypename, "*") != 0 &&
3980             strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
3981                 vfs_rel(mp);
3982                 return (EINVAL);
3983         }
3984         VCTLTOREQ(&vc, req);
3985         error = VFS_SYSCTL(mp, vc.vc_op, req);
3986         vfs_rel(mp);
3987         return (error);
3988 }
3989
3990 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "",
3991     "Sysctl by fsid");
3992
3993 /*
3994  * Function to initialize a va_filerev field sensibly.
3995  * XXX: Wouldn't a random number make a lot more sense ??
3996  */
3997 u_quad_t
3998 init_va_filerev(void)
3999 {
4000         struct bintime bt;
4001
4002         getbinuptime(&bt);
4003         return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4004 }
4005
4006 static int      filt_vfsread(struct knote *kn, long hint);
4007 static int      filt_vfswrite(struct knote *kn, long hint);
4008 static int      filt_vfsvnode(struct knote *kn, long hint);
4009 static void     filt_vfsdetach(struct knote *kn);
4010 static struct filterops vfsread_filtops =
4011         { 1, NULL, filt_vfsdetach, filt_vfsread };
4012 static struct filterops vfswrite_filtops =
4013         { 1, NULL, filt_vfsdetach, filt_vfswrite };
4014 static struct filterops vfsvnode_filtops =
4015         { 1, NULL, filt_vfsdetach, filt_vfsvnode };
4016
4017 static void
4018 vfs_knllock(void *arg)
4019 {
4020         struct vnode *vp = arg;
4021
4022         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4023 }
4024
4025 static void
4026 vfs_knlunlock(void *arg)
4027 {
4028         struct vnode *vp = arg;
4029
4030         VOP_UNLOCK(vp, 0);
4031 }
4032
4033 static int
4034 vfs_knllocked(void *arg)
4035 {
4036         struct vnode *vp = arg;
4037
4038         return (VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
4039 }
4040
4041 int
4042 vfs_kqfilter(struct vop_kqfilter_args *ap)
4043 {
4044         struct vnode *vp = ap->a_vp;
4045         struct knote *kn = ap->a_kn;
4046         struct knlist *knl;
4047
4048         switch (kn->kn_filter) {
4049         case EVFILT_READ:
4050                 kn->kn_fop = &vfsread_filtops;
4051                 break;
4052         case EVFILT_WRITE:
4053                 kn->kn_fop = &vfswrite_filtops;
4054                 break;
4055         case EVFILT_VNODE:
4056                 kn->kn_fop = &vfsvnode_filtops;
4057                 break;
4058         default:
4059                 return (EINVAL);
4060         }
4061
4062         kn->kn_hook = (caddr_t)vp;
4063
4064         if (vp->v_pollinfo == NULL)
4065                 v_addpollinfo(vp);
4066         if (vp->v_pollinfo == NULL)
4067                 return (ENOMEM);
4068         knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4069         knlist_add(knl, kn, 0);
4070
4071         return (0);
4072 }
4073
4074 /*
4075  * Detach knote from vnode
4076  */
4077 static void
4078 filt_vfsdetach(struct knote *kn)
4079 {
4080         struct vnode *vp = (struct vnode *)kn->kn_hook;
4081
4082         KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4083         knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4084 }
4085
4086 /*ARGSUSED*/
4087 static int
4088 filt_vfsread(struct knote *kn, long hint)
4089 {
4090         struct vnode *vp = (struct vnode *)kn->kn_hook;
4091         struct vattr va;
4092
4093         /*
4094          * filesystem is gone, so set the EOF flag and schedule
4095          * the knote for deletion.
4096          */
4097         if (hint == NOTE_REVOKE) {
4098                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4099                 return (1);
4100         }
4101
4102         if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread))
4103                 return (0);
4104
4105         kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4106         return (kn->kn_data != 0);
4107 }
4108
4109 /*ARGSUSED*/
4110 static int
4111 filt_vfswrite(struct knote *kn, long hint)
4112 {
4113         /*
4114          * filesystem is gone, so set the EOF flag and schedule
4115          * the knote for deletion.
4116          */
4117         if (hint == NOTE_REVOKE)
4118                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4119
4120         kn->kn_data = 0;
4121         return (1);
4122 }
4123
4124 static int
4125 filt_vfsvnode(struct knote *kn, long hint)
4126 {
4127         if (kn->kn_sfflags & hint)
4128                 kn->kn_fflags |= hint;
4129         if (hint == NOTE_REVOKE) {
4130                 kn->kn_flags |= EV_EOF;
4131                 return (1);
4132         }
4133         return (kn->kn_fflags != 0);
4134 }
4135
4136 int
4137 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4138 {
4139         int error;
4140
4141         if (dp->d_reclen > ap->a_uio->uio_resid)
4142                 return (ENAMETOOLONG);
4143         error = uiomove(dp, dp->d_reclen, ap->a_uio);
4144         if (error) {
4145                 if (ap->a_ncookies != NULL) {
4146                         if (ap->a_cookies != NULL)
4147                                 free(ap->a_cookies, M_TEMP);
4148                         ap->a_cookies = NULL;
4149                         *ap->a_ncookies = 0;
4150                 }
4151                 return (error);
4152         }
4153         if (ap->a_ncookies == NULL)
4154                 return (0);
4155
4156         KASSERT(ap->a_cookies,
4157             ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4158
4159         *ap->a_cookies = realloc(*ap->a_cookies,
4160             (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4161         (*ap->a_cookies)[*ap->a_ncookies] = off;
4162         return (0);
4163 }
4164
4165 /*
4166  * Mark for update the access time of the file if the filesystem
4167  * supports VA_MARK_ATIME.  This functionality is used by execve
4168  * and mmap, so we want to avoid the synchronous I/O implied by
4169  * directly setting va_atime for the sake of efficiency.
4170  */
4171 void
4172 vfs_mark_atime(struct vnode *vp, struct thread *td)
4173 {
4174         struct vattr atimeattr;
4175
4176         if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) {
4177                 VATTR_NULL(&atimeattr);
4178                 atimeattr.va_vaflags |= VA_MARK_ATIME;
4179                 (void)VOP_SETATTR(vp, &atimeattr, td->td_ucred, td);
4180         }
4181 }