sys/kern/vfs_export.c

   1 /*
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
  39  * $FreeBSD$
  40  */
  41
  42 /*
  43  * External virtual filesystem routines
  44  */
  45 #include "opt_ddb.h"
  46
  47 #include <sys/param.h>
  48 #include <sys/systm.h>
  49 #include <sys/fcntl.h>
  50 #include <sys/kernel.h>
  51 #include <sys/proc.h>
  52 #include <sys/kthread.h>
  53 #include <sys/malloc.h>
  54 #include <sys/mount.h>
  55 #include <sys/socket.h>
  56 #include <sys/vnode.h>
  57 #include <sys/stat.h>
  58 #include <sys/buf.h>
  59 #include <sys/domain.h>
  60 #include <sys/dirent.h>
  61 #include <sys/vmmeter.h>
  62 #include <sys/conf.h>
  63
  64 #include <machine/limits.h>
  65
  66 #include <vm/vm.h>
  67 #include <vm/vm_object.h>
  68 #include <vm/vm_extern.h>
  69 #include <vm/pmap.h>
  70 #include <vm/vm_map.h>
  71 #include <vm/vm_page.h>
  72 #include <vm/vm_pager.h>
  73 #include <vm/vnode_pager.h>
  74 #include <vm/vm_zone.h>
  75 #include <sys/sysctl.h>
  76
  77 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
  78
  79 static void     insmntque __P((struct vnode *vp, struct mount *mp));
  80 static void     vclean __P((struct vnode *vp, int flags, struct proc *p));
  81 static void     vfree __P((struct vnode *));
  82 static void     vgonel __P((struct vnode *vp, struct proc *p));
  83 static unsigned long    numvnodes;
  84 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
  85
  86 enum vtype iftovt_tab[16] = {
  87         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
  88         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
  89 };
  90 int vttoif_tab[9] = {
  91         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
  92         S_IFSOCK, S_IFIFO, S_IFMT,
  93 };
  94
  95 static TAILQ_HEAD(freelst, vnode) vnode_free_list;      /* vnode free list */
  96 struct tobefreelist vnode_tobefree_list;        /* vnode free list */
  97
  98 static u_long wantfreevnodes = 25;
  99 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 100 static u_long freevnodes = 0;
 101 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
 102
 103 static int reassignbufcalls;
 104 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
 105 static int reassignbufloops;
 106 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
 107 static int reassignbufsortgood;
 108 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
 109 static int reassignbufsortbad;
 110 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
 111 static int reassignbufmethod = 1;
 112 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
 113
 114 #ifdef ENABLE_VFS_IOOPT
 115 int vfs_ioopt = 0;
 116 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
 117 #endif
 118
 119 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */
 120 struct simplelock mountlist_slock;
 121 struct simplelock mntvnode_slock;
 122 int     nfs_mount_type = -1;
 123 #ifndef NULL_SIMPLELOCKS
 124 static struct simplelock mntid_slock;
 125 static struct simplelock vnode_free_list_slock;
 126 static struct simplelock spechash_slock;
 127 #endif
 128 struct nfs_public nfs_pub;      /* publicly exported FS */
 129 static vm_zone_t vnode_zone;
 130
 131 /*
 132  * The workitem queue.
 133  */
 134 #define SYNCER_MAXDELAY         32
 135 static int syncer_maxdelay = SYNCER_MAXDELAY;   /* maximum delay time */
 136 time_t syncdelay = 30;          /* max time to delay syncing data */
 137 time_t filedelay = 30;          /* time to delay syncing files */
 138 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
 139 time_t dirdelay = 29;           /* time to delay syncing directories */
 140 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
 141 time_t metadelay = 28;          /* time to delay syncing metadata */
 142 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
 143 static int rushjob;                     /* number of slots to run ASAP */
 144 static int stat_rush_requests;  /* number of times I/O speeded up */
 145 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
 146
 147 static int syncer_delayno = 0;
 148 static long syncer_mask;
 149 LIST_HEAD(synclist, vnode);
 150 static struct synclist *syncer_workitem_pending;
 151
 152 int desiredvnodes;
 153 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
 154     &desiredvnodes, 0, "Maximum number of vnodes");
 155
 156 static void     vfs_free_addrlist __P((struct netexport *nep));
 157 static int      vfs_free_netcred __P((struct radix_node *rn, void *w));
 158 static int      vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
 159                                        struct export_args *argp));
 160
 161 /*
 162  * Initialize the vnode management data structures.
 163  */
 164 void
 165 vntblinit()
 166 {
 167
 168         desiredvnodes = maxproc + cnt.v_page_count / 4;
 169         simple_lock_init(&mntvnode_slock);
 170         simple_lock_init(&mntid_slock);
 171         simple_lock_init(&spechash_slock);
 172         TAILQ_INIT(&vnode_free_list);
 173         TAILQ_INIT(&vnode_tobefree_list);
 174         simple_lock_init(&vnode_free_list_slock);
 175         vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
 176         /*
 177          * Initialize the filesystem syncer.
 178          */
 179         syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 180                 &syncer_mask);
 181         syncer_maxdelay = syncer_mask + 1;
 182 }
 183
 184 /*
 185  * Mark a mount point as busy. Used to synchronize access and to delay
 186  * unmounting. Interlock is not released on failure.
 187  */
 188 int
 189 vfs_busy(mp, flags, interlkp, p)
 190         struct mount *mp;
 191         int flags;
 192         struct simplelock *interlkp;
 193         struct proc *p;
 194 {
 195         int lkflags;
 196
 197         if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 198                 if (flags & LK_NOWAIT)
 199                         return (ENOENT);
 200                 mp->mnt_kern_flag |= MNTK_MWAIT;
 201                 if (interlkp) {
 202                         simple_unlock(interlkp);
 203                 }
 204                 /*
 205                  * Since all busy locks are shared except the exclusive
 206                  * lock granted when unmounting, the only place that a
 207                  * wakeup needs to be done is at the release of the
 208                  * exclusive lock at the end of dounmount.
 209                  */
 210                 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
 211                 if (interlkp) {
 212                         simple_lock(interlkp);
 213                 }
 214                 return (ENOENT);
 215         }
 216         lkflags = LK_SHARED | LK_NOPAUSE;
 217         if (interlkp)
 218                 lkflags |= LK_INTERLOCK;
 219         if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
 220                 panic("vfs_busy: unexpected lock failure");
 221         return (0);
 222 }
 223
 224 /*
 225  * Free a busy filesystem.
 226  */
 227 void
 228 vfs_unbusy(mp, p)
 229         struct mount *mp;
 230         struct proc *p;
 231 {
 232
 233         lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
 234 }
 235
 236 /*
 237  * Lookup a filesystem type, and if found allocate and initialize
 238  * a mount structure for it.
 239  *
 240  * Devname is usually updated by mount(8) after booting.
 241  */
 242 int
 243 vfs_rootmountalloc(fstypename, devname, mpp)
 244         char *fstypename;
 245         char *devname;
 246         struct mount **mpp;
 247 {
 248         struct proc *p = curproc;       /* XXX */
 249         struct vfsconf *vfsp;
 250         struct mount *mp;
 251
 252         if (fstypename == NULL)
 253                 return (ENODEV);
 254         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 255                 if (!strcmp(vfsp->vfc_name, fstypename))
 256                         break;
 257         if (vfsp == NULL)
 258                 return (ENODEV);
 259         mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
 260         bzero((char *)mp, (u_long)sizeof(struct mount));
 261         lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
 262         (void)vfs_busy(mp, LK_NOWAIT, 0, p);
 263         LIST_INIT(&mp->mnt_vnodelist);
 264         mp->mnt_vfc = vfsp;
 265         mp->mnt_op = vfsp->vfc_vfsops;
 266         mp->mnt_flag = MNT_RDONLY;
 267         mp->mnt_vnodecovered = NULLVP;
 268         vfsp->vfc_refcount++;
 269         mp->mnt_iosize_max = DFLTPHYS;
 270         mp->mnt_stat.f_type = vfsp->vfc_typenum;
 271         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 272         strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 273         mp->mnt_stat.f_mntonname[0] = '/';
 274         mp->mnt_stat.f_mntonname[1] = 0;
 275         (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
 276         *mpp = mp;
 277         return (0);
 278 }
 279
 280 /*
 281  * Find an appropriate filesystem to use for the root. If a filesystem
 282  * has not been preselected, walk through the list of known filesystems
 283  * trying those that have mountroot routines, and try them until one
 284  * works or we have tried them all.
 285  */
 286 #ifdef notdef   /* XXX JH */
 287 int
 288 lite2_vfs_mountroot()
 289 {
 290         struct vfsconf *vfsp;
 291         extern int (*lite2_mountroot) __P((void));
 292         int error;
 293
 294         if (lite2_mountroot != NULL)
 295                 return ((*lite2_mountroot)());
 296         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 297                 if (vfsp->vfc_mountroot == NULL)
 298                         continue;
 299                 if ((error = (*vfsp->vfc_mountroot)()) == 0)
 300                         return (0);
 301                 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
 302         }
 303         return (ENODEV);
 304 }
 305 #endif
 306
 307 /*
 308  * Lookup a mount point by filesystem identifier.
 309  */
 310 struct mount *
 311 vfs_getvfs(fsid)
 312         fsid_t *fsid;
 313 {
 314         register struct mount *mp;
 315
 316         simple_lock(&mountlist_slock);
 317         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 318                 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 319                     mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 320                         simple_unlock(&mountlist_slock);
 321                         return (mp);
 322             }
 323         }
 324         simple_unlock(&mountlist_slock);
 325         return ((struct mount *) 0);
 326 }
 327
 328 /*
 329  * Get a new unique fsid
 330  *
 331  * Keep in mind that several mounts may be running in parallel,
 332  * so always increment mntid_base even if lower numbers are available.
 333  */
 334
 335 static u_short mntid_base;
 336
 337 void
 338 vfs_getnewfsid(mp)
 339         struct mount *mp;
 340 {
 341         fsid_t tfsid;
 342         int mtype;
 343
 344         simple_lock(&mntid_slock);
 345
 346         mtype = mp->mnt_vfc->vfc_typenum;
 347         for (;;) {
 348                 tfsid.val[0] = makeudev(255, mtype + (mntid_base << 16));
 349                 tfsid.val[1] = mtype;
 350                 ++mntid_base;
 351                 if (vfs_getvfs(&tfsid) == NULL)
 352                         break;
 353         }
 354
 355         mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 356         mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 357
 358         simple_unlock(&mntid_slock);
 359 }
 360
 361 /*
 362  * Get what should become the root fsid.
 363  *
 364  * This is somewhat of a hack.  If the rootdev is not known we
 365  * assume that vfs_getnewfsid() will be called momentarily to
 366  * allocate it, and we return what vfs_getnewfsid() will return.
 367  */
 368
 369 dev_t
 370 vfs_getrootfsid(struct mount *mp)
 371 {
 372         int mtype;
 373
 374         mtype = mp->mnt_vfc->vfc_typenum;
 375         return(makedev(255, mtype + (mntid_base << 16)));
 376 }
 377
 378 /*
 379  * Knob to control the precision of file timestamps:
 380  *
 381  *   0 = seconds only; nanoseconds zeroed.
 382  *   1 = seconds and nanoseconds, accurate within 1/HZ.
 383  *   2 = seconds and nanoseconds, truncated to microseconds.
 384  * >=3 = seconds and nanoseconds, maximum precision.
 385  */
 386 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 387
 388 static int timestamp_precision = TSP_SEC;
 389 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
 390     &timestamp_precision, 0, "");
 391
 392 /*
 393  * Get a current timestamp.
 394  */
 395 void
 396 vfs_timestamp(tsp)
 397         struct timespec *tsp;
 398 {
 399         struct timeval tv;
 400
 401         switch (timestamp_precision) {
 402         case TSP_SEC:
 403                 tsp->tv_sec = time_second;
 404                 tsp->tv_nsec = 0;
 405                 break;
 406         case TSP_HZ:
 407                 getnanotime(tsp);
 408                 break;
 409         case TSP_USEC:
 410                 microtime(&tv);
 411                 TIMEVAL_TO_TIMESPEC(&tv, tsp);
 412                 break;
 413         case TSP_NSEC:
 414         default:
 415                 nanotime(tsp);
 416                 break;
 417         }
 418 }
 419
 420 /*
 421  * Set vnode attributes to VNOVAL
 422  */
 423 void
 424 vattr_null(vap)
 425         register struct vattr *vap;
 426 {
 427
 428         vap->va_type = VNON;
 429         vap->va_size = VNOVAL;
 430         vap->va_bytes = VNOVAL;
 431         vap->va_mode = VNOVAL;
 432         vap->va_nlink = VNOVAL;
 433         vap->va_uid = VNOVAL;
 434         vap->va_gid = VNOVAL;
 435         vap->va_fsid = VNOVAL;
 436         vap->va_fileid = VNOVAL;
 437         vap->va_blocksize = VNOVAL;
 438         vap->va_rdev = VNOVAL;
 439         vap->va_atime.tv_sec = VNOVAL;
 440         vap->va_atime.tv_nsec = VNOVAL;
 441         vap->va_mtime.tv_sec = VNOVAL;
 442         vap->va_mtime.tv_nsec = VNOVAL;
 443         vap->va_ctime.tv_sec = VNOVAL;
 444         vap->va_ctime.tv_nsec = VNOVAL;
 445         vap->va_flags = VNOVAL;
 446         vap->va_gen = VNOVAL;
 447         vap->va_vaflags = 0;
 448 }
 449
 450 /*
 451  * Routines having to do with the management of the vnode table.
 452  */
 453 extern vop_t **dead_vnodeop_p;
 454
 455 /*
 456  * Return the next vnode from the free list.
 457  */
 458 int
 459 getnewvnode(tag, mp, vops, vpp)
 460         enum vtagtype tag;
 461         struct mount *mp;
 462         vop_t **vops;
 463         struct vnode **vpp;
 464 {
 465         int s;
 466         struct proc *p = curproc;       /* XXX */
 467         struct vnode *vp, *tvp, *nvp;
 468         vm_object_t object;
 469         TAILQ_HEAD(freelst, vnode) vnode_tmp_list;
 470
 471         /*
 472          * We take the least recently used vnode from the freelist
 473          * if we can get it and it has no cached pages, and no
 474          * namecache entries are relative to it.
 475          * Otherwise we allocate a new vnode
 476          */
 477
 478         s = splbio();
 479         simple_lock(&vnode_free_list_slock);
 480         TAILQ_INIT(&vnode_tmp_list);
 481
 482         for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
 483                 nvp = TAILQ_NEXT(vp, v_freelist);
 484                 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
 485                 if (vp->v_flag & VAGE) {
 486                         TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 487                 } else {
 488                         TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 489                 }
 490                 vp->v_flag &= ~(VTBFREE|VAGE);
 491                 vp->v_flag |= VFREE;
 492                 if (vp->v_usecount)
 493                         panic("tobe free vnode isn't");
 494                 freevnodes++;
 495         }
 496
 497         if (wantfreevnodes && freevnodes < wantfreevnodes) {
 498                 vp = NULL;
 499         } else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
 500                 /*
 501                  * XXX: this is only here to be backwards compatible
 502                  */
 503                 vp = NULL;
 504         } else {
 505                 for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
 506                         nvp = TAILQ_NEXT(vp, v_freelist);
 507                         if (!simple_lock_try(&vp->v_interlock))
 508                                 continue;
 509                         if (vp->v_usecount)
 510                                 panic("free vnode isn't");
 511
 512                         object = vp->v_object;
 513                         if (object && (object->resident_page_count || object->ref_count)) {
 514                                 printf("object inconsistant state: RPC: %d, RC: %d\n",
 515                                         object->resident_page_count, object->ref_count);
 516                                 /* Don't recycle if it's caching some pages */
 517                                 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 518                                 TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
 519                                 continue;
 520                         } else if (LIST_FIRST(&vp->v_cache_src)) {
 521                                 /* Don't recycle if active in the namecache */
 522                                 simple_unlock(&vp->v_interlock);
 523                                 continue;
 524                         } else {
 525                                 break;
 526                         }
 527                 }
 528         }
 529
 530         for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
 531                 nvp = TAILQ_NEXT(tvp, v_freelist);
 532                 TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
 533                 TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
 534                 simple_unlock(&tvp->v_interlock);
 535         }
 536
 537         if (vp) {
 538                 vp->v_flag |= VDOOMED;
 539                 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 540                 freevnodes--;
 541                 simple_unlock(&vnode_free_list_slock);
 542                 cache_purge(vp);
 543                 vp->v_lease = NULL;
 544                 if (vp->v_type != VBAD) {
 545                         vgonel(vp, p);
 546                 } else {
 547                         simple_unlock(&vp->v_interlock);
 548                 }
 549
 550 #ifdef INVARIANTS
 551                 {
 552                         int s;
 553
 554                         if (vp->v_data)
 555                                 panic("cleaned vnode isn't");
 556                         s = splbio();
 557                         if (vp->v_numoutput)
 558                                 panic("Clean vnode has pending I/O's");
 559                         splx(s);
 560                 }
 561 #endif
 562                 vp->v_flag = 0;
 563                 vp->v_lastw = 0;
 564                 vp->v_lasta = 0;
 565                 vp->v_cstart = 0;
 566                 vp->v_clen = 0;
 567                 vp->v_socket = 0;
 568                 vp->v_writecount = 0;   /* XXX */
 569         } else {
 570                 simple_unlock(&vnode_free_list_slock);
 571                 vp = (struct vnode *) zalloc(vnode_zone);
 572                 bzero((char *) vp, sizeof *vp);
 573                 simple_lock_init(&vp->v_interlock);
 574                 vp->v_dd = vp;
 575                 cache_purge(vp);
 576                 LIST_INIT(&vp->v_cache_src);
 577                 TAILQ_INIT(&vp->v_cache_dst);
 578                 numvnodes++;
 579         }
 580
 581         TAILQ_INIT(&vp->v_cleanblkhd);
 582         TAILQ_INIT(&vp->v_dirtyblkhd);
 583         vp->v_type = VNON;
 584         vp->v_tag = tag;
 585         vp->v_op = vops;
 586         insmntque(vp, mp);
 587         *vpp = vp;
 588         vp->v_usecount = 1;
 589         vp->v_data = 0;
 590         splx(s);
 591
 592         vfs_object_create(vp, p, p->p_ucred);
 593         return (0);
 594 }
 595
 596 /*
 597  * Move a vnode from one mount queue to another.
 598  */
 599 static void
 600 insmntque(vp, mp)
 601         register struct vnode *vp;
 602         register struct mount *mp;
 603 {
 604
 605         simple_lock(&mntvnode_slock);
 606         /*
 607          * Delete from old mount point vnode list, if on one.
 608          */
 609         if (vp->v_mount != NULL)
 610                 LIST_REMOVE(vp, v_mntvnodes);
 611         /*
 612          * Insert into list of vnodes for the new mount point, if available.
 613          */
 614         if ((vp->v_mount = mp) == NULL) {
 615                 simple_unlock(&mntvnode_slock);
 616                 return;
 617         }
 618         LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
 619         simple_unlock(&mntvnode_slock);
 620 }
 621
 622 /*
 623  * Update outstanding I/O count and do wakeup if requested.
 624  */
 625 void
 626 vwakeup(bp)
 627         register struct buf *bp;
 628 {
 629         register struct vnode *vp;
 630
 631         bp->b_flags &= ~B_WRITEINPROG;
 632         if ((vp = bp->b_vp)) {
 633                 vp->v_numoutput--;
 634                 if (vp->v_numoutput < 0)
 635                         panic("vwakeup: neg numoutput");
 636                 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
 637                         vp->v_flag &= ~VBWAIT;
 638                         wakeup((caddr_t) &vp->v_numoutput);
 639                 }
 640         }
 641 }
 642
 643 /*
 644  * Flush out and invalidate all buffers associated with a vnode.
 645  * Called with the underlying object locked.
 646  */
 647 int
 648 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 649         register struct vnode *vp;
 650         int flags;
 651         struct ucred *cred;
 652         struct proc *p;
 653         int slpflag, slptimeo;
 654 {
 655         register struct buf *bp;
 656         struct buf *nbp, *blist;
 657         int s, error;
 658         vm_object_t object;
 659
 660         if (flags & V_SAVE) {
 661                 s = splbio();
 662                 while (vp->v_numoutput) {
 663                         vp->v_flag |= VBWAIT;
 664                         error = tsleep((caddr_t)&vp->v_numoutput,
 665                             slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
 666                         if (error) {
 667                                 splx(s);
 668                                 return (error);
 669                         }
 670                 }
 671                 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 672                         splx(s);
 673                         if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
 674                                 return (error);
 675                         s = splbio();
 676                         if (vp->v_numoutput > 0 ||
 677                             !TAILQ_EMPTY(&vp->v_dirtyblkhd))
 678                                 panic("vinvalbuf: dirty bufs");
 679                 }
 680                 splx(s);
 681         }
 682         s = splbio();
 683         for (;;) {
 684                 blist = TAILQ_FIRST(&vp->v_cleanblkhd);
 685                 if (!blist)
 686                         blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
 687                 if (!blist)
 688                         break;
 689
 690                 for (bp = blist; bp; bp = nbp) {
 691                         nbp = TAILQ_NEXT(bp, b_vnbufs);
 692                         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 693                                 error = BUF_TIMELOCK(bp,
 694                                     LK_EXCLUSIVE | LK_SLEEPFAIL,
 695                                     "vinvalbuf", slpflag, slptimeo);
 696                                 if (error == ENOLCK)
 697                                         break;
 698                                 splx(s);
 699                                 return (error);
 700                         }
 701                         /*
 702                          * XXX Since there are no node locks for NFS, I
 703                          * believe there is a slight chance that a delayed
 704                          * write will occur while sleeping just above, so
 705                          * check for it.  Note that vfs_bio_awrite expects
 706                          * buffers to reside on a queue, while VOP_BWRITE and
 707                          * brelse do not.
 708                          */
 709                         if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 710                                 (flags & V_SAVE)) {
 711
 712                                 if (bp->b_vp == vp) {
 713                                         if (bp->b_flags & B_CLUSTEROK) {
 714                                                 BUF_UNLOCK(bp);
 715                                                 vfs_bio_awrite(bp);
 716                                         } else {
 717                                                 bremfree(bp);
 718                                                 bp->b_flags |= B_ASYNC;
 719                                                 VOP_BWRITE(bp->b_vp, bp);
 720                                         }
 721                                 } else {
 722                                         bremfree(bp);
 723                                         (void) VOP_BWRITE(bp->b_vp, bp);
 724                                 }
 725                                 break;
 726                         }
 727                         bremfree(bp);
 728                         bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
 729                         bp->b_flags &= ~B_ASYNC;
 730                         brelse(bp);
 731                 }
 732         }
 733
 734         while (vp->v_numoutput > 0) {
 735                 vp->v_flag |= VBWAIT;
 736                 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
 737         }
 738
 739         splx(s);
 740
 741         /*
 742          * Destroy the copy in the VM cache, too.
 743          */
 744         simple_lock(&vp->v_interlock);
 745         object = vp->v_object;
 746         if (object != NULL) {
 747                 vm_object_page_remove(object, 0, 0,
 748                         (flags & V_SAVE) ? TRUE : FALSE);
 749         }
 750         simple_unlock(&vp->v_interlock);
 751
 752         if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
 753                 panic("vinvalbuf: flush failed");
 754         return (0);
 755 }
 756
 757 /*
 758  * Truncate a file's buffer and pages to a specified length.  This
 759  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
 760  * sync activity.
 761  */
 762 int
 763 vtruncbuf(vp, cred, p, length, blksize)
 764         register struct vnode *vp;
 765         struct ucred *cred;
 766         struct proc *p;
 767         off_t length;
 768         int blksize;
 769 {
 770         register struct buf *bp;
 771         struct buf *nbp;
 772         int s, anyfreed;
 773         int trunclbn;
 774
 775         /*
 776          * Round up to the *next* lbn.
 777          */
 778         trunclbn = (length + blksize - 1) / blksize;
 779
 780         s = splbio();
 781 restart:
 782         anyfreed = 1;
 783         for (;anyfreed;) {
 784                 anyfreed = 0;
 785                 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
 786                         nbp = TAILQ_NEXT(bp, b_vnbufs);
 787                         if (bp->b_lblkno >= trunclbn) {
 788                                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 789                                         BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 790                                         goto restart;
 791                                 } else {
 792                                         bremfree(bp);
 793                                         bp->b_flags |= (B_INVAL | B_RELBUF);
 794                                         bp->b_flags &= ~B_ASYNC;
 795                                         brelse(bp);
 796                                         anyfreed = 1;
 797                                 }
 798                                 if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)||
 799                                          (nbp->b_vp != vp) ||
 800                                          (nbp->b_flags & B_DELWRI))) {
 801                                         goto restart;
 802                                 }
 803                         }
 804                 }
 805
 806                 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 807                         nbp = TAILQ_NEXT(bp, b_vnbufs);
 808                         if (bp->b_lblkno >= trunclbn) {
 809                                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 810                                         BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 811                                         goto restart;
 812                                 } else {
 813                                         bremfree(bp);
 814                                         bp->b_flags |= (B_INVAL | B_RELBUF);
 815                                         bp->b_flags &= ~B_ASYNC;
 816                                         brelse(bp);
 817                                         anyfreed = 1;
 818                                 }
 819                                 if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)||
 820                                          (nbp->b_vp != vp) ||
 821                                          (nbp->b_flags & B_DELWRI) == 0)) {
 822                                         goto restart;
 823                                 }
 824                         }
 825                 }
 826         }
 827
 828         if (length > 0) {
 829 restartsync:
 830                 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 831                         nbp = TAILQ_NEXT(bp, b_vnbufs);
 832                         if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
 833                                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 834                                         BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 835                                         goto restart;
 836                                 } else {
 837                                         bremfree(bp);
 838                                         if (bp->b_vp == vp) {
 839                                                 bp->b_flags |= B_ASYNC;
 840                                         } else {
 841                                                 bp->b_flags &= ~B_ASYNC;
 842                                         }
 843                                         VOP_BWRITE(bp->b_vp, bp);
 844                                 }
 845                                 goto restartsync;
 846                         }
 847
 848                 }
 849         }
 850
 851         while (vp->v_numoutput > 0) {
 852                 vp->v_flag |= VBWAIT;
 853                 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
 854         }
 855
 856         splx(s);
 857
 858         vnode_pager_setsize(vp, length);
 859
 860         return (0);
 861 }
 862
 863 /*
 864  * Associate a buffer with a vnode.
 865  */
 866 void
 867 bgetvp(vp, bp)
 868         register struct vnode *vp;
 869         register struct buf *bp;
 870 {
 871         int s;
 872
 873         KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
 874
 875         vhold(vp);
 876         bp->b_vp = vp;
 877         bp->b_dev = vn_todev(vp);
 878         /*
 879          * Insert onto list for new vnode.
 880          */
 881         s = splbio();
 882         bp->b_xflags |= B_VNCLEAN;
 883         bp->b_xflags &= ~B_VNDIRTY;
 884         TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
 885         splx(s);
 886 }
 887
 888 /*
 889  * Disassociate a buffer from a vnode.
 890  */
 891 void
 892 brelvp(bp)
 893         register struct buf *bp;
 894 {
 895         struct vnode *vp;
 896         struct buflists *listheadp;
 897         int s;
 898
 899         KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 900
 901         /*
 902          * Delete from old vnode list, if on one.
 903          */
 904         vp = bp->b_vp;
 905         s = splbio();
 906         if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
 907                 if (bp->b_xflags & B_VNDIRTY)
 908                         listheadp = &vp->v_dirtyblkhd;
 909                 else
 910                         listheadp = &vp->v_cleanblkhd;
 911                 TAILQ_REMOVE(listheadp, bp, b_vnbufs);
 912                 bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
 913         }
 914         if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 915                 vp->v_flag &= ~VONWORKLST;
 916                 LIST_REMOVE(vp, v_synclist);
 917         }
 918         splx(s);
 919         bp->b_vp = (struct vnode *) 0;
 920         vdrop(vp);
 921 }
 922
 923 /*
 924  * The workitem queue.
 925  *
 926  * It is useful to delay writes of file data and filesystem metadata
 927  * for tens of seconds so that quickly created and deleted files need
 928  * not waste disk bandwidth being created and removed. To realize this,
 929  * we append vnodes to a "workitem" queue. When running with a soft
 930  * updates implementation, most pending metadata dependencies should
 931  * not wait for more than a few seconds. Thus, mounted on block devices
 932  * are delayed only about a half the time that file data is delayed.
 933  * Similarly, directory updates are more critical, so are only delayed
 934  * about a third the time that file data is delayed. Thus, there are
 935  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
 936  * one each second (driven off the filesystem syncer process). The
 937  * syncer_delayno variable indicates the next queue that is to be processed.
 938  * Items that need to be processed soon are placed in this queue:
 939  *
 940  *      syncer_workitem_pending[syncer_delayno]
 941  *
 942  * A delay of fifteen seconds is done by placing the request fifteen
 943  * entries later in the queue:
 944  *
 945  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
 946  *
 947  */
 948
 949 /*
 950  * Add an item to the syncer work queue.
 951  */
 952 static void
 953 vn_syncer_add_to_worklist(struct vnode *vp, int delay)
 954 {
 955         int s, slot;
 956
 957         s = splbio();
 958
 959         if (vp->v_flag & VONWORKLST) {
 960                 LIST_REMOVE(vp, v_synclist);
 961         }
 962
 963         if (delay > syncer_maxdelay - 2)
 964                 delay = syncer_maxdelay - 2;
 965         slot = (syncer_delayno + delay) & syncer_mask;
 966
 967         LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
 968         vp->v_flag |= VONWORKLST;
 969         splx(s);
 970 }
 971
 972 struct  proc *updateproc;
 973 static void sched_sync __P((void));
 974 static struct kproc_desc up_kp = {
 975         "syncer",
 976         sched_sync,
 977         &updateproc
 978 };
 979 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 980
 981 /*
 982  * System filesystem synchronizer daemon.
 983  */
 984 void
 985 sched_sync(void)
 986 {
 987         struct synclist *slp;
 988         struct vnode *vp;
 989         long starttime;
 990         int s;
 991         struct proc *p = updateproc;
 992
 993         p->p_flag |= P_BUFEXHAUST;
 994
 995         for (;;) {
 996                 starttime = time_second;
 997
 998                 /*
 999                  * Push files whose dirty time has expired.  Be careful
1000                  * of interrupt race on slp queue.
1001                  */
1002                 s = splbio();
1003                 slp = &syncer_workitem_pending[syncer_delayno];
1004                 syncer_delayno += 1;
1005                 if (syncer_delayno == syncer_maxdelay)
1006                         syncer_delayno = 0;
1007                 splx(s);
1008
1009                 while ((vp = LIST_FIRST(slp)) != NULL) {
1010                         if (VOP_ISLOCKED(vp) == 0) {
1011                                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
1012                                 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
1013                                 VOP_UNLOCK(vp, 0, p);
1014                         }
1015                         s = splbio();
1016                         if (LIST_FIRST(slp) == vp) {
1017                                 /*
1018                                  * Note: v_tag VT_VFS vps can remain on the
1019                                  * worklist too with no dirty blocks, but
1020                                  * since sync_fsync() moves it to a different
1021                                  * slot we are safe.
1022                                  */
1023                                 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1024                                     !vn_isdisk(vp))
1025                                         panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
1026                                 /*
1027                                  * Put us back on the worklist.  The worklist
1028                                  * routine will remove us from our current
1029                                  * position and then add us back in at a later
1030                                  * position.
1031                                  */
1032                                 vn_syncer_add_to_worklist(vp, syncdelay);
1033                         }
1034                         splx(s);
1035                 }
1036
1037                 /*
1038                  * Do soft update processing.
1039                  */
1040                 if (bioops.io_sync)
1041                         (*bioops.io_sync)(NULL);
1042
1043                 /*
1044                  * The variable rushjob allows the kernel to speed up the
1045                  * processing of the filesystem syncer process. A rushjob
1046                  * value of N tells the filesystem syncer to process the next
1047                  * N seconds worth of work on its queue ASAP. Currently rushjob
1048                  * is used by the soft update code to speed up the filesystem
1049                  * syncer process when the incore state is getting so far
1050                  * ahead of the disk that the kernel memory pool is being
1051                  * threatened with exhaustion.
1052                  */
1053                 if (rushjob > 0) {
1054                         rushjob -= 1;
1055                         continue;
1056                 }
1057                 /*
1058                  * If it has taken us less than a second to process the
1059                  * current work, then wait. Otherwise start right over
1060                  * again. We can still lose time if any single round
1061                  * takes more than two seconds, but it does not really
1062                  * matter as we are just trying to generally pace the
1063                  * filesystem activity.
1064                  */
1065                 if (time_second == starttime)
1066                         tsleep(&lbolt, PPAUSE, "syncer", 0);
1067         }
1068 }
1069
1070 /*
1071  * Request the syncer daemon to speed up its work.
1072  * We never push it to speed up more than half of its
1073  * normal turn time, otherwise it could take over the cpu.
1074  */
1075 int
1076 speedup_syncer()
1077 {
1078         int s;
1079
1080         s = splhigh();
1081         if (updateproc->p_wchan == &lbolt)
1082                 setrunnable(updateproc);
1083         splx(s);
1084         if (rushjob < syncdelay / 2) {
1085                 rushjob += 1;
1086                 stat_rush_requests += 1;
1087                 return (1);
1088         }
1089         return(0);
1090 }
1091
1092 /*
1093  * Associate a p-buffer with a vnode.
1094  *
1095  * Also sets B_PAGING flag to indicate that vnode is not fully associated
1096  * with the buffer.  i.e. the bp has not been linked into the vnode or
1097  * ref-counted.
1098  */
1099 void
1100 pbgetvp(vp, bp)
1101         register struct vnode *vp;
1102         register struct buf *bp;
1103 {
1104
1105         KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1106
1107         bp->b_vp = vp;
1108         bp->b_flags |= B_PAGING;
1109         bp->b_dev = vn_todev(vp);
1110 }
1111
1112 /*
1113  * Disassociate a p-buffer from a vnode.
1114  */
1115 void
1116 pbrelvp(bp)
1117         register struct buf *bp;
1118 {
1119
1120         KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1121
1122 #if !defined(MAX_PERF)
1123         /* XXX REMOVE ME */
1124         if (bp->b_vnbufs.tqe_next != NULL) {
1125                 panic(
1126                     "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1127                     bp,
1128                     (int)bp->b_flags
1129                 );
1130         }
1131 #endif
1132         bp->b_vp = (struct vnode *) 0;
1133         bp->b_flags &= ~B_PAGING;
1134 }
1135
1136 void
1137 pbreassignbuf(bp, newvp)
1138         struct buf *bp;
1139         struct vnode *newvp;
1140 {
1141 #if !defined(MAX_PERF)
1142         if ((bp->b_flags & B_PAGING) == 0) {
1143                 panic(
1144                     "pbreassignbuf() on non phys bp %p",
1145                     bp
1146                 );
1147         }
1148 #endif
1149         bp->b_vp = newvp;
1150 }
1151
1152 /*
1153  * Reassign a buffer from one vnode to another.
1154  * Used to assign file specific control information
1155  * (indirect blocks) to the vnode to which they belong.
1156  */
1157 void
1158 reassignbuf(bp, newvp)
1159         register struct buf *bp;
1160         register struct vnode *newvp;
1161 {
1162         struct buflists *listheadp;
1163         int delay;
1164         int s;
1165
1166         if (newvp == NULL) {
1167                 printf("reassignbuf: NULL");
1168                 return;
1169         }
1170         ++reassignbufcalls;
1171
1172 #if !defined(MAX_PERF)
1173         /*
1174          * B_PAGING flagged buffers cannot be reassigned because their vp
1175          * is not fully linked in.
1176          */
1177         if (bp->b_flags & B_PAGING)
1178                 panic("cannot reassign paging buffer");
1179 #endif
1180
1181         s = splbio();
1182         /*
1183          * Delete from old vnode list, if on one.
1184          */
1185         if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
1186                 if (bp->b_xflags & B_VNDIRTY)
1187                         listheadp = &bp->b_vp->v_dirtyblkhd;
1188                 else
1189                         listheadp = &bp->b_vp->v_cleanblkhd;
1190                 TAILQ_REMOVE(listheadp, bp, b_vnbufs);
1191                 bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
1192                 if (bp->b_vp != newvp) {
1193                         vdrop(bp->b_vp);
1194                         bp->b_vp = NULL;        /* for clarification */
1195                 }
1196         }
1197         /*
1198          * If dirty, put on list of dirty buffers; otherwise insert onto list
1199          * of clean buffers.
1200          */
1201         if (bp->b_flags & B_DELWRI) {
1202                 struct buf *tbp;
1203
1204                 listheadp = &newvp->v_dirtyblkhd;
1205                 if ((newvp->v_flag & VONWORKLST) == 0) {
1206                         switch (newvp->v_type) {
1207                         case VDIR:
1208                                 delay = dirdelay;
1209                                 break;
1210                         case VCHR:
1211                         case VBLK:
1212                                 if (newvp->v_specmountpoint != NULL) {
1213                                         delay = metadelay;
1214                                         break;
1215                                 }
1216                                 /* fall through */
1217                         default:
1218                                 delay = filedelay;
1219                         }
1220                         vn_syncer_add_to_worklist(newvp, delay);
1221                 }
1222                 bp->b_xflags |= B_VNDIRTY;
1223                 tbp = TAILQ_FIRST(listheadp);
1224                 if (tbp == NULL ||
1225                     bp->b_lblkno == 0 ||
1226                     (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
1227                         TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1228                         ++reassignbufsortgood;
1229                 } else if (bp->b_lblkno < 0) {
1230                         TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
1231                         ++reassignbufsortgood;
1232                 } else if (reassignbufmethod == 1) {
1233                         /*
1234                          * New sorting algorithm, only handle sequential case,
1235                          * otherwise guess.
1236                          */
1237                         if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
1238                             (tbp->b_xflags & B_VNDIRTY)) {
1239                                 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1240                                 ++reassignbufsortgood;
1241                         } else {
1242                                 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1243                                 ++reassignbufsortbad;
1244                         }
1245                 } else {
1246                         /*
1247                          * Old sorting algorithm, scan queue and insert
1248                          */
1249                         struct buf *ttbp;
1250                         while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
1251                             (ttbp->b_lblkno < bp->b_lblkno)) {
1252                                 ++reassignbufloops;
1253                                 tbp = ttbp;
1254                         }
1255                         TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1256                 }
1257         } else {
1258                 bp->b_xflags |= B_VNCLEAN;
1259                 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
1260                 if ((newvp->v_flag & VONWORKLST) &&
1261                     TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1262                         newvp->v_flag &= ~VONWORKLST;
1263                         LIST_REMOVE(newvp, v_synclist);
1264                 }
1265         }
1266         if (bp->b_vp != newvp) {
1267                 bp->b_vp = newvp;
1268                 vhold(bp->b_vp);
1269         }
1270         splx(s);
1271 }
1272
1273 /*
1274  * Create a vnode for a block device.
1275  * Used for mounting the root file system.
1276  */
1277 int
1278 bdevvp(dev, vpp)
1279         dev_t dev;
1280         struct vnode **vpp;
1281 {
1282         register struct vnode *vp;
1283         struct vnode *nvp;
1284         int error;
1285
1286         if (dev == NODEV) {
1287                 *vpp = NULLVP;
1288                 return (ENXIO);
1289         }
1290         error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1291         if (error) {
1292                 *vpp = NULLVP;
1293                 return (error);
1294         }
1295         vp = nvp;
1296         vp->v_type = VBLK;
1297         addalias(vp, dev);
1298         *vpp = vp;
1299         return (0);
1300 }
1301
1302 /*
1303  * Add vnode to the alias list hung off the dev_t.
1304  *
1305  * The reason for this gunk is that multiple vnodes can reference
1306  * the same physical device, so checking vp->v_usecount to see
1307  * how many users there are is inadequate; the v_usecount for
1308  * the vnodes need to be accumulated.  vcount() does that.
1309  */
1310 void
1311 addaliasu(nvp, nvp_rdev)
1312         struct vnode *nvp;
1313         udev_t nvp_rdev;
1314 {
1315
1316         if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1317                 panic("addaliasu on non-special vnode");
1318         addalias(nvp, udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0));
1319 }
1320
1321 void
1322 addalias(nvp, dev)
1323         struct vnode *nvp;
1324         dev_t dev;
1325 {
1326
1327         if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1328                 panic("addalias on non-special vnode");
1329
1330         nvp->v_rdev = dev;
1331         simple_lock(&spechash_slock);
1332         SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1333         simple_unlock(&spechash_slock);
1334 }
1335
1336 /*
1337  * Grab a particular vnode from the free list, increment its
1338  * reference count and lock it. The vnode lock bit is set if the
1339  * vnode is being eliminated in vgone. The process is awakened
1340  * when the transition is completed, and an error returned to
1341  * indicate that the vnode is no longer usable (possibly having
1342  * been changed to a new file system type).
1343  */
1344 int
1345 vget(vp, flags, p)
1346         register struct vnode *vp;
1347         int flags;
1348         struct proc *p;
1349 {
1350         int error;
1351
1352         /*
1353          * If the vnode is in the process of being cleaned out for
1354          * another use, we wait for the cleaning to finish and then
1355          * return failure. Cleaning is determined by checking that
1356          * the VXLOCK flag is set.
1357          */
1358         if ((flags & LK_INTERLOCK) == 0) {
1359                 simple_lock(&vp->v_interlock);
1360         }
1361         if (vp->v_flag & VXLOCK) {
1362                 vp->v_flag |= VXWANT;
1363                 simple_unlock(&vp->v_interlock);
1364                 tsleep((caddr_t)vp, PINOD, "vget", 0);
1365                 return (ENOENT);
1366         }
1367
1368         vp->v_usecount++;
1369
1370         if (VSHOULDBUSY(vp))
1371                 vbusy(vp);
1372         if (flags & LK_TYPE_MASK) {
1373                 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
1374                         /*
1375                          * must expand vrele here because we do not want
1376                          * to call VOP_INACTIVE if the reference count
1377                          * drops back to zero since it was never really
1378                          * active. We must remove it from the free list
1379                          * before sleeping so that multiple processes do
1380                          * not try to recycle it.
1381                          */
1382                         simple_lock(&vp->v_interlock);
1383                         vp->v_usecount--;
1384                         if (VSHOULDFREE(vp))
1385                                 vfree(vp);
1386                         simple_unlock(&vp->v_interlock);
1387                 }
1388                 return (error);
1389         }
1390         simple_unlock(&vp->v_interlock);
1391         return (0);
1392 }
1393
1394 void
1395 vref(struct vnode *vp)
1396 {
1397         simple_lock(&vp->v_interlock);
1398         vp->v_usecount++;
1399         simple_unlock(&vp->v_interlock);
1400 }
1401
1402 /*
1403  * Vnode put/release.
1404  * If count drops to zero, call inactive routine and return to freelist.
1405  */
1406 void
1407 vrele(vp)
1408         struct vnode *vp;
1409 {
1410         struct proc *p = curproc;       /* XXX */
1411
1412         KASSERT(vp != NULL, ("vrele: null vp"));
1413
1414         simple_lock(&vp->v_interlock);
1415
1416         if (vp->v_usecount > 1) {
1417
1418                 vp->v_usecount--;
1419                 simple_unlock(&vp->v_interlock);
1420
1421                 return;
1422         }
1423
1424         if (vp->v_usecount == 1) {
1425
1426                 vp->v_usecount--;
1427                 if (VSHOULDFREE(vp))
1428                         vfree(vp);
1429         /*
1430          * If we are doing a vput, the node is already locked, and we must
1431          * call VOP_INACTIVE with the node locked.  So, in the case of
1432          * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1433          */
1434                 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1435                         VOP_INACTIVE(vp, p);
1436                 }
1437
1438         } else {
1439 #ifdef DIAGNOSTIC
1440                 vprint("vrele: negative ref count", vp);
1441                 simple_unlock(&vp->v_interlock);
1442 #endif
1443                 panic("vrele: negative ref cnt");
1444         }
1445 }
1446
1447 void
1448 vput(vp)
1449         struct vnode *vp;
1450 {
1451         struct proc *p = curproc;       /* XXX */
1452
1453         KASSERT(vp != NULL, ("vput: null vp"));
1454
1455         simple_lock(&vp->v_interlock);
1456
1457         if (vp->v_usecount > 1) {
1458
1459                 vp->v_usecount--;
1460                 VOP_UNLOCK(vp, LK_INTERLOCK, p);
1461                 return;
1462
1463         }
1464
1465         if (vp->v_usecount == 1) {
1466
1467                 vp->v_usecount--;
1468                 if (VSHOULDFREE(vp))
1469                         vfree(vp);
1470         /*
1471          * If we are doing a vput, the node is already locked, and we must
1472          * call VOP_INACTIVE with the node locked.  So, in the case of
1473          * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1474          */
1475                 simple_unlock(&vp->v_interlock);
1476                 VOP_INACTIVE(vp, p);
1477
1478         } else {
1479 #ifdef DIAGNOSTIC
1480                 vprint("vput: negative ref count", vp);
1481 #endif
1482                 panic("vput: negative ref cnt");
1483         }
1484 }
1485
1486 /*
1487  * Somebody doesn't want the vnode recycled.
1488  */
1489 void
1490 vhold(vp)
1491         register struct vnode *vp;
1492 {
1493         int s;
1494
1495         s = splbio();
1496         vp->v_holdcnt++;
1497         if (VSHOULDBUSY(vp))
1498                 vbusy(vp);
1499         splx(s);
1500 }
1501
1502 /*
1503  * One less who cares about this vnode.
1504  */
1505 void
1506 vdrop(vp)
1507         register struct vnode *vp;
1508 {
1509         int s;
1510
1511         s = splbio();
1512         if (vp->v_holdcnt <= 0)
1513                 panic("vdrop: holdcnt");
1514         vp->v_holdcnt--;
1515         if (VSHOULDFREE(vp))
1516                 vfree(vp);
1517         splx(s);
1518 }
1519
1520 /*
1521  * Remove any vnodes in the vnode table belonging to mount point mp.
1522  *
1523  * If MNT_NOFORCE is specified, there should not be any active ones,
1524  * return error if any are found (nb: this is a user error, not a
1525  * system error). If MNT_FORCE is specified, detach any active vnodes
1526  * that are found.
1527  */
1528 #ifdef DIAGNOSTIC
1529 static int busyprt = 0;         /* print out busy vnodes */
1530 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1531 #endif
1532
1533 int
1534 vflush(mp, skipvp, flags)
1535         struct mount *mp;
1536         struct vnode *skipvp;
1537         int flags;
1538 {
1539         struct proc *p = curproc;       /* XXX */
1540         struct vnode *vp, *nvp;
1541         int busy = 0;
1542
1543         simple_lock(&mntvnode_slock);
1544 loop:
1545         for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
1546                 /*
1547                  * Make sure this vnode wasn't reclaimed in getnewvnode().
1548                  * Start over if it has (it won't be on the list anymore).
1549                  */
1550                 if (vp->v_mount != mp)
1551                         goto loop;
1552                 nvp = LIST_NEXT(vp, v_mntvnodes);
1553                 /*
1554                  * Skip over a selected vnode.
1555                  */
1556                 if (vp == skipvp)
1557                         continue;
1558
1559                 simple_lock(&vp->v_interlock);
1560                 /*
1561                  * Skip over a vnodes marked VSYSTEM.
1562                  */
1563                 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1564                         simple_unlock(&vp->v_interlock);
1565                         continue;
1566                 }
1567                 /*
1568                  * If WRITECLOSE is set, only flush out regular file vnodes
1569                  * open for writing.
1570                  */
1571                 if ((flags & WRITECLOSE) &&
1572                     (vp->v_writecount == 0 || vp->v_type != VREG)) {
1573                         simple_unlock(&vp->v_interlock);
1574                         continue;
1575                 }
1576
1577                 /*
1578                  * With v_usecount == 0, all we need to do is clear out the
1579                  * vnode data structures and we are done.
1580                  */
1581                 if (vp->v_usecount == 0) {
1582                         simple_unlock(&mntvnode_slock);
1583                         vgonel(vp, p);
1584                         simple_lock(&mntvnode_slock);
1585                         continue;
1586                 }
1587
1588                 /*
1589                  * If FORCECLOSE is set, forcibly close the vnode. For block
1590                  * or character devices, revert to an anonymous device. For
1591                  * all other files, just kill them.
1592                  */
1593                 if (flags & FORCECLOSE) {
1594                         simple_unlock(&mntvnode_slock);
1595                         if (vp->v_type != VBLK && vp->v_type != VCHR) {
1596                                 vgonel(vp, p);
1597                         } else {
1598                                 vclean(vp, 0, p);
1599                                 vp->v_op = spec_vnodeop_p;
1600                                 insmntque(vp, (struct mount *) 0);
1601                         }
1602                         simple_lock(&mntvnode_slock);
1603                         continue;
1604                 }
1605 #ifdef DIAGNOSTIC
1606                 if (busyprt)
1607                         vprint("vflush: busy vnode", vp);
1608 #endif
1609                 simple_unlock(&vp->v_interlock);
1610                 busy++;
1611         }
1612         simple_unlock(&mntvnode_slock);
1613         if (busy)
1614                 return (EBUSY);
1615         return (0);
1616 }
1617
1618 /*
1619  * Disassociate the underlying file system from a vnode.
1620  */
1621 static void
1622 vclean(vp, flags, p)
1623         struct vnode *vp;
1624         int flags;
1625         struct proc *p;
1626 {
1627         int active;
1628         vm_object_t obj;
1629
1630         /*
1631          * Check to see if the vnode is in use. If so we have to reference it
1632          * before we clean it out so that its count cannot fall to zero and
1633          * generate a race against ourselves to recycle it.
1634          */
1635         if ((active = vp->v_usecount))
1636                 vp->v_usecount++;
1637
1638         /*
1639          * Prevent the vnode from being recycled or brought into use while we
1640          * clean it out.
1641          */
1642         if (vp->v_flag & VXLOCK)
1643                 panic("vclean: deadlock");
1644         vp->v_flag |= VXLOCK;
1645         /*
1646          * Even if the count is zero, the VOP_INACTIVE routine may still
1647          * have the object locked while it cleans it out. The VOP_LOCK
1648          * ensures that the VOP_INACTIVE routine is done with its work.
1649          * For active vnodes, it ensures that no other activity can
1650          * occur while the underlying object is being cleaned out.
1651          */
1652         VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1653
1654         /*
1655          * Clean out any buffers associated with the vnode.
1656          */
1657         vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1658         if ((obj = vp->v_object) != NULL) {
1659                 if (obj->ref_count == 0) {
1660                         /*
1661                          * vclean() may be called twice.  The first time removes the
1662                          * primary reference to the object, the second time goes
1663                          * one further and is a special-case to terminate the object.
1664                          */
1665                         vm_object_terminate(obj);
1666                 } else {
1667                         /*
1668                          * Woe to the process that tries to page now :-).
1669                          */
1670                         vm_pager_deallocate(obj);
1671                 }
1672         }
1673
1674         /*
1675          * If purging an active vnode, it must be closed and
1676          * deactivated before being reclaimed. Note that the
1677          * VOP_INACTIVE will unlock the vnode.
1678          */
1679         if (active) {
1680                 if (flags & DOCLOSE)
1681                         VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1682                 VOP_INACTIVE(vp, p);
1683         } else {
1684                 /*
1685                  * Any other processes trying to obtain this lock must first
1686                  * wait for VXLOCK to clear, then call the new lock operation.
1687                  */
1688                 VOP_UNLOCK(vp, 0, p);
1689         }
1690         /*
1691          * Reclaim the vnode.
1692          */
1693         if (VOP_RECLAIM(vp, p))
1694                 panic("vclean: cannot reclaim");
1695
1696         if (active)
1697                 vrele(vp);
1698
1699         cache_purge(vp);
1700         if (vp->v_vnlock) {
1701                 FREE(vp->v_vnlock, M_VNODE);
1702                 vp->v_vnlock = NULL;
1703         }
1704
1705         if (VSHOULDFREE(vp))
1706                 vfree(vp);
1707
1708         /*
1709          * Done with purge, notify sleepers of the grim news.
1710          */
1711         vp->v_op = dead_vnodeop_p;
1712         vn_pollgone(vp);
1713         vp->v_tag = VT_NON;
1714         vp->v_flag &= ~VXLOCK;
1715         if (vp->v_flag & VXWANT) {
1716                 vp->v_flag &= ~VXWANT;
1717                 wakeup((caddr_t) vp);
1718         }
1719 }
1720
1721 /*
1722  * Eliminate all activity associated with the requested vnode
1723  * and with all vnodes aliased to the requested vnode.
1724  */
1725 int
1726 vop_revoke(ap)
1727         struct vop_revoke_args /* {
1728                 struct vnode *a_vp;
1729                 int a_flags;
1730         } */ *ap;
1731 {
1732         struct vnode *vp, *vq;
1733         dev_t dev;
1734
1735         KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
1736
1737         vp = ap->a_vp;
1738         /*
1739          * If a vgone (or vclean) is already in progress,
1740          * wait until it is done and return.
1741          */
1742         if (vp->v_flag & VXLOCK) {
1743                 vp->v_flag |= VXWANT;
1744                 simple_unlock(&vp->v_interlock);
1745                 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1746                 return (0);
1747         }
1748         dev = vp->v_rdev;
1749         for (;;) {
1750                 simple_lock(&spechash_slock);
1751                 vq = SLIST_FIRST(&dev->si_hlist);
1752                 simple_unlock(&spechash_slock);
1753                 if (!vq)
1754                         break;
1755                 vgone(vq);
1756         }
1757         return (0);
1758 }
1759
1760 /*
1761  * Recycle an unused vnode to the front of the free list.
1762  * Release the passed interlock if the vnode will be recycled.
1763  */
1764 int
1765 vrecycle(vp, inter_lkp, p)
1766         struct vnode *vp;
1767         struct simplelock *inter_lkp;
1768         struct proc *p;
1769 {
1770
1771         simple_lock(&vp->v_interlock);
1772         if (vp->v_usecount == 0) {
1773                 if (inter_lkp) {
1774                         simple_unlock(inter_lkp);
1775                 }
1776                 vgonel(vp, p);
1777                 return (1);
1778         }
1779         simple_unlock(&vp->v_interlock);
1780         return (0);
1781 }
1782
1783 /*
1784  * Eliminate all activity associated with a vnode
1785  * in preparation for reuse.
1786  */
1787 void
1788 vgone(vp)
1789         register struct vnode *vp;
1790 {
1791         struct proc *p = curproc;       /* XXX */
1792
1793         simple_lock(&vp->v_interlock);
1794         vgonel(vp, p);
1795 }
1796
1797 /*
1798  * vgone, with the vp interlock held.
1799  */
1800 static void
1801 vgonel(vp, p)
1802         struct vnode *vp;
1803         struct proc *p;
1804 {
1805         int s;
1806
1807         /*
1808          * If a vgone (or vclean) is already in progress,
1809          * wait until it is done and return.
1810          */
1811         if (vp->v_flag & VXLOCK) {
1812                 vp->v_flag |= VXWANT;
1813                 simple_unlock(&vp->v_interlock);
1814                 tsleep((caddr_t)vp, PINOD, "vgone", 0);
1815                 return;
1816         }
1817
1818         /*
1819          * Clean out the filesystem specific data.
1820          */
1821         vclean(vp, DOCLOSE, p);
1822         simple_lock(&vp->v_interlock);
1823
1824         /*
1825          * Delete from old mount point vnode list, if on one.
1826          */
1827         if (vp->v_mount != NULL)
1828                 insmntque(vp, (struct mount *)0);
1829         /*
1830          * If special device, remove it from special device alias list
1831          * if it is on one.
1832          */
1833         if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) {
1834                 simple_lock(&spechash_slock);
1835                 SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext);
1836                 freedev(vp->v_rdev);
1837                 simple_unlock(&spechash_slock);
1838                 vp->v_rdev = NULL;
1839         }
1840
1841         /*
1842          * If it is on the freelist and not already at the head,
1843          * move it to the head of the list. The test of the back
1844          * pointer and the reference count of zero is because
1845          * it will be removed from the free list by getnewvnode,
1846          * but will not have its reference count incremented until
1847          * after calling vgone. If the reference count were
1848          * incremented first, vgone would (incorrectly) try to
1849          * close the previous instance of the underlying object.
1850          */
1851         if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
1852                 s = splbio();
1853                 simple_lock(&vnode_free_list_slock);
1854                 if (vp->v_flag & VFREE) {
1855                         TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1856                 } else if (vp->v_flag & VTBFREE) {
1857                         TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
1858                         vp->v_flag &= ~VTBFREE;
1859                         freevnodes++;
1860                 } else
1861                         freevnodes++;
1862                 vp->v_flag |= VFREE;
1863                 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1864                 simple_unlock(&vnode_free_list_slock);
1865                 splx(s);
1866         }
1867
1868         vp->v_type = VBAD;
1869         simple_unlock(&vp->v_interlock);
1870 }
1871
1872 /*
1873  * Lookup a vnode by device number.
1874  */
1875 int
1876 vfinddev(dev, type, vpp)
1877         dev_t dev;
1878         enum vtype type;
1879         struct vnode **vpp;
1880 {
1881         struct vnode *vp;
1882
1883         simple_lock(&spechash_slock);
1884         SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
1885                 if (type == vp->v_type) {
1886                         *vpp = vp;
1887                         simple_unlock(&spechash_slock);
1888                         return (1);
1889                 }
1890         }
1891         simple_unlock(&spechash_slock);
1892         return (0);
1893 }
1894
1895 /*
1896  * Calculate the total number of references to a special device.
1897  */
1898 int
1899 vcount(vp)
1900         struct vnode *vp;
1901 {
1902         struct vnode *vq;
1903         int count;
1904
1905         count = 0;
1906         simple_lock(&spechash_slock);
1907         SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext)
1908                 count += vq->v_usecount;
1909         simple_unlock(&spechash_slock);
1910         return (count);
1911 }
1912
1913 /*
1914  * Print out a description of a vnode.
1915  */
1916 static char *typename[] =
1917 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1918
1919 void
1920 vprint(label, vp)
1921         char *label;
1922         struct vnode *vp;
1923 {
1924         char buf[96];
1925
1926         if (label != NULL)
1927                 printf("%s: %p: ", label, (void *)vp);
1928         else
1929                 printf("%p: ", (void *)vp);
1930         printf("type %s, usecount %d, writecount %d, refcount %d,",
1931             typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1932             vp->v_holdcnt);
1933         buf[0] = '\0';
1934         if (vp->v_flag & VROOT)
1935                 strcat(buf, "|VROOT");
1936         if (vp->v_flag & VTEXT)
1937                 strcat(buf, "|VTEXT");
1938         if (vp->v_flag & VSYSTEM)
1939                 strcat(buf, "|VSYSTEM");
1940         if (vp->v_flag & VXLOCK)
1941                 strcat(buf, "|VXLOCK");
1942         if (vp->v_flag & VXWANT)
1943                 strcat(buf, "|VXWANT");
1944         if (vp->v_flag & VBWAIT)
1945                 strcat(buf, "|VBWAIT");
1946         if (vp->v_flag & VDOOMED)
1947                 strcat(buf, "|VDOOMED");
1948         if (vp->v_flag & VFREE)
1949                 strcat(buf, "|VFREE");
1950         if (vp->v_flag & VOBJBUF)
1951                 strcat(buf, "|VOBJBUF");
1952         if (buf[0] != '\0')
1953                 printf(" flags (%s)", &buf[1]);
1954         if (vp->v_data == NULL) {
1955                 printf("\n");
1956         } else {
1957                 printf("\n\t");
1958                 VOP_PRINT(vp);
1959         }
1960 }
1961
1962 #ifdef DDB
1963 #include <ddb/ddb.h>
1964 /*
1965  * List all of the locked vnodes in the system.
1966  * Called when debugging the kernel.
1967  */
1968 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
1969 {
1970         struct proc *p = curproc;       /* XXX */
1971         struct mount *mp, *nmp;
1972         struct vnode *vp;
1973
1974         printf("Locked vnodes\n");
1975         simple_lock(&mountlist_slock);
1976         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
1977                 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1978                         nmp = TAILQ_NEXT(mp, mnt_list);
1979                         continue;
1980                 }
1981                 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
1982                         if (VOP_ISLOCKED(vp))
1983                                 vprint((char *)0, vp);
1984                 }
1985                 simple_lock(&mountlist_slock);
1986                 nmp = TAILQ_NEXT(mp, mnt_list);
1987                 vfs_unbusy(mp, p);
1988         }
1989         simple_unlock(&mountlist_slock);
1990 }
1991 #endif
1992
1993 /*
1994  * Top level filesystem related information gathering.
1995  */
1996 static int      sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
1997
1998 static int
1999 vfs_sysctl SYSCTL_HANDLER_ARGS
2000 {
2001         int *name = (int *)arg1 - 1;    /* XXX */
2002         u_int namelen = arg2 + 1;       /* XXX */
2003         struct vfsconf *vfsp;
2004
2005 #if 1 || defined(COMPAT_PRELITE2)
2006         /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2007         if (namelen == 1)
2008                 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2009 #endif
2010
2011 #ifdef notyet
2012         /* all sysctl names at this level are at least name and field */
2013         if (namelen < 2)
2014                 return (ENOTDIR);               /* overloaded */
2015         if (name[0] != VFS_GENERIC) {
2016                 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2017                         if (vfsp->vfc_typenum == name[0])
2018                                 break;
2019                 if (vfsp == NULL)
2020                         return (EOPNOTSUPP);
2021                 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2022                     oldp, oldlenp, newp, newlen, p));
2023         }
2024 #endif
2025         switch (name[1]) {
2026         case VFS_MAXTYPENUM:
2027                 if (namelen != 2)
2028                         return (ENOTDIR);
2029                 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2030         case VFS_CONF:
2031                 if (namelen != 3)
2032                         return (ENOTDIR);       /* overloaded */
2033                 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2034                         if (vfsp->vfc_typenum == name[2])
2035                                 break;
2036                 if (vfsp == NULL)
2037                         return (EOPNOTSUPP);
2038                 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
2039         }
2040         return (EOPNOTSUPP);
2041 }
2042
2043 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
2044         "Generic filesystem");
2045
2046 #if 1 || defined(COMPAT_PRELITE2)
2047
2048 static int
2049 sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
2050 {
2051         int error;
2052         struct vfsconf *vfsp;
2053         struct ovfsconf ovfs;
2054
2055         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2056                 ovfs.vfc_vfsops = vfsp->vfc_vfsops;     /* XXX used as flag */
2057                 strcpy(ovfs.vfc_name, vfsp->vfc_name);
2058                 ovfs.vfc_index = vfsp->vfc_typenum;
2059                 ovfs.vfc_refcount = vfsp->vfc_refcount;
2060                 ovfs.vfc_flags = vfsp->vfc_flags;
2061                 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2062                 if (error)
2063                         return error;
2064         }
2065         return 0;
2066 }
2067
2068 #endif /* 1 || COMPAT_PRELITE2 */
2069
2070 #if 0
2071 #define KINFO_VNODESLOP 10
2072 /*
2073  * Dump vnode list (via sysctl).
2074  * Copyout address of vnode followed by vnode.
2075  */
2076 /* ARGSUSED */
2077 static int
2078 sysctl_vnode SYSCTL_HANDLER_ARGS
2079 {
2080         struct proc *p = curproc;       /* XXX */
2081         struct mount *mp, *nmp;
2082         struct vnode *nvp, *vp;
2083         int error;
2084
2085 #define VPTRSZ  sizeof (struct vnode *)
2086 #define VNODESZ sizeof (struct vnode)
2087
2088         req->lock = 0;
2089         if (!req->oldptr) /* Make an estimate */
2090                 return (SYSCTL_OUT(req, 0,
2091                         (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
2092
2093         simple_lock(&mountlist_slock);
2094         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2095                 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2096                         nmp = TAILQ_NEXT(mp, mnt_list);
2097                         continue;
2098                 }
2099 again:
2100                 simple_lock(&mntvnode_slock);
2101                 for (vp = LIST_FIRST(&mp->mnt_vnodelist);
2102                      vp != NULL;
2103                      vp = nvp) {
2104                         /*
2105                          * Check that the vp is still associated with
2106                          * this filesystem.  RACE: could have been
2107                          * recycled onto the same filesystem.
2108                          */
2109                         if (vp->v_mount != mp) {
2110                                 simple_unlock(&mntvnode_slock);
2111                                 goto again;
2112                         }
2113                         nvp = LIST_NEXT(vp, v_mntvnodes);
2114                         simple_unlock(&mntvnode_slock);
2115                         if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
2116                             (error = SYSCTL_OUT(req, vp, VNODESZ)))
2117                                 return (error);
2118                         simple_lock(&mntvnode_slock);
2119                 }
2120                 simple_unlock(&mntvnode_slock);
2121                 simple_lock(&mountlist_slock);
2122                 nmp = TAILQ_NEXT(mp, mnt_list);
2123                 vfs_unbusy(mp, p);
2124         }
2125         simple_unlock(&mountlist_slock);
2126
2127         return (0);
2128 }
2129 #endif
2130
2131 /*
2132  * XXX
2133  * Exporting the vnode list on large systems causes them to crash.
2134  * Exporting the vnode list on medium systems causes sysctl to coredump.
2135  */
2136 #if 0
2137 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2138         0, 0, sysctl_vnode, "S,vnode", "");
2139 #endif
2140
2141 /*
2142  * Check to see if a filesystem is mounted on a block device.
2143  */
2144 int
2145 vfs_mountedon(vp)
2146         struct vnode *vp;
2147 {
2148
2149         if (vp->v_specmountpoint != NULL)
2150                 return (EBUSY);
2151         return (0);
2152 }
2153
2154 /*
2155  * Unmount all filesystems. The list is traversed in reverse order
2156  * of mounting to avoid dependencies.
2157  */
2158 void
2159 vfs_unmountall()
2160 {
2161         struct mount *mp;
2162         struct proc *p;
2163         int error;
2164
2165         if (curproc != NULL)
2166                 p = curproc;
2167         else
2168                 p = initproc;   /* XXX XXX should this be proc0? */
2169         /*
2170          * Since this only runs when rebooting, it is not interlocked.
2171          */
2172         while(!TAILQ_EMPTY(&mountlist)) {
2173                 mp = TAILQ_LAST(&mountlist, mntlist);
2174                 error = dounmount(mp, MNT_FORCE, p);
2175                 if (error) {
2176                         TAILQ_REMOVE(&mountlist, mp, mnt_list);
2177                         printf("unmount of %s failed (",
2178                             mp->mnt_stat.f_mntonname);
2179                         if (error == EBUSY)
2180                                 printf("BUSY)\n");
2181                         else
2182                                 printf("%d)\n", error);
2183                 } else {
2184                         /* The unmount has removed mp from the mountlist */
2185                 }
2186         }
2187 }
2188
2189 /*
2190  * Build hash lists of net addresses and hang them off the mount point.
2191  * Called by ufs_mount() to set up the lists of export addresses.
2192  */
2193 static int
2194 vfs_hang_addrlist(mp, nep, argp)
2195         struct mount *mp;
2196         struct netexport *nep;
2197         struct export_args *argp;
2198 {
2199         register struct netcred *np;
2200         register struct radix_node_head *rnh;
2201         register int i;
2202         struct radix_node *rn;
2203         struct sockaddr *saddr, *smask = 0;
2204         struct domain *dom;
2205         int error;
2206
2207         if (argp->ex_addrlen == 0) {
2208                 if (mp->mnt_flag & MNT_DEFEXPORTED)
2209                         return (EPERM);
2210                 np = &nep->ne_defexported;
2211                 np->netc_exflags = argp->ex_flags;
2212                 np->netc_anon = argp->ex_anon;
2213                 np->netc_anon.cr_ref = 1;
2214                 mp->mnt_flag |= MNT_DEFEXPORTED;
2215                 return (0);
2216         }
2217         i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2218         np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
2219         bzero((caddr_t) np, i);
2220         saddr = (struct sockaddr *) (np + 1);
2221         if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
2222                 goto out;
2223         if (saddr->sa_len > argp->ex_addrlen)
2224                 saddr->sa_len = argp->ex_addrlen;
2225         if (argp->ex_masklen) {
2226                 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
2227                 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
2228                 if (error)
2229                         goto out;
2230                 if (smask->sa_len > argp->ex_masklen)
2231                         smask->sa_len = argp->ex_masklen;
2232         }
2233         i = saddr->sa_family;
2234         if ((rnh = nep->ne_rtable[i]) == 0) {
2235                 /*
2236                  * Seems silly to initialize every AF when most are not used,
2237                  * do so on demand here
2238                  */
2239                 for (dom = domains; dom; dom = dom->dom_next)
2240                         if (dom->dom_family == i && dom->dom_rtattach) {
2241                                 dom->dom_rtattach((void **) &nep->ne_rtable[i],
2242                                     dom->dom_rtoffset);
2243                                 break;
2244                         }
2245                 if ((rnh = nep->ne_rtable[i]) == 0) {
2246                         error = ENOBUFS;
2247                         goto out;
2248                 }
2249         }
2250         rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
2251             np->netc_rnodes);
2252         if (rn == 0 || np != (struct netcred *) rn) {   /* already exists */
2253                 error = EPERM;
2254                 goto out;
2255         }
2256         np->netc_exflags = argp->ex_flags;
2257         np->netc_anon = argp->ex_anon;
2258         np->netc_anon.cr_ref = 1;
2259         return (0);
2260 out:
2261         free(np, M_NETADDR);
2262         return (error);
2263 }
2264
2265 /* ARGSUSED */
2266 static int
2267 vfs_free_netcred(rn, w)
2268         struct radix_node *rn;
2269         void *w;
2270 {
2271         register struct radix_node_head *rnh = (struct radix_node_head *) w;
2272
2273         (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2274         free((caddr_t) rn, M_NETADDR);
2275         return (0);
2276 }
2277
2278 /*
2279  * Free the net address hash lists that are hanging off the mount points.
2280  */
2281 static void
2282 vfs_free_addrlist(nep)
2283         struct netexport *nep;
2284 {
2285         register int i;
2286         register struct radix_node_head *rnh;
2287
2288         for (i = 0; i <= AF_MAX; i++)
2289                 if ((rnh = nep->ne_rtable[i])) {
2290                         (*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2291                             (caddr_t) rnh);
2292                         free((caddr_t) rnh, M_RTABLE);
2293                         nep->ne_rtable[i] = 0;
2294                 }
2295 }
2296
2297 int
2298 vfs_export(mp, nep, argp)
2299         struct mount *mp;
2300         struct netexport *nep;
2301         struct export_args *argp;
2302 {
2303         int error;
2304
2305         if (argp->ex_flags & MNT_DELEXPORT) {
2306                 if (mp->mnt_flag & MNT_EXPUBLIC) {
2307                         vfs_setpublicfs(NULL, NULL, NULL);
2308                         mp->mnt_flag &= ~MNT_EXPUBLIC;
2309                 }
2310                 vfs_free_addrlist(nep);
2311                 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2312         }
2313         if (argp->ex_flags & MNT_EXPORTED) {
2314                 if (argp->ex_flags & MNT_EXPUBLIC) {
2315                         if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2316                                 return (error);
2317                         mp->mnt_flag |= MNT_EXPUBLIC;
2318                 }
2319                 if ((error = vfs_hang_addrlist(mp, nep, argp)))
2320                         return (error);
2321                 mp->mnt_flag |= MNT_EXPORTED;
2322         }
2323         return (0);
2324 }
2325
2326
2327 /*
2328  * Set the publicly exported filesystem (WebNFS). Currently, only
2329  * one public filesystem is possible in the spec (RFC 2054 and 2055)
2330  */
2331 int
2332 vfs_setpublicfs(mp, nep, argp)
2333         struct mount *mp;
2334         struct netexport *nep;
2335         struct export_args *argp;
2336 {
2337         int error;
2338         struct vnode *rvp;
2339         char *cp;
2340
2341         /*
2342          * mp == NULL -> invalidate the current info, the FS is
2343          * no longer exported. May be called from either vfs_export
2344          * or unmount, so check if it hasn't already been done.
2345          */
2346         if (mp == NULL) {
2347                 if (nfs_pub.np_valid) {
2348                         nfs_pub.np_valid = 0;
2349                         if (nfs_pub.np_index != NULL) {
2350                                 FREE(nfs_pub.np_index, M_TEMP);
2351                                 nfs_pub.np_index = NULL;
2352                         }
2353                 }
2354                 return (0);
2355         }
2356
2357         /*
2358          * Only one allowed at a time.
2359          */
2360         if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2361                 return (EBUSY);
2362
2363         /*
2364          * Get real filehandle for root of exported FS.
2365          */
2366         bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2367         nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2368
2369         if ((error = VFS_ROOT(mp, &rvp)))
2370                 return (error);
2371
2372         if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2373                 return (error);
2374
2375         vput(rvp);
2376
2377         /*
2378          * If an indexfile was specified, pull it in.
2379          */
2380         if (argp->ex_indexfile != NULL) {
2381                 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2382                     M_WAITOK);
2383                 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2384                     MAXNAMLEN, (size_t *)0);
2385                 if (!error) {
2386                         /*
2387                          * Check for illegal filenames.
2388                          */
2389                         for (cp = nfs_pub.np_index; *cp; cp++) {
2390                                 if (*cp == '/') {
2391                                         error = EINVAL;
2392                                         break;
2393                                 }
2394                         }
2395                 }
2396                 if (error) {
2397                         FREE(nfs_pub.np_index, M_TEMP);
2398                         return (error);
2399                 }
2400         }
2401
2402         nfs_pub.np_mount = mp;
2403         nfs_pub.np_valid = 1;
2404         return (0);
2405 }
2406
2407 struct netcred *
2408 vfs_export_lookup(mp, nep, nam)
2409         register struct mount *mp;
2410         struct netexport *nep;
2411         struct sockaddr *nam;
2412 {
2413         register struct netcred *np;
2414         register struct radix_node_head *rnh;
2415         struct sockaddr *saddr;
2416
2417         np = NULL;
2418         if (mp->mnt_flag & MNT_EXPORTED) {
2419                 /*
2420                  * Lookup in the export list first.
2421                  */
2422                 if (nam != NULL) {
2423                         saddr = nam;
2424                         rnh = nep->ne_rtable[saddr->sa_family];
2425                         if (rnh != NULL) {
2426                                 np = (struct netcred *)
2427                                         (*rnh->rnh_matchaddr)((caddr_t)saddr,
2428                                                               rnh);
2429                                 if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2430                                         np = NULL;
2431                         }
2432                 }
2433                 /*
2434                  * If no address match, use the default if it exists.
2435                  */
2436                 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2437                         np = &nep->ne_defexported;
2438         }
2439         return (np);
2440 }
2441
2442 /*
2443  * perform msync on all vnodes under a mount point
2444  * the mount point must be locked.
2445  */
2446 void
2447 vfs_msync(struct mount *mp, int flags) {
2448         struct vnode *vp, *nvp;
2449         struct vm_object *obj;
2450         int anyio, tries;
2451
2452         tries = 5;
2453 loop:
2454         anyio = 0;
2455         for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) {
2456
2457                 nvp = LIST_NEXT(vp, v_mntvnodes);
2458
2459                 if (vp->v_mount != mp) {
2460                         goto loop;
2461                 }
2462
2463                 if (vp->v_flag & VXLOCK)        /* XXX: what if MNT_WAIT? */
2464                         continue;
2465
2466                 if (flags != MNT_WAIT) {
2467                         obj = vp->v_object;
2468                         if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
2469                                 continue;
2470                         if (VOP_ISLOCKED(vp))
2471                                 continue;
2472                 }
2473
2474                 simple_lock(&vp->v_interlock);
2475                 if (vp->v_object &&
2476                    (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
2477                         if (!vget(vp,
2478                                 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
2479                                 if (vp->v_object) {
2480                                         vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
2481                                         anyio = 1;
2482                                 }
2483                                 vput(vp);
2484                         }
2485                 } else {
2486                         simple_unlock(&vp->v_interlock);
2487                 }
2488         }
2489         if (anyio && (--tries > 0))
2490                 goto loop;
2491 }
2492
2493 /*
2494  * Create the VM object needed for VMIO and mmap support.  This
2495  * is done for all VREG files in the system.  Some filesystems might
2496  * afford the additional metadata buffering capability of the
2497  * VMIO code by making the device node be VMIO mode also.
2498  *
2499  * vp must be locked when vfs_object_create is called.
2500  */
2501 int
2502 vfs_object_create(vp, p, cred)
2503         struct vnode *vp;
2504         struct proc *p;
2505         struct ucred *cred;
2506 {
2507         struct vattr vat;
2508         vm_object_t object;
2509         int error = 0;
2510
2511         if (!vn_isdisk(vp) && vn_canvmio(vp) == FALSE)
2512                 return 0;
2513
2514 retry:
2515         if ((object = vp->v_object) == NULL) {
2516                 if (vp->v_type == VREG || vp->v_type == VDIR) {
2517                         if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
2518                                 goto retn;
2519                         object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
2520                 } else if (devsw(vp->v_rdev) != NULL) {
2521                         /*
2522                          * This simply allocates the biggest object possible
2523                          * for a disk vnode.  This should be fixed, but doesn't
2524                          * cause any problems (yet).
2525                          */
2526                         object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
2527                 } else {
2528                         goto retn;
2529                 }
2530                 /*
2531                  * Dereference the reference we just created.  This assumes
2532                  * that the object is associated with the vp.
2533                  */
2534                 object->ref_count--;
2535                 vp->v_usecount--;
2536         } else {
2537                 if (object->flags & OBJ_DEAD) {
2538                         VOP_UNLOCK(vp, 0, p);
2539                         tsleep(object, PVM, "vodead", 0);
2540                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
2541                         goto retry;
2542                 }
2543         }
2544
2545         KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object"));
2546         vp->v_flag |= VOBJBUF;
2547
2548 retn:
2549         return error;
2550 }
2551
2552 static void
2553 vfree(vp)
2554         struct vnode *vp;
2555 {
2556         int s;
2557
2558         s = splbio();
2559         simple_lock(&vnode_free_list_slock);
2560         if (vp->v_flag & VTBFREE) {
2561                 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2562                 vp->v_flag &= ~VTBFREE;
2563         }
2564         if (vp->v_flag & VAGE) {
2565                 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2566         } else {
2567                 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2568         }
2569         freevnodes++;
2570         simple_unlock(&vnode_free_list_slock);
2571         vp->v_flag &= ~VAGE;
2572         vp->v_flag |= VFREE;
2573         splx(s);
2574 }
2575
2576 void
2577 vbusy(vp)
2578         struct vnode *vp;
2579 {
2580         int s;
2581
2582         s = splbio();
2583         simple_lock(&vnode_free_list_slock);
2584         if (vp->v_flag & VTBFREE) {
2585                 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2586                 vp->v_flag &= ~VTBFREE;
2587         } else {
2588                 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2589                 freevnodes--;
2590         }
2591         simple_unlock(&vnode_free_list_slock);
2592         vp->v_flag &= ~(VFREE|VAGE);
2593         splx(s);
2594 }
2595
2596 /*
2597  * Record a process's interest in events which might happen to
2598  * a vnode.  Because poll uses the historic select-style interface
2599  * internally, this routine serves as both the ``check for any
2600  * pending events'' and the ``record my interest in future events''
2601  * functions.  (These are done together, while the lock is held,
2602  * to avoid race conditions.)
2603  */
2604 int
2605 vn_pollrecord(vp, p, events)
2606         struct vnode *vp;
2607         struct proc *p;
2608         short events;
2609 {
2610         simple_lock(&vp->v_pollinfo.vpi_lock);
2611         if (vp->v_pollinfo.vpi_revents & events) {
2612                 /*
2613                  * This leaves events we are not interested
2614                  * in available for the other process which
2615                  * which presumably had requested them
2616                  * (otherwise they would never have been
2617                  * recorded).
2618                  */
2619                 events &= vp->v_pollinfo.vpi_revents;
2620                 vp->v_pollinfo.vpi_revents &= ~events;
2621
2622                 simple_unlock(&vp->v_pollinfo.vpi_lock);
2623                 return events;
2624         }
2625         vp->v_pollinfo.vpi_events |= events;
2626         selrecord(p, &vp->v_pollinfo.vpi_selinfo);
2627         simple_unlock(&vp->v_pollinfo.vpi_lock);
2628         return 0;
2629 }
2630
2631 /*
2632  * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
2633  * it is possible for us to miss an event due to race conditions, but
2634  * that condition is expected to be rare, so for the moment it is the
2635  * preferred interface.
2636  */
2637 void
2638 vn_pollevent(vp, events)
2639         struct vnode *vp;
2640         short events;
2641 {
2642         simple_lock(&vp->v_pollinfo.vpi_lock);
2643         if (vp->v_pollinfo.vpi_events & events) {
2644                 /*
2645                  * We clear vpi_events so that we don't
2646                  * call selwakeup() twice if two events are
2647                  * posted before the polling process(es) is
2648                  * awakened.  This also ensures that we take at
2649                  * most one selwakeup() if the polling process
2650                  * is no longer interested.  However, it does
2651                  * mean that only one event can be noticed at
2652                  * a time.  (Perhaps we should only clear those
2653                  * event bits which we note?) XXX
2654                  */
2655                 vp->v_pollinfo.vpi_events = 0;  /* &= ~events ??? */
2656                 vp->v_pollinfo.vpi_revents |= events;
2657                 selwakeup(&vp->v_pollinfo.vpi_selinfo);
2658         }
2659         simple_unlock(&vp->v_pollinfo.vpi_lock);
2660 }
2661
2662 /*
2663  * Wake up anyone polling on vp because it is being revoked.
2664  * This depends on dead_poll() returning POLLHUP for correct
2665  * behavior.
2666  */
2667 void
2668 vn_pollgone(vp)
2669         struct vnode *vp;
2670 {
2671         simple_lock(&vp->v_pollinfo.vpi_lock);
2672         if (vp->v_pollinfo.vpi_events) {
2673                 vp->v_pollinfo.vpi_events = 0;
2674                 selwakeup(&vp->v_pollinfo.vpi_selinfo);
2675         }
2676         simple_unlock(&vp->v_pollinfo.vpi_lock);
2677 }
2678
2679
2680
2681 /*
2682  * Routine to create and manage a filesystem syncer vnode.
2683  */
2684 #define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
2685 static int      sync_fsync __P((struct  vop_fsync_args *));
2686 static int      sync_inactive __P((struct  vop_inactive_args *));
2687 static int      sync_reclaim  __P((struct  vop_reclaim_args *));
2688 #define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
2689 #define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
2690 static int      sync_print __P((struct vop_print_args *));
2691 #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2692
2693 static vop_t **sync_vnodeop_p;
2694 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2695         { &vop_default_desc,    (vop_t *) vop_eopnotsupp },
2696         { &vop_close_desc,      (vop_t *) sync_close },         /* close */
2697         { &vop_fsync_desc,      (vop_t *) sync_fsync },         /* fsync */
2698         { &vop_inactive_desc,   (vop_t *) sync_inactive },      /* inactive */
2699         { &vop_reclaim_desc,    (vop_t *) sync_reclaim },       /* reclaim */
2700         { &vop_lock_desc,       (vop_t *) sync_lock },          /* lock */
2701         { &vop_unlock_desc,     (vop_t *) sync_unlock },        /* unlock */
2702         { &vop_print_desc,      (vop_t *) sync_print },         /* print */
2703         { &vop_islocked_desc,   (vop_t *) sync_islocked },      /* islocked */
2704         { NULL, NULL }
2705 };
2706 static struct vnodeopv_desc sync_vnodeop_opv_desc =
2707         { &sync_vnodeop_p, sync_vnodeop_entries };
2708
2709 VNODEOP_SET(sync_vnodeop_opv_desc);
2710
2711 /*
2712  * Create a new filesystem syncer vnode for the specified mount point.
2713  */
2714 int
2715 vfs_allocate_syncvnode(mp)
2716         struct mount *mp;
2717 {
2718         struct vnode *vp;
2719         static long start, incr, next;
2720         int error;
2721
2722         /* Allocate a new vnode */
2723         if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2724                 mp->mnt_syncer = NULL;
2725                 return (error);
2726         }
2727         vp->v_type = VNON;
2728         /*
2729          * Place the vnode onto the syncer worklist. We attempt to
2730          * scatter them about on the list so that they will go off
2731          * at evenly distributed times even if all the filesystems
2732          * are mounted at once.
2733          */
2734         next += incr;
2735         if (next == 0 || next > syncer_maxdelay) {
2736                 start /= 2;
2737                 incr /= 2;
2738                 if (start == 0) {
2739                         start = syncer_maxdelay / 2;
2740                         incr = syncer_maxdelay;
2741                 }
2742                 next = start;
2743         }
2744         vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2745         mp->mnt_syncer = vp;
2746         return (0);
2747 }
2748
2749 /*
2750  * Do a lazy sync of the filesystem.
2751  */
2752 static int
2753 sync_fsync(ap)
2754         struct vop_fsync_args /* {
2755                 struct vnode *a_vp;
2756                 struct ucred *a_cred;
2757                 int a_waitfor;
2758                 struct proc *a_p;
2759         } */ *ap;
2760 {
2761         struct vnode *syncvp = ap->a_vp;
2762         struct mount *mp = syncvp->v_mount;
2763         struct proc *p = ap->a_p;
2764         int asyncflag;
2765
2766         /*
2767          * We only need to do something if this is a lazy evaluation.
2768          */
2769         if (ap->a_waitfor != MNT_LAZY)
2770                 return (0);
2771
2772         /*
2773          * Move ourselves to the back of the sync list.
2774          */
2775         vn_syncer_add_to_worklist(syncvp, syncdelay);
2776
2777         /*
2778          * Walk the list of vnodes pushing all that are dirty and
2779          * not already on the sync list.
2780          */
2781         simple_lock(&mountlist_slock);
2782         if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
2783                 simple_unlock(&mountlist_slock);
2784                 return (0);
2785         }
2786         asyncflag = mp->mnt_flag & MNT_ASYNC;
2787         mp->mnt_flag &= ~MNT_ASYNC;
2788         vfs_msync(mp, MNT_NOWAIT);
2789         VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
2790         if (asyncflag)
2791                 mp->mnt_flag |= MNT_ASYNC;
2792         vfs_unbusy(mp, p);
2793         return (0);
2794 }
2795
2796 /*
2797  * The syncer vnode is no referenced.
2798  */
2799 static int
2800 sync_inactive(ap)
2801         struct vop_inactive_args /* {
2802                 struct vnode *a_vp;
2803                 struct proc *a_p;
2804         } */ *ap;
2805 {
2806
2807         vgone(ap->a_vp);
2808         return (0);
2809 }
2810
2811 /*
2812  * The syncer vnode is no longer needed and is being decommissioned.
2813  *
2814  * Modifications to the worklist must be protected at splbio().
2815  */
2816 static int
2817 sync_reclaim(ap)
2818         struct vop_reclaim_args /* {
2819                 struct vnode *a_vp;
2820         } */ *ap;
2821 {
2822         struct vnode *vp = ap->a_vp;
2823         int s;
2824
2825         s = splbio();
2826         vp->v_mount->mnt_syncer = NULL;
2827         if (vp->v_flag & VONWORKLST) {
2828                 LIST_REMOVE(vp, v_synclist);
2829                 vp->v_flag &= ~VONWORKLST;
2830         }
2831         splx(s);
2832
2833         return (0);
2834 }
2835
2836 /*
2837  * Print out a syncer vnode.
2838  */
2839 static int
2840 sync_print(ap)
2841         struct vop_print_args /* {
2842                 struct vnode *a_vp;
2843         } */ *ap;
2844 {
2845         struct vnode *vp = ap->a_vp;
2846
2847         printf("syncer vnode");
2848         if (vp->v_vnlock != NULL)
2849                 lockmgr_printinfo(vp->v_vnlock);
2850         printf("\n");
2851         return (0);
2852 }
2853
2854 /*
2855  * extract the dev_t from a VBLK or VCHR
2856  */
2857 dev_t
2858 vn_todev(vp)
2859         struct vnode *vp;
2860 {
2861         if (vp->v_type != VBLK && vp->v_type != VCHR)
2862                 return (NODEV);
2863         return (vp->v_rdev);
2864 }
2865
2866 /*
2867  * Check if vnode represents a disk device
2868  */
2869 int
2870 vn_isdisk(vp)
2871         struct vnode *vp;
2872 {
2873         if (vp->v_type != VBLK && vp->v_type != VCHR)
2874                 return (0);
2875         if (!devsw(vp->v_rdev))
2876                 return (0);
2877         if (!(devsw(vp->v_rdev)->d_flags & D_DISK))
2878                 return (0);
2879         return (1);
2880 }
2881