sys/kern/vfs_subr.c

   1 /*
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
  39  * $FreeBSD$
  40  */
  41
  42 /*
  43  * External virtual filesystem routines
  44  */
  45 #include "opt_ddb.h"
  46
  47 #include <sys/param.h>
  48 #include <sys/systm.h>
  49 #include <sys/bio.h>
  50 #include <sys/buf.h>
  51 #include <sys/conf.h>
  52 #include <sys/dirent.h>
  53 #include <sys/domain.h>
  54 #include <sys/eventhandler.h>
  55 #include <sys/fcntl.h>
  56 #include <sys/kernel.h>
  57 #include <sys/kthread.h>
  58 #include <sys/malloc.h>
  59 #include <sys/mount.h>
  60 #include <sys/namei.h>
  61 #include <sys/proc.h>
  62 #include <sys/reboot.h>
  63 #include <sys/socket.h>
  64 #include <sys/stat.h>
  65 #include <sys/sysctl.h>
  66 #include <sys/vmmeter.h>
  67 #include <sys/vnode.h>
  68
  69 #include <machine/limits.h>
  70
  71 #include <vm/vm.h>
  72 #include <vm/vm_object.h>
  73 #include <vm/vm_extern.h>
  74 #include <vm/pmap.h>
  75 #include <vm/vm_map.h>
  76 #include <vm/vm_page.h>
  77 #include <vm/vm_pager.h>
  78 #include <vm/vnode_pager.h>
  79 #include <vm/vm_zone.h>
  80
  81 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
  82
  83 static void     insmntque __P((struct vnode *vp, struct mount *mp));
  84 static void     vclean __P((struct vnode *vp, int flags, struct proc *p));
  85 static void     vfree __P((struct vnode *));
  86 static unsigned long    numvnodes;
  87 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
  88
  89 enum vtype iftovt_tab[16] = {
  90         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
  91         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
  92 };
  93 int vttoif_tab[9] = {
  94         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
  95         S_IFSOCK, S_IFIFO, S_IFMT,
  96 };
  97
  98 static TAILQ_HEAD(freelst, struct vnode) vnode_free_list; /* vnode free list */
  99 struct tobefreelist vnode_tobefree_list;        /* vnode free list */
 100
 101 static u_long wantfreevnodes = 25;
 102 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 103 static u_long freevnodes = 0;
 104 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
 105
 106 static int reassignbufcalls;
 107 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
 108 static int reassignbufloops;
 109 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
 110 static int reassignbufsortgood;
 111 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
 112 static int reassignbufsortbad;
 113 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
 114 static int reassignbufmethod = 1;
 115 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
 116
 117 #ifdef ENABLE_VFS_IOOPT
 118 int vfs_ioopt = 0;
 119 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
 120 #endif
 121
 122 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */
 123 struct simplelock mountlist_slock;
 124 struct simplelock mntvnode_slock;
 125 int     nfs_mount_type = -1;
 126 #ifndef NULL_SIMPLELOCKS
 127 static struct simplelock mntid_slock;
 128 static struct simplelock vnode_free_list_slock;
 129 static struct simplelock spechash_slock;
 130 #endif
 131 struct nfs_public nfs_pub;      /* publicly exported FS */
 132 static vm_zone_t vnode_zone;
 133
 134 /*
 135  * The workitem queue.
 136  */
 137 #define SYNCER_MAXDELAY         32
 138 static int syncer_maxdelay = SYNCER_MAXDELAY;   /* maximum delay time */
 139 time_t syncdelay = 30;          /* max time to delay syncing data */
 140 time_t filedelay = 30;          /* time to delay syncing files */
 141 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
 142 time_t dirdelay = 29;           /* time to delay syncing directories */
 143 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
 144 time_t metadelay = 28;          /* time to delay syncing metadata */
 145 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
 146 static int rushjob;                     /* number of slots to run ASAP */
 147 static int stat_rush_requests;  /* number of times I/O speeded up */
 148 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
 149
 150 static int syncer_delayno = 0;
 151 static long syncer_mask;
 152 LIST_HEAD(synclist, struct vnode);
 153 static struct synclist *syncer_workitem_pending;
 154
 155 int desiredvnodes;
 156 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
 157     &desiredvnodes, 0, "Maximum number of vnodes");
 158
 159 static void     vfs_free_addrlist __P((struct netexport *nep));
 160 static int      vfs_free_netcred __P((struct radix_node *rn, void *w));
 161 static int      vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
 162                                        struct export_args *argp));
 163
 164 /*
 165  * Initialize the vnode management data structures.
 166  */
 167 void
 168 vntblinit()
 169 {
 170
 171         desiredvnodes = maxproc + cnt.v_page_count / 4;
 172         simple_lock_init(&mntvnode_slock);
 173         simple_lock_init(&mntid_slock);
 174         simple_lock_init(&spechash_slock);
 175         TAILQ_INIT(&vnode_free_list);
 176         TAILQ_INIT(&vnode_tobefree_list);
 177         simple_lock_init(&vnode_free_list_slock);
 178         vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
 179         /*
 180          * Initialize the filesystem syncer.
 181          */
 182         syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 183                 &syncer_mask);
 184         syncer_maxdelay = syncer_mask + 1;
 185 }
 186
 187 /*
 188  * Mark a mount point as busy. Used to synchronize access and to delay
 189  * unmounting. Interlock is not released on failure.
 190  */
 191 int
 192 vfs_busy(mp, flags, interlkp, p)
 193         struct mount *mp;
 194         int flags;
 195         struct simplelock *interlkp;
 196         struct proc *p;
 197 {
 198         int lkflags;
 199
 200         if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 201                 if (flags & LK_NOWAIT)
 202                         return (ENOENT);
 203                 mp->mnt_kern_flag |= MNTK_MWAIT;
 204                 if (interlkp) {
 205                         simple_unlock(interlkp);
 206                 }
 207                 /*
 208                  * Since all busy locks are shared except the exclusive
 209                  * lock granted when unmounting, the only place that a
 210                  * wakeup needs to be done is at the release of the
 211                  * exclusive lock at the end of dounmount.
 212                  */
 213                 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
 214                 if (interlkp) {
 215                         simple_lock(interlkp);
 216                 }
 217                 return (ENOENT);
 218         }
 219         lkflags = LK_SHARED | LK_NOPAUSE;
 220         if (interlkp)
 221                 lkflags |= LK_INTERLOCK;
 222         if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
 223                 panic("vfs_busy: unexpected lock failure");
 224         return (0);
 225 }
 226
 227 /*
 228  * Free a busy filesystem.
 229  */
 230 void
 231 vfs_unbusy(mp, p)
 232         struct mount *mp;
 233         struct proc *p;
 234 {
 235
 236         lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
 237 }
 238
 239 /*
 240  * Lookup a filesystem type, and if found allocate and initialize
 241  * a mount structure for it.
 242  *
 243  * Devname is usually updated by mount(8) after booting.
 244  */
 245 int
 246 vfs_rootmountalloc(fstypename, devname, mpp)
 247         char *fstypename;
 248         char *devname;
 249         struct mount **mpp;
 250 {
 251         struct proc *p = curproc;       /* XXX */
 252         struct vfsconf *vfsp;
 253         struct mount *mp;
 254
 255         if (fstypename == NULL)
 256                 return (ENODEV);
 257         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 258                 if (!strcmp(vfsp->vfc_name, fstypename))
 259                         break;
 260         if (vfsp == NULL)
 261                 return (ENODEV);
 262         mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
 263         bzero((char *)mp, (u_long)sizeof(struct mount));
 264         lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
 265         (void)vfs_busy(mp, LK_NOWAIT, 0, p);
 266         LIST_INIT(&mp->mnt_vnodelist);
 267         mp->mnt_vfc = vfsp;
 268         mp->mnt_op = vfsp->vfc_vfsops;
 269         mp->mnt_flag = MNT_RDONLY;
 270         mp->mnt_vnodecovered = NULLVP;
 271         vfsp->vfc_refcount++;
 272         mp->mnt_iosize_max = DFLTPHYS;
 273         mp->mnt_stat.f_type = vfsp->vfc_typenum;
 274         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 275         strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 276         mp->mnt_stat.f_mntonname[0] = '/';
 277         mp->mnt_stat.f_mntonname[1] = 0;
 278         (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
 279         *mpp = mp;
 280         return (0);
 281 }
 282
 283 /*
 284  * Find an appropriate filesystem to use for the root. If a filesystem
 285  * has not been preselected, walk through the list of known filesystems
 286  * trying those that have mountroot routines, and try them until one
 287  * works or we have tried them all.
 288  */
 289 #ifdef notdef   /* XXX JH */
 290 int
 291 lite2_vfs_mountroot()
 292 {
 293         struct vfsconf *vfsp;
 294         extern int (*lite2_mountroot) __P((void));
 295         int error;
 296
 297         if (lite2_mountroot != NULL)
 298                 return ((*lite2_mountroot)());
 299         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 300                 if (vfsp->vfc_mountroot == NULL)
 301                         continue;
 302                 if ((error = (*vfsp->vfc_mountroot)()) == 0)
 303                         return (0);
 304                 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
 305         }
 306         return (ENODEV);
 307 }
 308 #endif
 309
 310 /*
 311  * Lookup a mount point by filesystem identifier.
 312  */
 313 struct mount *
 314 vfs_getvfs(fsid)
 315         fsid_t *fsid;
 316 {
 317         register struct mount *mp;
 318
 319         simple_lock(&mountlist_slock);
 320         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 321                 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 322                     mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 323                         simple_unlock(&mountlist_slock);
 324                         return (mp);
 325             }
 326         }
 327         simple_unlock(&mountlist_slock);
 328         return ((struct mount *) 0);
 329 }
 330
 331 /*
 332  * Get a new unique fsid.  Try to make its val[0] unique, since this value
 333  * will be used to create fake device numbers for stat().  Also try (but
 334  * not so hard) make its val[0] unique mod 2^16, since some emulators only
 335  * support 16-bit device numbers.  We end up with unique val[0]'s for the
 336  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
 337  *
 338  * Keep in mind that several mounts may be running in parallel.  Starting
 339  * the search one past where the previous search terminated is both a
 340  * micro-optimization and a defense against returning the same fsid to
 341  * different mounts.
 342  */
 343 void
 344 vfs_getnewfsid(mp)
 345         struct mount *mp;
 346 {
 347         static u_int16_t mntid_base;
 348         fsid_t tfsid;
 349         int mtype;
 350
 351         simple_lock(&mntid_slock);
 352         mtype = mp->mnt_vfc->vfc_typenum;
 353         tfsid.val[1] = mtype;
 354         mtype = (mtype & 0xFF) << 16;
 355         for (;;) {
 356                 tfsid.val[0] = makeudev(255, mtype | mntid_base++);
 357                 if (vfs_getvfs(&tfsid) == NULL)
 358                         break;
 359         }
 360         mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 361         mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 362         simple_unlock(&mntid_slock);
 363 }
 364
 365 /*
 366  * Knob to control the precision of file timestamps:
 367  *
 368  *   0 = seconds only; nanoseconds zeroed.
 369  *   1 = seconds and nanoseconds, accurate within 1/HZ.
 370  *   2 = seconds and nanoseconds, truncated to microseconds.
 371  * >=3 = seconds and nanoseconds, maximum precision.
 372  */
 373 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 374
 375 static int timestamp_precision = TSP_SEC;
 376 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
 377     &timestamp_precision, 0, "");
 378
 379 /*
 380  * Get a current timestamp.
 381  */
 382 void
 383 vfs_timestamp(tsp)
 384         struct timespec *tsp;
 385 {
 386         struct timeval tv;
 387
 388         switch (timestamp_precision) {
 389         case TSP_SEC:
 390                 tsp->tv_sec = time_second;
 391                 tsp->tv_nsec = 0;
 392                 break;
 393         case TSP_HZ:
 394                 getnanotime(tsp);
 395                 break;
 396         case TSP_USEC:
 397                 microtime(&tv);
 398                 TIMEVAL_TO_TIMESPEC(&tv, tsp);
 399                 break;
 400         case TSP_NSEC:
 401         default:
 402                 nanotime(tsp);
 403                 break;
 404         }
 405 }
 406
 407 /*
 408  * Set vnode attributes to VNOVAL
 409  */
 410 void
 411 vattr_null(vap)
 412         register struct vattr *vap;
 413 {
 414
 415         vap->va_type = VNON;
 416         vap->va_size = VNOVAL;
 417         vap->va_bytes = VNOVAL;
 418         vap->va_mode = VNOVAL;
 419         vap->va_nlink = VNOVAL;
 420         vap->va_uid = VNOVAL;
 421         vap->va_gid = VNOVAL;
 422         vap->va_fsid = VNOVAL;
 423         vap->va_fileid = VNOVAL;
 424         vap->va_blocksize = VNOVAL;
 425         vap->va_rdev = VNOVAL;
 426         vap->va_atime.tv_sec = VNOVAL;
 427         vap->va_atime.tv_nsec = VNOVAL;
 428         vap->va_mtime.tv_sec = VNOVAL;
 429         vap->va_mtime.tv_nsec = VNOVAL;
 430         vap->va_ctime.tv_sec = VNOVAL;
 431         vap->va_ctime.tv_nsec = VNOVAL;
 432         vap->va_flags = VNOVAL;
 433         vap->va_gen = VNOVAL;
 434         vap->va_vaflags = 0;
 435 }
 436
 437 /*
 438  * Routines having to do with the management of the vnode table.
 439  */
 440 extern vop_t **dead_vnodeop_p;
 441
 442 /*
 443  * Return the next vnode from the free list.
 444  */
 445 int
 446 getnewvnode(tag, mp, vops, vpp)
 447         enum vtagtype tag;
 448         struct mount *mp;
 449         vop_t **vops;
 450         struct vnode **vpp;
 451 {
 452         int s;
 453         struct proc *p = curproc;       /* XXX */
 454         struct vnode *vp, *tvp, *nvp;
 455         vm_object_t object;
 456         TAILQ_HEAD(freelst, struct vnode) vnode_tmp_list;
 457
 458         /*
 459          * We take the least recently used vnode from the freelist
 460          * if we can get it and it has no cached pages, and no
 461          * namecache entries are relative to it.
 462          * Otherwise we allocate a new vnode
 463          */
 464
 465         s = splbio();
 466         simple_lock(&vnode_free_list_slock);
 467         TAILQ_INIT(&vnode_tmp_list);
 468
 469         for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
 470                 nvp = TAILQ_NEXT(vp, v_freelist);
 471                 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
 472                 if (vp->v_flag & VAGE) {
 473                         TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 474                 } else {
 475                         TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 476                 }
 477                 vp->v_flag &= ~(VTBFREE|VAGE);
 478                 vp->v_flag |= VFREE;
 479                 if (vp->v_usecount)
 480                         panic("tobe free vnode isn't");
 481                 freevnodes++;
 482         }
 483
 484         if (wantfreevnodes && freevnodes < wantfreevnodes) {
 485                 vp = NULL;
 486         } else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
 487                 /*
 488                  * XXX: this is only here to be backwards compatible
 489                  */
 490                 vp = NULL;
 491         } else {
 492                 for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
 493                         nvp = TAILQ_NEXT(vp, v_freelist);
 494                         if (!simple_lock_try(&vp->v_interlock))
 495                                 continue;
 496                         if (vp->v_usecount)
 497                                 panic("free vnode isn't");
 498
 499                         object = vp->v_object;
 500                         if (object && (object->resident_page_count || object->ref_count)) {
 501                                 printf("object inconsistant state: RPC: %d, RC: %d\n",
 502                                         object->resident_page_count, object->ref_count);
 503                                 /* Don't recycle if it's caching some pages */
 504                                 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 505                                 TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
 506                                 continue;
 507                         } else if (LIST_FIRST(&vp->v_cache_src)) {
 508                                 /* Don't recycle if active in the namecache */
 509                                 simple_unlock(&vp->v_interlock);
 510                                 continue;
 511                         } else {
 512                                 break;
 513                         }
 514                 }
 515         }
 516
 517         for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
 518                 nvp = TAILQ_NEXT(tvp, v_freelist);
 519                 TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
 520                 TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
 521                 simple_unlock(&tvp->v_interlock);
 522         }
 523
 524         if (vp) {
 525                 vp->v_flag |= VDOOMED;
 526                 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 527                 freevnodes--;
 528                 simple_unlock(&vnode_free_list_slock);
 529                 cache_purge(vp);
 530                 vp->v_lease = NULL;
 531                 if (vp->v_type != VBAD) {
 532                         vgonel(vp, p);
 533                 } else {
 534                         simple_unlock(&vp->v_interlock);
 535                 }
 536
 537 #ifdef INVARIANTS
 538                 {
 539                         int s;
 540
 541                         if (vp->v_data)
 542                                 panic("cleaned vnode isn't");
 543                         s = splbio();
 544                         if (vp->v_numoutput)
 545                                 panic("Clean vnode has pending I/O's");
 546                         splx(s);
 547                 }
 548 #endif
 549                 vp->v_flag = 0;
 550                 vp->v_lastw = 0;
 551                 vp->v_lasta = 0;
 552                 vp->v_cstart = 0;
 553                 vp->v_clen = 0;
 554                 vp->v_socket = 0;
 555                 vp->v_writecount = 0;   /* XXX */
 556         } else {
 557                 simple_unlock(&vnode_free_list_slock);
 558                 vp = (struct vnode *) zalloc(vnode_zone);
 559                 bzero((char *) vp, sizeof *vp);
 560                 simple_lock_init(&vp->v_interlock);
 561                 vp->v_dd = vp;
 562                 cache_purge(vp);
 563                 LIST_INIT(&vp->v_cache_src);
 564                 TAILQ_INIT(&vp->v_cache_dst);
 565                 numvnodes++;
 566         }
 567
 568         TAILQ_INIT(&vp->v_cleanblkhd);
 569         TAILQ_INIT(&vp->v_dirtyblkhd);
 570         vp->v_type = VNON;
 571         vp->v_tag = tag;
 572         vp->v_op = vops;
 573         insmntque(vp, mp);
 574         *vpp = vp;
 575         vp->v_usecount = 1;
 576         vp->v_data = 0;
 577         splx(s);
 578
 579         vfs_object_create(vp, p, p->p_ucred);
 580         return (0);
 581 }
 582
 583 /*
 584  * Move a vnode from one mount queue to another.
 585  */
 586 static void
 587 insmntque(vp, mp)
 588         register struct vnode *vp;
 589         register struct mount *mp;
 590 {
 591
 592         simple_lock(&mntvnode_slock);
 593         /*
 594          * Delete from old mount point vnode list, if on one.
 595          */
 596         if (vp->v_mount != NULL)
 597                 LIST_REMOVE(vp, v_mntvnodes);
 598         /*
 599          * Insert into list of vnodes for the new mount point, if available.
 600          */
 601         if ((vp->v_mount = mp) == NULL) {
 602                 simple_unlock(&mntvnode_slock);
 603                 return;
 604         }
 605         LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
 606         simple_unlock(&mntvnode_slock);
 607 }
 608
 609 /*
 610  * Update outstanding I/O count and do wakeup if requested.
 611  */
 612 void
 613 vwakeup(bp)
 614         register struct buf *bp;
 615 {
 616         register struct vnode *vp;
 617
 618         bp->b_flags &= ~B_WRITEINPROG;
 619         if ((vp = bp->b_vp)) {
 620                 vp->v_numoutput--;
 621                 if (vp->v_numoutput < 0)
 622                         panic("vwakeup: neg numoutput");
 623                 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
 624                         vp->v_flag &= ~VBWAIT;
 625                         wakeup((caddr_t) &vp->v_numoutput);
 626                 }
 627         }
 628 }
 629
 630 /*
 631  * Flush out and invalidate all buffers associated with a vnode.
 632  * Called with the underlying object locked.
 633  */
 634 int
 635 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 636         register struct vnode *vp;
 637         int flags;
 638         struct ucred *cred;
 639         struct proc *p;
 640         int slpflag, slptimeo;
 641 {
 642         register struct buf *bp;
 643         struct buf *nbp, *blist;
 644         int s, error;
 645         vm_object_t object;
 646
 647         if (flags & V_SAVE) {
 648                 s = splbio();
 649                 while (vp->v_numoutput) {
 650                         vp->v_flag |= VBWAIT;
 651                         error = tsleep((caddr_t)&vp->v_numoutput,
 652                             slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
 653                         if (error) {
 654                                 splx(s);
 655                                 return (error);
 656                         }
 657                 }
 658                 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 659                         splx(s);
 660                         if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
 661                                 return (error);
 662                         s = splbio();
 663                         if (vp->v_numoutput > 0 ||
 664                             !TAILQ_EMPTY(&vp->v_dirtyblkhd))
 665                                 panic("vinvalbuf: dirty bufs");
 666                 }
 667                 splx(s);
 668         }
 669         s = splbio();
 670         for (;;) {
 671                 blist = TAILQ_FIRST(&vp->v_cleanblkhd);
 672                 if (!blist)
 673                         blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
 674                 if (!blist)
 675                         break;
 676
 677                 for (bp = blist; bp; bp = nbp) {
 678                         nbp = TAILQ_NEXT(bp, b_vnbufs);
 679                         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 680                                 error = BUF_TIMELOCK(bp,
 681                                     LK_EXCLUSIVE | LK_SLEEPFAIL,
 682                                     "vinvalbuf", slpflag, slptimeo);
 683                                 if (error == ENOLCK)
 684                                         break;
 685                                 splx(s);
 686                                 return (error);
 687                         }
 688                         /*
 689                          * XXX Since there are no node locks for NFS, I
 690                          * believe there is a slight chance that a delayed
 691                          * write will occur while sleeping just above, so
 692                          * check for it.  Note that vfs_bio_awrite expects
 693                          * buffers to reside on a queue, while VOP_BWRITE and
 694                          * brelse do not.
 695                          */
 696                         if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 697                                 (flags & V_SAVE)) {
 698
 699                                 if (bp->b_vp == vp) {
 700                                         if (bp->b_flags & B_CLUSTEROK) {
 701                                                 BUF_UNLOCK(bp);
 702                                                 vfs_bio_awrite(bp);
 703                                         } else {
 704                                                 bremfree(bp);
 705                                                 bp->b_flags |= B_ASYNC;
 706                                                 BUF_WRITE(bp);
 707                                         }
 708                                 } else {
 709                                         bremfree(bp);
 710                                         (void) BUF_WRITE(bp);
 711                                 }
 712                                 break;
 713                         }
 714                         bremfree(bp);
 715                         bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
 716                         bp->b_flags &= ~B_ASYNC;
 717                         brelse(bp);
 718                 }
 719         }
 720
 721         while (vp->v_numoutput > 0) {
 722                 vp->v_flag |= VBWAIT;
 723                 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
 724         }
 725
 726         splx(s);
 727
 728         /*
 729          * Destroy the copy in the VM cache, too.
 730          */
 731         simple_lock(&vp->v_interlock);
 732         object = vp->v_object;
 733         if (object != NULL) {
 734                 vm_object_page_remove(object, 0, 0,
 735                         (flags & V_SAVE) ? TRUE : FALSE);
 736         }
 737         simple_unlock(&vp->v_interlock);
 738
 739         if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
 740                 panic("vinvalbuf: flush failed");
 741         return (0);
 742 }
 743
 744 /*
 745  * Truncate a file's buffer and pages to a specified length.  This
 746  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
 747  * sync activity.
 748  */
 749 int
 750 vtruncbuf(vp, cred, p, length, blksize)
 751         register struct vnode *vp;
 752         struct ucred *cred;
 753         struct proc *p;
 754         off_t length;
 755         int blksize;
 756 {
 757         register struct buf *bp;
 758         struct buf *nbp;
 759         int s, anyfreed;
 760         int trunclbn;
 761
 762         /*
 763          * Round up to the *next* lbn.
 764          */
 765         trunclbn = (length + blksize - 1) / blksize;
 766
 767         s = splbio();
 768 restart:
 769         anyfreed = 1;
 770         for (;anyfreed;) {
 771                 anyfreed = 0;
 772                 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
 773                         nbp = TAILQ_NEXT(bp, b_vnbufs);
 774                         if (bp->b_lblkno >= trunclbn) {
 775                                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 776                                         BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 777                                         goto restart;
 778                                 } else {
 779                                         bremfree(bp);
 780                                         bp->b_flags |= (B_INVAL | B_RELBUF);
 781                                         bp->b_flags &= ~B_ASYNC;
 782                                         brelse(bp);
 783                                         anyfreed = 1;
 784                                 }
 785                                 if (nbp &&
 786                                     (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 787                                     (nbp->b_vp != vp) ||
 788                                     (nbp->b_flags & B_DELWRI))) {
 789                                         goto restart;
 790                                 }
 791                         }
 792                 }
 793
 794                 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 795                         nbp = TAILQ_NEXT(bp, b_vnbufs);
 796                         if (bp->b_lblkno >= trunclbn) {
 797                                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 798                                         BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 799                                         goto restart;
 800                                 } else {
 801                                         bremfree(bp);
 802                                         bp->b_flags |= (B_INVAL | B_RELBUF);
 803                                         bp->b_flags &= ~B_ASYNC;
 804                                         brelse(bp);
 805                                         anyfreed = 1;
 806                                 }
 807                                 if (nbp &&
 808                                     (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 809                                     (nbp->b_vp != vp) ||
 810                                     (nbp->b_flags & B_DELWRI) == 0)) {
 811                                         goto restart;
 812                                 }
 813                         }
 814                 }
 815         }
 816
 817         if (length > 0) {
 818 restartsync:
 819                 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 820                         nbp = TAILQ_NEXT(bp, b_vnbufs);
 821                         if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
 822                                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 823                                         BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 824                                         goto restart;
 825                                 } else {
 826                                         bremfree(bp);
 827                                         if (bp->b_vp == vp) {
 828                                                 bp->b_flags |= B_ASYNC;
 829                                         } else {
 830                                                 bp->b_flags &= ~B_ASYNC;
 831                                         }
 832                                         BUF_WRITE(bp);
 833                                 }
 834                                 goto restartsync;
 835                         }
 836
 837                 }
 838         }
 839
 840         while (vp->v_numoutput > 0) {
 841                 vp->v_flag |= VBWAIT;
 842                 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
 843         }
 844
 845         splx(s);
 846
 847         vnode_pager_setsize(vp, length);
 848
 849         return (0);
 850 }
 851
 852 /*
 853  * Associate a buffer with a vnode.
 854  */
 855 void
 856 bgetvp(vp, bp)
 857         register struct vnode *vp;
 858         register struct buf *bp;
 859 {
 860         int s;
 861
 862         KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
 863
 864         vhold(vp);
 865         bp->b_vp = vp;
 866         bp->b_dev = vn_todev(vp);
 867         /*
 868          * Insert onto list for new vnode.
 869          */
 870         s = splbio();
 871         bp->b_xflags |= BX_VNCLEAN;
 872         bp->b_xflags &= ~BX_VNDIRTY;
 873         TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
 874         splx(s);
 875 }
 876
 877 /*
 878  * Disassociate a buffer from a vnode.
 879  */
 880 void
 881 brelvp(bp)
 882         register struct buf *bp;
 883 {
 884         struct vnode *vp;
 885         struct buflists *listheadp;
 886         int s;
 887
 888         KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 889
 890         /*
 891          * Delete from old vnode list, if on one.
 892          */
 893         vp = bp->b_vp;
 894         s = splbio();
 895         if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
 896                 if (bp->b_xflags & BX_VNDIRTY)
 897                         listheadp = &vp->v_dirtyblkhd;
 898                 else
 899                         listheadp = &vp->v_cleanblkhd;
 900                 TAILQ_REMOVE(listheadp, bp, b_vnbufs);
 901                 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 902         }
 903         if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 904                 vp->v_flag &= ~VONWORKLST;
 905                 LIST_REMOVE(vp, v_synclist);
 906         }
 907         splx(s);
 908         bp->b_vp = (struct vnode *) 0;
 909         vdrop(vp);
 910 }
 911
 912 /*
 913  * The workitem queue.
 914  *
 915  * It is useful to delay writes of file data and filesystem metadata
 916  * for tens of seconds so that quickly created and deleted files need
 917  * not waste disk bandwidth being created and removed. To realize this,
 918  * we append vnodes to a "workitem" queue. When running with a soft
 919  * updates implementation, most pending metadata dependencies should
 920  * not wait for more than a few seconds. Thus, mounted on block devices
 921  * are delayed only about a half the time that file data is delayed.
 922  * Similarly, directory updates are more critical, so are only delayed
 923  * about a third the time that file data is delayed. Thus, there are
 924  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
 925  * one each second (driven off the filesystem syncer process). The
 926  * syncer_delayno variable indicates the next queue that is to be processed.
 927  * Items that need to be processed soon are placed in this queue:
 928  *
 929  *      syncer_workitem_pending[syncer_delayno]
 930  *
 931  * A delay of fifteen seconds is done by placing the request fifteen
 932  * entries later in the queue:
 933  *
 934  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
 935  *
 936  */
 937
 938 /*
 939  * Add an item to the syncer work queue.
 940  */
 941 static void
 942 vn_syncer_add_to_worklist(struct vnode *vp, int delay)
 943 {
 944         int s, slot;
 945
 946         s = splbio();
 947
 948         if (vp->v_flag & VONWORKLST) {
 949                 LIST_REMOVE(vp, v_synclist);
 950         }
 951
 952         if (delay > syncer_maxdelay - 2)
 953                 delay = syncer_maxdelay - 2;
 954         slot = (syncer_delayno + delay) & syncer_mask;
 955
 956         LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
 957         vp->v_flag |= VONWORKLST;
 958         splx(s);
 959 }
 960
 961 struct  proc *updateproc;
 962 static void sched_sync __P((void));
 963 static struct kproc_desc up_kp = {
 964         "syncer",
 965         sched_sync,
 966         &updateproc
 967 };
 968 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 969
 970 /*
 971  * System filesystem synchronizer daemon.
 972  */
 973 void
 974 sched_sync(void)
 975 {
 976         struct synclist *slp;
 977         struct vnode *vp;
 978         long starttime;
 979         int s;
 980         struct proc *p = updateproc;
 981
 982         EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p,
 983             SHUTDOWN_PRI_LAST);
 984
 985         for (;;) {
 986                 kproc_suspend_loop(p);
 987
 988                 starttime = time_second;
 989
 990                 /*
 991                  * Push files whose dirty time has expired.  Be careful
 992                  * of interrupt race on slp queue.
 993                  */
 994                 s = splbio();
 995                 slp = &syncer_workitem_pending[syncer_delayno];
 996                 syncer_delayno += 1;
 997                 if (syncer_delayno == syncer_maxdelay)
 998                         syncer_delayno = 0;
 999                 splx(s);
1000
1001                 while ((vp = LIST_FIRST(slp)) != NULL) {
1002                         if (VOP_ISLOCKED(vp, NULL) == 0) {
1003                                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
1004                                 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
1005                                 VOP_UNLOCK(vp, 0, p);
1006                         }
1007                         s = splbio();
1008                         if (LIST_FIRST(slp) == vp) {
1009                                 /*
1010                                  * Note: v_tag VT_VFS vps can remain on the
1011                                  * worklist too with no dirty blocks, but
1012                                  * since sync_fsync() moves it to a different
1013                                  * slot we are safe.
1014                                  */
1015                                 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1016                                     !vn_isdisk(vp, NULL))
1017                                         panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
1018                                 /*
1019                                  * Put us back on the worklist.  The worklist
1020                                  * routine will remove us from our current
1021                                  * position and then add us back in at a later
1022                                  * position.
1023                                  */
1024                                 vn_syncer_add_to_worklist(vp, syncdelay);
1025                         }
1026                         splx(s);
1027                 }
1028
1029                 /*
1030                  * Do soft update processing.
1031                  */
1032                 if (bioops.io_sync)
1033                         (*bioops.io_sync)(NULL);
1034
1035                 /*
1036                  * The variable rushjob allows the kernel to speed up the
1037                  * processing of the filesystem syncer process. A rushjob
1038                  * value of N tells the filesystem syncer to process the next
1039                  * N seconds worth of work on its queue ASAP. Currently rushjob
1040                  * is used by the soft update code to speed up the filesystem
1041                  * syncer process when the incore state is getting so far
1042                  * ahead of the disk that the kernel memory pool is being
1043                  * threatened with exhaustion.
1044                  */
1045                 if (rushjob > 0) {
1046                         rushjob -= 1;
1047                         continue;
1048                 }
1049                 /*
1050                  * If it has taken us less than a second to process the
1051                  * current work, then wait. Otherwise start right over
1052                  * again. We can still lose time if any single round
1053                  * takes more than two seconds, but it does not really
1054                  * matter as we are just trying to generally pace the
1055                  * filesystem activity.
1056                  */
1057                 if (time_second == starttime)
1058                         tsleep(&lbolt, PPAUSE, "syncer", 0);
1059         }
1060 }
1061
1062 /*
1063  * Request the syncer daemon to speed up its work.
1064  * We never push it to speed up more than half of its
1065  * normal turn time, otherwise it could take over the cpu.
1066  */
1067 int
1068 speedup_syncer()
1069 {
1070         int s;
1071
1072         s = splhigh();
1073         if (updateproc->p_wchan == &lbolt)
1074                 setrunnable(updateproc);
1075         splx(s);
1076         if (rushjob < syncdelay / 2) {
1077                 rushjob += 1;
1078                 stat_rush_requests += 1;
1079                 return (1);
1080         }
1081         return(0);
1082 }
1083
1084 /*
1085  * Associate a p-buffer with a vnode.
1086  *
1087  * Also sets B_PAGING flag to indicate that vnode is not fully associated
1088  * with the buffer.  i.e. the bp has not been linked into the vnode or
1089  * ref-counted.
1090  */
1091 void
1092 pbgetvp(vp, bp)
1093         register struct vnode *vp;
1094         register struct buf *bp;
1095 {
1096
1097         KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1098
1099         bp->b_vp = vp;
1100         bp->b_flags |= B_PAGING;
1101         bp->b_dev = vn_todev(vp);
1102 }
1103
1104 /*
1105  * Disassociate a p-buffer from a vnode.
1106  */
1107 void
1108 pbrelvp(bp)
1109         register struct buf *bp;
1110 {
1111
1112         KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1113
1114         /* XXX REMOVE ME */
1115         if (bp->b_vnbufs.tqe_next != NULL) {
1116                 panic(
1117                     "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1118                     bp,
1119                     (int)bp->b_flags
1120                 );
1121         }
1122         bp->b_vp = (struct vnode *) 0;
1123         bp->b_flags &= ~B_PAGING;
1124 }
1125
1126 void
1127 pbreassignbuf(bp, newvp)
1128         struct buf *bp;
1129         struct vnode *newvp;
1130 {
1131         if ((bp->b_flags & B_PAGING) == 0) {
1132                 panic(
1133                     "pbreassignbuf() on non phys bp %p",
1134                     bp
1135                 );
1136         }
1137         bp->b_vp = newvp;
1138 }
1139
1140 /*
1141  * Reassign a buffer from one vnode to another.
1142  * Used to assign file specific control information
1143  * (indirect blocks) to the vnode to which they belong.
1144  */
1145 void
1146 reassignbuf(bp, newvp)
1147         register struct buf *bp;
1148         register struct vnode *newvp;
1149 {
1150         struct buflists *listheadp;
1151         int delay;
1152         int s;
1153
1154         if (newvp == NULL) {
1155                 printf("reassignbuf: NULL");
1156                 return;
1157         }
1158         ++reassignbufcalls;
1159
1160         /*
1161          * B_PAGING flagged buffers cannot be reassigned because their vp
1162          * is not fully linked in.
1163          */
1164         if (bp->b_flags & B_PAGING)
1165                 panic("cannot reassign paging buffer");
1166
1167         s = splbio();
1168         /*
1169          * Delete from old vnode list, if on one.
1170          */
1171         if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1172                 if (bp->b_xflags & BX_VNDIRTY)
1173                         listheadp = &bp->b_vp->v_dirtyblkhd;
1174                 else
1175                         listheadp = &bp->b_vp->v_cleanblkhd;
1176                 TAILQ_REMOVE(listheadp, bp, b_vnbufs);
1177                 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1178                 if (bp->b_vp != newvp) {
1179                         vdrop(bp->b_vp);
1180                         bp->b_vp = NULL;        /* for clarification */
1181                 }
1182         }
1183         /*
1184          * If dirty, put on list of dirty buffers; otherwise insert onto list
1185          * of clean buffers.
1186          */
1187         if (bp->b_flags & B_DELWRI) {
1188                 struct buf *tbp;
1189
1190                 listheadp = &newvp->v_dirtyblkhd;
1191                 if ((newvp->v_flag & VONWORKLST) == 0) {
1192                         switch (newvp->v_type) {
1193                         case VDIR:
1194                                 delay = dirdelay;
1195                                 break;
1196                         case VCHR:
1197                         case VBLK:
1198                                 if (newvp->v_specmountpoint != NULL) {
1199                                         delay = metadelay;
1200                                         break;
1201                                 }
1202                                 /* fall through */
1203                         default:
1204                                 delay = filedelay;
1205                         }
1206                         vn_syncer_add_to_worklist(newvp, delay);
1207                 }
1208                 bp->b_xflags |= BX_VNDIRTY;
1209                 tbp = TAILQ_FIRST(listheadp);
1210                 if (tbp == NULL ||
1211                     bp->b_lblkno == 0 ||
1212                     (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
1213                     (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
1214                         TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1215                         ++reassignbufsortgood;
1216                 } else if (bp->b_lblkno < 0) {
1217                         TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
1218                         ++reassignbufsortgood;
1219                 } else if (reassignbufmethod == 1) {
1220                         /*
1221                          * New sorting algorithm, only handle sequential case,
1222                          * otherwise append to end (but before metadata)
1223                          */
1224                         if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
1225                             (tbp->b_xflags & BX_VNDIRTY)) {
1226                                 /*
1227                                  * Found the best place to insert the buffer
1228                                  */
1229                                 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1230                                 ++reassignbufsortgood;
1231                         } else {
1232                                 /*
1233                                  * Missed, append to end, but before meta-data.
1234                                  * We know that the head buffer in the list is
1235                                  * not meta-data due to prior conditionals.
1236                                  *
1237                                  * Indirect effects:  NFS second stage write
1238                                  * tends to wind up here, giving maximum
1239                                  * distance between the unstable write and the
1240                                  * commit rpc.
1241                                  */
1242                                 tbp = TAILQ_LAST(listheadp, buflists);
1243                                 while (tbp && tbp->b_lblkno < 0)
1244                                         tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
1245                                 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1246                                 ++reassignbufsortbad;
1247                         }
1248                 } else {
1249                         /*
1250                          * Old sorting algorithm, scan queue and insert
1251                          */
1252                         struct buf *ttbp;
1253                         while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
1254                             (ttbp->b_lblkno < bp->b_lblkno)) {
1255                                 ++reassignbufloops;
1256                                 tbp = ttbp;
1257                         }
1258                         TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1259                 }
1260         } else {
1261                 bp->b_xflags |= BX_VNCLEAN;
1262                 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
1263                 if ((newvp->v_flag & VONWORKLST) &&
1264                     TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1265                         newvp->v_flag &= ~VONWORKLST;
1266                         LIST_REMOVE(newvp, v_synclist);
1267                 }
1268         }
1269         if (bp->b_vp != newvp) {
1270                 bp->b_vp = newvp;
1271                 vhold(bp->b_vp);
1272         }
1273         splx(s);
1274 }
1275
1276 /*
1277  * Create a vnode for a block device.
1278  * Used for mounting the root file system.
1279  * XXX: This now changed to a VCHR due to the block/char merging.
1280  */
1281 int
1282 bdevvp(dev, vpp)
1283         dev_t dev;
1284         struct vnode **vpp;
1285 {
1286         register struct vnode *vp;
1287         struct vnode *nvp;
1288         int error;
1289
1290         if (dev == NODEV) {
1291                 *vpp = NULLVP;
1292                 return (ENXIO);
1293         }
1294         error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1295         if (error) {
1296                 *vpp = NULLVP;
1297                 return (error);
1298         }
1299         vp = nvp;
1300         vp->v_type = VCHR;
1301         addalias(vp, dev);
1302         *vpp = vp;
1303         return (0);
1304 }
1305
1306 /*
1307  * Add vnode to the alias list hung off the dev_t.
1308  *
1309  * The reason for this gunk is that multiple vnodes can reference
1310  * the same physical device, so checking vp->v_usecount to see
1311  * how many users there are is inadequate; the v_usecount for
1312  * the vnodes need to be accumulated.  vcount() does that.
1313  */
1314 void
1315 addaliasu(nvp, nvp_rdev)
1316         struct vnode *nvp;
1317         udev_t nvp_rdev;
1318 {
1319
1320         if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1321                 panic("addaliasu on non-special vnode");
1322         addalias(nvp, udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0));
1323 }
1324
1325 void
1326 addalias(nvp, dev)
1327         struct vnode *nvp;
1328         dev_t dev;
1329 {
1330
1331         if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1332                 panic("addalias on non-special vnode");
1333
1334         nvp->v_rdev = dev;
1335         simple_lock(&spechash_slock);
1336         SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1337         simple_unlock(&spechash_slock);
1338 }
1339
1340 /*
1341  * Grab a particular vnode from the free list, increment its
1342  * reference count and lock it. The vnode lock bit is set if the
1343  * vnode is being eliminated in vgone. The process is awakened
1344  * when the transition is completed, and an error returned to
1345  * indicate that the vnode is no longer usable (possibly having
1346  * been changed to a new file system type).
1347  */
1348 int
1349 vget(vp, flags, p)
1350         register struct vnode *vp;
1351         int flags;
1352         struct proc *p;
1353 {
1354         int error;
1355
1356         /*
1357          * If the vnode is in the process of being cleaned out for
1358          * another use, we wait for the cleaning to finish and then
1359          * return failure. Cleaning is determined by checking that
1360          * the VXLOCK flag is set.
1361          */
1362         if ((flags & LK_INTERLOCK) == 0) {
1363                 simple_lock(&vp->v_interlock);
1364         }
1365         if (vp->v_flag & VXLOCK) {
1366                 vp->v_flag |= VXWANT;
1367                 simple_unlock(&vp->v_interlock);
1368                 tsleep((caddr_t)vp, PINOD, "vget", 0);
1369                 return (ENOENT);
1370         }
1371
1372         vp->v_usecount++;
1373
1374         if (VSHOULDBUSY(vp))
1375                 vbusy(vp);
1376         if (flags & LK_TYPE_MASK) {
1377                 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
1378                         /*
1379                          * must expand vrele here because we do not want
1380                          * to call VOP_INACTIVE if the reference count
1381                          * drops back to zero since it was never really
1382                          * active. We must remove it from the free list
1383                          * before sleeping so that multiple processes do
1384                          * not try to recycle it.
1385                          */
1386                         simple_lock(&vp->v_interlock);
1387                         vp->v_usecount--;
1388                         if (VSHOULDFREE(vp))
1389                                 vfree(vp);
1390                         simple_unlock(&vp->v_interlock);
1391                 }
1392                 return (error);
1393         }
1394         simple_unlock(&vp->v_interlock);
1395         return (0);
1396 }
1397
1398 void
1399 vref(struct vnode *vp)
1400 {
1401         simple_lock(&vp->v_interlock);
1402         vp->v_usecount++;
1403         simple_unlock(&vp->v_interlock);
1404 }
1405
1406 /*
1407  * Vnode put/release.
1408  * If count drops to zero, call inactive routine and return to freelist.
1409  */
1410 void
1411 vrele(vp)
1412         struct vnode *vp;
1413 {
1414         struct proc *p = curproc;       /* XXX */
1415
1416         KASSERT(vp != NULL, ("vrele: null vp"));
1417
1418         simple_lock(&vp->v_interlock);
1419
1420         if (vp->v_usecount > 1) {
1421
1422                 vp->v_usecount--;
1423                 simple_unlock(&vp->v_interlock);
1424
1425                 return;
1426         }
1427
1428         if (vp->v_usecount == 1) {
1429
1430                 vp->v_usecount--;
1431                 if (VSHOULDFREE(vp))
1432                         vfree(vp);
1433         /*
1434          * If we are doing a vput, the node is already locked, and we must
1435          * call VOP_INACTIVE with the node locked.  So, in the case of
1436          * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1437          */
1438                 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1439                         VOP_INACTIVE(vp, p);
1440                 }
1441
1442         } else {
1443 #ifdef DIAGNOSTIC
1444                 vprint("vrele: negative ref count", vp);
1445                 simple_unlock(&vp->v_interlock);
1446 #endif
1447                 panic("vrele: negative ref cnt");
1448         }
1449 }
1450
1451 void
1452 vput(vp)
1453         struct vnode *vp;
1454 {
1455         struct proc *p = curproc;       /* XXX */
1456
1457         KASSERT(vp != NULL, ("vput: null vp"));
1458
1459         simple_lock(&vp->v_interlock);
1460
1461         if (vp->v_usecount > 1) {
1462
1463                 vp->v_usecount--;
1464                 VOP_UNLOCK(vp, LK_INTERLOCK, p);
1465                 return;
1466
1467         }
1468
1469         if (vp->v_usecount == 1) {
1470
1471                 vp->v_usecount--;
1472                 if (VSHOULDFREE(vp))
1473                         vfree(vp);
1474         /*
1475          * If we are doing a vput, the node is already locked, and we must
1476          * call VOP_INACTIVE with the node locked.  So, in the case of
1477          * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1478          */
1479                 simple_unlock(&vp->v_interlock);
1480                 VOP_INACTIVE(vp, p);
1481
1482         } else {
1483 #ifdef DIAGNOSTIC
1484                 vprint("vput: negative ref count", vp);
1485 #endif
1486                 panic("vput: negative ref cnt");
1487         }
1488 }
1489
1490 /*
1491  * Somebody doesn't want the vnode recycled.
1492  */
1493 void
1494 vhold(vp)
1495         register struct vnode *vp;
1496 {
1497         int s;
1498
1499         s = splbio();
1500         vp->v_holdcnt++;
1501         if (VSHOULDBUSY(vp))
1502                 vbusy(vp);
1503         splx(s);
1504 }
1505
1506 /*
1507  * One less who cares about this vnode.
1508  */
1509 void
1510 vdrop(vp)
1511         register struct vnode *vp;
1512 {
1513         int s;
1514
1515         s = splbio();
1516         if (vp->v_holdcnt <= 0)
1517                 panic("vdrop: holdcnt");
1518         vp->v_holdcnt--;
1519         if (VSHOULDFREE(vp))
1520                 vfree(vp);
1521         splx(s);
1522 }
1523
1524 /*
1525  * Remove any vnodes in the vnode table belonging to mount point mp.
1526  *
1527  * If MNT_NOFORCE is specified, there should not be any active ones,
1528  * return error if any are found (nb: this is a user error, not a
1529  * system error). If MNT_FORCE is specified, detach any active vnodes
1530  * that are found.
1531  */
1532 #ifdef DIAGNOSTIC
1533 static int busyprt = 0;         /* print out busy vnodes */
1534 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1535 #endif
1536
1537 int
1538 vflush(mp, skipvp, flags)
1539         struct mount *mp;
1540         struct vnode *skipvp;
1541         int flags;
1542 {
1543         struct proc *p = curproc;       /* XXX */
1544         struct vnode *vp, *nvp;
1545         int busy = 0;
1546
1547         simple_lock(&mntvnode_slock);
1548 loop:
1549         for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
1550                 /*
1551                  * Make sure this vnode wasn't reclaimed in getnewvnode().
1552                  * Start over if it has (it won't be on the list anymore).
1553                  */
1554                 if (vp->v_mount != mp)
1555                         goto loop;
1556                 nvp = LIST_NEXT(vp, v_mntvnodes);
1557                 /*
1558                  * Skip over a selected vnode.
1559                  */
1560                 if (vp == skipvp)
1561                         continue;
1562
1563                 simple_lock(&vp->v_interlock);
1564                 /*
1565                  * Skip over a vnodes marked VSYSTEM.
1566                  */
1567                 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1568                         simple_unlock(&vp->v_interlock);
1569                         continue;
1570                 }
1571                 /*
1572                  * If WRITECLOSE is set, only flush out regular file vnodes
1573                  * open for writing.
1574                  */
1575                 if ((flags & WRITECLOSE) &&
1576                     (vp->v_writecount == 0 || vp->v_type != VREG)) {
1577                         simple_unlock(&vp->v_interlock);
1578                         continue;
1579                 }
1580
1581                 /*
1582                  * With v_usecount == 0, all we need to do is clear out the
1583                  * vnode data structures and we are done.
1584                  */
1585                 if (vp->v_usecount == 0) {
1586                         simple_unlock(&mntvnode_slock);
1587                         vgonel(vp, p);
1588                         simple_lock(&mntvnode_slock);
1589                         continue;
1590                 }
1591
1592                 /*
1593                  * If FORCECLOSE is set, forcibly close the vnode. For block
1594                  * or character devices, revert to an anonymous device. For
1595                  * all other files, just kill them.
1596                  */
1597                 if (flags & FORCECLOSE) {
1598                         simple_unlock(&mntvnode_slock);
1599                         if (vp->v_type != VBLK && vp->v_type != VCHR) {
1600                                 vgonel(vp, p);
1601                         } else {
1602                                 vclean(vp, 0, p);
1603                                 vp->v_op = spec_vnodeop_p;
1604                                 insmntque(vp, (struct mount *) 0);
1605                         }
1606                         simple_lock(&mntvnode_slock);
1607                         continue;
1608                 }
1609 #ifdef DIAGNOSTIC
1610                 if (busyprt)
1611                         vprint("vflush: busy vnode", vp);
1612 #endif
1613                 simple_unlock(&vp->v_interlock);
1614                 busy++;
1615         }
1616         simple_unlock(&mntvnode_slock);
1617         if (busy)
1618                 return (EBUSY);
1619         return (0);
1620 }
1621
1622 /*
1623  * Disassociate the underlying file system from a vnode.
1624  */
1625 static void
1626 vclean(vp, flags, p)
1627         struct vnode *vp;
1628         int flags;
1629         struct proc *p;
1630 {
1631         int active;
1632         vm_object_t obj;
1633
1634         /*
1635          * Check to see if the vnode is in use. If so we have to reference it
1636          * before we clean it out so that its count cannot fall to zero and
1637          * generate a race against ourselves to recycle it.
1638          */
1639         if ((active = vp->v_usecount))
1640                 vp->v_usecount++;
1641
1642         /*
1643          * Prevent the vnode from being recycled or brought into use while we
1644          * clean it out.
1645          */
1646         if (vp->v_flag & VXLOCK)
1647                 panic("vclean: deadlock");
1648         vp->v_flag |= VXLOCK;
1649         /*
1650          * Even if the count is zero, the VOP_INACTIVE routine may still
1651          * have the object locked while it cleans it out. The VOP_LOCK
1652          * ensures that the VOP_INACTIVE routine is done with its work.
1653          * For active vnodes, it ensures that no other activity can
1654          * occur while the underlying object is being cleaned out.
1655          */
1656         VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1657
1658         /*
1659          * Clean out any buffers associated with the vnode.
1660          */
1661         vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1662         if ((obj = vp->v_object) != NULL) {
1663                 if (obj->ref_count == 0) {
1664                         /*
1665                          * vclean() may be called twice.  The first time removes the
1666                          * primary reference to the object, the second time goes
1667                          * one further and is a special-case to terminate the object.
1668                          */
1669                         vm_object_terminate(obj);
1670                 } else {
1671                         /*
1672                          * Woe to the process that tries to page now :-).
1673                          */
1674                         vm_pager_deallocate(obj);
1675                 }
1676         }
1677
1678         /*
1679          * If purging an active vnode, it must be closed and
1680          * deactivated before being reclaimed. Note that the
1681          * VOP_INACTIVE will unlock the vnode.
1682          */
1683         if (active) {
1684                 if (flags & DOCLOSE)
1685                         VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1686                 VOP_INACTIVE(vp, p);
1687         } else {
1688                 /*
1689                  * Any other processes trying to obtain this lock must first
1690                  * wait for VXLOCK to clear, then call the new lock operation.
1691                  */
1692                 VOP_UNLOCK(vp, 0, p);
1693         }
1694         /*
1695          * Reclaim the vnode.
1696          */
1697         if (VOP_RECLAIM(vp, p))
1698                 panic("vclean: cannot reclaim");
1699
1700         if (active) {
1701                 /*
1702                  * Inline copy of vrele() since VOP_INACTIVE
1703                  * has already been called.
1704                  */
1705                 simple_lock(&vp->v_interlock);
1706                 if (--vp->v_usecount <= 0) {
1707 #ifdef DIAGNOSTIC
1708                         if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1709                                 vprint("vclean: bad ref count", vp);
1710                                 panic("vclean: ref cnt");
1711                         }
1712 #endif
1713                         vfree(vp);
1714                 }
1715                 simple_unlock(&vp->v_interlock);
1716         }
1717
1718         cache_purge(vp);
1719         if (vp->v_vnlock) {
1720                 FREE(vp->v_vnlock, M_VNODE);
1721                 vp->v_vnlock = NULL;
1722         }
1723
1724         if (VSHOULDFREE(vp))
1725                 vfree(vp);
1726
1727         /*
1728          * Done with purge, notify sleepers of the grim news.
1729          */
1730         vp->v_op = dead_vnodeop_p;
1731         vn_pollgone(vp);
1732         vp->v_tag = VT_NON;
1733         vp->v_flag &= ~VXLOCK;
1734         if (vp->v_flag & VXWANT) {
1735                 vp->v_flag &= ~VXWANT;
1736                 wakeup((caddr_t) vp);
1737         }
1738 }
1739
1740 /*
1741  * Eliminate all activity associated with the requested vnode
1742  * and with all vnodes aliased to the requested vnode.
1743  */
1744 int
1745 vop_revoke(ap)
1746         struct vop_revoke_args /* {
1747                 struct vnode *a_vp;
1748                 int a_flags;
1749         } */ *ap;
1750 {
1751         struct vnode *vp, *vq;
1752         dev_t dev;
1753
1754         KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
1755
1756         vp = ap->a_vp;
1757         /*
1758          * If a vgone (or vclean) is already in progress,
1759          * wait until it is done and return.
1760          */
1761         if (vp->v_flag & VXLOCK) {
1762                 vp->v_flag |= VXWANT;
1763                 simple_unlock(&vp->v_interlock);
1764                 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1765                 return (0);
1766         }
1767         dev = vp->v_rdev;
1768         for (;;) {
1769                 simple_lock(&spechash_slock);
1770                 vq = SLIST_FIRST(&dev->si_hlist);
1771                 simple_unlock(&spechash_slock);
1772                 if (!vq)
1773                         break;
1774                 vgone(vq);
1775         }
1776         return (0);
1777 }
1778
1779 /*
1780  * Recycle an unused vnode to the front of the free list.
1781  * Release the passed interlock if the vnode will be recycled.
1782  */
1783 int
1784 vrecycle(vp, inter_lkp, p)
1785         struct vnode *vp;
1786         struct simplelock *inter_lkp;
1787         struct proc *p;
1788 {
1789
1790         simple_lock(&vp->v_interlock);
1791         if (vp->v_usecount == 0) {
1792                 if (inter_lkp) {
1793                         simple_unlock(inter_lkp);
1794                 }
1795                 vgonel(vp, p);
1796                 return (1);
1797         }
1798         simple_unlock(&vp->v_interlock);
1799         return (0);
1800 }
1801
1802 /*
1803  * Eliminate all activity associated with a vnode
1804  * in preparation for reuse.
1805  */
1806 void
1807 vgone(vp)
1808         register struct vnode *vp;
1809 {
1810         struct proc *p = curproc;       /* XXX */
1811
1812         simple_lock(&vp->v_interlock);
1813         vgonel(vp, p);
1814 }
1815
1816 /*
1817  * vgone, with the vp interlock held.
1818  */
1819 void
1820 vgonel(vp, p)
1821         struct vnode *vp;
1822         struct proc *p;
1823 {
1824         int s;
1825
1826         /*
1827          * If a vgone (or vclean) is already in progress,
1828          * wait until it is done and return.
1829          */
1830         if (vp->v_flag & VXLOCK) {
1831                 vp->v_flag |= VXWANT;
1832                 simple_unlock(&vp->v_interlock);
1833                 tsleep((caddr_t)vp, PINOD, "vgone", 0);
1834                 return;
1835         }
1836
1837         /*
1838          * Clean out the filesystem specific data.
1839          */
1840         vclean(vp, DOCLOSE, p);
1841         simple_lock(&vp->v_interlock);
1842
1843         /*
1844          * Delete from old mount point vnode list, if on one.
1845          */
1846         if (vp->v_mount != NULL)
1847                 insmntque(vp, (struct mount *)0);
1848         /*
1849          * If special device, remove it from special device alias list
1850          * if it is on one.
1851          */
1852         if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) {
1853                 simple_lock(&spechash_slock);
1854                 SLIST_REMOVE(&vp->v_hashchain, vp, struct vnode, v_specnext);
1855                 freedev(vp->v_rdev);
1856                 simple_unlock(&spechash_slock);
1857                 vp->v_rdev = NULL;
1858         }
1859
1860         /*
1861          * If it is on the freelist and not already at the head,
1862          * move it to the head of the list. The test of the back
1863          * pointer and the reference count of zero is because
1864          * it will be removed from the free list by getnewvnode,
1865          * but will not have its reference count incremented until
1866          * after calling vgone. If the reference count were
1867          * incremented first, vgone would (incorrectly) try to
1868          * close the previous instance of the underlying object.
1869          */
1870         if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
1871                 s = splbio();
1872                 simple_lock(&vnode_free_list_slock);
1873                 if (vp->v_flag & VFREE) {
1874                         TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1875                 } else if (vp->v_flag & VTBFREE) {
1876                         TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
1877                         vp->v_flag &= ~VTBFREE;
1878                         freevnodes++;
1879                 } else
1880                         freevnodes++;
1881                 vp->v_flag |= VFREE;
1882                 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1883                 simple_unlock(&vnode_free_list_slock);
1884                 splx(s);
1885         }
1886
1887         vp->v_type = VBAD;
1888         simple_unlock(&vp->v_interlock);
1889 }
1890
1891 /*
1892  * Lookup a vnode by device number.
1893  */
1894 int
1895 vfinddev(dev, type, vpp)
1896         dev_t dev;
1897         enum vtype type;
1898         struct vnode **vpp;
1899 {
1900         struct vnode *vp;
1901
1902         simple_lock(&spechash_slock);
1903         SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
1904                 if (type == vp->v_type) {
1905                         *vpp = vp;
1906                         simple_unlock(&spechash_slock);
1907                         return (1);
1908                 }
1909         }
1910         simple_unlock(&spechash_slock);
1911         return (0);
1912 }
1913
1914 /*
1915  * Calculate the total number of references to a special device.
1916  */
1917 int
1918 vcount(vp)
1919         struct vnode *vp;
1920 {
1921         struct vnode *vq;
1922         int count;
1923
1924         count = 0;
1925         simple_lock(&spechash_slock);
1926         SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext)
1927                 count += vq->v_usecount;
1928         simple_unlock(&spechash_slock);
1929         return (count);
1930 }
1931
1932 /*
1933  * Same as above, but using the dev_t as argument
1934  */
1935
1936 int
1937 count_dev(dev)
1938         dev_t dev;
1939 {
1940         struct vnode *vp;
1941
1942         vp = SLIST_FIRST(&dev->si_hlist);
1943         if (vp == NULL)
1944                 return (0);
1945         return(vcount(vp));
1946 }
1947
1948 /*
1949  * Print out a description of a vnode.
1950  */
1951 static char *typename[] =
1952 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1953
1954 void
1955 vprint(label, vp)
1956         char *label;
1957         struct vnode *vp;
1958 {
1959         char buf[96];
1960
1961         if (label != NULL)
1962                 printf("%s: %p: ", label, (void *)vp);
1963         else
1964                 printf("%p: ", (void *)vp);
1965         printf("type %s, usecount %d, writecount %d, refcount %d,",
1966             typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1967             vp->v_holdcnt);
1968         buf[0] = '\0';
1969         if (vp->v_flag & VROOT)
1970                 strcat(buf, "|VROOT");
1971         if (vp->v_flag & VTEXT)
1972                 strcat(buf, "|VTEXT");
1973         if (vp->v_flag & VSYSTEM)
1974                 strcat(buf, "|VSYSTEM");
1975         if (vp->v_flag & VXLOCK)
1976                 strcat(buf, "|VXLOCK");
1977         if (vp->v_flag & VXWANT)
1978                 strcat(buf, "|VXWANT");
1979         if (vp->v_flag & VBWAIT)
1980                 strcat(buf, "|VBWAIT");
1981         if (vp->v_flag & VDOOMED)
1982                 strcat(buf, "|VDOOMED");
1983         if (vp->v_flag & VFREE)
1984                 strcat(buf, "|VFREE");
1985         if (vp->v_flag & VOBJBUF)
1986                 strcat(buf, "|VOBJBUF");
1987         if (buf[0] != '\0')
1988                 printf(" flags (%s)", &buf[1]);
1989         if (vp->v_data == NULL) {
1990                 printf("\n");
1991         } else {
1992                 printf("\n\t");
1993                 VOP_PRINT(vp);
1994         }
1995 }
1996
1997 #ifdef DDB
1998 #include <ddb/ddb.h>
1999 /*
2000  * List all of the locked vnodes in the system.
2001  * Called when debugging the kernel.
2002  */
2003 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
2004 {
2005         struct proc *p = curproc;       /* XXX */
2006         struct mount *mp, *nmp;
2007         struct vnode *vp;
2008
2009         printf("Locked vnodes\n");
2010         simple_lock(&mountlist_slock);
2011         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2012                 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2013                         nmp = TAILQ_NEXT(mp, mnt_list);
2014                         continue;
2015                 }
2016                 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2017                         if (VOP_ISLOCKED(vp, NULL))
2018                                 vprint((char *)0, vp);
2019                 }
2020                 simple_lock(&mountlist_slock);
2021                 nmp = TAILQ_NEXT(mp, mnt_list);
2022                 vfs_unbusy(mp, p);
2023         }
2024         simple_unlock(&mountlist_slock);
2025 }
2026 #endif
2027
2028 /*
2029  * Top level filesystem related information gathering.
2030  */
2031 static int      sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
2032
2033 static int
2034 vfs_sysctl SYSCTL_HANDLER_ARGS
2035 {
2036         int *name = (int *)arg1 - 1;    /* XXX */
2037         u_int namelen = arg2 + 1;       /* XXX */
2038         struct vfsconf *vfsp;
2039
2040 #if 1 || defined(COMPAT_PRELITE2)
2041         /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2042         if (namelen == 1)
2043                 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2044 #endif
2045
2046 #ifdef notyet
2047         /* all sysctl names at this level are at least name and field */
2048         if (namelen < 2)
2049                 return (ENOTDIR);               /* overloaded */
2050         if (name[0] != VFS_GENERIC) {
2051                 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2052                         if (vfsp->vfc_typenum == name[0])
2053                                 break;
2054                 if (vfsp == NULL)
2055                         return (EOPNOTSUPP);
2056                 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2057                     oldp, oldlenp, newp, newlen, p));
2058         }
2059 #endif
2060         switch (name[1]) {
2061         case VFS_MAXTYPENUM:
2062                 if (namelen != 2)
2063                         return (ENOTDIR);
2064                 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2065         case VFS_CONF:
2066                 if (namelen != 3)
2067                         return (ENOTDIR);       /* overloaded */
2068                 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2069                         if (vfsp->vfc_typenum == name[2])
2070                                 break;
2071                 if (vfsp == NULL)
2072                         return (EOPNOTSUPP);
2073                 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
2074         }
2075         return (EOPNOTSUPP);
2076 }
2077
2078 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
2079         "Generic filesystem");
2080
2081 #if 1 || defined(COMPAT_PRELITE2)
2082
2083 static int
2084 sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
2085 {
2086         int error;
2087         struct vfsconf *vfsp;
2088         struct ovfsconf ovfs;
2089
2090         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2091                 ovfs.vfc_vfsops = vfsp->vfc_vfsops;     /* XXX used as flag */
2092                 strcpy(ovfs.vfc_name, vfsp->vfc_name);
2093                 ovfs.vfc_index = vfsp->vfc_typenum;
2094                 ovfs.vfc_refcount = vfsp->vfc_refcount;
2095                 ovfs.vfc_flags = vfsp->vfc_flags;
2096                 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2097                 if (error)
2098                         return error;
2099         }
2100         return 0;
2101 }
2102
2103 #endif /* 1 || COMPAT_PRELITE2 */
2104
2105 #if 0
2106 #define KINFO_VNODESLOP 10
2107 /*
2108  * Dump vnode list (via sysctl).
2109  * Copyout address of vnode followed by vnode.
2110  */
2111 /* ARGSUSED */
2112 static int
2113 sysctl_vnode SYSCTL_HANDLER_ARGS
2114 {
2115         struct proc *p = curproc;       /* XXX */
2116         struct mount *mp, *nmp;
2117         struct vnode *nvp, *vp;
2118         int error;
2119
2120 #define VPTRSZ  sizeof (struct vnode *)
2121 #define VNODESZ sizeof (struct vnode)
2122
2123         req->lock = 0;
2124         if (!req->oldptr) /* Make an estimate */
2125                 return (SYSCTL_OUT(req, 0,
2126                         (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
2127
2128         simple_lock(&mountlist_slock);
2129         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2130                 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2131                         nmp = TAILQ_NEXT(mp, mnt_list);
2132                         continue;
2133                 }
2134 again:
2135                 simple_lock(&mntvnode_slock);
2136                 for (vp = LIST_FIRST(&mp->mnt_vnodelist);
2137                      vp != NULL;
2138                      vp = nvp) {
2139                         /*
2140                          * Check that the vp is still associated with
2141                          * this filesystem.  RACE: could have been
2142                          * recycled onto the same filesystem.
2143                          */
2144                         if (vp->v_mount != mp) {
2145                                 simple_unlock(&mntvnode_slock);
2146                                 goto again;
2147                         }
2148                         nvp = LIST_NEXT(vp, v_mntvnodes);
2149                         simple_unlock(&mntvnode_slock);
2150                         if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
2151                             (error = SYSCTL_OUT(req, vp, VNODESZ)))
2152                                 return (error);
2153                         simple_lock(&mntvnode_slock);
2154                 }
2155                 simple_unlock(&mntvnode_slock);
2156                 simple_lock(&mountlist_slock);
2157                 nmp = TAILQ_NEXT(mp, mnt_list);
2158                 vfs_unbusy(mp, p);
2159         }
2160         simple_unlock(&mountlist_slock);
2161
2162         return (0);
2163 }
2164 #endif
2165
2166 /*
2167  * XXX
2168  * Exporting the vnode list on large systems causes them to crash.
2169  * Exporting the vnode list on medium systems causes sysctl to coredump.
2170  */
2171 #if 0
2172 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2173         0, 0, sysctl_vnode, "S,vnode", "");
2174 #endif
2175
2176 /*
2177  * Check to see if a filesystem is mounted on a block device.
2178  */
2179 int
2180 vfs_mountedon(vp)
2181         struct vnode *vp;
2182 {
2183
2184         if (vp->v_specmountpoint != NULL)
2185                 return (EBUSY);
2186         return (0);
2187 }
2188
2189 /*
2190  * Unmount all filesystems. The list is traversed in reverse order
2191  * of mounting to avoid dependencies.
2192  */
2193 void
2194 vfs_unmountall()
2195 {
2196         struct mount *mp;
2197         struct proc *p;
2198         int error;
2199
2200         if (curproc != NULL)
2201                 p = curproc;
2202         else
2203                 p = initproc;   /* XXX XXX should this be proc0? */
2204         /*
2205          * Since this only runs when rebooting, it is not interlocked.
2206          */
2207         while(!TAILQ_EMPTY(&mountlist)) {
2208                 mp = TAILQ_LAST(&mountlist, mntlist);
2209                 error = dounmount(mp, MNT_FORCE, p);
2210                 if (error) {
2211                         TAILQ_REMOVE(&mountlist, mp, mnt_list);
2212                         printf("unmount of %s failed (",
2213                             mp->mnt_stat.f_mntonname);
2214                         if (error == EBUSY)
2215                                 printf("BUSY)\n");
2216                         else
2217                                 printf("%d)\n", error);
2218                 } else {
2219                         /* The unmount has removed mp from the mountlist */
2220                 }
2221         }
2222 }
2223
2224 /*
2225  * Build hash lists of net addresses and hang them off the mount point.
2226  * Called by ufs_mount() to set up the lists of export addresses.
2227  */
2228 static int
2229 vfs_hang_addrlist(mp, nep, argp)
2230         struct mount *mp;
2231         struct netexport *nep;
2232         struct export_args *argp;
2233 {
2234         register struct netcred *np;
2235         register struct radix_node_head *rnh;
2236         register int i;
2237         struct radix_node *rn;
2238         struct sockaddr *saddr, *smask = 0;
2239         struct domain *dom;
2240         int error;
2241
2242         if (argp->ex_addrlen == 0) {
2243                 if (mp->mnt_flag & MNT_DEFEXPORTED)
2244                         return (EPERM);
2245                 np = &nep->ne_defexported;
2246                 np->netc_exflags = argp->ex_flags;
2247                 np->netc_anon = argp->ex_anon;
2248                 np->netc_anon.cr_ref = 1;
2249                 mp->mnt_flag |= MNT_DEFEXPORTED;
2250                 return (0);
2251         }
2252         i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2253         np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
2254         bzero((caddr_t) np, i);
2255         saddr = (struct sockaddr *) (np + 1);
2256         if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
2257                 goto out;
2258         if (saddr->sa_len > argp->ex_addrlen)
2259                 saddr->sa_len = argp->ex_addrlen;
2260         if (argp->ex_masklen) {
2261                 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
2262                 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
2263                 if (error)
2264                         goto out;
2265                 if (smask->sa_len > argp->ex_masklen)
2266                         smask->sa_len = argp->ex_masklen;
2267         }
2268         i = saddr->sa_family;
2269         if ((rnh = nep->ne_rtable[i]) == 0) {
2270                 /*
2271                  * Seems silly to initialize every AF when most are not used,
2272                  * do so on demand here
2273                  */
2274                 for (dom = domains; dom; dom = dom->dom_next)
2275                         if (dom->dom_family == i && dom->dom_rtattach) {
2276                                 dom->dom_rtattach((void **) &nep->ne_rtable[i],
2277                                     dom->dom_rtoffset);
2278                                 break;
2279                         }
2280                 if ((rnh = nep->ne_rtable[i]) == 0) {
2281                         error = ENOBUFS;
2282                         goto out;
2283                 }
2284         }
2285         rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
2286             np->netc_rnodes);
2287         if (rn == 0 || np != (struct netcred *) rn) {   /* already exists */
2288                 error = EPERM;
2289                 goto out;
2290         }
2291         np->netc_exflags = argp->ex_flags;
2292         np->netc_anon = argp->ex_anon;
2293         np->netc_anon.cr_ref = 1;
2294         return (0);
2295 out:
2296         free(np, M_NETADDR);
2297         return (error);
2298 }
2299
2300 /* ARGSUSED */
2301 static int
2302 vfs_free_netcred(rn, w)
2303         struct radix_node *rn;
2304         void *w;
2305 {
2306         register struct radix_node_head *rnh = (struct radix_node_head *) w;
2307
2308         (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2309         free((caddr_t) rn, M_NETADDR);
2310         return (0);
2311 }
2312
2313 /*
2314  * Free the net address hash lists that are hanging off the mount points.
2315  */
2316 static void
2317 vfs_free_addrlist(nep)
2318         struct netexport *nep;
2319 {
2320         register int i;
2321         register struct radix_node_head *rnh;
2322
2323         for (i = 0; i <= AF_MAX; i++)
2324                 if ((rnh = nep->ne_rtable[i])) {
2325                         (*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2326                             (caddr_t) rnh);
2327                         free((caddr_t) rnh, M_RTABLE);
2328                         nep->ne_rtable[i] = 0;
2329                 }
2330 }
2331
2332 int
2333 vfs_export(mp, nep, argp)
2334         struct mount *mp;
2335         struct netexport *nep;
2336         struct export_args *argp;
2337 {
2338         int error;
2339
2340         if (argp->ex_flags & MNT_DELEXPORT) {
2341                 if (mp->mnt_flag & MNT_EXPUBLIC) {
2342                         vfs_setpublicfs(NULL, NULL, NULL);
2343                         mp->mnt_flag &= ~MNT_EXPUBLIC;
2344                 }
2345                 vfs_free_addrlist(nep);
2346                 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2347         }
2348         if (argp->ex_flags & MNT_EXPORTED) {
2349                 if (argp->ex_flags & MNT_EXPUBLIC) {
2350                         if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2351                                 return (error);
2352                         mp->mnt_flag |= MNT_EXPUBLIC;
2353                 }
2354                 if ((error = vfs_hang_addrlist(mp, nep, argp)))
2355                         return (error);
2356                 mp->mnt_flag |= MNT_EXPORTED;
2357         }
2358         return (0);
2359 }
2360
2361
2362 /*
2363  * Set the publicly exported filesystem (WebNFS). Currently, only
2364  * one public filesystem is possible in the spec (RFC 2054 and 2055)
2365  */
2366 int
2367 vfs_setpublicfs(mp, nep, argp)
2368         struct mount *mp;
2369         struct netexport *nep;
2370         struct export_args *argp;
2371 {
2372         int error;
2373         struct vnode *rvp;
2374         char *cp;
2375
2376         /*
2377          * mp == NULL -> invalidate the current info, the FS is
2378          * no longer exported. May be called from either vfs_export
2379          * or unmount, so check if it hasn't already been done.
2380          */
2381         if (mp == NULL) {
2382                 if (nfs_pub.np_valid) {
2383                         nfs_pub.np_valid = 0;
2384                         if (nfs_pub.np_index != NULL) {
2385                                 FREE(nfs_pub.np_index, M_TEMP);
2386                                 nfs_pub.np_index = NULL;
2387                         }
2388                 }
2389                 return (0);
2390         }
2391
2392         /*
2393          * Only one allowed at a time.
2394          */
2395         if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2396                 return (EBUSY);
2397
2398         /*
2399          * Get real filehandle for root of exported FS.
2400          */
2401         bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2402         nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2403
2404         if ((error = VFS_ROOT(mp, &rvp)))
2405                 return (error);
2406
2407         if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2408                 return (error);
2409
2410         vput(rvp);
2411
2412         /*
2413          * If an indexfile was specified, pull it in.
2414          */
2415         if (argp->ex_indexfile != NULL) {
2416                 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2417                     M_WAITOK);
2418                 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2419                     MAXNAMLEN, (size_t *)0);
2420                 if (!error) {
2421                         /*
2422                          * Check for illegal filenames.
2423                          */
2424                         for (cp = nfs_pub.np_index; *cp; cp++) {
2425                                 if (*cp == '/') {
2426                                         error = EINVAL;
2427                                         break;
2428                                 }
2429                         }
2430                 }
2431                 if (error) {
2432                         FREE(nfs_pub.np_index, M_TEMP);
2433                         return (error);
2434                 }
2435         }
2436
2437         nfs_pub.np_mount = mp;
2438         nfs_pub.np_valid = 1;
2439         return (0);
2440 }
2441
2442 struct netcred *
2443 vfs_export_lookup(mp, nep, nam)
2444         register struct mount *mp;
2445         struct netexport *nep;
2446         struct sockaddr *nam;
2447 {
2448         register struct netcred *np;
2449         register struct radix_node_head *rnh;
2450         struct sockaddr *saddr;
2451
2452         np = NULL;
2453         if (mp->mnt_flag & MNT_EXPORTED) {
2454                 /*
2455                  * Lookup in the export list first.
2456                  */
2457                 if (nam != NULL) {
2458                         saddr = nam;
2459                         rnh = nep->ne_rtable[saddr->sa_family];
2460                         if (rnh != NULL) {
2461                                 np = (struct netcred *)
2462                                         (*rnh->rnh_matchaddr)((caddr_t)saddr,
2463                                                               rnh);
2464                                 if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2465                                         np = NULL;
2466                         }
2467                 }
2468                 /*
2469                  * If no address match, use the default if it exists.
2470                  */
2471                 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2472                         np = &nep->ne_defexported;
2473         }
2474         return (np);
2475 }
2476
2477 /*
2478  * perform msync on all vnodes under a mount point
2479  * the mount point must be locked.
2480  */
2481 void
2482 vfs_msync(struct mount *mp, int flags) {
2483         struct vnode *vp, *nvp;
2484         struct vm_object *obj;
2485         int anyio, tries;
2486
2487         tries = 5;
2488 loop:
2489         anyio = 0;
2490         for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) {
2491
2492                 nvp = LIST_NEXT(vp, v_mntvnodes);
2493
2494                 if (vp->v_mount != mp) {
2495                         goto loop;
2496                 }
2497
2498                 if (vp->v_flag & VXLOCK)        /* XXX: what if MNT_WAIT? */
2499                         continue;
2500
2501                 if (flags != MNT_WAIT) {
2502                         obj = vp->v_object;
2503                         if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
2504                                 continue;
2505                         if (VOP_ISLOCKED(vp, NULL))
2506                                 continue;
2507                 }
2508
2509                 simple_lock(&vp->v_interlock);
2510                 if (vp->v_object &&
2511                    (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
2512                         if (!vget(vp,
2513                                 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
2514                                 if (vp->v_object) {
2515                                         vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
2516                                         anyio = 1;
2517                                 }
2518                                 vput(vp);
2519                         }
2520                 } else {
2521                         simple_unlock(&vp->v_interlock);
2522                 }
2523         }
2524         if (anyio && (--tries > 0))
2525                 goto loop;
2526 }
2527
2528 /*
2529  * Create the VM object needed for VMIO and mmap support.  This
2530  * is done for all VREG files in the system.  Some filesystems might
2531  * afford the additional metadata buffering capability of the
2532  * VMIO code by making the device node be VMIO mode also.
2533  *
2534  * vp must be locked when vfs_object_create is called.
2535  */
2536 int
2537 vfs_object_create(vp, p, cred)
2538         struct vnode *vp;
2539         struct proc *p;
2540         struct ucred *cred;
2541 {
2542         struct vattr vat;
2543         vm_object_t object;
2544         int error = 0;
2545
2546         if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE)
2547                 return 0;
2548
2549 retry:
2550         if ((object = vp->v_object) == NULL) {
2551                 if (vp->v_type == VREG || vp->v_type == VDIR) {
2552                         if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
2553                                 goto retn;
2554                         object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
2555                 } else if (devsw(vp->v_rdev) != NULL) {
2556                         /*
2557                          * This simply allocates the biggest object possible
2558                          * for a disk vnode.  This should be fixed, but doesn't
2559                          * cause any problems (yet).
2560                          */
2561                         object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
2562                 } else {
2563                         goto retn;
2564                 }
2565                 /*
2566                  * Dereference the reference we just created.  This assumes
2567                  * that the object is associated with the vp.
2568                  */
2569                 object->ref_count--;
2570                 vp->v_usecount--;
2571         } else {
2572                 if (object->flags & OBJ_DEAD) {
2573                         VOP_UNLOCK(vp, 0, p);
2574                         tsleep(object, PVM, "vodead", 0);
2575                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
2576                         goto retry;
2577                 }
2578         }
2579
2580         KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object"));
2581         vp->v_flag |= VOBJBUF;
2582
2583 retn:
2584         return error;
2585 }
2586
2587 static void
2588 vfree(vp)
2589         struct vnode *vp;
2590 {
2591         int s;
2592
2593         s = splbio();
2594         simple_lock(&vnode_free_list_slock);
2595         if (vp->v_flag & VTBFREE) {
2596                 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2597                 vp->v_flag &= ~VTBFREE;
2598         }
2599         if (vp->v_flag & VAGE) {
2600                 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2601         } else {
2602                 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2603         }
2604         freevnodes++;
2605         simple_unlock(&vnode_free_list_slock);
2606         vp->v_flag &= ~VAGE;
2607         vp->v_flag |= VFREE;
2608         splx(s);
2609 }
2610
2611 void
2612 vbusy(vp)
2613         struct vnode *vp;
2614 {
2615         int s;
2616
2617         s = splbio();
2618         simple_lock(&vnode_free_list_slock);
2619         if (vp->v_flag & VTBFREE) {
2620                 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2621                 vp->v_flag &= ~VTBFREE;
2622         } else {
2623                 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2624                 freevnodes--;
2625         }
2626         simple_unlock(&vnode_free_list_slock);
2627         vp->v_flag &= ~(VFREE|VAGE);
2628         splx(s);
2629 }
2630
2631 /*
2632  * Record a process's interest in events which might happen to
2633  * a vnode.  Because poll uses the historic select-style interface
2634  * internally, this routine serves as both the ``check for any
2635  * pending events'' and the ``record my interest in future events''
2636  * functions.  (These are done together, while the lock is held,
2637  * to avoid race conditions.)
2638  */
2639 int
2640 vn_pollrecord(vp, p, events)
2641         struct vnode *vp;
2642         struct proc *p;
2643         short events;
2644 {
2645         simple_lock(&vp->v_pollinfo.vpi_lock);
2646         if (vp->v_pollinfo.vpi_revents & events) {
2647                 /*
2648                  * This leaves events we are not interested
2649                  * in available for the other process which
2650                  * which presumably had requested them
2651                  * (otherwise they would never have been
2652                  * recorded).
2653                  */
2654                 events &= vp->v_pollinfo.vpi_revents;
2655                 vp->v_pollinfo.vpi_revents &= ~events;
2656
2657                 simple_unlock(&vp->v_pollinfo.vpi_lock);
2658                 return events;
2659         }
2660         vp->v_pollinfo.vpi_events |= events;
2661         selrecord(p, &vp->v_pollinfo.vpi_selinfo);
2662         simple_unlock(&vp->v_pollinfo.vpi_lock);
2663         return 0;
2664 }
2665
2666 /*
2667  * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
2668  * it is possible for us to miss an event due to race conditions, but
2669  * that condition is expected to be rare, so for the moment it is the
2670  * preferred interface.
2671  */
2672 void
2673 vn_pollevent(vp, events)
2674         struct vnode *vp;
2675         short events;
2676 {
2677         simple_lock(&vp->v_pollinfo.vpi_lock);
2678         if (vp->v_pollinfo.vpi_events & events) {
2679                 /*
2680                  * We clear vpi_events so that we don't
2681                  * call selwakeup() twice if two events are
2682                  * posted before the polling process(es) is
2683                  * awakened.  This also ensures that we take at
2684                  * most one selwakeup() if the polling process
2685                  * is no longer interested.  However, it does
2686                  * mean that only one event can be noticed at
2687                  * a time.  (Perhaps we should only clear those
2688                  * event bits which we note?) XXX
2689                  */
2690                 vp->v_pollinfo.vpi_events = 0;  /* &= ~events ??? */
2691                 vp->v_pollinfo.vpi_revents |= events;
2692                 selwakeup(&vp->v_pollinfo.vpi_selinfo);
2693         }
2694         simple_unlock(&vp->v_pollinfo.vpi_lock);
2695 }
2696
2697 /*
2698  * Wake up anyone polling on vp because it is being revoked.
2699  * This depends on dead_poll() returning POLLHUP for correct
2700  * behavior.
2701  */
2702 void
2703 vn_pollgone(vp)
2704         struct vnode *vp;
2705 {
2706         simple_lock(&vp->v_pollinfo.vpi_lock);
2707         if (vp->v_pollinfo.vpi_events) {
2708                 vp->v_pollinfo.vpi_events = 0;
2709                 selwakeup(&vp->v_pollinfo.vpi_selinfo);
2710         }
2711         simple_unlock(&vp->v_pollinfo.vpi_lock);
2712 }
2713
2714
2715
2716 /*
2717  * Routine to create and manage a filesystem syncer vnode.
2718  */
2719 #define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
2720 static int      sync_fsync __P((struct  vop_fsync_args *));
2721 static int      sync_inactive __P((struct  vop_inactive_args *));
2722 static int      sync_reclaim  __P((struct  vop_reclaim_args *));
2723 #define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
2724 #define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
2725 static int      sync_print __P((struct vop_print_args *));
2726 #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2727
2728 static vop_t **sync_vnodeop_p;
2729 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2730         { &vop_default_desc,    (vop_t *) vop_eopnotsupp },
2731         { &vop_close_desc,      (vop_t *) sync_close },         /* close */
2732         { &vop_fsync_desc,      (vop_t *) sync_fsync },         /* fsync */
2733         { &vop_inactive_desc,   (vop_t *) sync_inactive },      /* inactive */
2734         { &vop_reclaim_desc,    (vop_t *) sync_reclaim },       /* reclaim */
2735         { &vop_lock_desc,       (vop_t *) sync_lock },          /* lock */
2736         { &vop_unlock_desc,     (vop_t *) sync_unlock },        /* unlock */
2737         { &vop_print_desc,      (vop_t *) sync_print },         /* print */
2738         { &vop_islocked_desc,   (vop_t *) sync_islocked },      /* islocked */
2739         { NULL, NULL }
2740 };
2741 static struct vnodeopv_desc sync_vnodeop_opv_desc =
2742         { &sync_vnodeop_p, sync_vnodeop_entries };
2743
2744 VNODEOP_SET(sync_vnodeop_opv_desc);
2745
2746 /*
2747  * Create a new filesystem syncer vnode for the specified mount point.
2748  */
2749 int
2750 vfs_allocate_syncvnode(mp)
2751         struct mount *mp;
2752 {
2753         struct vnode *vp;
2754         static long start, incr, next;
2755         int error;
2756
2757         /* Allocate a new vnode */
2758         if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2759                 mp->mnt_syncer = NULL;
2760                 return (error);
2761         }
2762         vp->v_type = VNON;
2763         /*
2764          * Place the vnode onto the syncer worklist. We attempt to
2765          * scatter them about on the list so that they will go off
2766          * at evenly distributed times even if all the filesystems
2767          * are mounted at once.
2768          */
2769         next += incr;
2770         if (next == 0 || next > syncer_maxdelay) {
2771                 start /= 2;
2772                 incr /= 2;
2773                 if (start == 0) {
2774                         start = syncer_maxdelay / 2;
2775                         incr = syncer_maxdelay;
2776                 }
2777                 next = start;
2778         }
2779         vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2780         mp->mnt_syncer = vp;
2781         return (0);
2782 }
2783
2784 /*
2785  * Do a lazy sync of the filesystem.
2786  */
2787 static int
2788 sync_fsync(ap)
2789         struct vop_fsync_args /* {
2790                 struct vnode *a_vp;
2791                 struct ucred *a_cred;
2792                 int a_waitfor;
2793                 struct proc *a_p;
2794         } */ *ap;
2795 {
2796         struct vnode *syncvp = ap->a_vp;
2797         struct mount *mp = syncvp->v_mount;
2798         struct proc *p = ap->a_p;
2799         int asyncflag;
2800
2801         /*
2802          * We only need to do something if this is a lazy evaluation.
2803          */
2804         if (ap->a_waitfor != MNT_LAZY)
2805                 return (0);
2806
2807         /*
2808          * Move ourselves to the back of the sync list.
2809          */
2810         vn_syncer_add_to_worklist(syncvp, syncdelay);
2811
2812         /*
2813          * Walk the list of vnodes pushing all that are dirty and
2814          * not already on the sync list.
2815          */
2816         simple_lock(&mountlist_slock);
2817         if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
2818                 simple_unlock(&mountlist_slock);
2819                 return (0);
2820         }
2821         asyncflag = mp->mnt_flag & MNT_ASYNC;
2822         mp->mnt_flag &= ~MNT_ASYNC;
2823         vfs_msync(mp, MNT_NOWAIT);
2824         VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
2825         if (asyncflag)
2826                 mp->mnt_flag |= MNT_ASYNC;
2827         vfs_unbusy(mp, p);
2828         return (0);
2829 }
2830
2831 /*
2832  * The syncer vnode is no referenced.
2833  */
2834 static int
2835 sync_inactive(ap)
2836         struct vop_inactive_args /* {
2837                 struct vnode *a_vp;
2838                 struct proc *a_p;
2839         } */ *ap;
2840 {
2841
2842         vgone(ap->a_vp);
2843         return (0);
2844 }
2845
2846 /*
2847  * The syncer vnode is no longer needed and is being decommissioned.
2848  *
2849  * Modifications to the worklist must be protected at splbio().
2850  */
2851 static int
2852 sync_reclaim(ap)
2853         struct vop_reclaim_args /* {
2854                 struct vnode *a_vp;
2855         } */ *ap;
2856 {
2857         struct vnode *vp = ap->a_vp;
2858         int s;
2859
2860         s = splbio();
2861         vp->v_mount->mnt_syncer = NULL;
2862         if (vp->v_flag & VONWORKLST) {
2863                 LIST_REMOVE(vp, v_synclist);
2864                 vp->v_flag &= ~VONWORKLST;
2865         }
2866         splx(s);
2867
2868         return (0);
2869 }
2870
2871 /*
2872  * Print out a syncer vnode.
2873  */
2874 static int
2875 sync_print(ap)
2876         struct vop_print_args /* {
2877                 struct vnode *a_vp;
2878         } */ *ap;
2879 {
2880         struct vnode *vp = ap->a_vp;
2881
2882         printf("syncer vnode");
2883         if (vp->v_vnlock != NULL)
2884                 lockmgr_printinfo(vp->v_vnlock);
2885         printf("\n");
2886         return (0);
2887 }
2888
2889 /*
2890  * extract the dev_t from a VBLK or VCHR
2891  */
2892 dev_t
2893 vn_todev(vp)
2894         struct vnode *vp;
2895 {
2896         if (vp->v_type != VBLK && vp->v_type != VCHR)
2897                 return (NODEV);
2898         return (vp->v_rdev);
2899 }
2900
2901 /*
2902  * Check if vnode represents a disk device
2903  */
2904 int
2905 vn_isdisk(vp, errp)
2906         struct vnode *vp;
2907         int *errp;
2908 {
2909         if (vp->v_type != VBLK && vp->v_type != VCHR) {
2910                 if (errp != NULL)
2911                         *errp = ENOTBLK;
2912                 return (0);
2913         }
2914         if (vp->v_rdev == NULL) {
2915                 if (errp != NULL)
2916                         *errp = ENXIO;
2917                 return (0);
2918         }
2919         if (!devsw(vp->v_rdev)) {
2920                 if (errp != NULL)
2921                         *errp = ENXIO;
2922                 return (0);
2923         }
2924         if (!(devsw(vp->v_rdev)->d_flags & D_DISK)) {
2925                 if (errp != NULL)
2926                         *errp = ENOTBLK;
2927                 return (0);
2928         }
2929         if (errp != NULL)
2930                 *errp = 0;
2931         return (1);
2932 }
2933
2934 void
2935 NDFREE(ndp, flags)
2936      struct nameidata *ndp;
2937      const uint flags;
2938 {
2939         if (!(flags & NDF_NO_FREE_PNBUF) &&
2940             (ndp->ni_cnd.cn_flags & HASBUF)) {
2941                 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
2942                 ndp->ni_cnd.cn_flags &= ~HASBUF;
2943         }
2944         if (!(flags & NDF_NO_DVP_UNLOCK) &&
2945             (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
2946             ndp->ni_dvp != ndp->ni_vp)
2947                 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc);
2948         if (!(flags & NDF_NO_DVP_RELE) &&
2949             (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
2950                 vrele(ndp->ni_dvp);
2951                 ndp->ni_dvp = NULL;
2952         }
2953         if (!(flags & NDF_NO_VP_UNLOCK) &&
2954             (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
2955                 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc);
2956         if (!(flags & NDF_NO_VP_RELE) &&
2957             ndp->ni_vp) {
2958                 vrele(ndp->ni_vp);
2959                 ndp->ni_vp = NULL;
2960         }
2961         if (!(flags & NDF_NO_STARTDIR_RELE) &&
2962             (ndp->ni_cnd.cn_flags & SAVESTART)) {
2963                 vrele(ndp->ni_startdir);
2964                 ndp->ni_startdir = NULL;
2965         }
2966 }