sys/kern/vfs_bio.c

   1 /*-
   2  * Copyright (c) 2004 Poul-Henning Kamp
   3  * Copyright (c) 1994,1997 John S. Dyson
   4  * Copyright (c) 2013 The FreeBSD Foundation
   5  * All rights reserved.
   6  *
   7  * Portions of this software were developed by Konstantin Belousov
   8  * under sponsorship from the FreeBSD Foundation.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29  * SUCH DAMAGE.
  30  */
  31
  32 /*
  33  * this file contains a new buffer I/O scheme implementing a coherent
  34  * VM object and buffer cache scheme.  Pains have been taken to make
  35  * sure that the performance degradation associated with schemes such
  36  * as this is not realized.
  37  *
  38  * Author:  John S. Dyson
  39  * Significant help during the development and debugging phases
  40  * had been provided by David Greenman, also of the FreeBSD core team.
  41  *
  42  * see man buf(9) for more info.
  43  */
  44
  45 #include <sys/cdefs.h>
  46 __FBSDID("$FreeBSD$");
  47
  48 #include <sys/param.h>
  49 #include <sys/systm.h>
  50 #include <sys/bio.h>
  51 #include <sys/conf.h>
  52 #include <sys/buf.h>
  53 #include <sys/devicestat.h>
  54 #include <sys/eventhandler.h>
  55 #include <sys/fail.h>
  56 #include <sys/limits.h>
  57 #include <sys/lock.h>
  58 #include <sys/malloc.h>
  59 #include <sys/mount.h>
  60 #include <sys/mutex.h>
  61 #include <sys/kernel.h>
  62 #include <sys/kthread.h>
  63 #include <sys/proc.h>
  64 #include <sys/resourcevar.h>
  65 #include <sys/rwlock.h>
  66 #include <sys/sysctl.h>
  67 #include <sys/vmem.h>
  68 #include <sys/vmmeter.h>
  69 #include <sys/vnode.h>
  70 #include <geom/geom.h>
  71 #include <vm/vm.h>
  72 #include <vm/vm_param.h>
  73 #include <vm/vm_kern.h>
  74 #include <vm/vm_pageout.h>
  75 #include <vm/vm_page.h>
  76 #include <vm/vm_object.h>
  77 #include <vm/vm_extern.h>
  78 #include <vm/vm_map.h>
  79 #include "opt_compat.h"
  80 #include "opt_swap.h"
  81
  82 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
  83
  84 struct  bio_ops bioops;         /* I/O operation notification */
  85
  86 struct  buf_ops buf_ops_bio = {
  87         .bop_name       =       "buf_ops_bio",
  88         .bop_write      =       bufwrite,
  89         .bop_strategy   =       bufstrategy,
  90         .bop_sync       =       bufsync,
  91         .bop_bdflush    =       bufbdflush,
  92 };
  93
  94 /*
  95  * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
  96  * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
  97  */
  98 struct buf *buf;                /* buffer header pool */
  99 caddr_t unmapped_buf;
 100
 101 static struct proc *bufdaemonproc;
 102
 103 static int inmem(struct vnode *vp, daddr_t blkno);
 104 static void vm_hold_free_pages(struct buf *bp, int newbsize);
 105 static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
 106                 vm_offset_t to);
 107 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
 108 static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
 109                 vm_page_t m);
 110 static void vfs_clean_pages_dirty_buf(struct buf *bp);
 111 static void vfs_setdirty_locked_object(struct buf *bp);
 112 static void vfs_vmio_release(struct buf *bp);
 113 static int vfs_bio_clcheck(struct vnode *vp, int size,
 114                 daddr_t lblkno, daddr_t blkno);
 115 static int buf_flush(int);
 116 static int flushbufqueues(int, int);
 117 static void buf_daemon(void);
 118 static void bremfreel(struct buf *bp);
 119 static __inline void bd_wakeup(void);
 120 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
 121     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 122 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
 123 #endif
 124
 125 int vmiodirenable = TRUE;
 126 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
 127     "Use the VM system for directory writes");
 128 long runningbufspace;
 129 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
 130     "Amount of presently outstanding async buffer io");
 131 static long bufspace;
 132 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
 133     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 134 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
 135     &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
 136 #else
 137 SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
 138     "Virtual memory used for buffers");
 139 #endif
 140 static long unmapped_bufspace;
 141 SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD,
 142     &unmapped_bufspace, 0,
 143     "Amount of unmapped buffers, inclusive in the bufspace");
 144 static long maxbufspace;
 145 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
 146     "Maximum allowed value of bufspace (including buf_daemon)");
 147 static long bufmallocspace;
 148 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
 149     "Amount of malloced memory for buffers");
 150 static long maxbufmallocspace;
 151 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
 152     "Maximum amount of malloced memory for buffers");
 153 static long lobufspace;
 154 SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
 155     "Minimum amount of buffers we want to have");
 156 long hibufspace;
 157 SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
 158     "Maximum allowed value of bufspace (excluding buf_daemon)");
 159 static int bufreusecnt;
 160 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
 161     "Number of times we have reused a buffer");
 162 static int buffreekvacnt;
 163 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
 164     "Number of times we have freed the KVA space from some buffer");
 165 static int bufdefragcnt;
 166 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
 167     "Number of times we have had to repeat buffer allocation to defragment");
 168 static long lorunningspace;
 169 SYSCTL_LONG(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
 170     "Minimum preferred space used for in-progress I/O");
 171 static long hirunningspace;
 172 SYSCTL_LONG(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
 173     "Maximum amount of space to use for in-progress I/O");
 174 int dirtybufferflushes;
 175 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
 176     0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
 177 int bdwriteskip;
 178 SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
 179     0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
 180 int altbufferflushes;
 181 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
 182     0, "Number of fsync flushes to limit dirty buffers");
 183 static int recursiveflushes;
 184 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
 185     0, "Number of flushes skipped due to being recursive");
 186 static int numdirtybuffers;
 187 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
 188     "Number of buffers that are dirty (has unwritten changes) at the moment");
 189 static int lodirtybuffers;
 190 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
 191     "How many buffers we want to have free before bufdaemon can sleep");
 192 static int hidirtybuffers;
 193 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
 194     "When the number of dirty buffers is considered severe");
 195 int dirtybufthresh;
 196 SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
 197     0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
 198 static int numfreebuffers;
 199 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
 200     "Number of free buffers");
 201 static int lofreebuffers;
 202 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
 203    "XXX Unused");
 204 static int hifreebuffers;
 205 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
 206    "XXX Complicatedly unused");
 207 static int getnewbufcalls;
 208 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
 209    "Number of calls to getnewbuf");
 210 static int getnewbufrestarts;
 211 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
 212     "Number of times getnewbuf has had to restart a buffer aquisition");
 213 static int mappingrestarts;
 214 SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
 215     "Number of times getblk has had to restart a buffer mapping for "
 216     "unmapped buffer");
 217 static int flushbufqtarget = 100;
 218 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
 219     "Amount of work to do in flushbufqueues when helping bufdaemon");
 220 static long notbufdflushes;
 221 SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
 222     "Number of dirty buffer flushes done by the bufdaemon helpers");
 223 static long barrierwrites;
 224 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
 225     "Number of barrier writes");
 226 SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
 227     &unmapped_buf_allowed, 0,
 228     "Permit the use of the unmapped i/o");
 229
 230 /*
 231  * Lock for the non-dirty bufqueues
 232  */
 233 static struct mtx_padalign bqclean;
 234
 235 /*
 236  * Lock for the dirty queue.
 237  */
 238 static struct mtx_padalign bqdirty;
 239
 240 /*
 241  * This lock synchronizes access to bd_request.
 242  */
 243 static struct mtx_padalign bdlock;
 244
 245 /*
 246  * This lock protects the runningbufreq and synchronizes runningbufwakeup and
 247  * waitrunningbufspace().
 248  */
 249 static struct mtx_padalign rbreqlock;
 250
 251 /*
 252  * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
 253  */
 254 static struct mtx_padalign nblock;
 255
 256 /*
 257  * Lock that protects bdirtywait.
 258  */
 259 static struct mtx_padalign bdirtylock;
 260
 261 /*
 262  * Wakeup point for bufdaemon, as well as indicator of whether it is already
 263  * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
 264  * is idling.
 265  */
 266 static int bd_request;
 267
 268 /*
 269  * Request for the buf daemon to write more buffers than is indicated by
 270  * lodirtybuf.  This may be necessary to push out excess dependencies or
 271  * defragment the address space where a simple count of the number of dirty
 272  * buffers is insufficient to characterize the demand for flushing them.
 273  */
 274 static int bd_speedupreq;
 275
 276 /*
 277  * bogus page -- for I/O to/from partially complete buffers
 278  * this is a temporary solution to the problem, but it is not
 279  * really that bad.  it would be better to split the buffer
 280  * for input in the case of buffers partially already in memory,
 281  * but the code is intricate enough already.
 282  */
 283 vm_page_t bogus_page;
 284
 285 /*
 286  * Synchronization (sleep/wakeup) variable for active buffer space requests.
 287  * Set when wait starts, cleared prior to wakeup().
 288  * Used in runningbufwakeup() and waitrunningbufspace().
 289  */
 290 static int runningbufreq;
 291
 292 /*
 293  * Synchronization (sleep/wakeup) variable for buffer requests.
 294  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
 295  * by and/or.
 296  * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
 297  * getnewbuf(), and getblk().
 298  */
 299 static int needsbuffer;
 300
 301 /*
 302  * Synchronization for bwillwrite() waiters.
 303  */
 304 static int bdirtywait;
 305
 306 /*
 307  * Definitions for the buffer free lists.
 308  */
 309 #define BUFFER_QUEUES   5       /* number of free buffer queues */
 310
 311 #define QUEUE_NONE      0       /* on no queue */
 312 #define QUEUE_CLEAN     1       /* non-B_DELWRI buffers */
 313 #define QUEUE_DIRTY     2       /* B_DELWRI buffers */
 314 #define QUEUE_EMPTYKVA  3       /* empty buffer headers w/KVA assignment */
 315 #define QUEUE_EMPTY     4       /* empty buffer headers */
 316 #define QUEUE_SENTINEL  1024    /* not an queue index, but mark for sentinel */
 317
 318 /* Queues for free buffers with various properties */
 319 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
 320 #ifdef INVARIANTS
 321 static int bq_len[BUFFER_QUEUES];
 322 #endif
 323
 324 /*
 325  * Single global constant for BUF_WMESG, to avoid getting multiple references.
 326  * buf_wmesg is referred from macros.
 327  */
 328 const char *buf_wmesg = BUF_WMESG;
 329
 330 #define VFS_BIO_NEED_ANY        0x01    /* any freeable buffer */
 331 #define VFS_BIO_NEED_FREE       0x04    /* wait for free bufs, hi hysteresis */
 332 #define VFS_BIO_NEED_BUFSPACE   0x08    /* wait for buf space, lo hysteresis */
 333
 334 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
 335     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 336 static int
 337 sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 338 {
 339         long lvalue;
 340         int ivalue;
 341
 342         if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
 343                 return (sysctl_handle_long(oidp, arg1, arg2, req));
 344         lvalue = *(long *)arg1;
 345         if (lvalue > INT_MAX)
 346                 /* On overflow, still write out a long to trigger ENOMEM. */
 347                 return (sysctl_handle_long(oidp, &lvalue, 0, req));
 348         ivalue = lvalue;
 349         return (sysctl_handle_int(oidp, &ivalue, 0, req));
 350 }
 351 #endif
 352
 353 /*
 354  *      bqlock:
 355  *
 356  *      Return the appropriate queue lock based on the index.
 357  */
 358 static inline struct mtx *
 359 bqlock(int qindex)
 360 {
 361
 362         if (qindex == QUEUE_DIRTY)
 363                 return (struct mtx *)(&bqdirty);
 364         return (struct mtx *)(&bqclean);
 365 }
 366
 367 /*
 368  *      bdirtywakeup:
 369  *
 370  *      Wakeup any bwillwrite() waiters.
 371  */
 372 static void
 373 bdirtywakeup(void)
 374 {
 375         mtx_lock(&bdirtylock);
 376         if (bdirtywait) {
 377                 bdirtywait = 0;
 378                 wakeup(&bdirtywait);
 379         }
 380         mtx_unlock(&bdirtylock);
 381 }
 382
 383 /*
 384  *      bdirtysub:
 385  *
 386  *      Decrement the numdirtybuffers count by one and wakeup any
 387  *      threads blocked in bwillwrite().
 388  */
 389 static void
 390 bdirtysub(void)
 391 {
 392
 393         if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
 394             (lodirtybuffers + hidirtybuffers) / 2)
 395                 bdirtywakeup();
 396 }
 397
 398 /*
 399  *      bdirtyadd:
 400  *
 401  *      Increment the numdirtybuffers count by one and wakeup the buf
 402  *      daemon if needed.
 403  */
 404 static void
 405 bdirtyadd(void)
 406 {
 407
 408         /*
 409          * Only do the wakeup once as we cross the boundary.  The
 410          * buf daemon will keep running until the condition clears.
 411          */
 412         if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
 413             (lodirtybuffers + hidirtybuffers) / 2)
 414                 bd_wakeup();
 415 }
 416
 417 /*
 418  *      bufspacewakeup:
 419  *
 420  *      Called when buffer space is potentially available for recovery.
 421  *      getnewbuf() will block on this flag when it is unable to free
 422  *      sufficient buffer space.  Buffer space becomes recoverable when
 423  *      bp's get placed back in the queues.
 424  */
 425
 426 static __inline void
 427 bufspacewakeup(void)
 428 {
 429
 430         /*
 431          * If someone is waiting for BUF space, wake them up.  Even
 432          * though we haven't freed the kva space yet, the waiting
 433          * process will be able to now.
 434          */
 435         mtx_lock(&nblock);
 436         if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
 437                 needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
 438                 wakeup(&needsbuffer);
 439         }
 440         mtx_unlock(&nblock);
 441 }
 442
 443 /*
 444  *      runningwakeup:
 445  *
 446  *      Wake up processes that are waiting on asynchronous writes to fall
 447  *      below lorunningspace.
 448  */
 449 static void
 450 runningwakeup(void)
 451 {
 452
 453         mtx_lock(&rbreqlock);
 454         if (runningbufreq) {
 455                 runningbufreq = 0;
 456                 wakeup(&runningbufreq);
 457         }
 458         mtx_unlock(&rbreqlock);
 459 }
 460
 461 /*
 462  *      runningbufwakeup:
 463  *
 464  *      Decrement the outstanding write count according.
 465  */
 466 void
 467 runningbufwakeup(struct buf *bp)
 468 {
 469         long space, bspace;
 470
 471         bspace = bp->b_runningbufspace;
 472         if (bspace == 0)
 473                 return;
 474         space = atomic_fetchadd_long(&runningbufspace, -bspace);
 475         KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
 476             space, bspace));
 477         bp->b_runningbufspace = 0;
 478         /*
 479          * Only acquire the lock and wakeup on the transition from exceeding
 480          * the threshold to falling below it.
 481          */
 482         if (space < lorunningspace)
 483                 return;
 484         if (space - bspace > lorunningspace)
 485                 return;
 486         runningwakeup();
 487 }
 488
 489 /*
 490  *      bufcountadd:
 491  *
 492  *      Called when a buffer has been added to one of the free queues to
 493  *      account for the buffer and to wakeup anyone waiting for free buffers.
 494  *      This typically occurs when large amounts of metadata are being handled
 495  *      by the buffer cache ( else buffer space runs out first, usually ).
 496  */
 497 static __inline void
 498 bufcountadd(struct buf *bp)
 499 {
 500         int old;
 501
 502         KASSERT((bp->b_flags & B_INFREECNT) == 0,
 503             ("buf %p already counted as free", bp));
 504         bp->b_flags |= B_INFREECNT;
 505         old = atomic_fetchadd_int(&numfreebuffers, 1);
 506         KASSERT(old >= 0 && old < nbuf,
 507             ("numfreebuffers climbed to %d", old + 1));
 508         mtx_lock(&nblock);
 509         if (needsbuffer) {
 510                 needsbuffer &= ~VFS_BIO_NEED_ANY;
 511                 if (numfreebuffers >= hifreebuffers)
 512                         needsbuffer &= ~VFS_BIO_NEED_FREE;
 513                 wakeup(&needsbuffer);
 514         }
 515         mtx_unlock(&nblock);
 516 }
 517
 518 /*
 519  *      bufcountsub:
 520  *
 521  *      Decrement the numfreebuffers count as needed.
 522  */
 523 static void
 524 bufcountsub(struct buf *bp)
 525 {
 526         int old;
 527
 528         /*
 529          * Fixup numfreebuffers count.  If the buffer is invalid or not
 530          * delayed-write, the buffer was free and we must decrement
 531          * numfreebuffers.
 532          */
 533         if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
 534                 KASSERT((bp->b_flags & B_INFREECNT) != 0,
 535                     ("buf %p not counted in numfreebuffers", bp));
 536                 bp->b_flags &= ~B_INFREECNT;
 537                 old = atomic_fetchadd_int(&numfreebuffers, -1);
 538                 KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
 539         }
 540 }
 541
 542 /*
 543  *      waitrunningbufspace()
 544  *
 545  *      runningbufspace is a measure of the amount of I/O currently
 546  *      running.  This routine is used in async-write situations to
 547  *      prevent creating huge backups of pending writes to a device.
 548  *      Only asynchronous writes are governed by this function.
 549  *
 550  *      This does NOT turn an async write into a sync write.  It waits
 551  *      for earlier writes to complete and generally returns before the
 552  *      caller's write has reached the device.
 553  */
 554 void
 555 waitrunningbufspace(void)
 556 {
 557
 558         mtx_lock(&rbreqlock);
 559         while (runningbufspace > hirunningspace) {
 560                 runningbufreq = 1;
 561                 msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
 562         }
 563         mtx_unlock(&rbreqlock);
 564 }
 565
 566
 567 /*
 568  *      vfs_buf_test_cache:
 569  *
 570  *      Called when a buffer is extended.  This function clears the B_CACHE
 571  *      bit if the newly extended portion of the buffer does not contain
 572  *      valid data.
 573  */
 574 static __inline
 575 void
 576 vfs_buf_test_cache(struct buf *bp,
 577                   vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
 578                   vm_page_t m)
 579 {
 580
 581         VM_OBJECT_ASSERT_LOCKED(m->object);
 582         if (bp->b_flags & B_CACHE) {
 583                 int base = (foff + off) & PAGE_MASK;
 584                 if (vm_page_is_valid(m, base, size) == 0)
 585                         bp->b_flags &= ~B_CACHE;
 586         }
 587 }
 588
 589 /* Wake up the buffer daemon if necessary */
 590 static __inline void
 591 bd_wakeup(void)
 592 {
 593
 594         mtx_lock(&bdlock);
 595         if (bd_request == 0) {
 596                 bd_request = 1;
 597                 wakeup(&bd_request);
 598         }
 599         mtx_unlock(&bdlock);
 600 }
 601
 602 /*
 603  * bd_speedup - speedup the buffer cache flushing code
 604  */
 605 void
 606 bd_speedup(void)
 607 {
 608         int needwake;
 609
 610         mtx_lock(&bdlock);
 611         needwake = 0;
 612         if (bd_speedupreq == 0 || bd_request == 0)
 613                 needwake = 1;
 614         bd_speedupreq = 1;
 615         bd_request = 1;
 616         if (needwake)
 617                 wakeup(&bd_request);
 618         mtx_unlock(&bdlock);
 619 }
 620
 621 #ifdef __i386__
 622 #define TRANSIENT_DENOM 5
 623 #else
 624 #define TRANSIENT_DENOM 10
 625 #endif
 626
 627 /*
 628  * Calculating buffer cache scaling values and reserve space for buffer
 629  * headers.  This is called during low level kernel initialization and
 630  * may be called more then once.  We CANNOT write to the memory area
 631  * being reserved at this time.
 632  */
 633 caddr_t
 634 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 635 {
 636         int tuned_nbuf;
 637         long maxbuf, maxbuf_sz, buf_sz, biotmap_sz;
 638
 639         /*
 640          * physmem_est is in pages.  Convert it to kilobytes (assumes
 641          * PAGE_SIZE is >= 1K)
 642          */
 643         physmem_est = physmem_est * (PAGE_SIZE / 1024);
 644
 645         /*
 646          * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
 647          * For the first 64MB of ram nominally allocate sufficient buffers to
 648          * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
 649          * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
 650          * the buffer cache we limit the eventual kva reservation to
 651          * maxbcache bytes.
 652          *
 653          * factor represents the 1/4 x ram conversion.
 654          */
 655         if (nbuf == 0) {
 656                 int factor = 4 * BKVASIZE / 1024;
 657
 658                 nbuf = 50;
 659                 if (physmem_est > 4096)
 660                         nbuf += min((physmem_est - 4096) / factor,
 661                             65536 / factor);
 662                 if (physmem_est > 65536)
 663                         nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
 664                             32 * 1024 * 1024 / (factor * 5));
 665
 666                 if (maxbcache && nbuf > maxbcache / BKVASIZE)
 667                         nbuf = maxbcache / BKVASIZE;
 668                 tuned_nbuf = 1;
 669         } else
 670                 tuned_nbuf = 0;
 671
 672         /* XXX Avoid unsigned long overflows later on with maxbufspace. */
 673         maxbuf = (LONG_MAX / 3) / BKVASIZE;
 674         if (nbuf > maxbuf) {
 675                 if (!tuned_nbuf)
 676                         printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
 677                             maxbuf);
 678                 nbuf = maxbuf;
 679         }
 680
 681         /*
 682          * Ideal allocation size for the transient bio submap if 10%
 683          * of the maximal space buffer map.  This roughly corresponds
 684          * to the amount of the buffer mapped for typical UFS load.
 685          *
 686          * Clip the buffer map to reserve space for the transient
 687          * BIOs, if its extent is bigger than 90% (80% on i386) of the
 688          * maximum buffer map extent on the platform.
 689          *
 690          * The fall-back to the maxbuf in case of maxbcache unset,
 691          * allows to not trim the buffer KVA for the architectures
 692          * with ample KVA space.
 693          */
 694         if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
 695                 maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
 696                 buf_sz = (long)nbuf * BKVASIZE;
 697                 if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
 698                     (TRANSIENT_DENOM - 1)) {
 699                         /*
 700                          * There is more KVA than memory.  Do not
 701                          * adjust buffer map size, and assign the rest
 702                          * of maxbuf to transient map.
 703                          */
 704                         biotmap_sz = maxbuf_sz - buf_sz;
 705                 } else {
 706                         /*
 707                          * Buffer map spans all KVA we could afford on
 708                          * this platform.  Give 10% (20% on i386) of
 709                          * the buffer map to the transient bio map.
 710                          */
 711                         biotmap_sz = buf_sz / TRANSIENT_DENOM;
 712                         buf_sz -= biotmap_sz;
 713                 }
 714                 if (biotmap_sz / INT_MAX > MAXPHYS)
 715                         bio_transient_maxcnt = INT_MAX;
 716                 else
 717                         bio_transient_maxcnt = biotmap_sz / MAXPHYS;
 718                 /*
 719                  * Artifically limit to 1024 simultaneous in-flight I/Os
 720                  * using the transient mapping.
 721                  */
 722                 if (bio_transient_maxcnt > 1024)
 723                         bio_transient_maxcnt = 1024;
 724                 if (tuned_nbuf)
 725                         nbuf = buf_sz / BKVASIZE;
 726         }
 727
 728         /*
 729          * swbufs are used as temporary holders for I/O, such as paging I/O.
 730          * We have no less then 16 and no more then 256.
 731          */
 732         nswbuf = max(min(nbuf/4, 256), 16);
 733 #ifdef NSWBUF_MIN
 734         if (nswbuf < NSWBUF_MIN)
 735                 nswbuf = NSWBUF_MIN;
 736 #endif
 737
 738         /*
 739          * Reserve space for the buffer cache buffers
 740          */
 741         swbuf = (void *)v;
 742         v = (caddr_t)(swbuf + nswbuf);
 743         buf = (void *)v;
 744         v = (caddr_t)(buf + nbuf);
 745
 746         return(v);
 747 }
 748
 749 /* Initialize the buffer subsystem.  Called before use of any buffers. */
 750 void
 751 bufinit(void)
 752 {
 753         struct buf *bp;
 754         int i;
 755
 756         mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
 757         mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
 758         mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
 759         mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
 760         mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 761         mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
 762
 763         /* next, make a null set of free lists */
 764         for (i = 0; i < BUFFER_QUEUES; i++)
 765                 TAILQ_INIT(&bufqueues[i]);
 766
 767         /* finally, initialize each buffer header and stick on empty q */
 768         for (i = 0; i < nbuf; i++) {
 769                 bp = &buf[i];
 770                 bzero(bp, sizeof *bp);
 771                 bp->b_flags = B_INVAL | B_INFREECNT;
 772                 bp->b_rcred = NOCRED;
 773                 bp->b_wcred = NOCRED;
 774                 bp->b_qindex = QUEUE_EMPTY;
 775                 bp->b_xflags = 0;
 776                 LIST_INIT(&bp->b_dep);
 777                 BUF_LOCKINIT(bp);
 778                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 779 #ifdef INVARIANTS
 780                 bq_len[QUEUE_EMPTY]++;
 781 #endif
 782         }
 783
 784         /*
 785          * maxbufspace is the absolute maximum amount of buffer space we are
 786          * allowed to reserve in KVM and in real terms.  The absolute maximum
 787          * is nominally used by buf_daemon.  hibufspace is the nominal maximum
 788          * used by most other processes.  The differential is required to
 789          * ensure that buf_daemon is able to run when other processes might
 790          * be blocked waiting for buffer space.
 791          *
 792          * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
 793          * this may result in KVM fragmentation which is not handled optimally
 794          * by the system.
 795          */
 796         maxbufspace = (long)nbuf * BKVASIZE;
 797         hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
 798         lobufspace = hibufspace - MAXBSIZE;
 799
 800         /*
 801          * Note: The 16 MiB upper limit for hirunningspace was chosen
 802          * arbitrarily and may need further tuning. It corresponds to
 803          * 128 outstanding write IO requests (if IO size is 128 KiB),
 804          * which fits with many RAID controllers' tagged queuing limits.
 805          * The lower 1 MiB limit is the historical upper limit for
 806          * hirunningspace.
 807          */
 808         hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBSIZE),
 809             16 * 1024 * 1024), 1024 * 1024);
 810         lorunningspace = roundup((hirunningspace * 2) / 3, MAXBSIZE);
 811
 812 /*
 813  * Limit the amount of malloc memory since it is wired permanently into
 814  * the kernel space.  Even though this is accounted for in the buffer
 815  * allocation, we don't want the malloced region to grow uncontrolled.
 816  * The malloc scheme improves memory utilization significantly on average
 817  * (small) directories.
 818  */
 819         maxbufmallocspace = hibufspace / 20;
 820
 821 /*
 822  * Reduce the chance of a deadlock occuring by limiting the number
 823  * of delayed-write dirty buffers we allow to stack up.
 824  */
 825         hidirtybuffers = nbuf / 4 + 20;
 826         dirtybufthresh = hidirtybuffers * 9 / 10;
 827         numdirtybuffers = 0;
 828 /*
 829  * To support extreme low-memory systems, make sure hidirtybuffers cannot
 830  * eat up all available buffer space.  This occurs when our minimum cannot
 831  * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
 832  * BKVASIZE'd buffers.
 833  */
 834         while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
 835                 hidirtybuffers >>= 1;
 836         }
 837         lodirtybuffers = hidirtybuffers / 2;
 838
 839 /*
 840  * Try to keep the number of free buffers in the specified range,
 841  * and give special processes (e.g. like buf_daemon) access to an
 842  * emergency reserve.
 843  */
 844         lofreebuffers = nbuf / 18 + 5;
 845         hifreebuffers = 2 * lofreebuffers;
 846         numfreebuffers = nbuf;
 847
 848         bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
 849             VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
 850         unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
 851 }
 852
 853 #ifdef INVARIANTS
 854 static inline void
 855 vfs_buf_check_mapped(struct buf *bp)
 856 {
 857
 858         KASSERT((bp->b_flags & B_UNMAPPED) == 0,
 859             ("mapped buf %p %x", bp, bp->b_flags));
 860         KASSERT(bp->b_kvabase != unmapped_buf,
 861             ("mapped buf: b_kvabase was not updated %p", bp));
 862         KASSERT(bp->b_data != unmapped_buf,
 863             ("mapped buf: b_data was not updated %p", bp));
 864 }
 865
 866 static inline void
 867 vfs_buf_check_unmapped(struct buf *bp)
 868 {
 869
 870         KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED,
 871             ("unmapped buf %p %x", bp, bp->b_flags));
 872         KASSERT(bp->b_kvabase == unmapped_buf,
 873             ("unmapped buf: corrupted b_kvabase %p", bp));
 874         KASSERT(bp->b_data == unmapped_buf,
 875             ("unmapped buf: corrupted b_data %p", bp));
 876 }
 877
 878 #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
 879 #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
 880 #else
 881 #define BUF_CHECK_MAPPED(bp) do {} while (0)
 882 #define BUF_CHECK_UNMAPPED(bp) do {} while (0)
 883 #endif
 884
 885 static void
 886 bpmap_qenter(struct buf *bp)
 887 {
 888
 889         BUF_CHECK_MAPPED(bp);
 890
 891         /*
 892          * bp->b_data is relative to bp->b_offset, but
 893          * bp->b_offset may be offset into the first page.
 894          */
 895         bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
 896         pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
 897         bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
 898             (vm_offset_t)(bp->b_offset & PAGE_MASK));
 899 }
 900
 901 /*
 902  * bfreekva() - free the kva allocation for a buffer.
 903  *
 904  *      Since this call frees up buffer space, we call bufspacewakeup().
 905  */
 906 static void
 907 bfreekva(struct buf *bp)
 908 {
 909
 910         if (bp->b_kvasize == 0)
 911                 return;
 912
 913         atomic_add_int(&buffreekvacnt, 1);
 914         atomic_subtract_long(&bufspace, bp->b_kvasize);
 915         if ((bp->b_flags & B_UNMAPPED) == 0) {
 916                 BUF_CHECK_MAPPED(bp);
 917                 vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase,
 918                     bp->b_kvasize);
 919         } else {
 920                 BUF_CHECK_UNMAPPED(bp);
 921                 if ((bp->b_flags & B_KVAALLOC) != 0) {
 922                         vmem_free(buffer_arena, (vm_offset_t)bp->b_kvaalloc,
 923                             bp->b_kvasize);
 924                 }
 925                 atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
 926                 bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
 927         }
 928         bp->b_kvasize = 0;
 929         bufspacewakeup();
 930 }
 931
 932 /*
 933  *      binsfree:
 934  *
 935  *      Insert the buffer into the appropriate free list.
 936  */
 937 static void
 938 binsfree(struct buf *bp, int qindex)
 939 {
 940         struct mtx *olock, *nlock;
 941
 942         BUF_ASSERT_XLOCKED(bp);
 943
 944         olock = bqlock(bp->b_qindex);
 945         nlock = bqlock(qindex);
 946         mtx_lock(olock);
 947         /* Handle delayed bremfree() processing. */
 948         if (bp->b_flags & B_REMFREE)
 949                 bremfreel(bp);
 950
 951         if (bp->b_qindex != QUEUE_NONE)
 952                 panic("binsfree: free buffer onto another queue???");
 953
 954         bp->b_qindex = qindex;
 955         if (olock != nlock) {
 956                 mtx_unlock(olock);
 957                 mtx_lock(nlock);
 958         }
 959         if (bp->b_flags & B_AGE)
 960                 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 961         else
 962                 TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
 963 #ifdef INVARIANTS
 964         bq_len[bp->b_qindex]++;
 965 #endif
 966         mtx_unlock(nlock);
 967
 968         /*
 969          * Something we can maybe free or reuse.
 970          */
 971         if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
 972                 bufspacewakeup();
 973
 974         if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
 975                 bufcountadd(bp);
 976 }
 977
 978 /*
 979  *      bremfree:
 980  *
 981  *      Mark the buffer for removal from the appropriate free list.
 982  *
 983  */
 984 void
 985 bremfree(struct buf *bp)
 986 {
 987
 988         CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 989         KASSERT((bp->b_flags & B_REMFREE) == 0,
 990             ("bremfree: buffer %p already marked for delayed removal.", bp));
 991         KASSERT(bp->b_qindex != QUEUE_NONE,
 992             ("bremfree: buffer %p not on a queue.", bp));
 993         BUF_ASSERT_XLOCKED(bp);
 994
 995         bp->b_flags |= B_REMFREE;
 996         bufcountsub(bp);
 997 }
 998
 999 /*
1000  *      bremfreef:
1001  *
1002  *      Force an immediate removal from a free list.  Used only in nfs when
1003  *      it abuses the b_freelist pointer.
1004  */
1005 void
1006 bremfreef(struct buf *bp)
1007 {
1008         struct mtx *qlock;
1009
1010         qlock = bqlock(bp->b_qindex);
1011         mtx_lock(qlock);
1012         bremfreel(bp);
1013         mtx_unlock(qlock);
1014 }
1015
1016 /*
1017  *      bremfreel:
1018  *
1019  *      Removes a buffer from the free list, must be called with the
1020  *      correct qlock held.
1021  */
1022 static void
1023 bremfreel(struct buf *bp)
1024 {
1025
1026         CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
1027             bp, bp->b_vp, bp->b_flags);
1028         KASSERT(bp->b_qindex != QUEUE_NONE,
1029             ("bremfreel: buffer %p not on a queue.", bp));
1030         BUF_ASSERT_XLOCKED(bp);
1031         mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
1032
1033         TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
1034 #ifdef INVARIANTS
1035         KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
1036             bp->b_qindex));
1037         bq_len[bp->b_qindex]--;
1038 #endif
1039         bp->b_qindex = QUEUE_NONE;
1040         /*
1041          * If this was a delayed bremfree() we only need to remove the buffer
1042          * from the queue and return the stats are already done.
1043          */
1044         if (bp->b_flags & B_REMFREE) {
1045                 bp->b_flags &= ~B_REMFREE;
1046                 return;
1047         }
1048         bufcountsub(bp);
1049 }
1050
1051 /*
1052  * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
1053  * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
1054  * the buffer is valid and we do not have to do anything.
1055  */
1056 void
1057 breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
1058     int cnt, struct ucred * cred)
1059 {
1060         struct buf *rabp;
1061         int i;
1062
1063         for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
1064                 if (inmem(vp, *rablkno))
1065                         continue;
1066                 rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
1067
1068                 if ((rabp->b_flags & B_CACHE) == 0) {
1069                         if (!TD_IS_IDLETHREAD(curthread))
1070                                 curthread->td_ru.ru_inblock++;
1071                         rabp->b_flags |= B_ASYNC;
1072                         rabp->b_flags &= ~B_INVAL;
1073                         rabp->b_ioflags &= ~BIO_ERROR;
1074                         rabp->b_iocmd = BIO_READ;
1075                         if (rabp->b_rcred == NOCRED && cred != NOCRED)
1076                                 rabp->b_rcred = crhold(cred);
1077                         vfs_busy_pages(rabp, 0);
1078                         BUF_KERNPROC(rabp);
1079                         rabp->b_iooffset = dbtob(rabp->b_blkno);
1080                         bstrategy(rabp);
1081                 } else {
1082                         brelse(rabp);
1083                 }
1084         }
1085 }
1086
1087 /*
1088  * Entry point for bread() and breadn() via #defines in sys/buf.h.
1089  *
1090  * Get a buffer with the specified data.  Look in the cache first.  We
1091  * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
1092  * is set, the buffer is valid and we do not have to do anything, see
1093  * getblk(). Also starts asynchronous I/O on read-ahead blocks.
1094  */
1095 int
1096 breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
1097     int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp)
1098 {
1099         struct buf *bp;
1100         int rv = 0, readwait = 0;
1101
1102         CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
1103         /*
1104          * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
1105          */
1106         *bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
1107         if (bp == NULL)
1108                 return (EBUSY);
1109
1110         /* if not found in cache, do some I/O */
1111         if ((bp->b_flags & B_CACHE) == 0) {
1112                 if (!TD_IS_IDLETHREAD(curthread))
1113                         curthread->td_ru.ru_inblock++;
1114                 bp->b_iocmd = BIO_READ;
1115                 bp->b_flags &= ~B_INVAL;
1116                 bp->b_ioflags &= ~BIO_ERROR;
1117                 if (bp->b_rcred == NOCRED && cred != NOCRED)
1118                         bp->b_rcred = crhold(cred);
1119                 vfs_busy_pages(bp, 0);
1120                 bp->b_iooffset = dbtob(bp->b_blkno);
1121                 bstrategy(bp);
1122                 ++readwait;
1123         }
1124
1125         breada(vp, rablkno, rabsize, cnt, cred);
1126
1127         if (readwait) {
1128                 rv = bufwait(bp);
1129         }
1130         return (rv);
1131 }
1132
1133 /*
1134  * Write, release buffer on completion.  (Done by iodone
1135  * if async).  Do not bother writing anything if the buffer
1136  * is invalid.
1137  *
1138  * Note that we set B_CACHE here, indicating that buffer is
1139  * fully valid and thus cacheable.  This is true even of NFS
1140  * now so we set it generally.  This could be set either here
1141  * or in biodone() since the I/O is synchronous.  We put it
1142  * here.
1143  */
1144 int
1145 bufwrite(struct buf *bp)
1146 {
1147         int oldflags;
1148         struct vnode *vp;
1149         long space;
1150         int vp_md;
1151
1152         CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1153         if (bp->b_flags & B_INVAL) {
1154                 brelse(bp);
1155                 return (0);
1156         }
1157
1158         if (bp->b_flags & B_BARRIER)
1159                 barrierwrites++;
1160
1161         oldflags = bp->b_flags;
1162
1163         BUF_ASSERT_HELD(bp);
1164
1165         if (bp->b_pin_count > 0)
1166                 bunpin_wait(bp);
1167
1168         KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
1169             ("FFS background buffer should not get here %p", bp));
1170
1171         vp = bp->b_vp;
1172         if (vp)
1173                 vp_md = vp->v_vflag & VV_MD;
1174         else
1175                 vp_md = 0;
1176
1177         /*
1178          * Mark the buffer clean.  Increment the bufobj write count
1179          * before bundirty() call, to prevent other thread from seeing
1180          * empty dirty list and zero counter for writes in progress,
1181          * falsely indicating that the bufobj is clean.
1182          */
1183         bufobj_wref(bp->b_bufobj);
1184         bundirty(bp);
1185
1186         bp->b_flags &= ~B_DONE;
1187         bp->b_ioflags &= ~BIO_ERROR;
1188         bp->b_flags |= B_CACHE;
1189         bp->b_iocmd = BIO_WRITE;
1190
1191         vfs_busy_pages(bp, 1);
1192
1193         /*
1194          * Normal bwrites pipeline writes
1195          */
1196         bp->b_runningbufspace = bp->b_bufsize;
1197         space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
1198
1199         if (!TD_IS_IDLETHREAD(curthread))
1200                 curthread->td_ru.ru_oublock++;
1201         if (oldflags & B_ASYNC)
1202                 BUF_KERNPROC(bp);
1203         bp->b_iooffset = dbtob(bp->b_blkno);
1204         bstrategy(bp);
1205
1206         if ((oldflags & B_ASYNC) == 0) {
1207                 int rtval = bufwait(bp);
1208                 brelse(bp);
1209                 return (rtval);
1210         } else if (space > hirunningspace) {
1211                 /*
1212                  * don't allow the async write to saturate the I/O
1213                  * system.  We will not deadlock here because
1214                  * we are blocking waiting for I/O that is already in-progress
1215                  * to complete. We do not block here if it is the update
1216                  * or syncer daemon trying to clean up as that can lead
1217                  * to deadlock.
1218                  */
1219                 if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
1220                         waitrunningbufspace();
1221         }
1222
1223         return (0);
1224 }
1225
1226 void
1227 bufbdflush(struct bufobj *bo, struct buf *bp)
1228 {
1229         struct buf *nbp;
1230
1231         if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
1232                 (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
1233                 altbufferflushes++;
1234         } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
1235                 BO_LOCK(bo);
1236                 /*
1237                  * Try to find a buffer to flush.
1238                  */
1239                 TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
1240                         if ((nbp->b_vflags & BV_BKGRDINPROG) ||
1241                             BUF_LOCK(nbp,
1242                                      LK_EXCLUSIVE | LK_NOWAIT, NULL))
1243                                 continue;
1244                         if (bp == nbp)
1245                                 panic("bdwrite: found ourselves");
1246                         BO_UNLOCK(bo);
1247                         /* Don't countdeps with the bo lock held. */
1248                         if (buf_countdeps(nbp, 0)) {
1249                                 BO_LOCK(bo);
1250                                 BUF_UNLOCK(nbp);
1251                                 continue;
1252                         }
1253                         if (nbp->b_flags & B_CLUSTEROK) {
1254                                 vfs_bio_awrite(nbp);
1255                         } else {
1256                                 bremfree(nbp);
1257                                 bawrite(nbp);
1258                         }
1259                         dirtybufferflushes++;
1260                         break;
1261                 }
1262                 if (nbp == NULL)
1263                         BO_UNLOCK(bo);
1264         }
1265 }
1266
1267 /*
1268  * Delayed write. (Buffer is marked dirty).  Do not bother writing
1269  * anything if the buffer is marked invalid.
1270  *
1271  * Note that since the buffer must be completely valid, we can safely
1272  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
1273  * biodone() in order to prevent getblk from writing the buffer
1274  * out synchronously.
1275  */
1276 void
1277 bdwrite(struct buf *bp)
1278 {
1279         struct thread *td = curthread;
1280         struct vnode *vp;
1281         struct bufobj *bo;
1282
1283         CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1284         KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1285         KASSERT((bp->b_flags & B_BARRIER) == 0,
1286             ("Barrier request in delayed write %p", bp));
1287         BUF_ASSERT_HELD(bp);
1288
1289         if (bp->b_flags & B_INVAL) {
1290                 brelse(bp);
1291                 return;
1292         }
1293
1294         /*
1295          * If we have too many dirty buffers, don't create any more.
1296          * If we are wildly over our limit, then force a complete
1297          * cleanup. Otherwise, just keep the situation from getting
1298          * out of control. Note that we have to avoid a recursive
1299          * disaster and not try to clean up after our own cleanup!
1300          */
1301         vp = bp->b_vp;
1302         bo = bp->b_bufobj;
1303         if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
1304                 td->td_pflags |= TDP_INBDFLUSH;
1305                 BO_BDFLUSH(bo, bp);
1306                 td->td_pflags &= ~TDP_INBDFLUSH;
1307         } else
1308                 recursiveflushes++;
1309
1310         bdirty(bp);
1311         /*
1312          * Set B_CACHE, indicating that the buffer is fully valid.  This is
1313          * true even of NFS now.
1314          */
1315         bp->b_flags |= B_CACHE;
1316
1317         /*
1318          * This bmap keeps the system from needing to do the bmap later,
1319          * perhaps when the system is attempting to do a sync.  Since it
1320          * is likely that the indirect block -- or whatever other datastructure
1321          * that the filesystem needs is still in memory now, it is a good
1322          * thing to do this.  Note also, that if the pageout daemon is
1323          * requesting a sync -- there might not be enough memory to do
1324          * the bmap then...  So, this is important to do.
1325          */
1326         if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
1327                 VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
1328         }
1329
1330         /*
1331          * Set the *dirty* buffer range based upon the VM system dirty
1332          * pages.
1333          *
1334          * Mark the buffer pages as clean.  We need to do this here to
1335          * satisfy the vnode_pager and the pageout daemon, so that it
1336          * thinks that the pages have been "cleaned".  Note that since
1337          * the pages are in a delayed write buffer -- the VFS layer
1338          * "will" see that the pages get written out on the next sync,
1339          * or perhaps the cluster will be completed.
1340          */
1341         vfs_clean_pages_dirty_buf(bp);
1342         bqrelse(bp);
1343
1344         /*
1345          * note: we cannot initiate I/O from a bdwrite even if we wanted to,
1346          * due to the softdep code.
1347          */
1348 }
1349
1350 /*
1351  *      bdirty:
1352  *
1353  *      Turn buffer into delayed write request.  We must clear BIO_READ and
1354  *      B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to
1355  *      itself to properly update it in the dirty/clean lists.  We mark it
1356  *      B_DONE to ensure that any asynchronization of the buffer properly
1357  *      clears B_DONE ( else a panic will occur later ).
1358  *
1359  *      bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
1360  *      might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
1361  *      should only be called if the buffer is known-good.
1362  *
1363  *      Since the buffer is not on a queue, we do not update the numfreebuffers
1364  *      count.
1365  *
1366  *      The buffer must be on QUEUE_NONE.
1367  */
1368 void
1369 bdirty(struct buf *bp)
1370 {
1371
1372         CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
1373             bp, bp->b_vp, bp->b_flags);
1374         KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1375         KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
1376             ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
1377         BUF_ASSERT_HELD(bp);
1378         bp->b_flags &= ~(B_RELBUF);
1379         bp->b_iocmd = BIO_WRITE;
1380
1381         if ((bp->b_flags & B_DELWRI) == 0) {
1382                 bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
1383                 reassignbuf(bp);
1384                 bdirtyadd();
1385         }
1386 }
1387
1388 /*
1389  *      bundirty:
1390  *
1391  *      Clear B_DELWRI for buffer.
1392  *
1393  *      Since the buffer is not on a queue, we do not update the numfreebuffers
1394  *      count.
1395  *
1396  *      The buffer must be on QUEUE_NONE.
1397  */
1398
1399 void
1400 bundirty(struct buf *bp)
1401 {
1402
1403         CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1404         KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1405         KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
1406             ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
1407         BUF_ASSERT_HELD(bp);
1408
1409         if (bp->b_flags & B_DELWRI) {
1410                 bp->b_flags &= ~B_DELWRI;
1411                 reassignbuf(bp);
1412                 bdirtysub();
1413         }
1414         /*
1415          * Since it is now being written, we can clear its deferred write flag.
1416          */
1417         bp->b_flags &= ~B_DEFERRED;
1418 }
1419
1420 /*
1421  *      bawrite:
1422  *
1423  *      Asynchronous write.  Start output on a buffer, but do not wait for
1424  *      it to complete.  The buffer is released when the output completes.
1425  *
1426  *      bwrite() ( or the VOP routine anyway ) is responsible for handling
1427  *      B_INVAL buffers.  Not us.
1428  */
1429 void
1430 bawrite(struct buf *bp)
1431 {
1432
1433         bp->b_flags |= B_ASYNC;
1434         (void) bwrite(bp);
1435 }
1436
1437 /*
1438  *      babarrierwrite:
1439  *
1440  *      Asynchronous barrier write.  Start output on a buffer, but do not
1441  *      wait for it to complete.  Place a write barrier after this write so
1442  *      that this buffer and all buffers written before it are committed to
1443  *      the disk before any buffers written after this write are committed
1444  *      to the disk.  The buffer is released when the output completes.
1445  */
1446 void
1447 babarrierwrite(struct buf *bp)
1448 {
1449
1450         bp->b_flags |= B_ASYNC | B_BARRIER;
1451         (void) bwrite(bp);
1452 }
1453
1454 /*
1455  *      bbarrierwrite:
1456  *
1457  *      Synchronous barrier write.  Start output on a buffer and wait for
1458  *      it to complete.  Place a write barrier after this write so that
1459  *      this buffer and all buffers written before it are committed to
1460  *      the disk before any buffers written after this write are committed
1461  *      to the disk.  The buffer is released when the output completes.
1462  */
1463 int
1464 bbarrierwrite(struct buf *bp)
1465 {
1466
1467         bp->b_flags |= B_BARRIER;
1468         return (bwrite(bp));
1469 }
1470
1471 /*
1472  *      bwillwrite:
1473  *
1474  *      Called prior to the locking of any vnodes when we are expecting to
1475  *      write.  We do not want to starve the buffer cache with too many
1476  *      dirty buffers so we block here.  By blocking prior to the locking
1477  *      of any vnodes we attempt to avoid the situation where a locked vnode
1478  *      prevents the various system daemons from flushing related buffers.
1479  */
1480 void
1481 bwillwrite(void)
1482 {
1483
1484         if (numdirtybuffers >= hidirtybuffers) {
1485                 mtx_lock(&bdirtylock);
1486                 while (numdirtybuffers >= hidirtybuffers) {
1487                         bdirtywait = 1;
1488                         msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
1489                             "flswai", 0);
1490                 }
1491                 mtx_unlock(&bdirtylock);
1492         }
1493 }
1494
1495 /*
1496  * Return true if we have too many dirty buffers.
1497  */
1498 int
1499 buf_dirty_count_severe(void)
1500 {
1501
1502         return(numdirtybuffers >= hidirtybuffers);
1503 }
1504
1505 static __noinline int
1506 buf_vm_page_count_severe(void)
1507 {
1508
1509         KFAIL_POINT_CODE(DEBUG_FP, buf_pressure, return 1);
1510
1511         return vm_page_count_severe();
1512 }
1513
1514 /*
1515  *      brelse:
1516  *
1517  *      Release a busy buffer and, if requested, free its resources.  The
1518  *      buffer will be stashed in the appropriate bufqueue[] allowing it
1519  *      to be accessed later as a cache entity or reused for other purposes.
1520  */
1521 void
1522 brelse(struct buf *bp)
1523 {
1524         int qindex;
1525
1526         CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
1527             bp, bp->b_vp, bp->b_flags);
1528         KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1529             ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1530
1531         if (BUF_LOCKRECURSED(bp)) {
1532                 /*
1533                  * Do not process, in particular, do not handle the
1534                  * B_INVAL/B_RELBUF and do not release to free list.
1535                  */
1536                 BUF_UNLOCK(bp);
1537                 return;
1538         }
1539
1540         if (bp->b_flags & B_MANAGED) {
1541                 bqrelse(bp);
1542                 return;
1543         }
1544
1545         if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
1546             bp->b_error == EIO && !(bp->b_flags & B_INVAL)) {
1547                 /*
1548                  * Failed write, redirty.  Must clear BIO_ERROR to prevent
1549                  * pages from being scrapped.  If the error is anything
1550                  * other than an I/O error (EIO), assume that retrying
1551                  * is futile.
1552                  */
1553                 bp->b_ioflags &= ~BIO_ERROR;
1554                 bdirty(bp);
1555         } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
1556             (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
1557                 /*
1558                  * Either a failed I/O or we were asked to free or not
1559                  * cache the buffer.
1560                  */
1561                 bp->b_flags |= B_INVAL;
1562                 if (!LIST_EMPTY(&bp->b_dep))
1563                         buf_deallocate(bp);
1564                 if (bp->b_flags & B_DELWRI)
1565                         bdirtysub();
1566                 bp->b_flags &= ~(B_DELWRI | B_CACHE);
1567                 if ((bp->b_flags & B_VMIO) == 0) {
1568                         if (bp->b_bufsize)
1569                                 allocbuf(bp, 0);
1570                         if (bp->b_vp)
1571                                 brelvp(bp);
1572                 }
1573         }
1574
1575         /*
1576          * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
1577          * is called with B_DELWRI set, the underlying pages may wind up
1578          * getting freed causing a previous write (bdwrite()) to get 'lost'
1579          * because pages associated with a B_DELWRI bp are marked clean.
1580          *
1581          * We still allow the B_INVAL case to call vfs_vmio_release(), even
1582          * if B_DELWRI is set.
1583          *
1584          * If B_DELWRI is not set we may have to set B_RELBUF if we are low
1585          * on pages to return pages to the VM page queues.
1586          */
1587         if (bp->b_flags & B_DELWRI)
1588                 bp->b_flags &= ~B_RELBUF;
1589         else if (buf_vm_page_count_severe()) {
1590                 /*
1591                  * BKGRDINPROG can only be set with the buf and bufobj
1592                  * locks both held.  We tolerate a race to clear it here.
1593                  */
1594                 if (!(bp->b_vflags & BV_BKGRDINPROG))
1595                         bp->b_flags |= B_RELBUF;
1596         }
1597
1598         /*
1599          * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
1600          * constituted, not even NFS buffers now.  Two flags effect this.  If
1601          * B_INVAL, the struct buf is invalidated but the VM object is kept
1602          * around ( i.e. so it is trivial to reconstitute the buffer later ).
1603          *
1604          * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
1605          * invalidated.  BIO_ERROR cannot be set for a failed write unless the
1606          * buffer is also B_INVAL because it hits the re-dirtying code above.
1607          *
1608          * Normally we can do this whether a buffer is B_DELWRI or not.  If
1609          * the buffer is an NFS buffer, it is tracking piecemeal writes or
1610          * the commit state and we cannot afford to lose the buffer. If the
1611          * buffer has a background write in progress, we need to keep it
1612          * around to prevent it from being reconstituted and starting a second
1613          * background write.
1614          */
1615         if ((bp->b_flags & B_VMIO)
1616             && !(bp->b_vp->v_mount != NULL &&
1617                  (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
1618                  !vn_isdisk(bp->b_vp, NULL) &&
1619                  (bp->b_flags & B_DELWRI))
1620             ) {
1621
1622                 int i, j, resid;
1623                 vm_page_t m;
1624                 off_t foff;
1625                 vm_pindex_t poff;
1626                 vm_object_t obj;
1627
1628                 obj = bp->b_bufobj->bo_object;
1629
1630                 /*
1631                  * Get the base offset and length of the buffer.  Note that
1632                  * in the VMIO case if the buffer block size is not
1633                  * page-aligned then b_data pointer may not be page-aligned.
1634                  * But our b_pages[] array *IS* page aligned.
1635                  *
1636                  * block sizes less then DEV_BSIZE (usually 512) are not
1637                  * supported due to the page granularity bits (m->valid,
1638                  * m->dirty, etc...).
1639                  *
1640                  * See man buf(9) for more information
1641                  */
1642                 resid = bp->b_bufsize;
1643                 foff = bp->b_offset;
1644                 for (i = 0; i < bp->b_npages; i++) {
1645                         int had_bogus = 0;
1646
1647                         m = bp->b_pages[i];
1648
1649                         /*
1650                          * If we hit a bogus page, fixup *all* the bogus pages
1651                          * now.
1652                          */
1653                         if (m == bogus_page) {
1654                                 poff = OFF_TO_IDX(bp->b_offset);
1655                                 had_bogus = 1;
1656
1657                                 VM_OBJECT_RLOCK(obj);
1658                                 for (j = i; j < bp->b_npages; j++) {
1659                                         vm_page_t mtmp;
1660                                         mtmp = bp->b_pages[j];
1661                                         if (mtmp == bogus_page) {
1662                                                 mtmp = vm_page_lookup(obj, poff + j);
1663                                                 if (!mtmp) {
1664                                                         panic("brelse: page missing\n");
1665                                                 }
1666                                                 bp->b_pages[j] = mtmp;
1667                                         }
1668                                 }
1669                                 VM_OBJECT_RUNLOCK(obj);
1670
1671                                 if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) {
1672                                         BUF_CHECK_MAPPED(bp);
1673                                         pmap_qenter(
1674                                             trunc_page((vm_offset_t)bp->b_data),
1675                                             bp->b_pages, bp->b_npages);
1676                                 }
1677                                 m = bp->b_pages[i];
1678                         }
1679                         if ((bp->b_flags & B_NOCACHE) ||
1680                             (bp->b_ioflags & BIO_ERROR &&
1681                              bp->b_iocmd == BIO_READ)) {
1682                                 int poffset = foff & PAGE_MASK;
1683                                 int presid = resid > (PAGE_SIZE - poffset) ?
1684                                         (PAGE_SIZE - poffset) : resid;
1685
1686                                 KASSERT(presid >= 0, ("brelse: extra page"));
1687                                 VM_OBJECT_WLOCK(obj);
1688                                 while (vm_page_xbusied(m)) {
1689                                         vm_page_lock(m);
1690                                         VM_OBJECT_WUNLOCK(obj);
1691                                         vm_page_busy_sleep(m, "mbncsh");
1692                                         VM_OBJECT_WLOCK(obj);
1693                                 }
1694                                 if (pmap_page_wired_mappings(m) == 0)
1695                                         vm_page_set_invalid(m, poffset, presid);
1696                                 VM_OBJECT_WUNLOCK(obj);
1697                                 if (had_bogus)
1698                                         printf("avoided corruption bug in bogus_page/brelse code\n");
1699                         }
1700                         resid -= PAGE_SIZE - (foff & PAGE_MASK);
1701                         foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
1702                 }
1703                 if (bp->b_flags & (B_INVAL | B_RELBUF))
1704                         vfs_vmio_release(bp);
1705
1706         } else if (bp->b_flags & B_VMIO) {
1707
1708                 if (bp->b_flags & (B_INVAL | B_RELBUF)) {
1709                         vfs_vmio_release(bp);
1710                 }
1711
1712         } else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) {
1713                 if (bp->b_bufsize != 0)
1714                         allocbuf(bp, 0);
1715                 if (bp->b_vp != NULL)
1716                         brelvp(bp);
1717         }
1718
1719         /*
1720          * If the buffer has junk contents signal it and eventually
1721          * clean up B_DELWRI and diassociate the vnode so that gbincore()
1722          * doesn't find it.
1723          */
1724         if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
1725             (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
1726                 bp->b_flags |= B_INVAL;
1727         if (bp->b_flags & B_INVAL) {
1728                 if (bp->b_flags & B_DELWRI)
1729                         bundirty(bp);
1730                 if (bp->b_vp)
1731                         brelvp(bp);
1732         }
1733
1734         /* buffers with no memory */
1735         if (bp->b_bufsize == 0) {
1736                 bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1737                 if (bp->b_vflags & BV_BKGRDINPROG)
1738                         panic("losing buffer 1");
1739                 if (bp->b_kvasize)
1740                         qindex = QUEUE_EMPTYKVA;
1741                 else
1742                         qindex = QUEUE_EMPTY;
1743                 bp->b_flags |= B_AGE;
1744         /* buffers with junk contents */
1745         } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
1746             (bp->b_ioflags & BIO_ERROR)) {
1747                 bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1748                 if (bp->b_vflags & BV_BKGRDINPROG)
1749                         panic("losing buffer 2");
1750                 qindex = QUEUE_CLEAN;
1751                 bp->b_flags |= B_AGE;
1752         /* remaining buffers */
1753         } else if (bp->b_flags & B_DELWRI)
1754                 qindex = QUEUE_DIRTY;
1755         else
1756                 qindex = QUEUE_CLEAN;
1757
1758         binsfree(bp, qindex);
1759
1760         bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
1761         if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
1762                 panic("brelse: not dirty");
1763         /* unlock */
1764         BUF_UNLOCK(bp);
1765 }
1766
1767 /*
1768  * Release a buffer back to the appropriate queue but do not try to free
1769  * it.  The buffer is expected to be used again soon.
1770  *
1771  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
1772  * biodone() to requeue an async I/O on completion.  It is also used when
1773  * known good buffers need to be requeued but we think we may need the data
1774  * again soon.
1775  *
1776  * XXX we should be able to leave the B_RELBUF hint set on completion.
1777  */
1778 void
1779 bqrelse(struct buf *bp)
1780 {
1781         int qindex;
1782
1783         CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1784         KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1785             ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1786
1787         if (BUF_LOCKRECURSED(bp)) {
1788                 /* do not release to free list */
1789                 BUF_UNLOCK(bp);
1790                 return;
1791         }
1792         bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1793
1794         if (bp->b_flags & B_MANAGED) {
1795                 if (bp->b_flags & B_REMFREE)
1796                         bremfreef(bp);
1797                 goto out;
1798         }
1799
1800         /* buffers with stale but valid contents */
1801         if (bp->b_flags & B_DELWRI) {
1802                 qindex = QUEUE_DIRTY;
1803         } else {
1804                 if ((bp->b_flags & B_DELWRI) == 0 &&
1805                     (bp->b_xflags & BX_VNDIRTY))
1806                         panic("bqrelse: not dirty");
1807                 /*
1808                  * BKGRDINPROG can only be set with the buf and bufobj
1809                  * locks both held.  We tolerate a race to clear it here.
1810                  */
1811                 if (buf_vm_page_count_severe() &&
1812                     (bp->b_vflags & BV_BKGRDINPROG) == 0) {
1813                         /*
1814                          * We are too low on memory, we have to try to free
1815                          * the buffer (most importantly: the wired pages
1816                          * making up its backing store) *now*.
1817                          */
1818                         brelse(bp);
1819                         return;
1820                 }
1821                 qindex = QUEUE_CLEAN;
1822         }
1823         binsfree(bp, qindex);
1824
1825 out:
1826         /* unlock */
1827         BUF_UNLOCK(bp);
1828 }
1829
1830 /* Give pages used by the bp back to the VM system (where possible) */
1831 static void
1832 vfs_vmio_release(struct buf *bp)
1833 {
1834         int i;
1835         vm_page_t m;
1836
1837         if ((bp->b_flags & B_UNMAPPED) == 0) {
1838                 BUF_CHECK_MAPPED(bp);
1839                 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
1840         } else
1841                 BUF_CHECK_UNMAPPED(bp);
1842         VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
1843         for (i = 0; i < bp->b_npages; i++) {
1844                 m = bp->b_pages[i];
1845                 bp->b_pages[i] = NULL;
1846                 /*
1847                  * In order to keep page LRU ordering consistent, put
1848                  * everything on the inactive queue.
1849                  */
1850                 vm_page_lock(m);
1851                 vm_page_unwire(m, 0);
1852
1853                 /*
1854                  * Might as well free the page if we can and it has
1855                  * no valid data.  We also free the page if the
1856                  * buffer was used for direct I/O
1857                  */
1858                 if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) {
1859                         if (m->wire_count == 0 && !vm_page_busied(m))
1860                                 vm_page_free(m);
1861                 } else if (bp->b_flags & B_DIRECT)
1862                         vm_page_try_to_free(m);
1863                 else if (buf_vm_page_count_severe())
1864                         vm_page_try_to_cache(m);
1865                 vm_page_unlock(m);
1866         }
1867         VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
1868
1869         if (bp->b_bufsize) {
1870                 bufspacewakeup();
1871                 bp->b_bufsize = 0;
1872         }
1873         bp->b_npages = 0;
1874         bp->b_flags &= ~B_VMIO;
1875         if (bp->b_vp)
1876                 brelvp(bp);
1877 }
1878
1879 /*
1880  * Check to see if a block at a particular lbn is available for a clustered
1881  * write.
1882  */
1883 static int
1884 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
1885 {
1886         struct buf *bpa;
1887         int match;
1888
1889         match = 0;
1890
1891         /* If the buf isn't in core skip it */
1892         if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
1893                 return (0);
1894
1895         /* If the buf is busy we don't want to wait for it */
1896         if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
1897                 return (0);
1898
1899         /* Only cluster with valid clusterable delayed write buffers */
1900         if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
1901             (B_DELWRI | B_CLUSTEROK))
1902                 goto done;
1903
1904         if (bpa->b_bufsize != size)
1905                 goto done;
1906
1907         /*
1908          * Check to see if it is in the expected place on disk and that the
1909          * block has been mapped.
1910          */
1911         if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
1912                 match = 1;
1913 done:
1914         BUF_UNLOCK(bpa);
1915         return (match);
1916 }
1917
1918 /*
1919  *      vfs_bio_awrite:
1920  *
1921  *      Implement clustered async writes for clearing out B_DELWRI buffers.
1922  *      This is much better then the old way of writing only one buffer at
1923  *      a time.  Note that we may not be presented with the buffers in the
1924  *      correct order, so we search for the cluster in both directions.
1925  */
1926 int
1927 vfs_bio_awrite(struct buf *bp)
1928 {
1929         struct bufobj *bo;
1930         int i;
1931         int j;
1932         daddr_t lblkno = bp->b_lblkno;
1933         struct vnode *vp = bp->b_vp;
1934         int ncl;
1935         int nwritten;
1936         int size;
1937         int maxcl;
1938         int gbflags;
1939
1940         bo = &vp->v_bufobj;
1941         gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
1942         /*
1943          * right now we support clustered writing only to regular files.  If
1944          * we find a clusterable block we could be in the middle of a cluster
1945          * rather then at the beginning.
1946          */
1947         if ((vp->v_type == VREG) &&
1948             (vp->v_mount != 0) && /* Only on nodes that have the size info */
1949             (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
1950
1951                 size = vp->v_mount->mnt_stat.f_iosize;
1952                 maxcl = MAXPHYS / size;
1953
1954                 BO_RLOCK(bo);
1955                 for (i = 1; i < maxcl; i++)
1956                         if (vfs_bio_clcheck(vp, size, lblkno + i,
1957                             bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
1958                                 break;
1959
1960                 for (j = 1; i + j <= maxcl && j <= lblkno; j++)
1961                         if (vfs_bio_clcheck(vp, size, lblkno - j,
1962                             bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
1963                                 break;
1964                 BO_RUNLOCK(bo);
1965                 --j;
1966                 ncl = i + j;
1967                 /*
1968                  * this is a possible cluster write
1969                  */
1970                 if (ncl != 1) {
1971                         BUF_UNLOCK(bp);
1972                         nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
1973                             gbflags);
1974                         return (nwritten);
1975                 }
1976         }
1977         bremfree(bp);
1978         bp->b_flags |= B_ASYNC;
1979         /*
1980          * default (old) behavior, writing out only one block
1981          *
1982          * XXX returns b_bufsize instead of b_bcount for nwritten?
1983          */
1984         nwritten = bp->b_bufsize;
1985         (void) bwrite(bp);
1986
1987         return (nwritten);
1988 }
1989
1990 static void
1991 setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags)
1992 {
1993
1994         KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
1995             bp->b_kvasize == 0, ("call bfreekva(%p)", bp));
1996         if ((gbflags & GB_UNMAPPED) == 0) {
1997                 bp->b_kvabase = (caddr_t)addr;
1998         } else if ((gbflags & GB_KVAALLOC) != 0) {
1999                 KASSERT((gbflags & GB_UNMAPPED) != 0,
2000                     ("GB_KVAALLOC without GB_UNMAPPED"));
2001                 bp->b_kvaalloc = (caddr_t)addr;
2002                 bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
2003                 atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
2004         }
2005         bp->b_kvasize = maxsize;
2006 }
2007
2008 /*
2009  * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if
2010  * needed.
2011  */
2012 static int
2013 allocbufkva(struct buf *bp, int maxsize, int gbflags)
2014 {
2015         vm_offset_t addr;
2016
2017         bfreekva(bp);
2018         addr = 0;
2019
2020         if (vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr)) {
2021                 /*
2022                  * Buffer map is too fragmented.  Request the caller
2023                  * to defragment the map.
2024                  */
2025                 atomic_add_int(&bufdefragcnt, 1);
2026                 return (1);
2027         }
2028         setbufkva(bp, addr, maxsize, gbflags);
2029         atomic_add_long(&bufspace, bp->b_kvasize);
2030         return (0);
2031 }
2032
2033 /*
2034  * Ask the bufdaemon for help, or act as bufdaemon itself, when a
2035  * locked vnode is supplied.
2036  */
2037 static void
2038 getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
2039     int defrag)
2040 {
2041         struct thread *td;
2042         char *waitmsg;
2043         int cnt, error, flags, norunbuf, wait;
2044
2045         mtx_assert(&bqclean, MA_OWNED);
2046
2047         if (defrag) {
2048                 flags = VFS_BIO_NEED_BUFSPACE;
2049                 waitmsg = "nbufkv";
2050         } else if (bufspace >= hibufspace) {
2051                 waitmsg = "nbufbs";
2052                 flags = VFS_BIO_NEED_BUFSPACE;
2053         } else {
2054                 waitmsg = "newbuf";
2055                 flags = VFS_BIO_NEED_ANY;
2056         }
2057         mtx_lock(&nblock);
2058         needsbuffer |= flags;
2059         mtx_unlock(&nblock);
2060         mtx_unlock(&bqclean);
2061
2062         bd_speedup();   /* heeeelp */
2063         if ((gbflags & GB_NOWAIT_BD) != 0)
2064                 return;
2065
2066         td = curthread;
2067         cnt = 0;
2068         wait = MNT_NOWAIT;
2069         mtx_lock(&nblock);
2070         while (needsbuffer & flags) {
2071                 if (vp != NULL && vp->v_type != VCHR &&
2072                     (td->td_pflags & TDP_BUFNEED) == 0) {
2073                         mtx_unlock(&nblock);
2074
2075                         /*
2076                          * getblk() is called with a vnode locked, and
2077                          * some majority of the dirty buffers may as
2078                          * well belong to the vnode.  Flushing the
2079                          * buffers there would make a progress that
2080                          * cannot be achieved by the buf_daemon, that
2081                          * cannot lock the vnode.
2082                          */
2083                         if (cnt++ > 2)
2084                                 wait = MNT_WAIT;
2085                         ASSERT_VOP_LOCKED(vp, "bufd_helper");
2086                         error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
2087                             vn_lock(vp, LK_TRYUPGRADE);
2088                         if (error == 0) {
2089                                 /* play bufdaemon */
2090                                 norunbuf = curthread_pflags_set(TDP_BUFNEED |
2091                                     TDP_NORUNNINGBUF);
2092                                 VOP_FSYNC(vp, wait, td);
2093                                 atomic_add_long(&notbufdflushes, 1);
2094                                 curthread_pflags_restore(norunbuf);
2095                         }
2096                         mtx_lock(&nblock);
2097                         if ((needsbuffer & flags) == 0)
2098                                 break;
2099                 }
2100                 if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag,
2101                     waitmsg, slptimeo))
2102                         break;
2103         }
2104         mtx_unlock(&nblock);
2105 }
2106
2107 static void
2108 getnewbuf_reuse_bp(struct buf *bp, int qindex)
2109 {
2110
2111         CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
2112             "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
2113              bp->b_kvasize, bp->b_bufsize, qindex);
2114         mtx_assert(&bqclean, MA_NOTOWNED);
2115
2116         /*
2117          * Note: we no longer distinguish between VMIO and non-VMIO
2118          * buffers.
2119          */
2120         KASSERT((bp->b_flags & B_DELWRI) == 0,
2121             ("delwri buffer %p found in queue %d", bp, qindex));
2122
2123         if (qindex == QUEUE_CLEAN) {
2124                 if (bp->b_flags & B_VMIO) {
2125                         bp->b_flags &= ~B_ASYNC;
2126                         vfs_vmio_release(bp);
2127                 }
2128                 if (bp->b_vp != NULL)
2129                         brelvp(bp);
2130         }
2131
2132         /*
2133          * Get the rest of the buffer freed up.  b_kva* is still valid
2134          * after this operation.
2135          */
2136
2137         if (bp->b_rcred != NOCRED) {
2138                 crfree(bp->b_rcred);
2139                 bp->b_rcred = NOCRED;
2140         }
2141         if (bp->b_wcred != NOCRED) {
2142                 crfree(bp->b_wcred);
2143                 bp->b_wcred = NOCRED;
2144         }
2145         if (!LIST_EMPTY(&bp->b_dep))
2146                 buf_deallocate(bp);
2147         if (bp->b_vflags & BV_BKGRDINPROG)
2148                 panic("losing buffer 3");
2149         KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.  qindex: %d",
2150             bp, bp->b_vp, qindex));
2151         KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
2152             ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
2153
2154         if (bp->b_bufsize)
2155                 allocbuf(bp, 0);
2156
2157         bp->b_flags &= B_UNMAPPED | B_KVAALLOC;
2158         bp->b_ioflags = 0;
2159         bp->b_xflags = 0;
2160         KASSERT((bp->b_flags & B_INFREECNT) == 0,
2161             ("buf %p still counted as free?", bp));
2162         bp->b_vflags = 0;
2163         bp->b_vp = NULL;
2164         bp->b_blkno = bp->b_lblkno = 0;
2165         bp->b_offset = NOOFFSET;
2166         bp->b_iodone = 0;
2167         bp->b_error = 0;
2168         bp->b_resid = 0;
2169         bp->b_bcount = 0;
2170         bp->b_npages = 0;
2171         bp->b_dirtyoff = bp->b_dirtyend = 0;
2172         bp->b_bufobj = NULL;
2173         bp->b_pin_count = 0;
2174         bp->b_fsprivate1 = NULL;
2175         bp->b_fsprivate2 = NULL;
2176         bp->b_fsprivate3 = NULL;
2177
2178         LIST_INIT(&bp->b_dep);
2179 }
2180
2181 static int flushingbufs;
2182
2183 static struct buf *
2184 getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
2185 {
2186         struct buf *bp, *nbp;
2187         int nqindex, qindex, pass;
2188
2189         KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
2190
2191         pass = 1;
2192 restart:
2193         atomic_add_int(&getnewbufrestarts, 1);
2194
2195         /*
2196          * Setup for scan.  If we do not have enough free buffers,
2197          * we setup a degenerate case that immediately fails.  Note
2198          * that if we are specially marked process, we are allowed to
2199          * dip into our reserves.
2200          *
2201          * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
2202          * for the allocation of the mapped buffer.  For unmapped, the
2203          * easiest is to start with EMPTY outright.
2204          *
2205          * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
2206          * However, there are a number of cases (defragging, reusing, ...)
2207          * where we cannot backup.
2208          */
2209         nbp = NULL;
2210         mtx_lock(&bqclean);
2211         if (!defrag && unmapped) {
2212                 nqindex = QUEUE_EMPTY;
2213                 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
2214         }
2215         if (nbp == NULL) {
2216                 nqindex = QUEUE_EMPTYKVA;
2217                 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
2218         }
2219
2220         /*
2221          * If no EMPTYKVA buffers and we are either defragging or
2222          * reusing, locate a CLEAN buffer to free or reuse.  If
2223          * bufspace useage is low skip this step so we can allocate a
2224          * new buffer.
2225          */
2226         if (nbp == NULL && (defrag || bufspace >= lobufspace)) {
2227                 nqindex = QUEUE_CLEAN;
2228                 nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2229         }
2230
2231         /*
2232          * If we could not find or were not allowed to reuse a CLEAN
2233          * buffer, check to see if it is ok to use an EMPTY buffer.
2234          * We can only use an EMPTY buffer if allocating its KVA would
2235          * not otherwise run us out of buffer space.  No KVA is needed
2236          * for the unmapped allocation.
2237          */
2238         if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace ||
2239             metadata)) {
2240                 nqindex = QUEUE_EMPTY;
2241                 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
2242         }
2243
2244         /*
2245          * All available buffers might be clean, retry ignoring the
2246          * lobufspace as the last resort.
2247          */
2248         if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) {
2249                 nqindex = QUEUE_CLEAN;
2250                 nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2251         }
2252
2253         /*
2254          * Run scan, possibly freeing data and/or kva mappings on the fly
2255          * depending.
2256          */
2257         while ((bp = nbp) != NULL) {
2258                 qindex = nqindex;
2259
2260                 /*
2261                  * Calculate next bp (we can only use it if we do not
2262                  * block or do other fancy things).
2263                  */
2264                 if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
2265                         switch (qindex) {
2266                         case QUEUE_EMPTY:
2267                                 nqindex = QUEUE_EMPTYKVA;
2268                                 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
2269                                 if (nbp != NULL)
2270                                         break;
2271                                 /* FALLTHROUGH */
2272                         case QUEUE_EMPTYKVA:
2273                                 nqindex = QUEUE_CLEAN;
2274                                 nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2275                                 if (nbp != NULL)
2276                                         break;
2277                                 /* FALLTHROUGH */
2278                         case QUEUE_CLEAN:
2279                                 if (metadata && pass == 1) {
2280                                         pass = 2;
2281                                         nqindex = QUEUE_EMPTY;
2282                                         nbp = TAILQ_FIRST(
2283                                             &bufqueues[QUEUE_EMPTY]);
2284                                 }
2285                                 /*
2286                                  * nbp is NULL.
2287                                  */
2288                                 break;
2289                         }
2290                 }
2291                 /*
2292                  * If we are defragging then we need a buffer with
2293                  * b_kvasize != 0.  XXX this situation should no longer
2294                  * occur, if defrag is non-zero the buffer's b_kvasize
2295                  * should also be non-zero at this point.  XXX
2296                  */
2297                 if (defrag && bp->b_kvasize == 0) {
2298                         printf("Warning: defrag empty buffer %p\n", bp);
2299                         continue;
2300                 }
2301
2302                 /*
2303                  * Start freeing the bp.  This is somewhat involved.  nbp
2304                  * remains valid only for QUEUE_EMPTY[KVA] bp's.
2305                  */
2306                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
2307                         continue;
2308                 /*
2309                  * BKGRDINPROG can only be set with the buf and bufobj
2310                  * locks both held.  We tolerate a race to clear it here.
2311                  */
2312                 if (bp->b_vflags & BV_BKGRDINPROG) {
2313                         BUF_UNLOCK(bp);
2314                         continue;
2315                 }
2316
2317                 KASSERT(bp->b_qindex == qindex,
2318                     ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
2319
2320                 bremfreel(bp);
2321                 mtx_unlock(&bqclean);
2322                 /*
2323                  * NOTE:  nbp is now entirely invalid.  We can only restart
2324                  * the scan from this point on.
2325                  */
2326
2327                 getnewbuf_reuse_bp(bp, qindex);
2328                 mtx_assert(&bqclean, MA_NOTOWNED);
2329
2330                 /*
2331                  * If we are defragging then free the buffer.
2332                  */
2333                 if (defrag) {
2334                         bp->b_flags |= B_INVAL;
2335                         bfreekva(bp);
2336                         brelse(bp);
2337                         defrag = 0;
2338                         goto restart;
2339                 }
2340
2341                 /*
2342                  * Notify any waiters for the buffer lock about
2343                  * identity change by freeing the buffer.
2344                  */
2345                 if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) {
2346                         bp->b_flags |= B_INVAL;
2347                         bfreekva(bp);
2348                         brelse(bp);
2349                         goto restart;
2350                 }
2351
2352                 if (metadata)
2353                         break;
2354
2355                 /*
2356                  * If we are overcomitted then recover the buffer and its
2357                  * KVM space.  This occurs in rare situations when multiple
2358                  * processes are blocked in getnewbuf() or allocbuf().
2359                  */
2360                 if (bufspace >= hibufspace)
2361                         flushingbufs = 1;
2362                 if (flushingbufs && bp->b_kvasize != 0) {
2363                         bp->b_flags |= B_INVAL;
2364                         bfreekva(bp);
2365                         brelse(bp);
2366                         goto restart;
2367                 }
2368                 if (bufspace < lobufspace)
2369                         flushingbufs = 0;
2370                 break;
2371         }
2372         return (bp);
2373 }
2374
2375 /*
2376  *      getnewbuf:
2377  *
2378  *      Find and initialize a new buffer header, freeing up existing buffers
2379  *      in the bufqueues as necessary.  The new buffer is returned locked.
2380  *
2381  *      Important:  B_INVAL is not set.  If the caller wishes to throw the
2382  *      buffer away, the caller must set B_INVAL prior to calling brelse().
2383  *
2384  *      We block if:
2385  *              We have insufficient buffer headers
2386  *              We have insufficient buffer space
2387  *              buffer_arena is too fragmented ( space reservation fails )
2388  *              If we have to flush dirty buffers ( but we try to avoid this )
2389  */
2390 static struct buf *
2391 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
2392     int gbflags)
2393 {
2394         struct buf *bp;
2395         int defrag, metadata;
2396
2397         KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
2398             ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
2399         if (!unmapped_buf_allowed)
2400                 gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
2401
2402         defrag = 0;
2403         if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
2404             vp->v_type == VCHR)
2405                 metadata = 1;
2406         else
2407                 metadata = 0;
2408         /*
2409          * We can't afford to block since we might be holding a vnode lock,
2410          * which may prevent system daemons from running.  We deal with
2411          * low-memory situations by proactively returning memory and running
2412          * async I/O rather then sync I/O.
2413          */
2414         atomic_add_int(&getnewbufcalls, 1);
2415         atomic_subtract_int(&getnewbufrestarts, 1);
2416 restart:
2417         bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
2418             GB_KVAALLOC)) == GB_UNMAPPED, metadata);
2419         if (bp != NULL)
2420                 defrag = 0;
2421
2422         /*
2423          * If we exhausted our list, sleep as appropriate.  We may have to
2424          * wakeup various daemons and write out some dirty buffers.
2425          *
2426          * Generally we are sleeping due to insufficient buffer space.
2427          */
2428         if (bp == NULL) {
2429                 mtx_assert(&bqclean, MA_OWNED);
2430                 getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
2431                 mtx_assert(&bqclean, MA_NOTOWNED);
2432         } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
2433                 mtx_assert(&bqclean, MA_NOTOWNED);
2434
2435                 bfreekva(bp);
2436                 bp->b_flags |= B_UNMAPPED;
2437                 bp->b_kvabase = bp->b_data = unmapped_buf;
2438                 bp->b_kvasize = maxsize;
2439                 atomic_add_long(&bufspace, bp->b_kvasize);
2440                 atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
2441                 atomic_add_int(&bufreusecnt, 1);
2442         } else {
2443                 mtx_assert(&bqclean, MA_NOTOWNED);
2444
2445                 /*
2446                  * We finally have a valid bp.  We aren't quite out of the
2447                  * woods, we still have to reserve kva space.  In order
2448                  * to keep fragmentation sane we only allocate kva in
2449                  * BKVASIZE chunks.
2450                  */
2451                 maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
2452
2453                 if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED |
2454                     B_KVAALLOC)) == B_UNMAPPED) {
2455                         if (allocbufkva(bp, maxsize, gbflags)) {
2456                                 defrag = 1;
2457                                 bp->b_flags |= B_INVAL;
2458                                 brelse(bp);
2459                                 goto restart;
2460                         }
2461                         atomic_add_int(&bufreusecnt, 1);
2462                 } else if ((bp->b_flags & B_KVAALLOC) != 0 &&
2463                     (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) {
2464                         /*
2465                          * If the reused buffer has KVA allocated,
2466                          * reassign b_kvaalloc to b_kvabase.
2467                          */
2468                         bp->b_kvabase = bp->b_kvaalloc;
2469                         bp->b_flags &= ~B_KVAALLOC;
2470                         atomic_subtract_long(&unmapped_bufspace,
2471                             bp->b_kvasize);
2472                         atomic_add_int(&bufreusecnt, 1);
2473                 } else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
2474                     (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED |
2475                     GB_KVAALLOC)) {
2476                         /*
2477                          * The case of reused buffer already have KVA
2478                          * mapped, but the request is for unmapped
2479                          * buffer with KVA allocated.
2480                          */
2481                         bp->b_kvaalloc = bp->b_kvabase;
2482                         bp->b_data = bp->b_kvabase = unmapped_buf;
2483                         bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
2484                         atomic_add_long(&unmapped_bufspace,
2485                             bp->b_kvasize);
2486                         atomic_add_int(&bufreusecnt, 1);
2487                 }
2488                 if ((gbflags & GB_UNMAPPED) == 0) {
2489                         bp->b_saveaddr = bp->b_kvabase;
2490                         bp->b_data = bp->b_saveaddr;
2491                         bp->b_flags &= ~B_UNMAPPED;
2492                         BUF_CHECK_MAPPED(bp);
2493                 }
2494         }
2495         return (bp);
2496 }
2497
2498 /*
2499  *      buf_daemon:
2500  *
2501  *      buffer flushing daemon.  Buffers are normally flushed by the
2502  *      update daemon but if it cannot keep up this process starts to
2503  *      take the load in an attempt to prevent getnewbuf() from blocking.
2504  */
2505
2506 static struct kproc_desc buf_kp = {
2507         "bufdaemon",
2508         buf_daemon,
2509         &bufdaemonproc
2510 };
2511 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
2512
2513 static int
2514 buf_flush(int target)
2515 {
2516         int flushed;
2517
2518         flushed = flushbufqueues(target, 0);
2519         if (flushed == 0) {
2520                 /*
2521                  * Could not find any buffers without rollback
2522                  * dependencies, so just write the first one
2523                  * in the hopes of eventually making progress.
2524                  */
2525                 flushed = flushbufqueues(target, 1);
2526         }
2527         return (flushed);
2528 }
2529
2530 static void
2531 buf_daemon()
2532 {
2533         int lodirty;
2534
2535         /*
2536          * This process needs to be suspended prior to shutdown sync.
2537          */
2538         EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
2539             SHUTDOWN_PRI_LAST);
2540
2541         /*
2542          * This process is allowed to take the buffer cache to the limit
2543          */
2544         curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
2545         mtx_lock(&bdlock);
2546         for (;;) {
2547                 bd_request = 0;
2548                 mtx_unlock(&bdlock);
2549
2550                 kproc_suspend_check(bufdaemonproc);
2551                 lodirty = lodirtybuffers;
2552                 if (bd_speedupreq) {
2553                         lodirty = numdirtybuffers / 2;
2554                         bd_speedupreq = 0;
2555                 }
2556                 /*
2557                  * Do the flush.  Limit the amount of in-transit I/O we
2558                  * allow to build up, otherwise we would completely saturate
2559                  * the I/O system.
2560                  */
2561                 while (numdirtybuffers > lodirty) {
2562                         if (buf_flush(numdirtybuffers - lodirty) == 0)
2563                                 break;
2564                         kern_yield(PRI_USER);
2565                 }
2566
2567                 /*
2568                  * Only clear bd_request if we have reached our low water
2569                  * mark.  The buf_daemon normally waits 1 second and
2570                  * then incrementally flushes any dirty buffers that have
2571                  * built up, within reason.
2572                  *
2573                  * If we were unable to hit our low water mark and couldn't
2574                  * find any flushable buffers, we sleep for a short period
2575                  * to avoid endless loops on unlockable buffers.
2576                  */
2577                 mtx_lock(&bdlock);
2578                 if (numdirtybuffers <= lodirtybuffers) {
2579                         /*
2580                          * We reached our low water mark, reset the
2581                          * request and sleep until we are needed again.
2582                          * The sleep is just so the suspend code works.
2583                          */
2584                         bd_request = 0;
2585                         /*
2586                          * Do an extra wakeup in case dirty threshold
2587                          * changed via sysctl and the explicit transition
2588                          * out of shortfall was missed.
2589                          */
2590                         bdirtywakeup();
2591                         if (runningbufspace <= lorunningspace)
2592                                 runningwakeup();
2593                         msleep(&bd_request, &bdlock, PVM, "psleep", hz);
2594                 } else {
2595                         /*
2596                          * We couldn't find any flushable dirty buffers but
2597                          * still have too many dirty buffers, we
2598                          * have to sleep and try again.  (rare)
2599                          */
2600                         msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
2601                 }
2602         }
2603 }
2604
2605 /*
2606  *      flushbufqueues:
2607  *
2608  *      Try to flush a buffer in the dirty queue.  We must be careful to
2609  *      free up B_INVAL buffers instead of write them, which NFS is
2610  *      particularly sensitive to.
2611  */
2612 static int flushwithdeps = 0;
2613 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
2614     0, "Number of buffers flushed with dependecies that require rollbacks");
2615
2616 static int
2617 flushbufqueues(int target, int flushdeps)
2618 {
2619         struct buf *sentinel;
2620         struct vnode *vp;
2621         struct mount *mp;
2622         struct buf *bp;
2623         int hasdeps;
2624         int flushed;
2625         int queue;
2626         int error;
2627
2628         flushed = 0;
2629         queue = QUEUE_DIRTY;
2630         bp = NULL;
2631         sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
2632         sentinel->b_qindex = QUEUE_SENTINEL;
2633         mtx_lock(&bqdirty);
2634         TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
2635         mtx_unlock(&bqdirty);
2636         while (flushed != target) {
2637                 maybe_yield();
2638                 mtx_lock(&bqdirty);
2639                 bp = TAILQ_NEXT(sentinel, b_freelist);
2640                 if (bp != NULL) {
2641                         TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2642                         TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
2643                             b_freelist);
2644                 } else {
2645                         mtx_unlock(&bqdirty);
2646                         break;
2647                 }
2648                 KASSERT(bp->b_qindex != QUEUE_SENTINEL,
2649                     ("parallel calls to flushbufqueues() bp %p", bp));
2650                 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
2651                 mtx_unlock(&bqdirty);
2652                 if (error != 0)
2653                         continue;
2654                 if (bp->b_pin_count > 0) {
2655                         BUF_UNLOCK(bp);
2656                         continue;
2657                 }
2658                 /*
2659                  * BKGRDINPROG can only be set with the buf and bufobj
2660                  * locks both held.  We tolerate a race to clear it here.
2661                  */
2662                 if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
2663                     (bp->b_flags & B_DELWRI) == 0) {
2664                         BUF_UNLOCK(bp);
2665                         continue;
2666                 }
2667                 if (bp->b_flags & B_INVAL) {
2668                         bremfreef(bp);
2669                         brelse(bp);
2670                         flushed++;
2671                         continue;
2672                 }
2673
2674                 if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
2675                         if (flushdeps == 0) {
2676                                 BUF_UNLOCK(bp);
2677                                 continue;
2678                         }
2679                         hasdeps = 1;
2680                 } else
2681                         hasdeps = 0;
2682                 /*
2683                  * We must hold the lock on a vnode before writing
2684                  * one of its buffers. Otherwise we may confuse, or
2685                  * in the case of a snapshot vnode, deadlock the
2686                  * system.
2687                  *
2688                  * The lock order here is the reverse of the normal
2689                  * of vnode followed by buf lock.  This is ok because
2690                  * the NOWAIT will prevent deadlock.
2691                  */
2692                 vp = bp->b_vp;
2693                 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
2694                         BUF_UNLOCK(bp);
2695                         continue;
2696                 }
2697                 error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
2698                 if (error == 0) {
2699                         CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
2700                             bp, bp->b_vp, bp->b_flags);
2701                         vfs_bio_awrite(bp);
2702                         vn_finished_write(mp);
2703                         VOP_UNLOCK(vp, 0);
2704                         flushwithdeps += hasdeps;
2705                         flushed++;
2706                         if (runningbufspace > hirunningspace)
2707                                 waitrunningbufspace();
2708                         continue;
2709                 }
2710                 vn_finished_write(mp);
2711                 BUF_UNLOCK(bp);
2712         }
2713         mtx_lock(&bqdirty);
2714         TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2715         mtx_unlock(&bqdirty);
2716         free(sentinel, M_TEMP);
2717         return (flushed);
2718 }
2719
2720 /*
2721  * Check to see if a block is currently memory resident.
2722  */
2723 struct buf *
2724 incore(struct bufobj *bo, daddr_t blkno)
2725 {
2726         struct buf *bp;
2727
2728         BO_RLOCK(bo);
2729         bp = gbincore(bo, blkno);
2730         BO_RUNLOCK(bo);
2731         return (bp);
2732 }
2733
2734 /*
2735  * Returns true if no I/O is needed to access the
2736  * associated VM object.  This is like incore except
2737  * it also hunts around in the VM system for the data.
2738  */
2739
2740 static int
2741 inmem(struct vnode * vp, daddr_t blkno)
2742 {
2743         vm_object_t obj;
2744         vm_offset_t toff, tinc, size;
2745         vm_page_t m;
2746         vm_ooffset_t off;
2747
2748         ASSERT_VOP_LOCKED(vp, "inmem");
2749
2750         if (incore(&vp->v_bufobj, blkno))
2751                 return 1;
2752         if (vp->v_mount == NULL)
2753                 return 0;
2754         obj = vp->v_object;
2755         if (obj == NULL)
2756                 return (0);
2757
2758         size = PAGE_SIZE;
2759         if (size > vp->v_mount->mnt_stat.f_iosize)
2760                 size = vp->v_mount->mnt_stat.f_iosize;
2761         off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
2762
2763         VM_OBJECT_RLOCK(obj);
2764         for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
2765                 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
2766                 if (!m)
2767                         goto notinmem;
2768                 tinc = size;
2769                 if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
2770                         tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
2771                 if (vm_page_is_valid(m,
2772                     (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
2773                         goto notinmem;
2774         }
2775         VM_OBJECT_RUNLOCK(obj);
2776         return 1;
2777
2778 notinmem:
2779         VM_OBJECT_RUNLOCK(obj);
2780         return (0);
2781 }
2782
2783 /*
2784  * Set the dirty range for a buffer based on the status of the dirty
2785  * bits in the pages comprising the buffer.  The range is limited
2786  * to the size of the buffer.
2787  *
2788  * Tell the VM system that the pages associated with this buffer
2789  * are clean.  This is used for delayed writes where the data is
2790  * going to go to disk eventually without additional VM intevention.
2791  *
2792  * Note that while we only really need to clean through to b_bcount, we
2793  * just go ahead and clean through to b_bufsize.
2794  */
2795 static void
2796 vfs_clean_pages_dirty_buf(struct buf *bp)
2797 {
2798         vm_ooffset_t foff, noff, eoff;
2799         vm_page_t m;
2800         int i;
2801
2802         if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
2803                 return;
2804
2805         foff = bp->b_offset;
2806         KASSERT(bp->b_offset != NOOFFSET,
2807             ("vfs_clean_pages_dirty_buf: no buffer offset"));
2808
2809         VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
2810         vfs_drain_busy_pages(bp);
2811         vfs_setdirty_locked_object(bp);
2812         for (i = 0; i < bp->b_npages; i++) {
2813                 noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
2814                 eoff = noff;
2815                 if (eoff > bp->b_offset + bp->b_bufsize)
2816                         eoff = bp->b_offset + bp->b_bufsize;
2817                 m = bp->b_pages[i];
2818                 vfs_page_set_validclean(bp, foff, m);
2819                 /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
2820                 foff = noff;
2821         }
2822         VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
2823 }
2824
2825 static void
2826 vfs_setdirty_locked_object(struct buf *bp)
2827 {
2828         vm_object_t object;
2829         int i;
2830
2831         object = bp->b_bufobj->bo_object;
2832         VM_OBJECT_ASSERT_WLOCKED(object);
2833
2834         /*
2835          * We qualify the scan for modified pages on whether the
2836          * object has been flushed yet.
2837          */
2838         if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
2839                 vm_offset_t boffset;
2840                 vm_offset_t eoffset;
2841
2842                 /*
2843                  * test the pages to see if they have been modified directly
2844                  * by users through the VM system.
2845                  */
2846                 for (i = 0; i < bp->b_npages; i++)
2847                         vm_page_test_dirty(bp->b_pages[i]);
2848
2849                 /*
2850                  * Calculate the encompassing dirty range, boffset and eoffset,
2851                  * (eoffset - boffset) bytes.
2852                  */
2853
2854                 for (i = 0; i < bp->b_npages; i++) {
2855                         if (bp->b_pages[i]->dirty)
2856                                 break;
2857                 }
2858                 boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2859
2860                 for (i = bp->b_npages - 1; i >= 0; --i) {
2861                         if (bp->b_pages[i]->dirty) {
2862                                 break;
2863                         }
2864                 }
2865                 eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2866
2867                 /*
2868                  * Fit it to the buffer.
2869                  */
2870
2871                 if (eoffset > bp->b_bcount)
2872                         eoffset = bp->b_bcount;
2873
2874                 /*
2875                  * If we have a good dirty range, merge with the existing
2876                  * dirty range.
2877                  */
2878
2879                 if (boffset < eoffset) {
2880                         if (bp->b_dirtyoff > boffset)
2881                                 bp->b_dirtyoff = boffset;
2882                         if (bp->b_dirtyend < eoffset)
2883                                 bp->b_dirtyend = eoffset;
2884                 }
2885         }
2886 }
2887
2888 /*
2889  * Allocate the KVA mapping for an existing buffer. It handles the
2890  * cases of both B_UNMAPPED buffer, and buffer with the preallocated
2891  * KVA which is not mapped (B_KVAALLOC).
2892  */
2893 static void
2894 bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
2895 {
2896         struct buf *scratch_bp;
2897         int bsize, maxsize, need_mapping, need_kva;
2898         off_t offset;
2899
2900         need_mapping = (bp->b_flags & B_UNMAPPED) != 0 &&
2901             (gbflags & GB_UNMAPPED) == 0;
2902         need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED &&
2903             (gbflags & GB_KVAALLOC) != 0;
2904         if (!need_mapping && !need_kva)
2905                 return;
2906
2907         BUF_CHECK_UNMAPPED(bp);
2908
2909         if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) {
2910                 /*
2911                  * Buffer is not mapped, but the KVA was already
2912                  * reserved at the time of the instantiation.  Use the
2913                  * allocated space.
2914                  */
2915                 bp->b_flags &= ~B_KVAALLOC;
2916                 KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0"));
2917                 bp->b_kvabase = bp->b_kvaalloc;
2918                 atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
2919                 goto has_addr;
2920         }
2921
2922         /*
2923          * Calculate the amount of the address space we would reserve
2924          * if the buffer was mapped.
2925          */
2926         bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
2927         offset = blkno * bsize;
2928         maxsize = size + (offset & PAGE_MASK);
2929         maxsize = imax(maxsize, bsize);
2930
2931 mapping_loop:
2932         if (allocbufkva(bp, maxsize, gbflags)) {
2933                 /*
2934                  * Request defragmentation. getnewbuf() returns us the
2935                  * allocated space by the scratch buffer KVA.
2936                  */
2937                 scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
2938                     (GB_UNMAPPED | GB_KVAALLOC));
2939                 if (scratch_bp == NULL) {
2940                         if ((gbflags & GB_NOWAIT_BD) != 0) {
2941                                 /*
2942                                  * XXXKIB: defragmentation cannot
2943                                  * succeed, not sure what else to do.
2944                                  */
2945                                 panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp);
2946                         }
2947                         atomic_add_int(&mappingrestarts, 1);
2948                         goto mapping_loop;
2949                 }
2950                 KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0,
2951                     ("scratch bp !B_KVAALLOC %p", scratch_bp));
2952                 setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc,
2953                     scratch_bp->b_kvasize, gbflags);
2954
2955                 /* Get rid of the scratch buffer. */
2956                 scratch_bp->b_kvasize = 0;
2957                 scratch_bp->b_flags |= B_INVAL;
2958                 scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
2959                 brelse(scratch_bp);
2960         }
2961         if (!need_mapping)
2962                 return;
2963
2964 has_addr:
2965         bp->b_saveaddr = bp->b_kvabase;
2966         bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */
2967         bp->b_flags &= ~B_UNMAPPED;
2968         BUF_CHECK_MAPPED(bp);
2969         bpmap_qenter(bp);
2970 }
2971
2972 /*
2973  *      getblk:
2974  *
2975  *      Get a block given a specified block and offset into a file/device.
2976  *      The buffers B_DONE bit will be cleared on return, making it almost
2977  *      ready for an I/O initiation.  B_INVAL may or may not be set on
2978  *      return.  The caller should clear B_INVAL prior to initiating a
2979  *      READ.
2980  *
2981  *      For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
2982  *      an existing buffer.
2983  *
2984  *      For a VMIO buffer, B_CACHE is modified according to the backing VM.
2985  *      If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
2986  *      and then cleared based on the backing VM.  If the previous buffer is
2987  *      non-0-sized but invalid, B_CACHE will be cleared.
2988  *
2989  *      If getblk() must create a new buffer, the new buffer is returned with
2990  *      both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
2991  *      case it is returned with B_INVAL clear and B_CACHE set based on the
2992  *      backing VM.
2993  *
2994  *      getblk() also forces a bwrite() for any B_DELWRI buffer whos
2995  *      B_CACHE bit is clear.
2996  *
2997  *      What this means, basically, is that the caller should use B_CACHE to
2998  *      determine whether the buffer is fully valid or not and should clear
2999  *      B_INVAL prior to issuing a read.  If the caller intends to validate
3000  *      the buffer by loading its data area with something, the caller needs
3001  *      to clear B_INVAL.  If the caller does this without issuing an I/O,
3002  *      the caller should set B_CACHE ( as an optimization ), else the caller
3003  *      should issue the I/O and biodone() will set B_CACHE if the I/O was
3004  *      a write attempt or if it was a successfull read.  If the caller
3005  *      intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
3006  *      prior to issuing the READ.  biodone() will *not* clear B_INVAL.
3007  */
3008 struct buf *
3009 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
3010     int flags)
3011 {
3012         struct buf *bp;
3013         struct bufobj *bo;
3014         int bsize, error, maxsize, vmio;
3015         off_t offset;
3016
3017         CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
3018         KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
3019             ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
3020         ASSERT_VOP_LOCKED(vp, "getblk");
3021         if (size > MAXBSIZE)
3022                 panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
3023         if (!unmapped_buf_allowed)
3024                 flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
3025
3026         bo = &vp->v_bufobj;
3027 loop:
3028         BO_RLOCK(bo);
3029         bp = gbincore(bo, blkno);
3030         if (bp != NULL) {
3031                 int lockflags;
3032                 /*
3033                  * Buffer is in-core.  If the buffer is not busy nor managed,
3034                  * it must be on a queue.
3035                  */
3036                 lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
3037
3038                 if (flags & GB_LOCK_NOWAIT)
3039                         lockflags |= LK_NOWAIT;
3040
3041                 error = BUF_TIMELOCK(bp, lockflags,
3042                     BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
3043
3044                 /*
3045                  * If we slept and got the lock we have to restart in case
3046                  * the buffer changed identities.
3047                  */
3048                 if (error == ENOLCK)
3049                         goto loop;
3050                 /* We timed out or were interrupted. */
3051                 else if (error)
3052                         return (NULL);
3053                 /* If recursed, assume caller knows the rules. */
3054                 else if (BUF_LOCKRECURSED(bp))
3055                         goto end;
3056
3057                 /*
3058                  * The buffer is locked.  B_CACHE is cleared if the buffer is
3059                  * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
3060                  * and for a VMIO buffer B_CACHE is adjusted according to the
3061                  * backing VM cache.
3062                  */
3063                 if (bp->b_flags & B_INVAL)
3064                         bp->b_flags &= ~B_CACHE;
3065                 else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
3066                         bp->b_flags |= B_CACHE;
3067                 if (bp->b_flags & B_MANAGED)
3068                         MPASS(bp->b_qindex == QUEUE_NONE);
3069                 else
3070                         bremfree(bp);
3071
3072                 /*
3073                  * check for size inconsistencies for non-VMIO case.
3074                  */
3075                 if (bp->b_bcount != size) {
3076                         if ((bp->b_flags & B_VMIO) == 0 ||
3077                             (size > bp->b_kvasize)) {
3078                                 if (bp->b_flags & B_DELWRI) {
3079                                         /*
3080                                          * If buffer is pinned and caller does
3081                                          * not want sleep  waiting for it to be
3082                                          * unpinned, bail out
3083                                          * */
3084                                         if (bp->b_pin_count > 0) {
3085                                                 if (flags & GB_LOCK_NOWAIT) {
3086                                                         bqrelse(bp);
3087                                                         return (NULL);
3088                                                 } else {
3089                                                         bunpin_wait(bp);
3090                                                 }
3091                                         }
3092                                         bp->b_flags |= B_NOCACHE;
3093                                         bwrite(bp);
3094                                 } else {
3095                                         if (LIST_EMPTY(&bp->b_dep)) {
3096                                                 bp->b_flags |= B_RELBUF;
3097                                                 brelse(bp);
3098                                         } else {
3099                                                 bp->b_flags |= B_NOCACHE;
3100                                                 bwrite(bp);
3101                                         }
3102                                 }
3103                                 goto loop;
3104                         }
3105                 }
3106
3107                 /*
3108                  * Handle the case of unmapped buffer which should
3109                  * become mapped, or the buffer for which KVA
3110                  * reservation is requested.
3111                  */
3112                 bp_unmapped_get_kva(bp, blkno, size, flags);
3113
3114                 /*
3115                  * If the size is inconsistant in the VMIO case, we can resize
3116                  * the buffer.  This might lead to B_CACHE getting set or
3117                  * cleared.  If the size has not changed, B_CACHE remains
3118                  * unchanged from its previous state.
3119                  */
3120                 if (bp->b_bcount != size)
3121                         allocbuf(bp, size);
3122
3123                 KASSERT(bp->b_offset != NOOFFSET,
3124                     ("getblk: no buffer offset"));
3125
3126                 /*
3127                  * A buffer with B_DELWRI set and B_CACHE clear must
3128                  * be committed before we can return the buffer in
3129                  * order to prevent the caller from issuing a read
3130                  * ( due to B_CACHE not being set ) and overwriting
3131                  * it.
3132                  *
3133                  * Most callers, including NFS and FFS, need this to
3134                  * operate properly either because they assume they
3135                  * can issue a read if B_CACHE is not set, or because
3136                  * ( for example ) an uncached B_DELWRI might loop due
3137                  * to softupdates re-dirtying the buffer.  In the latter
3138                  * case, B_CACHE is set after the first write completes,
3139                  * preventing further loops.
3140                  * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
3141                  * above while extending the buffer, we cannot allow the
3142                  * buffer to remain with B_CACHE set after the write
3143                  * completes or it will represent a corrupt state.  To
3144                  * deal with this we set B_NOCACHE to scrap the buffer
3145                  * after the write.
3146                  *
3147                  * We might be able to do something fancy, like setting
3148                  * B_CACHE in bwrite() except if B_DELWRI is already set,
3149                  * so the below call doesn't set B_CACHE, but that gets real
3150                  * confusing.  This is much easier.
3151                  */
3152
3153                 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
3154                         bp->b_flags |= B_NOCACHE;
3155                         bwrite(bp);
3156                         goto loop;
3157                 }
3158                 bp->b_flags &= ~B_DONE;
3159         } else {
3160                 /*
3161                  * Buffer is not in-core, create new buffer.  The buffer
3162                  * returned by getnewbuf() is locked.  Note that the returned
3163                  * buffer is also considered valid (not marked B_INVAL).
3164                  */
3165                 BO_RUNLOCK(bo);
3166                 /*
3167                  * If the user does not want us to create the buffer, bail out
3168                  * here.
3169                  */
3170                 if (flags & GB_NOCREAT)
3171                         return NULL;
3172                 if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
3173                         return NULL;
3174
3175                 bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
3176                 offset = blkno * bsize;
3177                 vmio = vp->v_object != NULL;
3178                 if (vmio) {
3179                         maxsize = size + (offset & PAGE_MASK);
3180                 } else {
3181                         maxsize = size;
3182                         /* Do not allow non-VMIO notmapped buffers. */
3183                         flags &= ~GB_UNMAPPED;
3184                 }
3185                 maxsize = imax(maxsize, bsize);
3186
3187                 bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
3188                 if (bp == NULL) {
3189                         if (slpflag || slptimeo)
3190                                 return NULL;
3191                         goto loop;
3192                 }
3193
3194                 /*
3195                  * This code is used to make sure that a buffer is not
3196                  * created while the getnewbuf routine is blocked.
3197                  * This can be a problem whether the vnode is locked or not.
3198                  * If the buffer is created out from under us, we have to
3199                  * throw away the one we just created.
3200                  *
3201                  * Note: this must occur before we associate the buffer
3202                  * with the vp especially considering limitations in
3203                  * the splay tree implementation when dealing with duplicate
3204                  * lblkno's.
3205                  */
3206                 BO_LOCK(bo);
3207                 if (gbincore(bo, blkno)) {
3208                         BO_UNLOCK(bo);
3209                         bp->b_flags |= B_INVAL;
3210                         brelse(bp);
3211                         goto loop;
3212                 }
3213
3214                 /*
3215                  * Insert the buffer into the hash, so that it can
3216                  * be found by incore.
3217                  */
3218                 bp->b_blkno = bp->b_lblkno = blkno;
3219                 bp->b_offset = offset;
3220                 bgetvp(vp, bp);
3221                 BO_UNLOCK(bo);
3222
3223                 /*
3224                  * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
3225                  * buffer size starts out as 0, B_CACHE will be set by
3226                  * allocbuf() for the VMIO case prior to it testing the
3227                  * backing store for validity.
3228                  */
3229
3230                 if (vmio) {
3231                         bp->b_flags |= B_VMIO;
3232                         KASSERT(vp->v_object == bp->b_bufobj->bo_object,
3233                             ("ARGH! different b_bufobj->bo_object %p %p %p\n",
3234                             bp, vp->v_object, bp->b_bufobj->bo_object));
3235                 } else {
3236                         bp->b_flags &= ~B_VMIO;
3237                         KASSERT(bp->b_bufobj->bo_object == NULL,
3238                             ("ARGH! has b_bufobj->bo_object %p %p\n",
3239                             bp, bp->b_bufobj->bo_object));
3240                         BUF_CHECK_MAPPED(bp);
3241                 }
3242
3243                 allocbuf(bp, size);
3244                 bp->b_flags &= ~B_DONE;
3245         }
3246         CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
3247         BUF_ASSERT_HELD(bp);
3248 end:
3249         KASSERT(bp->b_bufobj == bo,
3250             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
3251         return (bp);
3252 }
3253
3254 /*
3255  * Get an empty, disassociated buffer of given size.  The buffer is initially
3256  * set to B_INVAL.
3257  */
3258 struct buf *
3259 geteblk(int size, int flags)
3260 {
3261         struct buf *bp;
3262         int maxsize;
3263
3264         maxsize = (size + BKVAMASK) & ~BKVAMASK;
3265         while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
3266                 if ((flags & GB_NOWAIT_BD) &&
3267                     (curthread->td_pflags & TDP_BUFNEED) != 0)
3268                         return (NULL);
3269         }
3270         allocbuf(bp, size);
3271         bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
3272         BUF_ASSERT_HELD(bp);
3273         return (bp);
3274 }
3275
3276
3277 /*
3278  * This code constitutes the buffer memory from either anonymous system
3279  * memory (in the case of non-VMIO operations) or from an associated
3280  * VM object (in the case of VMIO operations).  This code is able to
3281  * resize a buffer up or down.
3282  *
3283  * Note that this code is tricky, and has many complications to resolve
3284  * deadlock or inconsistant data situations.  Tread lightly!!!
3285  * There are B_CACHE and B_DELWRI interactions that must be dealt with by
3286  * the caller.  Calling this code willy nilly can result in the loss of data.
3287  *
3288  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
3289  * B_CACHE for the non-VMIO case.
3290  */
3291
3292 int
3293 allocbuf(struct buf *bp, int size)
3294 {
3295         int newbsize, mbsize;
3296         int i;
3297
3298         BUF_ASSERT_HELD(bp);
3299
3300         if (bp->b_kvasize < size)
3301                 panic("allocbuf: buffer too small");
3302
3303         if ((bp->b_flags & B_VMIO) == 0) {
3304                 caddr_t origbuf;
3305                 int origbufsize;
3306                 /*
3307                  * Just get anonymous memory from the kernel.  Don't
3308                  * mess with B_CACHE.
3309                  */
3310                 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
3311                 if (bp->b_flags & B_MALLOC)
3312                         newbsize = mbsize;
3313                 else
3314                         newbsize = round_page(size);
3315
3316                 if (newbsize < bp->b_bufsize) {
3317                         /*
3318                          * malloced buffers are not shrunk
3319                          */
3320                         if (bp->b_flags & B_MALLOC) {
3321                                 if (newbsize) {
3322                                         bp->b_bcount = size;
3323                                 } else {
3324                                         free(bp->b_data, M_BIOBUF);
3325                                         if (bp->b_bufsize) {
3326                                                 atomic_subtract_long(
3327                                                     &bufmallocspace,
3328                                                     bp->b_bufsize);
3329                                                 bufspacewakeup();
3330                                                 bp->b_bufsize = 0;
3331                                         }
3332                                         bp->b_saveaddr = bp->b_kvabase;
3333                                         bp->b_data = bp->b_saveaddr;
3334                                         bp->b_bcount = 0;
3335                                         bp->b_flags &= ~B_MALLOC;
3336                                 }
3337                                 return 1;
3338                         }
3339                         vm_hold_free_pages(bp, newbsize);
3340                 } else if (newbsize > bp->b_bufsize) {
3341                         /*
3342                          * We only use malloced memory on the first allocation.
3343                          * and revert to page-allocated memory when the buffer
3344                          * grows.
3345                          */
3346                         /*
3347                          * There is a potential smp race here that could lead
3348                          * to bufmallocspace slightly passing the max.  It
3349                          * is probably extremely rare and not worth worrying
3350                          * over.
3351                          */
3352                         if ( (bufmallocspace < maxbufmallocspace) &&
3353                                 (bp->b_bufsize == 0) &&
3354                                 (mbsize <= PAGE_SIZE/2)) {
3355
3356                                 bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
3357                                 bp->b_bufsize = mbsize;
3358                                 bp->b_bcount = size;
3359                                 bp->b_flags |= B_MALLOC;
3360                                 atomic_add_long(&bufmallocspace, mbsize);
3361                                 return 1;
3362                         }
3363                         origbuf = NULL;
3364                         origbufsize = 0;
3365                         /*
3366                          * If the buffer is growing on its other-than-first allocation,
3367                          * then we revert to the page-allocation scheme.
3368                          */
3369                         if (bp->b_flags & B_MALLOC) {
3370                                 origbuf = bp->b_data;
3371                                 origbufsize = bp->b_bufsize;
3372                                 bp->b_data = bp->b_kvabase;
3373                                 if (bp->b_bufsize) {
3374                                         atomic_subtract_long(&bufmallocspace,
3375                                             bp->b_bufsize);
3376                                         bufspacewakeup();
3377                                         bp->b_bufsize = 0;
3378                                 }
3379                                 bp->b_flags &= ~B_MALLOC;
3380                                 newbsize = round_page(newbsize);
3381                         }
3382                         vm_hold_load_pages(
3383                             bp,
3384                             (vm_offset_t) bp->b_data + bp->b_bufsize,
3385                             (vm_offset_t) bp->b_data + newbsize);
3386                         if (origbuf) {
3387                                 bcopy(origbuf, bp->b_data, origbufsize);
3388                                 free(origbuf, M_BIOBUF);
3389                         }
3390                 }
3391         } else {
3392                 int desiredpages;
3393
3394                 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
3395                 desiredpages = (size == 0) ? 0 :
3396                         num_pages((bp->b_offset & PAGE_MASK) + newbsize);
3397
3398                 if (bp->b_flags & B_MALLOC)
3399                         panic("allocbuf: VMIO buffer can't be malloced");
3400                 /*
3401                  * Set B_CACHE initially if buffer is 0 length or will become
3402                  * 0-length.
3403                  */
3404                 if (size == 0 || bp->b_bufsize == 0)
3405                         bp->b_flags |= B_CACHE;
3406
3407                 if (newbsize < bp->b_bufsize) {
3408                         /*
3409                          * DEV_BSIZE aligned new buffer size is less then the
3410                          * DEV_BSIZE aligned existing buffer size.  Figure out
3411                          * if we have to remove any pages.
3412                          */
3413                         if (desiredpages < bp->b_npages) {
3414                                 vm_page_t m;
3415
3416                                 if ((bp->b_flags & B_UNMAPPED) == 0) {
3417                                         BUF_CHECK_MAPPED(bp);
3418                                         pmap_qremove((vm_offset_t)trunc_page(
3419                                             (vm_offset_t)bp->b_data) +
3420                                             (desiredpages << PAGE_SHIFT),
3421                                             (bp->b_npages - desiredpages));
3422                                 } else
3423                                         BUF_CHECK_UNMAPPED(bp);
3424                                 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
3425                                 for (i = desiredpages; i < bp->b_npages; i++) {
3426                                         /*
3427                                          * the page is not freed here -- it
3428                                          * is the responsibility of
3429                                          * vnode_pager_setsize
3430                                          */
3431                                         m = bp->b_pages[i];
3432                                         KASSERT(m != bogus_page,
3433                                             ("allocbuf: bogus page found"));
3434                                         while (vm_page_sleep_if_busy(m,
3435                                             "biodep"))
3436                                                 continue;
3437
3438                                         bp->b_pages[i] = NULL;
3439                                         vm_page_lock(m);
3440                                         vm_page_unwire(m, 0);
3441                                         vm_page_unlock(m);
3442                                 }
3443                                 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
3444                                 bp->b_npages = desiredpages;
3445                         }
3446                 } else if (size > bp->b_bcount) {
3447                         /*
3448                          * We are growing the buffer, possibly in a
3449                          * byte-granular fashion.
3450                          */
3451                         vm_object_t obj;
3452                         vm_offset_t toff;
3453                         vm_offset_t tinc;
3454
3455                         /*
3456                          * Step 1, bring in the VM pages from the object,
3457                          * allocating them if necessary.  We must clear
3458                          * B_CACHE if these pages are not valid for the
3459                          * range covered by the buffer.
3460                          */
3461
3462                         obj = bp->b_bufobj->bo_object;
3463
3464                         VM_OBJECT_WLOCK(obj);
3465                         while (bp->b_npages < desiredpages) {
3466                                 vm_page_t m;
3467
3468                                 /*
3469                                  * We must allocate system pages since blocking
3470                                  * here could interfere with paging I/O, no
3471                                  * matter which process we are.
3472                                  *
3473                                  * Only exclusive busy can be tested here.
3474                                  * Blocking on shared busy might lead to
3475                                  * deadlocks once allocbuf() is called after
3476                                  * pages are vfs_busy_pages().
3477                                  */
3478                                 m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) +
3479                                     bp->b_npages, VM_ALLOC_NOBUSY |
3480                                     VM_ALLOC_SYSTEM | VM_ALLOC_WIRED |
3481                                     VM_ALLOC_IGN_SBUSY |
3482                                     VM_ALLOC_COUNT(desiredpages - bp->b_npages));
3483                                 if (m->valid == 0)
3484                                         bp->b_flags &= ~B_CACHE;
3485                                 bp->b_pages[bp->b_npages] = m;
3486                                 ++bp->b_npages;
3487                         }
3488
3489                         /*
3490                          * Step 2.  We've loaded the pages into the buffer,
3491                          * we have to figure out if we can still have B_CACHE
3492                          * set.  Note that B_CACHE is set according to the
3493                          * byte-granular range ( bcount and size ), new the
3494                          * aligned range ( newbsize ).
3495                          *
3496                          * The VM test is against m->valid, which is DEV_BSIZE
3497                          * aligned.  Needless to say, the validity of the data
3498                          * needs to also be DEV_BSIZE aligned.  Note that this
3499                          * fails with NFS if the server or some other client
3500                          * extends the file's EOF.  If our buffer is resized,
3501                          * B_CACHE may remain set! XXX
3502                          */
3503
3504                         toff = bp->b_bcount;
3505                         tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
3506
3507                         while ((bp->b_flags & B_CACHE) && toff < size) {
3508                                 vm_pindex_t pi;
3509
3510                                 if (tinc > (size - toff))
3511                                         tinc = size - toff;
3512
3513                                 pi = ((bp->b_offset & PAGE_MASK) + toff) >>
3514                                     PAGE_SHIFT;
3515
3516                                 vfs_buf_test_cache(
3517                                     bp,
3518                                     bp->b_offset,
3519                                     toff,
3520                                     tinc,
3521                                     bp->b_pages[pi]
3522                                 );
3523                                 toff += tinc;
3524                                 tinc = PAGE_SIZE;
3525                         }
3526                         VM_OBJECT_WUNLOCK(obj);
3527
3528                         /*
3529                          * Step 3, fixup the KVM pmap.
3530                          */
3531                         if ((bp->b_flags & B_UNMAPPED) == 0)
3532                                 bpmap_qenter(bp);
3533                         else
3534                                 BUF_CHECK_UNMAPPED(bp);
3535                 }
3536         }
3537         if (newbsize < bp->b_bufsize)
3538                 bufspacewakeup();
3539         bp->b_bufsize = newbsize;       /* actual buffer allocation     */
3540         bp->b_bcount = size;            /* requested buffer size        */
3541         return 1;
3542 }
3543
3544 extern int inflight_transient_maps;
3545
3546 void
3547 biodone(struct bio *bp)
3548 {
3549         struct mtx *mtxp;
3550         void (*done)(struct bio *);
3551         vm_offset_t start, end;
3552
3553         if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
3554                 bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
3555                 bp->bio_flags |= BIO_UNMAPPED;
3556                 start = trunc_page((vm_offset_t)bp->bio_data);
3557                 end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
3558                 pmap_qremove(start, OFF_TO_IDX(end - start));
3559                 vmem_free(transient_arena, start, end - start);
3560                 atomic_add_int(&inflight_transient_maps, -1);
3561         }
3562         done = bp->bio_done;
3563         if (done == NULL) {
3564                 mtxp = mtx_pool_find(mtxpool_sleep, bp);
3565                 mtx_lock(mtxp);
3566                 bp->bio_flags |= BIO_DONE;
3567                 wakeup(bp);
3568                 mtx_unlock(mtxp);
3569         } else {
3570                 bp->bio_flags |= BIO_DONE;
3571                 done(bp);
3572         }
3573 }
3574
3575 /*
3576  * Wait for a BIO to finish.
3577  */
3578 int
3579 biowait(struct bio *bp, const char *wchan)
3580 {
3581         struct mtx *mtxp;
3582
3583         mtxp = mtx_pool_find(mtxpool_sleep, bp);
3584         mtx_lock(mtxp);
3585         while ((bp->bio_flags & BIO_DONE) == 0)
3586                 msleep(bp, mtxp, PRIBIO, wchan, 0);
3587         mtx_unlock(mtxp);
3588         if (bp->bio_error != 0)
3589                 return (bp->bio_error);
3590         if (!(bp->bio_flags & BIO_ERROR))
3591                 return (0);
3592         return (EIO);
3593 }
3594
3595 void
3596 biofinish(struct bio *bp, struct devstat *stat, int error)
3597 {
3598
3599         if (error) {
3600                 bp->bio_error = error;
3601                 bp->bio_flags |= BIO_ERROR;
3602         }
3603         if (stat != NULL)
3604                 devstat_end_transaction_bio(stat, bp);
3605         biodone(bp);
3606 }
3607
3608 /*
3609  *      bufwait:
3610  *
3611  *      Wait for buffer I/O completion, returning error status.  The buffer
3612  *      is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
3613  *      error and cleared.
3614  */
3615 int
3616 bufwait(struct buf *bp)
3617 {
3618         if (bp->b_iocmd == BIO_READ)
3619                 bwait(bp, PRIBIO, "biord");
3620         else
3621                 bwait(bp, PRIBIO, "biowr");
3622         if (bp->b_flags & B_EINTR) {
3623                 bp->b_flags &= ~B_EINTR;
3624                 return (EINTR);
3625         }
3626         if (bp->b_ioflags & BIO_ERROR) {
3627                 return (bp->b_error ? bp->b_error : EIO);
3628         } else {
3629                 return (0);
3630         }
3631 }
3632
3633  /*
3634   * Call back function from struct bio back up to struct buf.
3635   */
3636 static void
3637 bufdonebio(struct bio *bip)
3638 {
3639         struct buf *bp;
3640
3641         bp = bip->bio_caller2;
3642         bp->b_resid = bp->b_bcount - bip->bio_completed;
3643         bp->b_resid = bip->bio_resid;   /* XXX: remove */
3644         bp->b_ioflags = bip->bio_flags;
3645         bp->b_error = bip->bio_error;
3646         if (bp->b_error)
3647                 bp->b_ioflags |= BIO_ERROR;
3648         bufdone(bp);
3649         g_destroy_bio(bip);
3650 }
3651
3652 void
3653 dev_strategy(struct cdev *dev, struct buf *bp)
3654 {
3655         struct cdevsw *csw;
3656         int ref;
3657
3658         KASSERT(dev->si_refcount > 0,
3659             ("dev_strategy on un-referenced struct cdev *(%s) %p",
3660             devtoname(dev), dev));
3661
3662         csw = dev_refthread(dev, &ref);
3663         dev_strategy_csw(dev, csw, bp);
3664         dev_relthread(dev, ref);
3665 }
3666
3667 void
3668 dev_strategy_csw(struct cdev *dev, struct cdevsw *csw, struct buf *bp)
3669 {
3670         struct bio *bip;
3671
3672         KASSERT(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE,
3673             ("b_iocmd botch"));
3674         KASSERT(((dev->si_flags & SI_ETERNAL) != 0 && csw != NULL) ||
3675             dev->si_threadcount > 0,
3676             ("dev_strategy_csw threadcount cdev *(%s) %p", devtoname(dev),
3677             dev));
3678         if (csw == NULL) {
3679                 bp->b_error = ENXIO;
3680                 bp->b_ioflags = BIO_ERROR;
3681                 bufdone(bp);
3682                 return;
3683         }
3684         for (;;) {
3685                 bip = g_new_bio();
3686                 if (bip != NULL)
3687                         break;
3688                 /* Try again later */
3689                 tsleep(&bp, PRIBIO, "dev_strat", hz/10);
3690         }
3691         bip->bio_cmd = bp->b_iocmd;
3692         bip->bio_offset = bp->b_iooffset;
3693         bip->bio_length = bp->b_bcount;
3694         bip->bio_bcount = bp->b_bcount; /* XXX: remove */
3695         bdata2bio(bp, bip);
3696         bip->bio_done = bufdonebio;
3697         bip->bio_caller2 = bp;
3698         bip->bio_dev = dev;
3699         (*csw->d_strategy)(bip);
3700 }
3701
3702 /*
3703  *      bufdone:
3704  *
3705  *      Finish I/O on a buffer, optionally calling a completion function.
3706  *      This is usually called from an interrupt so process blocking is
3707  *      not allowed.
3708  *
3709  *      biodone is also responsible for setting B_CACHE in a B_VMIO bp.
3710  *      In a non-VMIO bp, B_CACHE will be set on the next getblk()
3711  *      assuming B_INVAL is clear.
3712  *
3713  *      For the VMIO case, we set B_CACHE if the op was a read and no
3714  *      read error occured, or if the op was a write.  B_CACHE is never
3715  *      set if the buffer is invalid or otherwise uncacheable.
3716  *
3717  *      biodone does not mess with B_INVAL, allowing the I/O routine or the
3718  *      initiator to leave B_INVAL set to brelse the buffer out of existance
3719  *      in the biodone routine.
3720  */
3721 void
3722 bufdone(struct buf *bp)
3723 {
3724         struct bufobj *dropobj;
3725         void    (*biodone)(struct buf *);
3726
3727         CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
3728         dropobj = NULL;
3729
3730         KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
3731         BUF_ASSERT_HELD(bp);
3732
3733         runningbufwakeup(bp);
3734         if (bp->b_iocmd == BIO_WRITE)
3735                 dropobj = bp->b_bufobj;
3736         /* call optional completion function if requested */
3737         if (bp->b_iodone != NULL) {
3738                 biodone = bp->b_iodone;
3739                 bp->b_iodone = NULL;
3740                 (*biodone) (bp);
3741                 if (dropobj)
3742                         bufobj_wdrop(dropobj);
3743                 return;
3744         }
3745
3746         bufdone_finish(bp);
3747
3748         if (dropobj)
3749                 bufobj_wdrop(dropobj);
3750 }
3751
3752 void
3753 bufdone_finish(struct buf *bp)
3754 {
3755         BUF_ASSERT_HELD(bp);
3756
3757         if (!LIST_EMPTY(&bp->b_dep))
3758                 buf_complete(bp);
3759
3760         if (bp->b_flags & B_VMIO) {
3761                 vm_ooffset_t foff;
3762                 vm_page_t m;
3763                 vm_object_t obj;
3764                 struct vnode *vp;
3765                 int bogus, i, iosize;
3766
3767                 obj = bp->b_bufobj->bo_object;
3768                 KASSERT(obj->paging_in_progress >= bp->b_npages,
3769                     ("biodone_finish: paging in progress(%d) < b_npages(%d)",
3770                     obj->paging_in_progress, bp->b_npages));
3771
3772                 vp = bp->b_vp;
3773                 KASSERT(vp->v_holdcnt > 0,
3774                     ("biodone_finish: vnode %p has zero hold count", vp));
3775                 KASSERT(vp->v_object != NULL,
3776                     ("biodone_finish: vnode %p has no vm_object", vp));
3777
3778                 foff = bp->b_offset;
3779                 KASSERT(bp->b_offset != NOOFFSET,
3780                     ("biodone_finish: bp %p has no buffer offset", bp));
3781
3782                 /*
3783                  * Set B_CACHE if the op was a normal read and no error
3784                  * occured.  B_CACHE is set for writes in the b*write()
3785                  * routines.
3786                  */
3787                 iosize = bp->b_bcount - bp->b_resid;
3788                 if (bp->b_iocmd == BIO_READ &&
3789                     !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
3790                     !(bp->b_ioflags & BIO_ERROR)) {
3791                         bp->b_flags |= B_CACHE;
3792                 }
3793                 bogus = 0;
3794                 VM_OBJECT_WLOCK(obj);
3795                 for (i = 0; i < bp->b_npages; i++) {
3796                         int bogusflag = 0;
3797                         int resid;
3798
3799                         resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
3800                         if (resid > iosize)
3801                                 resid = iosize;
3802
3803                         /*
3804                          * cleanup bogus pages, restoring the originals
3805                          */
3806                         m = bp->b_pages[i];
3807                         if (m == bogus_page) {
3808                                 bogus = bogusflag = 1;
3809                                 m = vm_page_lookup(obj, OFF_TO_IDX(foff));
3810                                 if (m == NULL)
3811                                         panic("biodone: page disappeared!");
3812                                 bp->b_pages[i] = m;
3813                         }
3814                         KASSERT(OFF_TO_IDX(foff) == m->pindex,
3815                             ("biodone_finish: foff(%jd)/pindex(%ju) mismatch",
3816                             (intmax_t)foff, (uintmax_t)m->pindex));
3817
3818                         /*
3819                          * In the write case, the valid and clean bits are
3820                          * already changed correctly ( see bdwrite() ), so we
3821                          * only need to do this here in the read case.
3822                          */
3823                         if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
3824                                 KASSERT((m->dirty & vm_page_bits(foff &
3825                                     PAGE_MASK, resid)) == 0, ("bufdone_finish:"
3826                                     " page %p has unexpected dirty bits", m));
3827                                 vfs_page_set_valid(bp, foff, m);
3828                         }
3829
3830                         vm_page_sunbusy(m);
3831                         vm_object_pip_subtract(obj, 1);
3832                         foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3833                         iosize -= resid;
3834                 }
3835                 vm_object_pip_wakeupn(obj, 0);
3836                 VM_OBJECT_WUNLOCK(obj);
3837                 if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
3838                         BUF_CHECK_MAPPED(bp);
3839                         pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3840                             bp->b_pages, bp->b_npages);
3841                 }
3842         }
3843
3844         /*
3845          * For asynchronous completions, release the buffer now. The brelse
3846          * will do a wakeup there if necessary - so no need to do a wakeup
3847          * here in the async case. The sync case always needs to do a wakeup.
3848          */
3849
3850         if (bp->b_flags & B_ASYNC) {
3851                 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
3852                         brelse(bp);
3853                 else
3854                         bqrelse(bp);
3855         } else
3856                 bdone(bp);
3857 }
3858
3859 /*
3860  * This routine is called in lieu of iodone in the case of
3861  * incomplete I/O.  This keeps the busy status for pages
3862  * consistant.
3863  */
3864 void
3865 vfs_unbusy_pages(struct buf *bp)
3866 {
3867         int i;
3868         vm_object_t obj;
3869         vm_page_t m;
3870
3871         runningbufwakeup(bp);
3872         if (!(bp->b_flags & B_VMIO))
3873                 return;
3874
3875         obj = bp->b_bufobj->bo_object;
3876         VM_OBJECT_WLOCK(obj);
3877         for (i = 0; i < bp->b_npages; i++) {
3878                 m = bp->b_pages[i];
3879                 if (m == bogus_page) {
3880                         m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
3881                         if (!m)
3882                                 panic("vfs_unbusy_pages: page missing\n");
3883                         bp->b_pages[i] = m;
3884                         if ((bp->b_flags & B_UNMAPPED) == 0) {
3885                                 BUF_CHECK_MAPPED(bp);
3886                                 pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3887                                     bp->b_pages, bp->b_npages);
3888                         } else
3889                                 BUF_CHECK_UNMAPPED(bp);
3890                 }
3891                 vm_object_pip_subtract(obj, 1);
3892                 vm_page_sunbusy(m);
3893         }
3894         vm_object_pip_wakeupn(obj, 0);
3895         VM_OBJECT_WUNLOCK(obj);
3896 }
3897
3898 /*
3899  * vfs_page_set_valid:
3900  *
3901  *      Set the valid bits in a page based on the supplied offset.   The
3902  *      range is restricted to the buffer's size.
3903  *
3904  *      This routine is typically called after a read completes.
3905  */
3906 static void
3907 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
3908 {
3909         vm_ooffset_t eoff;
3910
3911         /*
3912          * Compute the end offset, eoff, such that [off, eoff) does not span a
3913          * page boundary and eoff is not greater than the end of the buffer.
3914          * The end of the buffer, in this case, is our file EOF, not the
3915          * allocation size of the buffer.
3916          */
3917         eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
3918         if (eoff > bp->b_offset + bp->b_bcount)
3919                 eoff = bp->b_offset + bp->b_bcount;
3920
3921         /*
3922          * Set valid range.  This is typically the entire buffer and thus the
3923          * entire page.
3924          */
3925         if (eoff > off)
3926                 vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
3927 }
3928
3929 /*
3930  * vfs_page_set_validclean:
3931  *
3932  *      Set the valid bits and clear the dirty bits in a page based on the
3933  *      supplied offset.   The range is restricted to the buffer's size.
3934  */
3935 static void
3936 vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
3937 {
3938         vm_ooffset_t soff, eoff;
3939
3940         /*
3941          * Start and end offsets in buffer.  eoff - soff may not cross a
3942          * page boundry or cross the end of the buffer.  The end of the
3943          * buffer, in this case, is our file EOF, not the allocation size
3944          * of the buffer.
3945          */
3946         soff = off;
3947         eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3948         if (eoff > bp->b_offset + bp->b_bcount)
3949                 eoff = bp->b_offset + bp->b_bcount;
3950
3951         /*
3952          * Set valid range.  This is typically the entire buffer and thus the
3953          * entire page.
3954          */
3955         if (eoff > soff) {
3956                 vm_page_set_validclean(
3957                     m,
3958                    (vm_offset_t) (soff & PAGE_MASK),
3959                    (vm_offset_t) (eoff - soff)
3960                 );
3961         }
3962 }
3963
3964 /*
3965  * Ensure that all buffer pages are not exclusive busied.  If any page is
3966  * exclusive busy, drain it.
3967  */
3968 void
3969 vfs_drain_busy_pages(struct buf *bp)
3970 {
3971         vm_page_t m;
3972         int i, last_busied;
3973
3974         VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
3975         last_busied = 0;
3976         for (i = 0; i < bp->b_npages; i++) {
3977                 m = bp->b_pages[i];
3978                 if (vm_page_xbusied(m)) {
3979                         for (; last_busied < i; last_busied++)
3980                                 vm_page_sbusy(bp->b_pages[last_busied]);
3981                         while (vm_page_xbusied(m)) {
3982                                 vm_page_lock(m);
3983                                 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
3984                                 vm_page_busy_sleep(m, "vbpage");
3985                                 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
3986                         }
3987                 }
3988         }
3989         for (i = 0; i < last_busied; i++)
3990                 vm_page_sunbusy(bp->b_pages[i]);
3991 }
3992
3993 /*
3994  * This routine is called before a device strategy routine.
3995  * It is used to tell the VM system that paging I/O is in
3996  * progress, and treat the pages associated with the buffer
3997  * almost as being exclusive busy.  Also the object paging_in_progress
3998  * flag is handled to make sure that the object doesn't become
3999  * inconsistant.
4000  *
4001  * Since I/O has not been initiated yet, certain buffer flags
4002  * such as BIO_ERROR or B_INVAL may be in an inconsistant state
4003  * and should be ignored.
4004  */
4005 void
4006 vfs_busy_pages(struct buf *bp, int clear_modify)
4007 {
4008         int i, bogus;
4009         vm_object_t obj;
4010         vm_ooffset_t foff;
4011         vm_page_t m;
4012
4013         if (!(bp->b_flags & B_VMIO))
4014                 return;
4015
4016         obj = bp->b_bufobj->bo_object;
4017         foff = bp->b_offset;
4018         KASSERT(bp->b_offset != NOOFFSET,
4019             ("vfs_busy_pages: no buffer offset"));
4020         VM_OBJECT_WLOCK(obj);
4021         vfs_drain_busy_pages(bp);
4022         if (bp->b_bufsize != 0)
4023                 vfs_setdirty_locked_object(bp);
4024         bogus = 0;
4025         for (i = 0; i < bp->b_npages; i++) {
4026                 m = bp->b_pages[i];
4027
4028                 if ((bp->b_flags & B_CLUSTER) == 0) {
4029                         vm_object_pip_add(obj, 1);
4030                         vm_page_sbusy(m);
4031                 }
4032                 /*
4033                  * When readying a buffer for a read ( i.e
4034                  * clear_modify == 0 ), it is important to do
4035                  * bogus_page replacement for valid pages in
4036                  * partially instantiated buffers.  Partially
4037                  * instantiated buffers can, in turn, occur when
4038                  * reconstituting a buffer from its VM backing store
4039                  * base.  We only have to do this if B_CACHE is
4040                  * clear ( which causes the I/O to occur in the
4041                  * first place ).  The replacement prevents the read
4042                  * I/O from overwriting potentially dirty VM-backed
4043                  * pages.  XXX bogus page replacement is, uh, bogus.
4044                  * It may not work properly with small-block devices.
4045                  * We need to find a better way.
4046                  */
4047                 if (clear_modify) {
4048                         pmap_remove_write(m);
4049                         vfs_page_set_validclean(bp, foff, m);
4050                 } else if (m->valid == VM_PAGE_BITS_ALL &&
4051                     (bp->b_flags & B_CACHE) == 0) {
4052                         bp->b_pages[i] = bogus_page;
4053                         bogus++;
4054                 }
4055                 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
4056         }
4057         VM_OBJECT_WUNLOCK(obj);
4058         if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
4059                 BUF_CHECK_MAPPED(bp);
4060                 pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
4061                     bp->b_pages, bp->b_npages);
4062         }
4063 }
4064
4065 /*
4066  *      vfs_bio_set_valid:
4067  *
4068  *      Set the range within the buffer to valid.  The range is
4069  *      relative to the beginning of the buffer, b_offset.  Note that
4070  *      b_offset itself may be offset from the beginning of the first
4071  *      page.
4072  */
4073 void
4074 vfs_bio_set_valid(struct buf *bp, int base, int size)
4075 {
4076         int i, n;
4077         vm_page_t m;
4078
4079         if (!(bp->b_flags & B_VMIO))
4080                 return;
4081
4082         /*
4083          * Fixup base to be relative to beginning of first page.
4084          * Set initial n to be the maximum number of bytes in the
4085          * first page that can be validated.
4086          */
4087         base += (bp->b_offset & PAGE_MASK);
4088         n = PAGE_SIZE - (base & PAGE_MASK);
4089
4090         VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4091         for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4092                 m = bp->b_pages[i];
4093                 if (n > size)
4094                         n = size;
4095                 vm_page_set_valid_range(m, base & PAGE_MASK, n);
4096                 base += n;
4097                 size -= n;
4098                 n = PAGE_SIZE;
4099         }
4100         VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4101 }
4102
4103 /*
4104  *      vfs_bio_clrbuf:
4105  *
4106  *      If the specified buffer is a non-VMIO buffer, clear the entire
4107  *      buffer.  If the specified buffer is a VMIO buffer, clear and
4108  *      validate only the previously invalid portions of the buffer.
4109  *      This routine essentially fakes an I/O, so we need to clear
4110  *      BIO_ERROR and B_INVAL.
4111  *
4112  *      Note that while we only theoretically need to clear through b_bcount,
4113  *      we go ahead and clear through b_bufsize.
4114  */
4115 void
4116 vfs_bio_clrbuf(struct buf *bp)
4117 {
4118         int i, j, mask, sa, ea, slide;
4119
4120         if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
4121                 clrbuf(bp);
4122                 return;
4123         }
4124         bp->b_flags &= ~B_INVAL;
4125         bp->b_ioflags &= ~BIO_ERROR;
4126         VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4127         if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
4128             (bp->b_offset & PAGE_MASK) == 0) {
4129                 if (bp->b_pages[0] == bogus_page)
4130                         goto unlock;
4131                 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
4132                 VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
4133                 if ((bp->b_pages[0]->valid & mask) == mask)
4134                         goto unlock;
4135                 if ((bp->b_pages[0]->valid & mask) == 0) {
4136                         pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
4137                         bp->b_pages[0]->valid |= mask;
4138                         goto unlock;
4139                 }
4140         }
4141         sa = bp->b_offset & PAGE_MASK;
4142         slide = 0;
4143         for (i = 0; i < bp->b_npages; i++, sa = 0) {
4144                 slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
4145                 ea = slide & PAGE_MASK;
4146                 if (ea == 0)
4147                         ea = PAGE_SIZE;
4148                 if (bp->b_pages[i] == bogus_page)
4149                         continue;
4150                 j = sa / DEV_BSIZE;
4151                 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
4152                 VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
4153                 if ((bp->b_pages[i]->valid & mask) == mask)
4154                         continue;
4155                 if ((bp->b_pages[i]->valid & mask) == 0)
4156                         pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
4157                 else {
4158                         for (; sa < ea; sa += DEV_BSIZE, j++) {
4159                                 if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
4160                                         pmap_zero_page_area(bp->b_pages[i],
4161                                             sa, DEV_BSIZE);
4162                                 }
4163                         }
4164                 }
4165                 bp->b_pages[i]->valid |= mask;
4166         }
4167 unlock:
4168         VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4169         bp->b_resid = 0;
4170 }
4171
4172 void
4173 vfs_bio_bzero_buf(struct buf *bp, int base, int size)
4174 {
4175         vm_page_t m;
4176         int i, n;
4177
4178         if ((bp->b_flags & B_UNMAPPED) == 0) {
4179                 BUF_CHECK_MAPPED(bp);
4180                 bzero(bp->b_data + base, size);
4181         } else {
4182                 BUF_CHECK_UNMAPPED(bp);
4183                 n = PAGE_SIZE - (base & PAGE_MASK);
4184                 for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4185                         m = bp->b_pages[i];
4186                         if (n > size)
4187                                 n = size;
4188                         pmap_zero_page_area(m, base & PAGE_MASK, n);
4189                         base += n;
4190                         size -= n;
4191                         n = PAGE_SIZE;
4192                 }
4193         }
4194 }
4195
4196 /*
4197  * vm_hold_load_pages and vm_hold_free_pages get pages into
4198  * a buffers address space.  The pages are anonymous and are
4199  * not associated with a file object.
4200  */
4201 static void
4202 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
4203 {
4204         vm_offset_t pg;
4205         vm_page_t p;
4206         int index;
4207
4208         BUF_CHECK_MAPPED(bp);
4209
4210         to = round_page(to);
4211         from = round_page(from);
4212         index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4213
4214         for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
4215 tryagain:
4216                 /*
4217                  * note: must allocate system pages since blocking here
4218                  * could interfere with paging I/O, no matter which
4219                  * process we are.
4220                  */
4221                 p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
4222                     VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
4223                 if (p == NULL) {
4224                         VM_WAIT;
4225                         goto tryagain;
4226                 }
4227                 pmap_qenter(pg, &p, 1);
4228                 bp->b_pages[index] = p;
4229         }
4230         bp->b_npages = index;
4231 }
4232
4233 /* Return pages associated with this buf to the vm system */
4234 static void
4235 vm_hold_free_pages(struct buf *bp, int newbsize)
4236 {
4237         vm_offset_t from;
4238         vm_page_t p;
4239         int index, newnpages;
4240
4241         BUF_CHECK_MAPPED(bp);
4242
4243         from = round_page((vm_offset_t)bp->b_data + newbsize);
4244         newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4245         if (bp->b_npages > newnpages)
4246                 pmap_qremove(from, bp->b_npages - newnpages);
4247         for (index = newnpages; index < bp->b_npages; index++) {
4248                 p = bp->b_pages[index];
4249                 bp->b_pages[index] = NULL;
4250                 if (vm_page_sbusied(p))
4251                         printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
4252                             (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
4253                 p->wire_count--;
4254                 vm_page_free(p);
4255                 atomic_subtract_int(&cnt.v_wire_count, 1);
4256         }
4257         bp->b_npages = newnpages;
4258 }
4259
4260 /*
4261  * Map an IO request into kernel virtual address space.
4262  *
4263  * All requests are (re)mapped into kernel VA space.
4264  * Notice that we use b_bufsize for the size of the buffer
4265  * to be mapped.  b_bcount might be modified by the driver.
4266  *
4267  * Note that even if the caller determines that the address space should
4268  * be valid, a race or a smaller-file mapped into a larger space may
4269  * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
4270  * check the return value.
4271  */
4272 int
4273 vmapbuf(struct buf *bp, int mapbuf)
4274 {
4275         caddr_t kva;
4276         vm_prot_t prot;
4277         int pidx;
4278
4279         if (bp->b_bufsize < 0)
4280                 return (-1);
4281         prot = VM_PROT_READ;
4282         if (bp->b_iocmd == BIO_READ)
4283                 prot |= VM_PROT_WRITE;  /* Less backwards than it looks */
4284         if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
4285             (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
4286             btoc(MAXPHYS))) < 0)
4287                 return (-1);
4288         bp->b_npages = pidx;
4289         if (mapbuf || !unmapped_buf_allowed) {
4290                 pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
4291                 kva = bp->b_saveaddr;
4292                 bp->b_saveaddr = bp->b_data;
4293                 bp->b_data = kva + (((vm_offset_t)bp->b_data) & PAGE_MASK);
4294                 bp->b_flags &= ~B_UNMAPPED;
4295         } else {
4296                 bp->b_flags |= B_UNMAPPED;
4297                 bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
4298                 bp->b_saveaddr = bp->b_data;
4299                 bp->b_data = unmapped_buf;
4300         }
4301         return(0);
4302 }
4303
4304 /*
4305  * Free the io map PTEs associated with this IO operation.
4306  * We also invalidate the TLB entries and restore the original b_addr.
4307  */
4308 void
4309 vunmapbuf(struct buf *bp)
4310 {
4311         int npages;
4312
4313         npages = bp->b_npages;
4314         if (bp->b_flags & B_UNMAPPED)
4315                 bp->b_flags &= ~B_UNMAPPED;
4316         else
4317                 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
4318         vm_page_unhold_pages(bp->b_pages, npages);
4319
4320         bp->b_data = bp->b_saveaddr;
4321 }
4322
4323 void
4324 bdone(struct buf *bp)
4325 {
4326         struct mtx *mtxp;
4327
4328         mtxp = mtx_pool_find(mtxpool_sleep, bp);
4329         mtx_lock(mtxp);
4330         bp->b_flags |= B_DONE;
4331         wakeup(bp);
4332         mtx_unlock(mtxp);
4333 }
4334
4335 void
4336 bwait(struct buf *bp, u_char pri, const char *wchan)
4337 {
4338         struct mtx *mtxp;
4339
4340         mtxp = mtx_pool_find(mtxpool_sleep, bp);
4341         mtx_lock(mtxp);
4342         while ((bp->b_flags & B_DONE) == 0)
4343                 msleep(bp, mtxp, pri, wchan, 0);
4344         mtx_unlock(mtxp);
4345 }
4346
4347 int
4348 bufsync(struct bufobj *bo, int waitfor)
4349 {
4350
4351         return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread));
4352 }
4353
4354 void
4355 bufstrategy(struct bufobj *bo, struct buf *bp)
4356 {
4357         int i = 0;
4358         struct vnode *vp;
4359
4360         vp = bp->b_vp;
4361         KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
4362         KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
4363             ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
4364         i = VOP_STRATEGY(vp, bp);
4365         KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
4366 }
4367
4368 void
4369 bufobj_wrefl(struct bufobj *bo)
4370 {
4371
4372         KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4373         ASSERT_BO_WLOCKED(bo);
4374         bo->bo_numoutput++;
4375 }
4376
4377 void
4378 bufobj_wref(struct bufobj *bo)
4379 {
4380
4381         KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4382         BO_LOCK(bo);
4383         bo->bo_numoutput++;
4384         BO_UNLOCK(bo);
4385 }
4386
4387 void
4388 bufobj_wdrop(struct bufobj *bo)
4389 {
4390
4391         KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
4392         BO_LOCK(bo);
4393         KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
4394         if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
4395                 bo->bo_flag &= ~BO_WWAIT;
4396                 wakeup(&bo->bo_numoutput);
4397         }
4398         BO_UNLOCK(bo);
4399 }
4400
4401 int
4402 bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
4403 {
4404         int error;
4405
4406         KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
4407         ASSERT_BO_WLOCKED(bo);
4408         error = 0;
4409         while (bo->bo_numoutput) {
4410                 bo->bo_flag |= BO_WWAIT;
4411                 error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
4412                     slpflag | (PRIBIO + 1), "bo_wwait", timeo);
4413                 if (error)
4414                         break;
4415         }
4416         return (error);
4417 }
4418
4419 void
4420 bpin(struct buf *bp)
4421 {
4422         struct mtx *mtxp;
4423
4424         mtxp = mtx_pool_find(mtxpool_sleep, bp);
4425         mtx_lock(mtxp);
4426         bp->b_pin_count++;
4427         mtx_unlock(mtxp);
4428 }
4429
4430 void
4431 bunpin(struct buf *bp)
4432 {
4433         struct mtx *mtxp;
4434
4435         mtxp = mtx_pool_find(mtxpool_sleep, bp);
4436         mtx_lock(mtxp);
4437         if (--bp->b_pin_count == 0)
4438                 wakeup(bp);
4439         mtx_unlock(mtxp);
4440 }
4441
4442 void
4443 bunpin_wait(struct buf *bp)
4444 {
4445         struct mtx *mtxp;
4446
4447         mtxp = mtx_pool_find(mtxpool_sleep, bp);
4448         mtx_lock(mtxp);
4449         while (bp->b_pin_count > 0)
4450                 msleep(bp, mtxp, PRIBIO, "bwunpin", 0);
4451         mtx_unlock(mtxp);
4452 }
4453
4454 /*
4455  * Set bio_data or bio_ma for struct bio from the struct buf.
4456  */
4457 void
4458 bdata2bio(struct buf *bp, struct bio *bip)
4459 {
4460
4461         if ((bp->b_flags & B_UNMAPPED) != 0) {
4462                 KASSERT(unmapped_buf_allowed, ("unmapped"));
4463                 bip->bio_ma = bp->b_pages;
4464                 bip->bio_ma_n = bp->b_npages;
4465                 bip->bio_data = unmapped_buf;
4466                 bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
4467                 bip->bio_flags |= BIO_UNMAPPED;
4468                 KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
4469                     PAGE_SIZE == bp->b_npages,
4470                     ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
4471                     (long long)bip->bio_length, bip->bio_ma_n));
4472         } else {
4473                 bip->bio_data = bp->b_data;
4474                 bip->bio_ma = NULL;
4475         }
4476 }
4477
4478 #include "opt_ddb.h"
4479 #ifdef DDB
4480 #include <ddb/ddb.h>
4481
4482 /* DDB command to show buffer data */
4483 DB_SHOW_COMMAND(buffer, db_show_buffer)
4484 {
4485         /* get args */
4486         struct buf *bp = (struct buf *)addr;
4487
4488         if (!have_addr) {
4489                 db_printf("usage: show buffer <addr>\n");
4490                 return;
4491         }
4492
4493         db_printf("buf at %p\n", bp);
4494         db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
4495             (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
4496             PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
4497         db_printf(
4498             "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
4499             "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
4500             "b_dep = %p\n",
4501             bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
4502             bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
4503             (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
4504         if (bp->b_npages) {
4505                 int i;
4506                 db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
4507                 for (i = 0; i < bp->b_npages; i++) {
4508                         vm_page_t m;
4509                         m = bp->b_pages[i];
4510                         db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
4511                             (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
4512                         if ((i + 1) < bp->b_npages)
4513                                 db_printf(",");
4514                 }
4515                 db_printf("\n");
4516         }
4517         db_printf(" ");
4518         BUF_LOCKPRINTINFO(bp);
4519 }
4520
4521 DB_SHOW_COMMAND(lockedbufs, lockedbufs)
4522 {
4523         struct buf *bp;
4524         int i;
4525
4526         for (i = 0; i < nbuf; i++) {
4527                 bp = &buf[i];
4528                 if (BUF_ISLOCKED(bp)) {
4529                         db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4530                         db_printf("\n");
4531                 }
4532         }
4533 }
4534
4535 DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
4536 {
4537         struct vnode *vp;
4538         struct buf *bp;
4539
4540         if (!have_addr) {
4541                 db_printf("usage: show vnodebufs <addr>\n");
4542                 return;
4543         }
4544         vp = (struct vnode *)addr;
4545         db_printf("Clean buffers:\n");
4546         TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
4547                 db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4548                 db_printf("\n");
4549         }
4550         db_printf("Dirty buffers:\n");
4551         TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
4552                 db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4553                 db_printf("\n");
4554         }
4555 }
4556
4557 DB_COMMAND(countfreebufs, db_coundfreebufs)
4558 {
4559         struct buf *bp;
4560         int i, used = 0, nfree = 0;
4561
4562         if (have_addr) {
4563                 db_printf("usage: countfreebufs\n");
4564                 return;
4565         }
4566
4567         for (i = 0; i < nbuf; i++) {
4568                 bp = &buf[i];
4569                 if ((bp->b_flags & B_INFREECNT) != 0)
4570                         nfree++;
4571                 else
4572                         used++;
4573         }
4574
4575         db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
4576             nfree + used);
4577         db_printf("numfreebuffers is %d\n", numfreebuffers);
4578 }
4579 #endif /* DDB */