sys/kern/vfs_bio.c

   1 /*-
   2  * Copyright (c) 2004 Poul-Henning Kamp
   3  * Copyright (c) 1994,1997 John S. Dyson
   4  * Copyright (c) 2013 The FreeBSD Foundation
   5  * All rights reserved.
   6  *
   7  * Portions of this software were developed by Konstantin Belousov
   8  * under sponsorship from the FreeBSD Foundation.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29  * SUCH DAMAGE.
  30  */
  31
  32 /*
  33  * this file contains a new buffer I/O scheme implementing a coherent
  34  * VM object and buffer cache scheme.  Pains have been taken to make
  35  * sure that the performance degradation associated with schemes such
  36  * as this is not realized.
  37  *
  38  * Author:  John S. Dyson
  39  * Significant help during the development and debugging phases
  40  * had been provided by David Greenman, also of the FreeBSD core team.
  41  *
  42  * see man buf(9) for more info.
  43  */
  44
  45 #include <sys/cdefs.h>
  46 __FBSDID("$FreeBSD$");
  47
  48 #include <sys/param.h>
  49 #include <sys/systm.h>
  50 #include <sys/bio.h>
  51 #include <sys/conf.h>
  52 #include <sys/buf.h>
  53 #include <sys/devicestat.h>
  54 #include <sys/eventhandler.h>
  55 #include <sys/fail.h>
  56 #include <sys/limits.h>
  57 #include <sys/lock.h>
  58 #include <sys/malloc.h>
  59 #include <sys/mount.h>
  60 #include <sys/mutex.h>
  61 #include <sys/kernel.h>
  62 #include <sys/kthread.h>
  63 #include <sys/proc.h>
  64 #include <sys/racct.h>
  65 #include <sys/resourcevar.h>
  66 #include <sys/rwlock.h>
  67 #include <sys/smp.h>
  68 #include <sys/sysctl.h>
  69 #include <sys/sysproto.h>
  70 #include <sys/vmem.h>
  71 #include <sys/vmmeter.h>
  72 #include <sys/vnode.h>
  73 #include <sys/watchdog.h>
  74 #include <geom/geom.h>
  75 #include <vm/vm.h>
  76 #include <vm/vm_param.h>
  77 #include <vm/vm_kern.h>
  78 #include <vm/vm_pageout.h>
  79 #include <vm/vm_page.h>
  80 #include <vm/vm_object.h>
  81 #include <vm/vm_extern.h>
  82 #include <vm/vm_map.h>
  83 #include <vm/swap_pager.h>
  84 #include "opt_compat.h"
  85 #include "opt_swap.h"
  86
  87 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
  88
  89 struct  bio_ops bioops;         /* I/O operation notification */
  90
  91 struct  buf_ops buf_ops_bio = {
  92         .bop_name       =       "buf_ops_bio",
  93         .bop_write      =       bufwrite,
  94         .bop_strategy   =       bufstrategy,
  95         .bop_sync       =       bufsync,
  96         .bop_bdflush    =       bufbdflush,
  97 };
  98
  99 static struct buf *buf;         /* buffer header pool */
 100 extern struct buf *swbuf;       /* Swap buffer header pool. */
 101 caddr_t unmapped_buf;
 102
 103 /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
 104 struct proc *bufdaemonproc;
 105 struct proc *bufspacedaemonproc;
 106
 107 static int inmem(struct vnode *vp, daddr_t blkno);
 108 static void vm_hold_free_pages(struct buf *bp, int newbsize);
 109 static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
 110                 vm_offset_t to);
 111 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
 112 static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
 113                 vm_page_t m);
 114 static void vfs_clean_pages_dirty_buf(struct buf *bp);
 115 static void vfs_setdirty_locked_object(struct buf *bp);
 116 static void vfs_vmio_invalidate(struct buf *bp);
 117 static void vfs_vmio_truncate(struct buf *bp, int npages);
 118 static void vfs_vmio_extend(struct buf *bp, int npages, int size);
 119 static int vfs_bio_clcheck(struct vnode *vp, int size,
 120                 daddr_t lblkno, daddr_t blkno);
 121 static int buf_flush(struct vnode *vp, int);
 122 static int buf_recycle(bool);
 123 static int buf_scan(bool);
 124 static int flushbufqueues(struct vnode *, int, int);
 125 static void buf_daemon(void);
 126 static void bremfreel(struct buf *bp);
 127 static __inline void bd_wakeup(void);
 128 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
 129 static void bufkva_reclaim(vmem_t *, int);
 130 static void bufkva_free(struct buf *);
 131 static int buf_import(void *, void **, int, int);
 132 static void buf_release(void *, void **, int);
 133
 134 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
 135     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 136 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
 137 #endif
 138
 139 int vmiodirenable = TRUE;
 140 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
 141     "Use the VM system for directory writes");
 142 long runningbufspace;
 143 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
 144     "Amount of presently outstanding async buffer io");
 145 static long bufspace;
 146 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
 147     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 148 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
 149     &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
 150 #else
 151 SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
 152     "Physical memory used for buffers");
 153 #endif
 154 static long bufkvaspace;
 155 SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
 156     "Kernel virtual memory used for buffers");
 157 static long maxbufspace;
 158 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
 159     "Maximum allowed value of bufspace (including metadata)");
 160 static long bufmallocspace;
 161 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
 162     "Amount of malloced memory for buffers");
 163 static long maxbufmallocspace;
 164 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
 165     0, "Maximum amount of malloced memory for buffers");
 166 static long lobufspace;
 167 SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
 168     "Minimum amount of buffers we want to have");
 169 long hibufspace;
 170 SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
 171     "Maximum allowed value of bufspace (excluding metadata)");
 172 long bufspacethresh;
 173 SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
 174     0, "Bufspace consumed before waking the daemon to free some");
 175 static int buffreekvacnt;
 176 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
 177     "Number of times we have freed the KVA space from some buffer");
 178 static int bufdefragcnt;
 179 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
 180     "Number of times we have had to repeat buffer allocation to defragment");
 181 static long lorunningspace;
 182 SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
 183     CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
 184     "Minimum preferred space used for in-progress I/O");
 185 static long hirunningspace;
 186 SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
 187     CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
 188     "Maximum amount of space to use for in-progress I/O");
 189 int dirtybufferflushes;
 190 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
 191     0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
 192 int bdwriteskip;
 193 SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
 194     0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
 195 int altbufferflushes;
 196 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
 197     0, "Number of fsync flushes to limit dirty buffers");
 198 static int recursiveflushes;
 199 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
 200     0, "Number of flushes skipped due to being recursive");
 201 static int numdirtybuffers;
 202 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
 203     "Number of buffers that are dirty (has unwritten changes) at the moment");
 204 static int lodirtybuffers;
 205 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
 206     "How many buffers we want to have free before bufdaemon can sleep");
 207 static int hidirtybuffers;
 208 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
 209     "When the number of dirty buffers is considered severe");
 210 int dirtybufthresh;
 211 SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
 212     0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
 213 static int numfreebuffers;
 214 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
 215     "Number of free buffers");
 216 static int lofreebuffers;
 217 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
 218    "Target number of free buffers");
 219 static int hifreebuffers;
 220 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
 221    "Threshold for clean buffer recycling");
 222 static int getnewbufcalls;
 223 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
 224    "Number of calls to getnewbuf");
 225 static int getnewbufrestarts;
 226 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
 227     "Number of times getnewbuf has had to restart a buffer acquisition");
 228 static int mappingrestarts;
 229 SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
 230     "Number of times getblk has had to restart a buffer mapping for "
 231     "unmapped buffer");
 232 static int numbufallocfails;
 233 SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
 234     "Number of times buffer allocations failed");
 235 static int flushbufqtarget = 100;
 236 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
 237     "Amount of work to do in flushbufqueues when helping bufdaemon");
 238 static long notbufdflushes;
 239 SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
 240     "Number of dirty buffer flushes done by the bufdaemon helpers");
 241 static long barrierwrites;
 242 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
 243     "Number of barrier writes");
 244 SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
 245     &unmapped_buf_allowed, 0,
 246     "Permit the use of the unmapped i/o");
 247
 248 /*
 249  * This lock synchronizes access to bd_request.
 250  */
 251 static struct mtx_padalign bdlock;
 252
 253 /*
 254  * This lock protects the runningbufreq and synchronizes runningbufwakeup and
 255  * waitrunningbufspace().
 256  */
 257 static struct mtx_padalign rbreqlock;
 258
 259 /*
 260  * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
 261  */
 262 static struct rwlock_padalign nblock;
 263
 264 /*
 265  * Lock that protects bdirtywait.
 266  */
 267 static struct mtx_padalign bdirtylock;
 268
 269 /*
 270  * Wakeup point for bufdaemon, as well as indicator of whether it is already
 271  * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
 272  * is idling.
 273  */
 274 static int bd_request;
 275
 276 /*
 277  * Request/wakeup point for the bufspace daemon.
 278  */
 279 static int bufspace_request;
 280
 281 /*
 282  * Request for the buf daemon to write more buffers than is indicated by
 283  * lodirtybuf.  This may be necessary to push out excess dependencies or
 284  * defragment the address space where a simple count of the number of dirty
 285  * buffers is insufficient to characterize the demand for flushing them.
 286  */
 287 static int bd_speedupreq;
 288
 289 /*
 290  * bogus page -- for I/O to/from partially complete buffers
 291  * this is a temporary solution to the problem, but it is not
 292  * really that bad.  it would be better to split the buffer
 293  * for input in the case of buffers partially already in memory,
 294  * but the code is intricate enough already.
 295  */
 296 vm_page_t bogus_page;
 297
 298 /*
 299  * Synchronization (sleep/wakeup) variable for active buffer space requests.
 300  * Set when wait starts, cleared prior to wakeup().
 301  * Used in runningbufwakeup() and waitrunningbufspace().
 302  */
 303 static int runningbufreq;
 304
 305 /*
 306  * Synchronization (sleep/wakeup) variable for buffer requests.
 307  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
 308  * by and/or.
 309  * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
 310  * getnewbuf(), and getblk().
 311  */
 312 static volatile int needsbuffer;
 313
 314 /*
 315  * Synchronization for bwillwrite() waiters.
 316  */
 317 static int bdirtywait;
 318
 319 /*
 320  * Definitions for the buffer free lists.
 321  */
 322 #define QUEUE_NONE      0       /* on no queue */
 323 #define QUEUE_EMPTY     1       /* empty buffer headers */
 324 #define QUEUE_DIRTY     2       /* B_DELWRI buffers */
 325 #define QUEUE_CLEAN     3       /* non-B_DELWRI buffers */
 326 #define QUEUE_SENTINEL  1024    /* not an queue index, but mark for sentinel */
 327
 328 /* Maximum number of clean buffer queues. */
 329 #define CLEAN_QUEUES    16
 330
 331 /* Configured number of clean queues. */
 332 static int clean_queues;
 333
 334 /* Maximum number of buffer queues. */
 335 #define BUFFER_QUEUES   (QUEUE_CLEAN + CLEAN_QUEUES)
 336
 337 /* Queues for free buffers with various properties */
 338 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
 339 #ifdef INVARIANTS
 340 static int bq_len[BUFFER_QUEUES];
 341 #endif
 342
 343 /*
 344  * Lock for each bufqueue
 345  */
 346 static struct mtx_padalign bqlocks[BUFFER_QUEUES];
 347
 348 /*
 349  * per-cpu empty buffer cache.
 350  */
 351 uma_zone_t buf_zone;
 352
 353 /*
 354  * Single global constant for BUF_WMESG, to avoid getting multiple references.
 355  * buf_wmesg is referred from macros.
 356  */
 357 const char *buf_wmesg = BUF_WMESG;
 358
 359 static int
 360 sysctl_runningspace(SYSCTL_HANDLER_ARGS)
 361 {
 362         long value;
 363         int error;
 364
 365         value = *(long *)arg1;
 366         error = sysctl_handle_long(oidp, &value, 0, req);
 367         if (error != 0 || req->newptr == NULL)
 368                 return (error);
 369         mtx_lock(&rbreqlock);
 370         if (arg1 == &hirunningspace) {
 371                 if (value < lorunningspace)
 372                         error = EINVAL;
 373                 else
 374                         hirunningspace = value;
 375         } else {
 376                 KASSERT(arg1 == &lorunningspace,
 377                     ("%s: unknown arg1", __func__));
 378                 if (value > hirunningspace)
 379                         error = EINVAL;
 380                 else
 381                         lorunningspace = value;
 382         }
 383         mtx_unlock(&rbreqlock);
 384         return (error);
 385 }
 386
 387 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
 388     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 389 static int
 390 sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 391 {
 392         long lvalue;
 393         int ivalue;
 394
 395         if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
 396                 return (sysctl_handle_long(oidp, arg1, arg2, req));
 397         lvalue = *(long *)arg1;
 398         if (lvalue > INT_MAX)
 399                 /* On overflow, still write out a long to trigger ENOMEM. */
 400                 return (sysctl_handle_long(oidp, &lvalue, 0, req));
 401         ivalue = lvalue;
 402         return (sysctl_handle_int(oidp, &ivalue, 0, req));
 403 }
 404 #endif
 405
 406 static int
 407 bqcleanq(void)
 408 {
 409         static int nextq;
 410
 411         return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
 412 }
 413
 414 static int
 415 bqisclean(int qindex)
 416 {
 417
 418         return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
 419 }
 420
 421 /*
 422  *      bqlock:
 423  *
 424  *      Return the appropriate queue lock based on the index.
 425  */
 426 static inline struct mtx *
 427 bqlock(int qindex)
 428 {
 429
 430         return (struct mtx *)&bqlocks[qindex];
 431 }
 432
 433 /*
 434  *      bdirtywakeup:
 435  *
 436  *      Wakeup any bwillwrite() waiters.
 437  */
 438 static void
 439 bdirtywakeup(void)
 440 {
 441         mtx_lock(&bdirtylock);
 442         if (bdirtywait) {
 443                 bdirtywait = 0;
 444                 wakeup(&bdirtywait);
 445         }
 446         mtx_unlock(&bdirtylock);
 447 }
 448
 449 /*
 450  *      bdirtysub:
 451  *
 452  *      Decrement the numdirtybuffers count by one and wakeup any
 453  *      threads blocked in bwillwrite().
 454  */
 455 static void
 456 bdirtysub(void)
 457 {
 458
 459         if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
 460             (lodirtybuffers + hidirtybuffers) / 2)
 461                 bdirtywakeup();
 462 }
 463
 464 /*
 465  *      bdirtyadd:
 466  *
 467  *      Increment the numdirtybuffers count by one and wakeup the buf
 468  *      daemon if needed.
 469  */
 470 static void
 471 bdirtyadd(void)
 472 {
 473
 474         /*
 475          * Only do the wakeup once as we cross the boundary.  The
 476          * buf daemon will keep running until the condition clears.
 477          */
 478         if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
 479             (lodirtybuffers + hidirtybuffers) / 2)
 480                 bd_wakeup();
 481 }
 482
 483 /*
 484  *      bufspace_wakeup:
 485  *
 486  *      Called when buffer space is potentially available for recovery.
 487  *      getnewbuf() will block on this flag when it is unable to free
 488  *      sufficient buffer space.  Buffer space becomes recoverable when
 489  *      bp's get placed back in the queues.
 490  */
 491 static void
 492 bufspace_wakeup(void)
 493 {
 494
 495         /*
 496          * If someone is waiting for bufspace, wake them up.
 497          *
 498          * Since needsbuffer is set prior to doing an additional queue
 499          * scan it is safe to check for the flag prior to acquiring the
 500          * lock.  The thread that is preparing to scan again before
 501          * blocking would discover the buf we released.
 502          */
 503         if (needsbuffer) {
 504                 rw_rlock(&nblock);
 505                 if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
 506                         wakeup(__DEVOLATILE(void *, &needsbuffer));
 507                 rw_runlock(&nblock);
 508         }
 509 }
 510
 511 /*
 512  *      bufspace_daemonwakeup:
 513  *
 514  *      Wakeup the daemon responsible for freeing clean bufs.
 515  */
 516 static void
 517 bufspace_daemonwakeup(void)
 518 {
 519         rw_rlock(&nblock);
 520         if (bufspace_request == 0) {
 521                 bufspace_request = 1;
 522                 wakeup(&bufspace_request);
 523         }
 524         rw_runlock(&nblock);
 525 }
 526
 527 /*
 528  *      bufspace_adjust:
 529  *
 530  *      Adjust the reported bufspace for a KVA managed buffer, possibly
 531  *      waking any waiters.
 532  */
 533 static void
 534 bufspace_adjust(struct buf *bp, int bufsize)
 535 {
 536         long space;
 537         int diff;
 538
 539         KASSERT((bp->b_flags & B_MALLOC) == 0,
 540             ("bufspace_adjust: malloc buf %p", bp));
 541         diff = bufsize - bp->b_bufsize;
 542         if (diff < 0) {
 543                 atomic_subtract_long(&bufspace, -diff);
 544                 bufspace_wakeup();
 545         } else {
 546                 space = atomic_fetchadd_long(&bufspace, diff);
 547                 /* Wake up the daemon on the transition. */
 548                 if (space < bufspacethresh && space + diff >= bufspacethresh)
 549                         bufspace_daemonwakeup();
 550         }
 551         bp->b_bufsize = bufsize;
 552 }
 553
 554 /*
 555  *      bufspace_reserve:
 556  *
 557  *      Reserve bufspace before calling allocbuf().  metadata has a
 558  *      different space limit than data.
 559  */
 560 static int
 561 bufspace_reserve(int size, bool metadata)
 562 {
 563         long limit;
 564         long space;
 565
 566         if (metadata)
 567                 limit = maxbufspace;
 568         else
 569                 limit = hibufspace;
 570         do {
 571                 space = bufspace;
 572                 if (space + size > limit)
 573                         return (ENOSPC);
 574         } while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
 575
 576         /* Wake up the daemon on the transition. */
 577         if (space < bufspacethresh && space + size >= bufspacethresh)
 578                 bufspace_daemonwakeup();
 579
 580         return (0);
 581 }
 582
 583 /*
 584  *      bufspace_release:
 585  *
 586  *      Release reserved bufspace after bufspace_adjust() has consumed it.
 587  */
 588 static void
 589 bufspace_release(int size)
 590 {
 591         atomic_subtract_long(&bufspace, size);
 592         bufspace_wakeup();
 593 }
 594
 595 /*
 596  *      bufspace_wait:
 597  *
 598  *      Wait for bufspace, acting as the buf daemon if a locked vnode is
 599  *      supplied.  needsbuffer must be set in a safe fashion prior to
 600  *      polling for space.  The operation must be re-tried on return.
 601  */
 602 static void
 603 bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
 604 {
 605         struct thread *td;
 606         int error, fl, norunbuf;
 607
 608         if ((gbflags & GB_NOWAIT_BD) != 0)
 609                 return;
 610
 611         td = curthread;
 612         rw_wlock(&nblock);
 613         while (needsbuffer != 0) {
 614                 if (vp != NULL && vp->v_type != VCHR &&
 615                     (td->td_pflags & TDP_BUFNEED) == 0) {
 616                         rw_wunlock(&nblock);
 617                         /*
 618                          * getblk() is called with a vnode locked, and
 619                          * some majority of the dirty buffers may as
 620                          * well belong to the vnode.  Flushing the
 621                          * buffers there would make a progress that
 622                          * cannot be achieved by the buf_daemon, that
 623                          * cannot lock the vnode.
 624                          */
 625                         norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
 626                             (td->td_pflags & TDP_NORUNNINGBUF);
 627
 628                         /*
 629                          * Play bufdaemon.  The getnewbuf() function
 630                          * may be called while the thread owns lock
 631                          * for another dirty buffer for the same
 632                          * vnode, which makes it impossible to use
 633                          * VOP_FSYNC() there, due to the buffer lock
 634                          * recursion.
 635                          */
 636                         td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
 637                         fl = buf_flush(vp, flushbufqtarget);
 638                         td->td_pflags &= norunbuf;
 639                         rw_wlock(&nblock);
 640                         if (fl != 0)
 641                                 continue;
 642                         if (needsbuffer == 0)
 643                                 break;
 644                 }
 645                 error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
 646                     (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
 647                 if (error != 0)
 648                         break;
 649         }
 650         rw_wunlock(&nblock);
 651 }
 652
 653
 654 /*
 655  *      bufspace_daemon:
 656  *
 657  *      buffer space management daemon.  Tries to maintain some marginal
 658  *      amount of free buffer space so that requesting processes neither
 659  *      block nor work to reclaim buffers.
 660  */
 661 static void
 662 bufspace_daemon(void)
 663 {
 664         for (;;) {
 665                 kproc_suspend_check(bufspacedaemonproc);
 666
 667                 /*
 668                  * Free buffers from the clean queue until we meet our
 669                  * targets.
 670                  *
 671                  * Theory of operation:  The buffer cache is most efficient
 672                  * when some free buffer headers and space are always
 673                  * available to getnewbuf().  This daemon attempts to prevent
 674                  * the excessive blocking and synchronization associated
 675                  * with shortfall.  It goes through three phases according
 676                  * demand:
 677                  *
 678                  * 1)   The daemon wakes up voluntarily once per-second
 679                  *      during idle periods when the counters are below
 680                  *      the wakeup thresholds (bufspacethresh, lofreebuffers).
 681                  *
 682                  * 2)   The daemon wakes up as we cross the thresholds
 683                  *      ahead of any potential blocking.  This may bounce
 684                  *      slightly according to the rate of consumption and
 685                  *      release.
 686                  *
 687                  * 3)   The daemon and consumers are starved for working
 688                  *      clean buffers.  This is the 'bufspace' sleep below
 689                  *      which will inefficiently trade bufs with bqrelse
 690                  *      until we return to condition 2.
 691                  */
 692                 while (bufspace > lobufspace ||
 693                     numfreebuffers < hifreebuffers) {
 694                         if (buf_recycle(false) != 0) {
 695                                 atomic_set_int(&needsbuffer, 1);
 696                                 if (buf_recycle(false) != 0) {
 697                                         rw_wlock(&nblock);
 698                                         if (needsbuffer)
 699                                                 rw_sleep(__DEVOLATILE(void *,
 700                                                     &needsbuffer), &nblock,
 701                                                     PRIBIO|PDROP, "bufspace",
 702                                                     hz/10);
 703                                         else
 704                                                 rw_wunlock(&nblock);
 705                                 }
 706                         }
 707                         maybe_yield();
 708                 }
 709
 710                 /*
 711                  * Re-check our limits under the exclusive nblock.
 712                  */
 713                 rw_wlock(&nblock);
 714                 if (bufspace < bufspacethresh &&
 715                     numfreebuffers > lofreebuffers) {
 716                         bufspace_request = 0;
 717                         rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
 718                             "-", hz);
 719                 } else
 720                         rw_wunlock(&nblock);
 721         }
 722 }
 723
 724 static struct kproc_desc bufspace_kp = {
 725         "bufspacedaemon",
 726         bufspace_daemon,
 727         &bufspacedaemonproc
 728 };
 729 SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
 730     &bufspace_kp);
 731
 732 /*
 733  *      bufmallocadjust:
 734  *
 735  *      Adjust the reported bufspace for a malloc managed buffer, possibly
 736  *      waking any waiters.
 737  */
 738 static void
 739 bufmallocadjust(struct buf *bp, int bufsize)
 740 {
 741         int diff;
 742
 743         KASSERT((bp->b_flags & B_MALLOC) != 0,
 744             ("bufmallocadjust: non-malloc buf %p", bp));
 745         diff = bufsize - bp->b_bufsize;
 746         if (diff < 0)
 747                 atomic_subtract_long(&bufmallocspace, -diff);
 748         else
 749                 atomic_add_long(&bufmallocspace, diff);
 750         bp->b_bufsize = bufsize;
 751 }
 752
 753 /*
 754  *      runningwakeup:
 755  *
 756  *      Wake up processes that are waiting on asynchronous writes to fall
 757  *      below lorunningspace.
 758  */
 759 static void
 760 runningwakeup(void)
 761 {
 762
 763         mtx_lock(&rbreqlock);
 764         if (runningbufreq) {
 765                 runningbufreq = 0;
 766                 wakeup(&runningbufreq);
 767         }
 768         mtx_unlock(&rbreqlock);
 769 }
 770
 771 /*
 772  *      runningbufwakeup:
 773  *
 774  *      Decrement the outstanding write count according.
 775  */
 776 void
 777 runningbufwakeup(struct buf *bp)
 778 {
 779         long space, bspace;
 780
 781         bspace = bp->b_runningbufspace;
 782         if (bspace == 0)
 783                 return;
 784         space = atomic_fetchadd_long(&runningbufspace, -bspace);
 785         KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
 786             space, bspace));
 787         bp->b_runningbufspace = 0;
 788         /*
 789          * Only acquire the lock and wakeup on the transition from exceeding
 790          * the threshold to falling below it.
 791          */
 792         if (space < lorunningspace)
 793                 return;
 794         if (space - bspace > lorunningspace)
 795                 return;
 796         runningwakeup();
 797 }
 798
 799 /*
 800  *      waitrunningbufspace()
 801  *
 802  *      runningbufspace is a measure of the amount of I/O currently
 803  *      running.  This routine is used in async-write situations to
 804  *      prevent creating huge backups of pending writes to a device.
 805  *      Only asynchronous writes are governed by this function.
 806  *
 807  *      This does NOT turn an async write into a sync write.  It waits
 808  *      for earlier writes to complete and generally returns before the
 809  *      caller's write has reached the device.
 810  */
 811 void
 812 waitrunningbufspace(void)
 813 {
 814
 815         mtx_lock(&rbreqlock);
 816         while (runningbufspace > hirunningspace) {
 817                 runningbufreq = 1;
 818                 msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
 819         }
 820         mtx_unlock(&rbreqlock);
 821 }
 822
 823
 824 /*
 825  *      vfs_buf_test_cache:
 826  *
 827  *      Called when a buffer is extended.  This function clears the B_CACHE
 828  *      bit if the newly extended portion of the buffer does not contain
 829  *      valid data.
 830  */
 831 static __inline void
 832 vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off,
 833     vm_offset_t size, vm_page_t m)
 834 {
 835
 836         VM_OBJECT_ASSERT_LOCKED(m->object);
 837         if (bp->b_flags & B_CACHE) {
 838                 int base = (foff + off) & PAGE_MASK;
 839                 if (vm_page_is_valid(m, base, size) == 0)
 840                         bp->b_flags &= ~B_CACHE;
 841         }
 842 }
 843
 844 /* Wake up the buffer daemon if necessary */
 845 static __inline void
 846 bd_wakeup(void)
 847 {
 848
 849         mtx_lock(&bdlock);
 850         if (bd_request == 0) {
 851                 bd_request = 1;
 852                 wakeup(&bd_request);
 853         }
 854         mtx_unlock(&bdlock);
 855 }
 856
 857 /*
 858  * bd_speedup - speedup the buffer cache flushing code
 859  */
 860 void
 861 bd_speedup(void)
 862 {
 863         int needwake;
 864
 865         mtx_lock(&bdlock);
 866         needwake = 0;
 867         if (bd_speedupreq == 0 || bd_request == 0)
 868                 needwake = 1;
 869         bd_speedupreq = 1;
 870         bd_request = 1;
 871         if (needwake)
 872                 wakeup(&bd_request);
 873         mtx_unlock(&bdlock);
 874 }
 875
 876 #ifndef NSWBUF_MIN
 877 #define NSWBUF_MIN      16
 878 #endif
 879
 880 #ifdef __i386__
 881 #define TRANSIENT_DENOM 5
 882 #else
 883 #define TRANSIENT_DENOM 10
 884 #endif
 885
 886 /*
 887  * Calculating buffer cache scaling values and reserve space for buffer
 888  * headers.  This is called during low level kernel initialization and
 889  * may be called more then once.  We CANNOT write to the memory area
 890  * being reserved at this time.
 891  */
 892 caddr_t
 893 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 894 {
 895         int tuned_nbuf;
 896         long maxbuf, maxbuf_sz, buf_sz, biotmap_sz;
 897
 898         /*
 899          * physmem_est is in pages.  Convert it to kilobytes (assumes
 900          * PAGE_SIZE is >= 1K)
 901          */
 902         physmem_est = physmem_est * (PAGE_SIZE / 1024);
 903
 904         /*
 905          * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
 906          * For the first 64MB of ram nominally allocate sufficient buffers to
 907          * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
 908          * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
 909          * the buffer cache we limit the eventual kva reservation to
 910          * maxbcache bytes.
 911          *
 912          * factor represents the 1/4 x ram conversion.
 913          */
 914         if (nbuf == 0) {
 915                 int factor = 4 * BKVASIZE / 1024;
 916
 917                 nbuf = 50;
 918                 if (physmem_est > 4096)
 919                         nbuf += min((physmem_est - 4096) / factor,
 920                             65536 / factor);
 921                 if (physmem_est > 65536)
 922                         nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
 923                             32 * 1024 * 1024 / (factor * 5));
 924
 925                 if (maxbcache && nbuf > maxbcache / BKVASIZE)
 926                         nbuf = maxbcache / BKVASIZE;
 927                 tuned_nbuf = 1;
 928         } else
 929                 tuned_nbuf = 0;
 930
 931         /* XXX Avoid unsigned long overflows later on with maxbufspace. */
 932         maxbuf = (LONG_MAX / 3) / BKVASIZE;
 933         if (nbuf > maxbuf) {
 934                 if (!tuned_nbuf)
 935                         printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
 936                             maxbuf);
 937                 nbuf = maxbuf;
 938         }
 939
 940         /*
 941          * Ideal allocation size for the transient bio submap is 10%
 942          * of the maximal space buffer map.  This roughly corresponds
 943          * to the amount of the buffer mapped for typical UFS load.
 944          *
 945          * Clip the buffer map to reserve space for the transient
 946          * BIOs, if its extent is bigger than 90% (80% on i386) of the
 947          * maximum buffer map extent on the platform.
 948          *
 949          * The fall-back to the maxbuf in case of maxbcache unset,
 950          * allows to not trim the buffer KVA for the architectures
 951          * with ample KVA space.
 952          */
 953         if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
 954                 maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
 955                 buf_sz = (long)nbuf * BKVASIZE;
 956                 if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
 957                     (TRANSIENT_DENOM - 1)) {
 958                         /*
 959                          * There is more KVA than memory.  Do not
 960                          * adjust buffer map size, and assign the rest
 961                          * of maxbuf to transient map.
 962                          */
 963                         biotmap_sz = maxbuf_sz - buf_sz;
 964                 } else {
 965                         /*
 966                          * Buffer map spans all KVA we could afford on
 967                          * this platform.  Give 10% (20% on i386) of
 968                          * the buffer map to the transient bio map.
 969                          */
 970                         biotmap_sz = buf_sz / TRANSIENT_DENOM;
 971                         buf_sz -= biotmap_sz;
 972                 }
 973                 if (biotmap_sz / INT_MAX > MAXPHYS)
 974                         bio_transient_maxcnt = INT_MAX;
 975                 else
 976                         bio_transient_maxcnt = biotmap_sz / MAXPHYS;
 977                 /*
 978                  * Artificially limit to 1024 simultaneous in-flight I/Os
 979                  * using the transient mapping.
 980                  */
 981                 if (bio_transient_maxcnt > 1024)
 982                         bio_transient_maxcnt = 1024;
 983                 if (tuned_nbuf)
 984                         nbuf = buf_sz / BKVASIZE;
 985         }
 986
 987         /*
 988          * swbufs are used as temporary holders for I/O, such as paging I/O.
 989          * We have no less then 16 and no more then 256.
 990          */
 991         nswbuf = min(nbuf / 4, 256);
 992         TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf);
 993         if (nswbuf < NSWBUF_MIN)
 994                 nswbuf = NSWBUF_MIN;
 995
 996         /*
 997          * Reserve space for the buffer cache buffers
 998          */
 999         swbuf = (void *)v;
1000         v = (caddr_t)(swbuf + nswbuf);
1001         buf = (void *)v;
1002         v = (caddr_t)(buf + nbuf);
1003
1004         return(v);
1005 }
1006
1007 /* Initialize the buffer subsystem.  Called before use of any buffers. */
1008 void
1009 bufinit(void)
1010 {
1011         struct buf *bp;
1012         int i;
1013
1014         CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
1015         mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
1016         mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
1017         for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
1018                 mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
1019         mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
1020         rw_init(&nblock, "needsbuffer lock");
1021         mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
1022         mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
1023
1024         /* next, make a null set of free lists */
1025         for (i = 0; i < BUFFER_QUEUES; i++)
1026                 TAILQ_INIT(&bufqueues[i]);
1027
1028         unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
1029
1030         /* finally, initialize each buffer header and stick on empty q */
1031         for (i = 0; i < nbuf; i++) {
1032                 bp = &buf[i];
1033                 bzero(bp, sizeof *bp);
1034                 bp->b_flags = B_INVAL;
1035                 bp->b_rcred = NOCRED;
1036                 bp->b_wcred = NOCRED;
1037                 bp->b_qindex = QUEUE_EMPTY;
1038                 bp->b_xflags = 0;
1039                 bp->b_data = bp->b_kvabase = unmapped_buf;
1040                 LIST_INIT(&bp->b_dep);
1041                 BUF_LOCKINIT(bp);
1042                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
1043 #ifdef INVARIANTS
1044                 bq_len[QUEUE_EMPTY]++;
1045 #endif
1046         }
1047
1048         /*
1049          * maxbufspace is the absolute maximum amount of buffer space we are
1050          * allowed to reserve in KVM and in real terms.  The absolute maximum
1051          * is nominally used by metadata.  hibufspace is the nominal maximum
1052          * used by most other requests.  The differential is required to
1053          * ensure that metadata deadlocks don't occur.
1054          *
1055          * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
1056          * this may result in KVM fragmentation which is not handled optimally
1057          * by the system. XXX This is less true with vmem.  We could use
1058          * PAGE_SIZE.
1059          */
1060         maxbufspace = (long)nbuf * BKVASIZE;
1061         hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
1062         lobufspace = (hibufspace / 20) * 19; /* 95% */
1063         bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
1064
1065         /*
1066          * Note: The 16 MiB upper limit for hirunningspace was chosen
1067          * arbitrarily and may need further tuning. It corresponds to
1068          * 128 outstanding write IO requests (if IO size is 128 KiB),
1069          * which fits with many RAID controllers' tagged queuing limits.
1070          * The lower 1 MiB limit is the historical upper limit for
1071          * hirunningspace.
1072          */
1073         hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF),
1074             16 * 1024 * 1024), 1024 * 1024);
1075         lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
1076
1077         /*
1078          * Limit the amount of malloc memory since it is wired permanently into
1079          * the kernel space.  Even though this is accounted for in the buffer
1080          * allocation, we don't want the malloced region to grow uncontrolled.
1081          * The malloc scheme improves memory utilization significantly on
1082          * average (small) directories.
1083          */
1084         maxbufmallocspace = hibufspace / 20;
1085
1086         /*
1087          * Reduce the chance of a deadlock occurring by limiting the number
1088          * of delayed-write dirty buffers we allow to stack up.
1089          */
1090         hidirtybuffers = nbuf / 4 + 20;
1091         dirtybufthresh = hidirtybuffers * 9 / 10;
1092         numdirtybuffers = 0;
1093         /*
1094          * To support extreme low-memory systems, make sure hidirtybuffers
1095          * cannot eat up all available buffer space.  This occurs when our
1096          * minimum cannot be met.  We try to size hidirtybuffers to 3/4 our
1097          * buffer space assuming BKVASIZE'd buffers.
1098          */
1099         while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
1100                 hidirtybuffers >>= 1;
1101         }
1102         lodirtybuffers = hidirtybuffers / 2;
1103
1104         /*
1105          * lofreebuffers should be sufficient to avoid stalling waiting on
1106          * buf headers under heavy utilization.  The bufs in per-cpu caches
1107          * are counted as free but will be unavailable to threads executing
1108          * on other cpus.
1109          *
1110          * hifreebuffers is the free target for the bufspace daemon.  This
1111          * should be set appropriately to limit work per-iteration.
1112          */
1113         lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
1114         hifreebuffers = (3 * lofreebuffers) / 2;
1115         numfreebuffers = nbuf;
1116
1117         bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
1118             VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
1119
1120         /* Setup the kva and free list allocators. */
1121         vmem_set_reclaim(buffer_arena, bufkva_reclaim);
1122         buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
1123             NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
1124
1125         /*
1126          * Size the clean queue according to the amount of buffer space.
1127          * One queue per-256mb up to the max.  More queues gives better
1128          * concurrency but less accurate LRU.
1129          */
1130         clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
1131
1132 }
1133
1134 #ifdef INVARIANTS
1135 static inline void
1136 vfs_buf_check_mapped(struct buf *bp)
1137 {
1138
1139         KASSERT(bp->b_kvabase != unmapped_buf,
1140             ("mapped buf: b_kvabase was not updated %p", bp));
1141         KASSERT(bp->b_data != unmapped_buf,
1142             ("mapped buf: b_data was not updated %p", bp));
1143         KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf +
1144             MAXPHYS, ("b_data + b_offset unmapped %p", bp));
1145 }
1146
1147 static inline void
1148 vfs_buf_check_unmapped(struct buf *bp)
1149 {
1150
1151         KASSERT(bp->b_data == unmapped_buf,
1152             ("unmapped buf: corrupted b_data %p", bp));
1153 }
1154
1155 #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
1156 #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
1157 #else
1158 #define BUF_CHECK_MAPPED(bp) do {} while (0)
1159 #define BUF_CHECK_UNMAPPED(bp) do {} while (0)
1160 #endif
1161
1162 static int
1163 isbufbusy(struct buf *bp)
1164 {
1165         if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) ||
1166             ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
1167                 return (1);
1168         return (0);
1169 }
1170
1171 /*
1172  * Shutdown the system cleanly to prepare for reboot, halt, or power off.
1173  */
1174 void
1175 bufshutdown(int show_busybufs)
1176 {
1177         static int first_buf_printf = 1;
1178         struct buf *bp;
1179         int iter, nbusy, pbusy;
1180 #ifndef PREEMPTION
1181         int subiter;
1182 #endif
1183
1184         /*
1185          * Sync filesystems for shutdown
1186          */
1187         wdog_kern_pat(WD_LASTVAL);
1188         sys_sync(curthread, NULL);
1189
1190         /*
1191          * With soft updates, some buffers that are
1192          * written will be remarked as dirty until other
1193          * buffers are written.
1194          */
1195         for (iter = pbusy = 0; iter < 20; iter++) {
1196                 nbusy = 0;
1197                 for (bp = &buf[nbuf]; --bp >= buf; )
1198                         if (isbufbusy(bp))
1199                                 nbusy++;
1200                 if (nbusy == 0) {
1201                         if (first_buf_printf)
1202                                 printf("All buffers synced.");
1203                         break;
1204                 }
1205                 if (first_buf_printf) {
1206                         printf("Syncing disks, buffers remaining... ");
1207                         first_buf_printf = 0;
1208                 }
1209                 printf("%d ", nbusy);
1210                 if (nbusy < pbusy)
1211                         iter = 0;
1212                 pbusy = nbusy;
1213
1214                 wdog_kern_pat(WD_LASTVAL);
1215                 sys_sync(curthread, NULL);
1216
1217 #ifdef PREEMPTION
1218                 /*
1219                  * Drop Giant and spin for a while to allow
1220                  * interrupt threads to run.
1221                  */
1222                 DROP_GIANT();
1223                 DELAY(50000 * iter);
1224                 PICKUP_GIANT();
1225 #else
1226                 /*
1227                  * Drop Giant and context switch several times to
1228                  * allow interrupt threads to run.
1229                  */
1230                 DROP_GIANT();
1231                 for (subiter = 0; subiter < 50 * iter; subiter++) {
1232                         thread_lock(curthread);
1233                         mi_switch(SW_VOL, NULL);
1234                         thread_unlock(curthread);
1235                         DELAY(1000);
1236                 }
1237                 PICKUP_GIANT();
1238 #endif
1239         }
1240         printf("\n");
1241         /*
1242          * Count only busy local buffers to prevent forcing
1243          * a fsck if we're just a client of a wedged NFS server
1244          */
1245         nbusy = 0;
1246         for (bp = &buf[nbuf]; --bp >= buf; ) {
1247                 if (isbufbusy(bp)) {
1248 #if 0
1249 /* XXX: This is bogus.  We should probably have a BO_REMOTE flag instead */
1250                         if (bp->b_dev == NULL) {
1251                                 TAILQ_REMOVE(&mountlist,
1252                                     bp->b_vp->v_mount, mnt_list);
1253                                 continue;
1254                         }
1255 #endif
1256                         nbusy++;
1257                         if (show_busybufs > 0) {
1258                                 printf(
1259             "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
1260                                     nbusy, bp, bp->b_vp, bp->b_flags,
1261                                     (intmax_t)bp->b_blkno,
1262                                     (intmax_t)bp->b_lblkno);
1263                                 BUF_LOCKPRINTINFO(bp);
1264                                 if (show_busybufs > 1)
1265                                         vn_printf(bp->b_vp,
1266                                             "vnode content: ");
1267                         }
1268                 }
1269         }
1270         if (nbusy) {
1271                 /*
1272                  * Failed to sync all blocks. Indicate this and don't
1273                  * unmount filesystems (thus forcing an fsck on reboot).
1274                  */
1275                 printf("Giving up on %d buffers\n", nbusy);
1276                 DELAY(5000000); /* 5 seconds */
1277         } else {
1278                 if (!first_buf_printf)
1279                         printf("Final sync complete\n");
1280                 /*
1281                  * Unmount filesystems
1282                  */
1283                 if (panicstr == NULL)
1284                         vfs_unmountall();
1285         }
1286         swapoff_all();
1287         DELAY(100000);          /* wait for console output to finish */
1288 }
1289
1290 static void
1291 bpmap_qenter(struct buf *bp)
1292 {
1293
1294         BUF_CHECK_MAPPED(bp);
1295
1296         /*
1297          * bp->b_data is relative to bp->b_offset, but
1298          * bp->b_offset may be offset into the first page.
1299          */
1300         bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
1301         pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
1302         bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
1303             (vm_offset_t)(bp->b_offset & PAGE_MASK));
1304 }
1305
1306 /*
1307  *      binsfree:
1308  *
1309  *      Insert the buffer into the appropriate free list.
1310  */
1311 static void
1312 binsfree(struct buf *bp, int qindex)
1313 {
1314         struct mtx *olock, *nlock;
1315
1316         if (qindex != QUEUE_EMPTY) {
1317                 BUF_ASSERT_XLOCKED(bp);
1318         }
1319
1320         /*
1321          * Stick to the same clean queue for the lifetime of the buf to
1322          * limit locking below.  Otherwise pick ont sequentially.
1323          */
1324         if (qindex == QUEUE_CLEAN) {
1325                 if (bqisclean(bp->b_qindex))
1326                         qindex = bp->b_qindex;
1327                 else
1328                         qindex = bqcleanq();
1329         }
1330
1331         /*
1332          * Handle delayed bremfree() processing.
1333          */
1334         nlock = bqlock(qindex);
1335         if (bp->b_flags & B_REMFREE) {
1336                 olock = bqlock(bp->b_qindex);
1337                 mtx_lock(olock);
1338                 bremfreel(bp);
1339                 if (olock != nlock) {
1340                         mtx_unlock(olock);
1341                         mtx_lock(nlock);
1342                 }
1343         } else
1344                 mtx_lock(nlock);
1345
1346         if (bp->b_qindex != QUEUE_NONE)
1347                 panic("binsfree: free buffer onto another queue???");
1348
1349         bp->b_qindex = qindex;
1350         if (bp->b_flags & B_AGE)
1351                 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
1352         else
1353                 TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
1354 #ifdef INVARIANTS
1355         bq_len[bp->b_qindex]++;
1356 #endif
1357         mtx_unlock(nlock);
1358 }
1359
1360 /*
1361  * buf_free:
1362  *
1363  *      Free a buffer to the buf zone once it no longer has valid contents.
1364  */
1365 static void
1366 buf_free(struct buf *bp)
1367 {
1368
1369         if (bp->b_flags & B_REMFREE)
1370                 bremfreef(bp);
1371         if (bp->b_vflags & BV_BKGRDINPROG)
1372                 panic("losing buffer 1");
1373         if (bp->b_rcred != NOCRED) {
1374                 crfree(bp->b_rcred);
1375                 bp->b_rcred = NOCRED;
1376         }
1377         if (bp->b_wcred != NOCRED) {
1378                 crfree(bp->b_wcred);
1379                 bp->b_wcred = NOCRED;
1380         }
1381         if (!LIST_EMPTY(&bp->b_dep))
1382                 buf_deallocate(bp);
1383         bufkva_free(bp);
1384         BUF_UNLOCK(bp);
1385         uma_zfree(buf_zone, bp);
1386         atomic_add_int(&numfreebuffers, 1);
1387         bufspace_wakeup();
1388 }
1389
1390 /*
1391  * buf_import:
1392  *
1393  *      Import bufs into the uma cache from the buf list.  The system still
1394  *      expects a static array of bufs and much of the synchronization
1395  *      around bufs assumes type stable storage.  As a result, UMA is used
1396  *      only as a per-cpu cache of bufs still maintained on a global list.
1397  */
1398 static int
1399 buf_import(void *arg, void **store, int cnt, int flags)
1400 {
1401         struct buf *bp;
1402         int i;
1403
1404         mtx_lock(&bqlocks[QUEUE_EMPTY]);
1405         for (i = 0; i < cnt; i++) {
1406                 bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1407                 if (bp == NULL)
1408                         break;
1409                 bremfreel(bp);
1410                 store[i] = bp;
1411         }
1412         mtx_unlock(&bqlocks[QUEUE_EMPTY]);
1413
1414         return (i);
1415 }
1416
1417 /*
1418  * buf_release:
1419  *
1420  *      Release bufs from the uma cache back to the buffer queues.
1421  */
1422 static void
1423 buf_release(void *arg, void **store, int cnt)
1424 {
1425         int i;
1426
1427         for (i = 0; i < cnt; i++)
1428                 binsfree(store[i], QUEUE_EMPTY);
1429 }
1430
1431 /*
1432  * buf_alloc:
1433  *
1434  *      Allocate an empty buffer header.
1435  */
1436 static struct buf *
1437 buf_alloc(void)
1438 {
1439         struct buf *bp;
1440
1441         bp = uma_zalloc(buf_zone, M_NOWAIT);
1442         if (bp == NULL) {
1443                 bufspace_daemonwakeup();
1444                 atomic_add_int(&numbufallocfails, 1);
1445                 return (NULL);
1446         }
1447
1448         /*
1449          * Wake-up the bufspace daemon on transition.
1450          */
1451         if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
1452                 bufspace_daemonwakeup();
1453
1454         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
1455                 panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
1456
1457         KASSERT(bp->b_vp == NULL,
1458             ("bp: %p still has vnode %p.", bp, bp->b_vp));
1459         KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
1460             ("invalid buffer %p flags %#x", bp, bp->b_flags));
1461         KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
1462             ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
1463         KASSERT(bp->b_npages == 0,
1464             ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
1465         KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
1466         KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
1467
1468         bp->b_flags = 0;
1469         bp->b_ioflags = 0;
1470         bp->b_xflags = 0;
1471         bp->b_vflags = 0;
1472         bp->b_vp = NULL;
1473         bp->b_blkno = bp->b_lblkno = 0;
1474         bp->b_offset = NOOFFSET;
1475         bp->b_iodone = 0;
1476         bp->b_error = 0;
1477         bp->b_resid = 0;
1478         bp->b_bcount = 0;
1479         bp->b_npages = 0;
1480         bp->b_dirtyoff = bp->b_dirtyend = 0;
1481         bp->b_bufobj = NULL;
1482         bp->b_pin_count = 0;
1483         bp->b_data = bp->b_kvabase = unmapped_buf;
1484         bp->b_fsprivate1 = NULL;
1485         bp->b_fsprivate2 = NULL;
1486         bp->b_fsprivate3 = NULL;
1487         LIST_INIT(&bp->b_dep);
1488
1489         return (bp);
1490 }
1491
1492 /*
1493  *      buf_qrecycle:
1494  *
1495  *      Free a buffer from the given bufqueue.  kva controls whether the
1496  *      freed buf must own some kva resources.  This is used for
1497  *      defragmenting.
1498  */
1499 static int
1500 buf_qrecycle(int qindex, bool kva)
1501 {
1502         struct buf *bp, *nbp;
1503
1504         if (kva)
1505                 atomic_add_int(&bufdefragcnt, 1);
1506         nbp = NULL;
1507         mtx_lock(&bqlocks[qindex]);
1508         nbp = TAILQ_FIRST(&bufqueues[qindex]);
1509
1510         /*
1511          * Run scan, possibly freeing data and/or kva mappings on the fly
1512          * depending.
1513          */
1514         while ((bp = nbp) != NULL) {
1515                 /*
1516                  * Calculate next bp (we can only use it if we do not
1517                  * release the bqlock).
1518                  */
1519                 nbp = TAILQ_NEXT(bp, b_freelist);
1520
1521                 /*
1522                  * If we are defragging then we need a buffer with
1523                  * some kva to reclaim.
1524                  */
1525                 if (kva && bp->b_kvasize == 0)
1526                         continue;
1527
1528                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
1529                         continue;
1530
1531                 /*
1532                  * Skip buffers with background writes in progress.
1533                  */
1534                 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
1535                         BUF_UNLOCK(bp);
1536                         continue;
1537                 }
1538
1539                 KASSERT(bp->b_qindex == qindex,
1540                     ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
1541                 /*
1542                  * NOTE:  nbp is now entirely invalid.  We can only restart
1543                  * the scan from this point on.
1544                  */
1545                 bremfreel(bp);
1546                 mtx_unlock(&bqlocks[qindex]);
1547
1548                 /*
1549                  * Requeue the background write buffer with error and
1550                  * restart the scan.
1551                  */
1552                 if ((bp->b_vflags & BV_BKGRDERR) != 0) {
1553                         bqrelse(bp);
1554                         mtx_lock(&bqlocks[qindex]);
1555                         nbp = TAILQ_FIRST(&bufqueues[qindex]);
1556                         continue;
1557                 }
1558                 bp->b_flags |= B_INVAL;
1559                 brelse(bp);
1560                 return (0);
1561         }
1562         mtx_unlock(&bqlocks[qindex]);
1563
1564         return (ENOBUFS);
1565 }
1566
1567 /*
1568  *      buf_recycle:
1569  *
1570  *      Iterate through all clean queues until we find a buf to recycle or
1571  *      exhaust the search.
1572  */
1573 static int
1574 buf_recycle(bool kva)
1575 {
1576         int qindex, first_qindex;
1577
1578         qindex = first_qindex = bqcleanq();
1579         do {
1580                 if (buf_qrecycle(qindex, kva) == 0)
1581                         return (0);
1582                 if (++qindex == QUEUE_CLEAN + clean_queues)
1583                         qindex = QUEUE_CLEAN;
1584         } while (qindex != first_qindex);
1585
1586         return (ENOBUFS);
1587 }
1588
1589 /*
1590  *      buf_scan:
1591  *
1592  *      Scan the clean queues looking for a buffer to recycle.  needsbuffer
1593  *      is set on failure so that the caller may optionally bufspace_wait()
1594  *      in a race-free fashion.
1595  */
1596 static int
1597 buf_scan(bool defrag)
1598 {
1599         int error;
1600
1601         /*
1602          * To avoid heavy synchronization and wakeup races we set
1603          * needsbuffer and re-poll before failing.  This ensures that
1604          * no frees can be missed between an unsuccessful poll and
1605          * going to sleep in a synchronized fashion.
1606          */
1607         if ((error = buf_recycle(defrag)) != 0) {
1608                 atomic_set_int(&needsbuffer, 1);
1609                 bufspace_daemonwakeup();
1610                 error = buf_recycle(defrag);
1611         }
1612         if (error == 0)
1613                 atomic_add_int(&getnewbufrestarts, 1);
1614         return (error);
1615 }
1616
1617 /*
1618  *      bremfree:
1619  *
1620  *      Mark the buffer for removal from the appropriate free list.
1621  *
1622  */
1623 void
1624 bremfree(struct buf *bp)
1625 {
1626
1627         CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1628         KASSERT((bp->b_flags & B_REMFREE) == 0,
1629             ("bremfree: buffer %p already marked for delayed removal.", bp));
1630         KASSERT(bp->b_qindex != QUEUE_NONE,
1631             ("bremfree: buffer %p not on a queue.", bp));
1632         BUF_ASSERT_XLOCKED(bp);
1633
1634         bp->b_flags |= B_REMFREE;
1635 }
1636
1637 /*
1638  *      bremfreef:
1639  *
1640  *      Force an immediate removal from a free list.  Used only in nfs when
1641  *      it abuses the b_freelist pointer.
1642  */
1643 void
1644 bremfreef(struct buf *bp)
1645 {
1646         struct mtx *qlock;
1647
1648         qlock = bqlock(bp->b_qindex);
1649         mtx_lock(qlock);
1650         bremfreel(bp);
1651         mtx_unlock(qlock);
1652 }
1653
1654 /*
1655  *      bremfreel:
1656  *
1657  *      Removes a buffer from the free list, must be called with the
1658  *      correct qlock held.
1659  */
1660 static void
1661 bremfreel(struct buf *bp)
1662 {
1663
1664         CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
1665             bp, bp->b_vp, bp->b_flags);
1666         KASSERT(bp->b_qindex != QUEUE_NONE,
1667             ("bremfreel: buffer %p not on a queue.", bp));
1668         if (bp->b_qindex != QUEUE_EMPTY) {
1669                 BUF_ASSERT_XLOCKED(bp);
1670         }
1671         mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
1672
1673         TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
1674 #ifdef INVARIANTS
1675         KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
1676             bp->b_qindex));
1677         bq_len[bp->b_qindex]--;
1678 #endif
1679         bp->b_qindex = QUEUE_NONE;
1680         bp->b_flags &= ~B_REMFREE;
1681 }
1682
1683 /*
1684  *      bufkva_free:
1685  *
1686  *      Free the kva allocation for a buffer.
1687  *
1688  */
1689 static void
1690 bufkva_free(struct buf *bp)
1691 {
1692
1693 #ifdef INVARIANTS
1694         if (bp->b_kvasize == 0) {
1695                 KASSERT(bp->b_kvabase == unmapped_buf &&
1696                     bp->b_data == unmapped_buf,
1697                     ("Leaked KVA space on %p", bp));
1698         } else if (buf_mapped(bp))
1699                 BUF_CHECK_MAPPED(bp);
1700         else
1701                 BUF_CHECK_UNMAPPED(bp);
1702 #endif
1703         if (bp->b_kvasize == 0)
1704                 return;
1705
1706         vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
1707         atomic_subtract_long(&bufkvaspace, bp->b_kvasize);
1708         atomic_add_int(&buffreekvacnt, 1);
1709         bp->b_data = bp->b_kvabase = unmapped_buf;
1710         bp->b_kvasize = 0;
1711 }
1712
1713 /*
1714  *      bufkva_alloc:
1715  *
1716  *      Allocate the buffer KVA and set b_kvasize and b_kvabase.
1717  */
1718 static int
1719 bufkva_alloc(struct buf *bp, int maxsize, int gbflags)
1720 {
1721         vm_offset_t addr;
1722         int error;
1723
1724         KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
1725             ("Invalid gbflags 0x%x in %s", gbflags, __func__));
1726
1727         bufkva_free(bp);
1728
1729         addr = 0;
1730         error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
1731         if (error != 0) {
1732                 /*
1733                  * Buffer map is too fragmented.  Request the caller
1734                  * to defragment the map.
1735                  */
1736                 return (error);
1737         }
1738         bp->b_kvabase = (caddr_t)addr;
1739         bp->b_kvasize = maxsize;
1740         atomic_add_long(&bufkvaspace, bp->b_kvasize);
1741         if ((gbflags & GB_UNMAPPED) != 0) {
1742                 bp->b_data = unmapped_buf;
1743                 BUF_CHECK_UNMAPPED(bp);
1744         } else {
1745                 bp->b_data = bp->b_kvabase;
1746                 BUF_CHECK_MAPPED(bp);
1747         }
1748         return (0);
1749 }
1750
1751 /*
1752  *      bufkva_reclaim:
1753  *
1754  *      Reclaim buffer kva by freeing buffers holding kva.  This is a vmem
1755  *      callback that fires to avoid returning failure.
1756  */
1757 static void
1758 bufkva_reclaim(vmem_t *vmem, int flags)
1759 {
1760         int i;
1761
1762         for (i = 0; i < 5; i++)
1763                 if (buf_scan(true) != 0)
1764                         break;
1765         return;
1766 }
1767
1768
1769 /*
1770  * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
1771  * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
1772  * the buffer is valid and we do not have to do anything.
1773  */
1774 void
1775 breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
1776     int cnt, struct ucred * cred)
1777 {
1778         struct buf *rabp;
1779         int i;
1780
1781         for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
1782                 if (inmem(vp, *rablkno))
1783                         continue;
1784                 rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
1785
1786                 if ((rabp->b_flags & B_CACHE) == 0) {
1787                         if (!TD_IS_IDLETHREAD(curthread)) {
1788 #ifdef RACCT
1789                                 if (racct_enable) {
1790                                         PROC_LOCK(curproc);
1791                                         racct_add_buf(curproc, rabp, 0);
1792                                         PROC_UNLOCK(curproc);
1793                                 }
1794 #endif /* RACCT */
1795                                 curthread->td_ru.ru_inblock++;
1796                         }
1797                         rabp->b_flags |= B_ASYNC;
1798                         rabp->b_flags &= ~B_INVAL;
1799                         rabp->b_ioflags &= ~BIO_ERROR;
1800                         rabp->b_iocmd = BIO_READ;
1801                         if (rabp->b_rcred == NOCRED && cred != NOCRED)
1802                                 rabp->b_rcred = crhold(cred);
1803                         vfs_busy_pages(rabp, 0);
1804                         BUF_KERNPROC(rabp);
1805                         rabp->b_iooffset = dbtob(rabp->b_blkno);
1806                         bstrategy(rabp);
1807                 } else {
1808                         brelse(rabp);
1809                 }
1810         }
1811 }
1812
1813 /*
1814  * Entry point for bread() and breadn() via #defines in sys/buf.h.
1815  *
1816  * Get a buffer with the specified data.  Look in the cache first.  We
1817  * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
1818  * is set, the buffer is valid and we do not have to do anything, see
1819  * getblk(). Also starts asynchronous I/O on read-ahead blocks.
1820  *
1821  * Always return a NULL buffer pointer (in bpp) when returning an error.
1822  */
1823 int
1824 breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
1825     int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp)
1826 {
1827         struct buf *bp;
1828         int rv = 0, readwait = 0;
1829
1830         CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
1831         /*
1832          * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
1833          */
1834         *bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
1835         if (bp == NULL)
1836                 return (EBUSY);
1837
1838         /* if not found in cache, do some I/O */
1839         if ((bp->b_flags & B_CACHE) == 0) {
1840                 if (!TD_IS_IDLETHREAD(curthread)) {
1841 #ifdef RACCT
1842                         if (racct_enable) {
1843                                 PROC_LOCK(curproc);
1844                                 racct_add_buf(curproc, bp, 0);
1845                                 PROC_UNLOCK(curproc);
1846                         }
1847 #endif /* RACCT */
1848                         curthread->td_ru.ru_inblock++;
1849                 }
1850                 bp->b_iocmd = BIO_READ;
1851                 bp->b_flags &= ~B_INVAL;
1852                 bp->b_ioflags &= ~BIO_ERROR;
1853                 if (bp->b_rcred == NOCRED && cred != NOCRED)
1854                         bp->b_rcred = crhold(cred);
1855                 vfs_busy_pages(bp, 0);
1856                 bp->b_iooffset = dbtob(bp->b_blkno);
1857                 bstrategy(bp);
1858                 ++readwait;
1859         }
1860
1861         breada(vp, rablkno, rabsize, cnt, cred);
1862
1863         if (readwait) {
1864                 rv = bufwait(bp);
1865                 if (rv != 0) {
1866                         brelse(bp);
1867                         *bpp = NULL;
1868                 }
1869         }
1870         return (rv);
1871 }
1872
1873 /*
1874  * Write, release buffer on completion.  (Done by iodone
1875  * if async).  Do not bother writing anything if the buffer
1876  * is invalid.
1877  *
1878  * Note that we set B_CACHE here, indicating that buffer is
1879  * fully valid and thus cacheable.  This is true even of NFS
1880  * now so we set it generally.  This could be set either here
1881  * or in biodone() since the I/O is synchronous.  We put it
1882  * here.
1883  */
1884 int
1885 bufwrite(struct buf *bp)
1886 {
1887         int oldflags;
1888         struct vnode *vp;
1889         long space;
1890         int vp_md;
1891
1892         CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1893         if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) {
1894                 bp->b_flags |= B_INVAL | B_RELBUF;
1895                 bp->b_flags &= ~B_CACHE;
1896                 brelse(bp);
1897                 return (ENXIO);
1898         }
1899         if (bp->b_flags & B_INVAL) {
1900                 brelse(bp);
1901                 return (0);
1902         }
1903
1904         if (bp->b_flags & B_BARRIER)
1905                 barrierwrites++;
1906
1907         oldflags = bp->b_flags;
1908
1909         BUF_ASSERT_HELD(bp);
1910
1911         if (bp->b_pin_count > 0)
1912                 bunpin_wait(bp);
1913
1914         KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
1915             ("FFS background buffer should not get here %p", bp));
1916
1917         vp = bp->b_vp;
1918         if (vp)
1919                 vp_md = vp->v_vflag & VV_MD;
1920         else
1921                 vp_md = 0;
1922
1923         /*
1924          * Mark the buffer clean.  Increment the bufobj write count
1925          * before bundirty() call, to prevent other thread from seeing
1926          * empty dirty list and zero counter for writes in progress,
1927          * falsely indicating that the bufobj is clean.
1928          */
1929         bufobj_wref(bp->b_bufobj);
1930         bundirty(bp);
1931
1932         bp->b_flags &= ~B_DONE;
1933         bp->b_ioflags &= ~BIO_ERROR;
1934         bp->b_flags |= B_CACHE;
1935         bp->b_iocmd = BIO_WRITE;
1936
1937         vfs_busy_pages(bp, 1);
1938
1939         /*
1940          * Normal bwrites pipeline writes
1941          */
1942         bp->b_runningbufspace = bp->b_bufsize;
1943         space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
1944
1945         if (!TD_IS_IDLETHREAD(curthread)) {
1946 #ifdef RACCT
1947                 if (racct_enable) {
1948                         PROC_LOCK(curproc);
1949                         racct_add_buf(curproc, bp, 1);
1950                         PROC_UNLOCK(curproc);
1951                 }
1952 #endif /* RACCT */
1953                 curthread->td_ru.ru_oublock++;
1954         }
1955         if (oldflags & B_ASYNC)
1956                 BUF_KERNPROC(bp);
1957         bp->b_iooffset = dbtob(bp->b_blkno);
1958         bstrategy(bp);
1959
1960         if ((oldflags & B_ASYNC) == 0) {
1961                 int rtval = bufwait(bp);
1962                 brelse(bp);
1963                 return (rtval);
1964         } else if (space > hirunningspace) {
1965                 /*
1966                  * don't allow the async write to saturate the I/O
1967                  * system.  We will not deadlock here because
1968                  * we are blocking waiting for I/O that is already in-progress
1969                  * to complete. We do not block here if it is the update
1970                  * or syncer daemon trying to clean up as that can lead
1971                  * to deadlock.
1972                  */
1973                 if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
1974                         waitrunningbufspace();
1975         }
1976
1977         return (0);
1978 }
1979
1980 void
1981 bufbdflush(struct bufobj *bo, struct buf *bp)
1982 {
1983         struct buf *nbp;
1984
1985         if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
1986                 (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
1987                 altbufferflushes++;
1988         } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
1989                 BO_LOCK(bo);
1990                 /*
1991                  * Try to find a buffer to flush.
1992                  */
1993                 TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
1994                         if ((nbp->b_vflags & BV_BKGRDINPROG) ||
1995                             BUF_LOCK(nbp,
1996                                      LK_EXCLUSIVE | LK_NOWAIT, NULL))
1997                                 continue;
1998                         if (bp == nbp)
1999                                 panic("bdwrite: found ourselves");
2000                         BO_UNLOCK(bo);
2001                         /* Don't countdeps with the bo lock held. */
2002                         if (buf_countdeps(nbp, 0)) {
2003                                 BO_LOCK(bo);
2004                                 BUF_UNLOCK(nbp);
2005                                 continue;
2006                         }
2007                         if (nbp->b_flags & B_CLUSTEROK) {
2008                                 vfs_bio_awrite(nbp);
2009                         } else {
2010                                 bremfree(nbp);
2011                                 bawrite(nbp);
2012                         }
2013                         dirtybufferflushes++;
2014                         break;
2015                 }
2016                 if (nbp == NULL)
2017                         BO_UNLOCK(bo);
2018         }
2019 }
2020
2021 /*
2022  * Delayed write. (Buffer is marked dirty).  Do not bother writing
2023  * anything if the buffer is marked invalid.
2024  *
2025  * Note that since the buffer must be completely valid, we can safely
2026  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
2027  * biodone() in order to prevent getblk from writing the buffer
2028  * out synchronously.
2029  */
2030 void
2031 bdwrite(struct buf *bp)
2032 {
2033         struct thread *td = curthread;
2034         struct vnode *vp;
2035         struct bufobj *bo;
2036
2037         CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
2038         KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
2039         KASSERT((bp->b_flags & B_BARRIER) == 0,
2040             ("Barrier request in delayed write %p", bp));
2041         BUF_ASSERT_HELD(bp);
2042
2043         if (bp->b_flags & B_INVAL) {
2044                 brelse(bp);
2045                 return;
2046         }
2047
2048         /*
2049          * If we have too many dirty buffers, don't create any more.
2050          * If we are wildly over our limit, then force a complete
2051          * cleanup. Otherwise, just keep the situation from getting
2052          * out of control. Note that we have to avoid a recursive
2053          * disaster and not try to clean up after our own cleanup!
2054          */
2055         vp = bp->b_vp;
2056         bo = bp->b_bufobj;
2057         if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
2058                 td->td_pflags |= TDP_INBDFLUSH;
2059                 BO_BDFLUSH(bo, bp);
2060                 td->td_pflags &= ~TDP_INBDFLUSH;
2061         } else
2062                 recursiveflushes++;
2063
2064         bdirty(bp);
2065         /*
2066          * Set B_CACHE, indicating that the buffer is fully valid.  This is
2067          * true even of NFS now.
2068          */
2069         bp->b_flags |= B_CACHE;
2070
2071         /*
2072          * This bmap keeps the system from needing to do the bmap later,
2073          * perhaps when the system is attempting to do a sync.  Since it
2074          * is likely that the indirect block -- or whatever other datastructure
2075          * that the filesystem needs is still in memory now, it is a good
2076          * thing to do this.  Note also, that if the pageout daemon is
2077          * requesting a sync -- there might not be enough memory to do
2078          * the bmap then...  So, this is important to do.
2079          */
2080         if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
2081                 VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
2082         }
2083
2084         /*
2085          * Set the *dirty* buffer range based upon the VM system dirty
2086          * pages.
2087          *
2088          * Mark the buffer pages as clean.  We need to do this here to
2089          * satisfy the vnode_pager and the pageout daemon, so that it
2090          * thinks that the pages have been "cleaned".  Note that since
2091          * the pages are in a delayed write buffer -- the VFS layer
2092          * "will" see that the pages get written out on the next sync,
2093          * or perhaps the cluster will be completed.
2094          */
2095         vfs_clean_pages_dirty_buf(bp);
2096         bqrelse(bp);
2097
2098         /*
2099          * note: we cannot initiate I/O from a bdwrite even if we wanted to,
2100          * due to the softdep code.
2101          */
2102 }
2103
2104 /*
2105  *      bdirty:
2106  *
2107  *      Turn buffer into delayed write request.  We must clear BIO_READ and
2108  *      B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to
2109  *      itself to properly update it in the dirty/clean lists.  We mark it
2110  *      B_DONE to ensure that any asynchronization of the buffer properly
2111  *      clears B_DONE ( else a panic will occur later ).
2112  *
2113  *      bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
2114  *      might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
2115  *      should only be called if the buffer is known-good.
2116  *
2117  *      Since the buffer is not on a queue, we do not update the numfreebuffers
2118  *      count.
2119  *
2120  *      The buffer must be on QUEUE_NONE.
2121  */
2122 void
2123 bdirty(struct buf *bp)
2124 {
2125
2126         CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
2127             bp, bp->b_vp, bp->b_flags);
2128         KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
2129         KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
2130             ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
2131         BUF_ASSERT_HELD(bp);
2132         bp->b_flags &= ~(B_RELBUF);
2133         bp->b_iocmd = BIO_WRITE;
2134
2135         if ((bp->b_flags & B_DELWRI) == 0) {
2136                 bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
2137                 reassignbuf(bp);
2138                 bdirtyadd();
2139         }
2140 }
2141
2142 /*
2143  *      bundirty:
2144  *
2145  *      Clear B_DELWRI for buffer.
2146  *
2147  *      Since the buffer is not on a queue, we do not update the numfreebuffers
2148  *      count.
2149  *
2150  *      The buffer must be on QUEUE_NONE.
2151  */
2152
2153 void
2154 bundirty(struct buf *bp)
2155 {
2156
2157         CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
2158         KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
2159         KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
2160             ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
2161         BUF_ASSERT_HELD(bp);
2162
2163         if (bp->b_flags & B_DELWRI) {
2164                 bp->b_flags &= ~B_DELWRI;
2165                 reassignbuf(bp);
2166                 bdirtysub();
2167         }
2168         /*
2169          * Since it is now being written, we can clear its deferred write flag.
2170          */
2171         bp->b_flags &= ~B_DEFERRED;
2172 }
2173
2174 /*
2175  *      bawrite:
2176  *
2177  *      Asynchronous write.  Start output on a buffer, but do not wait for
2178  *      it to complete.  The buffer is released when the output completes.
2179  *
2180  *      bwrite() ( or the VOP routine anyway ) is responsible for handling
2181  *      B_INVAL buffers.  Not us.
2182  */
2183 void
2184 bawrite(struct buf *bp)
2185 {
2186
2187         bp->b_flags |= B_ASYNC;
2188         (void) bwrite(bp);
2189 }
2190
2191 /*
2192  *      babarrierwrite:
2193  *
2194  *      Asynchronous barrier write.  Start output on a buffer, but do not
2195  *      wait for it to complete.  Place a write barrier after this write so
2196  *      that this buffer and all buffers written before it are committed to
2197  *      the disk before any buffers written after this write are committed
2198  *      to the disk.  The buffer is released when the output completes.
2199  */
2200 void
2201 babarrierwrite(struct buf *bp)
2202 {
2203
2204         bp->b_flags |= B_ASYNC | B_BARRIER;
2205         (void) bwrite(bp);
2206 }
2207
2208 /*
2209  *      bbarrierwrite:
2210  *
2211  *      Synchronous barrier write.  Start output on a buffer and wait for
2212  *      it to complete.  Place a write barrier after this write so that
2213  *      this buffer and all buffers written before it are committed to
2214  *      the disk before any buffers written after this write are committed
2215  *      to the disk.  The buffer is released when the output completes.
2216  */
2217 int
2218 bbarrierwrite(struct buf *bp)
2219 {
2220
2221         bp->b_flags |= B_BARRIER;
2222         return (bwrite(bp));
2223 }
2224
2225 /*
2226  *      bwillwrite:
2227  *
2228  *      Called prior to the locking of any vnodes when we are expecting to
2229  *      write.  We do not want to starve the buffer cache with too many
2230  *      dirty buffers so we block here.  By blocking prior to the locking
2231  *      of any vnodes we attempt to avoid the situation where a locked vnode
2232  *      prevents the various system daemons from flushing related buffers.
2233  */
2234 void
2235 bwillwrite(void)
2236 {
2237
2238         if (numdirtybuffers >= hidirtybuffers) {
2239                 mtx_lock(&bdirtylock);
2240                 while (numdirtybuffers >= hidirtybuffers) {
2241                         bdirtywait = 1;
2242                         msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
2243                             "flswai", 0);
2244                 }
2245                 mtx_unlock(&bdirtylock);
2246         }
2247 }
2248
2249 /*
2250  * Return true if we have too many dirty buffers.
2251  */
2252 int
2253 buf_dirty_count_severe(void)
2254 {
2255
2256         return(numdirtybuffers >= hidirtybuffers);
2257 }
2258
2259 /*
2260  *      brelse:
2261  *
2262  *      Release a busy buffer and, if requested, free its resources.  The
2263  *      buffer will be stashed in the appropriate bufqueue[] allowing it
2264  *      to be accessed later as a cache entity or reused for other purposes.
2265  */
2266 void
2267 brelse(struct buf *bp)
2268 {
2269         int qindex;
2270
2271         /*
2272          * Many functions erroneously call brelse with a NULL bp under rare
2273          * error conditions. Simply return when called with a NULL bp.
2274          */
2275         if (bp == NULL)
2276                 return;
2277         CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
2278             bp, bp->b_vp, bp->b_flags);
2279         KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
2280             ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
2281         KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0,
2282             ("brelse: non-VMIO buffer marked NOREUSE"));
2283
2284         if (BUF_LOCKRECURSED(bp)) {
2285                 /*
2286                  * Do not process, in particular, do not handle the
2287                  * B_INVAL/B_RELBUF and do not release to free list.
2288                  */
2289                 BUF_UNLOCK(bp);
2290                 return;
2291         }
2292
2293         if (bp->b_flags & B_MANAGED) {
2294                 bqrelse(bp);
2295                 return;
2296         }
2297
2298         if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) {
2299                 BO_LOCK(bp->b_bufobj);
2300                 bp->b_vflags &= ~BV_BKGRDERR;
2301                 BO_UNLOCK(bp->b_bufobj);
2302                 bdirty(bp);
2303         }
2304         if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
2305             !(bp->b_flags & B_INVAL)) {
2306                 /*
2307                  * Failed write, redirty.  Must clear BIO_ERROR to prevent
2308                  * pages from being scrapped.
2309                  */
2310                 bp->b_ioflags &= ~BIO_ERROR;
2311                 bdirty(bp);
2312         } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
2313             (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
2314                 /*
2315                  * Either a failed read I/O or we were asked to free or not
2316                  * cache the buffer.
2317                  */
2318                 bp->b_flags |= B_INVAL;
2319                 if (!LIST_EMPTY(&bp->b_dep))
2320                         buf_deallocate(bp);
2321                 if (bp->b_flags & B_DELWRI)
2322                         bdirtysub();
2323                 bp->b_flags &= ~(B_DELWRI | B_CACHE);
2324                 if ((bp->b_flags & B_VMIO) == 0) {
2325                         allocbuf(bp, 0);
2326                         if (bp->b_vp)
2327                                 brelvp(bp);
2328                 }
2329         }
2330
2331         /*
2332          * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_truncate()
2333          * is called with B_DELWRI set, the underlying pages may wind up
2334          * getting freed causing a previous write (bdwrite()) to get 'lost'
2335          * because pages associated with a B_DELWRI bp are marked clean.
2336          *
2337          * We still allow the B_INVAL case to call vfs_vmio_truncate(), even
2338          * if B_DELWRI is set.
2339          */
2340         if (bp->b_flags & B_DELWRI)
2341                 bp->b_flags &= ~B_RELBUF;
2342
2343         /*
2344          * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
2345          * constituted, not even NFS buffers now.  Two flags effect this.  If
2346          * B_INVAL, the struct buf is invalidated but the VM object is kept
2347          * around ( i.e. so it is trivial to reconstitute the buffer later ).
2348          *
2349          * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
2350          * invalidated.  BIO_ERROR cannot be set for a failed write unless the
2351          * buffer is also B_INVAL because it hits the re-dirtying code above.
2352          *
2353          * Normally we can do this whether a buffer is B_DELWRI or not.  If
2354          * the buffer is an NFS buffer, it is tracking piecemeal writes or
2355          * the commit state and we cannot afford to lose the buffer. If the
2356          * buffer has a background write in progress, we need to keep it
2357          * around to prevent it from being reconstituted and starting a second
2358          * background write.
2359          */
2360         if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE ||
2361             (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) &&
2362             !(bp->b_vp->v_mount != NULL &&
2363             (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
2364             !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI))) {
2365                 vfs_vmio_invalidate(bp);
2366                 allocbuf(bp, 0);
2367         }
2368
2369         if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 ||
2370             (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) {
2371                 allocbuf(bp, 0);
2372                 bp->b_flags &= ~B_NOREUSE;
2373                 if (bp->b_vp != NULL)
2374                         brelvp(bp);
2375         }
2376
2377         /*
2378          * If the buffer has junk contents signal it and eventually
2379          * clean up B_DELWRI and diassociate the vnode so that gbincore()
2380          * doesn't find it.
2381          */
2382         if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
2383             (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
2384                 bp->b_flags |= B_INVAL;
2385         if (bp->b_flags & B_INVAL) {
2386                 if (bp->b_flags & B_DELWRI)
2387                         bundirty(bp);
2388                 if (bp->b_vp)
2389                         brelvp(bp);
2390         }
2391
2392         /* buffers with no memory */
2393         if (bp->b_bufsize == 0) {
2394                 buf_free(bp);
2395                 return;
2396         }
2397         /* buffers with junk contents */
2398         if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
2399             (bp->b_ioflags & BIO_ERROR)) {
2400                 bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
2401                 if (bp->b_vflags & BV_BKGRDINPROG)
2402                         panic("losing buffer 2");
2403                 qindex = QUEUE_CLEAN;
2404                 bp->b_flags |= B_AGE;
2405         /* remaining buffers */
2406         } else if (bp->b_flags & B_DELWRI)
2407                 qindex = QUEUE_DIRTY;
2408         else
2409                 qindex = QUEUE_CLEAN;
2410
2411         binsfree(bp, qindex);
2412
2413         bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
2414         if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
2415                 panic("brelse: not dirty");
2416         /* unlock */
2417         BUF_UNLOCK(bp);
2418         if (qindex == QUEUE_CLEAN)
2419                 bufspace_wakeup();
2420 }
2421
2422 /*
2423  * Release a buffer back to the appropriate queue but do not try to free
2424  * it.  The buffer is expected to be used again soon.
2425  *
2426  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
2427  * biodone() to requeue an async I/O on completion.  It is also used when
2428  * known good buffers need to be requeued but we think we may need the data
2429  * again soon.
2430  *
2431  * XXX we should be able to leave the B_RELBUF hint set on completion.
2432  */
2433 void
2434 bqrelse(struct buf *bp)
2435 {
2436         int qindex;
2437
2438         CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
2439         KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
2440             ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
2441
2442         qindex = QUEUE_NONE;
2443         if (BUF_LOCKRECURSED(bp)) {
2444                 /* do not release to free list */
2445                 BUF_UNLOCK(bp);
2446                 return;
2447         }
2448         bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
2449
2450         if (bp->b_flags & B_MANAGED) {
2451                 if (bp->b_flags & B_REMFREE)
2452                         bremfreef(bp);
2453                 goto out;
2454         }
2455
2456         /* buffers with stale but valid contents */
2457         if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG |
2458             BV_BKGRDERR)) == BV_BKGRDERR) {
2459                 BO_LOCK(bp->b_bufobj);
2460                 bp->b_vflags &= ~BV_BKGRDERR;
2461                 BO_UNLOCK(bp->b_bufobj);
2462                 qindex = QUEUE_DIRTY;
2463         } else {
2464                 if ((bp->b_flags & B_DELWRI) == 0 &&
2465                     (bp->b_xflags & BX_VNDIRTY))
2466                         panic("bqrelse: not dirty");
2467                 if ((bp->b_flags & B_NOREUSE) != 0) {
2468                         brelse(bp);
2469                         return;
2470                 }
2471                 qindex = QUEUE_CLEAN;
2472         }
2473         binsfree(bp, qindex);
2474
2475 out:
2476         /* unlock */
2477         BUF_UNLOCK(bp);
2478         if (qindex == QUEUE_CLEAN)
2479                 bufspace_wakeup();
2480 }
2481
2482 /*
2483  * Complete I/O to a VMIO backed page.  Validate the pages as appropriate,
2484  * restore bogus pages.
2485  */
2486 static void
2487 vfs_vmio_iodone(struct buf *bp)
2488 {
2489         vm_ooffset_t foff;
2490         vm_page_t m;
2491         vm_object_t obj;
2492         struct vnode *vp;
2493         int bogus, i, iosize;
2494
2495         obj = bp->b_bufobj->bo_object;
2496         KASSERT(obj->paging_in_progress >= bp->b_npages,
2497             ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)",
2498             obj->paging_in_progress, bp->b_npages));
2499
2500         vp = bp->b_vp;
2501         KASSERT(vp->v_holdcnt > 0,
2502             ("vfs_vmio_iodone: vnode %p has zero hold count", vp));
2503         KASSERT(vp->v_object != NULL,
2504             ("vfs_vmio_iodone: vnode %p has no vm_object", vp));
2505
2506         foff = bp->b_offset;
2507         KASSERT(bp->b_offset != NOOFFSET,
2508             ("vfs_vmio_iodone: bp %p has no buffer offset", bp));
2509
2510         bogus = 0;
2511         iosize = bp->b_bcount - bp->b_resid;
2512         VM_OBJECT_WLOCK(obj);
2513         for (i = 0; i < bp->b_npages; i++) {
2514                 int resid;
2515
2516                 resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
2517                 if (resid > iosize)
2518                         resid = iosize;
2519
2520                 /*
2521                  * cleanup bogus pages, restoring the originals
2522                  */
2523                 m = bp->b_pages[i];
2524                 if (m == bogus_page) {
2525                         bogus = 1;
2526                         m = vm_page_lookup(obj, OFF_TO_IDX(foff));
2527                         if (m == NULL)
2528                                 panic("biodone: page disappeared!");
2529                         bp->b_pages[i] = m;
2530                 } else if ((bp->b_iocmd == BIO_READ) && resid > 0) {
2531                         /*
2532                          * In the write case, the valid and clean bits are
2533                          * already changed correctly ( see bdwrite() ), so we
2534                          * only need to do this here in the read case.
2535                          */
2536                         KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK,
2537                             resid)) == 0, ("vfs_vmio_iodone: page %p "
2538                             "has unexpected dirty bits", m));
2539                         vfs_page_set_valid(bp, foff, m);
2540                 }
2541                 KASSERT(OFF_TO_IDX(foff) == m->pindex,
2542                     ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch",
2543                     (intmax_t)foff, (uintmax_t)m->pindex));
2544
2545                 vm_page_sunbusy(m);
2546                 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
2547                 iosize -= resid;
2548         }
2549         vm_object_pip_wakeupn(obj, bp->b_npages);
2550         VM_OBJECT_WUNLOCK(obj);
2551         if (bogus && buf_mapped(bp)) {
2552                 BUF_CHECK_MAPPED(bp);
2553                 pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
2554                     bp->b_pages, bp->b_npages);
2555         }
2556 }
2557
2558 /*
2559  * Unwire a page held by a buf and place it on the appropriate vm queue.
2560  */
2561 static void
2562 vfs_vmio_unwire(struct buf *bp, vm_page_t m)
2563 {
2564         bool freed;
2565
2566         vm_page_lock(m);
2567         if (vm_page_unwire(m, PQ_NONE)) {
2568                 /*
2569                  * Determine if the page should be freed before adding
2570                  * it to the inactive queue.
2571                  */
2572                 if (m->valid == 0) {
2573                         freed = !vm_page_busied(m);
2574                         if (freed)
2575                                 vm_page_free(m);
2576                 } else if ((bp->b_flags & B_DIRECT) != 0)
2577                         freed = vm_page_try_to_free(m);
2578                 else
2579                         freed = false;
2580                 if (!freed) {
2581                         /*
2582                          * If the page is unlikely to be reused, let the
2583                          * VM know.  Otherwise, maintain LRU page
2584                          * ordering and put the page at the tail of the
2585                          * inactive queue.
2586                          */
2587                         if ((bp->b_flags & B_NOREUSE) != 0)
2588                                 vm_page_deactivate_noreuse(m);
2589                         else
2590                                 vm_page_deactivate(m);
2591                 }
2592         }
2593         vm_page_unlock(m);
2594 }
2595
2596 /*
2597  * Perform page invalidation when a buffer is released.  The fully invalid
2598  * pages will be reclaimed later in vfs_vmio_truncate().
2599  */
2600 static void
2601 vfs_vmio_invalidate(struct buf *bp)
2602 {
2603         vm_object_t obj;
2604         vm_page_t m;
2605         int i, resid, poffset, presid;
2606
2607         if (buf_mapped(bp)) {
2608                 BUF_CHECK_MAPPED(bp);
2609                 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
2610         } else
2611                 BUF_CHECK_UNMAPPED(bp);
2612         /*
2613          * Get the base offset and length of the buffer.  Note that
2614          * in the VMIO case if the buffer block size is not
2615          * page-aligned then b_data pointer may not be page-aligned.
2616          * But our b_pages[] array *IS* page aligned.
2617          *
2618          * block sizes less then DEV_BSIZE (usually 512) are not
2619          * supported due to the page granularity bits (m->valid,
2620          * m->dirty, etc...).
2621          *
2622          * See man buf(9) for more information
2623          */
2624         obj = bp->b_bufobj->bo_object;
2625         resid = bp->b_bufsize;
2626         poffset = bp->b_offset & PAGE_MASK;
2627         VM_OBJECT_WLOCK(obj);
2628         for (i = 0; i < bp->b_npages; i++) {
2629                 m = bp->b_pages[i];
2630                 if (m == bogus_page)
2631                         panic("vfs_vmio_invalidate: Unexpected bogus page.");
2632                 bp->b_pages[i] = NULL;
2633
2634                 presid = resid > (PAGE_SIZE - poffset) ?
2635                     (PAGE_SIZE - poffset) : resid;
2636                 KASSERT(presid >= 0, ("brelse: extra page"));
2637                 while (vm_page_xbusied(m)) {
2638                         vm_page_lock(m);
2639                         VM_OBJECT_WUNLOCK(obj);
2640                         vm_page_busy_sleep(m, "mbncsh");
2641                         VM_OBJECT_WLOCK(obj);
2642                 }
2643                 if (pmap_page_wired_mappings(m) == 0)
2644                         vm_page_set_invalid(m, poffset, presid);
2645                 vfs_vmio_unwire(bp, m);
2646                 resid -= presid;
2647                 poffset = 0;
2648         }
2649         VM_OBJECT_WUNLOCK(obj);
2650         bp->b_npages = 0;
2651 }
2652
2653 /*
2654  * Page-granular truncation of an existing VMIO buffer.
2655  */
2656 static void
2657 vfs_vmio_truncate(struct buf *bp, int desiredpages)
2658 {
2659         vm_object_t obj;
2660         vm_page_t m;
2661         int i;
2662
2663         if (bp->b_npages == desiredpages)
2664                 return;
2665
2666         if (buf_mapped(bp)) {
2667                 BUF_CHECK_MAPPED(bp);
2668                 pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) +
2669                     (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages);
2670         } else
2671                 BUF_CHECK_UNMAPPED(bp);
2672         obj = bp->b_bufobj->bo_object;
2673         if (obj != NULL)
2674                 VM_OBJECT_WLOCK(obj);
2675         for (i = desiredpages; i < bp->b_npages; i++) {
2676                 m = bp->b_pages[i];
2677                 KASSERT(m != bogus_page, ("allocbuf: bogus page found"));
2678                 bp->b_pages[i] = NULL;
2679                 vfs_vmio_unwire(bp, m);
2680         }
2681         if (obj != NULL)
2682                 VM_OBJECT_WUNLOCK(obj);
2683         bp->b_npages = desiredpages;
2684 }
2685
2686 /*
2687  * Byte granular extension of VMIO buffers.
2688  */
2689 static void
2690 vfs_vmio_extend(struct buf *bp, int desiredpages, int size)
2691 {
2692         /*
2693          * We are growing the buffer, possibly in a
2694          * byte-granular fashion.
2695          */
2696         vm_object_t obj;
2697         vm_offset_t toff;
2698         vm_offset_t tinc;
2699         vm_page_t m;
2700
2701         /*
2702          * Step 1, bring in the VM pages from the object, allocating
2703          * them if necessary.  We must clear B_CACHE if these pages
2704          * are not valid for the range covered by the buffer.
2705          */
2706         obj = bp->b_bufobj->bo_object;
2707         VM_OBJECT_WLOCK(obj);
2708         while (bp->b_npages < desiredpages) {
2709                 /*
2710                  * We must allocate system pages since blocking
2711                  * here could interfere with paging I/O, no
2712                  * matter which process we are.
2713                  *
2714                  * Only exclusive busy can be tested here.
2715                  * Blocking on shared busy might lead to
2716                  * deadlocks once allocbuf() is called after
2717                  * pages are vfs_busy_pages().
2718                  */
2719                 m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages,
2720                     VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM |
2721                     VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
2722                     VM_ALLOC_COUNT(desiredpages - bp->b_npages));
2723                 if (m->valid == 0)
2724                         bp->b_flags &= ~B_CACHE;
2725                 bp->b_pages[bp->b_npages] = m;
2726                 ++bp->b_npages;
2727         }
2728
2729         /*
2730          * Step 2.  We've loaded the pages into the buffer,
2731          * we have to figure out if we can still have B_CACHE
2732          * set.  Note that B_CACHE is set according to the
2733          * byte-granular range ( bcount and size ), not the
2734          * aligned range ( newbsize ).
2735          *
2736          * The VM test is against m->valid, which is DEV_BSIZE
2737          * aligned.  Needless to say, the validity of the data
2738          * needs to also be DEV_BSIZE aligned.  Note that this
2739          * fails with NFS if the server or some other client
2740          * extends the file's EOF.  If our buffer is resized,
2741          * B_CACHE may remain set! XXX
2742          */
2743         toff = bp->b_bcount;
2744         tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
2745         while ((bp->b_flags & B_CACHE) && toff < size) {
2746                 vm_pindex_t pi;
2747
2748                 if (tinc > (size - toff))
2749                         tinc = size - toff;
2750                 pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT;
2751                 m = bp->b_pages[pi];
2752                 vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m);
2753                 toff += tinc;
2754                 tinc = PAGE_SIZE;
2755         }
2756         VM_OBJECT_WUNLOCK(obj);
2757
2758         /*
2759          * Step 3, fixup the KVA pmap.
2760          */
2761         if (buf_mapped(bp))
2762                 bpmap_qenter(bp);
2763         else
2764                 BUF_CHECK_UNMAPPED(bp);
2765 }
2766
2767 /*
2768  * Check to see if a block at a particular lbn is available for a clustered
2769  * write.
2770  */
2771 static int
2772 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
2773 {
2774         struct buf *bpa;
2775         int match;
2776
2777         match = 0;
2778
2779         /* If the buf isn't in core skip it */
2780         if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
2781                 return (0);
2782
2783         /* If the buf is busy we don't want to wait for it */
2784         if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
2785                 return (0);
2786
2787         /* Only cluster with valid clusterable delayed write buffers */
2788         if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
2789             (B_DELWRI | B_CLUSTEROK))
2790                 goto done;
2791
2792         if (bpa->b_bufsize != size)
2793                 goto done;
2794
2795         /*
2796          * Check to see if it is in the expected place on disk and that the
2797          * block has been mapped.
2798          */
2799         if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
2800                 match = 1;
2801 done:
2802         BUF_UNLOCK(bpa);
2803         return (match);
2804 }
2805
2806 /*
2807  *      vfs_bio_awrite:
2808  *
2809  *      Implement clustered async writes for clearing out B_DELWRI buffers.
2810  *      This is much better then the old way of writing only one buffer at
2811  *      a time.  Note that we may not be presented with the buffers in the
2812  *      correct order, so we search for the cluster in both directions.
2813  */
2814 int
2815 vfs_bio_awrite(struct buf *bp)
2816 {
2817         struct bufobj *bo;
2818         int i;
2819         int j;
2820         daddr_t lblkno = bp->b_lblkno;
2821         struct vnode *vp = bp->b_vp;
2822         int ncl;
2823         int nwritten;
2824         int size;
2825         int maxcl;
2826         int gbflags;
2827
2828         bo = &vp->v_bufobj;
2829         gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0;
2830         /*
2831          * right now we support clustered writing only to regular files.  If
2832          * we find a clusterable block we could be in the middle of a cluster
2833          * rather then at the beginning.
2834          */
2835         if ((vp->v_type == VREG) &&
2836             (vp->v_mount != 0) && /* Only on nodes that have the size info */
2837             (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
2838
2839                 size = vp->v_mount->mnt_stat.f_iosize;
2840                 maxcl = MAXPHYS / size;
2841
2842                 BO_RLOCK(bo);
2843                 for (i = 1; i < maxcl; i++)
2844                         if (vfs_bio_clcheck(vp, size, lblkno + i,
2845                             bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
2846                                 break;
2847
2848                 for (j = 1; i + j <= maxcl && j <= lblkno; j++)
2849                         if (vfs_bio_clcheck(vp, size, lblkno - j,
2850                             bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
2851                                 break;
2852                 BO_RUNLOCK(bo);
2853                 --j;
2854                 ncl = i + j;
2855                 /*
2856                  * this is a possible cluster write
2857                  */
2858                 if (ncl != 1) {
2859                         BUF_UNLOCK(bp);
2860                         nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
2861                             gbflags);
2862                         return (nwritten);
2863                 }
2864         }
2865         bremfree(bp);
2866         bp->b_flags |= B_ASYNC;
2867         /*
2868          * default (old) behavior, writing out only one block
2869          *
2870          * XXX returns b_bufsize instead of b_bcount for nwritten?
2871          */
2872         nwritten = bp->b_bufsize;
2873         (void) bwrite(bp);
2874
2875         return (nwritten);
2876 }
2877
2878 /*
2879  *      getnewbuf_kva:
2880  *
2881  *      Allocate KVA for an empty buf header according to gbflags.
2882  */
2883 static int
2884 getnewbuf_kva(struct buf *bp, int gbflags, int maxsize)
2885 {
2886
2887         if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) {
2888                 /*
2889                  * In order to keep fragmentation sane we only allocate kva
2890                  * in BKVASIZE chunks.  XXX with vmem we can do page size.
2891                  */
2892                 maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
2893
2894                 if (maxsize != bp->b_kvasize &&
2895                     bufkva_alloc(bp, maxsize, gbflags))
2896                         return (ENOSPC);
2897         }
2898         return (0);
2899 }
2900
2901 /*
2902  *      getnewbuf:
2903  *
2904  *      Find and initialize a new buffer header, freeing up existing buffers
2905  *      in the bufqueues as necessary.  The new buffer is returned locked.
2906  *
2907  *      We block if:
2908  *              We have insufficient buffer headers
2909  *              We have insufficient buffer space
2910  *              buffer_arena is too fragmented ( space reservation fails )
2911  *              If we have to flush dirty buffers ( but we try to avoid this )
2912  *
2913  *      The caller is responsible for releasing the reserved bufspace after
2914  *      allocbuf() is called.
2915  */
2916 static struct buf *
2917 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags)
2918 {
2919         struct buf *bp;
2920         bool metadata, reserved;
2921
2922         bp = NULL;
2923         KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
2924             ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
2925         if (!unmapped_buf_allowed)
2926                 gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
2927
2928         if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
2929             vp->v_type == VCHR)
2930                 metadata = true;
2931         else
2932                 metadata = false;
2933         atomic_add_int(&getnewbufcalls, 1);
2934         reserved = false;
2935         do {
2936                 if (reserved == false &&
2937                     bufspace_reserve(maxsize, metadata) != 0)
2938                         continue;
2939                 reserved = true;
2940                 if ((bp = buf_alloc()) == NULL)
2941                         continue;
2942                 if (getnewbuf_kva(bp, gbflags, maxsize) == 0)
2943                         return (bp);
2944                 break;
2945         } while(buf_scan(false) == 0);
2946
2947         if (reserved)
2948                 atomic_subtract_long(&bufspace, maxsize);
2949         if (bp != NULL) {
2950                 bp->b_flags |= B_INVAL;
2951                 brelse(bp);
2952         }
2953         bufspace_wait(vp, gbflags, slpflag, slptimeo);
2954
2955         return (NULL);
2956 }
2957
2958 /*
2959  *      buf_daemon:
2960  *
2961  *      buffer flushing daemon.  Buffers are normally flushed by the
2962  *      update daemon but if it cannot keep up this process starts to
2963  *      take the load in an attempt to prevent getnewbuf() from blocking.
2964  */
2965 static struct kproc_desc buf_kp = {
2966         "bufdaemon",
2967         buf_daemon,
2968         &bufdaemonproc
2969 };
2970 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
2971
2972 static int
2973 buf_flush(struct vnode *vp, int target)
2974 {
2975         int flushed;
2976
2977         flushed = flushbufqueues(vp, target, 0);
2978         if (flushed == 0) {
2979                 /*
2980                  * Could not find any buffers without rollback
2981                  * dependencies, so just write the first one
2982                  * in the hopes of eventually making progress.
2983                  */
2984                 if (vp != NULL && target > 2)
2985                         target /= 2;
2986                 flushbufqueues(vp, target, 1);
2987         }
2988         return (flushed);
2989 }
2990
2991 static void
2992 buf_daemon()
2993 {
2994         int lodirty;
2995
2996         /*
2997          * This process needs to be suspended prior to shutdown sync.
2998          */
2999         EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
3000             SHUTDOWN_PRI_LAST);
3001
3002         /*
3003          * This process is allowed to take the buffer cache to the limit
3004          */
3005         curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
3006         mtx_lock(&bdlock);
3007         for (;;) {
3008                 bd_request = 0;
3009                 mtx_unlock(&bdlock);
3010
3011                 kproc_suspend_check(bufdaemonproc);
3012                 lodirty = lodirtybuffers;
3013                 if (bd_speedupreq) {
3014                         lodirty = numdirtybuffers / 2;
3015                         bd_speedupreq = 0;
3016                 }
3017                 /*
3018                  * Do the flush.  Limit the amount of in-transit I/O we
3019                  * allow to build up, otherwise we would completely saturate
3020                  * the I/O system.
3021                  */
3022                 while (numdirtybuffers > lodirty) {
3023                         if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
3024                                 break;
3025                         kern_yield(PRI_USER);
3026                 }
3027
3028                 /*
3029                  * Only clear bd_request if we have reached our low water
3030                  * mark.  The buf_daemon normally waits 1 second and
3031                  * then incrementally flushes any dirty buffers that have
3032                  * built up, within reason.
3033                  *
3034                  * If we were unable to hit our low water mark and couldn't
3035                  * find any flushable buffers, we sleep for a short period
3036                  * to avoid endless loops on unlockable buffers.
3037                  */
3038                 mtx_lock(&bdlock);
3039                 if (numdirtybuffers <= lodirtybuffers) {
3040                         /*
3041                          * We reached our low water mark, reset the
3042                          * request and sleep until we are needed again.
3043                          * The sleep is just so the suspend code works.
3044                          */
3045                         bd_request = 0;
3046                         /*
3047                          * Do an extra wakeup in case dirty threshold
3048                          * changed via sysctl and the explicit transition
3049                          * out of shortfall was missed.
3050                          */
3051                         bdirtywakeup();
3052                         if (runningbufspace <= lorunningspace)
3053                                 runningwakeup();
3054                         msleep(&bd_request, &bdlock, PVM, "psleep", hz);
3055                 } else {
3056                         /*
3057                          * We couldn't find any flushable dirty buffers but
3058                          * still have too many dirty buffers, we
3059                          * have to sleep and try again.  (rare)
3060                          */
3061                         msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
3062                 }
3063         }
3064 }
3065
3066 /*
3067  *      flushbufqueues:
3068  *
3069  *      Try to flush a buffer in the dirty queue.  We must be careful to
3070  *      free up B_INVAL buffers instead of write them, which NFS is
3071  *      particularly sensitive to.
3072  */
3073 static int flushwithdeps = 0;
3074 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
3075     0, "Number of buffers flushed with dependecies that require rollbacks");
3076
3077 static int
3078 flushbufqueues(struct vnode *lvp, int target, int flushdeps)
3079 {
3080         struct buf *sentinel;
3081         struct vnode *vp;
3082         struct mount *mp;
3083         struct buf *bp;
3084         int hasdeps;
3085         int flushed;
3086         int queue;
3087         int error;
3088         bool unlock;
3089
3090         flushed = 0;
3091         queue = QUEUE_DIRTY;
3092         bp = NULL;
3093         sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
3094         sentinel->b_qindex = QUEUE_SENTINEL;
3095         mtx_lock(&bqlocks[queue]);
3096         TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
3097         mtx_unlock(&bqlocks[queue]);
3098         while (flushed != target) {
3099                 maybe_yield();
3100                 mtx_lock(&bqlocks[queue]);
3101                 bp = TAILQ_NEXT(sentinel, b_freelist);
3102                 if (bp != NULL) {
3103                         TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
3104                         TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
3105                             b_freelist);
3106                 } else {
3107                         mtx_unlock(&bqlocks[queue]);
3108                         break;
3109                 }
3110                 /*
3111                  * Skip sentinels inserted by other invocations of the
3112                  * flushbufqueues(), taking care to not reorder them.
3113                  *
3114                  * Only flush the buffers that belong to the
3115                  * vnode locked by the curthread.
3116                  */
3117                 if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
3118                     bp->b_vp != lvp)) {
3119                         mtx_unlock(&bqlocks[queue]);
3120                         continue;
3121                 }
3122                 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
3123                 mtx_unlock(&bqlocks[queue]);
3124                 if (error != 0)
3125                         continue;
3126                 if (bp->b_pin_count > 0) {
3127                         BUF_UNLOCK(bp);
3128                         continue;
3129                 }
3130                 /*
3131                  * BKGRDINPROG can only be set with the buf and bufobj
3132                  * locks both held.  We tolerate a race to clear it here.
3133                  */
3134                 if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
3135                     (bp->b_flags & B_DELWRI) == 0) {
3136                         BUF_UNLOCK(bp);
3137                         continue;
3138                 }
3139                 if (bp->b_flags & B_INVAL) {
3140                         bremfreef(bp);
3141                         brelse(bp);
3142                         flushed++;
3143                         continue;
3144                 }
3145
3146                 if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
3147                         if (flushdeps == 0) {
3148                                 BUF_UNLOCK(bp);
3149                                 continue;
3150                         }
3151                         hasdeps = 1;
3152                 } else
3153                         hasdeps = 0;
3154                 /*
3155                  * We must hold the lock on a vnode before writing
3156                  * one of its buffers. Otherwise we may confuse, or
3157                  * in the case of a snapshot vnode, deadlock the
3158                  * system.
3159                  *
3160                  * The lock order here is the reverse of the normal
3161                  * of vnode followed by buf lock.  This is ok because
3162                  * the NOWAIT will prevent deadlock.
3163                  */
3164                 vp = bp->b_vp;
3165                 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
3166                         BUF_UNLOCK(bp);
3167                         continue;
3168                 }
3169                 if (lvp == NULL) {
3170                         unlock = true;
3171                         error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
3172                 } else {
3173                         ASSERT_VOP_LOCKED(vp, "getbuf");
3174                         unlock = false;
3175                         error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
3176                             vn_lock(vp, LK_TRYUPGRADE);
3177                 }
3178                 if (error == 0) {
3179                         CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
3180                             bp, bp->b_vp, bp->b_flags);
3181                         if (curproc == bufdaemonproc) {
3182                                 vfs_bio_awrite(bp);
3183                         } else {
3184                                 bremfree(bp);
3185                                 bwrite(bp);
3186                                 notbufdflushes++;
3187                         }
3188                         vn_finished_write(mp);
3189                         if (unlock)
3190                                 VOP_UNLOCK(vp, 0);
3191                         flushwithdeps += hasdeps;
3192                         flushed++;
3193
3194                         /*
3195                          * Sleeping on runningbufspace while holding
3196                          * vnode lock leads to deadlock.
3197                          */
3198                         if (curproc == bufdaemonproc &&
3199                             runningbufspace > hirunningspace)
3200                                 waitrunningbufspace();
3201                         continue;
3202                 }
3203                 vn_finished_write(mp);
3204                 BUF_UNLOCK(bp);
3205         }
3206         mtx_lock(&bqlocks[queue]);
3207         TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
3208         mtx_unlock(&bqlocks[queue]);
3209         free(sentinel, M_TEMP);
3210         return (flushed);
3211 }
3212
3213 /*
3214  * Check to see if a block is currently memory resident.
3215  */
3216 struct buf *
3217 incore(struct bufobj *bo, daddr_t blkno)
3218 {
3219         struct buf *bp;
3220
3221         BO_RLOCK(bo);
3222         bp = gbincore(bo, blkno);
3223         BO_RUNLOCK(bo);
3224         return (bp);
3225 }
3226
3227 /*
3228  * Returns true if no I/O is needed to access the
3229  * associated VM object.  This is like incore except
3230  * it also hunts around in the VM system for the data.
3231  */
3232
3233 static int
3234 inmem(struct vnode * vp, daddr_t blkno)
3235 {
3236         vm_object_t obj;
3237         vm_offset_t toff, tinc, size;
3238         vm_page_t m;
3239         vm_ooffset_t off;
3240
3241         ASSERT_VOP_LOCKED(vp, "inmem");
3242
3243         if (incore(&vp->v_bufobj, blkno))
3244                 return 1;
3245         if (vp->v_mount == NULL)
3246                 return 0;
3247         obj = vp->v_object;
3248         if (obj == NULL)
3249                 return (0);
3250
3251         size = PAGE_SIZE;
3252         if (size > vp->v_mount->mnt_stat.f_iosize)
3253                 size = vp->v_mount->mnt_stat.f_iosize;
3254         off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
3255
3256         VM_OBJECT_RLOCK(obj);
3257         for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
3258                 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
3259                 if (!m)
3260                         goto notinmem;
3261                 tinc = size;
3262                 if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
3263                         tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
3264                 if (vm_page_is_valid(m,
3265                     (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
3266                         goto notinmem;
3267         }
3268         VM_OBJECT_RUNLOCK(obj);
3269         return 1;
3270
3271 notinmem:
3272         VM_OBJECT_RUNLOCK(obj);
3273         return (0);
3274 }
3275
3276 /*
3277  * Set the dirty range for a buffer based on the status of the dirty
3278  * bits in the pages comprising the buffer.  The range is limited
3279  * to the size of the buffer.
3280  *
3281  * Tell the VM system that the pages associated with this buffer
3282  * are clean.  This is used for delayed writes where the data is
3283  * going to go to disk eventually without additional VM intevention.
3284  *
3285  * Note that while we only really need to clean through to b_bcount, we
3286  * just go ahead and clean through to b_bufsize.
3287  */
3288 static void
3289 vfs_clean_pages_dirty_buf(struct buf *bp)
3290 {
3291         vm_ooffset_t foff, noff, eoff;
3292         vm_page_t m;
3293         int i;
3294
3295         if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
3296                 return;
3297
3298         foff = bp->b_offset;
3299         KASSERT(bp->b_offset != NOOFFSET,
3300             ("vfs_clean_pages_dirty_buf: no buffer offset"));
3301
3302         VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
3303         vfs_drain_busy_pages(bp);
3304         vfs_setdirty_locked_object(bp);
3305         for (i = 0; i < bp->b_npages; i++) {
3306                 noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3307                 eoff = noff;
3308                 if (eoff > bp->b_offset + bp->b_bufsize)
3309                         eoff = bp->b_offset + bp->b_bufsize;
3310                 m = bp->b_pages[i];
3311                 vfs_page_set_validclean(bp, foff, m);
3312                 /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
3313                 foff = noff;
3314         }
3315         VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
3316 }
3317
3318 static void
3319 vfs_setdirty_locked_object(struct buf *bp)
3320 {
3321         vm_object_t object;
3322         int i;
3323
3324         object = bp->b_bufobj->bo_object;
3325         VM_OBJECT_ASSERT_WLOCKED(object);
3326
3327         /*
3328          * We qualify the scan for modified pages on whether the
3329          * object has been flushed yet.
3330          */
3331         if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
3332                 vm_offset_t boffset;
3333                 vm_offset_t eoffset;
3334
3335                 /*
3336                  * test the pages to see if they have been modified directly
3337                  * by users through the VM system.
3338                  */
3339                 for (i = 0; i < bp->b_npages; i++)
3340                         vm_page_test_dirty(bp->b_pages[i]);
3341
3342                 /*
3343                  * Calculate the encompassing dirty range, boffset and eoffset,
3344                  * (eoffset - boffset) bytes.
3345                  */
3346
3347                 for (i = 0; i < bp->b_npages; i++) {
3348                         if (bp->b_pages[i]->dirty)
3349                                 break;
3350                 }
3351                 boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
3352
3353                 for (i = bp->b_npages - 1; i >= 0; --i) {
3354                         if (bp->b_pages[i]->dirty) {
3355                                 break;
3356                         }
3357                 }
3358                 eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
3359
3360                 /*
3361                  * Fit it to the buffer.
3362                  */
3363
3364                 if (eoffset > bp->b_bcount)
3365                         eoffset = bp->b_bcount;
3366
3367                 /*
3368                  * If we have a good dirty range, merge with the existing
3369                  * dirty range.
3370                  */
3371
3372                 if (boffset < eoffset) {
3373                         if (bp->b_dirtyoff > boffset)
3374                                 bp->b_dirtyoff = boffset;
3375                         if (bp->b_dirtyend < eoffset)
3376                                 bp->b_dirtyend = eoffset;
3377                 }
3378         }
3379 }
3380
3381 /*
3382  * Allocate the KVA mapping for an existing buffer.
3383  * If an unmapped buffer is provided but a mapped buffer is requested, take
3384  * also care to properly setup mappings between pages and KVA.
3385  */
3386 static void
3387 bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
3388 {
3389         int bsize, maxsize, need_mapping, need_kva;
3390         off_t offset;
3391
3392         need_mapping = bp->b_data == unmapped_buf &&
3393             (gbflags & GB_UNMAPPED) == 0;
3394         need_kva = bp->b_kvabase == unmapped_buf &&
3395             bp->b_data == unmapped_buf &&
3396             (gbflags & GB_KVAALLOC) != 0;
3397         if (!need_mapping && !need_kva)
3398                 return;
3399
3400         BUF_CHECK_UNMAPPED(bp);
3401
3402         if (need_mapping && bp->b_kvabase != unmapped_buf) {
3403                 /*
3404                  * Buffer is not mapped, but the KVA was already
3405                  * reserved at the time of the instantiation.  Use the
3406                  * allocated space.
3407                  */
3408                 goto has_addr;
3409         }
3410
3411         /*
3412          * Calculate the amount of the address space we would reserve
3413          * if the buffer was mapped.
3414          */
3415         bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
3416         KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
3417         offset = blkno * bsize;
3418         maxsize = size + (offset & PAGE_MASK);
3419         maxsize = imax(maxsize, bsize);
3420
3421         while (bufkva_alloc(bp, maxsize, gbflags) != 0) {
3422                 if ((gbflags & GB_NOWAIT_BD) != 0) {
3423                         /*
3424                          * XXXKIB: defragmentation cannot
3425                          * succeed, not sure what else to do.
3426                          */
3427                         panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
3428                 }
3429                 atomic_add_int(&mappingrestarts, 1);
3430                 bufspace_wait(bp->b_vp, gbflags, 0, 0);
3431         }
3432 has_addr:
3433         if (need_mapping) {
3434                 /* b_offset is handled by bpmap_qenter. */
3435                 bp->b_data = bp->b_kvabase;
3436                 BUF_CHECK_MAPPED(bp);
3437                 bpmap_qenter(bp);
3438         }
3439 }
3440
3441 /*
3442  *      getblk:
3443  *
3444  *      Get a block given a specified block and offset into a file/device.
3445  *      The buffers B_DONE bit will be cleared on return, making it almost
3446  *      ready for an I/O initiation.  B_INVAL may or may not be set on
3447  *      return.  The caller should clear B_INVAL prior to initiating a
3448  *      READ.
3449  *
3450  *      For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
3451  *      an existing buffer.
3452  *
3453  *      For a VMIO buffer, B_CACHE is modified according to the backing VM.
3454  *      If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
3455  *      and then cleared based on the backing VM.  If the previous buffer is
3456  *      non-0-sized but invalid, B_CACHE will be cleared.
3457  *
3458  *      If getblk() must create a new buffer, the new buffer is returned with
3459  *      both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
3460  *      case it is returned with B_INVAL clear and B_CACHE set based on the
3461  *      backing VM.
3462  *
3463  *      getblk() also forces a bwrite() for any B_DELWRI buffer whos
3464  *      B_CACHE bit is clear.
3465  *
3466  *      What this means, basically, is that the caller should use B_CACHE to
3467  *      determine whether the buffer is fully valid or not and should clear
3468  *      B_INVAL prior to issuing a read.  If the caller intends to validate
3469  *      the buffer by loading its data area with something, the caller needs
3470  *      to clear B_INVAL.  If the caller does this without issuing an I/O,
3471  *      the caller should set B_CACHE ( as an optimization ), else the caller
3472  *      should issue the I/O and biodone() will set B_CACHE if the I/O was
3473  *      a write attempt or if it was a successful read.  If the caller
3474  *      intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
3475  *      prior to issuing the READ.  biodone() will *not* clear B_INVAL.
3476  */
3477 struct buf *
3478 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
3479     int flags)
3480 {
3481         struct buf *bp;
3482         struct bufobj *bo;
3483         int bsize, error, maxsize, vmio;
3484         off_t offset;
3485
3486         CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
3487         KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
3488             ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
3489         ASSERT_VOP_LOCKED(vp, "getblk");
3490         if (size > MAXBCACHEBUF)
3491                 panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size,
3492                     MAXBCACHEBUF);
3493         if (!unmapped_buf_allowed)
3494                 flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
3495
3496         bo = &vp->v_bufobj;
3497 loop:
3498         BO_RLOCK(bo);
3499         bp = gbincore(bo, blkno);
3500         if (bp != NULL) {
3501                 int lockflags;
3502                 /*
3503                  * Buffer is in-core.  If the buffer is not busy nor managed,
3504                  * it must be on a queue.
3505                  */
3506                 lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
3507
3508                 if (flags & GB_LOCK_NOWAIT)
3509                         lockflags |= LK_NOWAIT;
3510
3511                 error = BUF_TIMELOCK(bp, lockflags,
3512                     BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
3513
3514                 /*
3515                  * If we slept and got the lock we have to restart in case
3516                  * the buffer changed identities.
3517                  */
3518                 if (error == ENOLCK)
3519                         goto loop;
3520                 /* We timed out or were interrupted. */
3521                 else if (error)
3522                         return (NULL);
3523                 /* If recursed, assume caller knows the rules. */
3524                 else if (BUF_LOCKRECURSED(bp))
3525                         goto end;
3526
3527                 /*
3528                  * The buffer is locked.  B_CACHE is cleared if the buffer is
3529                  * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
3530                  * and for a VMIO buffer B_CACHE is adjusted according to the
3531                  * backing VM cache.
3532                  */
3533                 if (bp->b_flags & B_INVAL)
3534                         bp->b_flags &= ~B_CACHE;
3535                 else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
3536                         bp->b_flags |= B_CACHE;
3537                 if (bp->b_flags & B_MANAGED)
3538                         MPASS(bp->b_qindex == QUEUE_NONE);
3539                 else
3540                         bremfree(bp);
3541
3542                 /*
3543                  * check for size inconsistencies for non-VMIO case.
3544                  */
3545                 if (bp->b_bcount != size) {
3546                         if ((bp->b_flags & B_VMIO) == 0 ||
3547                             (size > bp->b_kvasize)) {
3548                                 if (bp->b_flags & B_DELWRI) {
3549                                         /*
3550                                          * If buffer is pinned and caller does
3551                                          * not want sleep  waiting for it to be
3552                                          * unpinned, bail out
3553                                          * */
3554                                         if (bp->b_pin_count > 0) {
3555                                                 if (flags & GB_LOCK_NOWAIT) {
3556                                                         bqrelse(bp);
3557                                                         return (NULL);
3558                                                 } else {
3559                                                         bunpin_wait(bp);
3560                                                 }
3561                                         }
3562                                         bp->b_flags |= B_NOCACHE;
3563                                         bwrite(bp);
3564                                 } else {
3565                                         if (LIST_EMPTY(&bp->b_dep)) {
3566                                                 bp->b_flags |= B_RELBUF;
3567                                                 brelse(bp);
3568                                         } else {
3569                                                 bp->b_flags |= B_NOCACHE;
3570                                                 bwrite(bp);
3571                                         }
3572                                 }
3573                                 goto loop;
3574                         }
3575                 }
3576
3577                 /*
3578                  * Handle the case of unmapped buffer which should
3579                  * become mapped, or the buffer for which KVA
3580                  * reservation is requested.
3581                  */
3582                 bp_unmapped_get_kva(bp, blkno, size, flags);
3583
3584                 /*
3585                  * If the size is inconsistent in the VMIO case, we can resize
3586                  * the buffer.  This might lead to B_CACHE getting set or
3587                  * cleared.  If the size has not changed, B_CACHE remains
3588                  * unchanged from its previous state.
3589                  */
3590                 allocbuf(bp, size);
3591
3592                 KASSERT(bp->b_offset != NOOFFSET,
3593                     ("getblk: no buffer offset"));
3594
3595                 /*
3596                  * A buffer with B_DELWRI set and B_CACHE clear must
3597                  * be committed before we can return the buffer in
3598                  * order to prevent the caller from issuing a read
3599                  * ( due to B_CACHE not being set ) and overwriting
3600                  * it.
3601                  *
3602                  * Most callers, including NFS and FFS, need this to
3603                  * operate properly either because they assume they
3604                  * can issue a read if B_CACHE is not set, or because
3605                  * ( for example ) an uncached B_DELWRI might loop due
3606                  * to softupdates re-dirtying the buffer.  In the latter
3607                  * case, B_CACHE is set after the first write completes,
3608                  * preventing further loops.
3609                  * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
3610                  * above while extending the buffer, we cannot allow the
3611                  * buffer to remain with B_CACHE set after the write
3612                  * completes or it will represent a corrupt state.  To
3613                  * deal with this we set B_NOCACHE to scrap the buffer
3614                  * after the write.
3615                  *
3616                  * We might be able to do something fancy, like setting
3617                  * B_CACHE in bwrite() except if B_DELWRI is already set,
3618                  * so the below call doesn't set B_CACHE, but that gets real
3619                  * confusing.  This is much easier.
3620                  */
3621
3622                 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
3623                         bp->b_flags |= B_NOCACHE;
3624                         bwrite(bp);
3625                         goto loop;
3626                 }
3627                 bp->b_flags &= ~B_DONE;
3628         } else {
3629                 /*
3630                  * Buffer is not in-core, create new buffer.  The buffer
3631                  * returned by getnewbuf() is locked.  Note that the returned
3632                  * buffer is also considered valid (not marked B_INVAL).
3633                  */
3634                 BO_RUNLOCK(bo);
3635                 /*
3636                  * If the user does not want us to create the buffer, bail out
3637                  * here.
3638                  */
3639                 if (flags & GB_NOCREAT)
3640                         return NULL;
3641                 if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
3642                         return NULL;
3643
3644                 bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
3645                 KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
3646                 offset = blkno * bsize;
3647                 vmio = vp->v_object != NULL;
3648                 if (vmio) {
3649                         maxsize = size + (offset & PAGE_MASK);
3650                 } else {
3651                         maxsize = size;
3652                         /* Do not allow non-VMIO notmapped buffers. */
3653                         flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
3654                 }
3655                 maxsize = imax(maxsize, bsize);
3656
3657                 bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags);
3658                 if (bp == NULL) {
3659                         if (slpflag || slptimeo)
3660                                 return NULL;
3661                         /*
3662                          * XXX This is here until the sleep path is diagnosed
3663                          * enough to work under very low memory conditions.
3664                          *
3665                          * There's an issue on low memory, 4BSD+non-preempt
3666                          * systems (eg MIPS routers with 32MB RAM) where buffer
3667                          * exhaustion occurs without sleeping for buffer
3668                          * reclaimation.  This just sticks in a loop and
3669                          * constantly attempts to allocate a buffer, which
3670                          * hits exhaustion and tries to wakeup bufdaemon.
3671                          * This never happens because we never yield.
3672                          *
3673                          * The real solution is to identify and fix these cases
3674                          * so we aren't effectively busy-waiting in a loop
3675                          * until the reclaimation path has cycles to run.
3676                          */
3677                         kern_yield(PRI_USER);
3678                         goto loop;
3679                 }
3680
3681                 /*
3682                  * This code is used to make sure that a buffer is not
3683                  * created while the getnewbuf routine is blocked.
3684                  * This can be a problem whether the vnode is locked or not.
3685                  * If the buffer is created out from under us, we have to
3686                  * throw away the one we just created.
3687                  *
3688                  * Note: this must occur before we associate the buffer
3689                  * with the vp especially considering limitations in
3690                  * the splay tree implementation when dealing with duplicate
3691                  * lblkno's.
3692                  */
3693                 BO_LOCK(bo);
3694                 if (gbincore(bo, blkno)) {
3695                         BO_UNLOCK(bo);
3696                         bp->b_flags |= B_INVAL;
3697                         brelse(bp);
3698                         bufspace_release(maxsize);
3699                         goto loop;
3700                 }
3701
3702                 /*
3703                  * Insert the buffer into the hash, so that it can
3704                  * be found by incore.
3705                  */
3706                 bp->b_blkno = bp->b_lblkno = blkno;
3707                 bp->b_offset = offset;
3708                 bgetvp(vp, bp);
3709                 BO_UNLOCK(bo);
3710
3711                 /*
3712                  * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
3713                  * buffer size starts out as 0, B_CACHE will be set by
3714                  * allocbuf() for the VMIO case prior to it testing the
3715                  * backing store for validity.
3716                  */
3717
3718                 if (vmio) {
3719                         bp->b_flags |= B_VMIO;
3720                         KASSERT(vp->v_object == bp->b_bufobj->bo_object,
3721                             ("ARGH! different b_bufobj->bo_object %p %p %p\n",
3722                             bp, vp->v_object, bp->b_bufobj->bo_object));
3723                 } else {
3724                         bp->b_flags &= ~B_VMIO;
3725                         KASSERT(bp->b_bufobj->bo_object == NULL,
3726                             ("ARGH! has b_bufobj->bo_object %p %p\n",
3727                             bp, bp->b_bufobj->bo_object));
3728                         BUF_CHECK_MAPPED(bp);
3729                 }
3730
3731                 allocbuf(bp, size);
3732                 bufspace_release(maxsize);
3733                 bp->b_flags &= ~B_DONE;
3734         }
3735         CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
3736         BUF_ASSERT_HELD(bp);
3737 end:
3738         KASSERT(bp->b_bufobj == bo,
3739             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
3740         return (bp);
3741 }
3742
3743 /*
3744  * Get an empty, disassociated buffer of given size.  The buffer is initially
3745  * set to B_INVAL.
3746  */
3747 struct buf *
3748 geteblk(int size, int flags)
3749 {
3750         struct buf *bp;
3751         int maxsize;
3752
3753         maxsize = (size + BKVAMASK) & ~BKVAMASK;
3754         while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) {
3755                 if ((flags & GB_NOWAIT_BD) &&
3756                     (curthread->td_pflags & TDP_BUFNEED) != 0)
3757                         return (NULL);
3758         }
3759         allocbuf(bp, size);
3760         bufspace_release(maxsize);
3761         bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
3762         BUF_ASSERT_HELD(bp);
3763         return (bp);
3764 }
3765
3766 /*
3767  * Truncate the backing store for a non-vmio buffer.
3768  */
3769 static void
3770 vfs_nonvmio_truncate(struct buf *bp, int newbsize)
3771 {
3772
3773         if (bp->b_flags & B_MALLOC) {
3774                 /*
3775                  * malloced buffers are not shrunk
3776                  */
3777                 if (newbsize == 0) {
3778                         bufmallocadjust(bp, 0);
3779                         free(bp->b_data, M_BIOBUF);
3780                         bp->b_data = bp->b_kvabase;
3781                         bp->b_flags &= ~B_MALLOC;
3782                 }
3783                 return;
3784         }
3785         vm_hold_free_pages(bp, newbsize);
3786         bufspace_adjust(bp, newbsize);
3787 }
3788
3789 /*
3790  * Extend the backing for a non-VMIO buffer.
3791  */
3792 static void
3793 vfs_nonvmio_extend(struct buf *bp, int newbsize)
3794 {
3795         caddr_t origbuf;
3796         int origbufsize;
3797
3798         /*
3799          * We only use malloced memory on the first allocation.
3800          * and revert to page-allocated memory when the buffer
3801          * grows.
3802          *
3803          * There is a potential smp race here that could lead
3804          * to bufmallocspace slightly passing the max.  It
3805          * is probably extremely rare and not worth worrying
3806          * over.
3807          */
3808         if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 &&
3809             bufmallocspace < maxbufmallocspace) {
3810                 bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK);
3811                 bp->b_flags |= B_MALLOC;
3812                 bufmallocadjust(bp, newbsize);
3813                 return;
3814         }
3815
3816         /*
3817          * If the buffer is growing on its other-than-first
3818          * allocation then we revert to the page-allocation
3819          * scheme.
3820          */
3821         origbuf = NULL;
3822         origbufsize = 0;
3823         if (bp->b_flags & B_MALLOC) {
3824                 origbuf = bp->b_data;
3825                 origbufsize = bp->b_bufsize;
3826                 bp->b_data = bp->b_kvabase;
3827                 bufmallocadjust(bp, 0);
3828                 bp->b_flags &= ~B_MALLOC;
3829                 newbsize = round_page(newbsize);
3830         }
3831         vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize,
3832             (vm_offset_t) bp->b_data + newbsize);
3833         if (origbuf != NULL) {
3834                 bcopy(origbuf, bp->b_data, origbufsize);
3835                 free(origbuf, M_BIOBUF);
3836         }
3837         bufspace_adjust(bp, newbsize);
3838 }
3839
3840 /*
3841  * This code constitutes the buffer memory from either anonymous system
3842  * memory (in the case of non-VMIO operations) or from an associated
3843  * VM object (in the case of VMIO operations).  This code is able to
3844  * resize a buffer up or down.
3845  *
3846  * Note that this code is tricky, and has many complications to resolve
3847  * deadlock or inconsistent data situations.  Tread lightly!!!
3848  * There are B_CACHE and B_DELWRI interactions that must be dealt with by
3849  * the caller.  Calling this code willy nilly can result in the loss of data.
3850  *
3851  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
3852  * B_CACHE for the non-VMIO case.
3853  */
3854 int
3855 allocbuf(struct buf *bp, int size)
3856 {
3857         int newbsize;
3858
3859         BUF_ASSERT_HELD(bp);
3860
3861         if (bp->b_bcount == size)
3862                 return (1);
3863
3864         if (bp->b_kvasize != 0 && bp->b_kvasize < size)
3865                 panic("allocbuf: buffer too small");
3866
3867         newbsize = roundup2(size, DEV_BSIZE);
3868         if ((bp->b_flags & B_VMIO) == 0) {
3869                 if ((bp->b_flags & B_MALLOC) == 0)
3870                         newbsize = round_page(newbsize);
3871                 /*
3872                  * Just get anonymous memory from the kernel.  Don't
3873                  * mess with B_CACHE.
3874                  */
3875                 if (newbsize < bp->b_bufsize)
3876                         vfs_nonvmio_truncate(bp, newbsize);
3877                 else if (newbsize > bp->b_bufsize)
3878                         vfs_nonvmio_extend(bp, newbsize);
3879         } else {
3880                 int desiredpages;
3881
3882                 desiredpages = (size == 0) ? 0 :
3883                     num_pages((bp->b_offset & PAGE_MASK) + newbsize);
3884
3885                 if (bp->b_flags & B_MALLOC)
3886                         panic("allocbuf: VMIO buffer can't be malloced");
3887                 /*
3888                  * Set B_CACHE initially if buffer is 0 length or will become
3889                  * 0-length.
3890                  */
3891                 if (size == 0 || bp->b_bufsize == 0)
3892                         bp->b_flags |= B_CACHE;
3893
3894                 if (newbsize < bp->b_bufsize)
3895                         vfs_vmio_truncate(bp, desiredpages);
3896                 /* XXX This looks as if it should be newbsize > b_bufsize */
3897                 else if (size > bp->b_bcount)
3898                         vfs_vmio_extend(bp, desiredpages, size);
3899                 bufspace_adjust(bp, newbsize);
3900         }
3901         bp->b_bcount = size;            /* requested buffer size. */
3902         return (1);
3903 }
3904
3905 extern int inflight_transient_maps;
3906
3907 void
3908 biodone(struct bio *bp)
3909 {
3910         struct mtx *mtxp;
3911         void (*done)(struct bio *);
3912         vm_offset_t start, end;
3913
3914         if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
3915                 bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
3916                 bp->bio_flags |= BIO_UNMAPPED;
3917                 start = trunc_page((vm_offset_t)bp->bio_data);
3918                 end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
3919                 bp->bio_data = unmapped_buf;
3920                 pmap_qremove(start, OFF_TO_IDX(end - start));
3921                 vmem_free(transient_arena, start, end - start);
3922                 atomic_add_int(&inflight_transient_maps, -1);
3923         }
3924         done = bp->bio_done;
3925         if (done == NULL) {
3926                 mtxp = mtx_pool_find(mtxpool_sleep, bp);
3927                 mtx_lock(mtxp);
3928                 bp->bio_flags |= BIO_DONE;
3929                 wakeup(bp);
3930                 mtx_unlock(mtxp);
3931         } else {
3932                 bp->bio_flags |= BIO_DONE;
3933                 done(bp);
3934         }
3935 }
3936
3937 /*
3938  * Wait for a BIO to finish.
3939  */
3940 int
3941 biowait(struct bio *bp, const char *wchan)
3942 {
3943         struct mtx *mtxp;
3944
3945         mtxp = mtx_pool_find(mtxpool_sleep, bp);
3946         mtx_lock(mtxp);
3947         while ((bp->bio_flags & BIO_DONE) == 0)
3948                 msleep(bp, mtxp, PRIBIO, wchan, 0);
3949         mtx_unlock(mtxp);
3950         if (bp->bio_error != 0)
3951                 return (bp->bio_error);
3952         if (!(bp->bio_flags & BIO_ERROR))
3953                 return (0);
3954         return (EIO);
3955 }
3956
3957 void
3958 biofinish(struct bio *bp, struct devstat *stat, int error)
3959 {
3960
3961         if (error) {
3962                 bp->bio_error = error;
3963                 bp->bio_flags |= BIO_ERROR;
3964         }
3965         if (stat != NULL)
3966                 devstat_end_transaction_bio(stat, bp);
3967         biodone(bp);
3968 }
3969
3970 /*
3971  *      bufwait:
3972  *
3973  *      Wait for buffer I/O completion, returning error status.  The buffer
3974  *      is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
3975  *      error and cleared.
3976  */
3977 int
3978 bufwait(struct buf *bp)
3979 {
3980         if (bp->b_iocmd == BIO_READ)
3981                 bwait(bp, PRIBIO, "biord");
3982         else
3983                 bwait(bp, PRIBIO, "biowr");
3984         if (bp->b_flags & B_EINTR) {
3985                 bp->b_flags &= ~B_EINTR;
3986                 return (EINTR);
3987         }
3988         if (bp->b_ioflags & BIO_ERROR) {
3989                 return (bp->b_error ? bp->b_error : EIO);
3990         } else {
3991                 return (0);
3992         }
3993 }
3994
3995 /*
3996  *      bufdone:
3997  *
3998  *      Finish I/O on a buffer, optionally calling a completion function.
3999  *      This is usually called from an interrupt so process blocking is
4000  *      not allowed.
4001  *
4002  *      biodone is also responsible for setting B_CACHE in a B_VMIO bp.
4003  *      In a non-VMIO bp, B_CACHE will be set on the next getblk()
4004  *      assuming B_INVAL is clear.
4005  *
4006  *      For the VMIO case, we set B_CACHE if the op was a read and no
4007  *      read error occurred, or if the op was a write.  B_CACHE is never
4008  *      set if the buffer is invalid or otherwise uncacheable.
4009  *
4010  *      biodone does not mess with B_INVAL, allowing the I/O routine or the
4011  *      initiator to leave B_INVAL set to brelse the buffer out of existence
4012  *      in the biodone routine.
4013  */
4014 void
4015 bufdone(struct buf *bp)
4016 {
4017         struct bufobj *dropobj;
4018         void    (*biodone)(struct buf *);
4019
4020         CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
4021         dropobj = NULL;
4022
4023         KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
4024         BUF_ASSERT_HELD(bp);
4025
4026         runningbufwakeup(bp);
4027         if (bp->b_iocmd == BIO_WRITE)
4028                 dropobj = bp->b_bufobj;
4029         /* call optional completion function if requested */
4030         if (bp->b_iodone != NULL) {
4031                 biodone = bp->b_iodone;
4032                 bp->b_iodone = NULL;
4033                 (*biodone) (bp);
4034                 if (dropobj)
4035                         bufobj_wdrop(dropobj);
4036                 return;
4037         }
4038
4039         bufdone_finish(bp);
4040
4041         if (dropobj)
4042                 bufobj_wdrop(dropobj);
4043 }
4044
4045 void
4046 bufdone_finish(struct buf *bp)
4047 {
4048         BUF_ASSERT_HELD(bp);
4049
4050         if (!LIST_EMPTY(&bp->b_dep))
4051                 buf_complete(bp);
4052
4053         if (bp->b_flags & B_VMIO) {
4054                 /*
4055                  * Set B_CACHE if the op was a normal read and no error
4056                  * occurred.  B_CACHE is set for writes in the b*write()
4057                  * routines.
4058                  */
4059                 if (bp->b_iocmd == BIO_READ &&
4060                     !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
4061                     !(bp->b_ioflags & BIO_ERROR))
4062                         bp->b_flags |= B_CACHE;
4063                 vfs_vmio_iodone(bp);
4064         }
4065
4066         /*
4067          * For asynchronous completions, release the buffer now. The brelse
4068          * will do a wakeup there if necessary - so no need to do a wakeup
4069          * here in the async case. The sync case always needs to do a wakeup.
4070          */
4071         if (bp->b_flags & B_ASYNC) {
4072                 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) ||
4073                     (bp->b_ioflags & BIO_ERROR))
4074                         brelse(bp);
4075                 else
4076                         bqrelse(bp);
4077         } else
4078                 bdone(bp);
4079 }
4080
4081 /*
4082  * This routine is called in lieu of iodone in the case of
4083  * incomplete I/O.  This keeps the busy status for pages
4084  * consistent.
4085  */
4086 void
4087 vfs_unbusy_pages(struct buf *bp)
4088 {
4089         int i;
4090         vm_object_t obj;
4091         vm_page_t m;
4092
4093         runningbufwakeup(bp);
4094         if (!(bp->b_flags & B_VMIO))
4095                 return;
4096
4097         obj = bp->b_bufobj->bo_object;
4098         VM_OBJECT_WLOCK(obj);
4099         for (i = 0; i < bp->b_npages; i++) {
4100                 m = bp->b_pages[i];
4101                 if (m == bogus_page) {
4102                         m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
4103                         if (!m)
4104                                 panic("vfs_unbusy_pages: page missing\n");
4105                         bp->b_pages[i] = m;
4106                         if (buf_mapped(bp)) {
4107                                 BUF_CHECK_MAPPED(bp);
4108                                 pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
4109                                     bp->b_pages, bp->b_npages);
4110                         } else
4111                                 BUF_CHECK_UNMAPPED(bp);
4112                 }
4113                 vm_page_sunbusy(m);
4114         }
4115         vm_object_pip_wakeupn(obj, bp->b_npages);
4116         VM_OBJECT_WUNLOCK(obj);
4117 }
4118
4119 /*
4120  * vfs_page_set_valid:
4121  *
4122  *      Set the valid bits in a page based on the supplied offset.   The
4123  *      range is restricted to the buffer's size.
4124  *
4125  *      This routine is typically called after a read completes.
4126  */
4127 static void
4128 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
4129 {
4130         vm_ooffset_t eoff;
4131
4132         /*
4133          * Compute the end offset, eoff, such that [off, eoff) does not span a
4134          * page boundary and eoff is not greater than the end of the buffer.
4135          * The end of the buffer, in this case, is our file EOF, not the
4136          * allocation size of the buffer.
4137          */
4138         eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
4139         if (eoff > bp->b_offset + bp->b_bcount)
4140                 eoff = bp->b_offset + bp->b_bcount;
4141
4142         /*
4143          * Set valid range.  This is typically the entire buffer and thus the
4144          * entire page.
4145          */
4146         if (eoff > off)
4147                 vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
4148 }
4149
4150 /*
4151  * vfs_page_set_validclean:
4152  *
4153  *      Set the valid bits and clear the dirty bits in a page based on the
4154  *      supplied offset.   The range is restricted to the buffer's size.
4155  */
4156 static void
4157 vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
4158 {
4159         vm_ooffset_t soff, eoff;
4160
4161         /*
4162          * Start and end offsets in buffer.  eoff - soff may not cross a
4163          * page boundary or cross the end of the buffer.  The end of the
4164          * buffer, in this case, is our file EOF, not the allocation size
4165          * of the buffer.
4166          */
4167         soff = off;
4168         eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
4169         if (eoff > bp->b_offset + bp->b_bcount)
4170                 eoff = bp->b_offset + bp->b_bcount;
4171
4172         /*
4173          * Set valid range.  This is typically the entire buffer and thus the
4174          * entire page.
4175          */
4176         if (eoff > soff) {
4177                 vm_page_set_validclean(
4178                     m,
4179                    (vm_offset_t) (soff & PAGE_MASK),
4180                    (vm_offset_t) (eoff - soff)
4181                 );
4182         }
4183 }
4184
4185 /*
4186  * Ensure that all buffer pages are not exclusive busied.  If any page is
4187  * exclusive busy, drain it.
4188  */
4189 void
4190 vfs_drain_busy_pages(struct buf *bp)
4191 {
4192         vm_page_t m;
4193         int i, last_busied;
4194
4195         VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
4196         last_busied = 0;
4197         for (i = 0; i < bp->b_npages; i++) {
4198                 m = bp->b_pages[i];
4199                 if (vm_page_xbusied(m)) {
4200                         for (; last_busied < i; last_busied++)
4201                                 vm_page_sbusy(bp->b_pages[last_busied]);
4202                         while (vm_page_xbusied(m)) {
4203                                 vm_page_lock(m);
4204                                 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4205                                 vm_page_busy_sleep(m, "vbpage");
4206                                 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4207                         }
4208                 }
4209         }
4210         for (i = 0; i < last_busied; i++)
4211                 vm_page_sunbusy(bp->b_pages[i]);
4212 }
4213
4214 /*
4215  * This routine is called before a device strategy routine.
4216  * It is used to tell the VM system that paging I/O is in
4217  * progress, and treat the pages associated with the buffer
4218  * almost as being exclusive busy.  Also the object paging_in_progress
4219  * flag is handled to make sure that the object doesn't become
4220  * inconsistent.
4221  *
4222  * Since I/O has not been initiated yet, certain buffer flags
4223  * such as BIO_ERROR or B_INVAL may be in an inconsistent state
4224  * and should be ignored.
4225  */
4226 void
4227 vfs_busy_pages(struct buf *bp, int clear_modify)
4228 {
4229         int i, bogus;
4230         vm_object_t obj;
4231         vm_ooffset_t foff;
4232         vm_page_t m;
4233
4234         if (!(bp->b_flags & B_VMIO))
4235                 return;
4236
4237         obj = bp->b_bufobj->bo_object;
4238         foff = bp->b_offset;
4239         KASSERT(bp->b_offset != NOOFFSET,
4240             ("vfs_busy_pages: no buffer offset"));
4241         VM_OBJECT_WLOCK(obj);
4242         vfs_drain_busy_pages(bp);
4243         if (bp->b_bufsize != 0)
4244                 vfs_setdirty_locked_object(bp);
4245         bogus = 0;
4246         for (i = 0; i < bp->b_npages; i++) {
4247                 m = bp->b_pages[i];
4248
4249                 if ((bp->b_flags & B_CLUSTER) == 0) {
4250                         vm_object_pip_add(obj, 1);
4251                         vm_page_sbusy(m);
4252                 }
4253                 /*
4254                  * When readying a buffer for a read ( i.e
4255                  * clear_modify == 0 ), it is important to do
4256                  * bogus_page replacement for valid pages in
4257                  * partially instantiated buffers.  Partially
4258                  * instantiated buffers can, in turn, occur when
4259                  * reconstituting a buffer from its VM backing store
4260                  * base.  We only have to do this if B_CACHE is
4261                  * clear ( which causes the I/O to occur in the
4262                  * first place ).  The replacement prevents the read
4263                  * I/O from overwriting potentially dirty VM-backed
4264                  * pages.  XXX bogus page replacement is, uh, bogus.
4265                  * It may not work properly with small-block devices.
4266                  * We need to find a better way.
4267                  */
4268                 if (clear_modify) {
4269                         pmap_remove_write(m);
4270                         vfs_page_set_validclean(bp, foff, m);
4271                 } else if (m->valid == VM_PAGE_BITS_ALL &&
4272                     (bp->b_flags & B_CACHE) == 0) {
4273                         bp->b_pages[i] = bogus_page;
4274                         bogus++;
4275                 }
4276                 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
4277         }
4278         VM_OBJECT_WUNLOCK(obj);
4279         if (bogus && buf_mapped(bp)) {
4280                 BUF_CHECK_MAPPED(bp);
4281                 pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
4282                     bp->b_pages, bp->b_npages);
4283         }
4284 }
4285
4286 /*
4287  *      vfs_bio_set_valid:
4288  *
4289  *      Set the range within the buffer to valid.  The range is
4290  *      relative to the beginning of the buffer, b_offset.  Note that
4291  *      b_offset itself may be offset from the beginning of the first
4292  *      page.
4293  */
4294 void
4295 vfs_bio_set_valid(struct buf *bp, int base, int size)
4296 {
4297         int i, n;
4298         vm_page_t m;
4299
4300         if (!(bp->b_flags & B_VMIO))
4301                 return;
4302
4303         /*
4304          * Fixup base to be relative to beginning of first page.
4305          * Set initial n to be the maximum number of bytes in the
4306          * first page that can be validated.
4307          */
4308         base += (bp->b_offset & PAGE_MASK);
4309         n = PAGE_SIZE - (base & PAGE_MASK);
4310
4311         VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4312         for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4313                 m = bp->b_pages[i];
4314                 if (n > size)
4315                         n = size;
4316                 vm_page_set_valid_range(m, base & PAGE_MASK, n);
4317                 base += n;
4318                 size -= n;
4319                 n = PAGE_SIZE;
4320         }
4321         VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4322 }
4323
4324 /*
4325  *      vfs_bio_clrbuf:
4326  *
4327  *      If the specified buffer is a non-VMIO buffer, clear the entire
4328  *      buffer.  If the specified buffer is a VMIO buffer, clear and
4329  *      validate only the previously invalid portions of the buffer.
4330  *      This routine essentially fakes an I/O, so we need to clear
4331  *      BIO_ERROR and B_INVAL.
4332  *
4333  *      Note that while we only theoretically need to clear through b_bcount,
4334  *      we go ahead and clear through b_bufsize.
4335  */
4336 void
4337 vfs_bio_clrbuf(struct buf *bp)
4338 {
4339         int i, j, mask, sa, ea, slide;
4340
4341         if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
4342                 clrbuf(bp);
4343                 return;
4344         }
4345         bp->b_flags &= ~B_INVAL;
4346         bp->b_ioflags &= ~BIO_ERROR;
4347         VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4348         if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
4349             (bp->b_offset & PAGE_MASK) == 0) {
4350                 if (bp->b_pages[0] == bogus_page)
4351                         goto unlock;
4352                 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
4353                 VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
4354                 if ((bp->b_pages[0]->valid & mask) == mask)
4355                         goto unlock;
4356                 if ((bp->b_pages[0]->valid & mask) == 0) {
4357                         pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
4358                         bp->b_pages[0]->valid |= mask;
4359                         goto unlock;
4360                 }
4361         }
4362         sa = bp->b_offset & PAGE_MASK;
4363         slide = 0;
4364         for (i = 0; i < bp->b_npages; i++, sa = 0) {
4365                 slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
4366                 ea = slide & PAGE_MASK;
4367                 if (ea == 0)
4368                         ea = PAGE_SIZE;
4369                 if (bp->b_pages[i] == bogus_page)
4370                         continue;
4371                 j = sa / DEV_BSIZE;
4372                 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
4373                 VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
4374                 if ((bp->b_pages[i]->valid & mask) == mask)
4375                         continue;
4376                 if ((bp->b_pages[i]->valid & mask) == 0)
4377                         pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
4378                 else {
4379                         for (; sa < ea; sa += DEV_BSIZE, j++) {
4380                                 if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
4381                                         pmap_zero_page_area(bp->b_pages[i],
4382                                             sa, DEV_BSIZE);
4383                                 }
4384                         }
4385                 }
4386                 bp->b_pages[i]->valid |= mask;
4387         }
4388 unlock:
4389         VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4390         bp->b_resid = 0;
4391 }
4392
4393 void
4394 vfs_bio_bzero_buf(struct buf *bp, int base, int size)
4395 {
4396         vm_page_t m;
4397         int i, n;
4398
4399         if (buf_mapped(bp)) {
4400                 BUF_CHECK_MAPPED(bp);
4401                 bzero(bp->b_data + base, size);
4402         } else {
4403                 BUF_CHECK_UNMAPPED(bp);
4404                 n = PAGE_SIZE - (base & PAGE_MASK);
4405                 for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4406                         m = bp->b_pages[i];
4407                         if (n > size)
4408                                 n = size;
4409                         pmap_zero_page_area(m, base & PAGE_MASK, n);
4410                         base += n;
4411                         size -= n;
4412                         n = PAGE_SIZE;
4413                 }
4414         }
4415 }
4416
4417 /*
4418  * vm_hold_load_pages and vm_hold_free_pages get pages into
4419  * a buffers address space.  The pages are anonymous and are
4420  * not associated with a file object.
4421  */
4422 static void
4423 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
4424 {
4425         vm_offset_t pg;
4426         vm_page_t p;
4427         int index;
4428
4429         BUF_CHECK_MAPPED(bp);
4430
4431         to = round_page(to);
4432         from = round_page(from);
4433         index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4434
4435         for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
4436 tryagain:
4437                 /*
4438                  * note: must allocate system pages since blocking here
4439                  * could interfere with paging I/O, no matter which
4440                  * process we are.
4441                  */
4442                 p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
4443                     VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
4444                 if (p == NULL) {
4445                         VM_WAIT;
4446                         goto tryagain;
4447                 }
4448                 pmap_qenter(pg, &p, 1);
4449                 bp->b_pages[index] = p;
4450         }
4451         bp->b_npages = index;
4452 }
4453
4454 /* Return pages associated with this buf to the vm system */
4455 static void
4456 vm_hold_free_pages(struct buf *bp, int newbsize)
4457 {
4458         vm_offset_t from;
4459         vm_page_t p;
4460         int index, newnpages;
4461
4462         BUF_CHECK_MAPPED(bp);
4463
4464         from = round_page((vm_offset_t)bp->b_data + newbsize);
4465         newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4466         if (bp->b_npages > newnpages)
4467                 pmap_qremove(from, bp->b_npages - newnpages);
4468         for (index = newnpages; index < bp->b_npages; index++) {
4469                 p = bp->b_pages[index];
4470                 bp->b_pages[index] = NULL;
4471                 if (vm_page_sbusied(p))
4472                         printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
4473                             (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
4474                 p->wire_count--;
4475                 vm_page_free(p);
4476                 atomic_subtract_int(&vm_cnt.v_wire_count, 1);
4477         }
4478         bp->b_npages = newnpages;
4479 }
4480
4481 /*
4482  * Map an IO request into kernel virtual address space.
4483  *
4484  * All requests are (re)mapped into kernel VA space.
4485  * Notice that we use b_bufsize for the size of the buffer
4486  * to be mapped.  b_bcount might be modified by the driver.
4487  *
4488  * Note that even if the caller determines that the address space should
4489  * be valid, a race or a smaller-file mapped into a larger space may
4490  * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
4491  * check the return value.
4492  *
4493  * This function only works with pager buffers.
4494  */
4495 int
4496 vmapbuf(struct buf *bp, int mapbuf)
4497 {
4498         vm_prot_t prot;
4499         int pidx;
4500
4501         if (bp->b_bufsize < 0)
4502                 return (-1);
4503         prot = VM_PROT_READ;
4504         if (bp->b_iocmd == BIO_READ)
4505                 prot |= VM_PROT_WRITE;  /* Less backwards than it looks */
4506         if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
4507             (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
4508             btoc(MAXPHYS))) < 0)
4509                 return (-1);
4510         bp->b_npages = pidx;
4511         bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
4512         if (mapbuf || !unmapped_buf_allowed) {
4513                 pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx);
4514                 bp->b_data = bp->b_kvabase + bp->b_offset;
4515         } else
4516                 bp->b_data = unmapped_buf;
4517         return(0);
4518 }
4519
4520 /*
4521  * Free the io map PTEs associated with this IO operation.
4522  * We also invalidate the TLB entries and restore the original b_addr.
4523  *
4524  * This function only works with pager buffers.
4525  */
4526 void
4527 vunmapbuf(struct buf *bp)
4528 {
4529         int npages;
4530
4531         npages = bp->b_npages;
4532         if (buf_mapped(bp))
4533                 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
4534         vm_page_unhold_pages(bp->b_pages, npages);
4535
4536         bp->b_data = unmapped_buf;
4537 }
4538
4539 void
4540 bdone(struct buf *bp)
4541 {
4542         struct mtx *mtxp;
4543
4544         mtxp = mtx_pool_find(mtxpool_sleep, bp);
4545         mtx_lock(mtxp);
4546         bp->b_flags |= B_DONE;
4547         wakeup(bp);
4548         mtx_unlock(mtxp);
4549 }
4550
4551 void
4552 bwait(struct buf *bp, u_char pri, const char *wchan)
4553 {
4554         struct mtx *mtxp;
4555
4556         mtxp = mtx_pool_find(mtxpool_sleep, bp);
4557         mtx_lock(mtxp);
4558         while ((bp->b_flags & B_DONE) == 0)
4559                 msleep(bp, mtxp, pri, wchan, 0);
4560         mtx_unlock(mtxp);
4561 }
4562
4563 int
4564 bufsync(struct bufobj *bo, int waitfor)
4565 {
4566
4567         return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread));
4568 }
4569
4570 void
4571 bufstrategy(struct bufobj *bo, struct buf *bp)
4572 {
4573         int i = 0;
4574         struct vnode *vp;
4575
4576         vp = bp->b_vp;
4577         KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
4578         KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
4579             ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
4580         i = VOP_STRATEGY(vp, bp);
4581         KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
4582 }
4583
4584 void
4585 bufobj_wrefl(struct bufobj *bo)
4586 {
4587
4588         KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4589         ASSERT_BO_WLOCKED(bo);
4590         bo->bo_numoutput++;
4591 }
4592
4593 void
4594 bufobj_wref(struct bufobj *bo)
4595 {
4596
4597         KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4598         BO_LOCK(bo);
4599         bo->bo_numoutput++;
4600         BO_UNLOCK(bo);
4601 }
4602
4603 void
4604 bufobj_wdrop(struct bufobj *bo)
4605 {
4606
4607         KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
4608         BO_LOCK(bo);
4609         KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
4610         if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
4611                 bo->bo_flag &= ~BO_WWAIT;
4612                 wakeup(&bo->bo_numoutput);
4613         }
4614         BO_UNLOCK(bo);
4615 }
4616
4617 int
4618 bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
4619 {
4620         int error;
4621
4622         KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
4623         ASSERT_BO_WLOCKED(bo);
4624         error = 0;
4625         while (bo->bo_numoutput) {
4626                 bo->bo_flag |= BO_WWAIT;
4627                 error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
4628                     slpflag | (PRIBIO + 1), "bo_wwait", timeo);
4629                 if (error)
4630                         break;
4631         }
4632         return (error);
4633 }
4634
4635 void
4636 bpin(struct buf *bp)
4637 {
4638         struct mtx *mtxp;
4639
4640         mtxp = mtx_pool_find(mtxpool_sleep, bp);
4641         mtx_lock(mtxp);
4642         bp->b_pin_count++;
4643         mtx_unlock(mtxp);
4644 }
4645
4646 void
4647 bunpin(struct buf *bp)
4648 {
4649         struct mtx *mtxp;
4650
4651         mtxp = mtx_pool_find(mtxpool_sleep, bp);
4652         mtx_lock(mtxp);
4653         if (--bp->b_pin_count == 0)
4654                 wakeup(bp);
4655         mtx_unlock(mtxp);
4656 }
4657
4658 void
4659 bunpin_wait(struct buf *bp)
4660 {
4661         struct mtx *mtxp;
4662
4663         mtxp = mtx_pool_find(mtxpool_sleep, bp);
4664         mtx_lock(mtxp);
4665         while (bp->b_pin_count > 0)
4666                 msleep(bp, mtxp, PRIBIO, "bwunpin", 0);
4667         mtx_unlock(mtxp);
4668 }
4669
4670 /*
4671  * Set bio_data or bio_ma for struct bio from the struct buf.
4672  */
4673 void
4674 bdata2bio(struct buf *bp, struct bio *bip)
4675 {
4676
4677         if (!buf_mapped(bp)) {
4678                 KASSERT(unmapped_buf_allowed, ("unmapped"));
4679                 bip->bio_ma = bp->b_pages;
4680                 bip->bio_ma_n = bp->b_npages;
4681                 bip->bio_data = unmapped_buf;
4682                 bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
4683                 bip->bio_flags |= BIO_UNMAPPED;
4684                 KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
4685                     PAGE_SIZE == bp->b_npages,
4686                     ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
4687                     (long long)bip->bio_length, bip->bio_ma_n));
4688         } else {
4689                 bip->bio_data = bp->b_data;
4690                 bip->bio_ma = NULL;
4691         }
4692 }
4693
4694 #include "opt_ddb.h"
4695 #ifdef DDB
4696 #include <ddb/ddb.h>
4697
4698 /* DDB command to show buffer data */
4699 DB_SHOW_COMMAND(buffer, db_show_buffer)
4700 {
4701         /* get args */
4702         struct buf *bp = (struct buf *)addr;
4703
4704         if (!have_addr) {
4705                 db_printf("usage: show buffer <addr>\n");
4706                 return;
4707         }
4708
4709         db_printf("buf at %p\n", bp);
4710         db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
4711             (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
4712             PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
4713         db_printf(
4714             "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
4715             "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
4716             "b_dep = %p\n",
4717             bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
4718             bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
4719             (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
4720         db_printf("b_kvabase = %p, b_kvasize = %d\n",
4721             bp->b_kvabase, bp->b_kvasize);
4722         if (bp->b_npages) {
4723                 int i;
4724                 db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
4725                 for (i = 0; i < bp->b_npages; i++) {
4726                         vm_page_t m;
4727                         m = bp->b_pages[i];
4728                         db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
4729                             (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
4730                         if ((i + 1) < bp->b_npages)
4731                                 db_printf(",");
4732                 }
4733                 db_printf("\n");
4734         }
4735         db_printf(" ");
4736         BUF_LOCKPRINTINFO(bp);
4737 }
4738
4739 DB_SHOW_COMMAND(lockedbufs, lockedbufs)
4740 {
4741         struct buf *bp;
4742         int i;
4743
4744         for (i = 0; i < nbuf; i++) {
4745                 bp = &buf[i];
4746                 if (BUF_ISLOCKED(bp)) {
4747                         db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4748                         db_printf("\n");
4749                 }
4750         }
4751 }
4752
4753 DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
4754 {
4755         struct vnode *vp;
4756         struct buf *bp;
4757
4758         if (!have_addr) {
4759                 db_printf("usage: show vnodebufs <addr>\n");
4760                 return;
4761         }
4762         vp = (struct vnode *)addr;
4763         db_printf("Clean buffers:\n");
4764         TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
4765                 db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4766                 db_printf("\n");
4767         }
4768         db_printf("Dirty buffers:\n");
4769         TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
4770                 db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4771                 db_printf("\n");
4772         }
4773 }
4774
4775 DB_COMMAND(countfreebufs, db_coundfreebufs)
4776 {
4777         struct buf *bp;
4778         int i, used = 0, nfree = 0;
4779
4780         if (have_addr) {
4781                 db_printf("usage: countfreebufs\n");
4782                 return;
4783         }
4784
4785         for (i = 0; i < nbuf; i++) {
4786                 bp = &buf[i];
4787                 if (bp->b_qindex == QUEUE_EMPTY)
4788                         nfree++;
4789                 else
4790                         used++;
4791         }
4792
4793         db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
4794             nfree + used);
4795         db_printf("numfreebuffers is %d\n", numfreebuffers);
4796 }
4797 #endif /* DDB */