sys/vm/vnode_pager.c

   1 /*
   2  * Copyright (c) 1990 University of Utah.
   3  * Copyright (c) 1991 The Regents of the University of California.
   4  * All rights reserved.
   5  * Copyright (c) 1993, 1994 John S. Dyson
   6  * Copyright (c) 1995, David Greenman
   7  *
   8  * This code is derived from software contributed to Berkeley by
   9  * the Systems Programming Group of the University of Utah Computer
  10  * Science Department.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. All advertising materials mentioning features or use of this software
  21  *    must display the following acknowledgement:
  22  *      This product includes software developed by the University of
  23  *      California, Berkeley and its contributors.
  24  * 4. Neither the name of the University nor the names of its contributors
  25  *    may be used to endorse or promote products derived from this software
  26  *    without specific prior written permission.
  27  *
  28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  38  * SUCH DAMAGE.
  39  *
  40  *      from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91
  41  * $FreeBSD$
  42  */
  43
  44 /*
  45  * Page to/from files (vnodes).
  46  */
  47
  48 /*
  49  * TODO:
  50  *      Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
  51  *      greatly re-simplify the vnode_pager.
  52  */
  53
  54 #include <sys/param.h>
  55 #include <sys/systm.h>
  56 #include <sys/proc.h>
  57 #include <sys/vnode.h>
  58 #include <sys/mount.h>
  59 #include <sys/bio.h>
  60 #include <sys/buf.h>
  61 #include <sys/vmmeter.h>
  62 #include <sys/conf.h>
  63
  64 #include <vm/vm.h>
  65 #include <vm/vm_object.h>
  66 #include <vm/vm_page.h>
  67 #include <vm/vm_pager.h>
  68 #include <vm/vm_map.h>
  69 #include <vm/vnode_pager.h>
  70 #include <vm/vm_extern.h>
  71
  72 static vm_offset_t vnode_pager_addr __P((struct vnode *vp, vm_ooffset_t address,
  73                                          int *run));
  74 static void vnode_pager_iodone __P((struct buf *bp));
  75 static int vnode_pager_input_smlfs __P((vm_object_t object, vm_page_t m));
  76 static int vnode_pager_input_old __P((vm_object_t object, vm_page_t m));
  77 static void vnode_pager_dealloc __P((vm_object_t));
  78 static int vnode_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
  79 static void vnode_pager_putpages __P((vm_object_t, vm_page_t *, int, boolean_t, int *));
  80 static boolean_t vnode_pager_haspage __P((vm_object_t, vm_pindex_t, int *, int *));
  81
  82 struct pagerops vnodepagerops = {
  83         NULL,
  84         vnode_pager_alloc,
  85         vnode_pager_dealloc,
  86         vnode_pager_getpages,
  87         vnode_pager_putpages,
  88         vnode_pager_haspage,
  89         NULL
  90 };
  91
  92 int vnode_pbuf_freecnt = -1;    /* start out unlimited */
  93
  94
  95 /*
  96  * Allocate (or lookup) pager for a vnode.
  97  * Handle is a vnode pointer.
  98  */
  99 vm_object_t
 100 vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
 101                   vm_ooffset_t offset)
 102 {
 103         vm_object_t object;
 104         struct vnode *vp;
 105
 106         mtx_assert(&Giant, MA_OWNED);
 107         /*
 108          * Pageout to vnode, no can do yet.
 109          */
 110         if (handle == NULL)
 111                 return (NULL);
 112
 113         /*
 114          * XXX hack - This initialization should be put somewhere else.
 115          */
 116         if (vnode_pbuf_freecnt < 0) {
 117             vnode_pbuf_freecnt = nswbuf / 2 + 1;
 118         }
 119
 120         vp = (struct vnode *) handle;
 121
 122         /*
 123          * Prevent race condition when allocating the object. This
 124          * can happen with NFS vnodes since the nfsnode isn't locked.
 125          */
 126         mtx_unlock(&vm_mtx);
 127         while (vp->v_flag & VOLOCK) {
 128                 vp->v_flag |= VOWANT;
 129                 tsleep(vp, PVM, "vnpobj", 0);
 130         }
 131         vp->v_flag |= VOLOCK;
 132         mtx_lock(&vm_mtx);
 133
 134         /*
 135          * If the object is being terminated, wait for it to
 136          * go away.
 137          */
 138         while (((object = vp->v_object) != NULL) &&
 139                 (object->flags & OBJ_DEAD)) {
 140                 msleep(object, &vm_mtx, PVM, "vadead", 0);
 141         }
 142
 143         if (vp->v_usecount == 0)
 144                 panic("vnode_pager_alloc: no vnode reference");
 145
 146         if (object == NULL) {
 147                 /*
 148                  * And an object of the appropriate size
 149                  */
 150                 object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size)));
 151                 object->flags = 0;
 152
 153                 object->un_pager.vnp.vnp_size = size;
 154
 155                 object->handle = handle;
 156                 vp->v_object = object;
 157                 vp->v_usecount++;
 158         } else {
 159                 object->ref_count++;
 160                 vp->v_usecount++;
 161         }
 162
 163         mtx_unlock(&vm_mtx);
 164         vp->v_flag &= ~VOLOCK;
 165         if (vp->v_flag & VOWANT) {
 166                 vp->v_flag &= ~VOWANT;
 167                 wakeup(vp);
 168         }
 169         mtx_lock(&vm_mtx);
 170         return (object);
 171 }
 172
 173 static void
 174 vnode_pager_dealloc(object)
 175         vm_object_t object;
 176 {
 177         register struct vnode *vp = object->handle;
 178
 179         mtx_assert(&Giant, MA_OWNED);
 180         if (vp == NULL)
 181                 panic("vnode_pager_dealloc: pager already dealloced");
 182
 183         vm_object_pip_wait(object, "vnpdea");
 184
 185         object->handle = NULL;
 186         object->type = OBJT_DEAD;
 187         vp->v_object = NULL;
 188         vp->v_flag &= ~(VTEXT | VOBJBUF);
 189 }
 190
 191 static boolean_t
 192 vnode_pager_haspage(object, pindex, before, after)
 193         vm_object_t object;
 194         vm_pindex_t pindex;
 195         int *before;
 196         int *after;
 197 {
 198         struct vnode *vp = object->handle;
 199         daddr_t bn;
 200         int err;
 201         daddr_t reqblock;
 202         int poff;
 203         int bsize;
 204         int pagesperblock, blocksperpage;
 205
 206         mtx_assert(&Giant, MA_OWNED);
 207         /*
 208          * If no vp or vp is doomed or marked transparent to VM, we do not
 209          * have the page.
 210          */
 211         if ((vp == NULL) || (vp->v_flag & VDOOMED))
 212                 return FALSE;
 213
 214         /*
 215          * If filesystem no longer mounted or offset beyond end of file we do
 216          * not have the page.
 217          */
 218         if ((vp->v_mount == NULL) ||
 219                 (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size))
 220                 return FALSE;
 221
 222         bsize = vp->v_mount->mnt_stat.f_iosize;
 223         pagesperblock = bsize / PAGE_SIZE;
 224         blocksperpage = 0;
 225         if (pagesperblock > 0) {
 226                 reqblock = pindex / pagesperblock;
 227         } else {
 228                 blocksperpage = (PAGE_SIZE / bsize);
 229                 reqblock = pindex * blocksperpage;
 230         }
 231         mtx_unlock(&vm_mtx);
 232         err = VOP_BMAP(vp, reqblock, (struct vnode **) 0, &bn,
 233                 after, before);
 234         mtx_lock(&vm_mtx);
 235         if (err)
 236                 return TRUE;
 237         if ( bn == -1)
 238                 return FALSE;
 239         if (pagesperblock > 0) {
 240                 poff = pindex - (reqblock * pagesperblock);
 241                 if (before) {
 242                         *before *= pagesperblock;
 243                         *before += poff;
 244                 }
 245                 if (after) {
 246                         int numafter;
 247                         *after *= pagesperblock;
 248                         numafter = pagesperblock - (poff + 1);
 249                         if (IDX_TO_OFF(pindex + numafter) > object->un_pager.vnp.vnp_size) {
 250                                 numafter = OFF_TO_IDX((object->un_pager.vnp.vnp_size - IDX_TO_OFF(pindex)));
 251                         }
 252                         *after += numafter;
 253                 }
 254         } else {
 255                 if (before) {
 256                         *before /= blocksperpage;
 257                 }
 258
 259                 if (after) {
 260                         *after /= blocksperpage;
 261                 }
 262         }
 263         return TRUE;
 264 }
 265
 266 /*
 267  * Lets the VM system know about a change in size for a file.
 268  * We adjust our own internal size and flush any cached pages in
 269  * the associated object that are affected by the size change.
 270  *
 271  * Note: this routine may be invoked as a result of a pager put
 272  * operation (possibly at object termination time), so we must be careful.
 273  */
 274 void
 275 vnode_pager_setsize(vp, nsize)
 276         struct vnode *vp;
 277         vm_ooffset_t nsize;
 278 {
 279         vm_pindex_t nobjsize;
 280         vm_object_t object = vp->v_object;
 281
 282         if (object == NULL)
 283                 return;
 284
 285         /*
 286          * Hasn't changed size
 287          */
 288         if (nsize == object->un_pager.vnp.vnp_size)
 289                 return;
 290
 291         nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
 292
 293         /*
 294          * File has shrunk. Toss any cached pages beyond the new EOF.
 295          */
 296         if (nsize < object->un_pager.vnp.vnp_size) {
 297                 int hadvmlock;
 298
 299                 hadvmlock = mtx_owned(&vm_mtx);
 300                 if (!hadvmlock)
 301                         mtx_lock(&vm_mtx);
 302                 vm_freeze_copyopts(object, OFF_TO_IDX(nsize), object->size);
 303                 if (nobjsize < object->size) {
 304                         vm_object_page_remove(object, nobjsize, object->size,
 305                                 FALSE);
 306                 }
 307                 /*
 308                  * this gets rid of garbage at the end of a page that is now
 309                  * only partially backed by the vnode...
 310                  */
 311                 if (nsize & PAGE_MASK) {
 312                         vm_offset_t kva;
 313                         vm_page_t m;
 314
 315                         m = vm_page_lookup(object, OFF_TO_IDX(nsize));
 316                         if (m) {
 317                                 int base = (int)nsize & PAGE_MASK;
 318                                 int size = PAGE_SIZE - base;
 319
 320                                 /*
 321                                  * Clear out partial-page garbage in case
 322                                  * the page has been mapped.
 323                                  */
 324                                 kva = vm_pager_map_page(m);
 325                                 bzero((caddr_t)kva + base, size);
 326                                 vm_pager_unmap_page(kva);
 327
 328                                 /*
 329                                  * Clear out partial-page dirty bits.  This
 330                                  * has the side effect of setting the valid
 331                                  * bits, but that is ok.  There are a bunch
 332                                  * of places in the VM system where we expected
 333                                  * m->dirty == VM_PAGE_BITS_ALL.  The file EOF
 334                                  * case is one of them.  If the page is still
 335                                  * partially dirty, make it fully dirty.
 336                                  */
 337                                 vm_page_set_validclean(m, base, size);
 338                                 if (m->dirty != 0)
 339                                         m->dirty = VM_PAGE_BITS_ALL;
 340                         }
 341                 }
 342                 if (!hadvmlock)
 343                         mtx_unlock(&vm_mtx);
 344         }
 345         object->un_pager.vnp.vnp_size = nsize;
 346         object->size = nobjsize;
 347 }
 348
 349 /*
 350  * calculate the linear (byte) disk address of specified virtual
 351  * file address
 352  */
 353 static vm_offset_t
 354 vnode_pager_addr(vp, address, run)
 355         struct vnode *vp;
 356         vm_ooffset_t address;
 357         int *run;
 358 {
 359         int rtaddress;
 360         int bsize;
 361         daddr_t block;
 362         struct vnode *rtvp;
 363         int err;
 364         daddr_t vblock;
 365         int voffset;
 366
 367         mtx_assert(&Giant, MA_OWNED);
 368         if ((int) address < 0)
 369                 return -1;
 370
 371         if (vp->v_mount == NULL)
 372                 return -1;
 373
 374         bsize = vp->v_mount->mnt_stat.f_iosize;
 375         vblock = address / bsize;
 376         voffset = address % bsize;
 377         mtx_unlock(&vm_mtx);
 378
 379         err = VOP_BMAP(vp, vblock, &rtvp, &block, run, NULL);
 380
 381         mtx_lock(&vm_mtx);
 382         if (err || (block == -1))
 383                 rtaddress = -1;
 384         else {
 385                 rtaddress = block + voffset / DEV_BSIZE;
 386                 if( run) {
 387                         *run += 1;
 388                         *run *= bsize/PAGE_SIZE;
 389                         *run -= voffset/PAGE_SIZE;
 390                 }
 391         }
 392
 393         return rtaddress;
 394 }
 395
 396 /*
 397  * interrupt routine for I/O completion
 398  */
 399 static void
 400 vnode_pager_iodone(bp)
 401         struct buf *bp;
 402 {
 403         bp->b_flags |= B_DONE;
 404         wakeup(bp);
 405 }
 406
 407 /*
 408  * small block file system vnode pager input
 409  */
 410 static int
 411 vnode_pager_input_smlfs(object, m)
 412         vm_object_t object;
 413         vm_page_t m;
 414 {
 415         int i;
 416         int s;
 417         struct vnode *dp, *vp;
 418         struct buf *bp;
 419         vm_offset_t kva;
 420         int fileaddr;
 421         vm_offset_t bsize;
 422         int error = 0;
 423
 424         mtx_assert(&Giant, MA_OWNED);
 425         vp = object->handle;
 426         if (vp->v_mount == NULL)
 427                 return VM_PAGER_BAD;
 428
 429         bsize = vp->v_mount->mnt_stat.f_iosize;
 430         mtx_unlock(&vm_mtx);
 431
 432         VOP_BMAP(vp, 0, &dp, 0, NULL, NULL);
 433
 434         mtx_lock(&vm_mtx);
 435         kva = vm_pager_map_page(m);
 436
 437         for (i = 0; i < PAGE_SIZE / bsize; i++) {
 438
 439                 if (vm_page_bits(i * bsize, bsize) & m->valid)
 440                         continue;
 441
 442                 fileaddr = vnode_pager_addr(vp,
 443                         IDX_TO_OFF(m->pindex) + i * bsize, (int *)0);
 444                 if (fileaddr != -1) {
 445                         mtx_unlock(&vm_mtx);
 446                         bp = getpbuf(&vnode_pbuf_freecnt);
 447
 448                         /* build a minimal buffer header */
 449                         bp->b_iocmd = BIO_READ;
 450                         bp->b_iodone = vnode_pager_iodone;
 451                         bp->b_rcred = bp->b_wcred = curproc->p_ucred;
 452                         if (bp->b_rcred != NOCRED)
 453                                 crhold(bp->b_rcred);
 454                         if (bp->b_wcred != NOCRED)
 455                                 crhold(bp->b_wcred);
 456                         bp->b_data = (caddr_t) kva + i * bsize;
 457                         bp->b_blkno = fileaddr;
 458                         pbgetvp(dp, bp);
 459                         bp->b_bcount = bsize;
 460                         bp->b_bufsize = bsize;
 461                         bp->b_runningbufspace = bp->b_bufsize;
 462                         runningbufspace += bp->b_runningbufspace;
 463
 464                         /* do the input */
 465                         BUF_STRATEGY(bp);
 466
 467                         /* we definitely need to be at splvm here */
 468
 469                         s = splvm();
 470                         while ((bp->b_flags & B_DONE) == 0) {
 471                                 tsleep(bp, PVM, "vnsrd", 0);
 472                         }
 473                         splx(s);
 474                         if ((bp->b_ioflags & BIO_ERROR) != 0)
 475                                 error = EIO;
 476
 477                         /*
 478                          * free the buffer header back to the swap buffer pool
 479                          */
 480                         relpbuf(bp, &vnode_pbuf_freecnt);
 481                         mtx_lock(&vm_mtx);
 482                         if (error)
 483                                 break;
 484
 485                         vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
 486                 } else {
 487                         vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
 488                         bzero((caddr_t) kva + i * bsize, bsize);
 489                 }
 490         }
 491         vm_pager_unmap_page(kva);
 492         pmap_clear_modify(m);
 493         vm_page_flag_clear(m, PG_ZERO);
 494         if (error) {
 495                 return VM_PAGER_ERROR;
 496         }
 497         return VM_PAGER_OK;
 498
 499 }
 500
 501
 502 /*
 503  * old style vnode pager output routine
 504  */
 505 static int
 506 vnode_pager_input_old(object, m)
 507         vm_object_t object;
 508         vm_page_t m;
 509 {
 510         struct uio auio;
 511         struct iovec aiov;
 512         int error;
 513         int size;
 514         vm_offset_t kva;
 515         struct vnode *vp;
 516
 517         mtx_assert(&Giant, MA_OWNED);
 518         error = 0;
 519
 520         /*
 521          * Return failure if beyond current EOF
 522          */
 523         if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
 524                 return VM_PAGER_BAD;
 525         } else {
 526                 size = PAGE_SIZE;
 527                 if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
 528                         size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
 529
 530                 /*
 531                  * Allocate a kernel virtual address and initialize so that
 532                  * we can use VOP_READ/WRITE routines.
 533                  */
 534                 kva = vm_pager_map_page(m);
 535
 536                 vp = object->handle;
 537                 mtx_unlock(&vm_mtx);
 538                 aiov.iov_base = (caddr_t) kva;
 539                 aiov.iov_len = size;
 540                 auio.uio_iov = &aiov;
 541                 auio.uio_iovcnt = 1;
 542                 auio.uio_offset = IDX_TO_OFF(m->pindex);
 543                 auio.uio_segflg = UIO_SYSSPACE;
 544                 auio.uio_rw = UIO_READ;
 545                 auio.uio_resid = size;
 546                 auio.uio_procp = curproc;
 547
 548                 error = VOP_READ(vp, &auio, 0, curproc->p_ucred);
 549                 if (!error) {
 550                         register int count = size - auio.uio_resid;
 551
 552                         if (count == 0)
 553                                 error = EINVAL;
 554                         else if (count != PAGE_SIZE)
 555                                 bzero((caddr_t) kva + count, PAGE_SIZE - count);
 556                 }
 557                 mtx_lock(&vm_mtx);
 558                 vm_pager_unmap_page(kva);
 559         }
 560         pmap_clear_modify(m);
 561         vm_page_undirty(m);
 562         vm_page_flag_clear(m, PG_ZERO);
 563         if (!error)
 564                 m->valid = VM_PAGE_BITS_ALL;
 565         return error ? VM_PAGER_ERROR : VM_PAGER_OK;
 566 }
 567
 568 /*
 569  * generic vnode pager input routine
 570  */
 571
 572 /*
 573  * Local media VFS's that do not implement their own VOP_GETPAGES
 574  * should have their VOP_GETPAGES should call to
 575  * vnode_pager_generic_getpages() to implement the previous behaviour.
 576  *
 577  * All other FS's should use the bypass to get to the local media
 578  * backing vp's VOP_GETPAGES.
 579  */
 580 static int
 581 vnode_pager_getpages(object, m, count, reqpage)
 582         vm_object_t object;
 583         vm_page_t *m;
 584         int count;
 585         int reqpage;
 586 {
 587         int rtval;
 588         struct vnode *vp;
 589         int bytes = count * PAGE_SIZE;
 590
 591         mtx_assert(&Giant, MA_OWNED);
 592         vp = object->handle;
 593         rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
 594         KASSERT(rtval != EOPNOTSUPP,
 595             ("vnode_pager: FS getpages not implemented\n"));
 596         return rtval;
 597 }
 598
 599
 600 /*
 601  * This is now called from local media FS's to operate against their
 602  * own vnodes if they fail to implement VOP_GETPAGES.
 603  */
 604 int
 605 vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
 606         struct vnode *vp;
 607         vm_page_t *m;
 608         int bytecount;
 609         int reqpage;
 610 {
 611         vm_object_t object;
 612         vm_offset_t kva;
 613         off_t foff, tfoff, nextoff;
 614         int i, size, bsize, first, firstaddr;
 615         struct vnode *dp;
 616         int runpg;
 617         int runend;
 618         struct buf *bp;
 619         int s;
 620         int count;
 621         int error = 0;
 622
 623         mtx_assert(&Giant, MA_OWNED);
 624         object = vp->v_object;
 625         count = bytecount / PAGE_SIZE;
 626
 627         if (vp->v_mount == NULL)
 628                 return VM_PAGER_BAD;
 629
 630         bsize = vp->v_mount->mnt_stat.f_iosize;
 631
 632         /* get the UNDERLYING device for the file with VOP_BMAP() */
 633
 634         /*
 635          * originally, we did not check for an error return value -- assuming
 636          * an fs always has a bmap entry point -- that assumption is wrong!!!
 637          */
 638         foff = IDX_TO_OFF(m[reqpage]->pindex);
 639
 640         /*
 641          * if we can't bmap, use old VOP code
 642          */
 643         mtx_unlock(&vm_mtx);
 644         if (VOP_BMAP(vp, 0, &dp, 0, NULL, NULL)) {
 645                 mtx_lock(&vm_mtx);
 646                 for (i = 0; i < count; i++) {
 647                         if (i != reqpage) {
 648                                 vm_page_free(m[i]);
 649                         }
 650                 }
 651                 cnt.v_vnodein++;
 652                 cnt.v_vnodepgsin++;
 653                 return vnode_pager_input_old(object, m[reqpage]);
 654
 655                 /*
 656                  * if the blocksize is smaller than a page size, then use
 657                  * special small filesystem code.  NFS sometimes has a small
 658                  * blocksize, but it can handle large reads itself.
 659                  */
 660         } else if ((PAGE_SIZE / bsize) > 1 &&
 661             (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
 662                 mtx_lock(&vm_mtx);
 663                 for (i = 0; i < count; i++) {
 664                         if (i != reqpage) {
 665                                 vm_page_free(m[i]);
 666                         }
 667                 }
 668                 cnt.v_vnodein++;
 669                 cnt.v_vnodepgsin++;
 670                 return vnode_pager_input_smlfs(object, m[reqpage]);
 671         }
 672         mtx_lock(&vm_mtx);
 673
 674         /*
 675          * If we have a completely valid page available to us, we can
 676          * clean up and return.  Otherwise we have to re-read the
 677          * media.
 678          */
 679
 680         if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
 681                 for (i = 0; i < count; i++) {
 682                         if (i != reqpage)
 683                                 vm_page_free(m[i]);
 684                 }
 685                 return VM_PAGER_OK;
 686         }
 687         m[reqpage]->valid = 0;
 688
 689         /*
 690          * here on direct device I/O
 691          */
 692
 693         firstaddr = -1;
 694         /*
 695          * calculate the run that includes the required page
 696          */
 697         for(first = 0, i = 0; i < count; i = runend) {
 698                 firstaddr = vnode_pager_addr(vp,
 699                         IDX_TO_OFF(m[i]->pindex), &runpg);
 700                 if (firstaddr == -1) {
 701                         if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
 702                                 /* XXX no %qd in kernel. */
 703                                 panic("vnode_pager_getpages: unexpected missing page: firstaddr: %d, foff: 0x%lx%08lx, vnp_size: 0x%lx%08lx",
 704                                  firstaddr, (u_long)(foff >> 32),
 705                                  (u_long)(u_int32_t)foff,
 706                                  (u_long)(u_int32_t)
 707                                  (object->un_pager.vnp.vnp_size >> 32),
 708                                  (u_long)(u_int32_t)
 709                                  object->un_pager.vnp.vnp_size);
 710                         }
 711                         vm_page_free(m[i]);
 712                         runend = i + 1;
 713                         first = runend;
 714                         continue;
 715                 }
 716                 runend = i + runpg;
 717                 if (runend <= reqpage) {
 718                         int j;
 719                         for (j = i; j < runend; j++) {
 720                                 vm_page_free(m[j]);
 721                         }
 722                 } else {
 723                         if (runpg < (count - first)) {
 724                                 for (i = first + runpg; i < count; i++)
 725                                         vm_page_free(m[i]);
 726                                 count = first + runpg;
 727                         }
 728                         break;
 729                 }
 730                 first = runend;
 731         }
 732
 733         /*
 734          * the first and last page have been calculated now, move input pages
 735          * to be zero based...
 736          */
 737         if (first != 0) {
 738                 for (i = first; i < count; i++) {
 739                         m[i - first] = m[i];
 740                 }
 741                 count -= first;
 742                 reqpage -= first;
 743         }
 744
 745         /*
 746          * calculate the file virtual address for the transfer
 747          */
 748         foff = IDX_TO_OFF(m[0]->pindex);
 749
 750         /*
 751          * calculate the size of the transfer
 752          */
 753         size = count * PAGE_SIZE;
 754         if ((foff + size) > object->un_pager.vnp.vnp_size)
 755                 size = object->un_pager.vnp.vnp_size - foff;
 756
 757         /*
 758          * round up physical size for real devices.
 759          */
 760         if (dp->v_type == VBLK || dp->v_type == VCHR) {
 761                 int secmask = dp->v_rdev->si_bsize_phys - 1;
 762                 KASSERT(secmask < PAGE_SIZE, ("vnode_pager_generic_getpages: sector size %d too large\n", secmask + 1));
 763                 size = (size + secmask) & ~secmask;
 764         }
 765
 766         bp = getpbuf(&vnode_pbuf_freecnt);
 767         kva = (vm_offset_t) bp->b_data;
 768
 769         /*
 770          * and map the pages to be read into the kva
 771          */
 772         pmap_qenter(kva, m, count);
 773         mtx_unlock(&vm_mtx);
 774
 775         /* build a minimal buffer header */
 776         bp->b_iocmd = BIO_READ;
 777         bp->b_iodone = vnode_pager_iodone;
 778         /* B_PHYS is not set, but it is nice to fill this in */
 779         bp->b_rcred = bp->b_wcred = curproc->p_ucred;
 780         if (bp->b_rcred != NOCRED)
 781                 crhold(bp->b_rcred);
 782         if (bp->b_wcred != NOCRED)
 783                 crhold(bp->b_wcred);
 784         bp->b_blkno = firstaddr;
 785         pbgetvp(dp, bp);
 786         bp->b_bcount = size;
 787         bp->b_bufsize = size;
 788         bp->b_runningbufspace = bp->b_bufsize;
 789         runningbufspace += bp->b_runningbufspace;
 790
 791         cnt.v_vnodein++;
 792         cnt.v_vnodepgsin += count;
 793
 794         /* do the input */
 795         BUF_STRATEGY(bp);
 796
 797         s = splvm();
 798         /* we definitely need to be at splvm here */
 799
 800         while ((bp->b_flags & B_DONE) == 0) {
 801                 tsleep(bp, PVM, "vnread", 0);
 802         }
 803         splx(s);
 804         if ((bp->b_ioflags & BIO_ERROR) != 0)
 805                 error = EIO;
 806
 807         if (!error) {
 808                 if (size != count * PAGE_SIZE)
 809                         bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
 810         }
 811         mtx_lock(&vm_mtx);
 812         pmap_qremove(kva, count);
 813
 814         /*
 815          * free the buffer header back to the swap buffer pool
 816          */
 817         relpbuf(bp, &vnode_pbuf_freecnt);
 818
 819         for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
 820                 vm_page_t mt;
 821
 822                 nextoff = tfoff + PAGE_SIZE;
 823                 mt = m[i];
 824
 825                 if (nextoff <= object->un_pager.vnp.vnp_size) {
 826                         /*
 827                          * Read filled up entire page.
 828                          */
 829                         mt->valid = VM_PAGE_BITS_ALL;
 830                         vm_page_undirty(mt);    /* should be an assert? XXX */
 831                         pmap_clear_modify(mt);
 832                 } else {
 833                         /*
 834                          * Read did not fill up entire page.  Since this
 835                          * is getpages, the page may be mapped, so we have
 836                          * to zero the invalid portions of the page even
 837                          * though we aren't setting them valid.
 838                          *
 839                          * Currently we do not set the entire page valid,
 840                          * we just try to clear the piece that we couldn't
 841                          * read.
 842                          */
 843                         vm_page_set_validclean(mt, 0,
 844                             object->un_pager.vnp.vnp_size - tfoff);
 845                         /* handled by vm_fault now */
 846                         /* vm_page_zero_invalid(mt, FALSE); */
 847                 }
 848
 849                 vm_page_flag_clear(mt, PG_ZERO);
 850                 if (i != reqpage) {
 851
 852                         /*
 853                          * whether or not to leave the page activated is up in
 854                          * the air, but we should put the page on a page queue
 855                          * somewhere. (it already is in the object). Result:
 856                          * It appears that empirical results show that
 857                          * deactivating pages is best.
 858                          */
 859
 860                         /*
 861                          * just in case someone was asking for this page we
 862                          * now tell them that it is ok to use
 863                          */
 864                         if (!error) {
 865                                 if (mt->flags & PG_WANTED)
 866                                         vm_page_activate(mt);
 867                                 else
 868                                         vm_page_deactivate(mt);
 869                                 vm_page_wakeup(mt);
 870                         } else {
 871                                 vm_page_free(mt);
 872                         }
 873                 }
 874         }
 875         if (error) {
 876                 printf("vnode_pager_getpages: I/O read error\n");
 877         }
 878         return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
 879 }
 880
 881 /*
 882  * EOPNOTSUPP is no longer legal.  For local media VFS's that do not
 883  * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
 884  * vnode_pager_generic_putpages() to implement the previous behaviour.
 885  *
 886  * All other FS's should use the bypass to get to the local media
 887  * backing vp's VOP_PUTPAGES.
 888  */
 889 static void
 890 vnode_pager_putpages(object, m, count, sync, rtvals)
 891         vm_object_t object;
 892         vm_page_t *m;
 893         int count;
 894         boolean_t sync;
 895         int *rtvals;
 896 {
 897         int rtval;
 898         struct vnode *vp;
 899         struct mount *mp;
 900         int bytes = count * PAGE_SIZE;
 901
 902         mtx_assert(&Giant, MA_OWNED);
 903         /*
 904          * Force synchronous operation if we are extremely low on memory
 905          * to prevent a low-memory deadlock.  VOP operations often need to
 906          * allocate more memory to initiate the I/O ( i.e. do a BMAP
 907          * operation ).  The swapper handles the case by limiting the amount
 908          * of asynchronous I/O, but that sort of solution doesn't scale well
 909          * for the vnode pager without a lot of work.
 910          *
 911          * Also, the backing vnode's iodone routine may not wake the pageout
 912          * daemon up.  This should be probably be addressed XXX.
 913          */
 914
 915         if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)
 916                 sync |= OBJPC_SYNC;
 917
 918         /*
 919          * Call device-specific putpages function
 920          */
 921
 922         vp = object->handle;
 923         mtx_unlock(&vm_mtx);
 924         if (vp->v_type != VREG)
 925                 mp = NULL;
 926         (void)vn_start_write(vp, &mp, V_WAIT);
 927         mtx_lock(&vm_mtx);
 928         rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0);
 929         KASSERT(rtval != EOPNOTSUPP,
 930             ("vnode_pager: stale FS putpages\n"));
 931         mtx_unlock(&vm_mtx);
 932         vn_finished_write(mp);
 933         mtx_lock(&vm_mtx);
 934 }
 935
 936
 937 /*
 938  * This is now called from local media FS's to operate against their
 939  * own vnodes if they fail to implement VOP_PUTPAGES.
 940  *
 941  * This is typically called indirectly via the pageout daemon and
 942  * clustering has already typically occured, so in general we ask the
 943  * underlying filesystem to write the data out asynchronously rather
 944  * then delayed.
 945  */
 946 int
 947 vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
 948         struct vnode *vp;
 949         vm_page_t *m;
 950         int bytecount;
 951         int flags;
 952         int *rtvals;
 953 {
 954         int i;
 955         vm_object_t object;
 956         int count;
 957
 958         int maxsize, ncount;
 959         vm_ooffset_t poffset;
 960         struct uio auio;
 961         struct iovec aiov;
 962         int error;
 963         int ioflags;
 964
 965         mtx_assert(&Giant, MA_OWNED);
 966         object = vp->v_object;
 967         count = bytecount / PAGE_SIZE;
 968
 969         for (i = 0; i < count; i++)
 970                 rtvals[i] = VM_PAGER_AGAIN;
 971
 972         if ((int) m[0]->pindex < 0) {
 973                 printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%x)\n",
 974                         (long)m[0]->pindex, m[0]->dirty);
 975                 rtvals[0] = VM_PAGER_BAD;
 976                 return VM_PAGER_BAD;
 977         }
 978
 979         maxsize = count * PAGE_SIZE;
 980         ncount = count;
 981
 982         poffset = IDX_TO_OFF(m[0]->pindex);
 983         if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
 984                 if (object->un_pager.vnp.vnp_size > poffset)
 985                         maxsize = object->un_pager.vnp.vnp_size - poffset;
 986                 else
 987                         maxsize = 0;
 988                 ncount = btoc(maxsize);
 989                 if (ncount < count) {
 990                         for (i = ncount; i < count; i++) {
 991                                 rtvals[i] = VM_PAGER_BAD;
 992                         }
 993                 }
 994         }
 995         mtx_unlock(&vm_mtx);
 996
 997         /*
 998          * pageouts are already clustered, use IO_ASYNC t o force a bawrite()
 999          * rather then a bdwrite() to prevent paging I/O from saturating
1000          * the buffer cache.
1001          */
1002         ioflags = IO_VMIO;
1003         ioflags |= (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ? IO_SYNC: IO_ASYNC;
1004         ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
1005
1006         aiov.iov_base = (caddr_t) 0;
1007         aiov.iov_len = maxsize;
1008         auio.uio_iov = &aiov;
1009         auio.uio_iovcnt = 1;
1010         auio.uio_offset = poffset;
1011         auio.uio_segflg = UIO_NOCOPY;
1012         auio.uio_rw = UIO_WRITE;
1013         auio.uio_resid = maxsize;
1014         auio.uio_procp = (struct proc *) 0;
1015         error = VOP_WRITE(vp, &auio, ioflags, curproc->p_ucred);
1016         mtx_lock(&vm_mtx);
1017         cnt.v_vnodeout++;
1018         cnt.v_vnodepgsout += ncount;
1019
1020         if (error) {
1021                 printf("vnode_pager_putpages: I/O error %d\n", error);
1022         }
1023         if (auio.uio_resid) {
1024                 printf("vnode_pager_putpages: residual I/O %d at %lu\n",
1025                     auio.uio_resid, (u_long)m[0]->pindex);
1026         }
1027         for (i = 0; i < ncount; i++) {
1028                 rtvals[i] = VM_PAGER_OK;
1029         }
1030         return rtvals[0];
1031 }
1032
1033 struct vnode *
1034 vnode_pager_lock(object)
1035         vm_object_t object;
1036 {
1037         struct proc *p = curproc;       /* XXX */
1038
1039         mtx_assert(&vm_mtx, MA_NOTOWNED);
1040         mtx_assert(&Giant, MA_OWNED);
1041         mtx_lock(&vm_mtx);
1042         for (; object != NULL; object = object->backing_object) {
1043                 if (object->type != OBJT_VNODE)
1044                         continue;
1045                 if (object->flags & OBJ_DEAD) {
1046                         mtx_unlock(&vm_mtx);
1047                         return NULL;
1048                 }
1049
1050                 mtx_unlock(&vm_mtx);
1051                 /* XXX; If object->handle can change, we need to cache it. */
1052                 while (vget(object->handle,
1053                         LK_NOPAUSE | LK_SHARED | LK_RETRY | LK_CANRECURSE, p)) {
1054                         if ((object->flags & OBJ_DEAD) || (object->type != OBJT_VNODE))
1055                                 return NULL;
1056                         printf("vnode_pager_lock: retrying\n");
1057                 }
1058                 return object->handle;
1059         }
1060         mtx_unlock(&vm_mtx);
1061         return NULL;
1062 }