sys/vm/vnode_pager.c

   1 /*-
   2  * Copyright (c) 1990 University of Utah.
   3  * Copyright (c) 1991 The Regents of the University of California.
   4  * All rights reserved.
   5  * Copyright (c) 1993, 1994 John S. Dyson
   6  * Copyright (c) 1995, David Greenman
   7  *
   8  * This code is derived from software contributed to Berkeley by
   9  * the Systems Programming Group of the University of Utah Computer
  10  * Science Department.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. All advertising materials mentioning features or use of this software
  21  *    must display the following acknowledgement:
  22  *      This product includes software developed by the University of
  23  *      California, Berkeley and its contributors.
  24  * 4. Neither the name of the University nor the names of its contributors
  25  *    may be used to endorse or promote products derived from this software
  26  *    without specific prior written permission.
  27  *
  28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  38  * SUCH DAMAGE.
  39  *
  40  *      from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91
  41  */
  42
  43 /*
  44  * Page to/from files (vnodes).
  45  */
  46
  47 /*
  48  * TODO:
  49  *      Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
  50  *      greatly re-simplify the vnode_pager.
  51  */
  52
  53 #include <sys/cdefs.h>
  54 __FBSDID("$FreeBSD$");
  55
  56 #include <sys/param.h>
  57 #include <sys/systm.h>
  58 #include <sys/proc.h>
  59 #include <sys/vnode.h>
  60 #include <sys/mount.h>
  61 #include <sys/bio.h>
  62 #include <sys/buf.h>
  63 #include <sys/vmmeter.h>
  64 #include <sys/limits.h>
  65 #include <sys/conf.h>
  66 #include <sys/sf_buf.h>
  67
  68 #include <machine/atomic.h>
  69
  70 #include <vm/vm.h>
  71 #include <vm/vm_object.h>
  72 #include <vm/vm_page.h>
  73 #include <vm/vm_pager.h>
  74 #include <vm/vm_map.h>
  75 #include <vm/vnode_pager.h>
  76 #include <vm/vm_extern.h>
  77
  78 static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
  79     daddr_t *rtaddress, int *run);
  80 static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
  81 static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
  82 static void vnode_pager_dealloc(vm_object_t);
  83 static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int);
  84 static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
  85 static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
  86 static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t);
  87
  88 struct pagerops vnodepagerops = {
  89         .pgo_alloc =    vnode_pager_alloc,
  90         .pgo_dealloc =  vnode_pager_dealloc,
  91         .pgo_getpages = vnode_pager_getpages,
  92         .pgo_putpages = vnode_pager_putpages,
  93         .pgo_haspage =  vnode_pager_haspage,
  94 };
  95
  96 int vnode_pbuf_freecnt;
  97
  98 /* Create the VM system backing object for this vnode */
  99 int
 100 vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td)
 101 {
 102         vm_object_t object;
 103         vm_ooffset_t size = isize;
 104         struct vattr va;
 105
 106         if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE)
 107                 return (0);
 108
 109         while ((object = vp->v_object) != NULL) {
 110                 VM_OBJECT_LOCK(object);
 111                 if (!(object->flags & OBJ_DEAD)) {
 112                         VM_OBJECT_UNLOCK(object);
 113                         return (0);
 114                 }
 115                 VOP_UNLOCK(vp, 0);
 116                 vm_object_set_flag(object, OBJ_DISCONNECTWNT);
 117                 msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vodead", 0);
 118                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 119         }
 120
 121         if (size == 0) {
 122                 if (vn_isdisk(vp, NULL)) {
 123                         size = IDX_TO_OFF(INT_MAX);
 124                 } else {
 125                         if (VOP_GETATTR(vp, &va, td->td_ucred, td) != 0)
 126                                 return (0);
 127                         size = va.va_size;
 128                 }
 129         }
 130
 131         object = vnode_pager_alloc(vp, size, 0, 0);
 132         /*
 133          * Dereference the reference we just created.  This assumes
 134          * that the object is associated with the vp.
 135          */
 136         VM_OBJECT_LOCK(object);
 137         object->ref_count--;
 138         VM_OBJECT_UNLOCK(object);
 139         vrele(vp);
 140
 141         KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object"));
 142
 143         return (0);
 144 }
 145
 146 void
 147 vnode_destroy_vobject(struct vnode *vp)
 148 {
 149         struct vm_object *obj;
 150
 151         obj = vp->v_object;
 152         if (obj == NULL)
 153                 return;
 154         ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject");
 155         VM_OBJECT_LOCK(obj);
 156         if (obj->ref_count == 0) {
 157                 /*
 158                  * vclean() may be called twice. The first time
 159                  * removes the primary reference to the object,
 160                  * the second time goes one further and is a
 161                  * special-case to terminate the object.
 162                  *
 163                  * don't double-terminate the object
 164                  */
 165                 if ((obj->flags & OBJ_DEAD) == 0)
 166                         vm_object_terminate(obj);
 167                 else
 168                         VM_OBJECT_UNLOCK(obj);
 169         } else {
 170                 /*
 171                  * Woe to the process that tries to page now :-).
 172                  */
 173                 vm_pager_deallocate(obj);
 174                 VM_OBJECT_UNLOCK(obj);
 175         }
 176         vp->v_object = NULL;
 177 }
 178
 179
 180 /*
 181  * Allocate (or lookup) pager for a vnode.
 182  * Handle is a vnode pointer.
 183  *
 184  * MPSAFE
 185  */
 186 vm_object_t
 187 vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
 188                   vm_ooffset_t offset)
 189 {
 190         vm_object_t object;
 191         struct vnode *vp;
 192
 193         /*
 194          * Pageout to vnode, no can do yet.
 195          */
 196         if (handle == NULL)
 197                 return (NULL);
 198
 199         vp = (struct vnode *) handle;
 200
 201         ASSERT_VOP_ELOCKED(vp, "vnode_pager_alloc");
 202
 203         /*
 204          * If the object is being terminated, wait for it to
 205          * go away.
 206          */
 207         while ((object = vp->v_object) != NULL) {
 208                 VM_OBJECT_LOCK(object);
 209                 if ((object->flags & OBJ_DEAD) == 0)
 210                         break;
 211                 vm_object_set_flag(object, OBJ_DISCONNECTWNT);
 212                 msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vadead", 0);
 213         }
 214
 215         if (vp->v_usecount == 0)
 216                 panic("vnode_pager_alloc: no vnode reference");
 217
 218         if (object == NULL) {
 219                 /*
 220                  * And an object of the appropriate size
 221                  */
 222                 object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size)));
 223
 224                 object->un_pager.vnp.vnp_size = size;
 225
 226                 object->handle = handle;
 227                 if (VFS_NEEDSGIANT(vp->v_mount))
 228                         vm_object_set_flag(object, OBJ_NEEDGIANT);
 229                 vp->v_object = object;
 230         } else {
 231                 object->ref_count++;
 232                 VM_OBJECT_UNLOCK(object);
 233         }
 234         vref(vp);
 235         return (object);
 236 }
 237
 238 /*
 239  *      The object must be locked.
 240  */
 241 static void
 242 vnode_pager_dealloc(object)
 243         vm_object_t object;
 244 {
 245         struct vnode *vp = object->handle;
 246
 247         if (vp == NULL)
 248                 panic("vnode_pager_dealloc: pager already dealloced");
 249
 250         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 251         vm_object_pip_wait(object, "vnpdea");
 252
 253         object->handle = NULL;
 254         object->type = OBJT_DEAD;
 255         if (object->flags & OBJ_DISCONNECTWNT) {
 256                 vm_object_clear_flag(object, OBJ_DISCONNECTWNT);
 257                 wakeup(object);
 258         }
 259         ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc");
 260         vp->v_object = NULL;
 261         vp->v_vflag &= ~VV_TEXT;
 262 }
 263
 264 static boolean_t
 265 vnode_pager_haspage(object, pindex, before, after)
 266         vm_object_t object;
 267         vm_pindex_t pindex;
 268         int *before;
 269         int *after;
 270 {
 271         struct vnode *vp = object->handle;
 272         daddr_t bn;
 273         int err;
 274         daddr_t reqblock;
 275         int poff;
 276         int bsize;
 277         int pagesperblock, blocksperpage;
 278         int vfslocked;
 279
 280         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 281         /*
 282          * If no vp or vp is doomed or marked transparent to VM, we do not
 283          * have the page.
 284          */
 285         if (vp == NULL || vp->v_iflag & VI_DOOMED)
 286                 return FALSE;
 287         /*
 288          * If the offset is beyond end of file we do
 289          * not have the page.
 290          */
 291         if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size)
 292                 return FALSE;
 293
 294         bsize = vp->v_mount->mnt_stat.f_iosize;
 295         pagesperblock = bsize / PAGE_SIZE;
 296         blocksperpage = 0;
 297         if (pagesperblock > 0) {
 298                 reqblock = pindex / pagesperblock;
 299         } else {
 300                 blocksperpage = (PAGE_SIZE / bsize);
 301                 reqblock = pindex * blocksperpage;
 302         }
 303         VM_OBJECT_UNLOCK(object);
 304         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 305         err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before);
 306         VFS_UNLOCK_GIANT(vfslocked);
 307         VM_OBJECT_LOCK(object);
 308         if (err)
 309                 return TRUE;
 310         if (bn == -1)
 311                 return FALSE;
 312         if (pagesperblock > 0) {
 313                 poff = pindex - (reqblock * pagesperblock);
 314                 if (before) {
 315                         *before *= pagesperblock;
 316                         *before += poff;
 317                 }
 318                 if (after) {
 319                         int numafter;
 320                         *after *= pagesperblock;
 321                         numafter = pagesperblock - (poff + 1);
 322                         if (IDX_TO_OFF(pindex + numafter) >
 323                             object->un_pager.vnp.vnp_size) {
 324                                 numafter =
 325                                     OFF_TO_IDX(object->un_pager.vnp.vnp_size) -
 326                                     pindex;
 327                         }
 328                         *after += numafter;
 329                 }
 330         } else {
 331                 if (before) {
 332                         *before /= blocksperpage;
 333                 }
 334
 335                 if (after) {
 336                         *after /= blocksperpage;
 337                 }
 338         }
 339         return TRUE;
 340 }
 341
 342 /*
 343  * Lets the VM system know about a change in size for a file.
 344  * We adjust our own internal size and flush any cached pages in
 345  * the associated object that are affected by the size change.
 346  *
 347  * Note: this routine may be invoked as a result of a pager put
 348  * operation (possibly at object termination time), so we must be careful.
 349  */
 350 void
 351 vnode_pager_setsize(vp, nsize)
 352         struct vnode *vp;
 353         vm_ooffset_t nsize;
 354 {
 355         vm_object_t object;
 356         vm_page_t m;
 357         vm_pindex_t nobjsize;
 358
 359         if ((object = vp->v_object) == NULL)
 360                 return;
 361         VM_OBJECT_LOCK(object);
 362         if (nsize == object->un_pager.vnp.vnp_size) {
 363                 /*
 364                  * Hasn't changed size
 365                  */
 366                 VM_OBJECT_UNLOCK(object);
 367                 return;
 368         }
 369         nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
 370         if (nsize < object->un_pager.vnp.vnp_size) {
 371                 /*
 372                  * File has shrunk. Toss any cached pages beyond the new EOF.
 373                  */
 374                 if (nobjsize < object->size)
 375                         vm_object_page_remove(object, nobjsize, object->size,
 376                             FALSE);
 377                 /*
 378                  * this gets rid of garbage at the end of a page that is now
 379                  * only partially backed by the vnode.
 380                  *
 381                  * XXX for some reason (I don't know yet), if we take a
 382                  * completely invalid page and mark it partially valid
 383                  * it can screw up NFS reads, so we don't allow the case.
 384                  */
 385                 if ((nsize & PAGE_MASK) &&
 386                     (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL &&
 387                     m->valid != 0) {
 388                         int base = (int)nsize & PAGE_MASK;
 389                         int size = PAGE_SIZE - base;
 390
 391                         /*
 392                          * Clear out partial-page garbage in case
 393                          * the page has been mapped.
 394                          */
 395                         pmap_zero_page_area(m, base, size);
 396
 397                         /*
 398                          * Clear out partial-page dirty bits.  This
 399                          * has the side effect of setting the valid
 400                          * bits, but that is ok.  There are a bunch
 401                          * of places in the VM system where we expected
 402                          * m->dirty == VM_PAGE_BITS_ALL.  The file EOF
 403                          * case is one of them.  If the page is still
 404                          * partially dirty, make it fully dirty.
 405                          *
 406                          * note that we do not clear out the valid
 407                          * bits.  This would prevent bogus_page
 408                          * replacement from working properly.
 409                          */
 410                         vm_page_lock_queues();
 411                         vm_page_set_validclean(m, base, size);
 412                         if (m->dirty != 0)
 413                                 m->dirty = VM_PAGE_BITS_ALL;
 414                         vm_page_unlock_queues();
 415                 } else if ((nsize & PAGE_MASK) &&
 416                     __predict_false(object->cache != NULL)) {
 417                         vm_page_cache_free(object, OFF_TO_IDX(nsize),
 418                             nobjsize);
 419                 }
 420         }
 421         object->un_pager.vnp.vnp_size = nsize;
 422         object->size = nobjsize;
 423         VM_OBJECT_UNLOCK(object);
 424 }
 425
 426 /*
 427  * calculate the linear (byte) disk address of specified virtual
 428  * file address
 429  */
 430 static int
 431 vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress,
 432     int *run)
 433 {
 434         int bsize;
 435         int err;
 436         daddr_t vblock;
 437         daddr_t voffset;
 438
 439         if (address < 0)
 440                 return -1;
 441
 442         if (vp->v_iflag & VI_DOOMED)
 443                 return -1;
 444
 445         bsize = vp->v_mount->mnt_stat.f_iosize;
 446         vblock = address / bsize;
 447         voffset = address % bsize;
 448
 449         err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL);
 450         if (err == 0) {
 451                 if (*rtaddress != -1)
 452                         *rtaddress += voffset / DEV_BSIZE;
 453                 if (run) {
 454                         *run += 1;
 455                         *run *= bsize/PAGE_SIZE;
 456                         *run -= voffset/PAGE_SIZE;
 457                 }
 458         }
 459
 460         return (err);
 461 }
 462
 463 /*
 464  * small block filesystem vnode pager input
 465  */
 466 static int
 467 vnode_pager_input_smlfs(object, m)
 468         vm_object_t object;
 469         vm_page_t m;
 470 {
 471         int i;
 472         struct vnode *vp;
 473         struct bufobj *bo;
 474         struct buf *bp;
 475         struct sf_buf *sf;
 476         daddr_t fileaddr;
 477         vm_offset_t bsize;
 478         int error = 0;
 479
 480         vp = object->handle;
 481         if (vp->v_iflag & VI_DOOMED)
 482                 return VM_PAGER_BAD;
 483
 484         bsize = vp->v_mount->mnt_stat.f_iosize;
 485
 486         VOP_BMAP(vp, 0, &bo, 0, NULL, NULL);
 487
 488         sf = sf_buf_alloc(m, 0);
 489
 490         for (i = 0; i < PAGE_SIZE / bsize; i++) {
 491                 vm_ooffset_t address;
 492
 493                 if (vm_page_bits(i * bsize, bsize) & m->valid)
 494                         continue;
 495
 496                 address = IDX_TO_OFF(m->pindex) + i * bsize;
 497                 if (address >= object->un_pager.vnp.vnp_size) {
 498                         fileaddr = -1;
 499                 } else {
 500                         error = vnode_pager_addr(vp, address, &fileaddr, NULL);
 501                         if (error)
 502                                 break;
 503                 }
 504                 if (fileaddr != -1) {
 505                         bp = getpbuf(&vnode_pbuf_freecnt);
 506
 507                         /* build a minimal buffer header */
 508                         bp->b_iocmd = BIO_READ;
 509                         bp->b_iodone = bdone;
 510                         KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 511                         KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 512                         bp->b_rcred = crhold(curthread->td_ucred);
 513                         bp->b_wcred = crhold(curthread->td_ucred);
 514                         bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize;
 515                         bp->b_blkno = fileaddr;
 516                         pbgetbo(bo, bp);
 517                         bp->b_bcount = bsize;
 518                         bp->b_bufsize = bsize;
 519                         bp->b_runningbufspace = bp->b_bufsize;
 520                         atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 521
 522                         /* do the input */
 523                         bp->b_iooffset = dbtob(bp->b_blkno);
 524                         bstrategy(bp);
 525
 526                         bwait(bp, PVM, "vnsrd");
 527
 528                         if ((bp->b_ioflags & BIO_ERROR) != 0)
 529                                 error = EIO;
 530
 531                         /*
 532                          * free the buffer header back to the swap buffer pool
 533                          */
 534                         pbrelbo(bp);
 535                         relpbuf(bp, &vnode_pbuf_freecnt);
 536                         if (error)
 537                                 break;
 538
 539                         VM_OBJECT_LOCK(object);
 540                         vm_page_lock_queues();
 541                         vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
 542                         vm_page_unlock_queues();
 543                         VM_OBJECT_UNLOCK(object);
 544                 } else {
 545                         VM_OBJECT_LOCK(object);
 546                         vm_page_lock_queues();
 547                         vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
 548                         vm_page_unlock_queues();
 549                         VM_OBJECT_UNLOCK(object);
 550                         bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize);
 551                 }
 552         }
 553         sf_buf_free(sf);
 554         vm_page_lock_queues();
 555         pmap_clear_modify(m);
 556         vm_page_unlock_queues();
 557         if (error) {
 558                 return VM_PAGER_ERROR;
 559         }
 560         return VM_PAGER_OK;
 561
 562 }
 563
 564
 565 /*
 566  * old style vnode pager input routine
 567  */
 568 static int
 569 vnode_pager_input_old(object, m)
 570         vm_object_t object;
 571         vm_page_t m;
 572 {
 573         struct uio auio;
 574         struct iovec aiov;
 575         int error;
 576         int size;
 577         struct sf_buf *sf;
 578         struct vnode *vp;
 579
 580         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 581         error = 0;
 582
 583         /*
 584          * Return failure if beyond current EOF
 585          */
 586         if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
 587                 return VM_PAGER_BAD;
 588         } else {
 589                 size = PAGE_SIZE;
 590                 if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
 591                         size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
 592                 vp = object->handle;
 593                 VM_OBJECT_UNLOCK(object);
 594
 595                 /*
 596                  * Allocate a kernel virtual address and initialize so that
 597                  * we can use VOP_READ/WRITE routines.
 598                  */
 599                 sf = sf_buf_alloc(m, 0);
 600
 601                 aiov.iov_base = (caddr_t)sf_buf_kva(sf);
 602                 aiov.iov_len = size;
 603                 auio.uio_iov = &aiov;
 604                 auio.uio_iovcnt = 1;
 605                 auio.uio_offset = IDX_TO_OFF(m->pindex);
 606                 auio.uio_segflg = UIO_SYSSPACE;
 607                 auio.uio_rw = UIO_READ;
 608                 auio.uio_resid = size;
 609                 auio.uio_td = curthread;
 610
 611                 error = VOP_READ(vp, &auio, 0, curthread->td_ucred);
 612                 if (!error) {
 613                         int count = size - auio.uio_resid;
 614
 615                         if (count == 0)
 616                                 error = EINVAL;
 617                         else if (count != PAGE_SIZE)
 618                                 bzero((caddr_t)sf_buf_kva(sf) + count,
 619                                     PAGE_SIZE - count);
 620                 }
 621                 sf_buf_free(sf);
 622
 623                 VM_OBJECT_LOCK(object);
 624         }
 625         vm_page_lock_queues();
 626         pmap_clear_modify(m);
 627         vm_page_undirty(m);
 628         vm_page_unlock_queues();
 629         if (!error)
 630                 m->valid = VM_PAGE_BITS_ALL;
 631         return error ? VM_PAGER_ERROR : VM_PAGER_OK;
 632 }
 633
 634 /*
 635  * generic vnode pager input routine
 636  */
 637
 638 /*
 639  * Local media VFS's that do not implement their own VOP_GETPAGES
 640  * should have their VOP_GETPAGES call to vnode_pager_generic_getpages()
 641  * to implement the previous behaviour.
 642  *
 643  * All other FS's should use the bypass to get to the local media
 644  * backing vp's VOP_GETPAGES.
 645  */
 646 static int
 647 vnode_pager_getpages(object, m, count, reqpage)
 648         vm_object_t object;
 649         vm_page_t *m;
 650         int count;
 651         int reqpage;
 652 {
 653         int rtval;
 654         struct vnode *vp;
 655         int bytes = count * PAGE_SIZE;
 656         int vfslocked;
 657
 658         vp = object->handle;
 659         VM_OBJECT_UNLOCK(object);
 660         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 661         rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
 662         KASSERT(rtval != EOPNOTSUPP,
 663             ("vnode_pager: FS getpages not implemented\n"));
 664         VFS_UNLOCK_GIANT(vfslocked);
 665         VM_OBJECT_LOCK(object);
 666         return rtval;
 667 }
 668
 669 /*
 670  * This is now called from local media FS's to operate against their
 671  * own vnodes if they fail to implement VOP_GETPAGES.
 672  */
 673 int
 674 vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
 675         struct vnode *vp;
 676         vm_page_t *m;
 677         int bytecount;
 678         int reqpage;
 679 {
 680         vm_object_t object;
 681         vm_offset_t kva;
 682         off_t foff, tfoff, nextoff;
 683         int i, j, size, bsize, first;
 684         daddr_t firstaddr, reqblock;
 685         struct bufobj *bo;
 686         int runpg;
 687         int runend;
 688         struct buf *bp;
 689         int count;
 690         int error;
 691
 692         object = vp->v_object;
 693         count = bytecount / PAGE_SIZE;
 694
 695         KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 696             ("vnode_pager_generic_getpages does not support devices"));
 697         if (vp->v_iflag & VI_DOOMED)
 698                 return VM_PAGER_BAD;
 699
 700         bsize = vp->v_mount->mnt_stat.f_iosize;
 701
 702         /* get the UNDERLYING device for the file with VOP_BMAP() */
 703
 704         /*
 705          * originally, we did not check for an error return value -- assuming
 706          * an fs always has a bmap entry point -- that assumption is wrong!!!
 707          */
 708         foff = IDX_TO_OFF(m[reqpage]->pindex);
 709
 710         /*
 711          * if we can't bmap, use old VOP code
 712          */
 713         error = VOP_BMAP(vp, foff / bsize, &bo, &reqblock, NULL, NULL);
 714         if (error == EOPNOTSUPP) {
 715                 VM_OBJECT_LOCK(object);
 716                 vm_page_lock_queues();
 717                 for (i = 0; i < count; i++)
 718                         if (i != reqpage)
 719                                 vm_page_free(m[i]);
 720                 vm_page_unlock_queues();
 721                 PCPU_INC(cnt.v_vnodein);
 722                 PCPU_INC(cnt.v_vnodepgsin);
 723                 error = vnode_pager_input_old(object, m[reqpage]);
 724                 VM_OBJECT_UNLOCK(object);
 725                 return (error);
 726         } else if (error != 0) {
 727                 VM_OBJECT_LOCK(object);
 728                 vm_page_lock_queues();
 729                 for (i = 0; i < count; i++)
 730                         if (i != reqpage)
 731                                 vm_page_free(m[i]);
 732                 vm_page_unlock_queues();
 733                 VM_OBJECT_UNLOCK(object);
 734                 return (VM_PAGER_ERROR);
 735
 736                 /*
 737                  * if the blocksize is smaller than a page size, then use
 738                  * special small filesystem code.  NFS sometimes has a small
 739                  * blocksize, but it can handle large reads itself.
 740                  */
 741         } else if ((PAGE_SIZE / bsize) > 1 &&
 742             (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
 743                 VM_OBJECT_LOCK(object);
 744                 vm_page_lock_queues();
 745                 for (i = 0; i < count; i++)
 746                         if (i != reqpage)
 747                                 vm_page_free(m[i]);
 748                 vm_page_unlock_queues();
 749                 VM_OBJECT_UNLOCK(object);
 750                 PCPU_INC(cnt.v_vnodein);
 751                 PCPU_INC(cnt.v_vnodepgsin);
 752                 return vnode_pager_input_smlfs(object, m[reqpage]);
 753         }
 754
 755         /*
 756          * If we have a completely valid page available to us, we can
 757          * clean up and return.  Otherwise we have to re-read the
 758          * media.
 759          */
 760         VM_OBJECT_LOCK(object);
 761         if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
 762                 vm_page_lock_queues();
 763                 for (i = 0; i < count; i++)
 764                         if (i != reqpage)
 765                                 vm_page_free(m[i]);
 766                 vm_page_unlock_queues();
 767                 VM_OBJECT_UNLOCK(object);
 768                 return VM_PAGER_OK;
 769         } else if (reqblock == -1) {
 770                 pmap_zero_page(m[reqpage]);
 771                 vm_page_undirty(m[reqpage]);
 772                 m[reqpage]->valid = VM_PAGE_BITS_ALL;
 773                 vm_page_lock_queues();
 774                 for (i = 0; i < count; i++)
 775                         if (i != reqpage)
 776                                 vm_page_free(m[i]);
 777                 vm_page_unlock_queues();
 778                 VM_OBJECT_UNLOCK(object);
 779                 return (VM_PAGER_OK);
 780         }
 781         m[reqpage]->valid = 0;
 782         VM_OBJECT_UNLOCK(object);
 783
 784         /*
 785          * here on direct device I/O
 786          */
 787         firstaddr = -1;
 788
 789         /*
 790          * calculate the run that includes the required page
 791          */
 792         for (first = 0, i = 0; i < count; i = runend) {
 793                 if (vnode_pager_addr(vp, IDX_TO_OFF(m[i]->pindex), &firstaddr,
 794                     &runpg) != 0) {
 795                         VM_OBJECT_LOCK(object);
 796                         vm_page_lock_queues();
 797                         for (; i < count; i++)
 798                                 if (i != reqpage)
 799                                         vm_page_free(m[i]);
 800                         vm_page_unlock_queues();
 801                         VM_OBJECT_UNLOCK(object);
 802                         return (VM_PAGER_ERROR);
 803                 }
 804                 if (firstaddr == -1) {
 805                         VM_OBJECT_LOCK(object);
 806                         if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
 807                                 panic("vnode_pager_getpages: unexpected missing page: firstaddr: %jd, foff: 0x%jx%08jx, vnp_size: 0x%jx%08jx",
 808                                     (intmax_t)firstaddr, (uintmax_t)(foff >> 32),
 809                                     (uintmax_t)foff,
 810                                     (uintmax_t)
 811                                     (object->un_pager.vnp.vnp_size >> 32),
 812                                     (uintmax_t)object->un_pager.vnp.vnp_size);
 813                         }
 814                         vm_page_lock_queues();
 815                         vm_page_free(m[i]);
 816                         vm_page_unlock_queues();
 817                         VM_OBJECT_UNLOCK(object);
 818                         runend = i + 1;
 819                         first = runend;
 820                         continue;
 821                 }
 822                 runend = i + runpg;
 823                 if (runend <= reqpage) {
 824                         VM_OBJECT_LOCK(object);
 825                         vm_page_lock_queues();
 826                         for (j = i; j < runend; j++)
 827                                 vm_page_free(m[j]);
 828                         vm_page_unlock_queues();
 829                         VM_OBJECT_UNLOCK(object);
 830                 } else {
 831                         if (runpg < (count - first)) {
 832                                 VM_OBJECT_LOCK(object);
 833                                 vm_page_lock_queues();
 834                                 for (i = first + runpg; i < count; i++)
 835                                         vm_page_free(m[i]);
 836                                 vm_page_unlock_queues();
 837                                 VM_OBJECT_UNLOCK(object);
 838                                 count = first + runpg;
 839                         }
 840                         break;
 841                 }
 842                 first = runend;
 843         }
 844
 845         /*
 846          * the first and last page have been calculated now, move input pages
 847          * to be zero based...
 848          */
 849         if (first != 0) {
 850                 m += first;
 851                 count -= first;
 852                 reqpage -= first;
 853         }
 854
 855         /*
 856          * calculate the file virtual address for the transfer
 857          */
 858         foff = IDX_TO_OFF(m[0]->pindex);
 859
 860         /*
 861          * calculate the size of the transfer
 862          */
 863         size = count * PAGE_SIZE;
 864         KASSERT(count > 0, ("zero count"));
 865         if ((foff + size) > object->un_pager.vnp.vnp_size)
 866                 size = object->un_pager.vnp.vnp_size - foff;
 867         KASSERT(size > 0, ("zero size"));
 868
 869         /*
 870          * round up physical size for real devices.
 871          */
 872         if (1) {
 873                 int secmask = bo->bo_bsize - 1;
 874                 KASSERT(secmask < PAGE_SIZE && secmask > 0,
 875                     ("vnode_pager_generic_getpages: sector size %d too large",
 876                     secmask + 1));
 877                 size = (size + secmask) & ~secmask;
 878         }
 879
 880         bp = getpbuf(&vnode_pbuf_freecnt);
 881         kva = (vm_offset_t) bp->b_data;
 882
 883         /*
 884          * and map the pages to be read into the kva
 885          */
 886         pmap_qenter(kva, m, count);
 887
 888         /* build a minimal buffer header */
 889         bp->b_iocmd = BIO_READ;
 890         bp->b_iodone = bdone;
 891         KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 892         KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 893         bp->b_rcred = crhold(curthread->td_ucred);
 894         bp->b_wcred = crhold(curthread->td_ucred);
 895         bp->b_blkno = firstaddr;
 896         pbgetbo(bo, bp);
 897         bp->b_bcount = size;
 898         bp->b_bufsize = size;
 899         bp->b_runningbufspace = bp->b_bufsize;
 900         atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 901
 902         PCPU_INC(cnt.v_vnodein);
 903         PCPU_ADD(cnt.v_vnodepgsin, count);
 904
 905         /* do the input */
 906         bp->b_iooffset = dbtob(bp->b_blkno);
 907         bstrategy(bp);
 908
 909         bwait(bp, PVM, "vnread");
 910
 911         if ((bp->b_ioflags & BIO_ERROR) != 0)
 912                 error = EIO;
 913
 914         if (!error) {
 915                 if (size != count * PAGE_SIZE)
 916                         bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
 917         }
 918         pmap_qremove(kva, count);
 919
 920         /*
 921          * free the buffer header back to the swap buffer pool
 922          */
 923         pbrelbo(bp);
 924         relpbuf(bp, &vnode_pbuf_freecnt);
 925
 926         VM_OBJECT_LOCK(object);
 927         vm_page_lock_queues();
 928         for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
 929                 vm_page_t mt;
 930
 931                 nextoff = tfoff + PAGE_SIZE;
 932                 mt = m[i];
 933
 934                 if (nextoff <= object->un_pager.vnp.vnp_size) {
 935                         /*
 936                          * Read filled up entire page.
 937                          */
 938                         mt->valid = VM_PAGE_BITS_ALL;
 939                         vm_page_undirty(mt);    /* should be an assert? XXX */
 940                         pmap_clear_modify(mt);
 941                 } else {
 942                         /*
 943                          * Read did not fill up entire page.  Since this
 944                          * is getpages, the page may be mapped, so we have
 945                          * to zero the invalid portions of the page even
 946                          * though we aren't setting them valid.
 947                          *
 948                          * Currently we do not set the entire page valid,
 949                          * we just try to clear the piece that we couldn't
 950                          * read.
 951                          */
 952                         vm_page_set_validclean(mt, 0,
 953                             object->un_pager.vnp.vnp_size - tfoff);
 954                         /* handled by vm_fault now */
 955                         /* vm_page_zero_invalid(mt, FALSE); */
 956                 }
 957
 958                 if (i != reqpage) {
 959
 960                         /*
 961                          * whether or not to leave the page activated is up in
 962                          * the air, but we should put the page on a page queue
 963                          * somewhere. (it already is in the object). Result:
 964                          * It appears that empirical results show that
 965                          * deactivating pages is best.
 966                          */
 967
 968                         /*
 969                          * just in case someone was asking for this page we
 970                          * now tell them that it is ok to use
 971                          */
 972                         if (!error) {
 973                                 if (mt->oflags & VPO_WANTED)
 974                                         vm_page_activate(mt);
 975                                 else
 976                                         vm_page_deactivate(mt);
 977                                 vm_page_wakeup(mt);
 978                         } else {
 979                                 vm_page_free(mt);
 980                         }
 981                 }
 982         }
 983         vm_page_unlock_queues();
 984         VM_OBJECT_UNLOCK(object);
 985         if (error) {
 986                 printf("vnode_pager_getpages: I/O read error\n");
 987         }
 988         return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
 989 }
 990
 991 /*
 992  * EOPNOTSUPP is no longer legal.  For local media VFS's that do not
 993  * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
 994  * vnode_pager_generic_putpages() to implement the previous behaviour.
 995  *
 996  * All other FS's should use the bypass to get to the local media
 997  * backing vp's VOP_PUTPAGES.
 998  */
 999 static void
1000 vnode_pager_putpages(object, m, count, sync, rtvals)
1001         vm_object_t object;
1002         vm_page_t *m;
1003         int count;
1004         boolean_t sync;
1005         int *rtvals;
1006 {
1007         int rtval;
1008         struct vnode *vp;
1009         struct mount *mp;
1010         int bytes = count * PAGE_SIZE;
1011
1012         /*
1013          * Force synchronous operation if we are extremely low on memory
1014          * to prevent a low-memory deadlock.  VOP operations often need to
1015          * allocate more memory to initiate the I/O ( i.e. do a BMAP
1016          * operation ).  The swapper handles the case by limiting the amount
1017          * of asynchronous I/O, but that sort of solution doesn't scale well
1018          * for the vnode pager without a lot of work.
1019          *
1020          * Also, the backing vnode's iodone routine may not wake the pageout
1021          * daemon up.  This should be probably be addressed XXX.
1022          */
1023
1024         if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)
1025                 sync |= OBJPC_SYNC;
1026
1027         /*
1028          * Call device-specific putpages function
1029          */
1030         vp = object->handle;
1031         VM_OBJECT_UNLOCK(object);
1032         if (vp->v_type != VREG)
1033                 mp = NULL;
1034         rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0);
1035         KASSERT(rtval != EOPNOTSUPP,
1036             ("vnode_pager: stale FS putpages\n"));
1037         VM_OBJECT_LOCK(object);
1038 }
1039
1040
1041 /*
1042  * This is now called from local media FS's to operate against their
1043  * own vnodes if they fail to implement VOP_PUTPAGES.
1044  *
1045  * This is typically called indirectly via the pageout daemon and
1046  * clustering has already typically occured, so in general we ask the
1047  * underlying filesystem to write the data out asynchronously rather
1048  * then delayed.
1049  */
1050 int
1051 vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
1052         struct vnode *vp;
1053         vm_page_t *m;
1054         int bytecount;
1055         int flags;
1056         int *rtvals;
1057 {
1058         int i;
1059         vm_object_t object;
1060         int count;
1061
1062         int maxsize, ncount;
1063         vm_ooffset_t poffset;
1064         struct uio auio;
1065         struct iovec aiov;
1066         int error;
1067         int ioflags;
1068         int ppscheck = 0;
1069         static struct timeval lastfail;
1070         static int curfail;
1071
1072         object = vp->v_object;
1073         count = bytecount / PAGE_SIZE;
1074
1075         for (i = 0; i < count; i++)
1076                 rtvals[i] = VM_PAGER_AGAIN;
1077
1078         if ((int64_t)m[0]->pindex < 0) {
1079                 printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%lx)\n",
1080                         (long)m[0]->pindex, (u_long)m[0]->dirty);
1081                 rtvals[0] = VM_PAGER_BAD;
1082                 return VM_PAGER_BAD;
1083         }
1084
1085         maxsize = count * PAGE_SIZE;
1086         ncount = count;
1087
1088         poffset = IDX_TO_OFF(m[0]->pindex);
1089
1090         /*
1091          * If the page-aligned write is larger then the actual file we
1092          * have to invalidate pages occuring beyond the file EOF.  However,
1093          * there is an edge case where a file may not be page-aligned where
1094          * the last page is partially invalid.  In this case the filesystem
1095          * may not properly clear the dirty bits for the entire page (which
1096          * could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
1097          * With the page locked we are free to fix-up the dirty bits here.
1098          *
1099          * We do not under any circumstances truncate the valid bits, as
1100          * this will screw up bogus page replacement.
1101          */
1102         if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
1103                 if (object->un_pager.vnp.vnp_size > poffset) {
1104                         int pgoff;
1105
1106                         maxsize = object->un_pager.vnp.vnp_size - poffset;
1107                         ncount = btoc(maxsize);
1108                         if ((pgoff = (int)maxsize & PAGE_MASK) != 0) {
1109                                 vm_page_lock_queues();
1110                                 vm_page_clear_dirty(m[ncount - 1], pgoff,
1111                                         PAGE_SIZE - pgoff);
1112                                 vm_page_unlock_queues();
1113                         }
1114                 } else {
1115                         maxsize = 0;
1116                         ncount = 0;
1117                 }
1118                 if (ncount < count) {
1119                         for (i = ncount; i < count; i++) {
1120                                 rtvals[i] = VM_PAGER_BAD;
1121                         }
1122                 }
1123         }
1124
1125         /*
1126          * pageouts are already clustered, use IO_ASYNC t o force a bawrite()
1127          * rather then a bdwrite() to prevent paging I/O from saturating
1128          * the buffer cache.  Dummy-up the sequential heuristic to cause
1129          * large ranges to cluster.  If neither IO_SYNC or IO_ASYNC is set,
1130          * the system decides how to cluster.
1131          */
1132         ioflags = IO_VMIO;
1133         if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL))
1134                 ioflags |= IO_SYNC;
1135         else if ((flags & VM_PAGER_CLUSTER_OK) == 0)
1136                 ioflags |= IO_ASYNC;
1137         ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
1138         ioflags |= IO_SEQMAX << IO_SEQSHIFT;
1139
1140         aiov.iov_base = (caddr_t) 0;
1141         aiov.iov_len = maxsize;
1142         auio.uio_iov = &aiov;
1143         auio.uio_iovcnt = 1;
1144         auio.uio_offset = poffset;
1145         auio.uio_segflg = UIO_NOCOPY;
1146         auio.uio_rw = UIO_WRITE;
1147         auio.uio_resid = maxsize;
1148         auio.uio_td = (struct thread *) 0;
1149         error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred);
1150         PCPU_INC(cnt.v_vnodeout);
1151         PCPU_ADD(cnt.v_vnodepgsout, ncount);
1152
1153         if (error) {
1154                 if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1)))
1155                         printf("vnode_pager_putpages: I/O error %d\n", error);
1156         }
1157         if (auio.uio_resid) {
1158                 if (ppscheck || ppsratecheck(&lastfail, &curfail, 1))
1159                         printf("vnode_pager_putpages: residual I/O %d at %lu\n",
1160                             auio.uio_resid, (u_long)m[0]->pindex);
1161         }
1162         for (i = 0; i < ncount; i++) {
1163                 rtvals[i] = VM_PAGER_OK;
1164         }
1165         return rtvals[0];
1166 }
1167
1168 struct vnode *
1169 vnode_pager_lock(vm_object_t first_object)
1170 {
1171         struct vnode *vp;
1172         vm_object_t backing_object, object;
1173
1174         VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED);
1175         for (object = first_object; object != NULL; object = backing_object) {
1176                 if (object->type != OBJT_VNODE) {
1177                         if ((backing_object = object->backing_object) != NULL)
1178                                 VM_OBJECT_LOCK(backing_object);
1179                         if (object != first_object)
1180                                 VM_OBJECT_UNLOCK(object);
1181                         continue;
1182                 }
1183         retry:
1184                 if (object->flags & OBJ_DEAD) {
1185                         if (object != first_object)
1186                                 VM_OBJECT_UNLOCK(object);
1187                         return NULL;
1188                 }
1189                 vp = object->handle;
1190                 VI_LOCK(vp);
1191                 VM_OBJECT_UNLOCK(object);
1192                 if (first_object != object)
1193                         VM_OBJECT_UNLOCK(first_object);
1194                 VFS_ASSERT_GIANT(vp->v_mount);
1195                 if (vget(vp, LK_CANRECURSE | LK_INTERLOCK |
1196                     LK_RETRY | LK_SHARED, curthread)) {
1197                         VM_OBJECT_LOCK(first_object);
1198                         if (object != first_object)
1199                                 VM_OBJECT_LOCK(object);
1200                         if (object->type != OBJT_VNODE) {
1201                                 if (object != first_object)
1202                                         VM_OBJECT_UNLOCK(object);
1203                                 return NULL;
1204                         }
1205                         printf("vnode_pager_lock: retrying\n");
1206                         goto retry;
1207                 }
1208                 VM_OBJECT_LOCK(first_object);
1209                 return (vp);
1210         }
1211         return NULL;
1212 }