sys/ufs/ffs/ffs_vnops.c

   1 /*-
   2  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause)
   3  *
   4  * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
   5  * All rights reserved.
   6  *
   7  * This software was developed for the FreeBSD Project by Marshall
   8  * Kirk McKusick and Network Associates Laboratories, the Security
   9  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
  10  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
  11  * research program
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in the
  20  *    documentation and/or other materials provided with the distribution.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * Copyright (c) 1982, 1986, 1989, 1993
  35  *      The Regents of the University of California.  All rights reserved.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      from: @(#)ufs_readwrite.c       8.11 (Berkeley) 5/8/95
  62  * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
  63  *      @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
  64  */
  65
  66 #include <sys/cdefs.h>
  67 __FBSDID("$FreeBSD$");
  68
  69 #include <sys/param.h>
  70 #include <sys/bio.h>
  71 #include <sys/systm.h>
  72 #include <sys/buf.h>
  73 #include <sys/conf.h>
  74 #include <sys/extattr.h>
  75 #include <sys/kernel.h>
  76 #include <sys/limits.h>
  77 #include <sys/malloc.h>
  78 #include <sys/mount.h>
  79 #include <sys/priv.h>
  80 #include <sys/rwlock.h>
  81 #include <sys/stat.h>
  82 #include <sys/sysctl.h>
  83 #include <sys/vmmeter.h>
  84 #include <sys/vnode.h>
  85
  86 #include <vm/vm.h>
  87 #include <vm/vm_param.h>
  88 #include <vm/vm_extern.h>
  89 #include <vm/vm_object.h>
  90 #include <vm/vm_page.h>
  91 #include <vm/vm_pager.h>
  92 #include <vm/vnode_pager.h>
  93
  94 #include <ufs/ufs/extattr.h>
  95 #include <ufs/ufs/quota.h>
  96 #include <ufs/ufs/inode.h>
  97 #include <ufs/ufs/ufs_extern.h>
  98 #include <ufs/ufs/ufsmount.h>
  99
 100 #include <ufs/ffs/fs.h>
 101 #include <ufs/ffs/ffs_extern.h>
 102 #include "opt_directio.h"
 103 #include "opt_ffs.h"
 104
 105 #define ALIGNED_TO(ptr, s)      \
 106         (((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)
 107
 108 #ifdef DIRECTIO
 109 extern int      ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
 110 #endif
 111 static vop_fdatasync_t  ffs_fdatasync;
 112 static vop_fsync_t      ffs_fsync;
 113 static vop_getpages_t   ffs_getpages;
 114 static vop_getpages_async_t     ffs_getpages_async;
 115 static vop_lock1_t      ffs_lock;
 116 static vop_read_t       ffs_read;
 117 static vop_write_t      ffs_write;
 118 static int      ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
 119 static int      ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
 120                     struct ucred *cred);
 121 static vop_strategy_t   ffsext_strategy;
 122 static vop_closeextattr_t       ffs_closeextattr;
 123 static vop_deleteextattr_t      ffs_deleteextattr;
 124 static vop_getextattr_t ffs_getextattr;
 125 static vop_listextattr_t        ffs_listextattr;
 126 static vop_openextattr_t        ffs_openextattr;
 127 static vop_setextattr_t ffs_setextattr;
 128 static vop_vptofh_t     ffs_vptofh;
 129
 130 /* Global vfs data structures for ufs. */
 131 struct vop_vector ffs_vnodeops1 = {
 132         .vop_default =          &ufs_vnodeops,
 133         .vop_fsync =            ffs_fsync,
 134         .vop_fdatasync =        ffs_fdatasync,
 135         .vop_getpages =         ffs_getpages,
 136         .vop_getpages_async =   ffs_getpages_async,
 137         .vop_lock1 =            ffs_lock,
 138         .vop_read =             ffs_read,
 139         .vop_reallocblks =      ffs_reallocblks,
 140         .vop_write =            ffs_write,
 141         .vop_vptofh =           ffs_vptofh,
 142 };
 143
 144 struct vop_vector ffs_fifoops1 = {
 145         .vop_default =          &ufs_fifoops,
 146         .vop_fsync =            ffs_fsync,
 147         .vop_fdatasync =        ffs_fdatasync,
 148         .vop_lock1 =            ffs_lock,
 149         .vop_vptofh =           ffs_vptofh,
 150 };
 151
 152 /* Global vfs data structures for ufs. */
 153 struct vop_vector ffs_vnodeops2 = {
 154         .vop_default =          &ufs_vnodeops,
 155         .vop_fsync =            ffs_fsync,
 156         .vop_fdatasync =        ffs_fdatasync,
 157         .vop_getpages =         ffs_getpages,
 158         .vop_getpages_async =   ffs_getpages_async,
 159         .vop_lock1 =            ffs_lock,
 160         .vop_read =             ffs_read,
 161         .vop_reallocblks =      ffs_reallocblks,
 162         .vop_write =            ffs_write,
 163         .vop_closeextattr =     ffs_closeextattr,
 164         .vop_deleteextattr =    ffs_deleteextattr,
 165         .vop_getextattr =       ffs_getextattr,
 166         .vop_listextattr =      ffs_listextattr,
 167         .vop_openextattr =      ffs_openextattr,
 168         .vop_setextattr =       ffs_setextattr,
 169         .vop_vptofh =           ffs_vptofh,
 170 };
 171
 172 struct vop_vector ffs_fifoops2 = {
 173         .vop_default =          &ufs_fifoops,
 174         .vop_fsync =            ffs_fsync,
 175         .vop_fdatasync =        ffs_fdatasync,
 176         .vop_lock1 =            ffs_lock,
 177         .vop_reallocblks =      ffs_reallocblks,
 178         .vop_strategy =         ffsext_strategy,
 179         .vop_closeextattr =     ffs_closeextattr,
 180         .vop_deleteextattr =    ffs_deleteextattr,
 181         .vop_getextattr =       ffs_getextattr,
 182         .vop_listextattr =      ffs_listextattr,
 183         .vop_openextattr =      ffs_openextattr,
 184         .vop_setextattr =       ffs_setextattr,
 185         .vop_vptofh =           ffs_vptofh,
 186 };
 187
 188 /*
 189  * Synch an open file.
 190  */
 191 /* ARGSUSED */
 192 static int
 193 ffs_fsync(struct vop_fsync_args *ap)
 194 {
 195         struct vnode *vp;
 196         struct bufobj *bo;
 197         int error;
 198
 199         vp = ap->a_vp;
 200         bo = &vp->v_bufobj;
 201 retry:
 202         error = ffs_syncvnode(vp, ap->a_waitfor, 0);
 203         if (error)
 204                 return (error);
 205         if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
 206                 error = softdep_fsync(vp);
 207                 if (error)
 208                         return (error);
 209
 210                 /*
 211                  * The softdep_fsync() function may drop vp lock,
 212                  * allowing for dirty buffers to reappear on the
 213                  * bo_dirty list. Recheck and resync as needed.
 214                  */
 215                 BO_LOCK(bo);
 216                 if ((vp->v_type == VREG || vp->v_type == VDIR) &&
 217                     (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
 218                         BO_UNLOCK(bo);
 219                         goto retry;
 220                 }
 221                 BO_UNLOCK(bo);
 222         }
 223         return (0);
 224 }
 225
 226 int
 227 ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
 228 {
 229         struct inode *ip;
 230         struct bufobj *bo;
 231         struct buf *bp, *nbp;
 232         ufs_lbn_t lbn;
 233         int error, passes;
 234         bool still_dirty, wait;
 235
 236         ip = VTOI(vp);
 237         ip->i_flag &= ~IN_NEEDSYNC;
 238         bo = &vp->v_bufobj;
 239
 240         /*
 241          * When doing MNT_WAIT we must first flush all dependencies
 242          * on the inode.
 243          */
 244         if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
 245             (error = softdep_sync_metadata(vp)) != 0)
 246                 return (error);
 247
 248         /*
 249          * Flush all dirty buffers associated with a vnode.
 250          */
 251         error = 0;
 252         passes = 0;
 253         wait = false;   /* Always do an async pass first. */
 254         lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
 255         BO_LOCK(bo);
 256 loop:
 257         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
 258                 bp->b_vflags &= ~BV_SCANNED;
 259         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 260                 /*
 261                  * Reasons to skip this buffer: it has already been considered
 262                  * on this pass, the buffer has dependencies that will cause
 263                  * it to be redirtied and it has not already been deferred,
 264                  * or it is already being written.
 265                  */
 266                 if ((bp->b_vflags & BV_SCANNED) != 0)
 267                         continue;
 268                 bp->b_vflags |= BV_SCANNED;
 269                 /*
 270                  * Flush indirects in order, if requested.
 271                  *
 272                  * Note that if only datasync is requested, we can
 273                  * skip indirect blocks when softupdates are not
 274                  * active.  Otherwise we must flush them with data,
 275                  * since dependencies prevent data block writes.
 276                  */
 277                 if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
 278                     (lbn_level(bp->b_lblkno) >= passes ||
 279                     ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
 280                         continue;
 281                 if (bp->b_lblkno > lbn)
 282                         panic("ffs_syncvnode: syncing truncated data.");
 283                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
 284                         BO_UNLOCK(bo);
 285                 } else if (wait) {
 286                         if (BUF_LOCK(bp,
 287                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 288                             BO_LOCKPTR(bo)) != 0) {
 289                                 bp->b_vflags &= ~BV_SCANNED;
 290                                 goto next;
 291                         }
 292                 } else
 293                         continue;
 294                 if ((bp->b_flags & B_DELWRI) == 0)
 295                         panic("ffs_fsync: not dirty");
 296                 /*
 297                  * Check for dependencies and potentially complete them.
 298                  */
 299                 if (!LIST_EMPTY(&bp->b_dep) &&
 300                     (error = softdep_sync_buf(vp, bp,
 301                     wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
 302                         /* I/O error. */
 303                         if (error != EBUSY) {
 304                                 BUF_UNLOCK(bp);
 305                                 return (error);
 306                         }
 307                         /* If we deferred once, don't defer again. */
 308                         if ((bp->b_flags & B_DEFERRED) == 0) {
 309                                 bp->b_flags |= B_DEFERRED;
 310                                 BUF_UNLOCK(bp);
 311                                 goto next;
 312                         }
 313                 }
 314                 if (wait) {
 315                         bremfree(bp);
 316                         if ((error = bwrite(bp)) != 0)
 317                                 return (error);
 318                 } else if ((bp->b_flags & B_CLUSTEROK)) {
 319                         (void) vfs_bio_awrite(bp);
 320                 } else {
 321                         bremfree(bp);
 322                         (void) bawrite(bp);
 323                 }
 324 next:
 325                 /*
 326                  * Since we may have slept during the I/O, we need
 327                  * to start from a known point.
 328                  */
 329                 BO_LOCK(bo);
 330                 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
 331         }
 332         if (waitfor != MNT_WAIT) {
 333                 BO_UNLOCK(bo);
 334                 if ((flags & NO_INO_UPDT) != 0)
 335                         return (0);
 336                 else
 337                         return (ffs_update(vp, 0));
 338         }
 339         /* Drain IO to see if we're done. */
 340         bufobj_wwait(bo, 0, 0);
 341         /*
 342          * Block devices associated with filesystems may have new I/O
 343          * requests posted for them even if the vnode is locked, so no
 344          * amount of trying will get them clean.  We make several passes
 345          * as a best effort.
 346          *
 347          * Regular files may need multiple passes to flush all dependency
 348          * work as it is possible that we must write once per indirect
 349          * level, once for the leaf, and once for the inode and each of
 350          * these will be done with one sync and one async pass.
 351          */
 352         if (bo->bo_dirty.bv_cnt > 0) {
 353                 if ((flags & DATA_ONLY) == 0) {
 354                         still_dirty = true;
 355                 } else {
 356                         /*
 357                          * For data-only sync, dirty indirect buffers
 358                          * are ignored.
 359                          */
 360                         still_dirty = false;
 361                         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
 362                                 if (bp->b_lblkno > -UFS_NDADDR) {
 363                                         still_dirty = true;
 364                                         break;
 365                                 }
 366                         }
 367                 }
 368
 369                 if (still_dirty) {
 370                         /* Write the inode after sync passes to flush deps. */
 371                         if (wait && DOINGSOFTDEP(vp) &&
 372                             (flags & NO_INO_UPDT) == 0) {
 373                                 BO_UNLOCK(bo);
 374                                 ffs_update(vp, 1);
 375                                 BO_LOCK(bo);
 376                         }
 377                         /* switch between sync/async. */
 378                         wait = !wait;
 379                         if (wait || ++passes < UFS_NIADDR + 2)
 380                                 goto loop;
 381                 }
 382         }
 383         BO_UNLOCK(bo);
 384         error = 0;
 385         if ((flags & DATA_ONLY) == 0) {
 386                 if ((flags & NO_INO_UPDT) == 0)
 387                         error = ffs_update(vp, 1);
 388                 if (DOINGSUJ(vp))
 389                         softdep_journal_fsync(VTOI(vp));
 390         }
 391         return (error);
 392 }
 393
 394 static int
 395 ffs_fdatasync(struct vop_fdatasync_args *ap)
 396 {
 397
 398         return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
 399 }
 400
 401 static int
 402 ffs_lock(ap)
 403         struct vop_lock1_args /* {
 404                 struct vnode *a_vp;
 405                 int a_flags;
 406                 struct thread *a_td;
 407                 char *file;
 408                 int line;
 409         } */ *ap;
 410 {
 411 #ifndef NO_FFS_SNAPSHOT
 412         struct vnode *vp;
 413         int flags;
 414         struct lock *lkp;
 415         int result;
 416
 417         switch (ap->a_flags & LK_TYPE_MASK) {
 418         case LK_SHARED:
 419         case LK_UPGRADE:
 420         case LK_EXCLUSIVE:
 421                 vp = ap->a_vp;
 422                 flags = ap->a_flags;
 423                 for (;;) {
 424 #ifdef DEBUG_VFS_LOCKS
 425                         KASSERT(vp->v_holdcnt != 0,
 426                             ("ffs_lock %p: zero hold count", vp));
 427 #endif
 428                         lkp = vp->v_vnlock;
 429                         result = _lockmgr_args(lkp, flags, VI_MTX(vp),
 430                             LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
 431                             ap->a_file, ap->a_line);
 432                         if (lkp == vp->v_vnlock || result != 0)
 433                                 break;
 434                         /*
 435                          * Apparent success, except that the vnode
 436                          * mutated between snapshot file vnode and
 437                          * regular file vnode while this process
 438                          * slept.  The lock currently held is not the
 439                          * right lock.  Release it, and try to get the
 440                          * new lock.
 441                          */
 442                         (void) _lockmgr_args(lkp, LK_RELEASE, NULL,
 443                             LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
 444                             ap->a_file, ap->a_line);
 445                         if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
 446                             (LK_INTERLOCK | LK_NOWAIT))
 447                                 return (EBUSY);
 448                         if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
 449                                 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
 450                         flags &= ~LK_INTERLOCK;
 451                 }
 452                 break;
 453         default:
 454                 result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
 455         }
 456         return (result);
 457 #else
 458         return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
 459 #endif
 460 }
 461
 462 static int
 463 ffs_read_hole(struct uio *uio, long xfersize, long *size)
 464 {
 465         ssize_t saved_resid, tlen;
 466         int error;
 467
 468         while (xfersize > 0) {
 469                 tlen = min(xfersize, ZERO_REGION_SIZE);
 470                 saved_resid = uio->uio_resid;
 471                 error = vn_io_fault_uiomove(__DECONST(void *, zero_region),
 472                     tlen, uio);
 473                 if (error != 0)
 474                         return (error);
 475                 tlen = saved_resid - uio->uio_resid;
 476                 xfersize -= tlen;
 477                 *size -= tlen;
 478         }
 479         return (0);
 480 }
 481
 482 /*
 483  * Vnode op for reading.
 484  */
 485 static int
 486 ffs_read(ap)
 487         struct vop_read_args /* {
 488                 struct vnode *a_vp;
 489                 struct uio *a_uio;
 490                 int a_ioflag;
 491                 struct ucred *a_cred;
 492         } */ *ap;
 493 {
 494         struct vnode *vp;
 495         struct inode *ip;
 496         struct uio *uio;
 497         struct fs *fs;
 498         struct buf *bp;
 499         ufs_lbn_t lbn, nextlbn;
 500         off_t bytesinfile;
 501         long size, xfersize, blkoffset;
 502         ssize_t orig_resid;
 503         int bflag, error, ioflag, seqcount;
 504
 505         vp = ap->a_vp;
 506         uio = ap->a_uio;
 507         ioflag = ap->a_ioflag;
 508         if (ap->a_ioflag & IO_EXT)
 509 #ifdef notyet
 510                 return (ffs_extread(vp, uio, ioflag));
 511 #else
 512                 panic("ffs_read+IO_EXT");
 513 #endif
 514 #ifdef DIRECTIO
 515         if ((ioflag & IO_DIRECT) != 0) {
 516                 int workdone;
 517
 518                 error = ffs_rawread(vp, uio, &workdone);
 519                 if (error != 0 || workdone != 0)
 520                         return error;
 521         }
 522 #endif
 523
 524         seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 525         ip = VTOI(vp);
 526
 527 #ifdef INVARIANTS
 528         if (uio->uio_rw != UIO_READ)
 529                 panic("ffs_read: mode");
 530
 531         if (vp->v_type == VLNK) {
 532                 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
 533                         panic("ffs_read: short symlink");
 534         } else if (vp->v_type != VREG && vp->v_type != VDIR)
 535                 panic("ffs_read: type %d",  vp->v_type);
 536 #endif
 537         orig_resid = uio->uio_resid;
 538         KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
 539         if (orig_resid == 0)
 540                 return (0);
 541         KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
 542         fs = ITOFS(ip);
 543         if (uio->uio_offset < ip->i_size &&
 544             uio->uio_offset >= fs->fs_maxfilesize)
 545                 return (EOVERFLOW);
 546
 547         bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE);
 548         for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 549                 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 550                         break;
 551                 lbn = lblkno(fs, uio->uio_offset);
 552                 nextlbn = lbn + 1;
 553
 554                 /*
 555                  * size of buffer.  The buffer representing the
 556                  * end of the file is rounded up to the size of
 557                  * the block type ( fragment or full block,
 558                  * depending ).
 559                  */
 560                 size = blksize(fs, ip, lbn);
 561                 blkoffset = blkoff(fs, uio->uio_offset);
 562
 563                 /*
 564                  * The amount we want to transfer in this iteration is
 565                  * one FS block less the amount of the data before
 566                  * our startpoint (duh!)
 567                  */
 568                 xfersize = fs->fs_bsize - blkoffset;
 569
 570                 /*
 571                  * But if we actually want less than the block,
 572                  * or the file doesn't have a whole block more of data,
 573                  * then use the lesser number.
 574                  */
 575                 if (uio->uio_resid < xfersize)
 576                         xfersize = uio->uio_resid;
 577                 if (bytesinfile < xfersize)
 578                         xfersize = bytesinfile;
 579
 580                 if (lblktosize(fs, nextlbn) >= ip->i_size) {
 581                         /*
 582                          * Don't do readahead if this is the end of the file.
 583                          */
 584                         error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
 585                 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 586                         /*
 587                          * Otherwise if we are allowed to cluster,
 588                          * grab as much as we can.
 589                          *
 590                          * XXX  This may not be a win if we are not
 591                          * doing sequential access.
 592                          */
 593                         error = cluster_read(vp, ip->i_size, lbn,
 594                             size, NOCRED, blkoffset + uio->uio_resid,
 595                             seqcount, bflag, &bp);
 596                 } else if (seqcount > 1) {
 597                         /*
 598                          * If we are NOT allowed to cluster, then
 599                          * if we appear to be acting sequentially,
 600                          * fire off a request for a readahead
 601                          * as well as a read. Note that the 4th and 5th
 602                          * arguments point to arrays of the size specified in
 603                          * the 6th argument.
 604                          */
 605                         u_int nextsize = blksize(fs, ip, nextlbn);
 606                         error = breadn_flags(vp, lbn, size, &nextlbn,
 607                             &nextsize, 1, NOCRED, bflag, NULL, &bp);
 608                 } else {
 609                         /*
 610                          * Failing all of the above, just read what the
 611                          * user asked for. Interestingly, the same as
 612                          * the first option above.
 613                          */
 614                         error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
 615                 }
 616                 if (error == EJUSTRETURN) {
 617                         error = ffs_read_hole(uio, xfersize, &size);
 618                         if (error == 0)
 619                                 continue;
 620                 }
 621                 if (error != 0) {
 622                         brelse(bp);
 623                         bp = NULL;
 624                         break;
 625                 }
 626
 627                 /*
 628                  * We should only get non-zero b_resid when an I/O error
 629                  * has occurred, which should cause us to break above.
 630                  * However, if the short read did not cause an error,
 631                  * then we want to ensure that we do not uiomove bad
 632                  * or uninitialized data.
 633                  */
 634                 size -= bp->b_resid;
 635                 if (size < xfersize) {
 636                         if (size == 0)
 637                                 break;
 638                         xfersize = size;
 639                 }
 640
 641                 if (buf_mapped(bp)) {
 642                         error = vn_io_fault_uiomove((char *)bp->b_data +
 643                             blkoffset, (int)xfersize, uio);
 644                 } else {
 645                         error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
 646                             (int)xfersize, uio);
 647                 }
 648                 if (error)
 649                         break;
 650
 651                 vfs_bio_brelse(bp, ioflag);
 652         }
 653
 654         /*
 655          * This can only happen in the case of an error
 656          * because the loop above resets bp to NULL on each iteration
 657          * and on normal completion has not set a new value into it.
 658          * so it must have come from a 'break' statement
 659          */
 660         if (bp != NULL)
 661                 vfs_bio_brelse(bp, ioflag);
 662
 663         if ((error == 0 || uio->uio_resid != orig_resid) &&
 664             (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 &&
 665             (ip->i_flag & IN_ACCESS) == 0) {
 666                 VI_LOCK(vp);
 667                 ip->i_flag |= IN_ACCESS;
 668                 VI_UNLOCK(vp);
 669         }
 670         return (error);
 671 }
 672
 673 /*
 674  * Vnode op for writing.
 675  */
 676 static int
 677 ffs_write(ap)
 678         struct vop_write_args /* {
 679                 struct vnode *a_vp;
 680                 struct uio *a_uio;
 681                 int a_ioflag;
 682                 struct ucred *a_cred;
 683         } */ *ap;
 684 {
 685         struct vnode *vp;
 686         struct uio *uio;
 687         struct inode *ip;
 688         struct fs *fs;
 689         struct buf *bp;
 690         ufs_lbn_t lbn;
 691         off_t osize;
 692         ssize_t resid;
 693         int seqcount;
 694         int blkoffset, error, flags, ioflag, size, xfersize;
 695
 696         vp = ap->a_vp;
 697         uio = ap->a_uio;
 698         ioflag = ap->a_ioflag;
 699         if (ap->a_ioflag & IO_EXT)
 700 #ifdef notyet
 701                 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
 702 #else
 703                 panic("ffs_write+IO_EXT");
 704 #endif
 705
 706         seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 707         ip = VTOI(vp);
 708
 709 #ifdef INVARIANTS
 710         if (uio->uio_rw != UIO_WRITE)
 711                 panic("ffs_write: mode");
 712 #endif
 713
 714         switch (vp->v_type) {
 715         case VREG:
 716                 if (ioflag & IO_APPEND)
 717                         uio->uio_offset = ip->i_size;
 718                 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
 719                         return (EPERM);
 720                 /* FALLTHROUGH */
 721         case VLNK:
 722                 break;
 723         case VDIR:
 724                 panic("ffs_write: dir write");
 725                 break;
 726         default:
 727                 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
 728                         (int)uio->uio_offset,
 729                         (int)uio->uio_resid
 730                 );
 731         }
 732
 733         KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
 734         KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
 735         fs = ITOFS(ip);
 736         if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
 737                 return (EFBIG);
 738         /*
 739          * Maybe this should be above the vnode op call, but so long as
 740          * file servers have no limits, I don't think it matters.
 741          */
 742         if (vn_rlimit_fsize(vp, uio, uio->uio_td))
 743                 return (EFBIG);
 744
 745         resid = uio->uio_resid;
 746         osize = ip->i_size;
 747         if (seqcount > BA_SEQMAX)
 748                 flags = BA_SEQMAX << BA_SEQSHIFT;
 749         else
 750                 flags = seqcount << BA_SEQSHIFT;
 751         if (ioflag & IO_SYNC)
 752                 flags |= IO_SYNC;
 753         flags |= BA_UNMAPPED;
 754
 755         for (error = 0; uio->uio_resid > 0;) {
 756                 lbn = lblkno(fs, uio->uio_offset);
 757                 blkoffset = blkoff(fs, uio->uio_offset);
 758                 xfersize = fs->fs_bsize - blkoffset;
 759                 if (uio->uio_resid < xfersize)
 760                         xfersize = uio->uio_resid;
 761                 if (uio->uio_offset + xfersize > ip->i_size)
 762                         vnode_pager_setsize(vp, uio->uio_offset + xfersize);
 763
 764                 /*
 765                  * We must perform a read-before-write if the transfer size
 766                  * does not cover the entire buffer.
 767                  */
 768                 if (fs->fs_bsize > xfersize)
 769                         flags |= BA_CLRBUF;
 770                 else
 771                         flags &= ~BA_CLRBUF;
 772 /* XXX is uio->uio_offset the right thing here? */
 773                 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
 774                     ap->a_cred, flags, &bp);
 775                 if (error != 0) {
 776                         vnode_pager_setsize(vp, ip->i_size);
 777                         break;
 778                 }
 779                 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
 780                         bp->b_flags |= B_NOCACHE;
 781
 782                 if (uio->uio_offset + xfersize > ip->i_size) {
 783                         ip->i_size = uio->uio_offset + xfersize;
 784                         DIP_SET(ip, i_size, ip->i_size);
 785                 }
 786
 787                 size = blksize(fs, ip, lbn) - bp->b_resid;
 788                 if (size < xfersize)
 789                         xfersize = size;
 790
 791                 if (buf_mapped(bp)) {
 792                         error = vn_io_fault_uiomove((char *)bp->b_data +
 793                             blkoffset, (int)xfersize, uio);
 794                 } else {
 795                         error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
 796                             (int)xfersize, uio);
 797                 }
 798                 /*
 799                  * If the buffer is not already filled and we encounter an
 800                  * error while trying to fill it, we have to clear out any
 801                  * garbage data from the pages instantiated for the buffer.
 802                  * If we do not, a failed uiomove() during a write can leave
 803                  * the prior contents of the pages exposed to a userland mmap.
 804                  *
 805                  * Note that we need only clear buffers with a transfer size
 806                  * equal to the block size because buffers with a shorter
 807                  * transfer size were cleared above by the call to UFS_BALLOC()
 808                  * with the BA_CLRBUF flag set.
 809                  *
 810                  * If the source region for uiomove identically mmaps the
 811                  * buffer, uiomove() performed the NOP copy, and the buffer
 812                  * content remains valid because the page fault handler
 813                  * validated the pages.
 814                  */
 815                 if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
 816                     fs->fs_bsize == xfersize)
 817                         vfs_bio_clrbuf(bp);
 818
 819                 vfs_bio_set_flags(bp, ioflag);
 820
 821                 /*
 822                  * If IO_SYNC each buffer is written synchronously.  Otherwise
 823                  * if we have a severe page deficiency write the buffer
 824                  * asynchronously.  Otherwise try to cluster, and if that
 825                  * doesn't do it then either do an async write (if O_DIRECT),
 826                  * or a delayed write (if not).
 827                  */
 828                 if (ioflag & IO_SYNC) {
 829                         (void)bwrite(bp);
 830                 } else if (vm_page_count_severe() ||
 831                             buf_dirty_count_severe() ||
 832                             (ioflag & IO_ASYNC)) {
 833                         bp->b_flags |= B_CLUSTEROK;
 834                         bawrite(bp);
 835                 } else if (xfersize + blkoffset == fs->fs_bsize) {
 836                         if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
 837                                 bp->b_flags |= B_CLUSTEROK;
 838                                 cluster_write(vp, bp, ip->i_size, seqcount,
 839                                     GB_UNMAPPED);
 840                         } else {
 841                                 bawrite(bp);
 842                         }
 843                 } else if (ioflag & IO_DIRECT) {
 844                         bp->b_flags |= B_CLUSTEROK;
 845                         bawrite(bp);
 846                 } else {
 847                         bp->b_flags |= B_CLUSTEROK;
 848                         bdwrite(bp);
 849                 }
 850                 if (error || xfersize == 0)
 851                         break;
 852                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
 853         }
 854         /*
 855          * If we successfully wrote any data, and we are not the superuser
 856          * we clear the setuid and setgid bits as a precaution against
 857          * tampering.
 858          */
 859         if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
 860             ap->a_cred) {
 861                 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) {
 862                         ip->i_mode &= ~(ISUID | ISGID);
 863                         DIP_SET(ip, i_mode, ip->i_mode);
 864                 }
 865         }
 866         if (error) {
 867                 if (ioflag & IO_UNIT) {
 868                         (void)ffs_truncate(vp, osize,
 869                             IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
 870                         uio->uio_offset -= resid - uio->uio_resid;
 871                         uio->uio_resid = resid;
 872                 }
 873         } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
 874                 error = ffs_update(vp, 1);
 875         return (error);
 876 }
 877
 878 /*
 879  * Extended attribute area reading.
 880  */
 881 static int
 882 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
 883 {
 884         struct inode *ip;
 885         struct ufs2_dinode *dp;
 886         struct fs *fs;
 887         struct buf *bp;
 888         ufs_lbn_t lbn, nextlbn;
 889         off_t bytesinfile;
 890         long size, xfersize, blkoffset;
 891         ssize_t orig_resid;
 892         int error;
 893
 894         ip = VTOI(vp);
 895         fs = ITOFS(ip);
 896         dp = ip->i_din2;
 897
 898 #ifdef INVARIANTS
 899         if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
 900                 panic("ffs_extread: mode");
 901
 902 #endif
 903         orig_resid = uio->uio_resid;
 904         KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
 905         if (orig_resid == 0)
 906                 return (0);
 907         KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
 908
 909         for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 910                 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
 911                         break;
 912                 lbn = lblkno(fs, uio->uio_offset);
 913                 nextlbn = lbn + 1;
 914
 915                 /*
 916                  * size of buffer.  The buffer representing the
 917                  * end of the file is rounded up to the size of
 918                  * the block type ( fragment or full block,
 919                  * depending ).
 920                  */
 921                 size = sblksize(fs, dp->di_extsize, lbn);
 922                 blkoffset = blkoff(fs, uio->uio_offset);
 923
 924                 /*
 925                  * The amount we want to transfer in this iteration is
 926                  * one FS block less the amount of the data before
 927                  * our startpoint (duh!)
 928                  */
 929                 xfersize = fs->fs_bsize - blkoffset;
 930
 931                 /*
 932                  * But if we actually want less than the block,
 933                  * or the file doesn't have a whole block more of data,
 934                  * then use the lesser number.
 935                  */
 936                 if (uio->uio_resid < xfersize)
 937                         xfersize = uio->uio_resid;
 938                 if (bytesinfile < xfersize)
 939                         xfersize = bytesinfile;
 940
 941                 if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
 942                         /*
 943                          * Don't do readahead if this is the end of the info.
 944                          */
 945                         error = bread(vp, -1 - lbn, size, NOCRED, &bp);
 946                 } else {
 947                         /*
 948                          * If we have a second block, then
 949                          * fire off a request for a readahead
 950                          * as well as a read. Note that the 4th and 5th
 951                          * arguments point to arrays of the size specified in
 952                          * the 6th argument.
 953                          */
 954                         u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
 955
 956                         nextlbn = -1 - nextlbn;
 957                         error = breadn(vp, -1 - lbn,
 958                             size, &nextlbn, &nextsize, 1, NOCRED, &bp);
 959                 }
 960                 if (error) {
 961                         brelse(bp);
 962                         bp = NULL;
 963                         break;
 964                 }
 965
 966                 /*
 967                  * We should only get non-zero b_resid when an I/O error
 968                  * has occurred, which should cause us to break above.
 969                  * However, if the short read did not cause an error,
 970                  * then we want to ensure that we do not uiomove bad
 971                  * or uninitialized data.
 972                  */
 973                 size -= bp->b_resid;
 974                 if (size < xfersize) {
 975                         if (size == 0)
 976                                 break;
 977                         xfersize = size;
 978                 }
 979
 980                 error = uiomove((char *)bp->b_data + blkoffset,
 981                                         (int)xfersize, uio);
 982                 if (error)
 983                         break;
 984                 vfs_bio_brelse(bp, ioflag);
 985         }
 986
 987         /*
 988          * This can only happen in the case of an error
 989          * because the loop above resets bp to NULL on each iteration
 990          * and on normal completion has not set a new value into it.
 991          * so it must have come from a 'break' statement
 992          */
 993         if (bp != NULL)
 994                 vfs_bio_brelse(bp, ioflag);
 995         return (error);
 996 }
 997
 998 /*
 999  * Extended attribute area writing.
1000  */
1001 static int
1002 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1003 {
1004         struct inode *ip;
1005         struct ufs2_dinode *dp;
1006         struct fs *fs;
1007         struct buf *bp;
1008         ufs_lbn_t lbn;
1009         off_t osize;
1010         ssize_t resid;
1011         int blkoffset, error, flags, size, xfersize;
1012
1013         ip = VTOI(vp);
1014         fs = ITOFS(ip);
1015         dp = ip->i_din2;
1016
1017 #ifdef INVARIANTS
1018         if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1019                 panic("ffs_extwrite: mode");
1020 #endif
1021
1022         if (ioflag & IO_APPEND)
1023                 uio->uio_offset = dp->di_extsize;
1024         KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1025         KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1026         if ((uoff_t)uio->uio_offset + uio->uio_resid >
1027             UFS_NXADDR * fs->fs_bsize)
1028                 return (EFBIG);
1029
1030         resid = uio->uio_resid;
1031         osize = dp->di_extsize;
1032         flags = IO_EXT;
1033         if (ioflag & IO_SYNC)
1034                 flags |= IO_SYNC;
1035
1036         for (error = 0; uio->uio_resid > 0;) {
1037                 lbn = lblkno(fs, uio->uio_offset);
1038                 blkoffset = blkoff(fs, uio->uio_offset);
1039                 xfersize = fs->fs_bsize - blkoffset;
1040                 if (uio->uio_resid < xfersize)
1041                         xfersize = uio->uio_resid;
1042
1043                 /*
1044                  * We must perform a read-before-write if the transfer size
1045                  * does not cover the entire buffer.
1046                  */
1047                 if (fs->fs_bsize > xfersize)
1048                         flags |= BA_CLRBUF;
1049                 else
1050                         flags &= ~BA_CLRBUF;
1051                 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1052                     ucred, flags, &bp);
1053                 if (error != 0)
1054                         break;
1055                 /*
1056                  * If the buffer is not valid we have to clear out any
1057                  * garbage data from the pages instantiated for the buffer.
1058                  * If we do not, a failed uiomove() during a write can leave
1059                  * the prior contents of the pages exposed to a userland
1060                  * mmap().  XXX deal with uiomove() errors a better way.
1061                  */
1062                 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1063                         vfs_bio_clrbuf(bp);
1064
1065                 if (uio->uio_offset + xfersize > dp->di_extsize)
1066                         dp->di_extsize = uio->uio_offset + xfersize;
1067
1068                 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1069                 if (size < xfersize)
1070                         xfersize = size;
1071
1072                 error =
1073                     uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1074
1075                 vfs_bio_set_flags(bp, ioflag);
1076
1077                 /*
1078                  * If IO_SYNC each buffer is written synchronously.  Otherwise
1079                  * if we have a severe page deficiency write the buffer
1080                  * asynchronously.  Otherwise try to cluster, and if that
1081                  * doesn't do it then either do an async write (if O_DIRECT),
1082                  * or a delayed write (if not).
1083                  */
1084                 if (ioflag & IO_SYNC) {
1085                         (void)bwrite(bp);
1086                 } else if (vm_page_count_severe() ||
1087                             buf_dirty_count_severe() ||
1088                             xfersize + blkoffset == fs->fs_bsize ||
1089                             (ioflag & (IO_ASYNC | IO_DIRECT)))
1090                         bawrite(bp);
1091                 else
1092                         bdwrite(bp);
1093                 if (error || xfersize == 0)
1094                         break;
1095                 ip->i_flag |= IN_CHANGE;
1096         }
1097         /*
1098          * If we successfully wrote any data, and we are not the superuser
1099          * we clear the setuid and setgid bits as a precaution against
1100          * tampering.
1101          */
1102         if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1103                 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) {
1104                         ip->i_mode &= ~(ISUID | ISGID);
1105                         dp->di_mode = ip->i_mode;
1106                 }
1107         }
1108         if (error) {
1109                 if (ioflag & IO_UNIT) {
1110                         (void)ffs_truncate(vp, osize,
1111                             IO_EXT | (ioflag&IO_SYNC), ucred);
1112                         uio->uio_offset -= resid - uio->uio_resid;
1113                         uio->uio_resid = resid;
1114                 }
1115         } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1116                 error = ffs_update(vp, 1);
1117         return (error);
1118 }
1119
1120
1121 /*
1122  * Vnode operating to retrieve a named extended attribute.
1123  *
1124  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1125  * the length of the EA, and possibly the pointer to the entry and to the data.
1126  */
1127 static int
1128 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name,
1129     struct extattr **eapp, u_char **eac)
1130 {
1131         struct extattr *eap, *eaend;
1132         size_t nlen;
1133
1134         nlen = strlen(name);
1135         KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
1136         eap = (struct extattr *)ptr;
1137         eaend = (struct extattr *)(ptr + length);
1138         for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
1139                 /* make sure this entry is complete */
1140                 if (EXTATTR_NEXT(eap) > eaend)
1141                         break;
1142                 if (eap->ea_namespace != nspace || eap->ea_namelength != nlen
1143                     || memcmp(eap->ea_name, name, nlen) != 0)
1144                         continue;
1145                 if (eapp != NULL)
1146                         *eapp = eap;
1147                 if (eac != NULL)
1148                         *eac = EXTATTR_CONTENT(eap);
1149                 return (EXTATTR_CONTENT_SIZE(eap));
1150         }
1151         return (-1);
1152 }
1153
1154 static int
1155 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1156 {
1157         struct inode *ip;
1158         struct ufs2_dinode *dp;
1159         struct fs *fs;
1160         struct uio luio;
1161         struct iovec liovec;
1162         u_int easize;
1163         int error;
1164         u_char *eae;
1165
1166         ip = VTOI(vp);
1167         fs = ITOFS(ip);
1168         dp = ip->i_din2;
1169         easize = dp->di_extsize;
1170         if ((uoff_t)easize + extra > UFS_NXADDR * fs->fs_bsize)
1171                 return (EFBIG);
1172
1173         eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1174
1175         liovec.iov_base = eae;
1176         liovec.iov_len = easize;
1177         luio.uio_iov = &liovec;
1178         luio.uio_iovcnt = 1;
1179         luio.uio_offset = 0;
1180         luio.uio_resid = easize;
1181         luio.uio_segflg = UIO_SYSSPACE;
1182         luio.uio_rw = UIO_READ;
1183         luio.uio_td = td;
1184
1185         error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1186         if (error) {
1187                 free(eae, M_TEMP);
1188                 return(error);
1189         }
1190         *p = eae;
1191         return (0);
1192 }
1193
1194 static void
1195 ffs_lock_ea(struct vnode *vp)
1196 {
1197         struct inode *ip;
1198
1199         ip = VTOI(vp);
1200         VI_LOCK(vp);
1201         while (ip->i_flag & IN_EA_LOCKED) {
1202                 ip->i_flag |= IN_EA_LOCKWAIT;
1203                 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
1204                     0);
1205         }
1206         ip->i_flag |= IN_EA_LOCKED;
1207         VI_UNLOCK(vp);
1208 }
1209
1210 static void
1211 ffs_unlock_ea(struct vnode *vp)
1212 {
1213         struct inode *ip;
1214
1215         ip = VTOI(vp);
1216         VI_LOCK(vp);
1217         if (ip->i_flag & IN_EA_LOCKWAIT)
1218                 wakeup(&ip->i_ea_refs);
1219         ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1220         VI_UNLOCK(vp);
1221 }
1222
1223 static int
1224 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1225 {
1226         struct inode *ip;
1227         struct ufs2_dinode *dp;
1228         int error;
1229
1230         ip = VTOI(vp);
1231
1232         ffs_lock_ea(vp);
1233         if (ip->i_ea_area != NULL) {
1234                 ip->i_ea_refs++;
1235                 ffs_unlock_ea(vp);
1236                 return (0);
1237         }
1238         dp = ip->i_din2;
1239         error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1240         if (error) {
1241                 ffs_unlock_ea(vp);
1242                 return (error);
1243         }
1244         ip->i_ea_len = dp->di_extsize;
1245         ip->i_ea_error = 0;
1246         ip->i_ea_refs++;
1247         ffs_unlock_ea(vp);
1248         return (0);
1249 }
1250
1251 /*
1252  * Vnode extattr transaction commit/abort
1253  */
1254 static int
1255 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1256 {
1257         struct inode *ip;
1258         struct uio luio;
1259         struct iovec liovec;
1260         int error;
1261         struct ufs2_dinode *dp;
1262
1263         ip = VTOI(vp);
1264
1265         ffs_lock_ea(vp);
1266         if (ip->i_ea_area == NULL) {
1267                 ffs_unlock_ea(vp);
1268                 return (EINVAL);
1269         }
1270         dp = ip->i_din2;
1271         error = ip->i_ea_error;
1272         if (commit && error == 0) {
1273                 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1274                 if (cred == NOCRED)
1275                         cred =  vp->v_mount->mnt_cred;
1276                 liovec.iov_base = ip->i_ea_area;
1277                 liovec.iov_len = ip->i_ea_len;
1278                 luio.uio_iov = &liovec;
1279                 luio.uio_iovcnt = 1;
1280                 luio.uio_offset = 0;
1281                 luio.uio_resid = ip->i_ea_len;
1282                 luio.uio_segflg = UIO_SYSSPACE;
1283                 luio.uio_rw = UIO_WRITE;
1284                 luio.uio_td = td;
1285                 /* XXX: I'm not happy about truncating to zero size */
1286                 if (ip->i_ea_len < dp->di_extsize)
1287                         error = ffs_truncate(vp, 0, IO_EXT, cred);
1288                 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1289         }
1290         if (--ip->i_ea_refs == 0) {
1291                 free(ip->i_ea_area, M_TEMP);
1292                 ip->i_ea_area = NULL;
1293                 ip->i_ea_len = 0;
1294                 ip->i_ea_error = 0;
1295         }
1296         ffs_unlock_ea(vp);
1297         return (error);
1298 }
1299
1300 /*
1301  * Vnode extattr strategy routine for fifos.
1302  *
1303  * We need to check for a read or write of the external attributes.
1304  * Otherwise we just fall through and do the usual thing.
1305  */
1306 static int
1307 ffsext_strategy(struct vop_strategy_args *ap)
1308 /*
1309 struct vop_strategy_args {
1310         struct vnodeop_desc *a_desc;
1311         struct vnode *a_vp;
1312         struct buf *a_bp;
1313 };
1314 */
1315 {
1316         struct vnode *vp;
1317         daddr_t lbn;
1318
1319         vp = ap->a_vp;
1320         lbn = ap->a_bp->b_lblkno;
1321         if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
1322                 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1323         if (vp->v_type == VFIFO)
1324                 return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1325         panic("spec nodes went here");
1326 }
1327
1328 /*
1329  * Vnode extattr transaction commit/abort
1330  */
1331 static int
1332 ffs_openextattr(struct vop_openextattr_args *ap)
1333 /*
1334 struct vop_openextattr_args {
1335         struct vnodeop_desc *a_desc;
1336         struct vnode *a_vp;
1337         IN struct ucred *a_cred;
1338         IN struct thread *a_td;
1339 };
1340 */
1341 {
1342
1343         if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1344                 return (EOPNOTSUPP);
1345
1346         return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1347 }
1348
1349
1350 /*
1351  * Vnode extattr transaction commit/abort
1352  */
1353 static int
1354 ffs_closeextattr(struct vop_closeextattr_args *ap)
1355 /*
1356 struct vop_closeextattr_args {
1357         struct vnodeop_desc *a_desc;
1358         struct vnode *a_vp;
1359         int a_commit;
1360         IN struct ucred *a_cred;
1361         IN struct thread *a_td;
1362 };
1363 */
1364 {
1365
1366         if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1367                 return (EOPNOTSUPP);
1368
1369         if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
1370                 return (EROFS);
1371
1372         return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1373 }
1374
1375 /*
1376  * Vnode operation to remove a named attribute.
1377  */
1378 static int
1379 ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1380 /*
1381 vop_deleteextattr {
1382         IN struct vnode *a_vp;
1383         IN int a_attrnamespace;
1384         IN const char *a_name;
1385         IN struct ucred *a_cred;
1386         IN struct thread *a_td;
1387 };
1388 */
1389 {
1390         struct inode *ip;
1391         struct extattr *eap;
1392         uint32_t ul;
1393         int olen, error, i, easize;
1394         u_char *eae;
1395         void *tmp;
1396
1397         ip = VTOI(ap->a_vp);
1398
1399         if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1400                 return (EOPNOTSUPP);
1401
1402         if (strlen(ap->a_name) == 0)
1403                 return (EINVAL);
1404
1405         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1406                 return (EROFS);
1407
1408         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1409             ap->a_cred, ap->a_td, VWRITE);
1410         if (error) {
1411
1412                 /*
1413                  * ffs_lock_ea is not needed there, because the vnode
1414                  * must be exclusively locked.
1415                  */
1416                 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1417                         ip->i_ea_error = error;
1418                 return (error);
1419         }
1420
1421         error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1422         if (error)
1423                 return (error);
1424
1425         /* CEM: delete could be done in-place instead */
1426         eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1427         bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1428         easize = ip->i_ea_len;
1429
1430         olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1431             &eap, NULL);
1432         if (olen == -1) {
1433                 /* delete but nonexistent */
1434                 free(eae, M_TEMP);
1435                 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1436                 return (ENOATTR);
1437         }
1438         ul = eap->ea_length;
1439         i = (u_char *)EXTATTR_NEXT(eap) - eae;
1440         bcopy(EXTATTR_NEXT(eap), eap, easize - i);
1441         easize -= ul;
1442
1443         tmp = ip->i_ea_area;
1444         ip->i_ea_area = eae;
1445         ip->i_ea_len = easize;
1446         free(tmp, M_TEMP);
1447         error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1448         return (error);
1449 }
1450
1451 /*
1452  * Vnode operation to retrieve a named extended attribute.
1453  */
1454 static int
1455 ffs_getextattr(struct vop_getextattr_args *ap)
1456 /*
1457 vop_getextattr {
1458         IN struct vnode *a_vp;
1459         IN int a_attrnamespace;
1460         IN const char *a_name;
1461         INOUT struct uio *a_uio;
1462         OUT size_t *a_size;
1463         IN struct ucred *a_cred;
1464         IN struct thread *a_td;
1465 };
1466 */
1467 {
1468         struct inode *ip;
1469         u_char *eae, *p;
1470         unsigned easize;
1471         int error, ealen;
1472
1473         ip = VTOI(ap->a_vp);
1474
1475         if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1476                 return (EOPNOTSUPP);
1477
1478         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1479             ap->a_cred, ap->a_td, VREAD);
1480         if (error)
1481                 return (error);
1482
1483         error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1484         if (error)
1485                 return (error);
1486
1487         eae = ip->i_ea_area;
1488         easize = ip->i_ea_len;
1489
1490         ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1491             NULL, &p);
1492         if (ealen >= 0) {
1493                 error = 0;
1494                 if (ap->a_size != NULL)
1495                         *ap->a_size = ealen;
1496                 else if (ap->a_uio != NULL)
1497                         error = uiomove(p, ealen, ap->a_uio);
1498         } else
1499                 error = ENOATTR;
1500
1501         ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1502         return (error);
1503 }
1504
1505 /*
1506  * Vnode operation to retrieve extended attributes on a vnode.
1507  */
1508 static int
1509 ffs_listextattr(struct vop_listextattr_args *ap)
1510 /*
1511 vop_listextattr {
1512         IN struct vnode *a_vp;
1513         IN int a_attrnamespace;
1514         INOUT struct uio *a_uio;
1515         OUT size_t *a_size;
1516         IN struct ucred *a_cred;
1517         IN struct thread *a_td;
1518 };
1519 */
1520 {
1521         struct inode *ip;
1522         struct extattr *eap, *eaend;
1523         int error, ealen;
1524
1525         ip = VTOI(ap->a_vp);
1526
1527         if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1528                 return (EOPNOTSUPP);
1529
1530         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1531             ap->a_cred, ap->a_td, VREAD);
1532         if (error)
1533                 return (error);
1534
1535         error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1536         if (error)
1537                 return (error);
1538
1539         error = 0;
1540         if (ap->a_size != NULL)
1541                 *ap->a_size = 0;
1542
1543         KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
1544         eap = (struct extattr *)ip->i_ea_area;
1545         eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
1546         for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
1547                 /* make sure this entry is complete */
1548                 if (EXTATTR_NEXT(eap) > eaend)
1549                         break;
1550                 if (eap->ea_namespace != ap->a_attrnamespace)
1551                         continue;
1552
1553                 ealen = eap->ea_namelength;
1554                 if (ap->a_size != NULL)
1555                         *ap->a_size += ealen + 1;
1556                 else if (ap->a_uio != NULL)
1557                         error = uiomove(&eap->ea_namelength, ealen + 1,
1558                             ap->a_uio);
1559         }
1560
1561         ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1562         return (error);
1563 }
1564
1565 /*
1566  * Vnode operation to set a named attribute.
1567  */
1568 static int
1569 ffs_setextattr(struct vop_setextattr_args *ap)
1570 /*
1571 vop_setextattr {
1572         IN struct vnode *a_vp;
1573         IN int a_attrnamespace;
1574         IN const char *a_name;
1575         INOUT struct uio *a_uio;
1576         IN struct ucred *a_cred;
1577         IN struct thread *a_td;
1578 };
1579 */
1580 {
1581         struct inode *ip;
1582         struct fs *fs;
1583         struct extattr *eap;
1584         uint32_t ealength, ul;
1585         ssize_t ealen;
1586         int olen, eapad1, eapad2, error, i, easize;
1587         u_char *eae;
1588         void *tmp;
1589
1590         ip = VTOI(ap->a_vp);
1591         fs = ITOFS(ip);
1592
1593         if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1594                 return (EOPNOTSUPP);
1595
1596         if (strlen(ap->a_name) == 0)
1597                 return (EINVAL);
1598
1599         /* XXX Now unsupported API to delete EAs using NULL uio. */
1600         if (ap->a_uio == NULL)
1601                 return (EOPNOTSUPP);
1602
1603         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1604                 return (EROFS);
1605
1606         ealen = ap->a_uio->uio_resid;
1607         if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR))
1608                 return (EINVAL);
1609
1610         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1611             ap->a_cred, ap->a_td, VWRITE);
1612         if (error) {
1613
1614                 /*
1615                  * ffs_lock_ea is not needed there, because the vnode
1616                  * must be exclusively locked.
1617                  */
1618                 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1619                         ip->i_ea_error = error;
1620                 return (error);
1621         }
1622
1623         error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1624         if (error)
1625                 return (error);
1626
1627         ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1628         eapad1 = roundup2(ealength, 8) - ealength;
1629         eapad2 = roundup2(ealen, 8) - ealen;
1630         ealength += eapad1 + ealen + eapad2;
1631
1632         /*
1633          * CEM: rewrites of the same size or smaller could be done in-place
1634          * instead.  (We don't acquire any fine-grained locks in here either,
1635          * so we could also do bigger writes in-place.)
1636          */
1637         eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1638         bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1639         easize = ip->i_ea_len;
1640
1641         olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1642             &eap, NULL);
1643         if (olen == -1) {
1644                 /* new, append at end */
1645                 KASSERT(ALIGNED_TO(eae + easize, struct extattr),
1646                     ("unaligned"));
1647                 eap = (struct extattr *)(eae + easize);
1648                 easize += ealength;
1649         } else {
1650                 ul = eap->ea_length;
1651                 i = (u_char *)EXTATTR_NEXT(eap) - eae;
1652                 if (ul != ealength) {
1653                         bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength,
1654                             easize - i);
1655                         easize += (ealength - ul);
1656                 }
1657         }
1658         if (easize > lblktosize(fs, UFS_NXADDR)) {
1659                 free(eae, M_TEMP);
1660                 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1661                 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1662                         ip->i_ea_error = ENOSPC;
1663                 return (ENOSPC);
1664         }
1665         eap->ea_length = ealength;
1666         eap->ea_namespace = ap->a_attrnamespace;
1667         eap->ea_contentpadlen = eapad2;
1668         eap->ea_namelength = strlen(ap->a_name);
1669         memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
1670         bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
1671         error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
1672         if (error) {
1673                 free(eae, M_TEMP);
1674                 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1675                 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1676                         ip->i_ea_error = error;
1677                 return (error);
1678         }
1679         bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2);
1680
1681         tmp = ip->i_ea_area;
1682         ip->i_ea_area = eae;
1683         ip->i_ea_len = easize;
1684         free(tmp, M_TEMP);
1685         error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1686         return (error);
1687 }
1688
1689 /*
1690  * Vnode pointer to File handle
1691  */
1692 static int
1693 ffs_vptofh(struct vop_vptofh_args *ap)
1694 /*
1695 vop_vptofh {
1696         IN struct vnode *a_vp;
1697         IN struct fid *a_fhp;
1698 };
1699 */
1700 {
1701         struct inode *ip;
1702         struct ufid *ufhp;
1703
1704         ip = VTOI(ap->a_vp);
1705         ufhp = (struct ufid *)ap->a_fhp;
1706         ufhp->ufid_len = sizeof(struct ufid);
1707         ufhp->ufid_ino = ip->i_number;
1708         ufhp->ufid_gen = ip->i_gen;
1709         return (0);
1710 }
1711
1712 SYSCTL_DECL(_vfs_ffs);
1713 static int use_buf_pager = 1;
1714 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
1715     "Always use buffer pager instead of bmap");
1716
1717 static daddr_t
1718 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
1719 {
1720
1721         return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
1722 }
1723
1724 static int
1725 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn)
1726 {
1727
1728         return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn));
1729 }
1730
1731 static int
1732 ffs_getpages(struct vop_getpages_args *ap)
1733 {
1734         struct vnode *vp;
1735         struct ufsmount *um;
1736
1737         vp = ap->a_vp;
1738         um = VFSTOUFS(vp->v_mount);
1739
1740         if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1741                 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1742                     ap->a_rbehind, ap->a_rahead, NULL, NULL));
1743         return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1744             ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
1745 }
1746
1747 static int
1748 ffs_getpages_async(struct vop_getpages_async_args *ap)
1749 {
1750         struct vnode *vp;
1751         struct ufsmount *um;
1752         int error;
1753
1754         vp = ap->a_vp;
1755         um = VFSTOUFS(vp->v_mount);
1756
1757         if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1758                 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1759                     ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg));
1760
1761         error = vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1762             ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz);
1763         ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
1764
1765         return (error);
1766 }
1767