sys/ufs/ffs/ffs_vnops.c

   1 /*-
   2  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause)
   3  *
   4  * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
   5  * All rights reserved.
   6  *
   7  * This software was developed for the FreeBSD Project by Marshall
   8  * Kirk McKusick and Network Associates Laboratories, the Security
   9  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
  10  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
  11  * research program
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in the
  20  *    documentation and/or other materials provided with the distribution.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * Copyright (c) 1982, 1986, 1989, 1993
  35  *      The Regents of the University of California.  All rights reserved.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      from: @(#)ufs_readwrite.c       8.11 (Berkeley) 5/8/95
  62  * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
  63  *      @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
  64  */
  65
  66 #include <sys/cdefs.h>
  67 __FBSDID("$FreeBSD$");
  68
  69 #include "opt_directio.h"
  70 #include "opt_ffs.h"
  71 #include "opt_ufs.h"
  72
  73 #include <sys/param.h>
  74 #include <sys/bio.h>
  75 #include <sys/systm.h>
  76 #include <sys/buf.h>
  77 #include <sys/conf.h>
  78 #include <sys/extattr.h>
  79 #include <sys/kernel.h>
  80 #include <sys/limits.h>
  81 #include <sys/malloc.h>
  82 #include <sys/mount.h>
  83 #include <sys/priv.h>
  84 #include <sys/rwlock.h>
  85 #include <sys/stat.h>
  86 #include <sys/sysctl.h>
  87 #include <sys/vmmeter.h>
  88 #include <sys/vnode.h>
  89
  90 #include <vm/vm.h>
  91 #include <vm/vm_param.h>
  92 #include <vm/vm_extern.h>
  93 #include <vm/vm_object.h>
  94 #include <vm/vm_page.h>
  95 #include <vm/vm_pager.h>
  96 #include <vm/vnode_pager.h>
  97
  98 #include <ufs/ufs/extattr.h>
  99 #include <ufs/ufs/quota.h>
 100 #include <ufs/ufs/inode.h>
 101 #include <ufs/ufs/ufs_extern.h>
 102 #include <ufs/ufs/ufsmount.h>
 103 #include <ufs/ufs/dir.h>
 104 #ifdef UFS_DIRHASH
 105 #include <ufs/ufs/dirhash.h>
 106 #endif
 107
 108 #include <ufs/ffs/fs.h>
 109 #include <ufs/ffs/ffs_extern.h>
 110
 111 #define ALIGNED_TO(ptr, s)      \
 112         (((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)
 113
 114 #ifdef DIRECTIO
 115 extern int      ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
 116 #endif
 117 static vop_fdatasync_t  ffs_fdatasync;
 118 static vop_fsync_t      ffs_fsync;
 119 static vop_getpages_t   ffs_getpages;
 120 static vop_getpages_async_t     ffs_getpages_async;
 121 static vop_lock1_t      ffs_lock;
 122 #ifdef INVARIANTS
 123 static vop_unlock_t     ffs_unlock_debug;
 124 #endif
 125 static vop_read_t       ffs_read;
 126 static vop_write_t      ffs_write;
 127 static int      ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
 128 static int      ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
 129                     struct ucred *cred);
 130 static vop_strategy_t   ffsext_strategy;
 131 static vop_closeextattr_t       ffs_closeextattr;
 132 static vop_deleteextattr_t      ffs_deleteextattr;
 133 static vop_getextattr_t ffs_getextattr;
 134 static vop_listextattr_t        ffs_listextattr;
 135 static vop_openextattr_t        ffs_openextattr;
 136 static vop_setextattr_t ffs_setextattr;
 137 static vop_vptofh_t     ffs_vptofh;
 138 static vop_vput_pair_t  ffs_vput_pair;
 139
 140 /* Global vfs data structures for ufs. */
 141 struct vop_vector ffs_vnodeops1 = {
 142         .vop_default =          &ufs_vnodeops,
 143         .vop_fsync =            ffs_fsync,
 144         .vop_fdatasync =        ffs_fdatasync,
 145         .vop_getpages =         ffs_getpages,
 146         .vop_getpages_async =   ffs_getpages_async,
 147         .vop_lock1 =            ffs_lock,
 148 #ifdef INVARIANTS
 149         .vop_unlock =           ffs_unlock_debug,
 150 #endif
 151         .vop_read =             ffs_read,
 152         .vop_reallocblks =      ffs_reallocblks,
 153         .vop_write =            ffs_write,
 154         .vop_vptofh =           ffs_vptofh,
 155         .vop_vput_pair =        ffs_vput_pair,
 156 };
 157 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops1);
 158
 159 struct vop_vector ffs_fifoops1 = {
 160         .vop_default =          &ufs_fifoops,
 161         .vop_fsync =            ffs_fsync,
 162         .vop_fdatasync =        ffs_fdatasync,
 163         .vop_lock1 =            ffs_lock,
 164 #ifdef INVARIANTS
 165         .vop_unlock =           ffs_unlock_debug,
 166 #endif
 167         .vop_vptofh =           ffs_vptofh,
 168 };
 169 VFS_VOP_VECTOR_REGISTER(ffs_fifoops1);
 170
 171 /* Global vfs data structures for ufs. */
 172 struct vop_vector ffs_vnodeops2 = {
 173         .vop_default =          &ufs_vnodeops,
 174         .vop_fsync =            ffs_fsync,
 175         .vop_fdatasync =        ffs_fdatasync,
 176         .vop_getpages =         ffs_getpages,
 177         .vop_getpages_async =   ffs_getpages_async,
 178         .vop_lock1 =            ffs_lock,
 179 #ifdef INVARIANTS
 180         .vop_unlock =           ffs_unlock_debug,
 181 #endif
 182         .vop_read =             ffs_read,
 183         .vop_reallocblks =      ffs_reallocblks,
 184         .vop_write =            ffs_write,
 185         .vop_closeextattr =     ffs_closeextattr,
 186         .vop_deleteextattr =    ffs_deleteextattr,
 187         .vop_getextattr =       ffs_getextattr,
 188         .vop_listextattr =      ffs_listextattr,
 189         .vop_openextattr =      ffs_openextattr,
 190         .vop_setextattr =       ffs_setextattr,
 191         .vop_vptofh =           ffs_vptofh,
 192         .vop_vput_pair =        ffs_vput_pair,
 193 };
 194 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops2);
 195
 196 struct vop_vector ffs_fifoops2 = {
 197         .vop_default =          &ufs_fifoops,
 198         .vop_fsync =            ffs_fsync,
 199         .vop_fdatasync =        ffs_fdatasync,
 200         .vop_lock1 =            ffs_lock,
 201 #ifdef INVARIANTS
 202         .vop_unlock =           ffs_unlock_debug,
 203 #endif
 204         .vop_reallocblks =      ffs_reallocblks,
 205         .vop_strategy =         ffsext_strategy,
 206         .vop_closeextattr =     ffs_closeextattr,
 207         .vop_deleteextattr =    ffs_deleteextattr,
 208         .vop_getextattr =       ffs_getextattr,
 209         .vop_listextattr =      ffs_listextattr,
 210         .vop_openextattr =      ffs_openextattr,
 211         .vop_setextattr =       ffs_setextattr,
 212         .vop_vptofh =           ffs_vptofh,
 213 };
 214 VFS_VOP_VECTOR_REGISTER(ffs_fifoops2);
 215
 216 /*
 217  * Synch an open file.
 218  */
 219 /* ARGSUSED */
 220 static int
 221 ffs_fsync(struct vop_fsync_args *ap)
 222 {
 223         struct vnode *vp;
 224         struct bufobj *bo;
 225         int error;
 226
 227         vp = ap->a_vp;
 228         bo = &vp->v_bufobj;
 229 retry:
 230         error = ffs_syncvnode(vp, ap->a_waitfor, 0);
 231         if (error)
 232                 return (error);
 233         if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
 234                 error = softdep_fsync(vp);
 235                 if (error)
 236                         return (error);
 237
 238                 /*
 239                  * The softdep_fsync() function may drop vp lock,
 240                  * allowing for dirty buffers to reappear on the
 241                  * bo_dirty list. Recheck and resync as needed.
 242                  */
 243                 BO_LOCK(bo);
 244                 if ((vp->v_type == VREG || vp->v_type == VDIR) &&
 245                     (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
 246                         BO_UNLOCK(bo);
 247                         goto retry;
 248                 }
 249                 BO_UNLOCK(bo);
 250         }
 251         if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), 0))
 252                 return (ENXIO);
 253         return (0);
 254 }
 255
 256 int
 257 ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
 258 {
 259         struct inode *ip;
 260         struct bufobj *bo;
 261         struct ufsmount *ump;
 262         struct buf *bp, *nbp;
 263         ufs_lbn_t lbn;
 264         int error, passes, wflag;
 265         bool still_dirty, unlocked, wait;
 266
 267         ip = VTOI(vp);
 268         bo = &vp->v_bufobj;
 269         ump = VFSTOUFS(vp->v_mount);
 270 #ifdef WITNESS
 271         wflag = IS_SNAPSHOT(ip) ? LK_NOWITNESS : 0;
 272 #else
 273         wflag = 0;
 274 #endif
 275
 276         /*
 277          * When doing MNT_WAIT we must first flush all dependencies
 278          * on the inode.
 279          */
 280         if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
 281             (error = softdep_sync_metadata(vp)) != 0) {
 282                 if (ffs_fsfail_cleanup(ump, error))
 283                         error = 0;
 284                 return (error);
 285         }
 286
 287         /*
 288          * Flush all dirty buffers associated with a vnode.
 289          */
 290         error = 0;
 291         passes = 0;
 292         wait = false;   /* Always do an async pass first. */
 293         unlocked = false;
 294         lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
 295         BO_LOCK(bo);
 296 loop:
 297         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
 298                 bp->b_vflags &= ~BV_SCANNED;
 299         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 300                 /*
 301                  * Reasons to skip this buffer: it has already been considered
 302                  * on this pass, the buffer has dependencies that will cause
 303                  * it to be redirtied and it has not already been deferred,
 304                  * or it is already being written.
 305                  */
 306                 if ((bp->b_vflags & BV_SCANNED) != 0)
 307                         continue;
 308                 bp->b_vflags |= BV_SCANNED;
 309                 /*
 310                  * Flush indirects in order, if requested.
 311                  *
 312                  * Note that if only datasync is requested, we can
 313                  * skip indirect blocks when softupdates are not
 314                  * active.  Otherwise we must flush them with data,
 315                  * since dependencies prevent data block writes.
 316                  */
 317                 if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
 318                     (lbn_level(bp->b_lblkno) >= passes ||
 319                     ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
 320                         continue;
 321                 if (bp->b_lblkno > lbn)
 322                         panic("ffs_syncvnode: syncing truncated data.");
 323                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
 324                         BO_UNLOCK(bo);
 325                 } else if (wait) {
 326                         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
 327                             LK_INTERLOCK | wflag, BO_LOCKPTR(bo)) != 0) {
 328                                 BO_LOCK(bo);
 329                                 bp->b_vflags &= ~BV_SCANNED;
 330                                 goto next_locked;
 331                         }
 332                 } else
 333                         continue;
 334                 if ((bp->b_flags & B_DELWRI) == 0)
 335                         panic("ffs_fsync: not dirty");
 336                 /*
 337                  * Check for dependencies and potentially complete them.
 338                  */
 339                 if (!LIST_EMPTY(&bp->b_dep) &&
 340                     (error = softdep_sync_buf(vp, bp,
 341                     wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
 342                         /*
 343                          * Lock order conflict, buffer was already unlocked,
 344                          * and vnode possibly unlocked.
 345                          */
 346                         if (error == ERELOOKUP) {
 347                                 if (vp->v_data == NULL)
 348                                         return (EBADF);
 349                                 unlocked = true;
 350                                 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
 351                                     (error = softdep_sync_metadata(vp)) != 0) {
 352                                         if (ffs_fsfail_cleanup(ump, error))
 353                                                 error = 0;
 354                                         return (unlocked && error == 0 ?
 355                                             ERELOOKUP : error);
 356                                 }
 357                                 /* Re-evaluate inode size */
 358                                 lbn = lblkno(ITOFS(ip), (ip->i_size +
 359                                     ITOFS(ip)->fs_bsize - 1));
 360                                 goto next;
 361                         }
 362                         /* I/O error. */
 363                         if (error != EBUSY) {
 364                                 BUF_UNLOCK(bp);
 365                                 return (error);
 366                         }
 367                         /* If we deferred once, don't defer again. */
 368                         if ((bp->b_flags & B_DEFERRED) == 0) {
 369                                 bp->b_flags |= B_DEFERRED;
 370                                 BUF_UNLOCK(bp);
 371                                 goto next;
 372                         }
 373                 }
 374                 if (wait) {
 375                         bremfree(bp);
 376                         error = bwrite(bp);
 377                         if (ffs_fsfail_cleanup(ump, error))
 378                                 error = 0;
 379                         if (error != 0)
 380                                 return (error);
 381                 } else if ((bp->b_flags & B_CLUSTEROK)) {
 382                         (void) vfs_bio_awrite(bp);
 383                 } else {
 384                         bremfree(bp);
 385                         (void) bawrite(bp);
 386                 }
 387 next:
 388                 /*
 389                  * Since we may have slept during the I/O, we need
 390                  * to start from a known point.
 391                  */
 392                 BO_LOCK(bo);
 393 next_locked:
 394                 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
 395         }
 396         if (waitfor != MNT_WAIT) {
 397                 BO_UNLOCK(bo);
 398                 if ((flags & NO_INO_UPDT) != 0)
 399                         return (unlocked ? ERELOOKUP : 0);
 400                 error = ffs_update(vp, 0);
 401                 if (error == 0 && unlocked)
 402                         error = ERELOOKUP;
 403                 return (error);
 404         }
 405         /* Drain IO to see if we're done. */
 406         bufobj_wwait(bo, 0, 0);
 407         /*
 408          * Block devices associated with filesystems may have new I/O
 409          * requests posted for them even if the vnode is locked, so no
 410          * amount of trying will get them clean.  We make several passes
 411          * as a best effort.
 412          *
 413          * Regular files may need multiple passes to flush all dependency
 414          * work as it is possible that we must write once per indirect
 415          * level, once for the leaf, and once for the inode and each of
 416          * these will be done with one sync and one async pass.
 417          */
 418         if (bo->bo_dirty.bv_cnt > 0) {
 419                 if ((flags & DATA_ONLY) == 0) {
 420                         still_dirty = true;
 421                 } else {
 422                         /*
 423                          * For data-only sync, dirty indirect buffers
 424                          * are ignored.
 425                          */
 426                         still_dirty = false;
 427                         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
 428                                 if (bp->b_lblkno > -UFS_NDADDR) {
 429                                         still_dirty = true;
 430                                         break;
 431                                 }
 432                         }
 433                 }
 434
 435                 if (still_dirty) {
 436                         /* Write the inode after sync passes to flush deps. */
 437                         if (wait && DOINGSOFTDEP(vp) &&
 438                             (flags & NO_INO_UPDT) == 0) {
 439                                 BO_UNLOCK(bo);
 440                                 ffs_update(vp, 1);
 441                                 BO_LOCK(bo);
 442                         }
 443                         /* switch between sync/async. */
 444                         wait = !wait;
 445                         if (wait || ++passes < UFS_NIADDR + 2)
 446                                 goto loop;
 447                 }
 448         }
 449         BO_UNLOCK(bo);
 450         error = 0;
 451         if ((flags & DATA_ONLY) == 0) {
 452                 if ((flags & NO_INO_UPDT) == 0)
 453                         error = ffs_update(vp, 1);
 454                 if (DOINGSUJ(vp))
 455                         softdep_journal_fsync(VTOI(vp));
 456         } else if ((ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)) != 0) {
 457                 error = ffs_update(vp, 1);
 458         }
 459         if (error == 0 && unlocked)
 460                 error = ERELOOKUP;
 461         if (error == 0)
 462                 ip->i_flag &= ~IN_NEEDSYNC;
 463         return (error);
 464 }
 465
 466 static int
 467 ffs_fdatasync(struct vop_fdatasync_args *ap)
 468 {
 469
 470         return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
 471 }
 472
 473 static int
 474 ffs_lock(ap)
 475         struct vop_lock1_args /* {
 476                 struct vnode *a_vp;
 477                 int a_flags;
 478                 char *file;
 479                 int line;
 480         } */ *ap;
 481 {
 482 #if !defined(NO_FFS_SNAPSHOT) || defined(DIAGNOSTIC)
 483         struct vnode *vp = ap->a_vp;
 484 #endif  /* !NO_FFS_SNAPSHOT || DIAGNOSTIC */
 485 #ifdef DIAGNOSTIC
 486         struct inode *ip;
 487 #endif  /* DIAGNOSTIC */
 488         int result;
 489 #ifndef NO_FFS_SNAPSHOT
 490         int flags;
 491         struct lock *lkp;
 492
 493         /*
 494          * Adaptive spinning mixed with SU leads to trouble. use a giant hammer
 495          * and only use it when LK_NODDLKTREAT is set. Currently this means it
 496          * is only used during path lookup.
 497          */
 498         if ((ap->a_flags & LK_NODDLKTREAT) != 0)
 499                 ap->a_flags |= LK_ADAPTIVE;
 500         switch (ap->a_flags & LK_TYPE_MASK) {
 501         case LK_SHARED:
 502         case LK_UPGRADE:
 503         case LK_EXCLUSIVE:
 504                 flags = ap->a_flags;
 505                 for (;;) {
 506 #ifdef DEBUG_VFS_LOCKS
 507                         VNPASS(vp->v_holdcnt != 0, vp);
 508 #endif  /* DEBUG_VFS_LOCKS */
 509                         lkp = vp->v_vnlock;
 510                         result = lockmgr_lock_flags(lkp, flags,
 511                             &VI_MTX(vp)->lock_object, ap->a_file, ap->a_line);
 512                         if (lkp == vp->v_vnlock || result != 0)
 513                                 break;
 514                         /*
 515                          * Apparent success, except that the vnode
 516                          * mutated between snapshot file vnode and
 517                          * regular file vnode while this process
 518                          * slept.  The lock currently held is not the
 519                          * right lock.  Release it, and try to get the
 520                          * new lock.
 521                          */
 522                         lockmgr_unlock(lkp);
 523                         if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
 524                             (LK_INTERLOCK | LK_NOWAIT))
 525                                 return (EBUSY);
 526                         if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
 527                                 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
 528                         flags &= ~LK_INTERLOCK;
 529                 }
 530 #ifdef DIAGNOSTIC
 531                 switch (ap->a_flags & LK_TYPE_MASK) {
 532                 case LK_UPGRADE:
 533                 case LK_EXCLUSIVE:
 534                         if (result == 0 && vp->v_vnlock->lk_recurse == 0) {
 535                                 ip = VTOI(vp);
 536                                 if (ip != NULL)
 537                                         ip->i_lock_gen++;
 538                         }
 539                 }
 540 #endif  /* DIAGNOSTIC */
 541                 break;
 542         default:
 543 #ifdef DIAGNOSTIC
 544                 if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) {
 545                         ip = VTOI(vp);
 546                         if (ip != NULL)
 547                                 ufs_unlock_tracker(ip);
 548                 }
 549 #endif  /* DIAGNOSTIC */
 550                 result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
 551                 break;
 552         }
 553 #else   /* NO_FFS_SNAPSHOT */
 554         /*
 555          * See above for an explanation.
 556          */
 557         if ((ap->a_flags & LK_NODDLKTREAT) != 0)
 558                 ap->a_flags |= LK_ADAPTIVE;
 559 #ifdef DIAGNOSTIC
 560         if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) {
 561                 ip = VTOI(vp);
 562                 if (ip != NULL)
 563                         ufs_unlock_tracker(ip);
 564         }
 565 #endif  /* DIAGNOSTIC */
 566         result =  VOP_LOCK1_APV(&ufs_vnodeops, ap);
 567 #endif  /* NO_FFS_SNAPSHOT */
 568 #ifdef DIAGNOSTIC
 569         switch (ap->a_flags & LK_TYPE_MASK) {
 570         case LK_UPGRADE:
 571         case LK_EXCLUSIVE:
 572                 if (result == 0 && vp->v_vnlock->lk_recurse == 0) {
 573                         ip = VTOI(vp);
 574                         if (ip != NULL)
 575                                 ip->i_lock_gen++;
 576                 }
 577         }
 578 #endif  /* DIAGNOSTIC */
 579         return (result);
 580 }
 581
 582 #ifdef INVARIANTS
 583 static int
 584 ffs_unlock_debug(struct vop_unlock_args *ap)
 585 {
 586         struct vnode *vp;
 587         struct inode *ip;
 588
 589         vp = ap->a_vp;
 590         ip = VTOI(vp);
 591         if (ip->i_flag & UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE) {
 592                 if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
 593                         VI_LOCK(vp);
 594                         VNASSERT((vp->v_mflag & VMP_LAZYLIST), vp,
 595                             ("%s: modified vnode (%x) not on lazy list",
 596                             __func__, ip->i_flag));
 597                         VI_UNLOCK(vp);
 598                 }
 599         }
 600         KASSERT(vp->v_type != VDIR || vp->v_vnlock->lk_recurse != 0 ||
 601             (ip->i_flag & IN_ENDOFF) == 0,
 602             ("ufs dir vp %p ip %p flags %#x", vp, ip, ip->i_flag));
 603 #ifdef DIAGNOSTIC
 604         if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && ip != NULL &&
 605             vp->v_vnlock->lk_recurse == 0)
 606                 ufs_unlock_tracker(ip);
 607 #endif
 608         return (VOP_UNLOCK_APV(&ufs_vnodeops, ap));
 609 }
 610 #endif
 611
 612 static int
 613 ffs_read_hole(struct uio *uio, long xfersize, long *size)
 614 {
 615         ssize_t saved_resid, tlen;
 616         int error;
 617
 618         while (xfersize > 0) {
 619                 tlen = min(xfersize, ZERO_REGION_SIZE);
 620                 saved_resid = uio->uio_resid;
 621                 error = vn_io_fault_uiomove(__DECONST(void *, zero_region),
 622                     tlen, uio);
 623                 if (error != 0)
 624                         return (error);
 625                 tlen = saved_resid - uio->uio_resid;
 626                 xfersize -= tlen;
 627                 *size -= tlen;
 628         }
 629         return (0);
 630 }
 631
 632 /*
 633  * Vnode op for reading.
 634  */
 635 static int
 636 ffs_read(ap)
 637         struct vop_read_args /* {
 638                 struct vnode *a_vp;
 639                 struct uio *a_uio;
 640                 int a_ioflag;
 641                 struct ucred *a_cred;
 642         } */ *ap;
 643 {
 644         struct vnode *vp;
 645         struct inode *ip;
 646         struct uio *uio;
 647         struct fs *fs;
 648         struct buf *bp;
 649         ufs_lbn_t lbn, nextlbn;
 650         off_t bytesinfile;
 651         long size, xfersize, blkoffset;
 652         ssize_t orig_resid;
 653         int bflag, error, ioflag, seqcount;
 654
 655         vp = ap->a_vp;
 656         uio = ap->a_uio;
 657         ioflag = ap->a_ioflag;
 658         if (ap->a_ioflag & IO_EXT)
 659 #ifdef notyet
 660                 return (ffs_extread(vp, uio, ioflag));
 661 #else
 662                 panic("ffs_read+IO_EXT");
 663 #endif
 664 #ifdef DIRECTIO
 665         if ((ioflag & IO_DIRECT) != 0) {
 666                 int workdone;
 667
 668                 error = ffs_rawread(vp, uio, &workdone);
 669                 if (error != 0 || workdone != 0)
 670                         return error;
 671         }
 672 #endif
 673
 674         seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 675         ip = VTOI(vp);
 676
 677 #ifdef INVARIANTS
 678         if (uio->uio_rw != UIO_READ)
 679                 panic("ffs_read: mode");
 680
 681         if (vp->v_type == VLNK) {
 682                 if ((int)ip->i_size < VFSTOUFS(vp->v_mount)->um_maxsymlinklen)
 683                         panic("ffs_read: short symlink");
 684         } else if (vp->v_type != VREG && vp->v_type != VDIR)
 685                 panic("ffs_read: type %d",  vp->v_type);
 686 #endif
 687         orig_resid = uio->uio_resid;
 688         KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
 689         if (orig_resid == 0)
 690                 return (0);
 691         KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
 692         fs = ITOFS(ip);
 693         if (uio->uio_offset < ip->i_size &&
 694             uio->uio_offset >= fs->fs_maxfilesize)
 695                 return (EOVERFLOW);
 696
 697         bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE);
 698         for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 699                 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 700                         break;
 701                 lbn = lblkno(fs, uio->uio_offset);
 702                 nextlbn = lbn + 1;
 703
 704                 /*
 705                  * size of buffer.  The buffer representing the
 706                  * end of the file is rounded up to the size of
 707                  * the block type ( fragment or full block,
 708                  * depending ).
 709                  */
 710                 size = blksize(fs, ip, lbn);
 711                 blkoffset = blkoff(fs, uio->uio_offset);
 712
 713                 /*
 714                  * The amount we want to transfer in this iteration is
 715                  * one FS block less the amount of the data before
 716                  * our startpoint (duh!)
 717                  */
 718                 xfersize = fs->fs_bsize - blkoffset;
 719
 720                 /*
 721                  * But if we actually want less than the block,
 722                  * or the file doesn't have a whole block more of data,
 723                  * then use the lesser number.
 724                  */
 725                 if (uio->uio_resid < xfersize)
 726                         xfersize = uio->uio_resid;
 727                 if (bytesinfile < xfersize)
 728                         xfersize = bytesinfile;
 729
 730                 if (lblktosize(fs, nextlbn) >= ip->i_size) {
 731                         /*
 732                          * Don't do readahead if this is the end of the file.
 733                          */
 734                         error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
 735                 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 736                         /*
 737                          * Otherwise if we are allowed to cluster,
 738                          * grab as much as we can.
 739                          *
 740                          * XXX  This may not be a win if we are not
 741                          * doing sequential access.
 742                          */
 743                         error = cluster_read(vp, ip->i_size, lbn,
 744                             size, NOCRED, blkoffset + uio->uio_resid,
 745                             seqcount, bflag, &bp);
 746                 } else if (seqcount > 1) {
 747                         /*
 748                          * If we are NOT allowed to cluster, then
 749                          * if we appear to be acting sequentially,
 750                          * fire off a request for a readahead
 751                          * as well as a read. Note that the 4th and 5th
 752                          * arguments point to arrays of the size specified in
 753                          * the 6th argument.
 754                          */
 755                         u_int nextsize = blksize(fs, ip, nextlbn);
 756                         error = breadn_flags(vp, lbn, lbn, size, &nextlbn,
 757                             &nextsize, 1, NOCRED, bflag, NULL, &bp);
 758                 } else {
 759                         /*
 760                          * Failing all of the above, just read what the
 761                          * user asked for. Interestingly, the same as
 762                          * the first option above.
 763                          */
 764                         error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
 765                 }
 766                 if (error == EJUSTRETURN) {
 767                         error = ffs_read_hole(uio, xfersize, &size);
 768                         if (error == 0)
 769                                 continue;
 770                 }
 771                 if (error != 0) {
 772                         brelse(bp);
 773                         bp = NULL;
 774                         break;
 775                 }
 776
 777                 /*
 778                  * We should only get non-zero b_resid when an I/O error
 779                  * has occurred, which should cause us to break above.
 780                  * However, if the short read did not cause an error,
 781                  * then we want to ensure that we do not uiomove bad
 782                  * or uninitialized data.
 783                  */
 784                 size -= bp->b_resid;
 785                 if (size < xfersize) {
 786                         if (size == 0)
 787                                 break;
 788                         xfersize = size;
 789                 }
 790
 791                 if (buf_mapped(bp)) {
 792                         error = vn_io_fault_uiomove((char *)bp->b_data +
 793                             blkoffset, (int)xfersize, uio);
 794                 } else {
 795                         error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
 796                             (int)xfersize, uio);
 797                 }
 798                 if (error)
 799                         break;
 800
 801                 vfs_bio_brelse(bp, ioflag);
 802         }
 803
 804         /*
 805          * This can only happen in the case of an error
 806          * because the loop above resets bp to NULL on each iteration
 807          * and on normal completion has not set a new value into it.
 808          * so it must have come from a 'break' statement
 809          */
 810         if (bp != NULL)
 811                 vfs_bio_brelse(bp, ioflag);
 812
 813         if ((error == 0 || uio->uio_resid != orig_resid) &&
 814             (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
 815                 UFS_INODE_SET_FLAG_SHARED(ip, IN_ACCESS);
 816         return (error);
 817 }
 818
 819 /*
 820  * Vnode op for writing.
 821  */
 822 static int
 823 ffs_write(ap)
 824         struct vop_write_args /* {
 825                 struct vnode *a_vp;
 826                 struct uio *a_uio;
 827                 int a_ioflag;
 828                 struct ucred *a_cred;
 829         } */ *ap;
 830 {
 831         struct vnode *vp;
 832         struct uio *uio;
 833         struct inode *ip;
 834         struct fs *fs;
 835         struct buf *bp;
 836         ufs_lbn_t lbn;
 837         off_t osize;
 838         ssize_t resid;
 839         int seqcount;
 840         int blkoffset, error, flags, ioflag, size, xfersize;
 841
 842         vp = ap->a_vp;
 843         if (DOINGSUJ(vp))
 844                 softdep_prealloc(vp, MNT_WAIT);
 845         if (vp->v_data == NULL)
 846                 return (EBADF);
 847
 848         uio = ap->a_uio;
 849         ioflag = ap->a_ioflag;
 850         if (ap->a_ioflag & IO_EXT)
 851 #ifdef notyet
 852                 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
 853 #else
 854                 panic("ffs_write+IO_EXT");
 855 #endif
 856
 857         seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 858         ip = VTOI(vp);
 859
 860 #ifdef INVARIANTS
 861         if (uio->uio_rw != UIO_WRITE)
 862                 panic("ffs_write: mode");
 863 #endif
 864
 865         switch (vp->v_type) {
 866         case VREG:
 867                 if (ioflag & IO_APPEND)
 868                         uio->uio_offset = ip->i_size;
 869                 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
 870                         return (EPERM);
 871                 /* FALLTHROUGH */
 872         case VLNK:
 873                 break;
 874         case VDIR:
 875                 panic("ffs_write: dir write");
 876                 break;
 877         default:
 878                 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
 879                         (int)uio->uio_offset,
 880                         (int)uio->uio_resid
 881                 );
 882         }
 883
 884         KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
 885         KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
 886         fs = ITOFS(ip);
 887         if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
 888                 return (EFBIG);
 889         /*
 890          * Maybe this should be above the vnode op call, but so long as
 891          * file servers have no limits, I don't think it matters.
 892          */
 893         if (vn_rlimit_fsize(vp, uio, uio->uio_td))
 894                 return (EFBIG);
 895
 896         resid = uio->uio_resid;
 897         osize = ip->i_size;
 898         if (seqcount > BA_SEQMAX)
 899                 flags = BA_SEQMAX << BA_SEQSHIFT;
 900         else
 901                 flags = seqcount << BA_SEQSHIFT;
 902         if (ioflag & IO_SYNC)
 903                 flags |= IO_SYNC;
 904         flags |= BA_UNMAPPED;
 905
 906         for (error = 0; uio->uio_resid > 0;) {
 907                 lbn = lblkno(fs, uio->uio_offset);
 908                 blkoffset = blkoff(fs, uio->uio_offset);
 909                 xfersize = fs->fs_bsize - blkoffset;
 910                 if (uio->uio_resid < xfersize)
 911                         xfersize = uio->uio_resid;
 912                 if (uio->uio_offset + xfersize > ip->i_size)
 913                         vnode_pager_setsize(vp, uio->uio_offset + xfersize);
 914
 915                 /*
 916                  * We must perform a read-before-write if the transfer size
 917                  * does not cover the entire buffer.
 918                  */
 919                 if (fs->fs_bsize > xfersize)
 920                         flags |= BA_CLRBUF;
 921                 else
 922                         flags &= ~BA_CLRBUF;
 923 /* XXX is uio->uio_offset the right thing here? */
 924                 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
 925                     ap->a_cred, flags, &bp);
 926                 if (error != 0) {
 927                         vnode_pager_setsize(vp, ip->i_size);
 928                         break;
 929                 }
 930                 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
 931                         bp->b_flags |= B_NOCACHE;
 932
 933                 if (uio->uio_offset + xfersize > ip->i_size) {
 934                         ip->i_size = uio->uio_offset + xfersize;
 935                         DIP_SET(ip, i_size, ip->i_size);
 936                         UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
 937                 }
 938
 939                 size = blksize(fs, ip, lbn) - bp->b_resid;
 940                 if (size < xfersize)
 941                         xfersize = size;
 942
 943                 if (buf_mapped(bp)) {
 944                         error = vn_io_fault_uiomove((char *)bp->b_data +
 945                             blkoffset, (int)xfersize, uio);
 946                 } else {
 947                         error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
 948                             (int)xfersize, uio);
 949                 }
 950                 /*
 951                  * If the buffer is not already filled and we encounter an
 952                  * error while trying to fill it, we have to clear out any
 953                  * garbage data from the pages instantiated for the buffer.
 954                  * If we do not, a failed uiomove() during a write can leave
 955                  * the prior contents of the pages exposed to a userland mmap.
 956                  *
 957                  * Note that we need only clear buffers with a transfer size
 958                  * equal to the block size because buffers with a shorter
 959                  * transfer size were cleared above by the call to UFS_BALLOC()
 960                  * with the BA_CLRBUF flag set.
 961                  *
 962                  * If the source region for uiomove identically mmaps the
 963                  * buffer, uiomove() performed the NOP copy, and the buffer
 964                  * content remains valid because the page fault handler
 965                  * validated the pages.
 966                  */
 967                 if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
 968                     fs->fs_bsize == xfersize)
 969                         vfs_bio_clrbuf(bp);
 970
 971                 vfs_bio_set_flags(bp, ioflag);
 972
 973                 /*
 974                  * If IO_SYNC each buffer is written synchronously.  Otherwise
 975                  * if we have a severe page deficiency write the buffer
 976                  * asynchronously.  Otherwise try to cluster, and if that
 977                  * doesn't do it then either do an async write (if O_DIRECT),
 978                  * or a delayed write (if not).
 979                  */
 980                 if (ioflag & IO_SYNC) {
 981                         (void)bwrite(bp);
 982                 } else if (vm_page_count_severe() ||
 983                             buf_dirty_count_severe() ||
 984                             (ioflag & IO_ASYNC)) {
 985                         bp->b_flags |= B_CLUSTEROK;
 986                         bawrite(bp);
 987                 } else if (xfersize + blkoffset == fs->fs_bsize) {
 988                         if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
 989                                 bp->b_flags |= B_CLUSTEROK;
 990                                 cluster_write(vp, bp, ip->i_size, seqcount,
 991                                     GB_UNMAPPED);
 992                         } else {
 993                                 bawrite(bp);
 994                         }
 995                 } else if (ioflag & IO_DIRECT) {
 996                         bp->b_flags |= B_CLUSTEROK;
 997                         bawrite(bp);
 998                 } else {
 999                         bp->b_flags |= B_CLUSTEROK;
1000                         bdwrite(bp);
1001                 }
1002                 if (error || xfersize == 0)
1003                         break;
1004                 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
1005         }
1006         /*
1007          * If we successfully wrote any data, and we are not the superuser
1008          * we clear the setuid and setgid bits as a precaution against
1009          * tampering.
1010          */
1011         if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
1012             ap->a_cred) {
1013                 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) {
1014                         vn_seqc_write_begin(vp);
1015                         UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID));
1016                         DIP_SET(ip, i_mode, ip->i_mode);
1017                         vn_seqc_write_end(vp);
1018                 }
1019         }
1020         if (error) {
1021                 if (ioflag & IO_UNIT) {
1022                         (void)ffs_truncate(vp, osize,
1023                             IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
1024                         uio->uio_offset -= resid - uio->uio_resid;
1025                         uio->uio_resid = resid;
1026                 }
1027         } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) {
1028                 if (!(ioflag & IO_DATASYNC) ||
1029                     (ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)))
1030                         error = ffs_update(vp, 1);
1031                 if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), error))
1032                         error = ENXIO;
1033         }
1034         return (error);
1035 }
1036
1037 /*
1038  * Extended attribute area reading.
1039  */
1040 static int
1041 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
1042 {
1043         struct inode *ip;
1044         struct ufs2_dinode *dp;
1045         struct fs *fs;
1046         struct buf *bp;
1047         ufs_lbn_t lbn, nextlbn;
1048         off_t bytesinfile;
1049         long size, xfersize, blkoffset;
1050         ssize_t orig_resid;
1051         int error;
1052
1053         ip = VTOI(vp);
1054         fs = ITOFS(ip);
1055         dp = ip->i_din2;
1056
1057 #ifdef INVARIANTS
1058         if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
1059                 panic("ffs_extread: mode");
1060
1061 #endif
1062         orig_resid = uio->uio_resid;
1063         KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
1064         if (orig_resid == 0)
1065                 return (0);
1066         KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
1067
1068         for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
1069                 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
1070                         break;
1071                 lbn = lblkno(fs, uio->uio_offset);
1072                 nextlbn = lbn + 1;
1073
1074                 /*
1075                  * size of buffer.  The buffer representing the
1076                  * end of the file is rounded up to the size of
1077                  * the block type ( fragment or full block,
1078                  * depending ).
1079                  */
1080                 size = sblksize(fs, dp->di_extsize, lbn);
1081                 blkoffset = blkoff(fs, uio->uio_offset);
1082
1083                 /*
1084                  * The amount we want to transfer in this iteration is
1085                  * one FS block less the amount of the data before
1086                  * our startpoint (duh!)
1087                  */
1088                 xfersize = fs->fs_bsize - blkoffset;
1089
1090                 /*
1091                  * But if we actually want less than the block,
1092                  * or the file doesn't have a whole block more of data,
1093                  * then use the lesser number.
1094                  */
1095                 if (uio->uio_resid < xfersize)
1096                         xfersize = uio->uio_resid;
1097                 if (bytesinfile < xfersize)
1098                         xfersize = bytesinfile;
1099
1100                 if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
1101                         /*
1102                          * Don't do readahead if this is the end of the info.
1103                          */
1104                         error = bread(vp, -1 - lbn, size, NOCRED, &bp);
1105                 } else {
1106                         /*
1107                          * If we have a second block, then
1108                          * fire off a request for a readahead
1109                          * as well as a read. Note that the 4th and 5th
1110                          * arguments point to arrays of the size specified in
1111                          * the 6th argument.
1112                          */
1113                         u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
1114
1115                         nextlbn = -1 - nextlbn;
1116                         error = breadn(vp, -1 - lbn,
1117                             size, &nextlbn, &nextsize, 1, NOCRED, &bp);
1118                 }
1119                 if (error) {
1120                         brelse(bp);
1121                         bp = NULL;
1122                         break;
1123                 }
1124
1125                 /*
1126                  * We should only get non-zero b_resid when an I/O error
1127                  * has occurred, which should cause us to break above.
1128                  * However, if the short read did not cause an error,
1129                  * then we want to ensure that we do not uiomove bad
1130                  * or uninitialized data.
1131                  */
1132                 size -= bp->b_resid;
1133                 if (size < xfersize) {
1134                         if (size == 0)
1135                                 break;
1136                         xfersize = size;
1137                 }
1138
1139                 error = uiomove((char *)bp->b_data + blkoffset,
1140                                         (int)xfersize, uio);
1141                 if (error)
1142                         break;
1143                 vfs_bio_brelse(bp, ioflag);
1144         }
1145
1146         /*
1147          * This can only happen in the case of an error
1148          * because the loop above resets bp to NULL on each iteration
1149          * and on normal completion has not set a new value into it.
1150          * so it must have come from a 'break' statement
1151          */
1152         if (bp != NULL)
1153                 vfs_bio_brelse(bp, ioflag);
1154         return (error);
1155 }
1156
1157 /*
1158  * Extended attribute area writing.
1159  */
1160 static int
1161 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1162 {
1163         struct inode *ip;
1164         struct ufs2_dinode *dp;
1165         struct fs *fs;
1166         struct buf *bp;
1167         ufs_lbn_t lbn;
1168         off_t osize;
1169         ssize_t resid;
1170         int blkoffset, error, flags, size, xfersize;
1171
1172         ip = VTOI(vp);
1173         fs = ITOFS(ip);
1174         dp = ip->i_din2;
1175
1176 #ifdef INVARIANTS
1177         if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1178                 panic("ffs_extwrite: mode");
1179 #endif
1180
1181         if (ioflag & IO_APPEND)
1182                 uio->uio_offset = dp->di_extsize;
1183         KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1184         KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1185         if ((uoff_t)uio->uio_offset + uio->uio_resid >
1186             UFS_NXADDR * fs->fs_bsize)
1187                 return (EFBIG);
1188
1189         resid = uio->uio_resid;
1190         osize = dp->di_extsize;
1191         flags = IO_EXT;
1192         if (ioflag & IO_SYNC)
1193                 flags |= IO_SYNC;
1194
1195         for (error = 0; uio->uio_resid > 0;) {
1196                 lbn = lblkno(fs, uio->uio_offset);
1197                 blkoffset = blkoff(fs, uio->uio_offset);
1198                 xfersize = fs->fs_bsize - blkoffset;
1199                 if (uio->uio_resid < xfersize)
1200                         xfersize = uio->uio_resid;
1201
1202                 /*
1203                  * We must perform a read-before-write if the transfer size
1204                  * does not cover the entire buffer.
1205                  */
1206                 if (fs->fs_bsize > xfersize)
1207                         flags |= BA_CLRBUF;
1208                 else
1209                         flags &= ~BA_CLRBUF;
1210                 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1211                     ucred, flags, &bp);
1212                 if (error != 0)
1213                         break;
1214                 /*
1215                  * If the buffer is not valid we have to clear out any
1216                  * garbage data from the pages instantiated for the buffer.
1217                  * If we do not, a failed uiomove() during a write can leave
1218                  * the prior contents of the pages exposed to a userland
1219                  * mmap().  XXX deal with uiomove() errors a better way.
1220                  */
1221                 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1222                         vfs_bio_clrbuf(bp);
1223
1224                 if (uio->uio_offset + xfersize > dp->di_extsize) {
1225                         dp->di_extsize = uio->uio_offset + xfersize;
1226                         UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
1227                 }
1228
1229                 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1230                 if (size < xfersize)
1231                         xfersize = size;
1232
1233                 error =
1234                     uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1235
1236                 vfs_bio_set_flags(bp, ioflag);
1237
1238                 /*
1239                  * If IO_SYNC each buffer is written synchronously.  Otherwise
1240                  * if we have a severe page deficiency write the buffer
1241                  * asynchronously.  Otherwise try to cluster, and if that
1242                  * doesn't do it then either do an async write (if O_DIRECT),
1243                  * or a delayed write (if not).
1244                  */
1245                 if (ioflag & IO_SYNC) {
1246                         (void)bwrite(bp);
1247                 } else if (vm_page_count_severe() ||
1248                             buf_dirty_count_severe() ||
1249                             xfersize + blkoffset == fs->fs_bsize ||
1250                             (ioflag & (IO_ASYNC | IO_DIRECT)))
1251                         bawrite(bp);
1252                 else
1253                         bdwrite(bp);
1254                 if (error || xfersize == 0)
1255                         break;
1256                 UFS_INODE_SET_FLAG(ip, IN_CHANGE);
1257         }
1258         /*
1259          * If we successfully wrote any data, and we are not the superuser
1260          * we clear the setuid and setgid bits as a precaution against
1261          * tampering.
1262          */
1263         if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1264                 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) {
1265                         vn_seqc_write_begin(vp);
1266                         UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID));
1267                         dp->di_mode = ip->i_mode;
1268                         vn_seqc_write_end(vp);
1269                 }
1270         }
1271         if (error) {
1272                 if (ioflag & IO_UNIT) {
1273                         (void)ffs_truncate(vp, osize,
1274                             IO_EXT | (ioflag&IO_SYNC), ucred);
1275                         uio->uio_offset -= resid - uio->uio_resid;
1276                         uio->uio_resid = resid;
1277                 }
1278         } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1279                 error = ffs_update(vp, 1);
1280         return (error);
1281 }
1282
1283 /*
1284  * Vnode operating to retrieve a named extended attribute.
1285  *
1286  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1287  * the length of the EA, and possibly the pointer to the entry and to the data.
1288  */
1289 static int
1290 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name,
1291     struct extattr **eapp, u_char **eac)
1292 {
1293         struct extattr *eap, *eaend;
1294         size_t nlen;
1295
1296         nlen = strlen(name);
1297         KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
1298         eap = (struct extattr *)ptr;
1299         eaend = (struct extattr *)(ptr + length);
1300         for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
1301                 KASSERT(EXTATTR_NEXT(eap) <= eaend,
1302                     ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend));
1303                 if (eap->ea_namespace != nspace || eap->ea_namelength != nlen
1304                     || memcmp(eap->ea_name, name, nlen) != 0)
1305                         continue;
1306                 if (eapp != NULL)
1307                         *eapp = eap;
1308                 if (eac != NULL)
1309                         *eac = EXTATTR_CONTENT(eap);
1310                 return (EXTATTR_CONTENT_SIZE(eap));
1311         }
1312         return (-1);
1313 }
1314
1315 static int
1316 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td)
1317 {
1318         const struct extattr *eap, *eaend, *eapnext;
1319         struct inode *ip;
1320         struct ufs2_dinode *dp;
1321         struct fs *fs;
1322         struct uio luio;
1323         struct iovec liovec;
1324         u_int easize;
1325         int error;
1326         u_char *eae;
1327
1328         ip = VTOI(vp);
1329         fs = ITOFS(ip);
1330         dp = ip->i_din2;
1331         easize = dp->di_extsize;
1332         if ((uoff_t)easize > UFS_NXADDR * fs->fs_bsize)
1333                 return (EFBIG);
1334
1335         eae = malloc(easize, M_TEMP, M_WAITOK);
1336
1337         liovec.iov_base = eae;
1338         liovec.iov_len = easize;
1339         luio.uio_iov = &liovec;
1340         luio.uio_iovcnt = 1;
1341         luio.uio_offset = 0;
1342         luio.uio_resid = easize;
1343         luio.uio_segflg = UIO_SYSSPACE;
1344         luio.uio_rw = UIO_READ;
1345         luio.uio_td = td;
1346
1347         error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1348         if (error) {
1349                 free(eae, M_TEMP);
1350                 return (error);
1351         }
1352         /* Validate disk xattrfile contents. */
1353         for (eap = (void *)eae, eaend = (void *)(eae + easize); eap < eaend;
1354             eap = eapnext) {
1355                 /* Detect zeroed out tail */
1356                 if (eap->ea_length < sizeof(*eap) || eap->ea_length == 0) {
1357                         easize = (const u_char *)eap - eae;
1358                         break;
1359                 }
1360
1361                 eapnext = EXTATTR_NEXT(eap);
1362                 /* Bogusly long entry. */
1363                 if (eapnext > eaend) {
1364                         free(eae, M_TEMP);
1365                         return (EINTEGRITY);
1366                 }
1367         }
1368         ip->i_ea_len = easize;
1369         *p = eae;
1370         return (0);
1371 }
1372
1373 static void
1374 ffs_lock_ea(struct vnode *vp)
1375 {
1376         struct inode *ip;
1377
1378         ip = VTOI(vp);
1379         VI_LOCK(vp);
1380         while (ip->i_flag & IN_EA_LOCKED) {
1381                 UFS_INODE_SET_FLAG(ip, IN_EA_LOCKWAIT);
1382                 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
1383                     0);
1384         }
1385         UFS_INODE_SET_FLAG(ip, IN_EA_LOCKED);
1386         VI_UNLOCK(vp);
1387 }
1388
1389 static void
1390 ffs_unlock_ea(struct vnode *vp)
1391 {
1392         struct inode *ip;
1393
1394         ip = VTOI(vp);
1395         VI_LOCK(vp);
1396         if (ip->i_flag & IN_EA_LOCKWAIT)
1397                 wakeup(&ip->i_ea_refs);
1398         ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1399         VI_UNLOCK(vp);
1400 }
1401
1402 static int
1403 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1404 {
1405         struct inode *ip;
1406         int error;
1407
1408         ip = VTOI(vp);
1409
1410         ffs_lock_ea(vp);
1411         if (ip->i_ea_area != NULL) {
1412                 ip->i_ea_refs++;
1413                 ffs_unlock_ea(vp);
1414                 return (0);
1415         }
1416         error = ffs_rdextattr(&ip->i_ea_area, vp, td);
1417         if (error) {
1418                 ffs_unlock_ea(vp);
1419                 return (error);
1420         }
1421         ip->i_ea_error = 0;
1422         ip->i_ea_refs++;
1423         ffs_unlock_ea(vp);
1424         return (0);
1425 }
1426
1427 /*
1428  * Vnode extattr transaction commit/abort
1429  */
1430 static int
1431 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1432 {
1433         struct inode *ip;
1434         struct uio luio;
1435         struct iovec *liovec;
1436         struct ufs2_dinode *dp;
1437         size_t ea_len, tlen;
1438         int error, i, lcnt;
1439         bool truncate;
1440
1441         ip = VTOI(vp);
1442
1443         ffs_lock_ea(vp);
1444         if (ip->i_ea_area == NULL) {
1445                 ffs_unlock_ea(vp);
1446                 return (EINVAL);
1447         }
1448         dp = ip->i_din2;
1449         error = ip->i_ea_error;
1450         truncate = false;
1451         if (commit && error == 0) {
1452                 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1453                 if (cred == NOCRED)
1454                         cred =  vp->v_mount->mnt_cred;
1455
1456                 ea_len = MAX(ip->i_ea_len, dp->di_extsize);
1457                 for (lcnt = 1, tlen = ea_len - ip->i_ea_len; tlen > 0;) {
1458                         tlen -= MIN(ZERO_REGION_SIZE, tlen);
1459                         lcnt++;
1460                 }
1461
1462                 liovec = __builtin_alloca(lcnt * sizeof(struct iovec));
1463                 luio.uio_iovcnt = lcnt;
1464
1465                 liovec[0].iov_base = ip->i_ea_area;
1466                 liovec[0].iov_len = ip->i_ea_len;
1467                 for (i = 1, tlen = ea_len - ip->i_ea_len; i < lcnt; i++) {
1468                         liovec[i].iov_base = __DECONST(void *, zero_region);
1469                         liovec[i].iov_len = MIN(ZERO_REGION_SIZE, tlen);
1470                         tlen -= liovec[i].iov_len;
1471                 }
1472                 MPASS(tlen == 0);
1473
1474                 luio.uio_iov = liovec;
1475                 luio.uio_offset = 0;
1476                 luio.uio_resid = ea_len;
1477                 luio.uio_segflg = UIO_SYSSPACE;
1478                 luio.uio_rw = UIO_WRITE;
1479                 luio.uio_td = td;
1480                 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1481                 if (error == 0 && ip->i_ea_len == 0)
1482                         truncate = true;
1483         }
1484         if (--ip->i_ea_refs == 0) {
1485                 free(ip->i_ea_area, M_TEMP);
1486                 ip->i_ea_area = NULL;
1487                 ip->i_ea_len = 0;
1488                 ip->i_ea_error = 0;
1489         }
1490         ffs_unlock_ea(vp);
1491
1492         if (truncate)
1493                 ffs_truncate(vp, 0, IO_EXT, cred);
1494         return (error);
1495 }
1496
1497 /*
1498  * Vnode extattr strategy routine for fifos.
1499  *
1500  * We need to check for a read or write of the external attributes.
1501  * Otherwise we just fall through and do the usual thing.
1502  */
1503 static int
1504 ffsext_strategy(struct vop_strategy_args *ap)
1505 /*
1506 struct vop_strategy_args {
1507         struct vnodeop_desc *a_desc;
1508         struct vnode *a_vp;
1509         struct buf *a_bp;
1510 };
1511 */
1512 {
1513         struct vnode *vp;
1514         daddr_t lbn;
1515
1516         vp = ap->a_vp;
1517         lbn = ap->a_bp->b_lblkno;
1518         if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
1519                 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1520         if (vp->v_type == VFIFO)
1521                 return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1522         panic("spec nodes went here");
1523 }
1524
1525 /*
1526  * Vnode extattr transaction commit/abort
1527  */
1528 static int
1529 ffs_openextattr(struct vop_openextattr_args *ap)
1530 /*
1531 struct vop_openextattr_args {
1532         struct vnodeop_desc *a_desc;
1533         struct vnode *a_vp;
1534         IN struct ucred *a_cred;
1535         IN struct thread *a_td;
1536 };
1537 */
1538 {
1539
1540         if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1541                 return (EOPNOTSUPP);
1542
1543         return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1544 }
1545
1546 /*
1547  * Vnode extattr transaction commit/abort
1548  */
1549 static int
1550 ffs_closeextattr(struct vop_closeextattr_args *ap)
1551 /*
1552 struct vop_closeextattr_args {
1553         struct vnodeop_desc *a_desc;
1554         struct vnode *a_vp;
1555         int a_commit;
1556         IN struct ucred *a_cred;
1557         IN struct thread *a_td;
1558 };
1559 */
1560 {
1561         struct vnode *vp;
1562
1563         vp = ap->a_vp;
1564         if (vp->v_type == VCHR || vp->v_type == VBLK)
1565                 return (EOPNOTSUPP);
1566         if (ap->a_commit && (vp->v_mount->mnt_flag & MNT_RDONLY) != 0)
1567                 return (EROFS);
1568
1569         if (ap->a_commit && DOINGSUJ(vp)) {
1570                 ASSERT_VOP_ELOCKED(vp, "ffs_closeextattr commit");
1571                 softdep_prealloc(vp, MNT_WAIT);
1572                 if (vp->v_data == NULL)
1573                         return (EBADF);
1574         }
1575         return (ffs_close_ea(vp, ap->a_commit, ap->a_cred, ap->a_td));
1576 }
1577
1578 /*
1579  * Vnode operation to remove a named attribute.
1580  */
1581 static int
1582 ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1583 /*
1584 vop_deleteextattr {
1585         IN struct vnode *a_vp;
1586         IN int a_attrnamespace;
1587         IN const char *a_name;
1588         IN struct ucred *a_cred;
1589         IN struct thread *a_td;
1590 };
1591 */
1592 {
1593         struct vnode *vp;
1594         struct inode *ip;
1595         struct extattr *eap;
1596         uint32_t ul;
1597         int olen, error, i, easize;
1598         u_char *eae;
1599         void *tmp;
1600
1601         vp = ap->a_vp;
1602         ip = VTOI(vp);
1603
1604         if (vp->v_type == VCHR || vp->v_type == VBLK)
1605                 return (EOPNOTSUPP);
1606         if (strlen(ap->a_name) == 0)
1607                 return (EINVAL);
1608         if (vp->v_mount->mnt_flag & MNT_RDONLY)
1609                 return (EROFS);
1610
1611         error = extattr_check_cred(vp, ap->a_attrnamespace,
1612             ap->a_cred, ap->a_td, VWRITE);
1613         if (error) {
1614                 /*
1615                  * ffs_lock_ea is not needed there, because the vnode
1616                  * must be exclusively locked.
1617                  */
1618                 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1619                         ip->i_ea_error = error;
1620                 return (error);
1621         }
1622
1623         if (DOINGSUJ(vp)) {
1624                 ASSERT_VOP_ELOCKED(vp, "ffs_deleteextattr");
1625                 softdep_prealloc(vp, MNT_WAIT);
1626                 if (vp->v_data == NULL)
1627                         return (EBADF);
1628         }
1629
1630         error = ffs_open_ea(vp, ap->a_cred, ap->a_td);
1631         if (error)
1632                 return (error);
1633
1634         /* CEM: delete could be done in-place instead */
1635         eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1636         bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1637         easize = ip->i_ea_len;
1638
1639         olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1640             &eap, NULL);
1641         if (olen == -1) {
1642                 /* delete but nonexistent */
1643                 free(eae, M_TEMP);
1644                 ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1645                 return (ENOATTR);
1646         }
1647         ul = eap->ea_length;
1648         i = (u_char *)EXTATTR_NEXT(eap) - eae;
1649         bcopy(EXTATTR_NEXT(eap), eap, easize - i);
1650         easize -= ul;
1651
1652         tmp = ip->i_ea_area;
1653         ip->i_ea_area = eae;
1654         ip->i_ea_len = easize;
1655         free(tmp, M_TEMP);
1656         error = ffs_close_ea(vp, 1, ap->a_cred, ap->a_td);
1657         return (error);
1658 }
1659
1660 /*
1661  * Vnode operation to retrieve a named extended attribute.
1662  */
1663 static int
1664 ffs_getextattr(struct vop_getextattr_args *ap)
1665 /*
1666 vop_getextattr {
1667         IN struct vnode *a_vp;
1668         IN int a_attrnamespace;
1669         IN const char *a_name;
1670         INOUT struct uio *a_uio;
1671         OUT size_t *a_size;
1672         IN struct ucred *a_cred;
1673         IN struct thread *a_td;
1674 };
1675 */
1676 {
1677         struct inode *ip;
1678         u_char *eae, *p;
1679         unsigned easize;
1680         int error, ealen;
1681
1682         ip = VTOI(ap->a_vp);
1683
1684         if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1685                 return (EOPNOTSUPP);
1686
1687         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1688             ap->a_cred, ap->a_td, VREAD);
1689         if (error)
1690                 return (error);
1691
1692         error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1693         if (error)
1694                 return (error);
1695
1696         eae = ip->i_ea_area;
1697         easize = ip->i_ea_len;
1698
1699         ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1700             NULL, &p);
1701         if (ealen >= 0) {
1702                 error = 0;
1703                 if (ap->a_size != NULL)
1704                         *ap->a_size = ealen;
1705                 else if (ap->a_uio != NULL)
1706                         error = uiomove(p, ealen, ap->a_uio);
1707         } else
1708                 error = ENOATTR;
1709
1710         ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1711         return (error);
1712 }
1713
1714 /*
1715  * Vnode operation to retrieve extended attributes on a vnode.
1716  */
1717 static int
1718 ffs_listextattr(struct vop_listextattr_args *ap)
1719 /*
1720 vop_listextattr {
1721         IN struct vnode *a_vp;
1722         IN int a_attrnamespace;
1723         INOUT struct uio *a_uio;
1724         OUT size_t *a_size;
1725         IN struct ucred *a_cred;
1726         IN struct thread *a_td;
1727 };
1728 */
1729 {
1730         struct inode *ip;
1731         struct extattr *eap, *eaend;
1732         int error, ealen;
1733
1734         ip = VTOI(ap->a_vp);
1735
1736         if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1737                 return (EOPNOTSUPP);
1738
1739         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1740             ap->a_cred, ap->a_td, VREAD);
1741         if (error)
1742                 return (error);
1743
1744         error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1745         if (error)
1746                 return (error);
1747
1748         error = 0;
1749         if (ap->a_size != NULL)
1750                 *ap->a_size = 0;
1751
1752         KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
1753         eap = (struct extattr *)ip->i_ea_area;
1754         eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
1755         for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
1756                 KASSERT(EXTATTR_NEXT(eap) <= eaend,
1757                     ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend));
1758                 if (eap->ea_namespace != ap->a_attrnamespace)
1759                         continue;
1760
1761                 ealen = eap->ea_namelength;
1762                 if (ap->a_size != NULL)
1763                         *ap->a_size += ealen + 1;
1764                 else if (ap->a_uio != NULL)
1765                         error = uiomove(&eap->ea_namelength, ealen + 1,
1766                             ap->a_uio);
1767         }
1768
1769         ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1770         return (error);
1771 }
1772
1773 /*
1774  * Vnode operation to set a named attribute.
1775  */
1776 static int
1777 ffs_setextattr(struct vop_setextattr_args *ap)
1778 /*
1779 vop_setextattr {
1780         IN struct vnode *a_vp;
1781         IN int a_attrnamespace;
1782         IN const char *a_name;
1783         INOUT struct uio *a_uio;
1784         IN struct ucred *a_cred;
1785         IN struct thread *a_td;
1786 };
1787 */
1788 {
1789         struct vnode *vp;
1790         struct inode *ip;
1791         struct fs *fs;
1792         struct extattr *eap;
1793         uint32_t ealength, ul;
1794         ssize_t ealen;
1795         int olen, eapad1, eapad2, error, i, easize;
1796         u_char *eae;
1797         void *tmp;
1798
1799         vp = ap->a_vp;
1800         ip = VTOI(vp);
1801         fs = ITOFS(ip);
1802
1803         if (vp->v_type == VCHR || vp->v_type == VBLK)
1804                 return (EOPNOTSUPP);
1805         if (strlen(ap->a_name) == 0)
1806                 return (EINVAL);
1807
1808         /* XXX Now unsupported API to delete EAs using NULL uio. */
1809         if (ap->a_uio == NULL)
1810                 return (EOPNOTSUPP);
1811
1812         if (vp->v_mount->mnt_flag & MNT_RDONLY)
1813                 return (EROFS);
1814
1815         ealen = ap->a_uio->uio_resid;
1816         if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR))
1817                 return (EINVAL);
1818
1819         error = extattr_check_cred(vp, ap->a_attrnamespace,
1820             ap->a_cred, ap->a_td, VWRITE);
1821         if (error) {
1822                 /*
1823                  * ffs_lock_ea is not needed there, because the vnode
1824                  * must be exclusively locked.
1825                  */
1826                 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1827                         ip->i_ea_error = error;
1828                 return (error);
1829         }
1830
1831         if (DOINGSUJ(vp)) {
1832                 ASSERT_VOP_ELOCKED(vp, "ffs_deleteextattr");
1833                 softdep_prealloc(vp, MNT_WAIT);
1834                 if (vp->v_data == NULL)
1835                         return (EBADF);
1836         }
1837
1838         error = ffs_open_ea(vp, ap->a_cred, ap->a_td);
1839         if (error)
1840                 return (error);
1841
1842         ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1843         eapad1 = roundup2(ealength, 8) - ealength;
1844         eapad2 = roundup2(ealen, 8) - ealen;
1845         ealength += eapad1 + ealen + eapad2;
1846
1847         /*
1848          * CEM: rewrites of the same size or smaller could be done in-place
1849          * instead.  (We don't acquire any fine-grained locks in here either,
1850          * so we could also do bigger writes in-place.)
1851          */
1852         eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1853         bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1854         easize = ip->i_ea_len;
1855
1856         olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1857             &eap, NULL);
1858         if (olen == -1) {
1859                 /* new, append at end */
1860                 KASSERT(ALIGNED_TO(eae + easize, struct extattr),
1861                     ("unaligned"));
1862                 eap = (struct extattr *)(eae + easize);
1863                 easize += ealength;
1864         } else {
1865                 ul = eap->ea_length;
1866                 i = (u_char *)EXTATTR_NEXT(eap) - eae;
1867                 if (ul != ealength) {
1868                         bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength,
1869                             easize - i);
1870                         easize += (ealength - ul);
1871                 }
1872         }
1873         if (easize > lblktosize(fs, UFS_NXADDR)) {
1874                 free(eae, M_TEMP);
1875                 ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1876                 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1877                         ip->i_ea_error = ENOSPC;
1878                 return (ENOSPC);
1879         }
1880         eap->ea_length = ealength;
1881         eap->ea_namespace = ap->a_attrnamespace;
1882         eap->ea_contentpadlen = eapad2;
1883         eap->ea_namelength = strlen(ap->a_name);
1884         memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
1885         bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
1886         error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
1887         if (error) {
1888                 free(eae, M_TEMP);
1889                 ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1890                 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1891                         ip->i_ea_error = error;
1892                 return (error);
1893         }
1894         bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2);
1895
1896         tmp = ip->i_ea_area;
1897         ip->i_ea_area = eae;
1898         ip->i_ea_len = easize;
1899         free(tmp, M_TEMP);
1900         error = ffs_close_ea(vp, 1, ap->a_cred, ap->a_td);
1901         return (error);
1902 }
1903
1904 /*
1905  * Vnode pointer to File handle
1906  */
1907 static int
1908 ffs_vptofh(struct vop_vptofh_args *ap)
1909 /*
1910 vop_vptofh {
1911         IN struct vnode *a_vp;
1912         IN struct fid *a_fhp;
1913 };
1914 */
1915 {
1916         struct inode *ip;
1917         struct ufid *ufhp;
1918
1919         ip = VTOI(ap->a_vp);
1920         ufhp = (struct ufid *)ap->a_fhp;
1921         ufhp->ufid_len = sizeof(struct ufid);
1922         ufhp->ufid_ino = ip->i_number;
1923         ufhp->ufid_gen = ip->i_gen;
1924         return (0);
1925 }
1926
1927 SYSCTL_DECL(_vfs_ffs);
1928 static int use_buf_pager = 1;
1929 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
1930     "Always use buffer pager instead of bmap");
1931
1932 static daddr_t
1933 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
1934 {
1935
1936         return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
1937 }
1938
1939 static int
1940 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn, long *sz)
1941 {
1942
1943         *sz = blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn);
1944         return (0);
1945 }
1946
1947 static int
1948 ffs_getpages(struct vop_getpages_args *ap)
1949 {
1950         struct vnode *vp;
1951         struct ufsmount *um;
1952
1953         vp = ap->a_vp;
1954         um = VFSTOUFS(vp->v_mount);
1955
1956         if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1957                 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1958                     ap->a_rbehind, ap->a_rahead, NULL, NULL));
1959         return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1960             ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
1961 }
1962
1963 static int
1964 ffs_getpages_async(struct vop_getpages_async_args *ap)
1965 {
1966         struct vnode *vp;
1967         struct ufsmount *um;
1968         bool do_iodone;
1969         int error;
1970
1971         vp = ap->a_vp;
1972         um = VFSTOUFS(vp->v_mount);
1973         do_iodone = true;
1974
1975         if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) {
1976                 error = vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1977                     ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg);
1978                 if (error == 0)
1979                         do_iodone = false;
1980         } else {
1981                 error = vfs_bio_getpages(vp, ap->a_m, ap->a_count,
1982                     ap->a_rbehind, ap->a_rahead, ffs_gbp_getblkno,
1983                     ffs_gbp_getblksz);
1984         }
1985         if (do_iodone && ap->a_iodone != NULL)
1986                 ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
1987
1988         return (error);
1989 }
1990
1991 static int
1992 ffs_vput_pair(struct vop_vput_pair_args *ap)
1993 {
1994         struct mount *mp;
1995         struct vnode *dvp, *vp, *vp1, **vpp;
1996         struct inode *dp, *ip;
1997         ino_t ip_ino;
1998         u_int64_t ip_gen;
1999         int error, vp_locked;
2000
2001         dvp = ap->a_dvp;
2002         dp = VTOI(dvp);
2003         vpp = ap->a_vpp;
2004         vp = vpp != NULL ? *vpp : NULL;
2005
2006         if ((dp->i_flag & (IN_NEEDSYNC | IN_ENDOFF)) == 0) {
2007                 vput(dvp);
2008                 if (vp != NULL && ap->a_unlock_vp)
2009                         vput(vp);
2010                 return (0);
2011         }
2012
2013         mp = dvp->v_mount;
2014         if (vp != NULL) {
2015                 if (ap->a_unlock_vp) {
2016                         vput(vp);
2017                 } else {
2018                         MPASS(vp->v_type != VNON);
2019                         vp_locked = VOP_ISLOCKED(vp);
2020                         ip = VTOI(vp);
2021                         ip_ino = ip->i_number;
2022                         ip_gen = ip->i_gen;
2023                         VOP_UNLOCK(vp);
2024                 }
2025         }
2026
2027         /*
2028          * If compaction or fsync was requested do it in ffs_vput_pair()
2029          * now that other locks are no longer held.
2030          */
2031         if ((dp->i_flag & IN_ENDOFF) != 0) {
2032                 VNASSERT(I_ENDOFF(dp) != 0 && I_ENDOFF(dp) < dp->i_size, dvp,
2033                     ("IN_ENDOFF set but I_ENDOFF() is not"));
2034                 dp->i_flag &= ~IN_ENDOFF;
2035                 error = UFS_TRUNCATE(dvp, (off_t)I_ENDOFF(dp), IO_NORMAL |
2036                     (DOINGASYNC(dvp) ? 0 : IO_SYNC), curthread->td_ucred);
2037                 if (error != 0 && error != ERELOOKUP) {
2038                         if (!ffs_fsfail_cleanup(VFSTOUFS(mp), error)) {
2039                                 vn_printf(dvp,
2040                                     "IN_ENDOFF: failed to truncate, "
2041                                     "error %d\n", error);
2042                         }
2043 #ifdef UFS_DIRHASH
2044                         ufsdirhash_free(dp);
2045 #endif
2046                 }
2047                 SET_I_ENDOFF(dp, 0);
2048         }
2049         if ((dp->i_flag & IN_NEEDSYNC) != 0) {
2050                 do {
2051                         error = ffs_syncvnode(dvp, MNT_WAIT, 0);
2052                 } while (error == ERELOOKUP);
2053         }
2054
2055         vput(dvp);
2056
2057         if (vp == NULL || ap->a_unlock_vp)
2058                 return (0);
2059         MPASS(mp != NULL);
2060
2061         /*
2062          * It is possible that vp is reclaimed at this point. Only
2063          * routines that call us with a_unlock_vp == false can find
2064          * that their vp has been reclaimed. There are three areas
2065          * that are affected:
2066          * 1) vn_open_cred() - later VOPs could fail, but
2067          *    dead_open() returns 0 to simulate successful open.
2068          * 2) ffs_snapshot() - creation of snapshot fails with EBADF.
2069          * 3) NFS server (several places) - code is prepared to detect
2070          *    and respond to dead vnodes by returning ESTALE.
2071          */
2072         VOP_LOCK(vp, vp_locked | LK_RETRY);
2073         if (IS_UFS(vp))
2074                 return (0);
2075
2076         /*
2077          * Try harder to recover from reclaimed vp if reclaim was not
2078          * because underlying inode was cleared.  We saved inode
2079          * number and inode generation, so we can try to reinstantiate
2080          * exactly same version of inode.  If this fails, return
2081          * original doomed vnode and let caller to handle
2082          * consequences.
2083          *
2084          * Note that callers must keep write started around
2085          * VOP_VPUT_PAIR() calls, so it is safe to use mp without
2086          * busying it.
2087          */
2088         VOP_UNLOCK(vp);
2089         error = ffs_inotovp(mp, ip_ino, ip_gen, LK_EXCLUSIVE, &vp1,
2090             FFSV_REPLACE_DOOMED);
2091         if (error != 0) {
2092                 VOP_LOCK(vp, vp_locked | LK_RETRY);
2093         } else {
2094                 vrele(vp);
2095                 *vpp = vp1;
2096         }
2097         return (error);
2098 }