sys/ufs/ffs/ffs_vnops.c

   1 /*-
   2  * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
   3  * All rights reserved.
   4  *
   5  * This software was developed for the FreeBSD Project by Marshall
   6  * Kirk McKusick and Network Associates Laboratories, the Security
   7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
   8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
   9  * research program
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  * Copyright (c) 1982, 1986, 1989, 1993
  33  *      The Regents of the University of California.  All rights reserved.
  34  *
  35  * Redistribution and use in source and binary forms, with or without
  36  * modification, are permitted provided that the following conditions
  37  * are met:
  38  * 1. Redistributions of source code must retain the above copyright
  39  *    notice, this list of conditions and the following disclaimer.
  40  * 2. Redistributions in binary form must reproduce the above copyright
  41  *    notice, this list of conditions and the following disclaimer in the
  42  *    documentation and/or other materials provided with the distribution.
  43  * 4. Neither the name of the University nor the names of its contributors
  44  *    may be used to endorse or promote products derived from this software
  45  *    without specific prior written permission.
  46  *
  47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  57  * SUCH DAMAGE.
  58  *
  59  *      from: @(#)ufs_readwrite.c       8.11 (Berkeley) 5/8/95
  60  * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
  61  *      @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
  62  */
  63
  64 #include <sys/cdefs.h>
  65 __FBSDID("$FreeBSD$");
  66
  67 #include <sys/param.h>
  68 #include <sys/bio.h>
  69 #include <sys/systm.h>
  70 #include <sys/buf.h>
  71 #include <sys/conf.h>
  72 #include <sys/extattr.h>
  73 #include <sys/kernel.h>
  74 #include <sys/limits.h>
  75 #include <sys/malloc.h>
  76 #include <sys/mount.h>
  77 #include <sys/priv.h>
  78 #include <sys/proc.h>
  79 #include <sys/resourcevar.h>
  80 #include <sys/signalvar.h>
  81 #include <sys/stat.h>
  82 #include <sys/vmmeter.h>
  83 #include <sys/vnode.h>
  84
  85 #include <vm/vm.h>
  86 #include <vm/vm_extern.h>
  87 #include <vm/vm_object.h>
  88 #include <vm/vm_page.h>
  89 #include <vm/vm_pager.h>
  90 #include <vm/vnode_pager.h>
  91
  92 #include <ufs/ufs/extattr.h>
  93 #include <ufs/ufs/quota.h>
  94 #include <ufs/ufs/inode.h>
  95 #include <ufs/ufs/ufs_extern.h>
  96 #include <ufs/ufs/ufsmount.h>
  97
  98 #include <ufs/ffs/fs.h>
  99 #include <ufs/ffs/ffs_extern.h>
 100 #include "opt_directio.h"
 101 #include "opt_ffs.h"
 102
 103 #ifdef DIRECTIO
 104 extern int      ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
 105 #endif
 106 static vop_fsync_t      ffs_fsync;
 107 static vop_lock1_t      ffs_lock;
 108 static vop_getpages_t   ffs_getpages;
 109 static vop_read_t       ffs_read;
 110 static vop_write_t      ffs_write;
 111 static int      ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
 112 static int      ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
 113                     struct ucred *cred);
 114 static vop_strategy_t   ffsext_strategy;
 115 static vop_closeextattr_t       ffs_closeextattr;
 116 static vop_deleteextattr_t      ffs_deleteextattr;
 117 static vop_getextattr_t ffs_getextattr;
 118 static vop_listextattr_t        ffs_listextattr;
 119 static vop_openextattr_t        ffs_openextattr;
 120 static vop_setextattr_t ffs_setextattr;
 121 static vop_vptofh_t     ffs_vptofh;
 122
 123
 124 /* Global vfs data structures for ufs. */
 125 struct vop_vector ffs_vnodeops1 = {
 126         .vop_default =          &ufs_vnodeops,
 127         .vop_fsync =            ffs_fsync,
 128         .vop_getpages =         ffs_getpages,
 129         .vop_lock1 =            ffs_lock,
 130         .vop_read =             ffs_read,
 131         .vop_reallocblks =      ffs_reallocblks,
 132         .vop_write =            ffs_write,
 133         .vop_vptofh =           ffs_vptofh,
 134 };
 135
 136 struct vop_vector ffs_fifoops1 = {
 137         .vop_default =          &ufs_fifoops,
 138         .vop_fsync =            ffs_fsync,
 139         .vop_reallocblks =      ffs_reallocblks, /* XXX: really ??? */
 140         .vop_vptofh =           ffs_vptofh,
 141 };
 142
 143 /* Global vfs data structures for ufs. */
 144 struct vop_vector ffs_vnodeops2 = {
 145         .vop_default =          &ufs_vnodeops,
 146         .vop_fsync =            ffs_fsync,
 147         .vop_getpages =         ffs_getpages,
 148         .vop_lock1 =            ffs_lock,
 149         .vop_read =             ffs_read,
 150         .vop_reallocblks =      ffs_reallocblks,
 151         .vop_write =            ffs_write,
 152         .vop_closeextattr =     ffs_closeextattr,
 153         .vop_deleteextattr =    ffs_deleteextattr,
 154         .vop_getextattr =       ffs_getextattr,
 155         .vop_listextattr =      ffs_listextattr,
 156         .vop_openextattr =      ffs_openextattr,
 157         .vop_setextattr =       ffs_setextattr,
 158         .vop_vptofh =           ffs_vptofh,
 159 };
 160
 161 struct vop_vector ffs_fifoops2 = {
 162         .vop_default =          &ufs_fifoops,
 163         .vop_fsync =            ffs_fsync,
 164         .vop_lock1 =            ffs_lock,
 165         .vop_reallocblks =      ffs_reallocblks,
 166         .vop_strategy =         ffsext_strategy,
 167         .vop_closeextattr =     ffs_closeextattr,
 168         .vop_deleteextattr =    ffs_deleteextattr,
 169         .vop_getextattr =       ffs_getextattr,
 170         .vop_listextattr =      ffs_listextattr,
 171         .vop_openextattr =      ffs_openextattr,
 172         .vop_setextattr =       ffs_setextattr,
 173         .vop_vptofh =           ffs_vptofh,
 174 };
 175
 176 /*
 177  * Synch an open file.
 178  */
 179 /* ARGSUSED */
 180 static int
 181 ffs_fsync(struct vop_fsync_args *ap)
 182 {
 183         int error;
 184
 185         error = ffs_syncvnode(ap->a_vp, ap->a_waitfor);
 186         if (error)
 187                 return (error);
 188         if (ap->a_waitfor == MNT_WAIT &&
 189             (ap->a_vp->v_mount->mnt_flag & MNT_SOFTDEP))
 190                 error = softdep_fsync(ap->a_vp);
 191         return (error);
 192 }
 193
 194 int
 195 ffs_syncvnode(struct vnode *vp, int waitfor)
 196 {
 197         struct inode *ip = VTOI(vp);
 198         struct bufobj *bo;
 199         struct buf *bp;
 200         struct buf *nbp;
 201         int s, error, wait, passes, skipmeta;
 202         ufs_lbn_t lbn;
 203
 204         wait = (waitfor == MNT_WAIT);
 205         lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
 206         bo = &vp->v_bufobj;
 207
 208         /*
 209          * Flush all dirty buffers associated with a vnode.
 210          */
 211         passes = NIADDR + 1;
 212         skipmeta = 0;
 213         if (wait)
 214                 skipmeta = 1;
 215         s = splbio();
 216         BO_LOCK(bo);
 217 loop:
 218         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
 219                 bp->b_vflags &= ~BV_SCANNED;
 220         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 221                 /*
 222                  * Reasons to skip this buffer: it has already been considered
 223                  * on this pass, this pass is the first time through on a
 224                  * synchronous flush request and the buffer being considered
 225                  * is metadata, the buffer has dependencies that will cause
 226                  * it to be redirtied and it has not already been deferred,
 227                  * or it is already being written.
 228                  */
 229                 if ((bp->b_vflags & BV_SCANNED) != 0)
 230                         continue;
 231                 bp->b_vflags |= BV_SCANNED;
 232                 if ((skipmeta == 1 && bp->b_lblkno < 0))
 233                         continue;
 234                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
 235                         continue;
 236                 BO_UNLOCK(bo);
 237                 if (!wait && !LIST_EMPTY(&bp->b_dep) &&
 238                     (bp->b_flags & B_DEFERRED) == 0 &&
 239                     buf_countdeps(bp, 0)) {
 240                         bp->b_flags |= B_DEFERRED;
 241                         BUF_UNLOCK(bp);
 242                         BO_LOCK(bo);
 243                         continue;
 244                 }
 245                 if ((bp->b_flags & B_DELWRI) == 0)
 246                         panic("ffs_fsync: not dirty");
 247                 /*
 248                  * If this is a synchronous flush request, or it is not a
 249                  * file or device, start the write on this buffer immediately.
 250                  */
 251                 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
 252
 253                         /*
 254                          * On our final pass through, do all I/O synchronously
 255                          * so that we can find out if our flush is failing
 256                          * because of write errors.
 257                          */
 258                         if (passes > 0 || !wait) {
 259                                 if ((bp->b_flags & B_CLUSTEROK) && !wait) {
 260                                         (void) vfs_bio_awrite(bp);
 261                                 } else {
 262                                         bremfree(bp);
 263                                         splx(s);
 264                                         (void) bawrite(bp);
 265                                         s = splbio();
 266                                 }
 267                         } else {
 268                                 bremfree(bp);
 269                                 splx(s);
 270                                 if ((error = bwrite(bp)) != 0)
 271                                         return (error);
 272                                 s = splbio();
 273                         }
 274                 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
 275                         /*
 276                          * If the buffer is for data that has been truncated
 277                          * off the file, then throw it away.
 278                          */
 279                         bremfree(bp);
 280                         bp->b_flags |= B_INVAL | B_NOCACHE;
 281                         splx(s);
 282                         brelse(bp);
 283                         s = splbio();
 284                 } else
 285                         vfs_bio_awrite(bp);
 286
 287                 /*
 288                  * Since we may have slept during the I/O, we need
 289                  * to start from a known point.
 290                  */
 291                 BO_LOCK(bo);
 292                 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
 293         }
 294         /*
 295          * If we were asked to do this synchronously, then go back for
 296          * another pass, this time doing the metadata.
 297          */
 298         if (skipmeta) {
 299                 skipmeta = 0;
 300                 goto loop;
 301         }
 302
 303         if (wait) {
 304                 bufobj_wwait(bo, 3, 0);
 305                 BO_UNLOCK(bo);
 306
 307                 /*
 308                  * Ensure that any filesystem metatdata associated
 309                  * with the vnode has been written.
 310                  */
 311                 splx(s);
 312                 if ((error = softdep_sync_metadata(vp)) != 0)
 313                         return (error);
 314                 s = splbio();
 315
 316                 BO_LOCK(bo);
 317                 if (bo->bo_dirty.bv_cnt > 0) {
 318                         /*
 319                          * Block devices associated with filesystems may
 320                          * have new I/O requests posted for them even if
 321                          * the vnode is locked, so no amount of trying will
 322                          * get them clean. Thus we give block devices a
 323                          * good effort, then just give up. For all other file
 324                          * types, go around and try again until it is clean.
 325                          */
 326                         if (passes > 0) {
 327                                 passes -= 1;
 328                                 goto loop;
 329                         }
 330 #ifdef INVARIANTS
 331                         if (!vn_isdisk(vp, NULL))
 332                                 vprint("ffs_fsync: dirty", vp);
 333 #endif
 334                 }
 335         }
 336         BO_UNLOCK(bo);
 337         splx(s);
 338         return (ffs_update(vp, wait));
 339 }
 340
 341 static int
 342 ffs_lock(ap)
 343         struct vop_lock1_args /* {
 344                 struct vnode *a_vp;
 345                 int a_flags;
 346                 struct thread *a_td;
 347                 char *file;
 348                 int line;
 349         } */ *ap;
 350 {
 351 #ifndef NO_FFS_SNAPSHOT
 352         struct vnode *vp;
 353         int flags;
 354         struct lock *lkp;
 355         int result;
 356
 357         switch (ap->a_flags & LK_TYPE_MASK) {
 358         case LK_SHARED:
 359         case LK_UPGRADE:
 360         case LK_EXCLUSIVE:
 361                 vp = ap->a_vp;
 362                 flags = ap->a_flags;
 363                 for (;;) {
 364                         lkp = vp->v_vnlock;
 365                         result = _lockmgr_args(lkp, flags, VI_MTX(vp),
 366                             LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
 367                             ap->a_file, ap->a_line);
 368                         if (lkp == vp->v_vnlock || result != 0)
 369                                 break;
 370                         /*
 371                          * Apparent success, except that the vnode
 372                          * mutated between snapshot file vnode and
 373                          * regular file vnode while this process
 374                          * slept.  The lock currently held is not the
 375                          * right lock.  Release it, and try to get the
 376                          * new lock.
 377                          */
 378                         (void) _lockmgr_args(lkp, LK_RELEASE, NULL,
 379                             LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
 380                             ap->a_file, ap->a_line);
 381                         if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
 382                             (LK_INTERLOCK | LK_NOWAIT))
 383                                 return (EBUSY);
 384                         if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
 385                                 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
 386                         flags &= ~LK_INTERLOCK;
 387                 }
 388                 break;
 389         default:
 390                 result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
 391         }
 392         return (result);
 393 #else
 394         return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
 395 #endif
 396 }
 397
 398 /*
 399  * Vnode op for reading.
 400  */
 401 /* ARGSUSED */
 402 static int
 403 ffs_read(ap)
 404         struct vop_read_args /* {
 405                 struct vnode *a_vp;
 406                 struct uio *a_uio;
 407                 int a_ioflag;
 408                 struct ucred *a_cred;
 409         } */ *ap;
 410 {
 411         struct vnode *vp;
 412         struct inode *ip;
 413         struct uio *uio;
 414         struct fs *fs;
 415         struct buf *bp;
 416         ufs_lbn_t lbn, nextlbn;
 417         off_t bytesinfile;
 418         long size, xfersize, blkoffset;
 419         int error, orig_resid;
 420         int seqcount;
 421         int ioflag;
 422
 423         vp = ap->a_vp;
 424         uio = ap->a_uio;
 425         ioflag = ap->a_ioflag;
 426         if (ap->a_ioflag & IO_EXT)
 427 #ifdef notyet
 428                 return (ffs_extread(vp, uio, ioflag));
 429 #else
 430                 panic("ffs_read+IO_EXT");
 431 #endif
 432 #ifdef DIRECTIO
 433         if ((ioflag & IO_DIRECT) != 0) {
 434                 int workdone;
 435
 436                 error = ffs_rawread(vp, uio, &workdone);
 437                 if (error != 0 || workdone != 0)
 438                         return error;
 439         }
 440 #endif
 441
 442         seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 443         ip = VTOI(vp);
 444
 445 #ifdef INVARIANTS
 446         if (uio->uio_rw != UIO_READ)
 447                 panic("ffs_read: mode");
 448
 449         if (vp->v_type == VLNK) {
 450                 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
 451                         panic("ffs_read: short symlink");
 452         } else if (vp->v_type != VREG && vp->v_type != VDIR)
 453                 panic("ffs_read: type %d",  vp->v_type);
 454 #endif
 455         orig_resid = uio->uio_resid;
 456         KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
 457         if (orig_resid == 0)
 458                 return (0);
 459         KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
 460         fs = ip->i_fs;
 461         if (uio->uio_offset < ip->i_size &&
 462             uio->uio_offset >= fs->fs_maxfilesize)
 463                 return (EOVERFLOW);
 464
 465         for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 466                 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 467                         break;
 468                 lbn = lblkno(fs, uio->uio_offset);
 469                 nextlbn = lbn + 1;
 470
 471                 /*
 472                  * size of buffer.  The buffer representing the
 473                  * end of the file is rounded up to the size of
 474                  * the block type ( fragment or full block,
 475                  * depending ).
 476                  */
 477                 size = blksize(fs, ip, lbn);
 478                 blkoffset = blkoff(fs, uio->uio_offset);
 479
 480                 /*
 481                  * The amount we want to transfer in this iteration is
 482                  * one FS block less the amount of the data before
 483                  * our startpoint (duh!)
 484                  */
 485                 xfersize = fs->fs_bsize - blkoffset;
 486
 487                 /*
 488                  * But if we actually want less than the block,
 489                  * or the file doesn't have a whole block more of data,
 490                  * then use the lesser number.
 491                  */
 492                 if (uio->uio_resid < xfersize)
 493                         xfersize = uio->uio_resid;
 494                 if (bytesinfile < xfersize)
 495                         xfersize = bytesinfile;
 496
 497                 if (lblktosize(fs, nextlbn) >= ip->i_size) {
 498                         /*
 499                          * Don't do readahead if this is the end of the file.
 500                          */
 501                         error = bread(vp, lbn, size, NOCRED, &bp);
 502                 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 503                         /*
 504                          * Otherwise if we are allowed to cluster,
 505                          * grab as much as we can.
 506                          *
 507                          * XXX  This may not be a win if we are not
 508                          * doing sequential access.
 509                          */
 510                         error = cluster_read(vp, ip->i_size, lbn,
 511                                 size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp);
 512                 } else if (seqcount > 1) {
 513                         /*
 514                          * If we are NOT allowed to cluster, then
 515                          * if we appear to be acting sequentially,
 516                          * fire off a request for a readahead
 517                          * as well as a read. Note that the 4th and 5th
 518                          * arguments point to arrays of the size specified in
 519                          * the 6th argument.
 520                          */
 521                         int nextsize = blksize(fs, ip, nextlbn);
 522                         error = breadn(vp, lbn,
 523                             size, &nextlbn, &nextsize, 1, NOCRED, &bp);
 524                 } else {
 525                         /*
 526                          * Failing all of the above, just read what the
 527                          * user asked for. Interestingly, the same as
 528                          * the first option above.
 529                          */
 530                         error = bread(vp, lbn, size, NOCRED, &bp);
 531                 }
 532                 if (error) {
 533                         brelse(bp);
 534                         bp = NULL;
 535                         break;
 536                 }
 537
 538                 /*
 539                  * If IO_DIRECT then set B_DIRECT for the buffer.  This
 540                  * will cause us to attempt to release the buffer later on
 541                  * and will cause the buffer cache to attempt to free the
 542                  * underlying pages.
 543                  */
 544                 if (ioflag & IO_DIRECT)
 545                         bp->b_flags |= B_DIRECT;
 546
 547                 /*
 548                  * We should only get non-zero b_resid when an I/O error
 549                  * has occurred, which should cause us to break above.
 550                  * However, if the short read did not cause an error,
 551                  * then we want to ensure that we do not uiomove bad
 552                  * or uninitialized data.
 553                  */
 554                 size -= bp->b_resid;
 555                 if (size < xfersize) {
 556                         if (size == 0)
 557                                 break;
 558                         xfersize = size;
 559                 }
 560
 561                 error = uiomove((char *)bp->b_data + blkoffset,
 562                     (int)xfersize, uio);
 563                 if (error)
 564                         break;
 565
 566                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 567                    (LIST_EMPTY(&bp->b_dep))) {
 568                         /*
 569                          * If there are no dependencies, and it's VMIO,
 570                          * then we don't need the buf, mark it available
 571                          * for freeing. The VM has the data.
 572                          */
 573                         bp->b_flags |= B_RELBUF;
 574                         brelse(bp);
 575                 } else {
 576                         /*
 577                          * Otherwise let whoever
 578                          * made the request take care of
 579                          * freeing it. We just queue
 580                          * it onto another list.
 581                          */
 582                         bqrelse(bp);
 583                 }
 584         }
 585
 586         /*
 587          * This can only happen in the case of an error
 588          * because the loop above resets bp to NULL on each iteration
 589          * and on normal completion has not set a new value into it.
 590          * so it must have come from a 'break' statement
 591          */
 592         if (bp != NULL) {
 593                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 594                    (LIST_EMPTY(&bp->b_dep))) {
 595                         bp->b_flags |= B_RELBUF;
 596                         brelse(bp);
 597                 } else {
 598                         bqrelse(bp);
 599                 }
 600         }
 601
 602         if ((error == 0 || uio->uio_resid != orig_resid) &&
 603             (vp->v_mount->mnt_flag & MNT_NOATIME) == 0 &&
 604             (ip->i_flag & IN_ACCESS) == 0) {
 605                 VI_LOCK(vp);
 606                 ip->i_flag |= IN_ACCESS;
 607                 VI_UNLOCK(vp);
 608         }
 609         return (error);
 610 }
 611
 612 /*
 613  * Vnode op for writing.
 614  */
 615 static int
 616 ffs_write(ap)
 617         struct vop_write_args /* {
 618                 struct vnode *a_vp;
 619                 struct uio *a_uio;
 620                 int a_ioflag;
 621                 struct ucred *a_cred;
 622         } */ *ap;
 623 {
 624         struct vnode *vp;
 625         struct uio *uio;
 626         struct inode *ip;
 627         struct fs *fs;
 628         struct buf *bp;
 629         struct thread *td;
 630         ufs_lbn_t lbn;
 631         off_t osize;
 632         int seqcount;
 633         int blkoffset, error, flags, ioflag, resid, size, xfersize;
 634
 635         vp = ap->a_vp;
 636         uio = ap->a_uio;
 637         ioflag = ap->a_ioflag;
 638         if (ap->a_ioflag & IO_EXT)
 639 #ifdef notyet
 640                 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
 641 #else
 642                 panic("ffs_write+IO_EXT");
 643 #endif
 644
 645         seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 646         ip = VTOI(vp);
 647
 648 #ifdef INVARIANTS
 649         if (uio->uio_rw != UIO_WRITE)
 650                 panic("ffs_write: mode");
 651 #endif
 652
 653         switch (vp->v_type) {
 654         case VREG:
 655                 if (ioflag & IO_APPEND)
 656                         uio->uio_offset = ip->i_size;
 657                 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
 658                         return (EPERM);
 659                 /* FALLTHROUGH */
 660         case VLNK:
 661                 break;
 662         case VDIR:
 663                 panic("ffs_write: dir write");
 664                 break;
 665         default:
 666                 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
 667                         (int)uio->uio_offset,
 668                         (int)uio->uio_resid
 669                 );
 670         }
 671
 672         KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
 673         KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
 674         fs = ip->i_fs;
 675         if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
 676                 return (EFBIG);
 677         /*
 678          * Maybe this should be above the vnode op call, but so long as
 679          * file servers have no limits, I don't think it matters.
 680          */
 681         td = uio->uio_td;
 682         if (vp->v_type == VREG && td != NULL) {
 683                 PROC_LOCK(td->td_proc);
 684                 if (uio->uio_offset + uio->uio_resid >
 685                     lim_cur(td->td_proc, RLIMIT_FSIZE)) {
 686                         psignal(td->td_proc, SIGXFSZ);
 687                         PROC_UNLOCK(td->td_proc);
 688                         return (EFBIG);
 689                 }
 690                 PROC_UNLOCK(td->td_proc);
 691         }
 692
 693         resid = uio->uio_resid;
 694         osize = ip->i_size;
 695         if (seqcount > BA_SEQMAX)
 696                 flags = BA_SEQMAX << BA_SEQSHIFT;
 697         else
 698                 flags = seqcount << BA_SEQSHIFT;
 699         if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
 700                 flags |= IO_SYNC;
 701
 702         for (error = 0; uio->uio_resid > 0;) {
 703                 lbn = lblkno(fs, uio->uio_offset);
 704                 blkoffset = blkoff(fs, uio->uio_offset);
 705                 xfersize = fs->fs_bsize - blkoffset;
 706                 if (uio->uio_resid < xfersize)
 707                         xfersize = uio->uio_resid;
 708                 if (uio->uio_offset + xfersize > ip->i_size)
 709                         vnode_pager_setsize(vp, uio->uio_offset + xfersize);
 710
 711                 /*
 712                  * We must perform a read-before-write if the transfer size
 713                  * does not cover the entire buffer.
 714                  */
 715                 if (fs->fs_bsize > xfersize)
 716                         flags |= BA_CLRBUF;
 717                 else
 718                         flags &= ~BA_CLRBUF;
 719 /* XXX is uio->uio_offset the right thing here? */
 720                 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
 721                     ap->a_cred, flags, &bp);
 722                 if (error != 0)
 723                         break;
 724                 /*
 725                  * If the buffer is not valid we have to clear out any
 726                  * garbage data from the pages instantiated for the buffer.
 727                  * If we do not, a failed uiomove() during a write can leave
 728                  * the prior contents of the pages exposed to a userland
 729                  * mmap().  XXX deal with uiomove() errors a better way.
 730                  */
 731                 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
 732                         vfs_bio_clrbuf(bp);
 733                 if (ioflag & IO_DIRECT)
 734                         bp->b_flags |= B_DIRECT;
 735                 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
 736                         bp->b_flags |= B_NOCACHE;
 737
 738                 if (uio->uio_offset + xfersize > ip->i_size) {
 739                         ip->i_size = uio->uio_offset + xfersize;
 740                         DIP_SET(ip, i_size, ip->i_size);
 741                 }
 742
 743                 size = blksize(fs, ip, lbn) - bp->b_resid;
 744                 if (size < xfersize)
 745                         xfersize = size;
 746
 747                 error =
 748                     uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 749                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 750                    (LIST_EMPTY(&bp->b_dep))) {
 751                         bp->b_flags |= B_RELBUF;
 752                 }
 753
 754                 /*
 755                  * If IO_SYNC each buffer is written synchronously.  Otherwise
 756                  * if we have a severe page deficiency write the buffer
 757                  * asynchronously.  Otherwise try to cluster, and if that
 758                  * doesn't do it then either do an async write (if O_DIRECT),
 759                  * or a delayed write (if not).
 760                  */
 761                 if (ioflag & IO_SYNC) {
 762                         (void)bwrite(bp);
 763                 } else if (vm_page_count_severe() ||
 764                             buf_dirty_count_severe() ||
 765                             (ioflag & IO_ASYNC)) {
 766                         bp->b_flags |= B_CLUSTEROK;
 767                         bawrite(bp);
 768                 } else if (xfersize + blkoffset == fs->fs_bsize) {
 769                         if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
 770                                 bp->b_flags |= B_CLUSTEROK;
 771                                 cluster_write(vp, bp, ip->i_size, seqcount);
 772                         } else {
 773                                 bawrite(bp);
 774                         }
 775                 } else if (ioflag & IO_DIRECT) {
 776                         bp->b_flags |= B_CLUSTEROK;
 777                         bawrite(bp);
 778                 } else {
 779                         bp->b_flags |= B_CLUSTEROK;
 780                         bdwrite(bp);
 781                 }
 782                 if (error || xfersize == 0)
 783                         break;
 784                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
 785         }
 786         /*
 787          * If we successfully wrote any data, and we are not the superuser
 788          * we clear the setuid and setgid bits as a precaution against
 789          * tampering.
 790          */
 791         if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
 792             ap->a_cred) {
 793                 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) {
 794                         ip->i_mode &= ~(ISUID | ISGID);
 795                         DIP_SET(ip, i_mode, ip->i_mode);
 796                 }
 797         }
 798         if (error) {
 799                 if (ioflag & IO_UNIT) {
 800                         (void)ffs_truncate(vp, osize,
 801                             IO_NORMAL | (ioflag & IO_SYNC),
 802                             ap->a_cred, uio->uio_td);
 803                         uio->uio_offset -= resid - uio->uio_resid;
 804                         uio->uio_resid = resid;
 805                 }
 806         } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
 807                 error = ffs_update(vp, 1);
 808         return (error);
 809 }
 810
 811 /*
 812  * get page routine
 813  */
 814 static int
 815 ffs_getpages(ap)
 816         struct vop_getpages_args *ap;
 817 {
 818         int i;
 819         vm_page_t mreq;
 820         int pcount;
 821
 822         pcount = round_page(ap->a_count) / PAGE_SIZE;
 823         mreq = ap->a_m[ap->a_reqpage];
 824
 825         /*
 826          * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
 827          * then the entire page is valid.  Since the page may be mapped,
 828          * user programs might reference data beyond the actual end of file
 829          * occuring within the page.  We have to zero that data.
 830          */
 831         VM_OBJECT_LOCK(mreq->object);
 832         if (mreq->valid) {
 833                 if (mreq->valid != VM_PAGE_BITS_ALL)
 834                         vm_page_zero_invalid(mreq, TRUE);
 835                 vm_page_lock_queues();
 836                 for (i = 0; i < pcount; i++) {
 837                         if (i != ap->a_reqpage) {
 838                                 vm_page_free(ap->a_m[i]);
 839                         }
 840                 }
 841                 vm_page_unlock_queues();
 842                 VM_OBJECT_UNLOCK(mreq->object);
 843                 return VM_PAGER_OK;
 844         }
 845         VM_OBJECT_UNLOCK(mreq->object);
 846
 847         return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
 848                                             ap->a_count,
 849                                             ap->a_reqpage);
 850 }
 851
 852
 853 /*
 854  * Extended attribute area reading.
 855  */
 856 static int
 857 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
 858 {
 859         struct inode *ip;
 860         struct ufs2_dinode *dp;
 861         struct fs *fs;
 862         struct buf *bp;
 863         ufs_lbn_t lbn, nextlbn;
 864         off_t bytesinfile;
 865         long size, xfersize, blkoffset;
 866         int error, orig_resid;
 867
 868         ip = VTOI(vp);
 869         fs = ip->i_fs;
 870         dp = ip->i_din2;
 871
 872 #ifdef INVARIANTS
 873         if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
 874                 panic("ffs_extread: mode");
 875
 876 #endif
 877         orig_resid = uio->uio_resid;
 878         KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
 879         if (orig_resid == 0)
 880                 return (0);
 881         KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
 882
 883         for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 884                 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
 885                         break;
 886                 lbn = lblkno(fs, uio->uio_offset);
 887                 nextlbn = lbn + 1;
 888
 889                 /*
 890                  * size of buffer.  The buffer representing the
 891                  * end of the file is rounded up to the size of
 892                  * the block type ( fragment or full block,
 893                  * depending ).
 894                  */
 895                 size = sblksize(fs, dp->di_extsize, lbn);
 896                 blkoffset = blkoff(fs, uio->uio_offset);
 897
 898                 /*
 899                  * The amount we want to transfer in this iteration is
 900                  * one FS block less the amount of the data before
 901                  * our startpoint (duh!)
 902                  */
 903                 xfersize = fs->fs_bsize - blkoffset;
 904
 905                 /*
 906                  * But if we actually want less than the block,
 907                  * or the file doesn't have a whole block more of data,
 908                  * then use the lesser number.
 909                  */
 910                 if (uio->uio_resid < xfersize)
 911                         xfersize = uio->uio_resid;
 912                 if (bytesinfile < xfersize)
 913                         xfersize = bytesinfile;
 914
 915                 if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
 916                         /*
 917                          * Don't do readahead if this is the end of the info.
 918                          */
 919                         error = bread(vp, -1 - lbn, size, NOCRED, &bp);
 920                 } else {
 921                         /*
 922                          * If we have a second block, then
 923                          * fire off a request for a readahead
 924                          * as well as a read. Note that the 4th and 5th
 925                          * arguments point to arrays of the size specified in
 926                          * the 6th argument.
 927                          */
 928                         int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
 929
 930                         nextlbn = -1 - nextlbn;
 931                         error = breadn(vp, -1 - lbn,
 932                             size, &nextlbn, &nextsize, 1, NOCRED, &bp);
 933                 }
 934                 if (error) {
 935                         brelse(bp);
 936                         bp = NULL;
 937                         break;
 938                 }
 939
 940                 /*
 941                  * If IO_DIRECT then set B_DIRECT for the buffer.  This
 942                  * will cause us to attempt to release the buffer later on
 943                  * and will cause the buffer cache to attempt to free the
 944                  * underlying pages.
 945                  */
 946                 if (ioflag & IO_DIRECT)
 947                         bp->b_flags |= B_DIRECT;
 948
 949                 /*
 950                  * We should only get non-zero b_resid when an I/O error
 951                  * has occurred, which should cause us to break above.
 952                  * However, if the short read did not cause an error,
 953                  * then we want to ensure that we do not uiomove bad
 954                  * or uninitialized data.
 955                  */
 956                 size -= bp->b_resid;
 957                 if (size < xfersize) {
 958                         if (size == 0)
 959                                 break;
 960                         xfersize = size;
 961                 }
 962
 963                 error = uiomove((char *)bp->b_data + blkoffset,
 964                                         (int)xfersize, uio);
 965                 if (error)
 966                         break;
 967
 968                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 969                    (LIST_EMPTY(&bp->b_dep))) {
 970                         /*
 971                          * If there are no dependencies, and it's VMIO,
 972                          * then we don't need the buf, mark it available
 973                          * for freeing. The VM has the data.
 974                          */
 975                         bp->b_flags |= B_RELBUF;
 976                         brelse(bp);
 977                 } else {
 978                         /*
 979                          * Otherwise let whoever
 980                          * made the request take care of
 981                          * freeing it. We just queue
 982                          * it onto another list.
 983                          */
 984                         bqrelse(bp);
 985                 }
 986         }
 987
 988         /*
 989          * This can only happen in the case of an error
 990          * because the loop above resets bp to NULL on each iteration
 991          * and on normal completion has not set a new value into it.
 992          * so it must have come from a 'break' statement
 993          */
 994         if (bp != NULL) {
 995                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 996                    (LIST_EMPTY(&bp->b_dep))) {
 997                         bp->b_flags |= B_RELBUF;
 998                         brelse(bp);
 999                 } else {
1000                         bqrelse(bp);
1001                 }
1002         }
1003
1004         if ((error == 0 || uio->uio_resid != orig_resid) &&
1005             (vp->v_mount->mnt_flag & MNT_NOATIME) == 0 &&
1006             (ip->i_flag & IN_ACCESS) == 0) {
1007                 VI_LOCK(vp);
1008                 ip->i_flag |= IN_ACCESS;
1009                 VI_UNLOCK(vp);
1010         }
1011         return (error);
1012 }
1013
1014 /*
1015  * Extended attribute area writing.
1016  */
1017 static int
1018 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1019 {
1020         struct inode *ip;
1021         struct ufs2_dinode *dp;
1022         struct fs *fs;
1023         struct buf *bp;
1024         ufs_lbn_t lbn;
1025         off_t osize;
1026         int blkoffset, error, flags, resid, size, xfersize;
1027
1028         ip = VTOI(vp);
1029         fs = ip->i_fs;
1030         dp = ip->i_din2;
1031
1032         KASSERT(!(ip->i_flag & IN_SPACECOUNTED), ("inode %u: inode is dead",
1033             ip->i_number));
1034
1035 #ifdef INVARIANTS
1036         if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1037                 panic("ffs_extwrite: mode");
1038 #endif
1039
1040         if (ioflag & IO_APPEND)
1041                 uio->uio_offset = dp->di_extsize;
1042         KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1043         KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1044         if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize)
1045                 return (EFBIG);
1046
1047         resid = uio->uio_resid;
1048         osize = dp->di_extsize;
1049         flags = IO_EXT;
1050         if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
1051                 flags |= IO_SYNC;
1052
1053         for (error = 0; uio->uio_resid > 0;) {
1054                 lbn = lblkno(fs, uio->uio_offset);
1055                 blkoffset = blkoff(fs, uio->uio_offset);
1056                 xfersize = fs->fs_bsize - blkoffset;
1057                 if (uio->uio_resid < xfersize)
1058                         xfersize = uio->uio_resid;
1059
1060                 /*
1061                  * We must perform a read-before-write if the transfer size
1062                  * does not cover the entire buffer.
1063                  */
1064                 if (fs->fs_bsize > xfersize)
1065                         flags |= BA_CLRBUF;
1066                 else
1067                         flags &= ~BA_CLRBUF;
1068                 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1069                     ucred, flags, &bp);
1070                 if (error != 0)
1071                         break;
1072                 /*
1073                  * If the buffer is not valid we have to clear out any
1074                  * garbage data from the pages instantiated for the buffer.
1075                  * If we do not, a failed uiomove() during a write can leave
1076                  * the prior contents of the pages exposed to a userland
1077                  * mmap().  XXX deal with uiomove() errors a better way.
1078                  */
1079                 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1080                         vfs_bio_clrbuf(bp);
1081                 if (ioflag & IO_DIRECT)
1082                         bp->b_flags |= B_DIRECT;
1083
1084                 if (uio->uio_offset + xfersize > dp->di_extsize)
1085                         dp->di_extsize = uio->uio_offset + xfersize;
1086
1087                 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1088                 if (size < xfersize)
1089                         xfersize = size;
1090
1091                 error =
1092                     uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1093                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1094                    (LIST_EMPTY(&bp->b_dep))) {
1095                         bp->b_flags |= B_RELBUF;
1096                 }
1097
1098                 /*
1099                  * If IO_SYNC each buffer is written synchronously.  Otherwise
1100                  * if we have a severe page deficiency write the buffer
1101                  * asynchronously.  Otherwise try to cluster, and if that
1102                  * doesn't do it then either do an async write (if O_DIRECT),
1103                  * or a delayed write (if not).
1104                  */
1105                 if (ioflag & IO_SYNC) {
1106                         (void)bwrite(bp);
1107                 } else if (vm_page_count_severe() ||
1108                             buf_dirty_count_severe() ||
1109                             xfersize + blkoffset == fs->fs_bsize ||
1110                             (ioflag & (IO_ASYNC | IO_DIRECT)))
1111                         bawrite(bp);
1112                 else
1113                         bdwrite(bp);
1114                 if (error || xfersize == 0)
1115                         break;
1116                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1117         }
1118         /*
1119          * If we successfully wrote any data, and we are not the superuser
1120          * we clear the setuid and setgid bits as a precaution against
1121          * tampering.
1122          */
1123         if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1124                 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) {
1125                         ip->i_mode &= ~(ISUID | ISGID);
1126                         dp->di_mode = ip->i_mode;
1127                 }
1128         }
1129         if (error) {
1130                 if (ioflag & IO_UNIT) {
1131                         (void)ffs_truncate(vp, osize,
1132                             IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td);
1133                         uio->uio_offset -= resid - uio->uio_resid;
1134                         uio->uio_resid = resid;
1135                 }
1136         } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1137                 error = ffs_update(vp, 1);
1138         return (error);
1139 }
1140
1141
1142 /*
1143  * Vnode operating to retrieve a named extended attribute.
1144  *
1145  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1146  * the length of the EA, and possibly the pointer to the entry and to the data.
1147  */
1148 static int
1149 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac)
1150 {
1151         u_char *p, *pe, *pn, *p0;
1152         int eapad1, eapad2, ealength, ealen, nlen;
1153         uint32_t ul;
1154
1155         pe = ptr + length;
1156         nlen = strlen(name);
1157
1158         for (p = ptr; p < pe; p = pn) {
1159                 p0 = p;
1160                 bcopy(p, &ul, sizeof(ul));
1161                 pn = p + ul;
1162                 /* make sure this entry is complete */
1163                 if (pn > pe)
1164                         break;
1165                 p += sizeof(uint32_t);
1166                 if (*p != nspace)
1167                         continue;
1168                 p++;
1169                 eapad2 = *p++;
1170                 if (*p != nlen)
1171                         continue;
1172                 p++;
1173                 if (bcmp(p, name, nlen))
1174                         continue;
1175                 ealength = sizeof(uint32_t) + 3 + nlen;
1176                 eapad1 = 8 - (ealength % 8);
1177                 if (eapad1 == 8)
1178                         eapad1 = 0;
1179                 ealength += eapad1;
1180                 ealen = ul - ealength - eapad2;
1181                 p += nlen + eapad1;
1182                 if (eap != NULL)
1183                         *eap = p0;
1184                 if (eac != NULL)
1185                         *eac = p;
1186                 return (ealen);
1187         }
1188         return(-1);
1189 }
1190
1191 static int
1192 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1193 {
1194         struct inode *ip;
1195         struct ufs2_dinode *dp;
1196         struct fs *fs;
1197         struct uio luio;
1198         struct iovec liovec;
1199         int easize, error;
1200         u_char *eae;
1201
1202         ip = VTOI(vp);
1203         fs = ip->i_fs;
1204         dp = ip->i_din2;
1205         easize = dp->di_extsize;
1206         if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize)
1207                 return (EFBIG);
1208
1209         eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1210
1211         liovec.iov_base = eae;
1212         liovec.iov_len = easize;
1213         luio.uio_iov = &liovec;
1214         luio.uio_iovcnt = 1;
1215         luio.uio_offset = 0;
1216         luio.uio_resid = easize;
1217         luio.uio_segflg = UIO_SYSSPACE;
1218         luio.uio_rw = UIO_READ;
1219         luio.uio_td = td;
1220
1221         error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1222         if (error) {
1223                 free(eae, M_TEMP);
1224                 return(error);
1225         }
1226         *p = eae;
1227         return (0);
1228 }
1229
1230 static int
1231 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1232 {
1233         struct inode *ip;
1234         struct ufs2_dinode *dp;
1235         int error;
1236
1237         ip = VTOI(vp);
1238
1239         if (ip->i_ea_area != NULL)
1240                 return (EBUSY);
1241         dp = ip->i_din2;
1242         error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1243         if (error)
1244                 return (error);
1245         ip->i_ea_len = dp->di_extsize;
1246         ip->i_ea_error = 0;
1247         return (0);
1248 }
1249
1250 /*
1251  * Vnode extattr transaction commit/abort
1252  */
1253 static int
1254 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1255 {
1256         struct inode *ip;
1257         struct uio luio;
1258         struct iovec liovec;
1259         int error;
1260         struct ufs2_dinode *dp;
1261
1262         ip = VTOI(vp);
1263         if (ip->i_ea_area == NULL)
1264                 return (EINVAL);
1265         dp = ip->i_din2;
1266         error = ip->i_ea_error;
1267         if (commit && error == 0) {
1268                 if (cred == NOCRED)
1269                         cred =  vp->v_mount->mnt_cred;
1270                 liovec.iov_base = ip->i_ea_area;
1271                 liovec.iov_len = ip->i_ea_len;
1272                 luio.uio_iov = &liovec;
1273                 luio.uio_iovcnt = 1;
1274                 luio.uio_offset = 0;
1275                 luio.uio_resid = ip->i_ea_len;
1276                 luio.uio_segflg = UIO_SYSSPACE;
1277                 luio.uio_rw = UIO_WRITE;
1278                 luio.uio_td = td;
1279                 /* XXX: I'm not happy about truncating to zero size */
1280                 if (ip->i_ea_len < dp->di_extsize)
1281                         error = ffs_truncate(vp, 0, IO_EXT, cred, td);
1282                 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1283         }
1284         free(ip->i_ea_area, M_TEMP);
1285         ip->i_ea_area = NULL;
1286         ip->i_ea_len = 0;
1287         ip->i_ea_error = 0;
1288         return (error);
1289 }
1290
1291 /*
1292  * Vnode extattr strategy routine for fifos.
1293  *
1294  * We need to check for a read or write of the external attributes.
1295  * Otherwise we just fall through and do the usual thing.
1296  */
1297 static int
1298 ffsext_strategy(struct vop_strategy_args *ap)
1299 /*
1300 struct vop_strategy_args {
1301         struct vnodeop_desc *a_desc;
1302         struct vnode *a_vp;
1303         struct buf *a_bp;
1304 };
1305 */
1306 {
1307         struct vnode *vp;
1308         daddr_t lbn;
1309
1310         vp = ap->a_vp;
1311         lbn = ap->a_bp->b_lblkno;
1312         if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
1313             lbn < 0 && lbn >= -NXADDR)
1314                 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1315         if (vp->v_type == VFIFO)
1316                 return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1317         panic("spec nodes went here");
1318 }
1319
1320 /*
1321  * Vnode extattr transaction commit/abort
1322  */
1323 static int
1324 ffs_openextattr(struct vop_openextattr_args *ap)
1325 /*
1326 struct vop_openextattr_args {
1327         struct vnodeop_desc *a_desc;
1328         struct vnode *a_vp;
1329         IN struct ucred *a_cred;
1330         IN struct thread *a_td;
1331 };
1332 */
1333 {
1334         struct inode *ip;
1335         struct fs *fs;
1336
1337         ip = VTOI(ap->a_vp);
1338         fs = ip->i_fs;
1339
1340         if (ap->a_vp->v_type == VCHR)
1341                 return (EOPNOTSUPP);
1342
1343         return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1344 }
1345
1346
1347 /*
1348  * Vnode extattr transaction commit/abort
1349  */
1350 static int
1351 ffs_closeextattr(struct vop_closeextattr_args *ap)
1352 /*
1353 struct vop_closeextattr_args {
1354         struct vnodeop_desc *a_desc;
1355         struct vnode *a_vp;
1356         int a_commit;
1357         IN struct ucred *a_cred;
1358         IN struct thread *a_td;
1359 };
1360 */
1361 {
1362         struct inode *ip;
1363         struct fs *fs;
1364
1365         ip = VTOI(ap->a_vp);
1366         fs = ip->i_fs;
1367
1368         if (ap->a_vp->v_type == VCHR)
1369                 return (EOPNOTSUPP);
1370
1371         if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
1372                 return (EROFS);
1373
1374         return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1375 }
1376
1377 /*
1378  * Vnode operation to remove a named attribute.
1379  */
1380 static int
1381 ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1382 /*
1383 vop_deleteextattr {
1384         IN struct vnode *a_vp;
1385         IN int a_attrnamespace;
1386         IN const char *a_name;
1387         IN struct ucred *a_cred;
1388         IN struct thread *a_td;
1389 };
1390 */
1391 {
1392         struct inode *ip;
1393         struct fs *fs;
1394         uint32_t ealength, ul;
1395         int ealen, olen, eapad1, eapad2, error, i, easize;
1396         u_char *eae, *p;
1397         int stand_alone;
1398
1399         ip = VTOI(ap->a_vp);
1400         fs = ip->i_fs;
1401
1402         if (ap->a_vp->v_type == VCHR)
1403                 return (EOPNOTSUPP);
1404
1405         if (strlen(ap->a_name) == 0)
1406                 return (EINVAL);
1407
1408         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1409                 return (EROFS);
1410
1411         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1412             ap->a_cred, ap->a_td, VWRITE);
1413         if (error) {
1414                 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1415                         ip->i_ea_error = error;
1416                 return (error);
1417         }
1418
1419         if (ip->i_ea_area == NULL) {
1420                 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1421                 if (error)
1422                         return (error);
1423                 stand_alone = 1;
1424         } else {
1425                 stand_alone = 0;
1426         }
1427
1428         ealength = eapad1 = ealen = eapad2 = 0;
1429
1430         eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1431         bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1432         easize = ip->i_ea_len;
1433
1434         olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1435             &p, NULL);
1436         if (olen == -1) {
1437                 /* delete but nonexistent */
1438                 free(eae, M_TEMP);
1439                 if (stand_alone)
1440                         ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1441                 return(ENOATTR);
1442         }
1443         bcopy(p, &ul, sizeof ul);
1444         i = p - eae + ul;
1445         if (ul != ealength) {
1446                 bcopy(p + ul, p + ealength, easize - i);
1447                 easize += (ealength - ul);
1448         }
1449         if (easize > NXADDR * fs->fs_bsize) {
1450                 free(eae, M_TEMP);
1451                 if (stand_alone)
1452                         ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1453                 else if (ip->i_ea_error == 0)
1454                         ip->i_ea_error = ENOSPC;
1455                 return(ENOSPC);
1456         }
1457         p = ip->i_ea_area;
1458         ip->i_ea_area = eae;
1459         ip->i_ea_len = easize;
1460         free(p, M_TEMP);
1461         if (stand_alone)
1462                 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1463         return(error);
1464 }
1465
1466 /*
1467  * Vnode operation to retrieve a named extended attribute.
1468  */
1469 static int
1470 ffs_getextattr(struct vop_getextattr_args *ap)
1471 /*
1472 vop_getextattr {
1473         IN struct vnode *a_vp;
1474         IN int a_attrnamespace;
1475         IN const char *a_name;
1476         INOUT struct uio *a_uio;
1477         OUT size_t *a_size;
1478         IN struct ucred *a_cred;
1479         IN struct thread *a_td;
1480 };
1481 */
1482 {
1483         struct inode *ip;
1484         struct fs *fs;
1485         u_char *eae, *p;
1486         unsigned easize;
1487         int error, ealen, stand_alone;
1488
1489         ip = VTOI(ap->a_vp);
1490         fs = ip->i_fs;
1491
1492         if (ap->a_vp->v_type == VCHR)
1493                 return (EOPNOTSUPP);
1494
1495         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1496             ap->a_cred, ap->a_td, VREAD);
1497         if (error)
1498                 return (error);
1499
1500         if (ip->i_ea_area == NULL) {
1501                 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1502                 if (error)
1503                         return (error);
1504                 stand_alone = 1;
1505         } else {
1506                 stand_alone = 0;
1507         }
1508         eae = ip->i_ea_area;
1509         easize = ip->i_ea_len;
1510
1511         ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1512             NULL, &p);
1513         if (ealen >= 0) {
1514                 error = 0;
1515                 if (ap->a_size != NULL)
1516                         *ap->a_size = ealen;
1517                 else if (ap->a_uio != NULL)
1518                         error = uiomove(p, ealen, ap->a_uio);
1519         } else
1520                 error = ENOATTR;
1521         if (stand_alone)
1522                 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1523         return(error);
1524 }
1525
1526 /*
1527  * Vnode operation to retrieve extended attributes on a vnode.
1528  */
1529 static int
1530 ffs_listextattr(struct vop_listextattr_args *ap)
1531 /*
1532 vop_listextattr {
1533         IN struct vnode *a_vp;
1534         IN int a_attrnamespace;
1535         INOUT struct uio *a_uio;
1536         OUT size_t *a_size;
1537         IN struct ucred *a_cred;
1538         IN struct thread *a_td;
1539 };
1540 */
1541 {
1542         struct inode *ip;
1543         struct fs *fs;
1544         u_char *eae, *p, *pe, *pn;
1545         unsigned easize;
1546         uint32_t ul;
1547         int error, ealen, stand_alone;
1548
1549         ip = VTOI(ap->a_vp);
1550         fs = ip->i_fs;
1551
1552         if (ap->a_vp->v_type == VCHR)
1553                 return (EOPNOTSUPP);
1554
1555         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1556             ap->a_cred, ap->a_td, VREAD);
1557         if (error)
1558                 return (error);
1559
1560         if (ip->i_ea_area == NULL) {
1561                 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1562                 if (error)
1563                         return (error);
1564                 stand_alone = 1;
1565         } else {
1566                 stand_alone = 0;
1567         }
1568         eae = ip->i_ea_area;
1569         easize = ip->i_ea_len;
1570
1571         error = 0;
1572         if (ap->a_size != NULL)
1573                 *ap->a_size = 0;
1574         pe = eae + easize;
1575         for(p = eae; error == 0 && p < pe; p = pn) {
1576                 bcopy(p, &ul, sizeof(ul));
1577                 pn = p + ul;
1578                 if (pn > pe)
1579                         break;
1580                 p += sizeof(ul);
1581                 if (*p++ != ap->a_attrnamespace)
1582                         continue;
1583                 p++;    /* pad2 */
1584                 ealen = *p;
1585                 if (ap->a_size != NULL) {
1586                         *ap->a_size += ealen + 1;
1587                 } else if (ap->a_uio != NULL) {
1588                         error = uiomove(p, ealen + 1, ap->a_uio);
1589                 }
1590         }
1591         if (stand_alone)
1592                 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1593         return(error);
1594 }
1595
1596 /*
1597  * Vnode operation to set a named attribute.
1598  */
1599 static int
1600 ffs_setextattr(struct vop_setextattr_args *ap)
1601 /*
1602 vop_setextattr {
1603         IN struct vnode *a_vp;
1604         IN int a_attrnamespace;
1605         IN const char *a_name;
1606         INOUT struct uio *a_uio;
1607         IN struct ucred *a_cred;
1608         IN struct thread *a_td;
1609 };
1610 */
1611 {
1612         struct inode *ip;
1613         struct fs *fs;
1614         uint32_t ealength, ul;
1615         int ealen, olen, eapad1, eapad2, error, i, easize;
1616         u_char *eae, *p;
1617         int stand_alone;
1618
1619         ip = VTOI(ap->a_vp);
1620         fs = ip->i_fs;
1621
1622         if (ap->a_vp->v_type == VCHR)
1623                 return (EOPNOTSUPP);
1624
1625         if (strlen(ap->a_name) == 0)
1626                 return (EINVAL);
1627
1628         /* XXX Now unsupported API to delete EAs using NULL uio. */
1629         if (ap->a_uio == NULL)
1630                 return (EOPNOTSUPP);
1631
1632         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1633                 return (EROFS);
1634
1635         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1636             ap->a_cred, ap->a_td, VWRITE);
1637         if (error) {
1638                 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1639                         ip->i_ea_error = error;
1640                 return (error);
1641         }
1642
1643         if (ip->i_ea_area == NULL) {
1644                 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1645                 if (error)
1646                         return (error);
1647                 stand_alone = 1;
1648         } else {
1649                 stand_alone = 0;
1650         }
1651
1652         ealen = ap->a_uio->uio_resid;
1653         ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1654         eapad1 = 8 - (ealength % 8);
1655         if (eapad1 == 8)
1656                 eapad1 = 0;
1657         eapad2 = 8 - (ealen % 8);
1658         if (eapad2 == 8)
1659                 eapad2 = 0;
1660         ealength += eapad1 + ealen + eapad2;
1661
1662         eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1663         bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1664         easize = ip->i_ea_len;
1665
1666         olen = ffs_findextattr(eae, easize,
1667             ap->a_attrnamespace, ap->a_name, &p, NULL);
1668         if (olen == -1) {
1669                 /* new, append at end */
1670                 p = eae + easize;
1671                 easize += ealength;
1672         } else {
1673                 bcopy(p, &ul, sizeof ul);
1674                 i = p - eae + ul;
1675                 if (ul != ealength) {
1676                         bcopy(p + ul, p + ealength, easize - i);
1677                         easize += (ealength - ul);
1678                 }
1679         }
1680         if (easize > NXADDR * fs->fs_bsize) {
1681                 free(eae, M_TEMP);
1682                 if (stand_alone)
1683                         ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1684                 else if (ip->i_ea_error == 0)
1685                         ip->i_ea_error = ENOSPC;
1686                 return(ENOSPC);
1687         }
1688         bcopy(&ealength, p, sizeof(ealength));
1689         p += sizeof(ealength);
1690         *p++ = ap->a_attrnamespace;
1691         *p++ = eapad2;
1692         *p++ = strlen(ap->a_name);
1693         strcpy(p, ap->a_name);
1694         p += strlen(ap->a_name);
1695         bzero(p, eapad1);
1696         p += eapad1;
1697         error = uiomove(p, ealen, ap->a_uio);
1698         if (error) {
1699                 free(eae, M_TEMP);
1700                 if (stand_alone)
1701                         ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1702                 else if (ip->i_ea_error == 0)
1703                         ip->i_ea_error = error;
1704                 return(error);
1705         }
1706         p += ealen;
1707         bzero(p, eapad2);
1708
1709         p = ip->i_ea_area;
1710         ip->i_ea_area = eae;
1711         ip->i_ea_len = easize;
1712         free(p, M_TEMP);
1713         if (stand_alone)
1714                 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1715         return(error);
1716 }
1717
1718 /*
1719  * Vnode pointer to File handle
1720  */
1721 static int
1722 ffs_vptofh(struct vop_vptofh_args *ap)
1723 /*
1724 vop_vptofh {
1725         IN struct vnode *a_vp;
1726         IN struct fid *a_fhp;
1727 };
1728 */
1729 {
1730         struct inode *ip;
1731         struct ufid *ufhp;
1732
1733         ip = VTOI(ap->a_vp);
1734         ufhp = (struct ufid *)ap->a_fhp;
1735         ufhp->ufid_len = sizeof(struct ufid);
1736         ufhp->ufid_ino = ip->i_number;
1737         ufhp->ufid_gen = ip->i_gen;
1738         return (0);
1739 }