sys/ufs/ffs/ffs_vnops.c

   1 /*-
   2  * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
   3  * All rights reserved.
   4  *
   5  * This software was developed for the FreeBSD Project by Marshall
   6  * Kirk McKusick and Network Associates Laboratories, the Security
   7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
   8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
   9  * research program
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  * Copyright (c) 1982, 1986, 1989, 1993
  33  *      The Regents of the University of California.  All rights reserved.
  34  *
  35  * Redistribution and use in source and binary forms, with or without
  36  * modification, are permitted provided that the following conditions
  37  * are met:
  38  * 1. Redistributions of source code must retain the above copyright
  39  *    notice, this list of conditions and the following disclaimer.
  40  * 2. Redistributions in binary form must reproduce the above copyright
  41  *    notice, this list of conditions and the following disclaimer in the
  42  *    documentation and/or other materials provided with the distribution.
  43  * 4. Neither the name of the University nor the names of its contributors
  44  *    may be used to endorse or promote products derived from this software
  45  *    without specific prior written permission.
  46  *
  47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  57  * SUCH DAMAGE.
  58  *
  59  *      from: @(#)ufs_readwrite.c       8.11 (Berkeley) 5/8/95
  60  * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
  61  *      @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
  62  */
  63
  64 #include <sys/cdefs.h>
  65 __FBSDID("$FreeBSD$");
  66
  67 #include <sys/param.h>
  68 #include <sys/bio.h>
  69 #include <sys/systm.h>
  70 #include <sys/buf.h>
  71 #include <sys/conf.h>
  72 #include <sys/extattr.h>
  73 #include <sys/kernel.h>
  74 #include <sys/limits.h>
  75 #include <sys/malloc.h>
  76 #include <sys/mount.h>
  77 #include <sys/priv.h>
  78 #include <sys/proc.h>
  79 #include <sys/resourcevar.h>
  80 #include <sys/signalvar.h>
  81 #include <sys/stat.h>
  82 #include <sys/vmmeter.h>
  83 #include <sys/vnode.h>
  84
  85 #include <vm/vm.h>
  86 #include <vm/vm_extern.h>
  87 #include <vm/vm_object.h>
  88 #include <vm/vm_page.h>
  89 #include <vm/vm_pager.h>
  90 #include <vm/vnode_pager.h>
  91
  92 #include <ufs/ufs/extattr.h>
  93 #include <ufs/ufs/quota.h>
  94 #include <ufs/ufs/inode.h>
  95 #include <ufs/ufs/ufs_extern.h>
  96 #include <ufs/ufs/ufsmount.h>
  97
  98 #include <ufs/ffs/fs.h>
  99 #include <ufs/ffs/ffs_extern.h>
 100 #include "opt_directio.h"
 101 #include "opt_ffs.h"
 102
 103 #ifdef DIRECTIO
 104 extern int      ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
 105 #endif
 106 static vop_fsync_t      ffs_fsync;
 107 static _vop_lock_t      ffs_lock;
 108 static vop_getpages_t   ffs_getpages;
 109 static vop_read_t       ffs_read;
 110 static vop_write_t      ffs_write;
 111 static int      ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
 112 static int      ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
 113                     struct ucred *cred);
 114 static vop_strategy_t   ffsext_strategy;
 115 static vop_closeextattr_t       ffs_closeextattr;
 116 static vop_deleteextattr_t      ffs_deleteextattr;
 117 static vop_getextattr_t ffs_getextattr;
 118 static vop_listextattr_t        ffs_listextattr;
 119 static vop_openextattr_t        ffs_openextattr;
 120 static vop_setextattr_t ffs_setextattr;
 121 static vop_vptofh_t     ffs_vptofh;
 122
 123
 124 /* Global vfs data structures for ufs. */
 125 struct vop_vector ffs_vnodeops1 = {
 126         .vop_default =          &ufs_vnodeops,
 127         .vop_fsync =            ffs_fsync,
 128         .vop_getpages =         ffs_getpages,
 129         ._vop_lock =            ffs_lock,
 130         .vop_read =             ffs_read,
 131         .vop_reallocblks =      ffs_reallocblks,
 132         .vop_write =            ffs_write,
 133         .vop_vptofh =           ffs_vptofh,
 134 };
 135
 136 struct vop_vector ffs_fifoops1 = {
 137         .vop_default =          &ufs_fifoops,
 138         .vop_fsync =            ffs_fsync,
 139         .vop_reallocblks =      ffs_reallocblks, /* XXX: really ??? */
 140         .vop_vptofh =           ffs_vptofh,
 141 };
 142
 143 /* Global vfs data structures for ufs. */
 144 struct vop_vector ffs_vnodeops2 = {
 145         .vop_default =          &ufs_vnodeops,
 146         .vop_fsync =            ffs_fsync,
 147         .vop_getpages =         ffs_getpages,
 148         ._vop_lock =            ffs_lock,
 149         .vop_read =             ffs_read,
 150         .vop_reallocblks =      ffs_reallocblks,
 151         .vop_write =            ffs_write,
 152         .vop_closeextattr =     ffs_closeextattr,
 153         .vop_deleteextattr =    ffs_deleteextattr,
 154         .vop_getextattr =       ffs_getextattr,
 155         .vop_listextattr =      ffs_listextattr,
 156         .vop_openextattr =      ffs_openextattr,
 157         .vop_setextattr =       ffs_setextattr,
 158         .vop_vptofh =           ffs_vptofh,
 159 };
 160
 161 struct vop_vector ffs_fifoops2 = {
 162         .vop_default =          &ufs_fifoops,
 163         .vop_fsync =            ffs_fsync,
 164         ._vop_lock =            ffs_lock,
 165         .vop_reallocblks =      ffs_reallocblks,
 166         .vop_strategy =         ffsext_strategy,
 167         .vop_closeextattr =     ffs_closeextattr,
 168         .vop_deleteextattr =    ffs_deleteextattr,
 169         .vop_getextattr =       ffs_getextattr,
 170         .vop_listextattr =      ffs_listextattr,
 171         .vop_openextattr =      ffs_openextattr,
 172         .vop_setextattr =       ffs_setextattr,
 173         .vop_vptofh =           ffs_vptofh,
 174 };
 175
 176 /*
 177  * Synch an open file.
 178  */
 179 /* ARGSUSED */
 180 static int
 181 ffs_fsync(struct vop_fsync_args *ap)
 182 {
 183         int error;
 184
 185         error = ffs_syncvnode(ap->a_vp, ap->a_waitfor);
 186         if (error)
 187                 return (error);
 188         if (ap->a_waitfor == MNT_WAIT &&
 189             (ap->a_vp->v_mount->mnt_flag & MNT_SOFTDEP))
 190                 error = softdep_fsync(ap->a_vp);
 191         return (error);
 192 }
 193
 194 int
 195 ffs_syncvnode(struct vnode *vp, int waitfor)
 196 {
 197         struct inode *ip = VTOI(vp);
 198         struct buf *bp;
 199         struct buf *nbp;
 200         int s, error, wait, passes, skipmeta;
 201         ufs_lbn_t lbn;
 202
 203         wait = (waitfor == MNT_WAIT);
 204         lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
 205
 206         /*
 207          * Flush all dirty buffers associated with a vnode.
 208          */
 209         passes = NIADDR + 1;
 210         skipmeta = 0;
 211         if (wait)
 212                 skipmeta = 1;
 213         s = splbio();
 214         VI_LOCK(vp);
 215 loop:
 216         TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs)
 217                 bp->b_vflags &= ~BV_SCANNED;
 218         TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
 219                 /*
 220                  * Reasons to skip this buffer: it has already been considered
 221                  * on this pass, this pass is the first time through on a
 222                  * synchronous flush request and the buffer being considered
 223                  * is metadata, the buffer has dependencies that will cause
 224                  * it to be redirtied and it has not already been deferred,
 225                  * or it is already being written.
 226                  */
 227                 if ((bp->b_vflags & BV_SCANNED) != 0)
 228                         continue;
 229                 bp->b_vflags |= BV_SCANNED;
 230                 if ((skipmeta == 1 && bp->b_lblkno < 0))
 231                         continue;
 232                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
 233                         continue;
 234                 VI_UNLOCK(vp);
 235                 if (!wait && LIST_FIRST(&bp->b_dep) != NULL &&
 236                     (bp->b_flags & B_DEFERRED) == 0 &&
 237                     buf_countdeps(bp, 0)) {
 238                         bp->b_flags |= B_DEFERRED;
 239                         BUF_UNLOCK(bp);
 240                         VI_LOCK(vp);
 241                         continue;
 242                 }
 243                 if ((bp->b_flags & B_DELWRI) == 0)
 244                         panic("ffs_fsync: not dirty");
 245                 /*
 246                  * If this is a synchronous flush request, or it is not a
 247                  * file or device, start the write on this buffer immediatly.
 248                  */
 249                 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
 250
 251                         /*
 252                          * On our final pass through, do all I/O synchronously
 253                          * so that we can find out if our flush is failing
 254                          * because of write errors.
 255                          */
 256                         if (passes > 0 || !wait) {
 257                                 if ((bp->b_flags & B_CLUSTEROK) && !wait) {
 258                                         (void) vfs_bio_awrite(bp);
 259                                 } else {
 260                                         bremfree(bp);
 261                                         splx(s);
 262                                         (void) bawrite(bp);
 263                                         s = splbio();
 264                                 }
 265                         } else {
 266                                 bremfree(bp);
 267                                 splx(s);
 268                                 if ((error = bwrite(bp)) != 0)
 269                                         return (error);
 270                                 s = splbio();
 271                         }
 272                 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
 273                         /*
 274                          * If the buffer is for data that has been truncated
 275                          * off the file, then throw it away.
 276                          */
 277                         bremfree(bp);
 278                         bp->b_flags |= B_INVAL | B_NOCACHE;
 279                         splx(s);
 280                         brelse(bp);
 281                         s = splbio();
 282                 } else
 283                         vfs_bio_awrite(bp);
 284
 285                 /*
 286                  * Since we may have slept during the I/O, we need
 287                  * to start from a known point.
 288                  */
 289                 VI_LOCK(vp);
 290                 nbp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd);
 291         }
 292         /*
 293          * If we were asked to do this synchronously, then go back for
 294          * another pass, this time doing the metadata.
 295          */
 296         if (skipmeta) {
 297                 skipmeta = 0;
 298                 goto loop;
 299         }
 300
 301         if (wait) {
 302                 bufobj_wwait(&vp->v_bufobj, 3, 0);
 303                 VI_UNLOCK(vp);
 304
 305                 /*
 306                  * Ensure that any filesystem metatdata associated
 307                  * with the vnode has been written.
 308                  */
 309                 splx(s);
 310                 if ((error = softdep_sync_metadata(vp)) != 0)
 311                         return (error);
 312                 s = splbio();
 313
 314                 VI_LOCK(vp);
 315                 if (vp->v_bufobj.bo_dirty.bv_cnt > 0) {
 316                         /*
 317                          * Block devices associated with filesystems may
 318                          * have new I/O requests posted for them even if
 319                          * the vnode is locked, so no amount of trying will
 320                          * get them clean. Thus we give block devices a
 321                          * good effort, then just give up. For all other file
 322                          * types, go around and try again until it is clean.
 323                          */
 324                         if (passes > 0) {
 325                                 passes -= 1;
 326                                 goto loop;
 327                         }
 328 #ifdef DIAGNOSTIC
 329                         if (!vn_isdisk(vp, NULL))
 330                                 vprint("ffs_fsync: dirty", vp);
 331 #endif
 332                 }
 333         }
 334         VI_UNLOCK(vp);
 335         splx(s);
 336         return (ffs_update(vp, wait));
 337 }
 338
 339 static int
 340 ffs_lock(ap)
 341         struct _vop_lock_args /* {
 342                 struct vnode *a_vp;
 343                 int a_flags;
 344                 struct thread *a_td;
 345                 char *file;
 346                 int line;
 347         } */ *ap;
 348 {
 349 #ifndef NO_FFS_SNAPSHOT
 350         struct vnode *vp;
 351         int flags;
 352         struct lock *lkp;
 353         int result;
 354
 355         switch (ap->a_flags & LK_TYPE_MASK) {
 356         case LK_SHARED:
 357         case LK_UPGRADE:
 358         case LK_EXCLUSIVE:
 359                 vp = ap->a_vp;
 360                 flags = ap->a_flags;
 361                 for (;;) {
 362                         /*
 363                          * vnode interlock must be held to ensure that
 364                          * the possibly external lock isn't freed,
 365                          * e.g. when mutating from snapshot file vnode
 366                          * to regular file vnode.
 367                          */
 368                         if ((flags & LK_INTERLOCK) == 0) {
 369                                 VI_LOCK(vp);
 370                                 flags |= LK_INTERLOCK;
 371                         }
 372                         lkp = vp->v_vnlock;
 373                         result = _lockmgr(lkp, flags, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line);
 374                         if (lkp == vp->v_vnlock || result != 0)
 375                                 break;
 376                         /*
 377                          * Apparent success, except that the vnode
 378                          * mutated between snapshot file vnode and
 379                          * regular file vnode while this process
 380                          * slept.  The lock currently held is not the
 381                          * right lock.  Release it, and try to get the
 382                          * new lock.
 383                          */
 384                         (void) _lockmgr(lkp, LK_RELEASE, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line);
 385                         if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
 386                                 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
 387                         flags &= ~LK_INTERLOCK;
 388                 }
 389                 break;
 390         default:
 391                 result = _VOP_LOCK_APV(&ufs_vnodeops, ap);
 392         }
 393         return (result);
 394 #else
 395         return (_VOP_LOCK_APV(&ufs_vnodeops, ap));
 396 #endif
 397 }
 398
 399 /*
 400  * Vnode op for reading.
 401  */
 402 /* ARGSUSED */
 403 static int
 404 ffs_read(ap)
 405         struct vop_read_args /* {
 406                 struct vnode *a_vp;
 407                 struct uio *a_uio;
 408                 int a_ioflag;
 409                 struct ucred *a_cred;
 410         } */ *ap;
 411 {
 412         struct vnode *vp;
 413         struct inode *ip;
 414         struct uio *uio;
 415         struct fs *fs;
 416         struct buf *bp;
 417         ufs_lbn_t lbn, nextlbn;
 418         off_t bytesinfile;
 419         long size, xfersize, blkoffset;
 420         int error, orig_resid;
 421         int seqcount;
 422         int ioflag;
 423
 424         vp = ap->a_vp;
 425         uio = ap->a_uio;
 426         ioflag = ap->a_ioflag;
 427         if (ap->a_ioflag & IO_EXT)
 428 #ifdef notyet
 429                 return (ffs_extread(vp, uio, ioflag));
 430 #else
 431                 panic("ffs_read+IO_EXT");
 432 #endif
 433 #ifdef DIRECTIO
 434         if ((ioflag & IO_DIRECT) != 0) {
 435                 int workdone;
 436
 437                 error = ffs_rawread(vp, uio, &workdone);
 438                 if (error != 0 || workdone != 0)
 439                         return error;
 440         }
 441 #endif
 442
 443         seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 444         ip = VTOI(vp);
 445
 446 #ifdef DIAGNOSTIC
 447         if (uio->uio_rw != UIO_READ)
 448                 panic("ffs_read: mode");
 449
 450         if (vp->v_type == VLNK) {
 451                 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
 452                         panic("ffs_read: short symlink");
 453         } else if (vp->v_type != VREG && vp->v_type != VDIR)
 454                 panic("ffs_read: type %d",  vp->v_type);
 455 #endif
 456         orig_resid = uio->uio_resid;
 457         KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
 458         if (orig_resid == 0)
 459                 return (0);
 460         KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
 461         fs = ip->i_fs;
 462         if (uio->uio_offset < ip->i_size &&
 463             uio->uio_offset >= fs->fs_maxfilesize)
 464                 return (EOVERFLOW);
 465
 466         for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 467                 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 468                         break;
 469                 lbn = lblkno(fs, uio->uio_offset);
 470                 nextlbn = lbn + 1;
 471
 472                 /*
 473                  * size of buffer.  The buffer representing the
 474                  * end of the file is rounded up to the size of
 475                  * the block type ( fragment or full block,
 476                  * depending ).
 477                  */
 478                 size = blksize(fs, ip, lbn);
 479                 blkoffset = blkoff(fs, uio->uio_offset);
 480
 481                 /*
 482                  * The amount we want to transfer in this iteration is
 483                  * one FS block less the amount of the data before
 484                  * our startpoint (duh!)
 485                  */
 486                 xfersize = fs->fs_bsize - blkoffset;
 487
 488                 /*
 489                  * But if we actually want less than the block,
 490                  * or the file doesn't have a whole block more of data,
 491                  * then use the lesser number.
 492                  */
 493                 if (uio->uio_resid < xfersize)
 494                         xfersize = uio->uio_resid;
 495                 if (bytesinfile < xfersize)
 496                         xfersize = bytesinfile;
 497
 498                 if (lblktosize(fs, nextlbn) >= ip->i_size) {
 499                         /*
 500                          * Don't do readahead if this is the end of the file.
 501                          */
 502                         error = bread(vp, lbn, size, NOCRED, &bp);
 503                 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 504                         /*
 505                          * Otherwise if we are allowed to cluster,
 506                          * grab as much as we can.
 507                          *
 508                          * XXX  This may not be a win if we are not
 509                          * doing sequential access.
 510                          */
 511                         error = cluster_read(vp, ip->i_size, lbn,
 512                                 size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp);
 513                 } else if (seqcount > 1) {
 514                         /*
 515                          * If we are NOT allowed to cluster, then
 516                          * if we appear to be acting sequentially,
 517                          * fire off a request for a readahead
 518                          * as well as a read. Note that the 4th and 5th
 519                          * arguments point to arrays of the size specified in
 520                          * the 6th argument.
 521                          */
 522                         int nextsize = blksize(fs, ip, nextlbn);
 523                         error = breadn(vp, lbn,
 524                             size, &nextlbn, &nextsize, 1, NOCRED, &bp);
 525                 } else {
 526                         /*
 527                          * Failing all of the above, just read what the
 528                          * user asked for. Interestingly, the same as
 529                          * the first option above.
 530                          */
 531                         error = bread(vp, lbn, size, NOCRED, &bp);
 532                 }
 533                 if (error) {
 534                         brelse(bp);
 535                         bp = NULL;
 536                         break;
 537                 }
 538
 539                 /*
 540                  * If IO_DIRECT then set B_DIRECT for the buffer.  This
 541                  * will cause us to attempt to release the buffer later on
 542                  * and will cause the buffer cache to attempt to free the
 543                  * underlying pages.
 544                  */
 545                 if (ioflag & IO_DIRECT)
 546                         bp->b_flags |= B_DIRECT;
 547
 548                 /*
 549                  * We should only get non-zero b_resid when an I/O error
 550                  * has occurred, which should cause us to break above.
 551                  * However, if the short read did not cause an error,
 552                  * then we want to ensure that we do not uiomove bad
 553                  * or uninitialized data.
 554                  */
 555                 size -= bp->b_resid;
 556                 if (size < xfersize) {
 557                         if (size == 0)
 558                                 break;
 559                         xfersize = size;
 560                 }
 561
 562                 error = uiomove((char *)bp->b_data + blkoffset,
 563                     (int)xfersize, uio);
 564                 if (error)
 565                         break;
 566
 567                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 568                    (LIST_FIRST(&bp->b_dep) == NULL)) {
 569                         /*
 570                          * If there are no dependencies, and it's VMIO,
 571                          * then we don't need the buf, mark it available
 572                          * for freeing. The VM has the data.
 573                          */
 574                         bp->b_flags |= B_RELBUF;
 575                         brelse(bp);
 576                 } else {
 577                         /*
 578                          * Otherwise let whoever
 579                          * made the request take care of
 580                          * freeing it. We just queue
 581                          * it onto another list.
 582                          */
 583                         bqrelse(bp);
 584                 }
 585         }
 586
 587         /*
 588          * This can only happen in the case of an error
 589          * because the loop above resets bp to NULL on each iteration
 590          * and on normal completion has not set a new value into it.
 591          * so it must have come from a 'break' statement
 592          */
 593         if (bp != NULL) {
 594                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 595                    (LIST_FIRST(&bp->b_dep) == NULL)) {
 596                         bp->b_flags |= B_RELBUF;
 597                         brelse(bp);
 598                 } else {
 599                         bqrelse(bp);
 600                 }
 601         }
 602
 603         if ((error == 0 || uio->uio_resid != orig_resid) &&
 604             (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) {
 605                 VI_LOCK(vp);
 606                 ip->i_flag |= IN_ACCESS;
 607                 VI_UNLOCK(vp);
 608         }
 609         return (error);
 610 }
 611
 612 /*
 613  * Vnode op for writing.
 614  */
 615 static int
 616 ffs_write(ap)
 617         struct vop_write_args /* {
 618                 struct vnode *a_vp;
 619                 struct uio *a_uio;
 620                 int a_ioflag;
 621                 struct ucred *a_cred;
 622         } */ *ap;
 623 {
 624         struct vnode *vp;
 625         struct uio *uio;
 626         struct inode *ip;
 627         struct fs *fs;
 628         struct buf *bp;
 629         struct thread *td;
 630         ufs_lbn_t lbn;
 631         off_t osize;
 632         int seqcount;
 633         int blkoffset, error, flags, ioflag, resid, size, xfersize;
 634
 635         vp = ap->a_vp;
 636         uio = ap->a_uio;
 637         ioflag = ap->a_ioflag;
 638         if (ap->a_ioflag & IO_EXT)
 639 #ifdef notyet
 640                 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
 641 #else
 642                 panic("ffs_write+IO_EXT");
 643 #endif
 644
 645         seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 646         ip = VTOI(vp);
 647
 648 #ifdef DIAGNOSTIC
 649         if (uio->uio_rw != UIO_WRITE)
 650                 panic("ffs_write: mode");
 651 #endif
 652
 653         switch (vp->v_type) {
 654         case VREG:
 655                 if (ioflag & IO_APPEND)
 656                         uio->uio_offset = ip->i_size;
 657                 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
 658                         return (EPERM);
 659                 /* FALLTHROUGH */
 660         case VLNK:
 661                 break;
 662         case VDIR:
 663                 panic("ffs_write: dir write");
 664                 break;
 665         default:
 666                 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
 667                         (int)uio->uio_offset,
 668                         (int)uio->uio_resid
 669                 );
 670         }
 671
 672         KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
 673         KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
 674         fs = ip->i_fs;
 675         if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
 676                 return (EFBIG);
 677         /*
 678          * Maybe this should be above the vnode op call, but so long as
 679          * file servers have no limits, I don't think it matters.
 680          */
 681         td = uio->uio_td;
 682         if (vp->v_type == VREG && td != NULL) {
 683                 PROC_LOCK(td->td_proc);
 684                 if (uio->uio_offset + uio->uio_resid >
 685                     lim_cur(td->td_proc, RLIMIT_FSIZE)) {
 686                         psignal(td->td_proc, SIGXFSZ);
 687                         PROC_UNLOCK(td->td_proc);
 688                         return (EFBIG);
 689                 }
 690                 PROC_UNLOCK(td->td_proc);
 691         }
 692
 693         resid = uio->uio_resid;
 694         osize = ip->i_size;
 695         if (seqcount > BA_SEQMAX)
 696                 flags = BA_SEQMAX << BA_SEQSHIFT;
 697         else
 698                 flags = seqcount << BA_SEQSHIFT;
 699         if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
 700                 flags |= IO_SYNC;
 701
 702         for (error = 0; uio->uio_resid > 0;) {
 703                 lbn = lblkno(fs, uio->uio_offset);
 704                 blkoffset = blkoff(fs, uio->uio_offset);
 705                 xfersize = fs->fs_bsize - blkoffset;
 706                 if (uio->uio_resid < xfersize)
 707                         xfersize = uio->uio_resid;
 708                 if (uio->uio_offset + xfersize > ip->i_size)
 709                         vnode_pager_setsize(vp, uio->uio_offset + xfersize);
 710
 711                 /*
 712                  * We must perform a read-before-write if the transfer size
 713                  * does not cover the entire buffer.
 714                  */
 715                 if (fs->fs_bsize > xfersize)
 716                         flags |= BA_CLRBUF;
 717                 else
 718                         flags &= ~BA_CLRBUF;
 719 /* XXX is uio->uio_offset the right thing here? */
 720                 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
 721                     ap->a_cred, flags, &bp);
 722                 if (error != 0)
 723                         break;
 724                 /*
 725                  * If the buffer is not valid we have to clear out any
 726                  * garbage data from the pages instantiated for the buffer.
 727                  * If we do not, a failed uiomove() during a write can leave
 728                  * the prior contents of the pages exposed to a userland
 729                  * mmap().  XXX deal with uiomove() errors a better way.
 730                  */
 731                 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
 732                         vfs_bio_clrbuf(bp);
 733                 if (ioflag & IO_DIRECT)
 734                         bp->b_flags |= B_DIRECT;
 735                 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
 736                         bp->b_flags |= B_NOCACHE;
 737
 738                 if (uio->uio_offset + xfersize > ip->i_size) {
 739                         ip->i_size = uio->uio_offset + xfersize;
 740                         DIP_SET(ip, i_size, ip->i_size);
 741                 }
 742
 743                 size = blksize(fs, ip, lbn) - bp->b_resid;
 744                 if (size < xfersize)
 745                         xfersize = size;
 746
 747                 error =
 748                     uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 749                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 750                    (LIST_FIRST(&bp->b_dep) == NULL)) {
 751                         bp->b_flags |= B_RELBUF;
 752                 }
 753
 754                 /*
 755                  * If IO_SYNC each buffer is written synchronously.  Otherwise
 756                  * if we have a severe page deficiency write the buffer
 757                  * asynchronously.  Otherwise try to cluster, and if that
 758                  * doesn't do it then either do an async write (if O_DIRECT),
 759                  * or a delayed write (if not).
 760                  */
 761                 if (ioflag & IO_SYNC) {
 762                         (void)bwrite(bp);
 763                 } else if (vm_page_count_severe() ||
 764                             buf_dirty_count_severe() ||
 765                             (ioflag & IO_ASYNC)) {
 766                         bp->b_flags |= B_CLUSTEROK;
 767                         bawrite(bp);
 768                 } else if (xfersize + blkoffset == fs->fs_bsize) {
 769                         if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
 770                                 bp->b_flags |= B_CLUSTEROK;
 771                                 cluster_write(vp, bp, ip->i_size, seqcount);
 772                         } else {
 773                                 bawrite(bp);
 774                         }
 775                 } else if (ioflag & IO_DIRECT) {
 776                         bp->b_flags |= B_CLUSTEROK;
 777                         bawrite(bp);
 778                 } else {
 779                         bp->b_flags |= B_CLUSTEROK;
 780                         bdwrite(bp);
 781                 }
 782                 if (error || xfersize == 0)
 783                         break;
 784                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
 785         }
 786         /*
 787          * If we successfully wrote any data, and we are not the superuser
 788          * we clear the setuid and setgid bits as a precaution against
 789          * tampering.
 790          */
 791         if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
 792             ap->a_cred) {
 793                 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID,
 794                     SUSER_ALLOWJAIL)) {
 795                         ip->i_mode &= ~(ISUID | ISGID);
 796                         DIP_SET(ip, i_mode, ip->i_mode);
 797                 }
 798         }
 799         if (error) {
 800                 if (ioflag & IO_UNIT) {
 801                         (void)ffs_truncate(vp, osize,
 802                             IO_NORMAL | (ioflag & IO_SYNC),
 803                             ap->a_cred, uio->uio_td);
 804                         uio->uio_offset -= resid - uio->uio_resid;
 805                         uio->uio_resid = resid;
 806                 }
 807         } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
 808                 error = ffs_update(vp, 1);
 809         return (error);
 810 }
 811
 812 /*
 813  * get page routine
 814  */
 815 static int
 816 ffs_getpages(ap)
 817         struct vop_getpages_args *ap;
 818 {
 819         int i;
 820         vm_page_t mreq;
 821         int pcount;
 822
 823         pcount = round_page(ap->a_count) / PAGE_SIZE;
 824         mreq = ap->a_m[ap->a_reqpage];
 825
 826         /*
 827          * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
 828          * then the entire page is valid.  Since the page may be mapped,
 829          * user programs might reference data beyond the actual end of file
 830          * occuring within the page.  We have to zero that data.
 831          */
 832         VM_OBJECT_LOCK(mreq->object);
 833         if (mreq->valid) {
 834                 if (mreq->valid != VM_PAGE_BITS_ALL)
 835                         vm_page_zero_invalid(mreq, TRUE);
 836                 vm_page_lock_queues();
 837                 for (i = 0; i < pcount; i++) {
 838                         if (i != ap->a_reqpage) {
 839                                 vm_page_free(ap->a_m[i]);
 840                         }
 841                 }
 842                 vm_page_unlock_queues();
 843                 VM_OBJECT_UNLOCK(mreq->object);
 844                 return VM_PAGER_OK;
 845         }
 846         VM_OBJECT_UNLOCK(mreq->object);
 847
 848         return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
 849                                             ap->a_count,
 850                                             ap->a_reqpage);
 851 }
 852
 853
 854 /*
 855  * Extended attribute area reading.
 856  */
 857 static int
 858 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
 859 {
 860         struct inode *ip;
 861         struct ufs2_dinode *dp;
 862         struct fs *fs;
 863         struct buf *bp;
 864         ufs_lbn_t lbn, nextlbn;
 865         off_t bytesinfile;
 866         long size, xfersize, blkoffset;
 867         int error, orig_resid;
 868
 869         ip = VTOI(vp);
 870         fs = ip->i_fs;
 871         dp = ip->i_din2;
 872
 873 #ifdef DIAGNOSTIC
 874         if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
 875                 panic("ffs_extread: mode");
 876
 877 #endif
 878         orig_resid = uio->uio_resid;
 879         KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
 880         if (orig_resid == 0)
 881                 return (0);
 882         KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
 883
 884         for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 885                 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
 886                         break;
 887                 lbn = lblkno(fs, uio->uio_offset);
 888                 nextlbn = lbn + 1;
 889
 890                 /*
 891                  * size of buffer.  The buffer representing the
 892                  * end of the file is rounded up to the size of
 893                  * the block type ( fragment or full block,
 894                  * depending ).
 895                  */
 896                 size = sblksize(fs, dp->di_extsize, lbn);
 897                 blkoffset = blkoff(fs, uio->uio_offset);
 898
 899                 /*
 900                  * The amount we want to transfer in this iteration is
 901                  * one FS block less the amount of the data before
 902                  * our startpoint (duh!)
 903                  */
 904                 xfersize = fs->fs_bsize - blkoffset;
 905
 906                 /*
 907                  * But if we actually want less than the block,
 908                  * or the file doesn't have a whole block more of data,
 909                  * then use the lesser number.
 910                  */
 911                 if (uio->uio_resid < xfersize)
 912                         xfersize = uio->uio_resid;
 913                 if (bytesinfile < xfersize)
 914                         xfersize = bytesinfile;
 915
 916                 if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
 917                         /*
 918                          * Don't do readahead if this is the end of the info.
 919                          */
 920                         error = bread(vp, -1 - lbn, size, NOCRED, &bp);
 921                 } else {
 922                         /*
 923                          * If we have a second block, then
 924                          * fire off a request for a readahead
 925                          * as well as a read. Note that the 4th and 5th
 926                          * arguments point to arrays of the size specified in
 927                          * the 6th argument.
 928                          */
 929                         int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
 930
 931                         nextlbn = -1 - nextlbn;
 932                         error = breadn(vp, -1 - lbn,
 933                             size, &nextlbn, &nextsize, 1, NOCRED, &bp);
 934                 }
 935                 if (error) {
 936                         brelse(bp);
 937                         bp = NULL;
 938                         break;
 939                 }
 940
 941                 /*
 942                  * If IO_DIRECT then set B_DIRECT for the buffer.  This
 943                  * will cause us to attempt to release the buffer later on
 944                  * and will cause the buffer cache to attempt to free the
 945                  * underlying pages.
 946                  */
 947                 if (ioflag & IO_DIRECT)
 948                         bp->b_flags |= B_DIRECT;
 949
 950                 /*
 951                  * We should only get non-zero b_resid when an I/O error
 952                  * has occurred, which should cause us to break above.
 953                  * However, if the short read did not cause an error,
 954                  * then we want to ensure that we do not uiomove bad
 955                  * or uninitialized data.
 956                  */
 957                 size -= bp->b_resid;
 958                 if (size < xfersize) {
 959                         if (size == 0)
 960                                 break;
 961                         xfersize = size;
 962                 }
 963
 964                 error = uiomove((char *)bp->b_data + blkoffset,
 965                                         (int)xfersize, uio);
 966                 if (error)
 967                         break;
 968
 969                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 970                    (LIST_FIRST(&bp->b_dep) == NULL)) {
 971                         /*
 972                          * If there are no dependencies, and it's VMIO,
 973                          * then we don't need the buf, mark it available
 974                          * for freeing. The VM has the data.
 975                          */
 976                         bp->b_flags |= B_RELBUF;
 977                         brelse(bp);
 978                 } else {
 979                         /*
 980                          * Otherwise let whoever
 981                          * made the request take care of
 982                          * freeing it. We just queue
 983                          * it onto another list.
 984                          */
 985                         bqrelse(bp);
 986                 }
 987         }
 988
 989         /*
 990          * This can only happen in the case of an error
 991          * because the loop above resets bp to NULL on each iteration
 992          * and on normal completion has not set a new value into it.
 993          * so it must have come from a 'break' statement
 994          */
 995         if (bp != NULL) {
 996                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 997                    (LIST_FIRST(&bp->b_dep) == NULL)) {
 998                         bp->b_flags |= B_RELBUF;
 999                         brelse(bp);
1000                 } else {
1001                         bqrelse(bp);
1002                 }
1003         }
1004
1005         if ((error == 0 || uio->uio_resid != orig_resid) &&
1006             (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) {
1007                 VI_LOCK(vp);
1008                 ip->i_flag |= IN_ACCESS;
1009                 VI_UNLOCK(vp);
1010         }
1011         return (error);
1012 }
1013
1014 /*
1015  * Extended attribute area writing.
1016  */
1017 static int
1018 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1019 {
1020         struct inode *ip;
1021         struct ufs2_dinode *dp;
1022         struct fs *fs;
1023         struct buf *bp;
1024         ufs_lbn_t lbn;
1025         off_t osize;
1026         int blkoffset, error, flags, resid, size, xfersize;
1027
1028         ip = VTOI(vp);
1029         fs = ip->i_fs;
1030         dp = ip->i_din2;
1031
1032 #ifdef DIAGNOSTIC
1033         if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1034                 panic("ffs_extwrite: mode");
1035 #endif
1036
1037         if (ioflag & IO_APPEND)
1038                 uio->uio_offset = dp->di_extsize;
1039         KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1040         KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1041         if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize)
1042                 return (EFBIG);
1043
1044         resid = uio->uio_resid;
1045         osize = dp->di_extsize;
1046         flags = IO_EXT;
1047         if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
1048                 flags |= IO_SYNC;
1049
1050         for (error = 0; uio->uio_resid > 0;) {
1051                 lbn = lblkno(fs, uio->uio_offset);
1052                 blkoffset = blkoff(fs, uio->uio_offset);
1053                 xfersize = fs->fs_bsize - blkoffset;
1054                 if (uio->uio_resid < xfersize)
1055                         xfersize = uio->uio_resid;
1056
1057                 /*
1058                  * We must perform a read-before-write if the transfer size
1059                  * does not cover the entire buffer.
1060                  */
1061                 if (fs->fs_bsize > xfersize)
1062                         flags |= BA_CLRBUF;
1063                 else
1064                         flags &= ~BA_CLRBUF;
1065                 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1066                     ucred, flags, &bp);
1067                 if (error != 0)
1068                         break;
1069                 /*
1070                  * If the buffer is not valid we have to clear out any
1071                  * garbage data from the pages instantiated for the buffer.
1072                  * If we do not, a failed uiomove() during a write can leave
1073                  * the prior contents of the pages exposed to a userland
1074                  * mmap().  XXX deal with uiomove() errors a better way.
1075                  */
1076                 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1077                         vfs_bio_clrbuf(bp);
1078                 if (ioflag & IO_DIRECT)
1079                         bp->b_flags |= B_DIRECT;
1080
1081                 if (uio->uio_offset + xfersize > dp->di_extsize)
1082                         dp->di_extsize = uio->uio_offset + xfersize;
1083
1084                 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1085                 if (size < xfersize)
1086                         xfersize = size;
1087
1088                 error =
1089                     uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1090                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1091                    (LIST_FIRST(&bp->b_dep) == NULL)) {
1092                         bp->b_flags |= B_RELBUF;
1093                 }
1094
1095                 /*
1096                  * If IO_SYNC each buffer is written synchronously.  Otherwise
1097                  * if we have a severe page deficiency write the buffer
1098                  * asynchronously.  Otherwise try to cluster, and if that
1099                  * doesn't do it then either do an async write (if O_DIRECT),
1100                  * or a delayed write (if not).
1101                  */
1102                 if (ioflag & IO_SYNC) {
1103                         (void)bwrite(bp);
1104                 } else if (vm_page_count_severe() ||
1105                             buf_dirty_count_severe() ||
1106                             xfersize + blkoffset == fs->fs_bsize ||
1107                             (ioflag & (IO_ASYNC | IO_DIRECT)))
1108                         bawrite(bp);
1109                 else
1110                         bdwrite(bp);
1111                 if (error || xfersize == 0)
1112                         break;
1113                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1114         }
1115         /*
1116          * If we successfully wrote any data, and we are not the superuser
1117          * we clear the setuid and setgid bits as a precaution against
1118          * tampering.
1119          */
1120         if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1121                 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID,
1122                     SUSER_ALLOWJAIL)) {
1123                         ip->i_mode &= ~(ISUID | ISGID);
1124                         dp->di_mode = ip->i_mode;
1125                 }
1126         }
1127         if (error) {
1128                 if (ioflag & IO_UNIT) {
1129                         (void)ffs_truncate(vp, osize,
1130                             IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td);
1131                         uio->uio_offset -= resid - uio->uio_resid;
1132                         uio->uio_resid = resid;
1133                 }
1134         } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1135                 error = ffs_update(vp, 1);
1136         return (error);
1137 }
1138
1139
1140 /*
1141  * Vnode operating to retrieve a named extended attribute.
1142  *
1143  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1144  * the length of the EA, and possibly the pointer to the entry and to the data.
1145  */
1146 static int
1147 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac)
1148 {
1149         u_char *p, *pe, *pn, *p0;
1150         int eapad1, eapad2, ealength, ealen, nlen;
1151         uint32_t ul;
1152
1153         pe = ptr + length;
1154         nlen = strlen(name);
1155
1156         for (p = ptr; p < pe; p = pn) {
1157                 p0 = p;
1158                 bcopy(p, &ul, sizeof(ul));
1159                 pn = p + ul;
1160                 /* make sure this entry is complete */
1161                 if (pn > pe)
1162                         break;
1163                 p += sizeof(uint32_t);
1164                 if (*p != nspace)
1165                         continue;
1166                 p++;
1167                 eapad2 = *p++;
1168                 if (*p != nlen)
1169                         continue;
1170                 p++;
1171                 if (bcmp(p, name, nlen))
1172                         continue;
1173                 ealength = sizeof(uint32_t) + 3 + nlen;
1174                 eapad1 = 8 - (ealength % 8);
1175                 if (eapad1 == 8)
1176                         eapad1 = 0;
1177                 ealength += eapad1;
1178                 ealen = ul - ealength - eapad2;
1179                 p += nlen + eapad1;
1180                 if (eap != NULL)
1181                         *eap = p0;
1182                 if (eac != NULL)
1183                         *eac = p;
1184                 return (ealen);
1185         }
1186         return(-1);
1187 }
1188
1189 static int
1190 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1191 {
1192         struct inode *ip;
1193         struct ufs2_dinode *dp;
1194         struct uio luio;
1195         struct iovec liovec;
1196         int easize, error;
1197         u_char *eae;
1198
1199         ip = VTOI(vp);
1200         dp = ip->i_din2;
1201         easize = dp->di_extsize;
1202
1203         eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1204
1205         liovec.iov_base = eae;
1206         liovec.iov_len = easize;
1207         luio.uio_iov = &liovec;
1208         luio.uio_iovcnt = 1;
1209         luio.uio_offset = 0;
1210         luio.uio_resid = easize;
1211         luio.uio_segflg = UIO_SYSSPACE;
1212         luio.uio_rw = UIO_READ;
1213         luio.uio_td = td;
1214
1215         error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1216         if (error) {
1217                 free(eae, M_TEMP);
1218                 return(error);
1219         }
1220         *p = eae;
1221         return (0);
1222 }
1223
1224 static int
1225 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1226 {
1227         struct inode *ip;
1228         struct ufs2_dinode *dp;
1229         int error;
1230
1231         ip = VTOI(vp);
1232
1233         if (ip->i_ea_area != NULL)
1234                 return (EBUSY);
1235         dp = ip->i_din2;
1236         error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1237         if (error)
1238                 return (error);
1239         ip->i_ea_len = dp->di_extsize;
1240         ip->i_ea_error = 0;
1241         return (0);
1242 }
1243
1244 /*
1245  * Vnode extattr transaction commit/abort
1246  */
1247 static int
1248 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1249 {
1250         struct inode *ip;
1251         struct uio luio;
1252         struct iovec liovec;
1253         int error;
1254         struct ufs2_dinode *dp;
1255
1256         ip = VTOI(vp);
1257         if (ip->i_ea_area == NULL)
1258                 return (EINVAL);
1259         dp = ip->i_din2;
1260         error = ip->i_ea_error;
1261         if (commit && error == 0) {
1262                 if (cred == NOCRED)
1263                         cred =  vp->v_mount->mnt_cred;
1264                 liovec.iov_base = ip->i_ea_area;
1265                 liovec.iov_len = ip->i_ea_len;
1266                 luio.uio_iov = &liovec;
1267                 luio.uio_iovcnt = 1;
1268                 luio.uio_offset = 0;
1269                 luio.uio_resid = ip->i_ea_len;
1270                 luio.uio_segflg = UIO_SYSSPACE;
1271                 luio.uio_rw = UIO_WRITE;
1272                 luio.uio_td = td;
1273                 /* XXX: I'm not happy about truncating to zero size */
1274                 if (ip->i_ea_len < dp->di_extsize)
1275                         error = ffs_truncate(vp, 0, IO_EXT, cred, td);
1276                 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1277         }
1278         free(ip->i_ea_area, M_TEMP);
1279         ip->i_ea_area = NULL;
1280         ip->i_ea_len = 0;
1281         ip->i_ea_error = 0;
1282         return (error);
1283 }
1284
1285 /*
1286  * Vnode extattr strategy routine for fifos.
1287  *
1288  * We need to check for a read or write of the external attributes.
1289  * Otherwise we just fall through and do the usual thing.
1290  */
1291 static int
1292 ffsext_strategy(struct vop_strategy_args *ap)
1293 /*
1294 struct vop_strategy_args {
1295         struct vnodeop_desc *a_desc;
1296         struct vnode *a_vp;
1297         struct buf *a_bp;
1298 };
1299 */
1300 {
1301         struct vnode *vp;
1302         daddr_t lbn;
1303
1304         vp = ap->a_vp;
1305         lbn = ap->a_bp->b_lblkno;
1306         if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
1307             lbn < 0 && lbn >= -NXADDR)
1308                 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1309         if (vp->v_type == VFIFO)
1310                 return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1311         panic("spec nodes went here");
1312 }
1313
1314 /*
1315  * Vnode extattr transaction commit/abort
1316  */
1317 static int
1318 ffs_openextattr(struct vop_openextattr_args *ap)
1319 /*
1320 struct vop_openextattr_args {
1321         struct vnodeop_desc *a_desc;
1322         struct vnode *a_vp;
1323         IN struct ucred *a_cred;
1324         IN struct thread *a_td;
1325 };
1326 */
1327 {
1328         struct inode *ip;
1329         struct fs *fs;
1330
1331         ip = VTOI(ap->a_vp);
1332         fs = ip->i_fs;
1333
1334         if (ap->a_vp->v_type == VCHR)
1335                 return (EOPNOTSUPP);
1336
1337         return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1338 }
1339
1340
1341 /*
1342  * Vnode extattr transaction commit/abort
1343  */
1344 static int
1345 ffs_closeextattr(struct vop_closeextattr_args *ap)
1346 /*
1347 struct vop_closeextattr_args {
1348         struct vnodeop_desc *a_desc;
1349         struct vnode *a_vp;
1350         int a_commit;
1351         IN struct ucred *a_cred;
1352         IN struct thread *a_td;
1353 };
1354 */
1355 {
1356         struct inode *ip;
1357         struct fs *fs;
1358
1359         ip = VTOI(ap->a_vp);
1360         fs = ip->i_fs;
1361
1362         if (ap->a_vp->v_type == VCHR)
1363                 return (EOPNOTSUPP);
1364
1365         if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
1366                 return (EROFS);
1367
1368         return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1369 }
1370
1371 /*
1372  * Vnode operation to remove a named attribute.
1373  */
1374 static int
1375 ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1376 /*
1377 vop_deleteextattr {
1378         IN struct vnode *a_vp;
1379         IN int a_attrnamespace;
1380         IN const char *a_name;
1381         IN struct ucred *a_cred;
1382         IN struct thread *a_td;
1383 };
1384 */
1385 {
1386         struct inode *ip;
1387         struct fs *fs;
1388         uint32_t ealength, ul;
1389         int ealen, olen, eapad1, eapad2, error, i, easize;
1390         u_char *eae, *p;
1391         int stand_alone;
1392
1393         ip = VTOI(ap->a_vp);
1394         fs = ip->i_fs;
1395
1396         if (ap->a_vp->v_type == VCHR)
1397                 return (EOPNOTSUPP);
1398
1399         if (strlen(ap->a_name) == 0)
1400                 return (EINVAL);
1401
1402         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1403                 return (EROFS);
1404
1405         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1406             ap->a_cred, ap->a_td, IWRITE);
1407         if (error) {
1408                 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1409                         ip->i_ea_error = error;
1410                 return (error);
1411         }
1412
1413         if (ip->i_ea_area == NULL) {
1414                 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1415                 if (error)
1416                         return (error);
1417                 stand_alone = 1;
1418         } else {
1419                 stand_alone = 0;
1420         }
1421
1422         ealength = eapad1 = ealen = eapad2 = 0;
1423
1424         eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1425         bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1426         easize = ip->i_ea_len;
1427
1428         olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1429             &p, NULL);
1430         if (olen == -1) {
1431                 /* delete but nonexistent */
1432                 free(eae, M_TEMP);
1433                 if (stand_alone)
1434                         ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1435                 return(ENOATTR);
1436         }
1437         bcopy(p, &ul, sizeof ul);
1438         i = p - eae + ul;
1439         if (ul != ealength) {
1440                 bcopy(p + ul, p + ealength, easize - i);
1441                 easize += (ealength - ul);
1442         }
1443         if (easize > NXADDR * fs->fs_bsize) {
1444                 free(eae, M_TEMP);
1445                 if (stand_alone)
1446                         ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1447                 else if (ip->i_ea_error == 0)
1448                         ip->i_ea_error = ENOSPC;
1449                 return(ENOSPC);
1450         }
1451         p = ip->i_ea_area;
1452         ip->i_ea_area = eae;
1453         ip->i_ea_len = easize;
1454         free(p, M_TEMP);
1455         if (stand_alone)
1456                 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1457         return(error);
1458 }
1459
1460 /*
1461  * Vnode operation to retrieve a named extended attribute.
1462  */
1463 static int
1464 ffs_getextattr(struct vop_getextattr_args *ap)
1465 /*
1466 vop_getextattr {
1467         IN struct vnode *a_vp;
1468         IN int a_attrnamespace;
1469         IN const char *a_name;
1470         INOUT struct uio *a_uio;
1471         OUT size_t *a_size;
1472         IN struct ucred *a_cred;
1473         IN struct thread *a_td;
1474 };
1475 */
1476 {
1477         struct inode *ip;
1478         struct fs *fs;
1479         u_char *eae, *p;
1480         unsigned easize;
1481         int error, ealen, stand_alone;
1482
1483         ip = VTOI(ap->a_vp);
1484         fs = ip->i_fs;
1485
1486         if (ap->a_vp->v_type == VCHR)
1487                 return (EOPNOTSUPP);
1488
1489         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1490             ap->a_cred, ap->a_td, IREAD);
1491         if (error)
1492                 return (error);
1493
1494         if (ip->i_ea_area == NULL) {
1495                 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1496                 if (error)
1497                         return (error);
1498                 stand_alone = 1;
1499         } else {
1500                 stand_alone = 0;
1501         }
1502         eae = ip->i_ea_area;
1503         easize = ip->i_ea_len;
1504
1505         ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1506             NULL, &p);
1507         if (ealen >= 0) {
1508                 error = 0;
1509                 if (ap->a_size != NULL)
1510                         *ap->a_size = ealen;
1511                 else if (ap->a_uio != NULL)
1512                         error = uiomove(p, ealen, ap->a_uio);
1513         } else
1514                 error = ENOATTR;
1515         if (stand_alone)
1516                 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1517         return(error);
1518 }
1519
1520 /*
1521  * Vnode operation to retrieve extended attributes on a vnode.
1522  */
1523 static int
1524 ffs_listextattr(struct vop_listextattr_args *ap)
1525 /*
1526 vop_listextattr {
1527         IN struct vnode *a_vp;
1528         IN int a_attrnamespace;
1529         INOUT struct uio *a_uio;
1530         OUT size_t *a_size;
1531         IN struct ucred *a_cred;
1532         IN struct thread *a_td;
1533 };
1534 */
1535 {
1536         struct inode *ip;
1537         struct fs *fs;
1538         u_char *eae, *p, *pe, *pn;
1539         unsigned easize;
1540         uint32_t ul;
1541         int error, ealen, stand_alone;
1542
1543         ip = VTOI(ap->a_vp);
1544         fs = ip->i_fs;
1545
1546         if (ap->a_vp->v_type == VCHR)
1547                 return (EOPNOTSUPP);
1548
1549         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1550             ap->a_cred, ap->a_td, IREAD);
1551         if (error)
1552                 return (error);
1553
1554         if (ip->i_ea_area == NULL) {
1555                 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1556                 if (error)
1557                         return (error);
1558                 stand_alone = 1;
1559         } else {
1560                 stand_alone = 0;
1561         }
1562         eae = ip->i_ea_area;
1563         easize = ip->i_ea_len;
1564
1565         error = 0;
1566         if (ap->a_size != NULL)
1567                 *ap->a_size = 0;
1568         pe = eae + easize;
1569         for(p = eae; error == 0 && p < pe; p = pn) {
1570                 bcopy(p, &ul, sizeof(ul));
1571                 pn = p + ul;
1572                 if (pn > pe)
1573                         break;
1574                 p += sizeof(ul);
1575                 if (*p++ != ap->a_attrnamespace)
1576                         continue;
1577                 p++;    /* pad2 */
1578                 ealen = *p;
1579                 if (ap->a_size != NULL) {
1580                         *ap->a_size += ealen + 1;
1581                 } else if (ap->a_uio != NULL) {
1582                         error = uiomove(p, ealen + 1, ap->a_uio);
1583                 }
1584         }
1585         if (stand_alone)
1586                 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1587         return(error);
1588 }
1589
1590 /*
1591  * Vnode operation to set a named attribute.
1592  */
1593 static int
1594 ffs_setextattr(struct vop_setextattr_args *ap)
1595 /*
1596 vop_setextattr {
1597         IN struct vnode *a_vp;
1598         IN int a_attrnamespace;
1599         IN const char *a_name;
1600         INOUT struct uio *a_uio;
1601         IN struct ucred *a_cred;
1602         IN struct thread *a_td;
1603 };
1604 */
1605 {
1606         struct inode *ip;
1607         struct fs *fs;
1608         uint32_t ealength, ul;
1609         int ealen, olen, eapad1, eapad2, error, i, easize;
1610         u_char *eae, *p;
1611         int stand_alone;
1612
1613         ip = VTOI(ap->a_vp);
1614         fs = ip->i_fs;
1615
1616         if (ap->a_vp->v_type == VCHR)
1617                 return (EOPNOTSUPP);
1618
1619         if (strlen(ap->a_name) == 0)
1620                 return (EINVAL);
1621
1622         /* XXX Now unsupported API to delete EAs using NULL uio. */
1623         if (ap->a_uio == NULL)
1624                 return (EOPNOTSUPP);
1625
1626         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1627                 return (EROFS);
1628
1629         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1630             ap->a_cred, ap->a_td, IWRITE);
1631         if (error) {
1632                 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1633                         ip->i_ea_error = error;
1634                 return (error);
1635         }
1636
1637         if (ip->i_ea_area == NULL) {
1638                 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1639                 if (error)
1640                         return (error);
1641                 stand_alone = 1;
1642         } else {
1643                 stand_alone = 0;
1644         }
1645
1646         ealen = ap->a_uio->uio_resid;
1647         ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1648         eapad1 = 8 - (ealength % 8);
1649         if (eapad1 == 8)
1650                 eapad1 = 0;
1651         eapad2 = 8 - (ealen % 8);
1652         if (eapad2 == 8)
1653                 eapad2 = 0;
1654         ealength += eapad1 + ealen + eapad2;
1655
1656         eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1657         bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1658         easize = ip->i_ea_len;
1659
1660         olen = ffs_findextattr(eae, easize,
1661             ap->a_attrnamespace, ap->a_name, &p, NULL);
1662         if (olen == -1) {
1663                 /* new, append at end */
1664                 p = eae + easize;
1665                 easize += ealength;
1666         } else {
1667                 bcopy(p, &ul, sizeof ul);
1668                 i = p - eae + ul;
1669                 if (ul != ealength) {
1670                         bcopy(p + ul, p + ealength, easize - i);
1671                         easize += (ealength - ul);
1672                 }
1673         }
1674         if (easize > NXADDR * fs->fs_bsize) {
1675                 free(eae, M_TEMP);
1676                 if (stand_alone)
1677                         ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1678                 else if (ip->i_ea_error == 0)
1679                         ip->i_ea_error = ENOSPC;
1680                 return(ENOSPC);
1681         }
1682         bcopy(&ealength, p, sizeof(ealength));
1683         p += sizeof(ealength);
1684         *p++ = ap->a_attrnamespace;
1685         *p++ = eapad2;
1686         *p++ = strlen(ap->a_name);
1687         strcpy(p, ap->a_name);
1688         p += strlen(ap->a_name);
1689         bzero(p, eapad1);
1690         p += eapad1;
1691         error = uiomove(p, ealen, ap->a_uio);
1692         if (error) {
1693                 free(eae, M_TEMP);
1694                 if (stand_alone)
1695                         ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1696                 else if (ip->i_ea_error == 0)
1697                         ip->i_ea_error = error;
1698                 return(error);
1699         }
1700         p += ealen;
1701         bzero(p, eapad2);
1702
1703         p = ip->i_ea_area;
1704         ip->i_ea_area = eae;
1705         ip->i_ea_len = easize;
1706         free(p, M_TEMP);
1707         if (stand_alone)
1708                 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1709         return(error);
1710 }
1711
1712 /*
1713  * Vnode pointer to File handle
1714  */
1715 static int
1716 ffs_vptofh(struct vop_vptofh_args *ap)
1717 /*
1718 vop_vptofh {
1719         IN struct vnode *a_vp;
1720         IN struct fid *a_fhp;
1721 };
1722 */
1723 {
1724         struct inode *ip;
1725         struct ufid *ufhp;
1726
1727         ip = VTOI(ap->a_vp);
1728         ufhp = (struct ufid *)ap->a_fhp;
1729         ufhp->ufid_len = sizeof(struct ufid);
1730         ufhp->ufid_ino = ip->i_number;
1731         ufhp->ufid_gen = ip->i_gen;
1732         return (0);
1733 }