sys/nfsclient/nfs_vnops.c

   1 /*-
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * Rick Macklem at The University of Guelph.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 4. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *      @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95
  33  */
  34
  35 #include <sys/cdefs.h>
  36 __FBSDID("$FreeBSD$");
  37
  38 /*
  39  * vnode op calls for Sun NFS version 2 and 3
  40  */
  41
  42 #include "opt_inet.h"
  43
  44 #include <sys/param.h>
  45 #include <sys/kernel.h>
  46 #include <sys/systm.h>
  47 #include <sys/resourcevar.h>
  48 #include <sys/proc.h>
  49 #include <sys/mount.h>
  50 #include <sys/bio.h>
  51 #include <sys/buf.h>
  52 #include <sys/malloc.h>
  53 #include <sys/mbuf.h>
  54 #include <sys/namei.h>
  55 #include <sys/socket.h>
  56 #include <sys/vnode.h>
  57 #include <sys/dirent.h>
  58 #include <sys/fcntl.h>
  59 #include <sys/lockf.h>
  60 #include <sys/stat.h>
  61 #include <sys/sysctl.h>
  62 #include <sys/signalvar.h>
  63
  64 #include <vm/vm.h>
  65 #include <vm/vm_object.h>
  66 #include <vm/vm_extern.h>
  67 #include <vm/vm_object.h>
  68
  69 #include <fs/fifofs/fifo.h>
  70
  71 #include <rpc/rpcclnt.h>
  72
  73 #include <nfs/rpcv2.h>
  74 #include <nfs/nfsproto.h>
  75 #include <nfsclient/nfs.h>
  76 #include <nfsclient/nfsnode.h>
  77 #include <nfsclient/nfsmount.h>
  78 #include <nfsclient/nfs_lock.h>
  79 #include <nfs/xdr_subs.h>
  80 #include <nfsclient/nfsm_subs.h>
  81
  82 #include <net/if.h>
  83 #include <netinet/in.h>
  84 #include <netinet/in_var.h>
  85
  86 /* Defs */
  87 #define TRUE    1
  88 #define FALSE   0
  89
  90 /*
  91  * Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these
  92  * calls are not in getblk() and brelse() so that they would not be necessary
  93  * here.
  94  */
  95 #ifndef B_VMIO
  96 #define vfs_busy_pages(bp, f)
  97 #endif
  98
  99 static vop_read_t       nfsfifo_read;
 100 static vop_write_t      nfsfifo_write;
 101 static vop_close_t      nfsfifo_close;
 102 static int      nfs_flush(struct vnode *, int, struct thread *,
 103                     int);
 104 static int      nfs_setattrrpc(struct vnode *, struct vattr *, struct ucred *,
 105                     struct thread *);
 106 static vop_lookup_t     nfs_lookup;
 107 static vop_create_t     nfs_create;
 108 static vop_mknod_t      nfs_mknod;
 109 static vop_open_t       nfs_open;
 110 static vop_close_t      nfs_close;
 111 static vop_access_t     nfs_access;
 112 static vop_getattr_t    nfs_getattr;
 113 static vop_setattr_t    nfs_setattr;
 114 static vop_read_t       nfs_read;
 115 static vop_fsync_t      nfs_fsync;
 116 static vop_remove_t     nfs_remove;
 117 static vop_link_t       nfs_link;
 118 static vop_rename_t     nfs_rename;
 119 static vop_mkdir_t      nfs_mkdir;
 120 static vop_rmdir_t      nfs_rmdir;
 121 static vop_symlink_t    nfs_symlink;
 122 static vop_readdir_t    nfs_readdir;
 123 static vop_strategy_t   nfs_strategy;
 124 static  int     nfs_lookitup(struct vnode *, const char *, int,
 125                     struct ucred *, struct thread *, struct nfsnode **);
 126 static  int     nfs_sillyrename(struct vnode *, struct vnode *,
 127                     struct componentname *);
 128 static vop_access_t     nfsspec_access;
 129 static vop_readlink_t   nfs_readlink;
 130 static vop_print_t      nfs_print;
 131 static vop_advlock_t    nfs_advlock;
 132
 133 /*
 134  * Global vfs data structures for nfs
 135  */
 136 struct vop_vector nfs_vnodeops = {
 137         .vop_default =          &default_vnodeops,
 138         .vop_access =           nfs_access,
 139         .vop_advlock =          nfs_advlock,
 140         .vop_close =            nfs_close,
 141         .vop_create =           nfs_create,
 142         .vop_fsync =            nfs_fsync,
 143         .vop_getattr =          nfs_getattr,
 144         .vop_getpages =         nfs_getpages,
 145         .vop_putpages =         nfs_putpages,
 146         .vop_inactive =         nfs_inactive,
 147         .vop_lease =            VOP_NULL,
 148         .vop_link =             nfs_link,
 149         .vop_lookup =           nfs_lookup,
 150         .vop_mkdir =            nfs_mkdir,
 151         .vop_mknod =            nfs_mknod,
 152         .vop_open =             nfs_open,
 153         .vop_print =            nfs_print,
 154         .vop_read =             nfs_read,
 155         .vop_readdir =          nfs_readdir,
 156         .vop_readlink =         nfs_readlink,
 157         .vop_reclaim =          nfs_reclaim,
 158         .vop_remove =           nfs_remove,
 159         .vop_rename =           nfs_rename,
 160         .vop_rmdir =            nfs_rmdir,
 161         .vop_setattr =          nfs_setattr,
 162         .vop_strategy =         nfs_strategy,
 163         .vop_symlink =          nfs_symlink,
 164         .vop_write =            nfs_write,
 165 };
 166
 167 struct vop_vector nfs_fifoops = {
 168         .vop_default =          &fifo_specops,
 169         .vop_access =           nfsspec_access,
 170         .vop_close =            nfsfifo_close,
 171         .vop_fsync =            nfs_fsync,
 172         .vop_getattr =          nfs_getattr,
 173         .vop_inactive =         nfs_inactive,
 174         .vop_print =            nfs_print,
 175         .vop_read =             nfsfifo_read,
 176         .vop_reclaim =          nfs_reclaim,
 177         .vop_setattr =          nfs_setattr,
 178         .vop_write =            nfsfifo_write,
 179 };
 180
 181 static int      nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp,
 182                              struct componentname *cnp, struct vattr *vap);
 183 static int      nfs_removerpc(struct vnode *dvp, const char *name, int namelen,
 184                               struct ucred *cred, struct thread *td);
 185 static int      nfs_renamerpc(struct vnode *fdvp, const char *fnameptr,
 186                               int fnamelen, struct vnode *tdvp,
 187                               const char *tnameptr, int tnamelen,
 188                               struct ucred *cred, struct thread *td);
 189 static int      nfs_renameit(struct vnode *sdvp, struct componentname *scnp,
 190                              struct sillyrename *sp);
 191
 192 /*
 193  * Global variables
 194  */
 195 struct mtx      nfs_iod_mtx;
 196 struct proc     *nfs_iodwant[NFS_MAXASYNCDAEMON];
 197 struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON];
 198 int              nfs_numasync = 0;
 199 #define DIRHDSIZ        (sizeof (struct dirent) - (MAXNAMLEN + 1))
 200
 201 SYSCTL_DECL(_vfs_nfs);
 202
 203 static int      nfsaccess_cache_timeout = NFS_MAXATTRTIMO;
 204 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW,
 205            &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout");
 206
 207 static int      nfsv3_commit_on_close = 0;
 208 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfsv3_commit_on_close, CTLFLAG_RW,
 209            &nfsv3_commit_on_close, 0, "write+commit on close, else only write");
 210
 211 static int      nfs_clean_pages_on_close = 1;
 212 SYSCTL_INT(_vfs_nfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW,
 213            &nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close");
 214
 215 int nfs_directio_enable = 0;
 216 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW,
 217            &nfs_directio_enable, 0, "Enable NFS directio");
 218
 219 /*
 220  * This sysctl allows other processes to mmap a file that has been opened
 221  * O_DIRECT by a process.  In general, having processes mmap the file while
 222  * Direct IO is in progress can lead to Data Inconsistencies.  But, we allow
 223  * this by default to prevent DoS attacks - to prevent a malicious user from
 224  * opening up files O_DIRECT preventing other users from mmap'ing these
 225  * files.  "Protected" environments where stricter consistency guarantees are
 226  * required can disable this knob.  The process that opened the file O_DIRECT
 227  * cannot mmap() the file, because mmap'ed IO on an O_DIRECT open() is not
 228  * meaningful.
 229  */
 230 int nfs_directio_allow_mmap = 1;
 231 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW,
 232            &nfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens");
 233
 234 #if 0
 235 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_hits, CTLFLAG_RD,
 236            &nfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count");
 237
 238 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_misses, CTLFLAG_RD,
 239            &nfsstats.accesscache_misses, 0, "NFS ACCESS cache miss count");
 240 #endif
 241
 242 #define NFSV3ACCESS_ALL (NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY          \
 243                          | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE     \
 244                          | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP)
 245
 246 /*
 247  * SMP Locking Note :
 248  * The list of locks after the description of the lock is the ordering
 249  * of other locks acquired with the lock held.
 250  * np->n_mtx : Protects the fields in the nfsnode.
 251        VM Object Lock
 252        VI_MTX (acquired indirectly)
 253  * nmp->nm_mtx : Protects the fields in the nfsmount.
 254        rep->r_mtx
 255  * nfs_iod_mtx : Global lock, protects shared nfsiod state.
 256  * nfs_reqq_mtx : Global lock, protects the nfs_reqq list.
 257        nmp->nm_mtx
 258        rep->r_mtx
 259  * rep->r_mtx : Protects the fields in an nfsreq.
 260  */
 261
 262 static int
 263 nfs3_access_otw(struct vnode *vp, int wmode, struct thread *td,
 264     struct ucred *cred)
 265 {
 266         const int v3 = 1;
 267         u_int32_t *tl;
 268         int error = 0, attrflag;
 269
 270         struct mbuf *mreq, *mrep, *md, *mb;
 271         caddr_t bpos, dpos;
 272         u_int32_t rmode;
 273         struct nfsnode *np = VTONFS(vp);
 274
 275         nfsstats.rpccnt[NFSPROC_ACCESS]++;
 276         mreq = nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED);
 277         mb = mreq;
 278         bpos = mtod(mb, caddr_t);
 279         nfsm_fhtom(vp, v3);
 280         tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 281         *tl = txdr_unsigned(wmode);
 282         nfsm_request(vp, NFSPROC_ACCESS, td, cred);
 283         nfsm_postop_attr(vp, attrflag);
 284         if (!error) {
 285                 tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
 286                 rmode = fxdr_unsigned(u_int32_t, *tl);
 287                 mtx_lock(&np->n_mtx);
 288                 np->n_mode = rmode;
 289                 np->n_modeuid = cred->cr_uid;
 290                 np->n_modestamp = time_second;
 291                 mtx_unlock(&np->n_mtx);
 292         }
 293         m_freem(mrep);
 294 nfsmout:
 295         return (error);
 296 }
 297
 298 /*
 299  * nfs access vnode op.
 300  * For nfs version 2, just return ok. File accesses may fail later.
 301  * For nfs version 3, use the access rpc to check accessibility. If file modes
 302  * are changed on the server, accesses might still fail later.
 303  */
 304 static int
 305 nfs_access(struct vop_access_args *ap)
 306 {
 307         struct vnode *vp = ap->a_vp;
 308         int error = 0;
 309         u_int32_t mode, wmode;
 310         int v3 = NFS_ISV3(vp);
 311         struct nfsnode *np = VTONFS(vp);
 312
 313         /*
 314          * Disallow write attempts on filesystems mounted read-only;
 315          * unless the file is a socket, fifo, or a block or character
 316          * device resident on the filesystem.
 317          */
 318         if ((ap->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 319                 switch (vp->v_type) {
 320                 case VREG:
 321                 case VDIR:
 322                 case VLNK:
 323                         return (EROFS);
 324                 default:
 325                         break;
 326                 }
 327         }
 328         /*
 329          * For nfs v3, check to see if we have done this recently, and if
 330          * so return our cached result instead of making an ACCESS call.
 331          * If not, do an access rpc, otherwise you are stuck emulating
 332          * ufs_access() locally using the vattr. This may not be correct,
 333          * since the server may apply other access criteria such as
 334          * client uid-->server uid mapping that we do not know about.
 335          */
 336         if (v3) {
 337                 if (ap->a_mode & VREAD)
 338                         mode = NFSV3ACCESS_READ;
 339                 else
 340                         mode = 0;
 341                 if (vp->v_type != VDIR) {
 342                         if (ap->a_mode & VWRITE)
 343                                 mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND);
 344                         if (ap->a_mode & VEXEC)
 345                                 mode |= NFSV3ACCESS_EXECUTE;
 346                 } else {
 347                         if (ap->a_mode & VWRITE)
 348                                 mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND |
 349                                          NFSV3ACCESS_DELETE);
 350                         if (ap->a_mode & VEXEC)
 351                                 mode |= NFSV3ACCESS_LOOKUP;
 352                 }
 353                 /* XXX safety belt, only make blanket request if caching */
 354                 if (nfsaccess_cache_timeout > 0) {
 355                         wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY |
 356                                 NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE |
 357                                 NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP;
 358                 } else {
 359                         wmode = mode;
 360                 }
 361
 362                 /*
 363                  * Does our cached result allow us to give a definite yes to
 364                  * this request?
 365                  */
 366                 mtx_lock(&np->n_mtx);
 367                 if ((time_second < (np->n_modestamp + nfsaccess_cache_timeout)) &&
 368                     (ap->a_cred->cr_uid == np->n_modeuid) &&
 369                     ((np->n_mode & mode) == mode)) {
 370                         nfsstats.accesscache_hits++;
 371                 } else {
 372                         /*
 373                          * Either a no, or a don't know.  Go to the wire.
 374                          */
 375                         nfsstats.accesscache_misses++;
 376                         mtx_unlock(&np->n_mtx);
 377                         error = nfs3_access_otw(vp, wmode, ap->a_td,ap->a_cred);
 378                         mtx_lock(&np->n_mtx);
 379                         if (!error) {
 380                                 if ((np->n_mode & mode) != mode) {
 381                                         error = EACCES;
 382                                 }
 383                         }
 384                 }
 385                 mtx_unlock(&np->n_mtx);
 386                 return (error);
 387         } else {
 388                 if ((error = nfsspec_access(ap)) != 0) {
 389                         return (error);
 390                 }
 391                 /*
 392                  * Attempt to prevent a mapped root from accessing a file
 393                  * which it shouldn't.  We try to read a byte from the file
 394                  * if the user is root and the file is not zero length.
 395                  * After calling nfsspec_access, we should have the correct
 396                  * file size cached.
 397                  */
 398                 mtx_lock(&np->n_mtx);
 399                 if (ap->a_cred->cr_uid == 0 && (ap->a_mode & VREAD)
 400                     && VTONFS(vp)->n_size > 0) {
 401                         struct iovec aiov;
 402                         struct uio auio;
 403                         char buf[1];
 404
 405                         mtx_unlock(&np->n_mtx);
 406                         aiov.iov_base = buf;
 407                         aiov.iov_len = 1;
 408                         auio.uio_iov = &aiov;
 409                         auio.uio_iovcnt = 1;
 410                         auio.uio_offset = 0;
 411                         auio.uio_resid = 1;
 412                         auio.uio_segflg = UIO_SYSSPACE;
 413                         auio.uio_rw = UIO_READ;
 414                         auio.uio_td = ap->a_td;
 415
 416                         if (vp->v_type == VREG)
 417                                 error = nfs_readrpc(vp, &auio, ap->a_cred);
 418                         else if (vp->v_type == VDIR) {
 419                                 char* bp;
 420                                 bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
 421                                 aiov.iov_base = bp;
 422                                 aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
 423                                 error = nfs_readdirrpc(vp, &auio, ap->a_cred);
 424                                 free(bp, M_TEMP);
 425                         } else if (vp->v_type == VLNK)
 426                                 error = nfs_readlinkrpc(vp, &auio, ap->a_cred);
 427                         else
 428                                 error = EACCES;
 429                 } else
 430                         mtx_unlock(&np->n_mtx);
 431                 return (error);
 432         }
 433 }
 434
 435 int nfs_otw_getattr_avoid = 0;
 436
 437 /*
 438  * nfs open vnode op
 439  * Check to see if the type is ok
 440  * and that deletion is not in progress.
 441  * For paged in text files, you will need to flush the page cache
 442  * if consistency is lost.
 443  */
 444 /* ARGSUSED */
 445 static int
 446 nfs_open(struct vop_open_args *ap)
 447 {
 448         struct vnode *vp = ap->a_vp;
 449         struct nfsnode *np = VTONFS(vp);
 450         struct vattr vattr;
 451         int error;
 452         int fmode = ap->a_mode;
 453
 454         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
 455                 return (EOPNOTSUPP);
 456
 457         /*
 458          * Get a valid lease. If cached data is stale, flush it.
 459          */
 460         mtx_lock(&np->n_mtx);
 461         if (np->n_flag & NMODIFIED) {
 462                 mtx_unlock(&np->n_mtx);
 463                 error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 464                 if (error == EINTR || error == EIO)
 465                         return (error);
 466                 np->n_attrstamp = 0;
 467                 if (vp->v_type == VDIR)
 468                         np->n_direofoffset = 0;
 469                 error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
 470                 if (error)
 471                         return (error);
 472                 mtx_lock(&np->n_mtx);
 473                 np->n_mtime = vattr.va_mtime;
 474                 mtx_unlock(&np->n_mtx);
 475         } else {
 476                 struct thread *td = curthread;
 477
 478                 if (np->n_ac_ts_syscalls != td->td_syscalls ||
 479                     np->n_ac_ts_tid != td->td_tid ||
 480                     td->td_proc == NULL ||
 481                     np->n_ac_ts_pid != td->td_proc->p_pid) {
 482                         np->n_attrstamp = 0;
 483                 }
 484                 mtx_unlock(&np->n_mtx);
 485                 error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
 486                 if (error)
 487                         return (error);
 488                 mtx_lock(&np->n_mtx);
 489                 if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 490                         if (vp->v_type == VDIR)
 491                                 np->n_direofoffset = 0;
 492                         mtx_unlock(&np->n_mtx);
 493                         error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 494                         if (error == EINTR || error == EIO) {
 495                                 return (error);
 496                         }
 497                         mtx_lock(&np->n_mtx);
 498                         np->n_mtime = vattr.va_mtime;
 499                 }
 500                 mtx_unlock(&np->n_mtx);
 501         }
 502         /*
 503          * If the object has >= 1 O_DIRECT active opens, we disable caching.
 504          */
 505         if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
 506                 if (np->n_directio_opens == 0) {
 507                         error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 508                         if (error)
 509                                 return (error);
 510                         mtx_lock(&np->n_mtx);
 511                         np->n_flag |= NNONCACHE;
 512                         mtx_unlock(&np->n_mtx);
 513                 }
 514                 np->n_directio_opens++;
 515         }
 516         vnode_create_vobject(vp, vattr.va_size, ap->a_td);
 517         return (0);
 518 }
 519
 520 /*
 521  * nfs close vnode op
 522  * What an NFS client should do upon close after writing is a debatable issue.
 523  * Most NFS clients push delayed writes to the server upon close, basically for
 524  * two reasons:
 525  * 1 - So that any write errors may be reported back to the client process
 526  *     doing the close system call. By far the two most likely errors are
 527  *     NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure.
 528  * 2 - To put a worst case upper bound on cache inconsistency between
 529  *     multiple clients for the file.
 530  * There is also a consistency problem for Version 2 of the protocol w.r.t.
 531  * not being able to tell if other clients are writing a file concurrently,
 532  * since there is no way of knowing if the changed modify time in the reply
 533  * is only due to the write for this client.
 534  * (NFS Version 3 provides weak cache consistency data in the reply that
 535  *  should be sufficient to detect and handle this case.)
 536  *
 537  * The current code does the following:
 538  * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers
 539  * for NFS Version 3 - flush dirty buffers to the server but don't invalidate
 540  *                     or commit them (this satisfies 1 and 2 except for the
 541  *                     case where the server crashes after this close but
 542  *                     before the commit RPC, which is felt to be "good
 543  *                     enough". Changing the last argument to nfs_flush() to
 544  *                     a 1 would force a commit operation, if it is felt a
 545  *                     commit is necessary now.
 546  */
 547 /* ARGSUSED */
 548 static int
 549 nfs_close(struct vop_close_args *ap)
 550 {
 551         struct vnode *vp = ap->a_vp;
 552         struct nfsnode *np = VTONFS(vp);
 553         int error = 0;
 554         int fmode = ap->a_fflag;
 555
 556         if (vp->v_type == VREG) {
 557             /*
 558              * Examine and clean dirty pages, regardless of NMODIFIED.
 559              * This closes a major hole in close-to-open consistency.
 560              * We want to push out all dirty pages (and buffers) on
 561              * close, regardless of whether they were dirtied by
 562              * mmap'ed writes or via write().
 563              */
 564             if (nfs_clean_pages_on_close && vp->v_object) {
 565                 VM_OBJECT_LOCK(vp->v_object);
 566                 vm_object_page_clean(vp->v_object, 0, 0, 0);
 567                 VM_OBJECT_UNLOCK(vp->v_object);
 568             }
 569             mtx_lock(&np->n_mtx);
 570             if (np->n_flag & NMODIFIED) {
 571                 mtx_unlock(&np->n_mtx);
 572                 if (NFS_ISV3(vp)) {
 573                     /*
 574                      * Under NFSv3 we have dirty buffers to dispose of.  We
 575                      * must flush them to the NFS server.  We have the option
 576                      * of waiting all the way through the commit rpc or just
 577                      * waiting for the initial write.  The default is to only
 578                      * wait through the initial write so the data is in the
 579                      * server's cache, which is roughly similar to the state
 580                      * a standard disk subsystem leaves the file in on close().
 581                      *
 582                      * We cannot clear the NMODIFIED bit in np->n_flag due to
 583                      * potential races with other processes, and certainly
 584                      * cannot clear it if we don't commit.
 585                      */
 586                     int cm = nfsv3_commit_on_close ? 1 : 0;
 587                     error = nfs_flush(vp, MNT_WAIT, ap->a_td, cm);
 588                     /* np->n_flag &= ~NMODIFIED; */
 589                 } else
 590                     error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 591                 mtx_lock(&np->n_mtx);
 592             }
 593             /*
 594              * Invalidate the attribute cache in all cases.
 595              * An open is going to fetch fresh attrs any way, other procs
 596              * on this node that have file open will be forced to do an
 597              * otw attr fetch, but this is safe.
 598              */
 599             np->n_attrstamp = 0;
 600             if (np->n_flag & NWRITEERR) {
 601                 np->n_flag &= ~NWRITEERR;
 602                 error = np->n_error;
 603             }
 604             mtx_unlock(&np->n_mtx);
 605         }
 606         if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
 607                 mtx_lock(&np->n_mtx);
 608                 KASSERT((np->n_directio_opens > 0),
 609                         ("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
 610                 np->n_directio_opens--;
 611                 if (np->n_directio_opens == 0)
 612                         np->n_flag &= ~NNONCACHE;
 613                 mtx_unlock(&np->n_mtx);
 614         }
 615         return (error);
 616 }
 617
 618 /*
 619  * nfs getattr call from vfs.
 620  */
 621 static int
 622 nfs_getattr(struct vop_getattr_args *ap)
 623 {
 624         struct vnode *vp = ap->a_vp;
 625         struct nfsnode *np = VTONFS(vp);
 626         caddr_t bpos, dpos;
 627         int error = 0;
 628         struct mbuf *mreq, *mrep, *md, *mb;
 629         int v3 = NFS_ISV3(vp);
 630
 631         /*
 632          * Update local times for special files.
 633          */
 634         mtx_lock(&np->n_mtx);
 635         if (np->n_flag & (NACC | NUPD))
 636                 np->n_flag |= NCHG;
 637         mtx_unlock(&np->n_mtx);
 638         /*
 639          * First look in the cache.
 640          */
 641         if (nfs_getattrcache(vp, ap->a_vap) == 0)
 642                 goto nfsmout;
 643         if (v3 && nfsaccess_cache_timeout > 0) {
 644                 nfsstats.accesscache_misses++;
 645                 nfs3_access_otw(vp, NFSV3ACCESS_ALL, ap->a_td, ap->a_cred);
 646                 if (nfs_getattrcache(vp, ap->a_vap) == 0)
 647                         goto nfsmout;
 648         }
 649         nfsstats.rpccnt[NFSPROC_GETATTR]++;
 650         mreq = nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH(v3));
 651         mb = mreq;
 652         bpos = mtod(mb, caddr_t);
 653         nfsm_fhtom(vp, v3);
 654         nfsm_request(vp, NFSPROC_GETATTR, ap->a_td, ap->a_cred);
 655         if (!error) {
 656                 nfsm_loadattr(vp, ap->a_vap);
 657         }
 658         m_freem(mrep);
 659 nfsmout:
 660         return (error);
 661 }
 662
 663 /*
 664  * nfs setattr call.
 665  */
 666 static int
 667 nfs_setattr(struct vop_setattr_args *ap)
 668 {
 669         struct vnode *vp = ap->a_vp;
 670         struct nfsnode *np = VTONFS(vp);
 671         struct vattr *vap = ap->a_vap;
 672         int error = 0;
 673         u_quad_t tsize;
 674
 675 #ifndef nolint
 676         tsize = (u_quad_t)0;
 677 #endif
 678
 679         /*
 680          * Setting of flags and marking of atimes are not supported.
 681          */
 682         if (vap->va_flags != VNOVAL || (vap->va_vaflags & VA_MARK_ATIME))
 683                 return (EOPNOTSUPP);
 684
 685         /*
 686          * Disallow write attempts if the filesystem is mounted read-only.
 687          */
 688         if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 689             vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 690             vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
 691             (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 692                 error = EROFS;
 693                 goto out;
 694         }
 695         if (vap->va_size != VNOVAL) {
 696                 switch (vp->v_type) {
 697                 case VDIR:
 698                         return (EISDIR);
 699                 case VCHR:
 700                 case VBLK:
 701                 case VSOCK:
 702                 case VFIFO:
 703                         if (vap->va_mtime.tv_sec == VNOVAL &&
 704                             vap->va_atime.tv_sec == VNOVAL &&
 705                             vap->va_mode == (mode_t)VNOVAL &&
 706                             vap->va_uid == (uid_t)VNOVAL &&
 707                             vap->va_gid == (gid_t)VNOVAL)
 708                                 return (0);
 709                         vap->va_size = VNOVAL;
 710                         break;
 711                 default:
 712                         /*
 713                          * Disallow write attempts if the filesystem is
 714                          * mounted read-only.
 715                          */
 716                         if (vp->v_mount->mnt_flag & MNT_RDONLY)
 717                                 return (EROFS);
 718                         /*
 719                          *  We run vnode_pager_setsize() early (why?),
 720                          * we must set np->n_size now to avoid vinvalbuf
 721                          * V_SAVE races that might setsize a lower
 722                          * value.
 723                          */
 724                         mtx_lock(&np->n_mtx);
 725                         tsize = np->n_size;
 726                         mtx_unlock(&np->n_mtx);
 727                         error = nfs_meta_setsize(vp, ap->a_cred,
 728                                                  ap->a_td, vap->va_size);
 729                         mtx_lock(&np->n_mtx);
 730                         if (np->n_flag & NMODIFIED) {
 731                             tsize = np->n_size;
 732                             mtx_unlock(&np->n_mtx);
 733                             if (vap->va_size == 0)
 734                                 error = nfs_vinvalbuf(vp, 0, ap->a_td, 1);
 735                             else
 736                                 error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 737                             if (error) {
 738                                 vnode_pager_setsize(vp, tsize);
 739                                 goto out;
 740                             }
 741                         } else
 742                             mtx_unlock(&np->n_mtx);
 743                         /*
 744                          * np->n_size has already been set to vap->va_size
 745                          * in nfs_meta_setsize(). We must set it again since
 746                          * nfs_loadattrcache() could be called through
 747                          * nfs_meta_setsize() and could modify np->n_size.
 748                          */
 749                         mtx_lock(&np->n_mtx);
 750                         np->n_vattr.va_size = np->n_size = vap->va_size;
 751                         mtx_unlock(&np->n_mtx);
 752                 };
 753         } else {
 754                 mtx_lock(&np->n_mtx);
 755                 if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) &&
 756                     (np->n_flag & NMODIFIED) && vp->v_type == VREG) {
 757                         mtx_unlock(&np->n_mtx);
 758                         if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1)) != 0 &&
 759                             (error == EINTR || error == EIO))
 760                                 return error;
 761                 } else
 762                         mtx_unlock(&np->n_mtx);
 763         }
 764         error = nfs_setattrrpc(vp, vap, ap->a_cred, ap->a_td);
 765         if (error && vap->va_size != VNOVAL) {
 766                 mtx_lock(&np->n_mtx);
 767                 np->n_size = np->n_vattr.va_size = tsize;
 768                 vnode_pager_setsize(vp, tsize);
 769                 mtx_unlock(&np->n_mtx);
 770         }
 771 out:
 772         return (error);
 773 }
 774
 775 /*
 776  * Do an nfs setattr rpc.
 777  */
 778 static int
 779 nfs_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred,
 780     struct thread *td)
 781 {
 782         struct nfsv2_sattr *sp;
 783         struct nfsnode *np = VTONFS(vp);
 784         caddr_t bpos, dpos;
 785         u_int32_t *tl;
 786         int error = 0, wccflag = NFSV3_WCCRATTR;
 787         struct mbuf *mreq, *mrep, *md, *mb;
 788         int v3 = NFS_ISV3(vp);
 789
 790         nfsstats.rpccnt[NFSPROC_SETATTR]++;
 791         mreq = nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH(v3) + NFSX_SATTR(v3));
 792         mb = mreq;
 793         bpos = mtod(mb, caddr_t);
 794         nfsm_fhtom(vp, v3);
 795         if (v3) {
 796                 nfsm_v3attrbuild(vap, TRUE);
 797                 tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
 798                 *tl = nfs_false;
 799         } else {
 800                 sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
 801                 if (vap->va_mode == (mode_t)VNOVAL)
 802                         sp->sa_mode = nfs_xdrneg1;
 803                 else
 804                         sp->sa_mode = vtonfsv2_mode(vp->v_type, vap->va_mode);
 805                 if (vap->va_uid == (uid_t)VNOVAL)
 806                         sp->sa_uid = nfs_xdrneg1;
 807                 else
 808                         sp->sa_uid = txdr_unsigned(vap->va_uid);
 809                 if (vap->va_gid == (gid_t)VNOVAL)
 810                         sp->sa_gid = nfs_xdrneg1;
 811                 else
 812                         sp->sa_gid = txdr_unsigned(vap->va_gid);
 813                 sp->sa_size = txdr_unsigned(vap->va_size);
 814                 txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
 815                 txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
 816         }
 817         nfsm_request(vp, NFSPROC_SETATTR, td, cred);
 818         if (v3) {
 819                 np->n_modestamp = 0;
 820                 nfsm_wcc_data(vp, wccflag);
 821         } else
 822                 nfsm_loadattr(vp, NULL);
 823         m_freem(mrep);
 824 nfsmout:
 825         return (error);
 826 }
 827
 828 /*
 829  * nfs lookup call, one step at a time...
 830  * First look in cache
 831  * If not found, unlock the directory nfsnode and do the rpc
 832  */
 833 static int
 834 nfs_lookup(struct vop_lookup_args *ap)
 835 {
 836         struct componentname *cnp = ap->a_cnp;
 837         struct vnode *dvp = ap->a_dvp;
 838         struct vnode **vpp = ap->a_vpp;
 839         int flags = cnp->cn_flags;
 840         struct vnode *newvp;
 841         struct nfsmount *nmp;
 842         caddr_t bpos, dpos;
 843         struct mbuf *mreq, *mrep, *md, *mb;
 844         long len;
 845         nfsfh_t *fhp;
 846         struct nfsnode *np;
 847         int error = 0, attrflag, fhsize;
 848         int v3 = NFS_ISV3(dvp);
 849         struct thread *td = cnp->cn_thread;
 850
 851         *vpp = NULLVP;
 852         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 853             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 854                 return (EROFS);
 855         if (dvp->v_type != VDIR)
 856                 return (ENOTDIR);
 857         nmp = VFSTONFS(dvp->v_mount);
 858         np = VTONFS(dvp);
 859         if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0) {
 860                 *vpp = NULLVP;
 861                 return (error);
 862         }
 863         if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) {
 864                 struct vattr vattr;
 865
 866                 newvp = *vpp;
 867                 if (!VOP_GETATTR(newvp, &vattr, cnp->cn_cred, td)
 868                  && vattr.va_ctime.tv_sec == VTONFS(newvp)->n_ctime) {
 869                      nfsstats.lookupcache_hits++;
 870                      if (cnp->cn_nameiop != LOOKUP &&
 871                          (flags & ISLASTCN))
 872                              cnp->cn_flags |= SAVENAME;
 873                      return (0);
 874                 }
 875                 cache_purge(newvp);
 876                 if (dvp != newvp)
 877                         vput(newvp);
 878                 else
 879                         vrele(newvp);
 880                 *vpp = NULLVP;
 881         }
 882         error = 0;
 883         newvp = NULLVP;
 884         nfsstats.lookupcache_misses++;
 885         nfsstats.rpccnt[NFSPROC_LOOKUP]++;
 886         len = cnp->cn_namelen;
 887         mreq = nfsm_reqhead(dvp, NFSPROC_LOOKUP,
 888                 NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len));
 889         mb = mreq;
 890         bpos = mtod(mb, caddr_t);
 891         nfsm_fhtom(dvp, v3);
 892         nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN);
 893         nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_thread, cnp->cn_cred);
 894         if (error) {
 895                 if (v3) {
 896                         nfsm_postop_attr(dvp, attrflag);
 897                         m_freem(mrep);
 898                 }
 899                 goto nfsmout;
 900         }
 901         nfsm_getfh(fhp, fhsize, v3);
 902
 903         /*
 904          * Handle RENAME case...
 905          */
 906         if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) {
 907                 if (NFS_CMPFH(np, fhp, fhsize)) {
 908                         m_freem(mrep);
 909                         return (EISDIR);
 910                 }
 911                 error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE);
 912                 if (error) {
 913                         m_freem(mrep);
 914                         return (error);
 915                 }
 916                 newvp = NFSTOV(np);
 917                 if (v3) {
 918                         nfsm_postop_attr(newvp, attrflag);
 919                         nfsm_postop_attr(dvp, attrflag);
 920                 } else
 921                         nfsm_loadattr(newvp, NULL);
 922                 *vpp = newvp;
 923                 m_freem(mrep);
 924                 cnp->cn_flags |= SAVENAME;
 925                 return (0);
 926         }
 927
 928         if (flags & ISDOTDOT) {
 929                 VOP_UNLOCK(dvp, 0, td);
 930                 error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, cnp->cn_lkflags);
 931                 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
 932                 if (error)
 933                         return (error);
 934                 newvp = NFSTOV(np);
 935         } else if (NFS_CMPFH(np, fhp, fhsize)) {
 936                 VREF(dvp);
 937                 newvp = dvp;
 938         } else {
 939                 error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, cnp->cn_lkflags);
 940                 if (error) {
 941                         m_freem(mrep);
 942                         return (error);
 943                 }
 944                 newvp = NFSTOV(np);
 945         }
 946         if (v3) {
 947                 nfsm_postop_attr(newvp, attrflag);
 948                 nfsm_postop_attr(dvp, attrflag);
 949         } else
 950                 nfsm_loadattr(newvp, NULL);
 951         if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
 952                 cnp->cn_flags |= SAVENAME;
 953         if ((cnp->cn_flags & MAKEENTRY) &&
 954             (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) {
 955                 np->n_ctime = np->n_vattr.va_ctime.tv_sec;
 956                 cache_enter(dvp, newvp, cnp);
 957         }
 958         *vpp = newvp;
 959         m_freem(mrep);
 960 nfsmout:
 961         if (error) {
 962                 if (newvp != NULLVP) {
 963                         vput(newvp);
 964                         *vpp = NULLVP;
 965                 }
 966                 if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) &&
 967                     (flags & ISLASTCN) && error == ENOENT) {
 968                         if (dvp->v_mount->mnt_flag & MNT_RDONLY)
 969                                 error = EROFS;
 970                         else
 971                                 error = EJUSTRETURN;
 972                 }
 973                 if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
 974                         cnp->cn_flags |= SAVENAME;
 975         }
 976         return (error);
 977 }
 978
 979 /*
 980  * nfs read call.
 981  * Just call nfs_bioread() to do the work.
 982  */
 983 static int
 984 nfs_read(struct vop_read_args *ap)
 985 {
 986         struct vnode *vp = ap->a_vp;
 987
 988         switch (vp->v_type) {
 989         case VREG:
 990                 return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
 991         case VDIR:
 992                 return (EISDIR);
 993         default:
 994                 return (EOPNOTSUPP);
 995         }
 996 }
 997
 998 /*
 999  * nfs readlink call
1000  */
1001 static int
1002 nfs_readlink(struct vop_readlink_args *ap)
1003 {
1004         struct vnode *vp = ap->a_vp;
1005
1006         if (vp->v_type != VLNK)
1007                 return (EINVAL);
1008         return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
1009 }
1010
1011 /*
1012  * Do a readlink rpc.
1013  * Called by nfs_doio() from below the buffer cache.
1014  */
1015 int
1016 nfs_readlinkrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
1017 {
1018         caddr_t bpos, dpos;
1019         int error = 0, len, attrflag;
1020         struct mbuf *mreq, *mrep, *md, *mb;
1021         int v3 = NFS_ISV3(vp);
1022
1023         nfsstats.rpccnt[NFSPROC_READLINK]++;
1024         mreq = nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH(v3));
1025         mb = mreq;
1026         bpos = mtod(mb, caddr_t);
1027         nfsm_fhtom(vp, v3);
1028         nfsm_request(vp, NFSPROC_READLINK, uiop->uio_td, cred);
1029         if (v3)
1030                 nfsm_postop_attr(vp, attrflag);
1031         if (!error) {
1032                 nfsm_strsiz(len, NFS_MAXPATHLEN);
1033                 if (len == NFS_MAXPATHLEN) {
1034                         struct nfsnode *np = VTONFS(vp);
1035                         mtx_lock(&np->n_mtx);
1036                         if (np->n_size && np->n_size < NFS_MAXPATHLEN)
1037                                 len = np->n_size;
1038                         mtx_unlock(&np->n_mtx);
1039                 }
1040                 nfsm_mtouio(uiop, len);
1041         }
1042         m_freem(mrep);
1043 nfsmout:
1044         return (error);
1045 }
1046
1047 /*
1048  * nfs read rpc call
1049  * Ditto above
1050  */
1051 int
1052 nfs_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
1053 {
1054         u_int32_t *tl;
1055         caddr_t bpos, dpos;
1056         struct mbuf *mreq, *mrep, *md, *mb;
1057         struct nfsmount *nmp;
1058         int error = 0, len, retlen, tsiz, eof, attrflag;
1059         int v3 = NFS_ISV3(vp);
1060         int rsize;
1061
1062 #ifndef nolint
1063         eof = 0;
1064 #endif
1065         nmp = VFSTONFS(vp->v_mount);
1066         tsiz = uiop->uio_resid;
1067         mtx_lock(&nmp->nm_mtx);
1068         if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) {
1069                 mtx_unlock(&nmp->nm_mtx);
1070                 return (EFBIG);
1071         }
1072         rsize = nmp->nm_rsize;
1073         mtx_unlock(&nmp->nm_mtx);
1074         while (tsiz > 0) {
1075                 nfsstats.rpccnt[NFSPROC_READ]++;
1076                 len = (tsiz > rsize) ? rsize : tsiz;
1077                 mreq = nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3);
1078                 mb = mreq;
1079                 bpos = mtod(mb, caddr_t);
1080                 nfsm_fhtom(vp, v3);
1081                 tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED * 3);
1082                 if (v3) {
1083                         txdr_hyper(uiop->uio_offset, tl);
1084                         *(tl + 2) = txdr_unsigned(len);
1085                 } else {
1086                         *tl++ = txdr_unsigned(uiop->uio_offset);
1087                         *tl++ = txdr_unsigned(len);
1088                         *tl = 0;
1089                 }
1090                 nfsm_request(vp, NFSPROC_READ, uiop->uio_td, cred);
1091                 if (v3) {
1092                         nfsm_postop_attr(vp, attrflag);
1093                         if (error) {
1094                                 m_freem(mrep);
1095                                 goto nfsmout;
1096                         }
1097                         tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED);
1098                         eof = fxdr_unsigned(int, *(tl + 1));
1099                 } else {
1100                         nfsm_loadattr(vp, NULL);
1101                 }
1102                 nfsm_strsiz(retlen, rsize);
1103                 nfsm_mtouio(uiop, retlen);
1104                 m_freem(mrep);
1105                 tsiz -= retlen;
1106                 if (v3) {
1107                         if (eof || retlen == 0) {
1108                                 tsiz = 0;
1109                         }
1110                 } else if (retlen < len) {
1111                         tsiz = 0;
1112                 }
1113         }
1114 nfsmout:
1115         return (error);
1116 }
1117
1118 /*
1119  * nfs write call
1120  */
1121 int
1122 nfs_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
1123              int *iomode, int *must_commit)
1124 {
1125         u_int32_t *tl;
1126         int32_t backup;
1127         caddr_t bpos, dpos;
1128         struct mbuf *mreq, *mrep, *md, *mb;
1129         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1130         int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit;
1131         int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC;
1132         int wsize;
1133
1134 #ifndef DIAGNOSTIC
1135         if (uiop->uio_iovcnt != 1)
1136                 panic("nfs: writerpc iovcnt > 1");
1137 #endif
1138         *must_commit = 0;
1139         tsiz = uiop->uio_resid;
1140         mtx_lock(&nmp->nm_mtx);
1141         if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) {
1142                 mtx_unlock(&nmp->nm_mtx);
1143                 return (EFBIG);
1144         }
1145         wsize = nmp->nm_wsize;
1146         mtx_unlock(&nmp->nm_mtx);
1147         while (tsiz > 0) {
1148                 nfsstats.rpccnt[NFSPROC_WRITE]++;
1149                 len = (tsiz > wsize) ? wsize : tsiz;
1150                 mreq = nfsm_reqhead(vp, NFSPROC_WRITE,
1151                         NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len));
1152                 mb = mreq;
1153                 bpos = mtod(mb, caddr_t);
1154                 nfsm_fhtom(vp, v3);
1155                 if (v3) {
1156                         tl = nfsm_build(u_int32_t *, 5 * NFSX_UNSIGNED);
1157                         txdr_hyper(uiop->uio_offset, tl);
1158                         tl += 2;
1159                         *tl++ = txdr_unsigned(len);
1160                         *tl++ = txdr_unsigned(*iomode);
1161                         *tl = txdr_unsigned(len);
1162                 } else {
1163                         u_int32_t x;
1164
1165                         tl = nfsm_build(u_int32_t *, 4 * NFSX_UNSIGNED);
1166                         /* Set both "begin" and "current" to non-garbage. */
1167                         x = txdr_unsigned((u_int32_t)uiop->uio_offset);
1168                         *tl++ = x;      /* "begin offset" */
1169                         *tl++ = x;      /* "current offset" */
1170                         x = txdr_unsigned(len);
1171                         *tl++ = x;      /* total to this offset */
1172                         *tl = x;        /* size of this write */
1173                 }
1174                 nfsm_uiotom(uiop, len);
1175                 nfsm_request(vp, NFSPROC_WRITE, uiop->uio_td, cred);
1176                 if (v3) {
1177                         wccflag = NFSV3_WCCCHK;
1178                         nfsm_wcc_data(vp, wccflag);
1179                         if (!error) {
1180                                 tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED
1181                                         + NFSX_V3WRITEVERF);
1182                                 rlen = fxdr_unsigned(int, *tl++);
1183                                 if (rlen == 0) {
1184                                         error = NFSERR_IO;
1185                                         m_freem(mrep);
1186                                         break;
1187                                 } else if (rlen < len) {
1188                                         backup = len - rlen;
1189                                         uiop->uio_iov->iov_base =
1190                                             (char *)uiop->uio_iov->iov_base -
1191                                             backup;
1192                                         uiop->uio_iov->iov_len += backup;
1193                                         uiop->uio_offset -= backup;
1194                                         uiop->uio_resid += backup;
1195                                         len = rlen;
1196                                 }
1197                                 commit = fxdr_unsigned(int, *tl++);
1198
1199                                 /*
1200                                  * Return the lowest committment level
1201                                  * obtained by any of the RPCs.
1202                                  */
1203                                 if (committed == NFSV3WRITE_FILESYNC)
1204                                         committed = commit;
1205                                 else if (committed == NFSV3WRITE_DATASYNC &&
1206                                         commit == NFSV3WRITE_UNSTABLE)
1207                                         committed = commit;
1208                                 mtx_lock(&nmp->nm_mtx);
1209                                 if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){
1210                                     bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
1211                                         NFSX_V3WRITEVERF);
1212                                     nmp->nm_state |= NFSSTA_HASWRITEVERF;
1213                                 } else if (bcmp((caddr_t)tl,
1214                                     (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF)) {
1215                                     *must_commit = 1;
1216                                     bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
1217                                         NFSX_V3WRITEVERF);
1218                                 }
1219                                 mtx_unlock(&nmp->nm_mtx);
1220                         }
1221                 } else {
1222                         nfsm_loadattr(vp, NULL);
1223                 }
1224                 if (wccflag) {
1225                         mtx_lock(&(VTONFS(vp))->n_mtx);
1226                         VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime;
1227                         mtx_unlock(&(VTONFS(vp))->n_mtx);
1228                 }
1229                 m_freem(mrep);
1230                 if (error)
1231                         break;
1232                 tsiz -= len;
1233         }
1234 nfsmout:
1235         if (vp->v_mount->mnt_kern_flag & MNTK_ASYNC)
1236                 committed = NFSV3WRITE_FILESYNC;
1237         *iomode = committed;
1238         if (error)
1239                 uiop->uio_resid = tsiz;
1240         return (error);
1241 }
1242
1243 /*
1244  * nfs mknod rpc
1245  * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the
1246  * mode set to specify the file type and the size field for rdev.
1247  */
1248 static int
1249 nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1250     struct vattr *vap)
1251 {
1252         struct nfsv2_sattr *sp;
1253         u_int32_t *tl;
1254         struct vnode *newvp = NULL;
1255         struct nfsnode *np = NULL;
1256         struct vattr vattr;
1257         caddr_t bpos, dpos;
1258         int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0;
1259         struct mbuf *mreq, *mrep, *md, *mb;
1260         u_int32_t rdev;
1261         int v3 = NFS_ISV3(dvp);
1262
1263         if (vap->va_type == VCHR || vap->va_type == VBLK)
1264                 rdev = txdr_unsigned(vap->va_rdev);
1265         else if (vap->va_type == VFIFO || vap->va_type == VSOCK)
1266                 rdev = nfs_xdrneg1;
1267         else {
1268                 return (EOPNOTSUPP);
1269         }
1270         if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread)) != 0) {
1271                 return (error);
1272         }
1273         nfsstats.rpccnt[NFSPROC_MKNOD]++;
1274         mreq = nfsm_reqhead(dvp, NFSPROC_MKNOD, NFSX_FH(v3) + 4 * NFSX_UNSIGNED +
1275                 + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3));
1276         mb = mreq;
1277         bpos = mtod(mb, caddr_t);
1278         nfsm_fhtom(dvp, v3);
1279         nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
1280         if (v3) {
1281                 tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
1282                 *tl++ = vtonfsv3_type(vap->va_type);
1283                 nfsm_v3attrbuild(vap, FALSE);
1284                 if (vap->va_type == VCHR || vap->va_type == VBLK) {
1285                         tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
1286                         *tl++ = txdr_unsigned(umajor(vap->va_rdev));
1287                         *tl = txdr_unsigned(uminor(vap->va_rdev));
1288                 }
1289         } else {
1290                 sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
1291                 sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
1292                 sp->sa_uid = nfs_xdrneg1;
1293                 sp->sa_gid = nfs_xdrneg1;
1294                 sp->sa_size = rdev;
1295                 txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
1296                 txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
1297         }
1298         nfsm_request(dvp, NFSPROC_MKNOD, cnp->cn_thread, cnp->cn_cred);
1299         if (!error) {
1300                 nfsm_mtofh(dvp, newvp, v3, gotvp);
1301                 if (!gotvp) {
1302                         if (newvp) {
1303                                 vput(newvp);
1304                                 newvp = NULL;
1305                         }
1306                         error = nfs_lookitup(dvp, cnp->cn_nameptr,
1307                             cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np);
1308                         if (!error)
1309                                 newvp = NFSTOV(np);
1310                 }
1311         }
1312         if (v3)
1313                 nfsm_wcc_data(dvp, wccflag);
1314         m_freem(mrep);
1315 nfsmout:
1316         if (error) {
1317                 if (newvp)
1318                         vput(newvp);
1319         } else {
1320                 if (cnp->cn_flags & MAKEENTRY)
1321                         cache_enter(dvp, newvp, cnp);
1322                 *vpp = newvp;
1323         }
1324         mtx_lock(&(VTONFS(dvp))->n_mtx);
1325         VTONFS(dvp)->n_flag |= NMODIFIED;
1326         if (!wccflag)
1327                 VTONFS(dvp)->n_attrstamp = 0;
1328         mtx_unlock(&(VTONFS(dvp))->n_mtx);
1329         return (error);
1330 }
1331
1332 /*
1333  * nfs mknod vop
1334  * just call nfs_mknodrpc() to do the work.
1335  */
1336 /* ARGSUSED */
1337 static int
1338 nfs_mknod(struct vop_mknod_args *ap)
1339 {
1340         return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap));
1341 }
1342
1343 static u_long create_verf;
1344 /*
1345  * nfs file create call
1346  */
1347 static int
1348 nfs_create(struct vop_create_args *ap)
1349 {
1350         struct vnode *dvp = ap->a_dvp;
1351         struct vattr *vap = ap->a_vap;
1352         struct componentname *cnp = ap->a_cnp;
1353         struct nfsv2_sattr *sp;
1354         u_int32_t *tl;
1355         struct nfsnode *np = NULL;
1356         struct vnode *newvp = NULL;
1357         caddr_t bpos, dpos;
1358         int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0, fmode = 0;
1359         struct mbuf *mreq, *mrep, *md, *mb;
1360         struct vattr vattr;
1361         int v3 = NFS_ISV3(dvp);
1362
1363         /*
1364          * Oops, not for me..
1365          */
1366         if (vap->va_type == VSOCK)
1367                 return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap));
1368
1369         if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread)) != 0) {
1370                 return (error);
1371         }
1372         if (vap->va_vaflags & VA_EXCLUSIVE)
1373                 fmode |= O_EXCL;
1374 again:
1375         nfsstats.rpccnt[NFSPROC_CREATE]++;
1376         mreq = nfsm_reqhead(dvp, NFSPROC_CREATE, NFSX_FH(v3) + 2 * NFSX_UNSIGNED +
1377                 nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3));
1378         mb = mreq;
1379         bpos = mtod(mb, caddr_t);
1380         nfsm_fhtom(dvp, v3);
1381         nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
1382         if (v3) {
1383                 tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
1384                 if (fmode & O_EXCL) {
1385                         *tl = txdr_unsigned(NFSV3CREATE_EXCLUSIVE);
1386                         tl = nfsm_build(u_int32_t *, NFSX_V3CREATEVERF);
1387 #ifdef INET
1388                         if (!TAILQ_EMPTY(&in_ifaddrhead))
1389                                 *tl++ = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr.s_addr;
1390                         else
1391 #endif
1392                                 *tl++ = create_verf;
1393                         *tl = ++create_verf;
1394                 } else {
1395                         *tl = txdr_unsigned(NFSV3CREATE_UNCHECKED);
1396                         nfsm_v3attrbuild(vap, FALSE);
1397                 }
1398         } else {
1399                 sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
1400                 sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
1401                 sp->sa_uid = nfs_xdrneg1;
1402                 sp->sa_gid = nfs_xdrneg1;
1403                 sp->sa_size = 0;
1404                 txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
1405                 txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
1406         }
1407         nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_thread, cnp->cn_cred);
1408         if (!error) {
1409                 nfsm_mtofh(dvp, newvp, v3, gotvp);
1410                 if (!gotvp) {
1411                         if (newvp) {
1412                                 vput(newvp);
1413                                 newvp = NULL;
1414                         }
1415                         error = nfs_lookitup(dvp, cnp->cn_nameptr,
1416                             cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np);
1417                         if (!error)
1418                                 newvp = NFSTOV(np);
1419                 }
1420         }
1421         if (v3)
1422                 nfsm_wcc_data(dvp, wccflag);
1423         m_freem(mrep);
1424 nfsmout:
1425         if (error) {
1426                 if (v3 && (fmode & O_EXCL) && error == NFSERR_NOTSUPP) {
1427                         fmode &= ~O_EXCL;
1428                         goto again;
1429                 }
1430                 if (newvp)
1431                         vput(newvp);
1432         } else if (v3 && (fmode & O_EXCL)) {
1433                 /*
1434                  * We are normally called with only a partially initialized
1435                  * VAP.  Since the NFSv3 spec says that server may use the
1436                  * file attributes to store the verifier, the spec requires
1437                  * us to do a SETATTR RPC. FreeBSD servers store the verifier
1438                  * in atime, but we can't really assume that all servers will
1439                  * so we ensure that our SETATTR sets both atime and mtime.
1440                  */
1441                 if (vap->va_mtime.tv_sec == VNOVAL)
1442                         vfs_timestamp(&vap->va_mtime);
1443                 if (vap->va_atime.tv_sec == VNOVAL)
1444                         vap->va_atime = vap->va_mtime;
1445                 error = nfs_setattrrpc(newvp, vap, cnp->cn_cred, cnp->cn_thread);
1446                 if (error)
1447                         vput(newvp);
1448         }
1449         if (!error) {
1450                 if (cnp->cn_flags & MAKEENTRY)
1451                         cache_enter(dvp, newvp, cnp);
1452                 *ap->a_vpp = newvp;
1453         }
1454         mtx_lock(&(VTONFS(dvp))->n_mtx);
1455         VTONFS(dvp)->n_flag |= NMODIFIED;
1456         if (!wccflag)
1457                 VTONFS(dvp)->n_attrstamp = 0;
1458         mtx_unlock(&(VTONFS(dvp))->n_mtx);
1459         return (error);
1460 }
1461
1462 /*
1463  * nfs file remove call
1464  * To try and make nfs semantics closer to ufs semantics, a file that has
1465  * other processes using the vnode is renamed instead of removed and then
1466  * removed later on the last close.
1467  * - If v_usecount > 1
1468  *        If a rename is not already in the works
1469  *           call nfs_sillyrename() to set it up
1470  *     else
1471  *        do the remove rpc
1472  */
1473 static int
1474 nfs_remove(struct vop_remove_args *ap)
1475 {
1476         struct vnode *vp = ap->a_vp;
1477         struct vnode *dvp = ap->a_dvp;
1478         struct componentname *cnp = ap->a_cnp;
1479         struct nfsnode *np = VTONFS(vp);
1480         int error = 0;
1481         struct vattr vattr;
1482
1483 #ifndef DIAGNOSTIC
1484         if ((cnp->cn_flags & HASBUF) == 0)
1485                 panic("nfs_remove: no name");
1486         if (vrefcnt(vp) < 1)
1487                 panic("nfs_remove: bad v_usecount");
1488 #endif
1489         if (vp->v_type == VDIR)
1490                 error = EPERM;
1491         else if (vrefcnt(vp) == 1 || (np->n_sillyrename &&
1492             VOP_GETATTR(vp, &vattr, cnp->cn_cred, cnp->cn_thread) == 0 &&
1493             vattr.va_nlink > 1)) {
1494                 /*
1495                  * Purge the name cache so that the chance of a lookup for
1496                  * the name succeeding while the remove is in progress is
1497                  * minimized. Without node locking it can still happen, such
1498                  * that an I/O op returns ESTALE, but since you get this if
1499                  * another host removes the file..
1500                  */
1501                 cache_purge(vp);
1502                 /*
1503                  * throw away biocache buffers, mainly to avoid
1504                  * unnecessary delayed writes later.
1505                  */
1506                 error = nfs_vinvalbuf(vp, 0, cnp->cn_thread, 1);
1507                 /* Do the rpc */
1508                 if (error != EINTR && error != EIO)
1509                         error = nfs_removerpc(dvp, cnp->cn_nameptr,
1510                                 cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread);
1511                 /*
1512                  * Kludge City: If the first reply to the remove rpc is lost..
1513                  *   the reply to the retransmitted request will be ENOENT
1514                  *   since the file was in fact removed
1515                  *   Therefore, we cheat and return success.
1516                  */
1517                 if (error == ENOENT)
1518                         error = 0;
1519         } else if (!np->n_sillyrename)
1520                 error = nfs_sillyrename(dvp, vp, cnp);
1521         np->n_attrstamp = 0;
1522         return (error);
1523 }
1524
1525 /*
1526  * nfs file remove rpc called from nfs_inactive
1527  */
1528 int
1529 nfs_removeit(struct sillyrename *sp)
1530 {
1531         /*
1532          * Make sure that the directory vnode is still valid.
1533          * XXX we should lock sp->s_dvp here.
1534          */
1535         if (sp->s_dvp->v_type == VBAD)
1536                 return (0);
1537         return (nfs_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred,
1538                 NULL));
1539 }
1540
1541 /*
1542  * Nfs remove rpc, called from nfs_remove() and nfs_removeit().
1543  */
1544 static int
1545 nfs_removerpc(struct vnode *dvp, const char *name, int namelen,
1546     struct ucred *cred, struct thread *td)
1547 {
1548         caddr_t bpos, dpos;
1549         int error = 0, wccflag = NFSV3_WCCRATTR;
1550         struct mbuf *mreq, *mrep, *md, *mb;
1551         int v3 = NFS_ISV3(dvp);
1552
1553         nfsstats.rpccnt[NFSPROC_REMOVE]++;
1554         mreq = nfsm_reqhead(dvp, NFSPROC_REMOVE,
1555                 NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen));
1556         mb = mreq;
1557         bpos = mtod(mb, caddr_t);
1558         nfsm_fhtom(dvp, v3);
1559         nfsm_strtom(name, namelen, NFS_MAXNAMLEN);
1560         nfsm_request(dvp, NFSPROC_REMOVE, td, cred);
1561         if (v3)
1562                 nfsm_wcc_data(dvp, wccflag);
1563         m_freem(mrep);
1564 nfsmout:
1565         mtx_lock(&(VTONFS(dvp))->n_mtx);
1566         VTONFS(dvp)->n_flag |= NMODIFIED;
1567         if (!wccflag)
1568                 VTONFS(dvp)->n_attrstamp = 0;
1569         mtx_unlock(&(VTONFS(dvp))->n_mtx);
1570         return (error);
1571 }
1572
1573 /*
1574  * nfs file rename call
1575  */
1576 static int
1577 nfs_rename(struct vop_rename_args *ap)
1578 {
1579         struct vnode *fvp = ap->a_fvp;
1580         struct vnode *tvp = ap->a_tvp;
1581         struct vnode *fdvp = ap->a_fdvp;
1582         struct vnode *tdvp = ap->a_tdvp;
1583         struct componentname *tcnp = ap->a_tcnp;
1584         struct componentname *fcnp = ap->a_fcnp;
1585         int error;
1586
1587 #ifndef DIAGNOSTIC
1588         if ((tcnp->cn_flags & HASBUF) == 0 ||
1589             (fcnp->cn_flags & HASBUF) == 0)
1590                 panic("nfs_rename: no name");
1591 #endif
1592         /* Check for cross-device rename */
1593         if ((fvp->v_mount != tdvp->v_mount) ||
1594             (tvp && (fvp->v_mount != tvp->v_mount))) {
1595                 error = EXDEV;
1596                 goto out;
1597         }
1598
1599         if (fvp == tvp) {
1600                 nfs_printf("nfs_rename: fvp == tvp (can't happen)\n");
1601                 error = 0;
1602                 goto out;
1603         }
1604         if ((error = vn_lock(fvp, LK_EXCLUSIVE, fcnp->cn_thread)) != 0)
1605                 goto out;
1606
1607         /*
1608          * We have to flush B_DELWRI data prior to renaming
1609          * the file.  If we don't, the delayed-write buffers
1610          * can be flushed out later after the file has gone stale
1611          * under NFSV3.  NFSV2 does not have this problem because
1612          * ( as far as I can tell ) it flushes dirty buffers more
1613          * often.
1614          *
1615          * Skip the rename operation if the fsync fails, this can happen
1616          * due to the server's volume being full, when we pushed out data
1617          * that was written back to our cache earlier. Not checking for
1618          * this condition can result in potential (silent) data loss.
1619          */
1620         error = VOP_FSYNC(fvp, MNT_WAIT, fcnp->cn_thread);
1621         VOP_UNLOCK(fvp, 0, fcnp->cn_thread);
1622         if (!error && tvp)
1623                 error = VOP_FSYNC(tvp, MNT_WAIT, tcnp->cn_thread);
1624         if (error)
1625                 goto out;
1626
1627         /*
1628          * If the tvp exists and is in use, sillyrename it before doing the
1629          * rename of the new file over it.
1630          * XXX Can't sillyrename a directory.
1631          */
1632         if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename &&
1633                 tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) {
1634                 vput(tvp);
1635                 tvp = NULL;
1636         }
1637
1638         error = nfs_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen,
1639                 tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred,
1640                 tcnp->cn_thread);
1641
1642         if (fvp->v_type == VDIR) {
1643                 if (tvp != NULL && tvp->v_type == VDIR)
1644                         cache_purge(tdvp);
1645                 cache_purge(fdvp);
1646         }
1647
1648 out:
1649         if (tdvp == tvp)
1650                 vrele(tdvp);
1651         else
1652                 vput(tdvp);
1653         if (tvp)
1654                 vput(tvp);
1655         vrele(fdvp);
1656         vrele(fvp);
1657         /*
1658          * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry.
1659          */
1660         if (error == ENOENT)
1661                 error = 0;
1662         return (error);
1663 }
1664
1665 /*
1666  * nfs file rename rpc called from nfs_remove() above
1667  */
1668 static int
1669 nfs_renameit(struct vnode *sdvp, struct componentname *scnp,
1670     struct sillyrename *sp)
1671 {
1672
1673         return (nfs_renamerpc(sdvp, scnp->cn_nameptr, scnp->cn_namelen, sdvp,
1674             sp->s_name, sp->s_namlen, scnp->cn_cred, scnp->cn_thread));
1675 }
1676
1677 /*
1678  * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit().
1679  */
1680 static int
1681 nfs_renamerpc(struct vnode *fdvp, const char *fnameptr, int fnamelen,
1682     struct vnode *tdvp, const char *tnameptr, int tnamelen, struct ucred *cred,
1683     struct thread *td)
1684 {
1685         caddr_t bpos, dpos;
1686         int error = 0, fwccflag = NFSV3_WCCRATTR, twccflag = NFSV3_WCCRATTR;
1687         struct mbuf *mreq, *mrep, *md, *mb;
1688         int v3 = NFS_ISV3(fdvp);
1689
1690         nfsstats.rpccnt[NFSPROC_RENAME]++;
1691         mreq = nfsm_reqhead(fdvp, NFSPROC_RENAME,
1692                 (NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) +
1693                 nfsm_rndup(tnamelen));
1694         mb = mreq;
1695         bpos = mtod(mb, caddr_t);
1696         nfsm_fhtom(fdvp, v3);
1697         nfsm_strtom(fnameptr, fnamelen, NFS_MAXNAMLEN);
1698         nfsm_fhtom(tdvp, v3);
1699         nfsm_strtom(tnameptr, tnamelen, NFS_MAXNAMLEN);
1700         nfsm_request(fdvp, NFSPROC_RENAME, td, cred);
1701         if (v3) {
1702                 nfsm_wcc_data(fdvp, fwccflag);
1703                 nfsm_wcc_data(tdvp, twccflag);
1704         }
1705         m_freem(mrep);
1706 nfsmout:
1707         mtx_lock(&(VTONFS(fdvp))->n_mtx);
1708         VTONFS(fdvp)->n_flag |= NMODIFIED;
1709         mtx_unlock(&(VTONFS(fdvp))->n_mtx);
1710         mtx_lock(&(VTONFS(tdvp))->n_mtx);
1711         VTONFS(tdvp)->n_flag |= NMODIFIED;
1712         mtx_unlock(&(VTONFS(tdvp))->n_mtx);
1713         if (!fwccflag)
1714                 VTONFS(fdvp)->n_attrstamp = 0;
1715         if (!twccflag)
1716                 VTONFS(tdvp)->n_attrstamp = 0;
1717         return (error);
1718 }
1719
1720 /*
1721  * nfs hard link create call
1722  */
1723 static int
1724 nfs_link(struct vop_link_args *ap)
1725 {
1726         struct vnode *vp = ap->a_vp;
1727         struct vnode *tdvp = ap->a_tdvp;
1728         struct componentname *cnp = ap->a_cnp;
1729         caddr_t bpos, dpos;
1730         int error = 0, wccflag = NFSV3_WCCRATTR, attrflag = 0;
1731         struct mbuf *mreq, *mrep, *md, *mb;
1732         int v3;
1733
1734         if (vp->v_mount != tdvp->v_mount) {
1735                 return (EXDEV);
1736         }
1737
1738         /*
1739          * Push all writes to the server, so that the attribute cache
1740          * doesn't get "out of sync" with the server.
1741          * XXX There should be a better way!
1742          */
1743         VOP_FSYNC(vp, MNT_WAIT, cnp->cn_thread);
1744
1745         v3 = NFS_ISV3(vp);
1746         nfsstats.rpccnt[NFSPROC_LINK]++;
1747         mreq = nfsm_reqhead(vp, NFSPROC_LINK,
1748                 NFSX_FH(v3)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen));
1749         mb = mreq;
1750         bpos = mtod(mb, caddr_t);
1751         nfsm_fhtom(vp, v3);
1752         nfsm_fhtom(tdvp, v3);
1753         nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
1754         nfsm_request(vp, NFSPROC_LINK, cnp->cn_thread, cnp->cn_cred);
1755         if (v3) {
1756                 nfsm_postop_attr(vp, attrflag);
1757                 nfsm_wcc_data(tdvp, wccflag);
1758         }
1759         m_freem(mrep);
1760 nfsmout:
1761         mtx_lock(&(VTONFS(tdvp))->n_mtx);
1762         VTONFS(tdvp)->n_flag |= NMODIFIED;
1763         mtx_unlock(&(VTONFS(tdvp))->n_mtx);
1764         if (!attrflag)
1765                 VTONFS(vp)->n_attrstamp = 0;
1766         if (!wccflag)
1767                 VTONFS(tdvp)->n_attrstamp = 0;
1768         /*
1769          * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry.
1770          */
1771         if (error == EEXIST)
1772                 error = 0;
1773         return (error);
1774 }
1775
1776 /*
1777  * nfs symbolic link create call
1778  */
1779 static int
1780 nfs_symlink(struct vop_symlink_args *ap)
1781 {
1782         struct vnode *dvp = ap->a_dvp;
1783         struct vattr *vap = ap->a_vap;
1784         struct componentname *cnp = ap->a_cnp;
1785         struct nfsv2_sattr *sp;
1786         caddr_t bpos, dpos;
1787         int slen, error = 0, wccflag = NFSV3_WCCRATTR, gotvp;
1788         struct mbuf *mreq, *mrep, *md, *mb;
1789         struct vnode *newvp = NULL;
1790         int v3 = NFS_ISV3(dvp);
1791
1792         nfsstats.rpccnt[NFSPROC_SYMLINK]++;
1793         slen = strlen(ap->a_target);
1794         mreq = nfsm_reqhead(dvp, NFSPROC_SYMLINK, NFSX_FH(v3) + 2*NFSX_UNSIGNED +
1795             nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(v3));
1796         mb = mreq;
1797         bpos = mtod(mb, caddr_t);
1798         nfsm_fhtom(dvp, v3);
1799         nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
1800         if (v3) {
1801                 nfsm_v3attrbuild(vap, FALSE);
1802         }
1803         nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN);
1804         if (!v3) {
1805                 sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
1806                 sp->sa_mode = vtonfsv2_mode(VLNK, vap->va_mode);
1807                 sp->sa_uid = nfs_xdrneg1;
1808                 sp->sa_gid = nfs_xdrneg1;
1809                 sp->sa_size = nfs_xdrneg1;
1810                 txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
1811                 txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
1812         }
1813
1814         /*
1815          * Issue the NFS request and get the rpc response.
1816          *
1817          * Only NFSv3 responses returning an error of 0 actually return
1818          * a file handle that can be converted into newvp without having
1819          * to do an extra lookup rpc.
1820          */
1821         nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_thread, cnp->cn_cred);
1822         if (v3) {
1823                 if (error == 0)
1824                         nfsm_mtofh(dvp, newvp, v3, gotvp);
1825                 nfsm_wcc_data(dvp, wccflag);
1826         }
1827
1828         /*
1829          * out code jumps -> here, mrep is also freed.
1830          */
1831
1832         m_freem(mrep);
1833 nfsmout:
1834
1835         /*
1836          * If we get an EEXIST error, silently convert it to no-error
1837          * in case of an NFS retry.
1838          */
1839         if (error == EEXIST)
1840                 error = 0;
1841
1842         /*
1843          * If we do not have (or no longer have) an error, and we could
1844          * not extract the newvp from the response due to the request being
1845          * NFSv2 or the error being EEXIST.  We have to do a lookup in order
1846          * to obtain a newvp to return.
1847          */
1848         if (error == 0 && newvp == NULL) {
1849                 struct nfsnode *np = NULL;
1850
1851                 error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
1852                     cnp->cn_cred, cnp->cn_thread, &np);
1853                 if (!error)
1854                         newvp = NFSTOV(np);
1855         }
1856         if (error) {
1857                 if (newvp)
1858                         vput(newvp);
1859         } else {
1860                 *ap->a_vpp = newvp;
1861         }
1862         mtx_lock(&(VTONFS(dvp))->n_mtx);
1863         VTONFS(dvp)->n_flag |= NMODIFIED;
1864         mtx_unlock(&(VTONFS(dvp))->n_mtx);
1865         if (!wccflag)
1866                 VTONFS(dvp)->n_attrstamp = 0;
1867         return (error);
1868 }
1869
1870 /*
1871  * nfs make dir call
1872  */
1873 static int
1874 nfs_mkdir(struct vop_mkdir_args *ap)
1875 {
1876         struct vnode *dvp = ap->a_dvp;
1877         struct vattr *vap = ap->a_vap;
1878         struct componentname *cnp = ap->a_cnp;
1879         struct nfsv2_sattr *sp;
1880         int len;
1881         struct nfsnode *np = NULL;
1882         struct vnode *newvp = NULL;
1883         caddr_t bpos, dpos;
1884         int error = 0, wccflag = NFSV3_WCCRATTR;
1885         int gotvp = 0;
1886         struct mbuf *mreq, *mrep, *md, *mb;
1887         struct vattr vattr;
1888         int v3 = NFS_ISV3(dvp);
1889
1890         if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_thread)) != 0) {
1891                 return (error);
1892         }
1893         len = cnp->cn_namelen;
1894         nfsstats.rpccnt[NFSPROC_MKDIR]++;
1895         mreq = nfsm_reqhead(dvp, NFSPROC_MKDIR,
1896           NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len) + NFSX_SATTR(v3));
1897         mb = mreq;
1898         bpos = mtod(mb, caddr_t);
1899         nfsm_fhtom(dvp, v3);
1900         nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN);
1901         if (v3) {
1902                 nfsm_v3attrbuild(vap, FALSE);
1903         } else {
1904                 sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
1905                 sp->sa_mode = vtonfsv2_mode(VDIR, vap->va_mode);
1906                 sp->sa_uid = nfs_xdrneg1;
1907                 sp->sa_gid = nfs_xdrneg1;
1908                 sp->sa_size = nfs_xdrneg1;
1909                 txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
1910                 txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
1911         }
1912         nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_thread, cnp->cn_cred);
1913         if (!error)
1914                 nfsm_mtofh(dvp, newvp, v3, gotvp);
1915         if (v3)
1916                 nfsm_wcc_data(dvp, wccflag);
1917         m_freem(mrep);
1918 nfsmout:
1919         mtx_lock(&(VTONFS(dvp))->n_mtx);
1920         VTONFS(dvp)->n_flag |= NMODIFIED;
1921         mtx_unlock(&(VTONFS(dvp))->n_mtx);
1922         if (!wccflag)
1923                 VTONFS(dvp)->n_attrstamp = 0;
1924         /*
1925          * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry
1926          * if we can succeed in looking up the directory.
1927          */
1928         if (error == EEXIST || (!error && !gotvp)) {
1929                 if (newvp) {
1930                         vput(newvp);
1931                         newvp = NULL;
1932                 }
1933                 error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred,
1934                         cnp->cn_thread, &np);
1935                 if (!error) {
1936                         newvp = NFSTOV(np);
1937                         if (newvp->v_type != VDIR)
1938                                 error = EEXIST;
1939                 }
1940         }
1941         if (error) {
1942                 if (newvp)
1943                         vput(newvp);
1944         } else
1945                 *ap->a_vpp = newvp;
1946         return (error);
1947 }
1948
1949 /*
1950  * nfs remove directory call
1951  */
1952 static int
1953 nfs_rmdir(struct vop_rmdir_args *ap)
1954 {
1955         struct vnode *vp = ap->a_vp;
1956         struct vnode *dvp = ap->a_dvp;
1957         struct componentname *cnp = ap->a_cnp;
1958         caddr_t bpos, dpos;
1959         int error = 0, wccflag = NFSV3_WCCRATTR;
1960         struct mbuf *mreq, *mrep, *md, *mb;
1961         int v3 = NFS_ISV3(dvp);
1962
1963         if (dvp == vp)
1964                 return (EINVAL);
1965         nfsstats.rpccnt[NFSPROC_RMDIR]++;
1966         mreq = nfsm_reqhead(dvp, NFSPROC_RMDIR,
1967                 NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen));
1968         mb = mreq;
1969         bpos = mtod(mb, caddr_t);
1970         nfsm_fhtom(dvp, v3);
1971         nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
1972         nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_thread, cnp->cn_cred);
1973         if (v3)
1974                 nfsm_wcc_data(dvp, wccflag);
1975         m_freem(mrep);
1976 nfsmout:
1977         mtx_lock(&(VTONFS(dvp))->n_mtx);
1978         VTONFS(dvp)->n_flag |= NMODIFIED;
1979         mtx_unlock(&(VTONFS(dvp))->n_mtx);
1980         if (!wccflag)
1981                 VTONFS(dvp)->n_attrstamp = 0;
1982         cache_purge(dvp);
1983         cache_purge(vp);
1984         /*
1985          * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry.
1986          */
1987         if (error == ENOENT)
1988                 error = 0;
1989         return (error);
1990 }
1991
1992 /*
1993  * nfs readdir call
1994  */
1995 static int
1996 nfs_readdir(struct vop_readdir_args *ap)
1997 {
1998         struct vnode *vp = ap->a_vp;
1999         struct nfsnode *np = VTONFS(vp);
2000         struct uio *uio = ap->a_uio;
2001         int tresid, error = 0;
2002         struct vattr vattr;
2003
2004         if (vp->v_type != VDIR)
2005                 return(EPERM);
2006
2007         /*
2008          * First, check for hit on the EOF offset cache
2009          */
2010         if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset &&
2011             (np->n_flag & NMODIFIED) == 0) {
2012                 if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_td) == 0) {
2013                         mtx_lock(&np->n_mtx);
2014                         if (!NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
2015                                 mtx_unlock(&np->n_mtx);
2016                                 nfsstats.direofcache_hits++;
2017                                 goto out;
2018                         } else
2019                                 mtx_unlock(&np->n_mtx);
2020                 }
2021         }
2022
2023         /*
2024          * Call nfs_bioread() to do the real work.
2025          */
2026         tresid = uio->uio_resid;
2027         error = nfs_bioread(vp, uio, 0, ap->a_cred);
2028
2029         if (!error && uio->uio_resid == tresid) {
2030                 nfsstats.direofcache_misses++;
2031         }
2032 out:
2033         return (error);
2034 }
2035
2036 /*
2037  * Readdir rpc call.
2038  * Called from below the buffer cache by nfs_doio().
2039  */
2040 int
2041 nfs_readdirrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
2042 {
2043         int len, left;
2044         struct dirent *dp = NULL;
2045         u_int32_t *tl;
2046         caddr_t cp;
2047         nfsuint64 *cookiep;
2048         caddr_t bpos, dpos;
2049         struct mbuf *mreq, *mrep, *md, *mb;
2050         nfsuint64 cookie;
2051         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2052         struct nfsnode *dnp = VTONFS(vp);
2053         u_quad_t fileno;
2054         int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1;
2055         int attrflag;
2056         int v3 = NFS_ISV3(vp);
2057
2058 #ifndef DIAGNOSTIC
2059         if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) ||
2060                 (uiop->uio_resid & (DIRBLKSIZ - 1)))
2061                 panic("nfs readdirrpc bad uio");
2062 #endif
2063
2064         /*
2065          * If there is no cookie, assume directory was stale.
2066          */
2067         nfs_dircookie_lock(dnp);
2068         cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
2069         if (cookiep) {
2070                 cookie = *cookiep;
2071                 nfs_dircookie_unlock(dnp);
2072         } else {
2073                 nfs_dircookie_unlock(dnp);
2074                 return (NFSERR_BAD_COOKIE);
2075         }
2076
2077         /*
2078          * Loop around doing readdir rpc's of size nm_readdirsize
2079          * truncated to a multiple of DIRBLKSIZ.
2080          * The stopping criteria is EOF or buffer full.
2081          */
2082         while (more_dirs && bigenough) {
2083                 nfsstats.rpccnt[NFSPROC_READDIR]++;
2084                 mreq = nfsm_reqhead(vp, NFSPROC_READDIR, NFSX_FH(v3) +
2085                         NFSX_READDIR(v3));
2086                 mb = mreq;
2087                 bpos = mtod(mb, caddr_t);
2088                 nfsm_fhtom(vp, v3);
2089                 if (v3) {
2090                         tl = nfsm_build(u_int32_t *, 5 * NFSX_UNSIGNED);
2091                         *tl++ = cookie.nfsuquad[0];
2092                         *tl++ = cookie.nfsuquad[1];
2093                         mtx_lock(&dnp->n_mtx);
2094                         *tl++ = dnp->n_cookieverf.nfsuquad[0];
2095                         *tl++ = dnp->n_cookieverf.nfsuquad[1];
2096                         mtx_unlock(&dnp->n_mtx);
2097                 } else {
2098                         tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
2099                         *tl++ = cookie.nfsuquad[0];
2100                 }
2101                 *tl = txdr_unsigned(nmp->nm_readdirsize);
2102                 nfsm_request(vp, NFSPROC_READDIR, uiop->uio_td, cred);
2103                 if (v3) {
2104                         nfsm_postop_attr(vp, attrflag);
2105                         if (!error) {
2106                                 tl = nfsm_dissect(u_int32_t *,
2107                                     2 * NFSX_UNSIGNED);
2108                                 mtx_lock(&dnp->n_mtx);
2109                                 dnp->n_cookieverf.nfsuquad[0] = *tl++;
2110                                 dnp->n_cookieverf.nfsuquad[1] = *tl;
2111                                 mtx_unlock(&dnp->n_mtx);
2112                         } else {
2113                                 m_freem(mrep);
2114                                 goto nfsmout;
2115                         }
2116                 }
2117                 tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2118                 more_dirs = fxdr_unsigned(int, *tl);
2119
2120                 /* loop thru the dir entries, doctoring them to 4bsd form */
2121                 while (more_dirs && bigenough) {
2122                         if (v3) {
2123                                 tl = nfsm_dissect(u_int32_t *,
2124                                     3 * NFSX_UNSIGNED);
2125                                 fileno = fxdr_hyper(tl);
2126                                 len = fxdr_unsigned(int, *(tl + 2));
2127                         } else {
2128                                 tl = nfsm_dissect(u_int32_t *,
2129                                     2 * NFSX_UNSIGNED);
2130                                 fileno = fxdr_unsigned(u_quad_t, *tl++);
2131                                 len = fxdr_unsigned(int, *tl);
2132                         }
2133                         if (len <= 0 || len > NFS_MAXNAMLEN) {
2134                                 error = EBADRPC;
2135                                 m_freem(mrep);
2136                                 goto nfsmout;
2137                         }
2138                         tlen = nfsm_rndup(len);
2139                         if (tlen == len)
2140                                 tlen += 4;      /* To ensure null termination */
2141                         left = DIRBLKSIZ - blksiz;
2142                         if ((tlen + DIRHDSIZ) > left) {
2143                                 dp->d_reclen += left;
2144                                 uiop->uio_iov->iov_base =
2145                                     (char *)uiop->uio_iov->iov_base + left;
2146                                 uiop->uio_iov->iov_len -= left;
2147                                 uiop->uio_offset += left;
2148                                 uiop->uio_resid -= left;
2149                                 blksiz = 0;
2150                         }
2151                         if ((tlen + DIRHDSIZ) > uiop->uio_resid)
2152                                 bigenough = 0;
2153                         if (bigenough) {
2154                                 dp = (struct dirent *)uiop->uio_iov->iov_base;
2155                                 dp->d_fileno = (int)fileno;
2156                                 dp->d_namlen = len;
2157                                 dp->d_reclen = tlen + DIRHDSIZ;
2158                                 dp->d_type = DT_UNKNOWN;
2159                                 blksiz += dp->d_reclen;
2160                                 if (blksiz == DIRBLKSIZ)
2161                                         blksiz = 0;
2162                                 uiop->uio_offset += DIRHDSIZ;
2163                                 uiop->uio_resid -= DIRHDSIZ;
2164                                 uiop->uio_iov->iov_base =
2165                                     (char *)uiop->uio_iov->iov_base + DIRHDSIZ;
2166                                 uiop->uio_iov->iov_len -= DIRHDSIZ;
2167                                 nfsm_mtouio(uiop, len);
2168                                 cp = uiop->uio_iov->iov_base;
2169                                 tlen -= len;
2170                                 *cp = '\0';     /* null terminate */
2171                                 uiop->uio_iov->iov_base =
2172                                     (char *)uiop->uio_iov->iov_base + tlen;
2173                                 uiop->uio_iov->iov_len -= tlen;
2174                                 uiop->uio_offset += tlen;
2175                                 uiop->uio_resid -= tlen;
2176                         } else
2177                                 nfsm_adv(nfsm_rndup(len));
2178                         if (v3) {
2179                                 tl = nfsm_dissect(u_int32_t *,
2180                                     3 * NFSX_UNSIGNED);
2181                         } else {
2182                                 tl = nfsm_dissect(u_int32_t *,
2183                                     2 * NFSX_UNSIGNED);
2184                         }
2185                         if (bigenough) {
2186                                 cookie.nfsuquad[0] = *tl++;
2187                                 if (v3)
2188                                         cookie.nfsuquad[1] = *tl++;
2189                         } else if (v3)
2190                                 tl += 2;
2191                         else
2192                                 tl++;
2193                         more_dirs = fxdr_unsigned(int, *tl);
2194                 }
2195                 /*
2196                  * If at end of rpc data, get the eof boolean
2197                  */
2198                 if (!more_dirs) {
2199                         tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2200                         more_dirs = (fxdr_unsigned(int, *tl) == 0);
2201                 }
2202                 m_freem(mrep);
2203         }
2204         /*
2205          * Fill last record, iff any, out to a multiple of DIRBLKSIZ
2206          * by increasing d_reclen for the last record.
2207          */
2208         if (blksiz > 0) {
2209                 left = DIRBLKSIZ - blksiz;
2210                 dp->d_reclen += left;
2211                 uiop->uio_iov->iov_base =
2212                     (char *)uiop->uio_iov->iov_base + left;
2213                 uiop->uio_iov->iov_len -= left;
2214                 uiop->uio_offset += left;
2215                 uiop->uio_resid -= left;
2216         }
2217
2218         /*
2219          * We are now either at the end of the directory or have filled the
2220          * block.
2221          */
2222         if (bigenough)
2223                 dnp->n_direofoffset = uiop->uio_offset;
2224         else {
2225                 if (uiop->uio_resid > 0)
2226                         nfs_printf("EEK! readdirrpc resid > 0\n");
2227                 nfs_dircookie_lock(dnp);
2228                 cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
2229                 *cookiep = cookie;
2230                 nfs_dircookie_unlock(dnp);
2231         }
2232 nfsmout:
2233         return (error);
2234 }
2235
2236 /*
2237  * NFS V3 readdir plus RPC. Used in place of nfs_readdirrpc().
2238  */
2239 int
2240 nfs_readdirplusrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
2241 {
2242         int len, left;
2243         struct dirent *dp;
2244         u_int32_t *tl;
2245         caddr_t cp;
2246         struct vnode *newvp;
2247         nfsuint64 *cookiep;
2248         caddr_t bpos, dpos, dpossav1, dpossav2;
2249         struct mbuf *mreq, *mrep, *md, *mb, *mdsav1, *mdsav2;
2250         struct nameidata nami, *ndp = &nami;
2251         struct componentname *cnp = &ndp->ni_cnd;
2252         nfsuint64 cookie;
2253         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2254         struct nfsnode *dnp = VTONFS(vp), *np;
2255         nfsfh_t *fhp;
2256         u_quad_t fileno;
2257         int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i;
2258         int attrflag, fhsize;
2259
2260 #ifndef nolint
2261         dp = NULL;
2262 #endif
2263 #ifndef DIAGNOSTIC
2264         if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) ||
2265                 (uiop->uio_resid & (DIRBLKSIZ - 1)))
2266                 panic("nfs readdirplusrpc bad uio");
2267 #endif
2268         ndp->ni_dvp = vp;
2269         newvp = NULLVP;
2270
2271         /*
2272          * If there is no cookie, assume directory was stale.
2273          */
2274         nfs_dircookie_lock(dnp);
2275         cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
2276         if (cookiep) {
2277                 cookie = *cookiep;
2278                 nfs_dircookie_unlock(dnp);
2279         } else {
2280                 nfs_dircookie_unlock(dnp);
2281                 return (NFSERR_BAD_COOKIE);
2282         }
2283         /*
2284          * Loop around doing readdir rpc's of size nm_readdirsize
2285          * truncated to a multiple of DIRBLKSIZ.
2286          * The stopping criteria is EOF or buffer full.
2287          */
2288         while (more_dirs && bigenough) {
2289                 nfsstats.rpccnt[NFSPROC_READDIRPLUS]++;
2290                 mreq = nfsm_reqhead(vp, NFSPROC_READDIRPLUS,
2291                         NFSX_FH(1) + 6 * NFSX_UNSIGNED);
2292                 mb = mreq;
2293                 bpos = mtod(mb, caddr_t);
2294                 nfsm_fhtom(vp, 1);
2295                 tl = nfsm_build(u_int32_t *, 6 * NFSX_UNSIGNED);
2296                 *tl++ = cookie.nfsuquad[0];
2297                 *tl++ = cookie.nfsuquad[1];
2298                 mtx_lock(&dnp->n_mtx);
2299                 *tl++ = dnp->n_cookieverf.nfsuquad[0];
2300                 *tl++ = dnp->n_cookieverf.nfsuquad[1];
2301                 mtx_unlock(&dnp->n_mtx);
2302                 *tl++ = txdr_unsigned(nmp->nm_readdirsize);
2303                 *tl = txdr_unsigned(nmp->nm_rsize);
2304                 nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_td, cred);
2305                 nfsm_postop_attr(vp, attrflag);
2306                 if (error) {
2307                         m_freem(mrep);
2308                         goto nfsmout;
2309                 }
2310                 tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
2311                 mtx_lock(&dnp->n_mtx);
2312                 dnp->n_cookieverf.nfsuquad[0] = *tl++;
2313                 dnp->n_cookieverf.nfsuquad[1] = *tl++;
2314                 mtx_unlock(&dnp->n_mtx);
2315                 more_dirs = fxdr_unsigned(int, *tl);
2316
2317                 /* loop thru the dir entries, doctoring them to 4bsd form */
2318                 while (more_dirs && bigenough) {
2319                         tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
2320                         fileno = fxdr_hyper(tl);
2321                         len = fxdr_unsigned(int, *(tl + 2));
2322                         if (len <= 0 || len > NFS_MAXNAMLEN) {
2323                                 error = EBADRPC;
2324                                 m_freem(mrep);
2325                                 goto nfsmout;
2326                         }
2327                         tlen = nfsm_rndup(len);
2328                         if (tlen == len)
2329                                 tlen += 4;      /* To ensure null termination*/
2330                         left = DIRBLKSIZ - blksiz;
2331                         if ((tlen + DIRHDSIZ) > left) {
2332                                 dp->d_reclen += left;
2333                                 uiop->uio_iov->iov_base =
2334                                     (char *)uiop->uio_iov->iov_base + left;
2335                                 uiop->uio_iov->iov_len -= left;
2336                                 uiop->uio_offset += left;
2337                                 uiop->uio_resid -= left;
2338                                 blksiz = 0;
2339                         }
2340                         if ((tlen + DIRHDSIZ) > uiop->uio_resid)
2341                                 bigenough = 0;
2342                         if (bigenough) {
2343                                 dp = (struct dirent *)uiop->uio_iov->iov_base;
2344                                 dp->d_fileno = (int)fileno;
2345                                 dp->d_namlen = len;
2346                                 dp->d_reclen = tlen + DIRHDSIZ;
2347                                 dp->d_type = DT_UNKNOWN;
2348                                 blksiz += dp->d_reclen;
2349                                 if (blksiz == DIRBLKSIZ)
2350                                         blksiz = 0;
2351                                 uiop->uio_offset += DIRHDSIZ;
2352                                 uiop->uio_resid -= DIRHDSIZ;
2353                                 uiop->uio_iov->iov_base =
2354                                     (char *)uiop->uio_iov->iov_base + DIRHDSIZ;
2355                                 uiop->uio_iov->iov_len -= DIRHDSIZ;
2356                                 cnp->cn_nameptr = uiop->uio_iov->iov_base;
2357                                 cnp->cn_namelen = len;
2358                                 nfsm_mtouio(uiop, len);
2359                                 cp = uiop->uio_iov->iov_base;
2360                                 tlen -= len;
2361                                 *cp = '\0';
2362                                 uiop->uio_iov->iov_base =
2363                                     (char *)uiop->uio_iov->iov_base + tlen;
2364                                 uiop->uio_iov->iov_len -= tlen;
2365                                 uiop->uio_offset += tlen;
2366                                 uiop->uio_resid -= tlen;
2367                         } else
2368                                 nfsm_adv(nfsm_rndup(len));
2369                         tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
2370                         if (bigenough) {
2371                                 cookie.nfsuquad[0] = *tl++;
2372                                 cookie.nfsuquad[1] = *tl++;
2373                         } else
2374                                 tl += 2;
2375
2376                         /*
2377                          * Since the attributes are before the file handle
2378                          * (sigh), we must skip over the attributes and then
2379                          * come back and get them.
2380                          */
2381                         attrflag = fxdr_unsigned(int, *tl);
2382                         if (attrflag) {
2383                             dpossav1 = dpos;
2384                             mdsav1 = md;
2385                             nfsm_adv(NFSX_V3FATTR);
2386                             tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2387                             doit = fxdr_unsigned(int, *tl);
2388                             /*
2389                              * Skip loading the attrs for "..". There's a
2390                              * race between loading the attrs here and
2391                              * lookups that look for the directory currently
2392                              * being read (in the parent). We try to acquire
2393                              * the exclusive lock on ".." here, owning the
2394                              * lock on the directory being read. Lookup will
2395                              * hold the lock on ".." and try to acquire the
2396                              * lock on the directory being read.
2397                              *
2398                              * There are other ways of fixing this, one would
2399                              * be to do a trylock on the ".." vnode and skip
2400                              * loading the attrs on ".." if it happens to be
2401                              * locked by another process. But skipping the
2402                              * attrload on ".." seems the easiest option.
2403                              */
2404                             if (strcmp(dp->d_name, "..") == 0) {
2405                                     doit = 0;
2406                                     /*
2407                                      * We've already skipped over the attrs,
2408                                      * skip over the filehandle. And store d_type
2409                                      * as VDIR.
2410                                      */
2411                                     tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2412                                     i = fxdr_unsigned(int, *tl);
2413                                     nfsm_adv(nfsm_rndup(i));
2414                                     dp->d_type = IFTODT(VTTOIF(VDIR));
2415                             }
2416                             if (doit) {
2417                                 nfsm_getfh(fhp, fhsize, 1);
2418                                 if (NFS_CMPFH(dnp, fhp, fhsize)) {
2419                                     VREF(vp);
2420                                     newvp = vp;
2421                                     np = dnp;
2422                                 } else {
2423                                     error = nfs_nget(vp->v_mount, fhp,
2424                                         fhsize, &np, LK_EXCLUSIVE);
2425                                     if (error)
2426                                         doit = 0;
2427                                     else
2428                                         newvp = NFSTOV(np);
2429                                 }
2430                             }
2431                             if (doit && bigenough) {
2432                                 dpossav2 = dpos;
2433                                 dpos = dpossav1;
2434                                 mdsav2 = md;
2435                                 md = mdsav1;
2436                                 nfsm_loadattr(newvp, NULL);
2437                                 dpos = dpossav2;
2438                                 md = mdsav2;
2439                                 dp->d_type =
2440                                     IFTODT(VTTOIF(np->n_vattr.va_type));
2441                                 ndp->ni_vp = newvp;
2442                                 /* Update n_ctime, so subsequent lookup doesn't purge entry */
2443                                 np->n_ctime = np->n_vattr.va_ctime.tv_sec;
2444                                 cache_enter(ndp->ni_dvp, ndp->ni_vp, cnp);
2445                             }
2446                         } else {
2447                             /* Just skip over the file handle */
2448                             tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2449                             i = fxdr_unsigned(int, *tl);
2450                             if (i) {
2451                                     tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2452                                     fhsize = fxdr_unsigned(int, *tl);
2453                                     nfsm_adv(nfsm_rndup(fhsize));
2454                             }
2455                         }
2456                         if (newvp != NULLVP) {
2457                             if (newvp == vp)
2458                                 vrele(newvp);
2459                             else
2460                                 vput(newvp);
2461                             newvp = NULLVP;
2462                         }
2463                         tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2464                         more_dirs = fxdr_unsigned(int, *tl);
2465                 }
2466                 /*
2467                  * If at end of rpc data, get the eof boolean
2468                  */
2469                 if (!more_dirs) {
2470                         tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2471                         more_dirs = (fxdr_unsigned(int, *tl) == 0);
2472                 }
2473                 m_freem(mrep);
2474         }
2475         /*
2476          * Fill last record, iff any, out to a multiple of DIRBLKSIZ
2477          * by increasing d_reclen for the last record.
2478          */
2479         if (blksiz > 0) {
2480                 left = DIRBLKSIZ - blksiz;
2481                 dp->d_reclen += left;
2482                 uiop->uio_iov->iov_base =
2483                     (char *)uiop->uio_iov->iov_base + left;
2484                 uiop->uio_iov->iov_len -= left;
2485                 uiop->uio_offset += left;
2486                 uiop->uio_resid -= left;
2487         }
2488
2489         /*
2490          * We are now either at the end of the directory or have filled the
2491          * block.
2492          */
2493         if (bigenough)
2494                 dnp->n_direofoffset = uiop->uio_offset;
2495         else {
2496                 if (uiop->uio_resid > 0)
2497                         nfs_printf("EEK! readdirplusrpc resid > 0\n");
2498                 nfs_dircookie_lock(dnp);
2499                 cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
2500                 *cookiep = cookie;
2501                 nfs_dircookie_unlock(dnp);
2502         }
2503 nfsmout:
2504         if (newvp != NULLVP) {
2505                 if (newvp == vp)
2506                         vrele(newvp);
2507                 else
2508                         vput(newvp);
2509                 newvp = NULLVP;
2510         }
2511         return (error);
2512 }
2513
2514 /*
2515  * Silly rename. To make the NFS filesystem that is stateless look a little
2516  * more like the "ufs" a remove of an active vnode is translated to a rename
2517  * to a funny looking filename that is removed by nfs_inactive on the
2518  * nfsnode. There is the potential for another process on a different client
2519  * to create the same funny name between the nfs_lookitup() fails and the
2520  * nfs_rename() completes, but...
2521  */
2522 static int
2523 nfs_sillyrename(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2524 {
2525         struct sillyrename *sp;
2526         struct nfsnode *np;
2527         int error;
2528         short pid;
2529         unsigned int lticks;
2530
2531         cache_purge(dvp);
2532         np = VTONFS(vp);
2533 #ifndef DIAGNOSTIC
2534         if (vp->v_type == VDIR)
2535                 panic("nfs: sillyrename dir");
2536 #endif
2537         MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename),
2538                 M_NFSREQ, M_WAITOK);
2539         sp->s_cred = crhold(cnp->cn_cred);
2540         sp->s_dvp = dvp;
2541         sp->s_removeit = nfs_removeit;
2542         VREF(dvp);
2543
2544         /*
2545          * Fudge together a funny name.
2546          * Changing the format of the funny name to accomodate more
2547          * sillynames per directory.
2548          * The name is now changed to .nfs.<ticks>.<pid>.4, where ticks is
2549          * CPU ticks since boot.
2550          */
2551         pid = cnp->cn_thread->td_proc->p_pid;
2552         lticks = (unsigned int)ticks;
2553         for ( ; ; ) {
2554                 sp->s_namlen = sprintf(sp->s_name,
2555                                        ".nfs.%08x.%04x4.4", lticks,
2556                                        pid);
2557                 if (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
2558                                  cnp->cn_thread, NULL))
2559                         break;
2560                 lticks++;
2561         }
2562         error = nfs_renameit(dvp, cnp, sp);
2563         if (error)
2564                 goto bad;
2565         error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
2566                 cnp->cn_thread, &np);
2567         np->n_sillyrename = sp;
2568         return (0);
2569 bad:
2570         vrele(sp->s_dvp);
2571         crfree(sp->s_cred);
2572         free((caddr_t)sp, M_NFSREQ);
2573         return (error);
2574 }
2575
2576 /*
2577  * Look up a file name and optionally either update the file handle or
2578  * allocate an nfsnode, depending on the value of npp.
2579  * npp == NULL  --> just do the lookup
2580  * *npp == NULL --> allocate a new nfsnode and make sure attributes are
2581  *                      handled too
2582  * *npp != NULL --> update the file handle in the vnode
2583  */
2584 static int
2585 nfs_lookitup(struct vnode *dvp, const char *name, int len, struct ucred *cred,
2586     struct thread *td, struct nfsnode **npp)
2587 {
2588         struct vnode *newvp = NULL;
2589         struct nfsnode *np, *dnp = VTONFS(dvp);
2590         caddr_t bpos, dpos;
2591         int error = 0, fhlen, attrflag;
2592         struct mbuf *mreq, *mrep, *md, *mb;
2593         nfsfh_t *nfhp;
2594         int v3 = NFS_ISV3(dvp);
2595
2596         nfsstats.rpccnt[NFSPROC_LOOKUP]++;
2597         mreq = nfsm_reqhead(dvp, NFSPROC_LOOKUP,
2598                 NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len));
2599         mb = mreq;
2600         bpos = mtod(mb, caddr_t);
2601         nfsm_fhtom(dvp, v3);
2602         nfsm_strtom(name, len, NFS_MAXNAMLEN);
2603         nfsm_request(dvp, NFSPROC_LOOKUP, td, cred);
2604         if (npp && !error) {
2605                 nfsm_getfh(nfhp, fhlen, v3);
2606                 if (*npp) {
2607                     np = *npp;
2608                     if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) {
2609                         free((caddr_t)np->n_fhp, M_NFSBIGFH);
2610                         np->n_fhp = &np->n_fh;
2611                     } else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH)
2612                         np->n_fhp =(nfsfh_t *)malloc(fhlen, M_NFSBIGFH, M_WAITOK);
2613                     bcopy((caddr_t)nfhp, (caddr_t)np->n_fhp, fhlen);
2614                     np->n_fhsize = fhlen;
2615                     newvp = NFSTOV(np);
2616                 } else if (NFS_CMPFH(dnp, nfhp, fhlen)) {
2617                     VREF(dvp);
2618                     newvp = dvp;
2619                 } else {
2620                     error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np, LK_EXCLUSIVE);
2621                     if (error) {
2622                         m_freem(mrep);
2623                         return (error);
2624                     }
2625                     newvp = NFSTOV(np);
2626                 }
2627                 if (v3) {
2628                         nfsm_postop_attr(newvp, attrflag);
2629                         if (!attrflag && *npp == NULL) {
2630                                 m_freem(mrep);
2631                                 if (newvp == dvp)
2632                                         vrele(newvp);
2633                                 else
2634                                         vput(newvp);
2635                                 return (ENOENT);
2636                         }
2637                 } else
2638                         nfsm_loadattr(newvp, NULL);
2639         }
2640         m_freem(mrep);
2641 nfsmout:
2642         if (npp && *npp == NULL) {
2643                 if (error) {
2644                         if (newvp) {
2645                                 if (newvp == dvp)
2646                                         vrele(newvp);
2647                                 else
2648                                         vput(newvp);
2649                         }
2650                 } else
2651                         *npp = np;
2652         }
2653         return (error);
2654 }
2655
2656 /*
2657  * Nfs Version 3 commit rpc
2658  */
2659 int
2660 nfs_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred,
2661            struct thread *td)
2662 {
2663         u_int32_t *tl;
2664         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2665         caddr_t bpos, dpos;
2666         int error = 0, wccflag = NFSV3_WCCRATTR;
2667         struct mbuf *mreq, *mrep, *md, *mb;
2668
2669         mtx_lock(&nmp->nm_mtx);
2670         if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
2671                 mtx_unlock(&nmp->nm_mtx);
2672                 return (0);
2673         }
2674         mtx_unlock(&nmp->nm_mtx);
2675         nfsstats.rpccnt[NFSPROC_COMMIT]++;
2676         mreq = nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1));
2677         mb = mreq;
2678         bpos = mtod(mb, caddr_t);
2679         nfsm_fhtom(vp, 1);
2680         tl = nfsm_build(u_int32_t *, 3 * NFSX_UNSIGNED);
2681         txdr_hyper(offset, tl);
2682         tl += 2;
2683         *tl = txdr_unsigned(cnt);
2684         nfsm_request(vp, NFSPROC_COMMIT, td, cred);
2685         nfsm_wcc_data(vp, wccflag);
2686         if (!error) {
2687                 tl = nfsm_dissect(u_int32_t *, NFSX_V3WRITEVERF);
2688                 if (bcmp((caddr_t)nmp->nm_verf, (caddr_t)tl,
2689                         NFSX_V3WRITEVERF)) {
2690                         bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
2691                                 NFSX_V3WRITEVERF);
2692                         error = NFSERR_STALEWRITEVERF;
2693                 }
2694         }
2695         m_freem(mrep);
2696 nfsmout:
2697         return (error);
2698 }
2699
2700 /*
2701  * Strategy routine.
2702  * For async requests when nfsiod(s) are running, queue the request by
2703  * calling nfs_asyncio(), otherwise just all nfs_doio() to do the
2704  * request.
2705  */
2706 static int
2707 nfs_strategy(struct vop_strategy_args *ap)
2708 {
2709         struct buf *bp = ap->a_bp;
2710         struct ucred *cr;
2711
2712         KASSERT(!(bp->b_flags & B_DONE), ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
2713         KASSERT(BUF_REFCNT(bp) > 0, ("nfs_strategy: buffer %p not locked", bp));
2714
2715         if (bp->b_iocmd == BIO_READ)
2716                 cr = bp->b_rcred;
2717         else
2718                 cr = bp->b_wcred;
2719
2720         /*
2721          * If the op is asynchronous and an i/o daemon is waiting
2722          * queue the request, wake it up and wait for completion
2723          * otherwise just do it ourselves.
2724          */
2725         if ((bp->b_flags & B_ASYNC) == 0 ||
2726             nfs_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, curthread))
2727                 (void)nfs_doio(ap->a_vp, bp, cr, curthread);
2728         return (0);
2729 }
2730
2731 /*
2732  * fsync vnode op. Just call nfs_flush() with commit == 1.
2733  */
2734 /* ARGSUSED */
2735 static int
2736 nfs_fsync(struct vop_fsync_args *ap)
2737 {
2738         return (nfs_flush(ap->a_vp, ap->a_waitfor, ap->a_td, 1));
2739 }
2740
2741 /*
2742  * Flush all the blocks associated with a vnode.
2743  *      Walk through the buffer pool and push any dirty pages
2744  *      associated with the vnode.
2745  */
2746 static int
2747 nfs_flush(struct vnode *vp, int waitfor, struct thread *td,
2748     int commit)
2749 {
2750         struct nfsnode *np = VTONFS(vp);
2751         struct buf *bp;
2752         int i;
2753         struct buf *nbp;
2754         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2755         int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
2756         int passone = 1;
2757         u_quad_t off, endoff, toff;
2758         struct ucred* wcred = NULL;
2759         struct buf **bvec = NULL;
2760 #ifndef NFS_COMMITBVECSIZ
2761 #define NFS_COMMITBVECSIZ       20
2762 #endif
2763         struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
2764         int bvecsize = 0, bveccount;
2765
2766         if (nmp->nm_flag & NFSMNT_INT)
2767                 slpflag = PCATCH;
2768         if (!commit)
2769                 passone = 0;
2770         /*
2771          * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
2772          * server, but has not been committed to stable storage on the server
2773          * yet. On the first pass, the byte range is worked out and the commit
2774          * rpc is done. On the second pass, nfs_writebp() is called to do the
2775          * job.
2776          */
2777 again:
2778         off = (u_quad_t)-1;
2779         endoff = 0;
2780         bvecpos = 0;
2781         if (NFS_ISV3(vp) && commit) {
2782                 s = splbio();
2783                 if (bvec != NULL && bvec != bvec_on_stack)
2784                         free(bvec, M_TEMP);
2785                 /*
2786                  * Count up how many buffers waiting for a commit.
2787                  */
2788                 bveccount = 0;
2789                 VI_LOCK(vp);
2790                 TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
2791                         if (BUF_REFCNT(bp) == 0 &&
2792                             (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
2793                                 == (B_DELWRI | B_NEEDCOMMIT))
2794                                 bveccount++;
2795                 }
2796                 /*
2797                  * Allocate space to remember the list of bufs to commit.  It is
2798                  * important to use M_NOWAIT here to avoid a race with nfs_write.
2799                  * If we can't get memory (for whatever reason), we will end up
2800                  * committing the buffers one-by-one in the loop below.
2801                  */
2802                 if (bveccount > NFS_COMMITBVECSIZ) {
2803                         /*
2804                          * Release the vnode interlock to avoid a lock
2805                          * order reversal.
2806                          */
2807                         VI_UNLOCK(vp);
2808                         bvec = (struct buf **)
2809                                 malloc(bveccount * sizeof(struct buf *),
2810                                        M_TEMP, M_NOWAIT);
2811                         VI_LOCK(vp);
2812                         if (bvec == NULL) {
2813                                 bvec = bvec_on_stack;
2814                                 bvecsize = NFS_COMMITBVECSIZ;
2815                         } else
2816                                 bvecsize = bveccount;
2817                 } else {
2818                         bvec = bvec_on_stack;
2819                         bvecsize = NFS_COMMITBVECSIZ;
2820                 }
2821                 TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
2822                         if (bvecpos >= bvecsize)
2823                                 break;
2824                         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
2825                                 nbp = TAILQ_NEXT(bp, b_bobufs);
2826                                 continue;
2827                         }
2828                         if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
2829                             (B_DELWRI | B_NEEDCOMMIT)) {
2830                                 BUF_UNLOCK(bp);
2831                                 nbp = TAILQ_NEXT(bp, b_bobufs);
2832                                 continue;
2833                         }
2834                         VI_UNLOCK(vp);
2835                         bremfree(bp);
2836                         /*
2837                          * Work out if all buffers are using the same cred
2838                          * so we can deal with them all with one commit.
2839                          *
2840                          * NOTE: we are not clearing B_DONE here, so we have
2841                          * to do it later on in this routine if we intend to
2842                          * initiate I/O on the bp.
2843                          *
2844                          * Note: to avoid loopback deadlocks, we do not
2845                          * assign b_runningbufspace.
2846                          */
2847                         if (wcred == NULL)
2848                                 wcred = bp->b_wcred;
2849                         else if (wcred != bp->b_wcred)
2850                                 wcred = NOCRED;
2851                         vfs_busy_pages(bp, 1);
2852
2853                         VI_LOCK(vp);
2854                         /*
2855                          * bp is protected by being locked, but nbp is not
2856                          * and vfs_busy_pages() may sleep.  We have to
2857                          * recalculate nbp.
2858                          */
2859                         nbp = TAILQ_NEXT(bp, b_bobufs);
2860
2861                         /*
2862                          * A list of these buffers is kept so that the
2863                          * second loop knows which buffers have actually
2864                          * been committed. This is necessary, since there
2865                          * may be a race between the commit rpc and new
2866                          * uncommitted writes on the file.
2867                          */
2868                         bvec[bvecpos++] = bp;
2869                         toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
2870                                 bp->b_dirtyoff;
2871                         if (toff < off)
2872                                 off = toff;
2873                         toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
2874                         if (toff > endoff)
2875                                 endoff = toff;
2876                 }
2877                 splx(s);
2878                 VI_UNLOCK(vp);
2879         }
2880         if (bvecpos > 0) {
2881                 /*
2882                  * Commit data on the server, as required.
2883                  * If all bufs are using the same wcred, then use that with
2884                  * one call for all of them, otherwise commit each one
2885                  * separately.
2886                  */
2887                 if (wcred != NOCRED)
2888                         retv = nfs_commit(vp, off, (int)(endoff - off),
2889                                           wcred, td);
2890                 else {
2891                         retv = 0;
2892                         for (i = 0; i < bvecpos; i++) {
2893                                 off_t off, size;
2894                                 bp = bvec[i];
2895                                 off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
2896                                         bp->b_dirtyoff;
2897                                 size = (u_quad_t)(bp->b_dirtyend
2898                                                   - bp->b_dirtyoff);
2899                                 retv = nfs_commit(vp, off, (int)size,
2900                                                   bp->b_wcred, td);
2901                                 if (retv) break;
2902                         }
2903                 }
2904
2905                 if (retv == NFSERR_STALEWRITEVERF)
2906                         nfs_clearcommit(vp->v_mount);
2907
2908                 /*
2909                  * Now, either mark the blocks I/O done or mark the
2910                  * blocks dirty, depending on whether the commit
2911                  * succeeded.
2912                  */
2913                 for (i = 0; i < bvecpos; i++) {
2914                         bp = bvec[i];
2915                         bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
2916                         if (retv) {
2917                                 /*
2918                                  * Error, leave B_DELWRI intact
2919                                  */
2920                                 vfs_unbusy_pages(bp);
2921                                 brelse(bp);
2922                         } else {
2923                                 /*
2924                                  * Success, remove B_DELWRI ( bundirty() ).
2925                                  *
2926                                  * b_dirtyoff/b_dirtyend seem to be NFS
2927                                  * specific.  We should probably move that
2928                                  * into bundirty(). XXX
2929                                  */
2930                                 s = splbio();
2931                                 bufobj_wref(&vp->v_bufobj);
2932                                 bp->b_flags |= B_ASYNC;
2933                                 bundirty(bp);
2934                                 bp->b_flags &= ~B_DONE;
2935                                 bp->b_ioflags &= ~BIO_ERROR;
2936                                 bp->b_dirtyoff = bp->b_dirtyend = 0;
2937                                 splx(s);
2938                                 bufdone(bp);
2939                         }
2940                 }
2941         }
2942
2943         /*
2944          * Start/do any write(s) that are required.
2945          */
2946 loop:
2947         s = splbio();
2948         VI_LOCK(vp);
2949         TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
2950                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
2951                         if (waitfor != MNT_WAIT || passone)
2952                                 continue;
2953
2954                         error = BUF_TIMELOCK(bp,
2955                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
2956                             VI_MTX(vp), "nfsfsync", slpflag, slptimeo);
2957                         splx(s);
2958                         if (error == 0) {
2959                                 BUF_UNLOCK(bp);
2960                                 goto loop;
2961                         }
2962                         if (error == ENOLCK)
2963                                 goto loop;
2964                         if (nfs_sigintr(nmp, NULL, td)) {
2965                                 error = EINTR;
2966                                 goto done;
2967                         }
2968                         if (slpflag == PCATCH) {
2969                                 slpflag = 0;
2970                                 slptimeo = 2 * hz;
2971                         }
2972                         goto loop;
2973                 }
2974                 if ((bp->b_flags & B_DELWRI) == 0)
2975                         panic("nfs_fsync: not dirty");
2976                 if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
2977                         BUF_UNLOCK(bp);
2978                         continue;
2979                 }
2980                 VI_UNLOCK(vp);
2981                 bremfree(bp);
2982                 if (passone || !commit)
2983                     bp->b_flags |= B_ASYNC;
2984                 else
2985                     bp->b_flags |= B_ASYNC;
2986                 splx(s);
2987                 bwrite(bp);
2988                 if (nfs_sigintr(nmp, NULL, td)) {
2989                         error = EINTR;
2990                         goto done;
2991                 }
2992                 goto loop;
2993         }
2994         splx(s);
2995         if (passone) {
2996                 passone = 0;
2997                 VI_UNLOCK(vp);
2998                 goto again;
2999         }
3000         if (waitfor == MNT_WAIT) {
3001                 while (vp->v_bufobj.bo_numoutput) {
3002                         error = bufobj_wwait(&vp->v_bufobj, slpflag, slptimeo);
3003                         if (error) {
3004                             VI_UNLOCK(vp);
3005                             error = nfs_sigintr(nmp, NULL, td);
3006                             if (error)
3007                                 goto done;
3008                             if (slpflag == PCATCH) {
3009                                 slpflag = 0;
3010                                 slptimeo = 2 * hz;
3011                             }
3012                             VI_LOCK(vp);
3013                         }
3014                 }
3015                 if (vp->v_bufobj.bo_dirty.bv_cnt != 0 && commit) {
3016                         VI_UNLOCK(vp);
3017                         goto loop;
3018                 }
3019                 /*
3020                  * Wait for all the async IO requests to drain
3021                  */
3022                 VI_UNLOCK(vp);
3023                 mtx_lock(&np->n_mtx);
3024                 while (np->n_directio_asyncwr > 0) {
3025                         np->n_flag |= NFSYNCWAIT;
3026                         error = nfs_msleep(td, (caddr_t)&np->n_directio_asyncwr,
3027                                            &np->n_mtx, slpflag | (PRIBIO + 1),
3028                                            "nfsfsync", 0);
3029                         if (error) {
3030                                 if (nfs_sigintr(nmp, (struct nfsreq *)0, td)) {
3031                                         mtx_unlock(&np->n_mtx);
3032                                         error = EINTR;
3033                                         goto done;
3034                                 }
3035                         }
3036                 }
3037                 mtx_unlock(&np->n_mtx);
3038         } else
3039                 VI_UNLOCK(vp);
3040         mtx_lock(&np->n_mtx);
3041         if (np->n_flag & NWRITEERR) {
3042                 error = np->n_error;
3043                 np->n_flag &= ~NWRITEERR;
3044         }
3045         if (commit && vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
3046             vp->v_bufobj.bo_numoutput == 0 && np->n_directio_asyncwr == 0)
3047                 np->n_flag &= ~NMODIFIED;
3048         mtx_unlock(&np->n_mtx);
3049 done:
3050         if (bvec != NULL && bvec != bvec_on_stack)
3051                 free(bvec, M_TEMP);
3052         return (error);
3053 }
3054
3055 /*
3056  * NFS advisory byte-level locks.
3057  */
3058 static int
3059 nfs_advlock(struct vop_advlock_args *ap)
3060 {
3061         int error;
3062
3063         mtx_lock(&Giant);
3064         if ((VFSTONFS(ap->a_vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
3065                 struct nfsnode *np = VTONFS(ap->a_vp);
3066
3067                 error = lf_advlock(ap, &(np->n_lockf), np->n_size);
3068                 goto out;
3069         }
3070         error = nfs_dolock(ap);
3071 out:
3072         mtx_unlock(&Giant);
3073         return (error);
3074 }
3075
3076 /*
3077  * Print out the contents of an nfsnode.
3078  */
3079 static int
3080 nfs_print(struct vop_print_args *ap)
3081 {
3082         struct vnode *vp = ap->a_vp;
3083         struct nfsnode *np = VTONFS(vp);
3084
3085         nfs_printf("\tfileid %ld fsid 0x%x",
3086            np->n_vattr.va_fileid, np->n_vattr.va_fsid);
3087         if (vp->v_type == VFIFO)
3088                 fifo_printinfo(vp);
3089         printf("\n");
3090         return (0);
3091 }
3092
3093 /*
3094  * This is the "real" nfs::bwrite(struct buf*).
3095  * We set B_CACHE if this is a VMIO buffer.
3096  */
3097 int
3098 nfs_writebp(struct buf *bp, int force __unused, struct thread *td)
3099 {
3100         int s;
3101         int oldflags = bp->b_flags;
3102 #if 0
3103         int retv = 1;
3104         off_t off;
3105 #endif
3106
3107         if (BUF_REFCNT(bp) == 0)
3108                 panic("bwrite: buffer is not locked???");
3109
3110         if (bp->b_flags & B_INVAL) {
3111                 brelse(bp);
3112                 return(0);
3113         }
3114
3115         bp->b_flags |= B_CACHE;
3116
3117         /*
3118          * Undirty the bp.  We will redirty it later if the I/O fails.
3119          */
3120
3121         s = splbio();
3122         bundirty(bp);
3123         bp->b_flags &= ~B_DONE;
3124         bp->b_ioflags &= ~BIO_ERROR;
3125         bp->b_iocmd = BIO_WRITE;
3126
3127         bufobj_wref(bp->b_bufobj);
3128         curthread->td_proc->p_stats->p_ru.ru_oublock++;
3129         splx(s);
3130
3131         /*
3132          * Note: to avoid loopback deadlocks, we do not
3133          * assign b_runningbufspace.
3134          */
3135         vfs_busy_pages(bp, 1);
3136
3137         BUF_KERNPROC(bp);
3138         bp->b_iooffset = dbtob(bp->b_blkno);
3139         bstrategy(bp);
3140
3141         if( (oldflags & B_ASYNC) == 0) {
3142                 int rtval = bufwait(bp);
3143
3144                 if (oldflags & B_DELWRI) {
3145                         s = splbio();
3146                         reassignbuf(bp);
3147                         splx(s);
3148                 }
3149                 brelse(bp);
3150                 return (rtval);
3151         }
3152
3153         return (0);
3154 }
3155
3156 /*
3157  * nfs special file access vnode op.
3158  * Essentially just get vattr and then imitate iaccess() since the device is
3159  * local to the client.
3160  */
3161 static int
3162 nfsspec_access(struct vop_access_args *ap)
3163 {
3164         struct vattr *vap;
3165         struct ucred *cred = ap->a_cred;
3166         struct vnode *vp = ap->a_vp;
3167         mode_t mode = ap->a_mode;
3168         struct vattr vattr;
3169         int error;
3170
3171         /*
3172          * Disallow write attempts on filesystems mounted read-only;
3173          * unless the file is a socket, fifo, or a block or character
3174          * device resident on the filesystem.
3175          */
3176         if ((mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
3177                 switch (vp->v_type) {
3178                 case VREG:
3179                 case VDIR:
3180                 case VLNK:
3181                         return (EROFS);
3182                 default:
3183                         break;
3184                 }
3185         }
3186         vap = &vattr;
3187         error = VOP_GETATTR(vp, vap, cred, ap->a_td);
3188         if (error)
3189                 goto out;
3190         error  = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
3191                          mode, cred, NULL);
3192 out:
3193         return error;
3194 }
3195
3196 /*
3197  * Read wrapper for fifos.
3198  */
3199 static int
3200 nfsfifo_read(struct vop_read_args *ap)
3201 {
3202         struct nfsnode *np = VTONFS(ap->a_vp);
3203         int error;
3204
3205         /*
3206          * Set access flag.
3207          */
3208         mtx_lock(&np->n_mtx);
3209         np->n_flag |= NACC;
3210         getnanotime(&np->n_atim);
3211         mtx_unlock(&np->n_mtx);
3212         error = fifo_specops.vop_read(ap);
3213         return error;
3214 }
3215
3216 /*
3217  * Write wrapper for fifos.
3218  */
3219 static int
3220 nfsfifo_write(struct vop_write_args *ap)
3221 {
3222         struct nfsnode *np = VTONFS(ap->a_vp);
3223
3224         /*
3225          * Set update flag.
3226          */
3227         mtx_lock(&np->n_mtx);
3228         np->n_flag |= NUPD;
3229         getnanotime(&np->n_mtim);
3230         mtx_unlock(&np->n_mtx);
3231         return(fifo_specops.vop_write(ap));
3232 }
3233
3234 /*
3235  * Close wrapper for fifos.
3236  *
3237  * Update the times on the nfsnode then do fifo close.
3238  */
3239 static int
3240 nfsfifo_close(struct vop_close_args *ap)
3241 {
3242         struct vnode *vp = ap->a_vp;
3243         struct nfsnode *np = VTONFS(vp);
3244         struct vattr vattr;
3245         struct timespec ts;
3246
3247         mtx_lock(&np->n_mtx);
3248         if (np->n_flag & (NACC | NUPD)) {
3249                 getnanotime(&ts);
3250                 if (np->n_flag & NACC)
3251                         np->n_atim = ts;
3252                 if (np->n_flag & NUPD)
3253                         np->n_mtim = ts;
3254                 np->n_flag |= NCHG;
3255                 if (vrefcnt(vp) == 1 &&
3256                     (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
3257                         VATTR_NULL(&vattr);
3258                         if (np->n_flag & NACC)
3259                                 vattr.va_atime = np->n_atim;
3260                         if (np->n_flag & NUPD)
3261                                 vattr.va_mtime = np->n_mtim;
3262                         mtx_unlock(&np->n_mtx);
3263                         (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_td);
3264                         goto out;
3265                 }
3266         }
3267         mtx_unlock(&np->n_mtx);
3268 out:
3269         return (fifo_specops.vop_close(ap));
3270 }
3271
3272 /*
3273  * Just call nfs_writebp() with the force argument set to 1.
3274  *
3275  * NOTE: B_DONE may or may not be set in a_bp on call.
3276  */
3277 static int
3278 nfs_bwrite(struct buf *bp)
3279 {
3280
3281         return (nfs_writebp(bp, 1, curthread));
3282 }
3283
3284 struct buf_ops buf_ops_nfs = {
3285         .bop_name       =       "buf_ops_nfs",
3286         .bop_write      =       nfs_bwrite,
3287         .bop_strategy   =       bufstrategy,
3288         .bop_sync       =       bufsync,
3289         .bop_bdflush    =       bufbdflush,
3290 };