sys/nfsclient/nfs_socket.c

   1 /*-
   2  * Copyright (c) 1989, 1991, 1993, 1995
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * Rick Macklem at The University of Guelph.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 4. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
  33  */
  34
  35 #include <sys/cdefs.h>
  36 __FBSDID("$FreeBSD$");
  37
  38 /*
  39  * Socket operations for use by nfs
  40  */
  41
  42 #include "opt_inet6.h"
  43
  44 #include <sys/param.h>
  45 #include <sys/systm.h>
  46 #include <sys/kernel.h>
  47 #include <sys/lock.h>
  48 #include <sys/malloc.h>
  49 #include <sys/mbuf.h>
  50 #include <sys/mount.h>
  51 #include <sys/mutex.h>
  52 #include <sys/proc.h>
  53 #include <sys/protosw.h>
  54 #include <sys/signalvar.h>
  55 #include <sys/syscallsubr.h>
  56 #include <sys/socket.h>
  57 #include <sys/socketvar.h>
  58 #include <sys/sysctl.h>
  59 #include <sys/syslog.h>
  60 #include <sys/vnode.h>
  61
  62 #include <netinet/in.h>
  63 #include <netinet/tcp.h>
  64
  65 #include <rpc/rpcclnt.h>
  66
  67 #include <nfs/rpcv2.h>
  68 #include <nfs/nfsproto.h>
  69 #include <nfsclient/nfs.h>
  70 #include <nfs/xdr_subs.h>
  71 #include <nfsclient/nfsm_subs.h>
  72 #include <nfsclient/nfsmount.h>
  73 #include <nfsclient/nfsnode.h>
  74
  75 #include <nfs4client/nfs4.h>
  76
  77 #define TRUE    1
  78 #define FALSE   0
  79
  80 extern u_int32_t nfs_xid;
  81
  82 static int      nfs_realign_test;
  83 static int      nfs_realign_count;
  84 static int      nfs_bufpackets = 4;
  85 static int      nfs_reconnects;
  86 static int     nfs3_jukebox_delay = 10;
  87 static int     nfs_skip_wcc_data_onerr = 1;
  88
  89 SYSCTL_DECL(_vfs_nfs);
  90
  91 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
  92 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
  93 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
  94 SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
  95     "number of times the nfs client has had to reconnect");
  96 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
  97            "number of seconds to delay a retry after receiving EJUKEBOX");
  98 SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0, "");
  99
 100 /*
 101  * There is a congestion window for outstanding rpcs maintained per mount
 102  * point. The cwnd size is adjusted in roughly the way that:
 103  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
 104  * SIGCOMM '88". ACM, August 1988.
 105  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
 106  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
 107  * of rpcs is in progress.
 108  * (The sent count and cwnd are scaled for integer arith.)
 109  * Variants of "slow start" were tried and were found to be too much of a
 110  * performance hit (ave. rtt 3 times larger),
 111  * I suspect due to the large rtt that nfs rpcs have.
 112  */
 113 #define NFS_CWNDSCALE   256
 114 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
 115 #define NFS_NBACKOFF    8
 116 static int nfs_backoff[NFS_NBACKOFF] = { 2, 4, 8, 16, 32, 64, 128, 256, };
 117 struct callout  nfs_callout;
 118
 119 static int      nfs_msg(struct thread *, const char *, const char *, int);
 120 static int      nfs_realign(struct mbuf **pm, int hsiz);
 121 static int      nfs_reply(struct nfsreq *);
 122 static void     nfs_softterm(struct nfsreq *rep);
 123 static int      nfs_reconnect(struct nfsreq *rep);
 124 static void nfs_clnt_tcp_soupcall(struct socket *so, void *arg, int waitflag);
 125 static void nfs_clnt_udp_soupcall(struct socket *so, void *arg, int waitflag);
 126
 127 extern struct mtx nfs_reqq_mtx;
 128
 129 /*
 130  * RTT estimator
 131  */
 132
 133 static enum nfs_rto_timer_t nfs_proct[NFS_NPROCS] = {
 134         NFS_DEFAULT_TIMER,      /* NULL */
 135         NFS_GETATTR_TIMER,      /* GETATTR */
 136         NFS_DEFAULT_TIMER,      /* SETATTR */
 137         NFS_LOOKUP_TIMER,       /* LOOKUP */
 138         NFS_GETATTR_TIMER,      /* ACCESS */
 139         NFS_READ_TIMER,         /* READLINK */
 140         NFS_READ_TIMER,         /* READ */
 141         NFS_WRITE_TIMER,        /* WRITE */
 142         NFS_DEFAULT_TIMER,      /* CREATE */
 143         NFS_DEFAULT_TIMER,      /* MKDIR */
 144         NFS_DEFAULT_TIMER,      /* SYMLINK */
 145         NFS_DEFAULT_TIMER,      /* MKNOD */
 146         NFS_DEFAULT_TIMER,      /* REMOVE */
 147         NFS_DEFAULT_TIMER,      /* RMDIR */
 148         NFS_DEFAULT_TIMER,      /* RENAME */
 149         NFS_DEFAULT_TIMER,      /* LINK */
 150         NFS_READ_TIMER,         /* READDIR */
 151         NFS_READ_TIMER,         /* READDIRPLUS */
 152         NFS_DEFAULT_TIMER,      /* FSSTAT */
 153         NFS_DEFAULT_TIMER,      /* FSINFO */
 154         NFS_DEFAULT_TIMER,      /* PATHCONF */
 155         NFS_DEFAULT_TIMER,      /* COMMIT */
 156         NFS_DEFAULT_TIMER,      /* NOOP */
 157 };
 158
 159 /*
 160  * Choose the correct RTT timer for this NFS procedure.
 161  */
 162 static inline enum nfs_rto_timer_t
 163 nfs_rto_timer(u_int32_t procnum)
 164 {
 165         return nfs_proct[procnum];
 166 }
 167
 168 /*
 169  * Initialize the RTT estimator state for a new mount point.
 170  */
 171 static void
 172 nfs_init_rtt(struct nfsmount *nmp)
 173 {
 174         int i;
 175
 176         for (i = 0; i < NFS_MAX_TIMER; i++)
 177                 nmp->nm_srtt[i] = NFS_INITRTT;
 178         for (i = 0; i < NFS_MAX_TIMER; i++)
 179                 nmp->nm_sdrtt[i] = 0;
 180 }
 181
 182 /*
 183  * Update a mount point's RTT estimator state using data from the
 184  * passed-in request.
 185  *
 186  * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation.
 187  *
 188  * NB: Since the timer resolution of NFS_HZ is so course, it can often
 189  * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is
 190  * between N + dt and N + 2 - dt ticks, add 1 before calculating the
 191  * update values.
 192  */
 193 static void
 194 nfs_update_rtt(struct nfsreq *rep)
 195 {
 196         int t1 = rep->r_rtt + 1;
 197         int index = nfs_rto_timer(rep->r_procnum) - 1;
 198         int *srtt = &rep->r_nmp->nm_srtt[index];
 199         int *sdrtt = &rep->r_nmp->nm_sdrtt[index];
 200
 201         t1 -= *srtt >> 3;
 202         *srtt += t1;
 203         if (t1 < 0)
 204                 t1 = -t1;
 205         t1 -= *sdrtt >> 2;
 206         *sdrtt += t1;
 207 }
 208
 209 /*
 210  * Estimate RTO for an NFS RPC sent via an unreliable datagram.
 211  *
 212  * Use the mean and mean deviation of RTT for the appropriate type
 213  * of RPC for the frequent RPCs and a default for the others.
 214  * The justification for doing "other" this way is that these RPCs
 215  * happen so infrequently that timer est. would probably be stale.
 216  * Also, since many of these RPCs are non-idempotent, a conservative
 217  * timeout is desired.
 218  *
 219  * getattr, lookup - A+2D
 220  * read, write     - A+4D
 221  * other           - nm_timeo
 222  */
 223 static int
 224 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum)
 225 {
 226         enum nfs_rto_timer_t timer = nfs_rto_timer(procnum);
 227         int index = timer - 1;
 228         int rto;
 229
 230         switch (timer) {
 231         case NFS_GETATTR_TIMER:
 232         case NFS_LOOKUP_TIMER:
 233                 rto = ((nmp->nm_srtt[index] + 3) >> 2) +
 234                                 ((nmp->nm_sdrtt[index] + 1) >> 1);
 235                 break;
 236         case NFS_READ_TIMER:
 237         case NFS_WRITE_TIMER:
 238                 rto = ((nmp->nm_srtt[index] + 7) >> 3) +
 239                                 (nmp->nm_sdrtt[index] + 1);
 240                 break;
 241         default:
 242                 rto = nmp->nm_timeo;
 243                 return (rto);
 244         }
 245
 246         if (rto < NFS_MINRTO)
 247                 rto = NFS_MINRTO;
 248         else if (rto > NFS_MAXRTO)
 249                 rto = NFS_MAXRTO;
 250
 251         return (rto);
 252 }
 253
 254
 255 /*
 256  * Initialize sockets and congestion for a new NFS connection.
 257  * We do not free the sockaddr if error.
 258  */
 259 int
 260 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
 261 {
 262         struct socket *so;
 263         int error, rcvreserve, sndreserve;
 264         int pktscale;
 265         struct sockaddr *saddr;
 266         struct thread *td = &thread0; /* only used for socreate and sobind */
 267
 268         if (nmp->nm_sotype == SOCK_STREAM) {
 269                 mtx_lock(&nmp->nm_mtx);
 270                 nmp->nm_nfstcpstate.flags |= NFS_TCP_EXPECT_RPCMARKER;
 271                 nmp->nm_nfstcpstate.rpcresid = 0;
 272                 mtx_unlock(&nmp->nm_mtx);
 273         }
 274         nmp->nm_so = NULL;
 275         saddr = nmp->nm_nam;
 276         error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
 277                 nmp->nm_soproto, nmp->nm_mountp->mnt_cred, td);
 278         if (error)
 279                 goto bad;
 280         so = nmp->nm_so;
 281         nmp->nm_soflags = so->so_proto->pr_flags;
 282
 283         /*
 284          * Some servers require that the client port be a reserved port number.
 285          */
 286         if (nmp->nm_flag & NFSMNT_RESVPORT) {
 287                 struct sockopt sopt;
 288                 int ip, ip2, len;
 289                 struct sockaddr_in6 ssin;
 290                 struct sockaddr *sa;
 291
 292                 bzero(&sopt, sizeof sopt);
 293                 switch(saddr->sa_family) {
 294                 case AF_INET:
 295                         sopt.sopt_level = IPPROTO_IP;
 296                         sopt.sopt_name = IP_PORTRANGE;
 297                         ip = IP_PORTRANGE_LOW;
 298                         ip2 = IP_PORTRANGE_DEFAULT;
 299                         len = sizeof (struct sockaddr_in);
 300                         break;
 301 #ifdef INET6
 302                 case AF_INET6:
 303                         sopt.sopt_level = IPPROTO_IPV6;
 304                         sopt.sopt_name = IPV6_PORTRANGE;
 305                         ip = IPV6_PORTRANGE_LOW;
 306                         ip2 = IPV6_PORTRANGE_DEFAULT;
 307                         len = sizeof (struct sockaddr_in6);
 308                         break;
 309 #endif
 310                 default:
 311                         goto noresvport;
 312                 }
 313                 sa = (struct sockaddr *)&ssin;
 314                 bzero(sa, len);
 315                 sa->sa_len = len;
 316                 sa->sa_family = saddr->sa_family;
 317                 sopt.sopt_dir = SOPT_SET;
 318                 sopt.sopt_val = (void *)&ip;
 319                 sopt.sopt_valsize = sizeof(ip);
 320                 error = sosetopt(so, &sopt);
 321                 if (error)
 322                         goto bad;
 323                 error = sobind(so, sa, td);
 324                 if (error)
 325                         goto bad;
 326                 ip = ip2;
 327                 error = sosetopt(so, &sopt);
 328                 if (error)
 329                         goto bad;
 330         noresvport: ;
 331         }
 332
 333         /*
 334          * Protocols that do not require connections may be optionally left
 335          * unconnected for servers that reply from a port other than NFS_PORT.
 336          */
 337         mtx_lock(&nmp->nm_mtx);
 338         if (nmp->nm_flag & NFSMNT_NOCONN) {
 339                 if (nmp->nm_soflags & PR_CONNREQUIRED) {
 340                         error = ENOTCONN;
 341                         mtx_unlock(&nmp->nm_mtx);
 342                         goto bad;
 343                 } else
 344                         mtx_unlock(&nmp->nm_mtx);
 345         } else {
 346                 mtx_unlock(&nmp->nm_mtx);
 347                 error = soconnect(so, nmp->nm_nam, td);
 348                 if (error)
 349                         goto bad;
 350
 351                 /*
 352                  * Wait for the connection to complete. Cribbed from the
 353                  * connect system call but with the wait timing out so
 354                  * that interruptible mounts don't hang here for a long time.
 355                  */
 356                 SOCK_LOCK(so);
 357                 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 358                         (void) msleep(&so->so_timeo, SOCK_MTX(so),
 359                             PSOCK, "nfscon", 2 * hz);
 360                         if ((so->so_state & SS_ISCONNECTING) &&
 361                             so->so_error == 0 && rep &&
 362                             (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0) {
 363                                 so->so_state &= ~SS_ISCONNECTING;
 364                                 SOCK_UNLOCK(so);
 365                                 goto bad;
 366                         }
 367                 }
 368                 if (so->so_error) {
 369                         error = so->so_error;
 370                         so->so_error = 0;
 371                         SOCK_UNLOCK(so);
 372                         goto bad;
 373                 }
 374                 SOCK_UNLOCK(so);
 375         }
 376         so->so_rcv.sb_timeo = 12 * hz;
 377         so->so_snd.sb_timeo = 5 * hz;
 378
 379         /*
 380          * Get buffer reservation size from sysctl, but impose reasonable
 381          * limits.
 382          */
 383         pktscale = nfs_bufpackets;
 384         if (pktscale < 2)
 385                 pktscale = 2;
 386         if (pktscale > 64)
 387                 pktscale = 64;
 388         mtx_lock(&nmp->nm_mtx);
 389         if (nmp->nm_sotype == SOCK_DGRAM) {
 390                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
 391                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
 392                     NFS_MAXPKTHDR) * pktscale;
 393         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
 394                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
 395                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
 396                     NFS_MAXPKTHDR) * pktscale;
 397         } else {
 398                 if (nmp->nm_sotype != SOCK_STREAM)
 399                         panic("nfscon sotype");
 400                 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
 401                         struct sockopt sopt;
 402                         int val;
 403
 404                         bzero(&sopt, sizeof sopt);
 405                         sopt.sopt_dir = SOPT_SET;
 406                         sopt.sopt_level = SOL_SOCKET;
 407                         sopt.sopt_name = SO_KEEPALIVE;
 408                         sopt.sopt_val = &val;
 409                         sopt.sopt_valsize = sizeof val;
 410                         val = 1;
 411                         mtx_unlock(&nmp->nm_mtx);
 412                         sosetopt(so, &sopt);
 413                         mtx_lock(&nmp->nm_mtx);
 414                 }
 415                 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
 416                         struct sockopt sopt;
 417                         int val;
 418
 419                         bzero(&sopt, sizeof sopt);
 420                         sopt.sopt_dir = SOPT_SET;
 421                         sopt.sopt_level = IPPROTO_TCP;
 422                         sopt.sopt_name = TCP_NODELAY;
 423                         sopt.sopt_val = &val;
 424                         sopt.sopt_valsize = sizeof val;
 425                         val = 1;
 426                         mtx_unlock(&nmp->nm_mtx);
 427                         sosetopt(so, &sopt);
 428                         mtx_lock(&nmp->nm_mtx);
 429                 }
 430                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
 431                     sizeof (u_int32_t)) * pktscale;
 432                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
 433                     sizeof (u_int32_t)) * pktscale;
 434         }
 435         mtx_unlock(&nmp->nm_mtx);
 436         error = soreserve(so, sndreserve, rcvreserve);
 437         if (error)
 438                 goto bad;
 439         SOCKBUF_LOCK(&so->so_rcv);
 440         so->so_rcv.sb_flags |= SB_NOINTR;
 441         so->so_upcallarg = (caddr_t)nmp;
 442         if (so->so_type == SOCK_STREAM)
 443                 so->so_upcall = nfs_clnt_tcp_soupcall;
 444         else
 445                 so->so_upcall = nfs_clnt_udp_soupcall;
 446         so->so_rcv.sb_flags |= SB_UPCALL;
 447         SOCKBUF_UNLOCK(&so->so_rcv);
 448         SOCKBUF_LOCK(&so->so_snd);
 449         so->so_snd.sb_flags |= SB_NOINTR;
 450         SOCKBUF_UNLOCK(&so->so_snd);
 451
 452         mtx_lock(&nmp->nm_mtx);
 453         /* Initialize other non-zero congestion variables */
 454         nfs_init_rtt(nmp);
 455         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
 456         nmp->nm_sent = 0;
 457         nmp->nm_timeouts = 0;
 458         mtx_unlock(&nmp->nm_mtx);
 459         return (0);
 460
 461 bad:
 462         nfs_disconnect(nmp);
 463         return (error);
 464 }
 465
 466 /*
 467  * Reconnect routine:
 468  * Called when a connection is broken on a reliable protocol.
 469  * - clean up the old socket
 470  * - nfs_connect() again
 471  * - set R_MUSTRESEND for all outstanding requests on mount point
 472  * If this fails the mount point is DEAD!
 473  * nb: Must be called with the nfs_sndlock() set on the mount point.
 474  */
 475 static int
 476 nfs_reconnect(struct nfsreq *rep)
 477 {
 478         struct nfsreq *rp;
 479         struct nfsmount *nmp = rep->r_nmp;
 480         int error;
 481
 482         nfs_reconnects++;
 483         nfs_disconnect(nmp);
 484         while ((error = nfs_connect(nmp, rep)) != 0) {
 485                 if (error == ERESTART)
 486                         error = EINTR;
 487                 if (error == EIO || error == EINTR)
 488                         return (error);
 489                 (void) tsleep(&lbolt, PSOCK, "nfscon", 0);
 490         }
 491
 492         /*
 493          * Clear the FORCE_RECONNECT flag only after the connect
 494          * succeeds. To prevent races between multiple processes
 495          * waiting on the mountpoint where the connection is being
 496          * torn down. The first one to acquire the sndlock will
 497          * retry the connection. The others block on the sndlock
 498          * until the connection is established successfully, and
 499          * then re-transmit the request.
 500          */
 501         mtx_lock(&nmp->nm_mtx);
 502         nmp->nm_nfstcpstate.flags &= ~NFS_TCP_FORCE_RECONNECT;
 503         nmp->nm_nfstcpstate.rpcresid = 0;
 504         mtx_unlock(&nmp->nm_mtx);
 505
 506         /*
 507          * Loop through outstanding request list and fix up all requests
 508          * on old socket.
 509          */
 510         mtx_lock(&nfs_reqq_mtx);
 511         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
 512                 if (rp->r_nmp == nmp) {
 513                         mtx_lock(&rp->r_mtx);
 514                         rp->r_flags |= R_MUSTRESEND;
 515                         mtx_unlock(&rp->r_mtx);
 516                 }
 517         }
 518         mtx_unlock(&nfs_reqq_mtx);
 519         return (0);
 520 }
 521
 522 /*
 523  * NFS disconnect. Clean up and unlink.
 524  */
 525 void
 526 nfs_disconnect(struct nfsmount *nmp)
 527 {
 528         struct socket *so;
 529
 530         mtx_lock(&nmp->nm_mtx);
 531         if (nmp->nm_so) {
 532                 so = nmp->nm_so;
 533                 nmp->nm_so = NULL;
 534                 mtx_unlock(&nmp->nm_mtx);
 535                 SOCKBUF_LOCK(&so->so_rcv);
 536                 so->so_upcallarg = NULL;
 537                 so->so_upcall = NULL;
 538                 so->so_rcv.sb_flags &= ~SB_UPCALL;
 539                 SOCKBUF_UNLOCK(&so->so_rcv);
 540                 soshutdown(so, SHUT_WR);
 541                 soclose(so);
 542         } else
 543                 mtx_unlock(&nmp->nm_mtx);
 544 }
 545
 546 void
 547 nfs_safedisconnect(struct nfsmount *nmp)
 548 {
 549         struct nfsreq dummyreq;
 550
 551         bzero(&dummyreq, sizeof(dummyreq));
 552         dummyreq.r_nmp = nmp;
 553         nfs_disconnect(nmp);
 554 }
 555
 556 /*
 557  * This is the nfs send routine. For connection based socket types, it
 558  * must be called with an nfs_sndlock() on the socket.
 559  * - return EINTR if the RPC is terminated, 0 otherwise
 560  * - set R_MUSTRESEND if the send fails for any reason
 561  * - do any cleanup required by recoverable socket errors (?)
 562  */
 563 int
 564 nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
 565     struct nfsreq *rep)
 566 {
 567         struct sockaddr *sendnam;
 568         int error, error2, soflags, flags;
 569
 570         KASSERT(rep, ("nfs_send: called with rep == NULL"));
 571
 572         error = nfs_sigintr(rep->r_nmp, rep, rep->r_td);
 573         if (error) {
 574                 m_freem(top);
 575                 return (error);
 576         }
 577         mtx_lock(&rep->r_nmp->nm_mtx);
 578         mtx_lock(&rep->r_mtx);
 579         if ((so = rep->r_nmp->nm_so) == NULL) {
 580                 rep->r_flags |= R_MUSTRESEND;
 581                 mtx_unlock(&rep->r_mtx);
 582                 mtx_unlock(&rep->r_nmp->nm_mtx);
 583                 m_freem(top);
 584                 return (0);
 585         }
 586         rep->r_flags &= ~R_MUSTRESEND;
 587         soflags = rep->r_nmp->nm_soflags;
 588         mtx_unlock(&rep->r_mtx);
 589         mtx_unlock(&rep->r_nmp->nm_mtx);
 590
 591         if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
 592                 sendnam = NULL;
 593         else
 594                 sendnam = nam;
 595         if (so->so_type == SOCK_SEQPACKET)
 596                 flags = MSG_EOR;
 597         else
 598                 flags = 0;
 599
 600         error = sosend(so, sendnam, 0, top, 0, flags, curthread /*XXX*/);
 601         if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
 602                 error = 0;
 603                 mtx_lock(&rep->r_mtx);
 604                 rep->r_flags |= R_MUSTRESEND;
 605                 mtx_unlock(&rep->r_mtx);
 606         }
 607
 608         if (error) {
 609                 /*
 610                  * Don't report EPIPE errors on nfs sockets.
 611                  * These can be due to idle tcp mounts which will be closed by
 612                  * netapp, solaris, etc. if left idle too long.
 613                  */
 614                 if (error != EPIPE) {
 615                         log(LOG_INFO, "nfs send error %d for server %s\n",
 616                             error,
 617                             rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
 618                 }
 619                 /*
 620                  * Deal with errors for the client side.
 621                  */
 622                 error2 = NFS_SIGREP(rep);
 623                 if (error2)
 624                         error = error2;
 625                 else {
 626                         mtx_lock(&rep->r_mtx);
 627                         rep->r_flags |= R_MUSTRESEND;
 628                         mtx_unlock(&rep->r_mtx);
 629                 }
 630
 631                 /*
 632                  * Handle any recoverable (soft) socket errors here. (?)
 633                  * Make EWOULDBLOCK a recoverable error, we'll rexmit from nfs_timer().
 634                  */
 635                 if (error != EINTR && error != ERESTART && error != EIO && error != EPIPE)
 636                         error = 0;
 637         }
 638         return (error);
 639 }
 640
 641 int
 642 nfs_reply(struct nfsreq *rep)
 643 {
 644         register struct socket *so;
 645         register struct mbuf *m;
 646         int error = 0, sotype, slpflag;
 647
 648         sotype = rep->r_nmp->nm_sotype;
 649         /*
 650          * For reliable protocols, lock against other senders/receivers
 651          * in case a reconnect is necessary.
 652          */
 653         if (sotype != SOCK_DGRAM) {
 654                 error = nfs_sndlock(rep);
 655                 if (error)
 656                         return (error);
 657 tryagain:
 658                 mtx_lock(&rep->r_nmp->nm_mtx);
 659                 mtx_lock(&rep->r_mtx);
 660                 if (rep->r_mrep) {
 661                         mtx_unlock(&rep->r_mtx);
 662                         mtx_unlock(&rep->r_nmp->nm_mtx);
 663                         nfs_sndunlock(rep);
 664                         return (0);
 665                 }
 666                 if (rep->r_flags & R_SOFTTERM) {
 667                         mtx_unlock(&rep->r_mtx);
 668                         mtx_unlock(&rep->r_nmp->nm_mtx);
 669                         nfs_sndunlock(rep);
 670                         return (EINTR);
 671                 }
 672                 so = rep->r_nmp->nm_so;
 673                 if (!so ||
 674                     (rep->r_nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) {
 675                         mtx_unlock(&rep->r_mtx);
 676                         mtx_unlock(&rep->r_nmp->nm_mtx);
 677                         error = nfs_reconnect(rep);
 678                         if (error) {
 679                                 nfs_sndunlock(rep);
 680                                 return (error);
 681                         }
 682                         goto tryagain;
 683                 }
 684                 while (rep->r_flags & R_MUSTRESEND) {
 685                         mtx_unlock(&rep->r_mtx);
 686                         mtx_unlock(&rep->r_nmp->nm_mtx);
 687                         m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
 688                         nfsstats.rpcretries++;
 689                         error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
 690                         if (error) {
 691                                 if (error == EINTR || error == ERESTART ||
 692                                     (error = nfs_reconnect(rep)) != 0) {
 693                                         nfs_sndunlock(rep);
 694                                         return (error);
 695                                 }
 696                                 goto tryagain;
 697                         }
 698                         mtx_lock(&rep->r_nmp->nm_mtx);
 699                         mtx_lock(&rep->r_mtx);
 700                 }
 701                 mtx_unlock(&rep->r_nmp->nm_mtx);
 702                 mtx_unlock(&rep->r_mtx);
 703                 nfs_sndunlock(rep);
 704         }
 705         slpflag = 0;
 706         mtx_lock(&rep->r_nmp->nm_mtx);
 707         if (rep->r_nmp->nm_flag & NFSMNT_INT)
 708                 slpflag = PCATCH;
 709         mtx_unlock(&rep->r_nmp->nm_mtx);
 710         mtx_lock(&rep->r_mtx);
 711         while ((rep->r_mrep == NULL) && (error == 0) &&
 712                ((rep->r_flags & R_SOFTTERM) == 0) &&
 713                ((sotype == SOCK_DGRAM) || ((rep->r_flags & R_MUSTRESEND) == 0)))
 714                 error = msleep((caddr_t)rep, &rep->r_mtx,
 715                                slpflag | (PZERO - 1), "nfsreq", 0);
 716         if (error == EINTR || error == ERESTART) {
 717                 /* NFS operations aren't restartable. Map ERESTART to EINTR */
 718                 mtx_unlock(&rep->r_mtx);
 719                 return (EINTR);
 720         }
 721         if (rep->r_flags & R_SOFTTERM) {
 722                 /* Request was terminated because we exceeded the retries (soft mount) */
 723                 mtx_unlock(&rep->r_mtx);
 724                 return (ETIMEDOUT);
 725         }
 726         mtx_unlock(&rep->r_mtx);
 727         if (sotype == SOCK_STREAM) {
 728                 mtx_lock(&rep->r_nmp->nm_mtx);
 729                 mtx_lock(&rep->r_mtx);
 730                 if (((rep->r_nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) ||
 731                      (rep->r_flags & R_MUSTRESEND))) {
 732                         mtx_unlock(&rep->r_mtx);
 733                         mtx_unlock(&rep->r_nmp->nm_mtx);
 734                         error = nfs_sndlock(rep);
 735                         if (error)
 736                                 return (error);
 737                         goto tryagain;
 738                 } else {
 739                         mtx_unlock(&rep->r_mtx);
 740                         mtx_unlock(&rep->r_nmp->nm_mtx);
 741                 }
 742         }
 743         return (error);
 744 }
 745
 746 /*
 747  * XXX TO DO
 748  * Make nfs_realign() non-blocking. Also make nfsm_dissect() nonblocking.
 749  */
 750 static void
 751 nfs_clnt_match_xid(struct socket *so,
 752                    struct nfsmount *nmp,
 753                    struct mbuf *mrep)
 754 {
 755         struct mbuf *md;
 756         caddr_t dpos;
 757         u_int32_t rxid, *tl;
 758         struct nfsreq *rep;
 759         int error;
 760
 761         /*
 762          * Search for any mbufs that are not a multiple of 4 bytes long
 763          * or with m_data not longword aligned.
 764          * These could cause pointer alignment problems, so copy them to
 765          * well aligned mbufs.
 766          */
 767         if (nfs_realign(&mrep, 5 * NFSX_UNSIGNED) == ENOMEM) {
 768                 m_freem(mrep);
 769                 nfsstats.rpcinvalid++;
 770                 return;
 771         }
 772
 773         /*
 774          * Get the xid and check that it is an rpc reply
 775          */
 776         md = mrep;
 777         dpos = mtod(md, caddr_t);
 778         tl = nfsm_dissect_nonblock(u_int32_t *, 2*NFSX_UNSIGNED);
 779         rxid = *tl++;
 780         if (*tl != rpc_reply) {
 781                 m_freem(mrep);
 782 nfsmout:
 783                 nfsstats.rpcinvalid++;
 784                 return;
 785         }
 786
 787         mtx_lock(&nfs_reqq_mtx);
 788         /*
 789          * Loop through the request list to match up the reply
 790          * Iff no match, just drop the datagram
 791          */
 792         TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
 793                 mtx_lock(&nmp->nm_mtx);
 794                 mtx_lock(&rep->r_mtx);
 795                 if (rep->r_mrep == NULL && rxid == rep->r_xid) {
 796                         /* Found it.. */
 797                         rep->r_mrep = mrep;
 798                         rep->r_md = md;
 799                         rep->r_dpos = dpos;
 800                         /*
 801                          * Update congestion window.
 802                          * Do the additive increase of
 803                          * one rpc/rtt.
 804                          */
 805                         if (nmp->nm_cwnd <= nmp->nm_sent) {
 806                                 nmp->nm_cwnd +=
 807                                         (NFS_CWNDSCALE * NFS_CWNDSCALE +
 808                                          (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
 809                                 if (nmp->nm_cwnd > NFS_MAXCWND)
 810                                         nmp->nm_cwnd = NFS_MAXCWND;
 811                         }
 812                         if (rep->r_flags & R_SENT) {
 813                                 rep->r_flags &= ~R_SENT;
 814                                 nmp->nm_sent -= NFS_CWNDSCALE;
 815                         }
 816                         if (rep->r_flags & R_TIMING)
 817                                 nfs_update_rtt(rep);
 818                         nmp->nm_timeouts = 0;
 819                         wakeup((caddr_t)rep);
 820                         mtx_unlock(&rep->r_mtx);
 821                         mtx_unlock(&nmp->nm_mtx);
 822                         break;
 823                 }
 824                 mtx_unlock(&rep->r_mtx);
 825                 mtx_unlock(&nmp->nm_mtx);
 826         }
 827         /*
 828          * If not matched to a request, drop it.
 829          * If it's mine, wake up requestor.
 830          */
 831         if (rep == 0) {
 832                 nfsstats.rpcunexpected++;
 833                 m_freem(mrep);
 834         }
 835         mtx_unlock(&nfs_reqq_mtx);
 836 }
 837
 838 static void
 839 nfs_mark_for_reconnect(struct nfsmount *nmp)
 840 {
 841         struct nfsreq *rp;
 842
 843         mtx_lock(&nmp->nm_mtx);
 844         nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
 845         mtx_unlock(&nmp->nm_mtx);
 846         /*
 847          * Wakeup all processes that are waiting for replies
 848          * on this mount point. One of them does the reconnect.
 849          */
 850         mtx_lock(&nfs_reqq_mtx);
 851         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
 852                 if (rp->r_nmp == nmp) {
 853                         mtx_lock(&rp->r_mtx);
 854                         rp->r_flags |= R_MUSTRESEND;
 855                         wakeup((caddr_t)rp);
 856                         mtx_unlock(&rp->r_mtx);
 857                 }
 858         }
 859         mtx_unlock(&nfs_reqq_mtx);
 860 }
 861
 862 static int
 863 nfstcp_readable(struct socket *so, int bytes)
 864 {
 865         int retval;
 866
 867         SOCKBUF_LOCK(&so->so_rcv);
 868         retval = (so->so_rcv.sb_cc >= (bytes) ||
 869                   (so->so_rcv.sb_state & SBS_CANTRCVMORE) ||
 870                   so->so_error);
 871         SOCKBUF_UNLOCK(&so->so_rcv);
 872         return (retval);
 873 }
 874
 875 #define nfstcp_marker_readable(so)      nfstcp_readable(so, sizeof(u_int32_t))
 876
 877 static int
 878 nfs_copy_len(struct mbuf *mp, char *buf, int len)
 879 {
 880         while (len > 0 && mp != NULL) {
 881                 int copylen = min(len, mp->m_len);
 882
 883                 bcopy(mp->m_data, buf, copylen);
 884                 buf += copylen;
 885                 len -= copylen;
 886                 mp = mp->m_next;
 887         }
 888         return (len);
 889 }
 890
 891 static void
 892 nfs_clnt_tcp_soupcall(struct socket *so, void *arg, int waitflag)
 893 {
 894         struct nfsmount *nmp = (struct nfsmount *)arg;
 895         struct mbuf *mp = NULL;
 896         struct uio auio;
 897         int error;
 898         u_int32_t len;
 899         int rcvflg;
 900
 901         /*
 902          * Don't pick any more data from the socket if we've marked the
 903          * mountpoint for reconnect.
 904          */
 905         mtx_lock(&nmp->nm_mtx);
 906         if (nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) {
 907                 mtx_unlock(&nmp->nm_mtx);
 908                 return;
 909         } else
 910                 mtx_unlock(&nmp->nm_mtx);
 911         auio.uio_td = curthread;
 912         auio.uio_segflg = UIO_SYSSPACE;
 913         auio.uio_rw = UIO_READ;
 914         for ( ; ; ) {
 915                 mtx_lock(&nmp->nm_mtx);
 916                 if (nmp->nm_nfstcpstate.flags & NFS_TCP_EXPECT_RPCMARKER) {
 917                         int resid;
 918
 919                         mtx_unlock(&nmp->nm_mtx);
 920                         if (!nfstcp_marker_readable(so)) {
 921                                 /* Marker is not readable */
 922                                 return;
 923                         }
 924                         auio.uio_resid = sizeof(u_int32_t);
 925                         auio.uio_iov = NULL;
 926                         auio.uio_iovcnt = 0;
 927                         mp = NULL;
 928                         rcvflg = (MSG_DONTWAIT | MSG_SOCALLBCK);
 929                         error =  soreceive(so, (struct sockaddr **)0, &auio,
 930                             &mp, (struct mbuf **)0, &rcvflg);
 931                         /*
 932                          * We've already tested that the socket is readable. 2 cases
 933                          * here, we either read 0 bytes (client closed connection),
 934                          * or got some other error. In both cases, we tear down the
 935                          * connection.
 936                          */
 937                         if (error || auio.uio_resid > 0) {
 938                                 if (error && error != ECONNRESET) {
 939                                         log(LOG_ERR,
 940                                             "nfs/tcp clnt: Error %d reading socket, tearing down TCP connection\n",
 941                                             error);
 942                                 }
 943                                 goto mark_reconnect;
 944                         }
 945                         if (mp == NULL)
 946                                 panic("nfs_clnt_tcp_soupcall: Got empty mbuf chain from sorecv\n");
 947                         /*
 948                          * Sigh. We can't do the obvious thing here (which would
 949                          * be to have soreceive copy the length from mbufs for us).
 950                          * Calling uiomove() from the context of a socket callback
 951                          * (even for kernel-kernel copies) leads to LORs (since
 952                          * we hold network locks at this point).
 953                          */
 954                         if ((resid = nfs_copy_len(mp, (char *)&len,
 955                                                   sizeof(u_int32_t)))) {
 956                                 log(LOG_ERR, "%s (%d) from nfs server %s\n",
 957                                     "Bad RPC HDR length",
 958                                     (int)(sizeof(u_int32_t) - resid),
 959                                     nmp->nm_mountp->mnt_stat.f_mntfromname);
 960                                 goto mark_reconnect;
 961                         }
 962                         len = ntohl(len) & ~0x80000000;
 963                         m_freem(mp);
 964                         /*
 965                          * This is SERIOUS! We are out of sync with the sender
 966                          * and forcing a disconnect/reconnect is all I can do.
 967                          */
 968                         if (len > NFS_MAXPACKET || len == 0) {
 969                                 log(LOG_ERR, "%s (%d) from nfs server %s\n",
 970                                     "impossible packet length",
 971                                     len,
 972                                     nmp->nm_mountp->mnt_stat.f_mntfromname);
 973                                 goto mark_reconnect;
 974                         }
 975                         mtx_lock(&nmp->nm_mtx);
 976                         nmp->nm_nfstcpstate.rpcresid = len;
 977                         nmp->nm_nfstcpstate.flags &= ~(NFS_TCP_EXPECT_RPCMARKER);
 978                         mtx_unlock(&nmp->nm_mtx);
 979                 } else
 980                         mtx_unlock(&nmp->nm_mtx);
 981
 982                 /*
 983                  * Processed RPC marker or no RPC marker to process.
 984                  * Pull in and process data.
 985                  */
 986                 mtx_lock(&nmp->nm_mtx);
 987                 if (nmp->nm_nfstcpstate.rpcresid > 0) {
 988                         mtx_unlock(&nmp->nm_mtx);
 989                         if (!nfstcp_readable(so, nmp->nm_nfstcpstate.rpcresid)) {
 990                                 /* All data not readable */
 991                                 return;
 992                         }
 993                         auio.uio_resid = nmp->nm_nfstcpstate.rpcresid;
 994                         auio.uio_iov = NULL;
 995                         auio.uio_iovcnt = 0;
 996                         mp = NULL;
 997                         rcvflg = (MSG_DONTWAIT | MSG_SOCALLBCK);
 998                         error =  soreceive(so, (struct sockaddr **)0, &auio,
 999                             &mp, (struct mbuf **)0, &rcvflg);
1000                         if (error || auio.uio_resid > 0) {
1001                                 if (error && error != ECONNRESET) {
1002                                         log(LOG_ERR,
1003                                             "nfs/tcp clnt: Error %d reading socket, tearing down TCP connection\n",
1004                                             error);
1005                                 }
1006                                 goto mark_reconnect;
1007                         }
1008                         if (mp == NULL)
1009                                 panic("nfs_clnt_tcp_soupcall: Got empty mbuf chain from sorecv\n");
1010                         mtx_lock(&nmp->nm_mtx);
1011                         nmp->nm_nfstcpstate.rpcresid = 0;
1012                         nmp->nm_nfstcpstate.flags |= NFS_TCP_EXPECT_RPCMARKER;
1013                         mtx_unlock(&nmp->nm_mtx);
1014                         /* We got the entire RPC reply. Match XIDs and wake up requestor */
1015                         nfs_clnt_match_xid(so, nmp, mp);
1016                 } else
1017                         mtx_unlock(&nmp->nm_mtx);
1018         }
1019
1020 mark_reconnect:
1021         nfs_mark_for_reconnect(nmp);
1022 }
1023
1024 static void
1025 nfs_clnt_udp_soupcall(struct socket *so, void *arg, int waitflag)
1026 {
1027         struct nfsmount *nmp = (struct nfsmount *)arg;
1028         struct uio auio;
1029         struct mbuf *mp = NULL;
1030         struct mbuf *control = NULL;
1031         int error, rcvflag;
1032
1033         auio.uio_resid = 1000000;
1034         auio.uio_td = curthread;
1035         rcvflag = MSG_DONTWAIT;
1036         auio.uio_resid = 1000000000;
1037         do {
1038                 mp = control = NULL;
1039                 error = soreceive(so, NULL, &auio, &mp, &control, &rcvflag);
1040                 if (control)
1041                         m_freem(control);
1042                 if (mp)
1043                         nfs_clnt_match_xid(so, nmp, mp);
1044         } while (mp && !error);
1045 }
1046
1047 /*
1048  * nfs_request - goes something like this
1049  *      - fill in request struct
1050  *      - links it into list
1051  *      - calls nfs_send() for first transmit
1052  *      - calls nfs_receive() to get reply
1053  *      - break down rpc header and return with nfs reply pointed to
1054  *        by mrep or error
1055  * nb: always frees up mreq mbuf list
1056  */
1057 int
1058 nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
1059     struct thread *td, struct ucred *cred, struct mbuf **mrp,
1060     struct mbuf **mdp, caddr_t *dposp)
1061 {
1062         struct mbuf *mrep, *m2;
1063         struct nfsreq *rep;
1064         u_int32_t *tl;
1065         int i;
1066         struct nfsmount *nmp;
1067         struct mbuf *m, *md, *mheadend;
1068         time_t waituntil;
1069         caddr_t dpos;
1070         int error = 0, mrest_len, auth_len, auth_type;
1071         struct timeval now;
1072         u_int32_t *xidp;
1073
1074         /* Reject requests while attempting a forced unmount. */
1075         if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
1076                 m_freem(mrest);
1077                 return (ESTALE);
1078         }
1079         nmp = VFSTONFS(vp->v_mount);
1080         if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
1081                 return nfs4_request(vp, mrest, procnum, td, cred, mrp, mdp, dposp);
1082         MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
1083         bzero(rep, sizeof(struct nfsreq));
1084         rep->r_nmp = nmp;
1085         rep->r_vp = vp;
1086         rep->r_td = td;
1087         rep->r_procnum = procnum;
1088         mtx_init(&rep->r_mtx, "NFSrep lock", NULL, MTX_DEF);
1089
1090         getmicrouptime(&now);
1091         rep->r_lastmsg = now.tv_sec -
1092             ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1093         mrest_len = m_length(mrest, NULL);
1094
1095         /*
1096          * Get the RPC header with authorization.
1097          */
1098         auth_type = RPCAUTH_UNIX;
1099         if (cred->cr_ngroups < 1)
1100                 panic("nfsreq nogrps");
1101         auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1102                 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1103                 5 * NFSX_UNSIGNED;
1104         m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1105              mrest, mrest_len, &mheadend, &xidp);
1106
1107         /*
1108          * For stream protocols, insert a Sun RPC Record Mark.
1109          */
1110         if (nmp->nm_sotype == SOCK_STREAM) {
1111                 M_PREPEND(m, NFSX_UNSIGNED, M_TRYWAIT);
1112                 *mtod(m, u_int32_t *) = htonl(0x80000000 |
1113                          (m->m_pkthdr.len - NFSX_UNSIGNED));
1114         }
1115         rep->r_mreq = m;
1116         rep->r_xid = *xidp;
1117 tryagain:
1118         if (nmp->nm_flag & NFSMNT_SOFT)
1119                 rep->r_retry = nmp->nm_retry;
1120         else
1121                 rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
1122         rep->r_rtt = rep->r_rexmit = 0;
1123         if (nfs_rto_timer(procnum) != NFS_DEFAULT_TIMER)
1124                 rep->r_flags = R_TIMING;
1125         else
1126                 rep->r_flags = 0;
1127         rep->r_mrep = NULL;
1128
1129         /*
1130          * Do the client side RPC.
1131          */
1132         nfsstats.rpcrequests++;
1133         /*
1134          * Chain request into list of outstanding requests. Be sure
1135          * to put it LAST so timer finds oldest requests first.
1136          */
1137         mtx_lock(&nfs_reqq_mtx);
1138         if (TAILQ_EMPTY(&nfs_reqq))
1139                 callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL);
1140         TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1141         mtx_unlock(&nfs_reqq_mtx);
1142
1143         /*
1144          * If backing off another request or avoiding congestion, don't
1145          * send this one now but let timer do it. If not timing a request,
1146          * do it now.
1147          */
1148         mtx_lock(&nmp->nm_mtx);
1149         if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1150                 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1151                 nmp->nm_sent < nmp->nm_cwnd)) {
1152                 mtx_unlock(&nmp->nm_mtx);
1153                 error = nfs_sndlock(rep);
1154                 if (!error) {
1155                         m2 = m_copym(m, 0, M_COPYALL, M_TRYWAIT);
1156                         error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1157                         nfs_sndunlock(rep);
1158                 }
1159                 mtx_lock(&nfs_reqq_mtx);
1160                 /*
1161                  * nfs_timer() could've re-transmitted the request if we ended up
1162                  * blocking on nfs_send() too long, so check for R_SENT here.
1163                  */
1164                 if (!error && (rep->r_flags & (R_SENT | R_MUSTRESEND)) == 0) {
1165                         mtx_lock(&nmp->nm_mtx);
1166                         nmp->nm_sent += NFS_CWNDSCALE;
1167                         mtx_unlock(&nmp->nm_mtx);
1168                         rep->r_flags |= R_SENT;
1169                 }
1170                 mtx_unlock(&nfs_reqq_mtx);
1171         } else {
1172                 mtx_unlock(&nmp->nm_mtx);
1173                 rep->r_rtt = -1;
1174         }
1175
1176         /*
1177          * Wait for the reply from our send or the timer's.
1178          */
1179         if (!error || error == EPIPE)
1180                 error = nfs_reply(rep);
1181
1182         /*
1183          * RPC done, unlink the request.
1184          */
1185         mtx_lock(&nfs_reqq_mtx);
1186         /*
1187          * nfs_timer() may be in the process of re-transmitting this request.
1188          * nfs_timer() drops the nfs_reqq_mtx before the pru_send() (to avoid LORs).
1189          * Wait till nfs_timer() completes the re-transmission. When the reply
1190          * comes back, it will be discarded (since the req struct for it no longer
1191          * exists).
1192          */
1193         while (rep->r_flags & R_PIN_REQ) {
1194                 msleep((caddr_t)&rep->r_flags, &nfs_reqq_mtx,
1195                        (PZERO - 1), "nfsrxmt", 0);
1196         }
1197         TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1198         if (TAILQ_EMPTY(&nfs_reqq))
1199                 callout_stop(&nfs_callout);
1200         /*
1201          * Decrement the outstanding request count.
1202          */
1203         if (rep->r_flags & R_SENT) {
1204                 rep->r_flags &= ~R_SENT;        /* paranoia */
1205                 mtx_lock(&nmp->nm_mtx);
1206                 nmp->nm_sent -= NFS_CWNDSCALE;
1207                 mtx_unlock(&nmp->nm_mtx);
1208         }
1209         mtx_unlock(&nfs_reqq_mtx);
1210
1211         /*
1212          * If there was a successful reply and a tprintf msg.
1213          * tprintf a response.
1214          */
1215         if (!error) {
1216                 nfs_up(rep, nmp, rep->r_td, "is alive again", NFSSTA_TIMEO);
1217         }
1218         mrep = rep->r_mrep;
1219         md = rep->r_md;
1220         dpos = rep->r_dpos;
1221         if (error) {
1222                 /*
1223                  * If we got interrupted by a signal in nfs_reply(), there's
1224                  * a very small window where the reply could've come in before
1225                  * this process got scheduled in. To handle that case, we need
1226                  * to free the reply if it was delivered.
1227                  */
1228                 if (rep->r_mrep != NULL)
1229                         m_freem(rep->r_mrep);
1230                 m_freem(rep->r_mreq);
1231                 mtx_destroy(&rep->r_mtx);
1232                 free((caddr_t)rep, M_NFSREQ);
1233                 return (error);
1234         }
1235
1236         if (rep->r_mrep == NULL)
1237                 panic("nfs_request: rep->r_mrep shouldn't be NULL if no error\n");
1238
1239         /*
1240          * break down the rpc header and check if ok
1241          */
1242         tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
1243         if (*tl++ == rpc_msgdenied) {
1244                 if (*tl == rpc_mismatch)
1245                         error = EOPNOTSUPP;
1246                 else
1247                         error = EACCES;
1248                 m_freem(mrep);
1249                 m_freem(rep->r_mreq);
1250                 mtx_destroy(&rep->r_mtx);
1251                 free((caddr_t)rep, M_NFSREQ);
1252                 return (error);
1253         }
1254
1255         /*
1256          * Just throw away any verifyer (ie: kerberos etc).
1257          */
1258         i = fxdr_unsigned(int, *tl++);          /* verf type */
1259         i = fxdr_unsigned(int32_t, *tl);        /* len */
1260         if (i > 0)
1261                 nfsm_adv(nfsm_rndup(i));
1262         tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
1263         /* 0 == ok */
1264         if (*tl == 0) {
1265                 tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
1266                 if (*tl != 0) {
1267                         error = fxdr_unsigned(int, *tl);
1268                         if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1269                                 error == NFSERR_TRYLATER) {
1270                                 m_freem(mrep);
1271                                 error = 0;
1272                                 waituntil = time_second + nfs3_jukebox_delay;
1273                                 while (time_second < waituntil) {
1274                                         (void) tsleep(&lbolt, PSOCK, "nqnfstry", 0);
1275                                 }
1276                                 mtx_lock(&nfs_reqq_mtx);
1277                                 if (++nfs_xid == 0)
1278                                         nfs_xid++;
1279                                 rep->r_xid = *xidp = txdr_unsigned(nfs_xid);
1280                                 mtx_unlock(&nfs_reqq_mtx);
1281                                 goto tryagain;
1282                         }
1283
1284                         /*
1285                          * If the File Handle was stale, invalidate the
1286                          * lookup cache, just in case.
1287                          */
1288                         if (error == ESTALE)
1289                                 cache_purge(vp);
1290                         /*
1291                          * Skip wcc data on NFS errors for now. NetApp filers return corrupt
1292                          * postop attrs in the wcc data for NFS err EROFS. Not sure if they
1293                          * could return corrupt postop attrs for others errors.
1294                          */
1295                         if ((nmp->nm_flag & NFSMNT_NFSV3) && !nfs_skip_wcc_data_onerr) {
1296                                 *mrp = mrep;
1297                                 *mdp = md;
1298                                 *dposp = dpos;
1299                                 error |= NFSERR_RETERR;
1300                         } else
1301                                 m_freem(mrep);
1302                         m_freem(rep->r_mreq);
1303                         mtx_destroy(&rep->r_mtx);
1304                         free((caddr_t)rep, M_NFSREQ);
1305                         return (error);
1306                 }
1307
1308                 *mrp = mrep;
1309                 *mdp = md;
1310                 *dposp = dpos;
1311                 m_freem(rep->r_mreq);
1312                 mtx_destroy(&rep->r_mtx);
1313                 FREE((caddr_t)rep, M_NFSREQ);
1314                 return (0);
1315         }
1316         m_freem(mrep);
1317         error = EPROTONOSUPPORT;
1318 nfsmout:
1319         m_freem(rep->r_mreq);
1320         mtx_destroy(&rep->r_mtx);
1321         free((caddr_t)rep, M_NFSREQ);
1322         return (error);
1323 }
1324
1325 /*
1326  * Nfs timer routine
1327  * Scan the nfsreq list and retranmit any requests that have timed out
1328  * To avoid retransmission attempts on STREAM sockets (in the future) make
1329  * sure to set the r_retry field to 0 (implies nm_retry == 0).
1330  *
1331  * The nfs reqq lock cannot be held while we do the pru_send() because of a
1332  * lock ordering violation. The NFS client socket callback acquires
1333  * inp_lock->nfsreq mutex and pru_send acquires inp_lock. So we drop the
1334  * reqq mutex (and reacquire it after the pru_send()). The req structure
1335  * (for the rexmit) is prevented from being removed by the R_PIN_REQ flag.
1336  */
1337 void
1338 nfs_timer(void *arg)
1339 {
1340         struct nfsreq *rep;
1341         struct mbuf *m;
1342         struct socket *so;
1343         struct nfsmount *nmp;
1344         int timeo;
1345         int error;
1346         struct timeval now;
1347
1348         getmicrouptime(&now);
1349         mtx_lock(&nfs_reqq_mtx);
1350         TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1351                 nmp = rep->r_nmp;
1352                 mtx_lock(&rep->r_mtx);
1353                 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
1354                         mtx_unlock(&rep->r_mtx);
1355                         continue;
1356                 } else {
1357                         /*
1358                          * Terminate request if force-unmount in progress.
1359                          * Note that NFS could have vfs_busy'ed the mount,
1360                          * causing the unmount to wait for the mnt_lock, making
1361                          * this bit of logic necessary.
1362                          */
1363                         if (rep->r_nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF) {
1364                                 nfs_softterm(rep);
1365                                 mtx_unlock(&rep->r_mtx);
1366                                 continue;
1367                         }
1368                         mtx_unlock(&rep->r_mtx);
1369                 }
1370                 if (nfs_sigintr(nmp, rep, rep->r_td))
1371                         continue;
1372                 mtx_lock(&nmp->nm_mtx);
1373                 mtx_lock(&rep->r_mtx);
1374                 if (nmp->nm_tprintf_initial_delay != 0 &&
1375                     (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1376                     rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1377                         rep->r_lastmsg = now.tv_sec;
1378                         /*
1379                          * Pin down the request and drop locks for the acquisition
1380                          * of Giant from tprintf() in nfs_down().
1381                          */
1382                         rep->r_flags |= R_PIN_REQ;
1383                         mtx_unlock(&rep->r_mtx);
1384                         mtx_unlock(&nmp->nm_mtx);
1385                         mtx_unlock(&nfs_reqq_mtx);
1386                         nfs_down(rep, nmp, rep->r_td, "not responding",
1387                                  0, NFSSTA_TIMEO);
1388                         mtx_lock(&nfs_reqq_mtx);
1389                         mtx_lock(&nmp->nm_mtx);
1390                         mtx_lock(&rep->r_mtx);
1391                         rep->r_flags &= ~R_PIN_REQ;
1392                         wakeup((caddr_t)&rep->r_flags);
1393                 }
1394                 if (rep->r_rtt >= 0) {
1395                         rep->r_rtt++;
1396                         if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1397                                 timeo = nmp->nm_timeo;
1398                         else
1399                                 timeo = nfs_estimate_rto(nmp, rep->r_procnum);
1400                         if (nmp->nm_timeouts > 0)
1401                                 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1402                         if (rep->r_rtt <= timeo) {
1403                                 mtx_unlock(&rep->r_mtx);
1404                                 mtx_unlock(&nmp->nm_mtx);
1405                                 continue;
1406                         }
1407                         if (nmp->nm_timeouts < NFS_NBACKOFF)
1408                                 nmp->nm_timeouts++;
1409                 }
1410                 if (rep->r_rexmit >= rep->r_retry) {    /* too many */
1411                         nfsstats.rpctimeouts++;
1412                         nfs_softterm(rep);
1413                         mtx_unlock(&rep->r_mtx);
1414                         mtx_unlock(&nmp->nm_mtx);
1415                         continue;
1416                 }
1417                 if (nmp->nm_sotype != SOCK_DGRAM) {
1418                         if (++rep->r_rexmit > NFS_MAXREXMIT)
1419                                 rep->r_rexmit = NFS_MAXREXMIT;
1420                         /*
1421                          * For NFS/TCP, setting R_MUSTRESEND and waking up
1422                          * the requester will cause the request to be
1423                          * retransmitted (in nfs_reply()), re-connecting
1424                          * if necessary.
1425                          */
1426                         rep->r_flags |= R_MUSTRESEND;
1427                         wakeup((caddr_t)rep);
1428                         rep->r_rtt = 0;
1429                         mtx_unlock(&rep->r_mtx);
1430                         mtx_unlock(&nmp->nm_mtx);
1431                         continue;
1432                 }
1433                 if ((so = nmp->nm_so) == NULL) {
1434                         mtx_unlock(&rep->r_mtx);
1435                         mtx_unlock(&nmp->nm_mtx);
1436                         continue;
1437                 }
1438                 /*
1439                  * If there is enough space and the window allows..
1440                  *      Resend it
1441                  * Set r_rtt to -1 in case we fail to send it now.
1442                  */
1443                 rep->r_rtt = -1;
1444                 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
1445                     ((nmp->nm_flag & NFSMNT_DUMBTIMR) || (rep->r_flags & R_SENT) ||
1446                      nmp->nm_sent < nmp->nm_cwnd)) {
1447                         mtx_unlock(&rep->r_mtx);
1448                         mtx_unlock(&nmp->nm_mtx);
1449                         if ((m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))) {
1450                                 /*
1451                                  * Mark the request to indicate that a XMIT is in
1452                                  * progress to prevent the req structure being
1453                                  * removed in nfs_request().
1454                                  */
1455                                 mtx_lock(&rep->r_mtx);
1456                                 rep->r_flags |= R_PIN_REQ;
1457                                 mtx_unlock(&rep->r_mtx);
1458                                 mtx_unlock(&nfs_reqq_mtx);
1459                                 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
1460                                         error = (*so->so_proto->pr_usrreqs->pru_send)
1461                                                 (so, 0, m, NULL, NULL, curthread);
1462                                 else
1463                                         error = (*so->so_proto->pr_usrreqs->pru_send)
1464                                                 (so, 0, m, nmp->nm_nam, NULL,
1465                                                  curthread);
1466                                 mtx_lock(&nfs_reqq_mtx);
1467                                 mtx_lock(&nmp->nm_mtx);
1468                                 mtx_lock(&rep->r_mtx);
1469                                 rep->r_flags &= ~R_PIN_REQ;
1470                                 wakeup((caddr_t)&rep->r_flags);
1471                                 if (error) {
1472                                         if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
1473                                                 so->so_error = 0;
1474                                         rep->r_flags |= R_RESENDERR;
1475                                 } else {
1476                                         /*
1477                                          * Iff first send, start timing
1478                                          * else turn timing off, backoff timer
1479                                          * and divide congestion window by 2.
1480                                          */
1481                                         rep->r_flags &= ~R_RESENDERR;
1482                                         if (rep->r_flags & R_SENT) {
1483                                                 rep->r_flags &= ~R_TIMING;
1484                                                 if (++rep->r_rexmit > NFS_MAXREXMIT)
1485                                                         rep->r_rexmit = NFS_MAXREXMIT;
1486                                                 nmp->nm_cwnd >>= 1;
1487                                                 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1488                                                         nmp->nm_cwnd = NFS_CWNDSCALE;
1489                                                 nfsstats.rpcretries++;
1490                                         } else {
1491                                                 rep->r_flags |= R_SENT;
1492                                                 nmp->nm_sent += NFS_CWNDSCALE;
1493                                         }
1494                                         rep->r_rtt = 0;
1495                                 }
1496                                 mtx_unlock(&rep->r_mtx);
1497                                 mtx_unlock(&nmp->nm_mtx);
1498                         }
1499                 } else {
1500                         mtx_unlock(&rep->r_mtx);
1501                         mtx_unlock(&nmp->nm_mtx);
1502                 }
1503         }
1504         mtx_unlock(&nfs_reqq_mtx);
1505         callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL);
1506 }
1507
1508 /*
1509  * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
1510  * wait for all requests to complete. This is used by forced unmounts
1511  * to terminate any outstanding RPCs.
1512  */
1513 int
1514 nfs_nmcancelreqs(nmp)
1515         struct nfsmount *nmp;
1516 {
1517         struct nfsreq *req;
1518         int i;
1519
1520         mtx_lock(&nfs_reqq_mtx);
1521         TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
1522                 mtx_lock(&req->r_mtx);
1523                 if (nmp != req->r_nmp || req->r_mrep != NULL ||
1524                     (req->r_flags & R_SOFTTERM)) {
1525                         mtx_unlock(&req->r_mtx);
1526                         continue;
1527                 }
1528                 nfs_softterm(req);
1529                 mtx_unlock(&req->r_mtx);
1530         }
1531         mtx_unlock(&nfs_reqq_mtx);
1532
1533         for (i = 0; i < 30; i++) {
1534                 mtx_lock(&nfs_reqq_mtx);
1535                 TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
1536                         if (nmp == req->r_nmp)
1537                                 break;
1538                 }
1539                 mtx_unlock(&nfs_reqq_mtx);
1540                 if (req == NULL)
1541                         return (0);
1542                 tsleep(&lbolt, PSOCK, "nfscancel", 0);
1543         }
1544         return (EBUSY);
1545 }
1546
1547 /*
1548  * Flag a request as being about to terminate (due to NFSMNT_INT/NFSMNT_SOFT).
1549  * The nm_send count is decremented now to avoid deadlocks when the process in
1550  * soreceive() hasn't yet managed to send its own request.
1551  */
1552
1553 static void
1554 nfs_softterm(struct nfsreq *rep)
1555 {
1556         KASSERT(mtx_owned(&rep->r_mtx), ("NFS req lock not owned !"));
1557         rep->r_flags |= R_SOFTTERM;
1558         if (rep->r_flags & R_SENT) {
1559                 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1560                 rep->r_flags &= ~R_SENT;
1561         }
1562         /*
1563          * Request terminated, wakeup the blocked process, so that we
1564          * can return EINTR back.
1565          */
1566         wakeup((caddr_t)rep);
1567 }
1568
1569 /*
1570  * Any signal that can interrupt an NFS operation in an intr mount
1571  * should be added to this set. SIGSTOP and SIGKILL cannot be masked.
1572  */
1573 int nfs_sig_set[] = {
1574         SIGINT,
1575         SIGTERM,
1576         SIGHUP,
1577         SIGKILL,
1578         SIGSTOP,
1579         SIGQUIT
1580 };
1581
1582 /*
1583  * Check to see if one of the signals in our subset is pending on
1584  * the process (in an intr mount).
1585  */
1586 static int
1587 nfs_sig_pending(sigset_t set)
1588 {
1589         int i;
1590
1591         for (i = 0 ; i < sizeof(nfs_sig_set)/sizeof(int) ; i++)
1592                 if (SIGISMEMBER(set, nfs_sig_set[i]))
1593                         return (1);
1594         return (0);
1595 }
1596
1597 /*
1598  * The set/restore sigmask functions are used to (temporarily) overwrite
1599  * the process p_sigmask during an RPC call (for example). These are also
1600  * used in other places in the NFS client that might tsleep().
1601  */
1602 void
1603 nfs_set_sigmask(struct thread *td, sigset_t *oldset)
1604 {
1605         sigset_t newset;
1606         int i;
1607         struct proc *p;
1608
1609         SIGFILLSET(newset);
1610         if (td == NULL)
1611                 td = curthread; /* XXX */
1612         p = td->td_proc;
1613         /* Remove the NFS set of signals from newset */
1614         PROC_LOCK(p);
1615         mtx_lock(&p->p_sigacts->ps_mtx);
1616         for (i = 0 ; i < sizeof(nfs_sig_set)/sizeof(int) ; i++) {
1617                 /*
1618                  * But make sure we leave the ones already masked
1619                  * by the process, ie. remove the signal from the
1620                  * temporary signalmask only if it wasn't already
1621                  * in p_sigmask.
1622                  */
1623                 if (!SIGISMEMBER(td->td_sigmask, nfs_sig_set[i]) &&
1624                     !SIGISMEMBER(p->p_sigacts->ps_sigignore, nfs_sig_set[i]))
1625                         SIGDELSET(newset, nfs_sig_set[i]);
1626         }
1627         mtx_unlock(&p->p_sigacts->ps_mtx);
1628         PROC_UNLOCK(p);
1629         kern_sigprocmask(td, SIG_SETMASK, &newset, oldset, 0);
1630 }
1631
1632 void
1633 nfs_restore_sigmask(struct thread *td, sigset_t *set)
1634 {
1635         if (td == NULL)
1636                 td = curthread; /* XXX */
1637         kern_sigprocmask(td, SIG_SETMASK, set, NULL, 0);
1638 }
1639
1640 /*
1641  * NFS wrapper to msleep(), that shoves a new p_sigmask and restores the
1642  * old one after msleep() returns.
1643  */
1644 int
1645 nfs_msleep(struct thread *td, void *ident, struct mtx *mtx, int priority, char *wmesg, int timo)
1646 {
1647         sigset_t oldset;
1648         int error;
1649         struct proc *p;
1650
1651         if ((priority & PCATCH) == 0)
1652                 return msleep(ident, mtx, priority, wmesg, timo);
1653         if (td == NULL)
1654                 td = curthread; /* XXX */
1655         nfs_set_sigmask(td, &oldset);
1656         error = msleep(ident, mtx, priority, wmesg, timo);
1657         nfs_restore_sigmask(td, &oldset);
1658         p = td->td_proc;
1659         return (error);
1660 }
1661
1662 /*
1663  * Test for a termination condition pending on the process.
1664  * This is used for NFSMNT_INT mounts.
1665  */
1666 int
1667 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
1668 {
1669         struct proc *p;
1670         sigset_t tmpset;
1671
1672         if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
1673                 return nfs4_sigintr(nmp, rep, td);
1674         if (rep) {
1675                 mtx_lock(&rep->r_mtx);
1676                 if (rep->r_flags & R_SOFTTERM) {
1677                         mtx_unlock(&rep->r_mtx);
1678                         return (EIO);
1679                 } else
1680                         mtx_unlock(&rep->r_mtx);
1681         }
1682         /* Terminate all requests while attempting a forced unmount. */
1683         if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
1684                 return (EIO);
1685         if (!(nmp->nm_flag & NFSMNT_INT))
1686                 return (0);
1687         if (td == NULL)
1688                 return (0);
1689         p = td->td_proc;
1690         PROC_LOCK(p);
1691         tmpset = p->p_siglist;
1692         SIGSETOR(tmpset, td->td_siglist);
1693         SIGSETNAND(tmpset, td->td_sigmask);
1694         mtx_lock(&p->p_sigacts->ps_mtx);
1695         SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
1696         mtx_unlock(&p->p_sigacts->ps_mtx);
1697         if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist))
1698             && nfs_sig_pending(tmpset)) {
1699                 PROC_UNLOCK(p);
1700                 return (EINTR);
1701         }
1702         PROC_UNLOCK(p);
1703         return (0);
1704 }
1705
1706 /*
1707  * Lock a socket against others.
1708  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1709  * and also to avoid race conditions between the processes with nfs requests
1710  * in progress when a reconnect is necessary.
1711  */
1712 int
1713 nfs_sndlock(struct nfsreq *rep)
1714 {
1715         int *statep = &rep->r_nmp->nm_state;
1716         struct thread *td;
1717         int error, slpflag = 0, slptimeo = 0;
1718
1719         td = rep->r_td;
1720         mtx_lock(&rep->r_nmp->nm_mtx);
1721         if (rep->r_nmp->nm_flag & NFSMNT_INT)
1722                 slpflag = PCATCH;
1723         while (*statep & NFSSTA_SNDLOCK) {
1724                 error = nfs_sigintr(rep->r_nmp, rep, td);
1725                 if (error) {
1726                         mtx_unlock(&rep->r_nmp->nm_mtx);
1727                         return (error);
1728                 }
1729                 *statep |= NFSSTA_WANTSND;
1730                 (void) msleep(statep, &rep->r_nmp->nm_mtx,
1731                               slpflag | (PZERO - 1), "nfsndlck", slptimeo);
1732                 if (slpflag == PCATCH) {
1733                         slpflag = 0;
1734                         slptimeo = 2 * hz;
1735                 }
1736         }
1737         *statep |= NFSSTA_SNDLOCK;
1738         mtx_unlock(&rep->r_nmp->nm_mtx);
1739         return (0);
1740 }
1741
1742 /*
1743  * Unlock the stream socket for others.
1744  */
1745 void
1746 nfs_sndunlock(struct nfsreq *rep)
1747 {
1748         int *statep = &rep->r_nmp->nm_state;
1749
1750         mtx_lock(&rep->r_nmp->nm_mtx);
1751         if ((*statep & NFSSTA_SNDLOCK) == 0)
1752                 panic("nfs sndunlock");
1753         *statep &= ~NFSSTA_SNDLOCK;
1754         if (*statep & NFSSTA_WANTSND) {
1755                 *statep &= ~NFSSTA_WANTSND;
1756                 wakeup(statep);
1757         }
1758         mtx_unlock(&rep->r_nmp->nm_mtx);
1759 }
1760
1761 /*
1762  *      nfs_realign:
1763  *
1764  *      Check for badly aligned mbuf data and realign by copying the unaligned
1765  *      portion of the data into a new mbuf chain and freeing the portions
1766  *      of the old chain that were replaced.
1767  *
1768  *      We cannot simply realign the data within the existing mbuf chain
1769  *      because the underlying buffers may contain other rpc commands and
1770  *      we cannot afford to overwrite them.
1771  *
1772  *      We would prefer to avoid this situation entirely.  The situation does
1773  *      not occur with NFS/UDP and is supposed to only occassionally occur
1774  *      with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
1775  *
1776  */
1777 static int
1778 nfs_realign(struct mbuf **pm, int hsiz)
1779 {
1780         struct mbuf *m;
1781         struct mbuf *n = NULL;
1782         int off = 0;
1783
1784         ++nfs_realign_test;
1785         while ((m = *pm) != NULL) {
1786                 if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
1787                         MGET(n, M_DONTWAIT, MT_DATA);
1788                         if (n == NULL)
1789                                 return (ENOMEM);
1790                         if (m->m_len >= MINCLSIZE) {
1791                                 MCLGET(n, M_DONTWAIT);
1792                                 if (n->m_ext.ext_buf == NULL) {
1793                                         m_freem(n);
1794                                         return (ENOMEM);
1795                                 }
1796                         }
1797                         n->m_len = 0;
1798                         break;
1799                 }
1800                 pm = &m->m_next;
1801         }
1802         /*
1803          * If n is non-NULL, loop on m copying data, then replace the
1804          * portion of the chain that had to be realigned.
1805          */
1806         if (n != NULL) {
1807                 ++nfs_realign_count;
1808                 while (m) {
1809                         m_copyback(n, off, m->m_len, mtod(m, caddr_t));
1810                         off += m->m_len;
1811                         m = m->m_next;
1812                 }
1813                 m_freem(*pm);
1814                 *pm = n;
1815         }
1816         return (0);
1817 }
1818
1819
1820 static int
1821 nfs_msg(struct thread *td, const char *server, const char *msg, int error)
1822 {
1823         struct proc *p;
1824
1825         p = td ? td->td_proc : NULL;
1826         if (error) {
1827                 tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n", server,
1828                     msg, error);
1829         } else {
1830                 tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
1831         }
1832         return (0);
1833 }
1834
1835 void
1836 nfs_down(rep, nmp, td, msg, error, flags)
1837         struct nfsreq *rep;
1838         struct nfsmount *nmp;
1839         struct thread *td;
1840         const char *msg;
1841         int error, flags;
1842 {
1843         if (nmp == NULL)
1844                 return;
1845         mtx_lock(&nmp->nm_mtx);
1846         if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
1847                 nmp->nm_state |= NFSSTA_TIMEO;
1848                 mtx_unlock(&nmp->nm_mtx);
1849                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1850                     VQ_NOTRESP, 0);
1851         } else
1852                 mtx_unlock(&nmp->nm_mtx);
1853 #ifdef NFSSTA_LOCKTIMEO
1854         mtx_lock(&nmp->nm_mtx);
1855         if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
1856                 nmp->nm_state |= NFSSTA_LOCKTIMEO;
1857                 mtx_unlock(&nmp->nm_mtx);
1858                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1859                     VQ_NOTRESPLOCK, 0);
1860         } else
1861                 mtx_unlock(&nmp->nm_mtx);
1862 #endif
1863         if (rep != NULL) {
1864                 mtx_lock(&rep->r_mtx);
1865                 rep->r_flags |= R_TPRINTFMSG;
1866                 mtx_unlock(&rep->r_mtx);
1867         }
1868         nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
1869 }
1870
1871 void
1872 nfs_up(rep, nmp, td, msg, flags)
1873         struct nfsreq *rep;
1874         struct nfsmount *nmp;
1875         struct thread *td;
1876         const char *msg;
1877         int flags;
1878 {
1879         if (nmp == NULL || rep == NULL)
1880                 return;
1881         mtx_lock(&rep->r_mtx);
1882         if ((rep->r_flags & R_TPRINTFMSG) != 0) {
1883                 mtx_unlock(&rep->r_mtx);
1884                 nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
1885         } else
1886                 mtx_unlock(&rep->r_mtx);
1887
1888         mtx_lock(&nmp->nm_mtx);
1889         if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
1890                 nmp->nm_state &= ~NFSSTA_TIMEO;
1891                 mtx_unlock(&nmp->nm_mtx);
1892                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1893                     VQ_NOTRESP, 1);
1894         } else
1895                 mtx_unlock(&nmp->nm_mtx);
1896
1897 #ifdef NFSSTA_LOCKTIMEO
1898         mtx_lock(&nmp->nm_mtx);
1899         if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
1900                 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
1901                 mtx_unlock(&nmp->nm_mtx);
1902                 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1903                     VQ_NOTRESPLOCK, 1);
1904         } else
1905                 mtx_unlock(&nmp->nm_mtx);
1906 #endif
1907 }