sys/fs/nfsserver/nfs_nfsdcache.c

   1 /*-
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * Rick Macklem at The University of Guelph.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 4. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  */
  33
  34 #include <sys/cdefs.h>
  35 __FBSDID("$FreeBSD$");
  36
  37 /*
  38  * Here is the basic algorithm:
  39  * First, some design criteria I used:
  40  * - I think a false hit is more serious than a false miss
  41  * - A false hit for an RPC that has Op(s) that order via seqid# must be
  42  *   avoided at all cost
  43  * - A valid hit will probably happen a long time after the original reply
  44  *   and the TCP socket that the original request was received on will no
  45  *   longer be active
  46  *   (The long time delay implies to me that LRU is not appropriate.)
  47  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
  48  *   in them as well as minimizing the risk of redoing retried non-idempotent
  49  *   Ops.
  50  * Because it is biased towards avoiding false hits, multiple entries with
  51  * the same xid are to be expected, especially for the case of the entry
  52  * in the cache being related to a seqid# sequenced Op.
  53  *
  54  * The basic algorithm I'm about to code up:
  55  * - Null RPCs bypass the cache and are just done
  56  * For TCP
  57  *      - key on <xid, NFS version> (as noted above, there can be several
  58  *                                   entries with the same key)
  59  *      When a request arrives:
  60  *              For all that match key
  61  *              - if RPC# != OR request_size !=
  62  *                      - not a match with this one
  63  *              - if NFSv4 and received on same TCP socket OR
  64  *                      received on a TCP connection created before the
  65  *                      entry was cached
  66  *                      - not a match with this one
  67  *                      (V2,3 clients might retry on same TCP socket)
  68  *              - calculate checksum on first N bytes of NFS XDR
  69  *              - if checksum !=
  70  *                      - not a match for this one
  71  *              If any of the remaining ones that match has a
  72  *                      seqid_refcnt > 0
  73  *                      - not a match (go do RPC, using new cache entry)
  74  *              If one match left
  75  *                      - a hit (reply from cache)
  76  *              else
  77  *                      - miss (go do RPC, using new cache entry)
  78  *
  79  *      During processing of NFSv4 request:
  80  *              - set a flag when a non-idempotent Op is processed
  81  *              - when an Op that uses a seqid# (Open,...) is processed
  82  *                      - if same seqid# as referenced entry in cache
  83  *                              - free new cache entry
  84  *                              - reply from referenced cache entry
  85  *                        else if next seqid# in order
  86  *                              - free referenced cache entry
  87  *                              - increment seqid_refcnt on new cache entry
  88  *                              - set pointer from Openowner/Lockowner to
  89  *                                      new cache entry (aka reference it)
  90  *                        else if first seqid# in sequence
  91  *                              - increment seqid_refcnt on new cache entry
  92  *                              - set pointer from Openowner/Lockowner to
  93  *                                      new cache entry (aka reference it)
  94  *
  95  *      At end of RPC processing:
  96  *              - if seqid_refcnt > 0 OR flagged non-idempotent on new
  97  *                      cache entry
  98  *                      - save reply in cache entry
  99  *                      - calculate checksum on first N bytes of NFS XDR
 100  *                              request
 101  *                      - note op and length of XDR request (in bytes)
 102  *                      - timestamp it
 103  *                else
 104  *                      - free new cache entry
 105  *              - Send reply (noting info for socket activity check, below)
 106  *
 107  *      For cache entries saved above:
 108  *              - if saved since seqid_refcnt was > 0
 109  *                      - free when seqid_refcnt decrements to 0
 110  *                        (when next one in sequence is processed above, or
 111  *                         when Openowner/Lockowner is discarded)
 112  *                else { non-idempotent Op(s) }
 113  *                      - free when
 114  *                              - some further activity observed on same
 115  *                                      socket
 116  *                                (I'm not yet sure how I'm going to do
 117  *                                 this. Maybe look at the TCP connection
 118  *                                 to see if the send_tcp_sequence# is well
 119  *                                 past sent reply OR K additional RPCs
 120  *                                 replied on same socket OR?)
 121  *                        OR
 122  *                              - when very old (hours, days, weeks?)
 123  *
 124  * For UDP (v2, 3 only), pretty much the old way:
 125  * - key on <xid, NFS version, RPC#, Client host ip#>
 126  *   (at most one entry for each key)
 127  *
 128  * When a Request arrives:
 129  * - if a match with entry via key
 130  *      - if RPC marked In_progress
 131  *              - discard request (don't send reply)
 132  *        else
 133  *              - reply from cache
 134  *              - timestamp cache entry
 135  *   else
 136  *      - add entry to cache, marked In_progress
 137  *      - do RPC
 138  *      - when RPC done
 139  *              - if RPC# non-idempotent
 140  *                      - mark entry Done (not In_progress)
 141  *                      - save reply
 142  *                      - timestamp cache entry
 143  *                else
 144  *                      - free cache entry
 145  *              - send reply
 146  *
 147  * Later, entries with saved replies are free'd a short time (few minutes)
 148  * after reply sent (timestamp).
 149  * Reference: Chet Juszczak, "Improving the Performance and Correctness
 150  *              of an NFS Server", in Proc. Winter 1989 USENIX Conference,
 151  *              pages 53-63. San Diego, February 1989.
 152  *       for the UDP case.
 153  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
 154  *      for TCP. For V3, a reply won't be saved when the flood level is
 155  *      hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
 156  *      that case. This level should be set high enough that this almost
 157  *      never happens.
 158  */
 159 #ifndef APPLEKEXT
 160 #include <fs/nfs/nfsport.h>
 161
 162 extern struct nfsstats newnfsstats;
 163 extern struct mtx nfsrc_udpmtx;
 164 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
 165 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
 166 #endif  /* !APPLEKEXT */
 167
 168 SYSCTL_DECL(_vfs_nfsd);
 169
 170 static u_int    nfsrc_tcphighwater = 0;
 171 static int
 172 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
 173 {
 174         int error, newhighwater;
 175
 176         newhighwater = nfsrc_tcphighwater;
 177         error = sysctl_handle_int(oidp, &newhighwater, 0, req);
 178         if (error != 0 || req->newptr == NULL)
 179                 return (error);
 180         if (newhighwater < 0)
 181                 return (EINVAL);
 182         if (newhighwater >= nfsrc_floodlevel)
 183                 nfsrc_floodlevel = newhighwater + newhighwater / 5;
 184         nfsrc_tcphighwater = newhighwater;
 185         return (0);
 186 }
 187 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
 188     sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
 189     "High water mark for TCP cache entries");
 190
 191 static u_int    nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
 192 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
 193     &nfsrc_udphighwater, 0,
 194     "High water mark for UDP cache entries");
 195 static u_int    nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
 196 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
 197     &nfsrc_tcptimeout, 0,
 198     "Timeout for TCP entries in the DRC");
 199 static u_int nfsrc_tcpnonidempotent = 1;
 200 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
 201     &nfsrc_tcpnonidempotent, 0,
 202     "Enable the DRC for NFS over TCP");
 203
 204 static int nfsrc_udpcachesize = 0;
 205 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
 206 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
 207
 208 /*
 209  * and the reverse mapping from generic to Version 2 procedure numbers
 210  */
 211 static int newnfsv2_procid[NFS_V3NPROCS] = {
 212         NFSV2PROC_NULL,
 213         NFSV2PROC_GETATTR,
 214         NFSV2PROC_SETATTR,
 215         NFSV2PROC_LOOKUP,
 216         NFSV2PROC_NOOP,
 217         NFSV2PROC_READLINK,
 218         NFSV2PROC_READ,
 219         NFSV2PROC_WRITE,
 220         NFSV2PROC_CREATE,
 221         NFSV2PROC_MKDIR,
 222         NFSV2PROC_SYMLINK,
 223         NFSV2PROC_CREATE,
 224         NFSV2PROC_REMOVE,
 225         NFSV2PROC_RMDIR,
 226         NFSV2PROC_RENAME,
 227         NFSV2PROC_LINK,
 228         NFSV2PROC_READDIR,
 229         NFSV2PROC_NOOP,
 230         NFSV2PROC_STATFS,
 231         NFSV2PROC_NOOP,
 232         NFSV2PROC_NOOP,
 233         NFSV2PROC_NOOP,
 234 };
 235
 236 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
 237 #define NFSRCUDPHASH(xid) \
 238         (&nfsrvudphashtbl[nfsrc_hash(xid)])
 239 #define NFSRCHASH(xid) \
 240         (&nfsrchash_table[nfsrc_hash(xid)].tbl)
 241 #define TRUE    1
 242 #define FALSE   0
 243 #define NFSRVCACHE_CHECKLEN     100
 244
 245 /* True iff the rpc reply is an nfs status ONLY! */
 246 static int nfsv2_repstat[NFS_V3NPROCS] = {
 247         FALSE,
 248         FALSE,
 249         FALSE,
 250         FALSE,
 251         FALSE,
 252         FALSE,
 253         FALSE,
 254         FALSE,
 255         FALSE,
 256         FALSE,
 257         TRUE,
 258         TRUE,
 259         TRUE,
 260         TRUE,
 261         FALSE,
 262         TRUE,
 263         FALSE,
 264         FALSE,
 265         FALSE,
 266         FALSE,
 267         FALSE,
 268         FALSE,
 269 };
 270
 271 /*
 272  * Will NFS want to work over IPv6 someday?
 273  */
 274 #define NETFAMILY(rp) \
 275                 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
 276
 277 /* local functions */
 278 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
 279 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
 280 static void nfsrc_lock(struct nfsrvcache *rp);
 281 static void nfsrc_unlock(struct nfsrvcache *rp);
 282 static void nfsrc_wanted(struct nfsrvcache *rp);
 283 static void nfsrc_freecache(struct nfsrvcache *rp);
 284 static void nfsrc_trimcache(u_int64_t, struct socket *);
 285 static int nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t,
 286     struct socket *);
 287 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
 288 static void nfsrc_marksametcpconn(u_int64_t);
 289
 290 /*
 291  * Return the correct mutex for this cache entry.
 292  */
 293 static __inline struct mtx *
 294 nfsrc_cachemutex(struct nfsrvcache *rp)
 295 {
 296
 297         if ((rp->rc_flag & RC_UDP) != 0)
 298                 return (&nfsrc_udpmtx);
 299         return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
 300 }
 301
 302 /*
 303  * Initialize the server request cache list
 304  */
 305 APPLESTATIC void
 306 nfsrvd_initcache(void)
 307 {
 308         int i;
 309         static int inited = 0;
 310
 311         if (inited)
 312                 return;
 313         inited = 1;
 314         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 315                 LIST_INIT(&nfsrvudphashtbl[i]);
 316                 LIST_INIT(&nfsrchash_table[i].tbl);
 317         }
 318         TAILQ_INIT(&nfsrvudplru);
 319         nfsrc_tcpsavedreplies = 0;
 320         nfsrc_udpcachesize = 0;
 321         newnfsstats.srvcache_tcppeak = 0;
 322         newnfsstats.srvcache_size = 0;
 323 }
 324
 325 /*
 326  * Get a cache entry for this request. Basically just malloc a new one
 327  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
 328  * Call nfsrc_trimcache() to clean up the cache before returning.
 329  */
 330 APPLESTATIC int
 331 nfsrvd_getcache(struct nfsrv_descript *nd, struct socket *so)
 332 {
 333         struct nfsrvcache *newrp;
 334         int ret;
 335
 336         if (nd->nd_procnum == NFSPROC_NULL)
 337                 panic("nfsd cache null");
 338         MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
 339             M_NFSRVCACHE, M_WAITOK);
 340         NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
 341         if (nd->nd_flag & ND_NFSV4)
 342                 newrp->rc_flag = RC_NFSV4;
 343         else if (nd->nd_flag & ND_NFSV3)
 344                 newrp->rc_flag = RC_NFSV3;
 345         else
 346                 newrp->rc_flag = RC_NFSV2;
 347         newrp->rc_xid = nd->nd_retxid;
 348         newrp->rc_proc = nd->nd_procnum;
 349         newrp->rc_sockref = nd->nd_sockref;
 350         newrp->rc_cachetime = nd->nd_tcpconntime;
 351         if (nd->nd_flag & ND_SAMETCPCONN)
 352                 newrp->rc_flag |= RC_SAMETCPCONN;
 353         if (nd->nd_nam2 != NULL) {
 354                 newrp->rc_flag |= RC_UDP;
 355                 ret = nfsrc_getudp(nd, newrp);
 356         } else {
 357                 ret = nfsrc_gettcp(nd, newrp);
 358         }
 359         nfsrc_trimcache(nd->nd_sockref, so);
 360         NFSEXITCODE2(0, nd);
 361         return (ret);
 362 }
 363
 364 /*
 365  * For UDP (v2, v3):
 366  * - key on <xid, NFS version, RPC#, Client host ip#>
 367  *   (at most one entry for each key)
 368  */
 369 static int
 370 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
 371 {
 372         struct nfsrvcache *rp;
 373         struct sockaddr_in *saddr;
 374         struct sockaddr_in6 *saddr6;
 375         struct nfsrvhashhead *hp;
 376         int ret = 0;
 377         struct mtx *mutex;
 378
 379         mutex = nfsrc_cachemutex(newrp);
 380         hp = NFSRCUDPHASH(newrp->rc_xid);
 381 loop:
 382         mtx_lock(mutex);
 383         LIST_FOREACH(rp, hp, rc_hash) {
 384             if (newrp->rc_xid == rp->rc_xid &&
 385                 newrp->rc_proc == rp->rc_proc &&
 386                 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
 387                 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
 388                         if ((rp->rc_flag & RC_LOCKED) != 0) {
 389                                 rp->rc_flag |= RC_WANTED;
 390                                 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
 391                                     "nfsrc", 10 * hz);
 392                                 goto loop;
 393                         }
 394                         if (rp->rc_flag == 0)
 395                                 panic("nfs udp cache0");
 396                         rp->rc_flag |= RC_LOCKED;
 397                         TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
 398                         TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
 399                         if (rp->rc_flag & RC_INPROG) {
 400                                 newnfsstats.srvcache_inproghits++;
 401                                 mtx_unlock(mutex);
 402                                 ret = RC_DROPIT;
 403                         } else if (rp->rc_flag & RC_REPSTATUS) {
 404                                 /*
 405                                  * V2 only.
 406                                  */
 407                                 newnfsstats.srvcache_nonidemdonehits++;
 408                                 mtx_unlock(mutex);
 409                                 nfsrvd_rephead(nd);
 410                                 *(nd->nd_errp) = rp->rc_status;
 411                                 ret = RC_REPLY;
 412                                 rp->rc_timestamp = NFSD_MONOSEC +
 413                                         NFSRVCACHE_UDPTIMEOUT;
 414                         } else if (rp->rc_flag & RC_REPMBUF) {
 415                                 newnfsstats.srvcache_nonidemdonehits++;
 416                                 mtx_unlock(mutex);
 417                                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
 418                                         M_COPYALL, M_WAITOK);
 419                                 ret = RC_REPLY;
 420                                 rp->rc_timestamp = NFSD_MONOSEC +
 421                                         NFSRVCACHE_UDPTIMEOUT;
 422                         } else {
 423                                 panic("nfs udp cache1");
 424                         }
 425                         nfsrc_unlock(rp);
 426                         free((caddr_t)newrp, M_NFSRVCACHE);
 427                         goto out;
 428                 }
 429         }
 430         newnfsstats.srvcache_misses++;
 431         atomic_add_int(&newnfsstats.srvcache_size, 1);
 432         nfsrc_udpcachesize++;
 433
 434         newrp->rc_flag |= RC_INPROG;
 435         saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
 436         if (saddr->sin_family == AF_INET)
 437                 newrp->rc_inet = saddr->sin_addr.s_addr;
 438         else if (saddr->sin_family == AF_INET6) {
 439                 saddr6 = (struct sockaddr_in6 *)saddr;
 440                 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
 441                     sizeof (struct in6_addr));
 442                 newrp->rc_flag |= RC_INETIPV6;
 443         }
 444         LIST_INSERT_HEAD(hp, newrp, rc_hash);
 445         TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
 446         mtx_unlock(mutex);
 447         nd->nd_rp = newrp;
 448         ret = RC_DOIT;
 449
 450 out:
 451         NFSEXITCODE2(0, nd);
 452         return (ret);
 453 }
 454
 455 /*
 456  * Update a request cache entry after the rpc has been done
 457  */
 458 APPLESTATIC struct nfsrvcache *
 459 nfsrvd_updatecache(struct nfsrv_descript *nd, struct socket *so)
 460 {
 461         struct nfsrvcache *rp;
 462         struct nfsrvcache *retrp = NULL;
 463         mbuf_t m;
 464         struct mtx *mutex;
 465
 466         rp = nd->nd_rp;
 467         if (!rp)
 468                 panic("nfsrvd_updatecache null rp");
 469         nd->nd_rp = NULL;
 470         mutex = nfsrc_cachemutex(rp);
 471         mtx_lock(mutex);
 472         nfsrc_lock(rp);
 473         if (!(rp->rc_flag & RC_INPROG))
 474                 panic("nfsrvd_updatecache not inprog");
 475         rp->rc_flag &= ~RC_INPROG;
 476         if (rp->rc_flag & RC_UDP) {
 477                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
 478                 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
 479         }
 480
 481         /*
 482          * Reply from cache is a special case returned by nfsrv_checkseqid().
 483          */
 484         if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
 485                 newnfsstats.srvcache_nonidemdonehits++;
 486                 mtx_unlock(mutex);
 487                 nd->nd_repstat = 0;
 488                 if (nd->nd_mreq)
 489                         mbuf_freem(nd->nd_mreq);
 490                 if (!(rp->rc_flag & RC_REPMBUF))
 491                         panic("reply from cache");
 492                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
 493                     M_COPYALL, M_WAITOK);
 494                 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 495                 nfsrc_unlock(rp);
 496                 goto out;
 497         }
 498
 499         /*
 500          * If rc_refcnt > 0, save it
 501          * For UDP, save it if ND_SAVEREPLY is set
 502          * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
 503          */
 504         if (nd->nd_repstat != NFSERR_DONTREPLY &&
 505             (rp->rc_refcnt > 0 ||
 506              ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
 507              ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
 508               nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
 509               nfsrc_tcpnonidempotent))) {
 510                 if (rp->rc_refcnt > 0) {
 511                         if (!(rp->rc_flag & RC_NFSV4))
 512                                 panic("update_cache refcnt");
 513                         rp->rc_flag |= RC_REFCNT;
 514                 }
 515                 if ((nd->nd_flag & ND_NFSV2) &&
 516                     nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
 517                         rp->rc_status = nd->nd_repstat;
 518                         rp->rc_flag |= RC_REPSTATUS;
 519                         mtx_unlock(mutex);
 520                 } else {
 521                         if (!(rp->rc_flag & RC_UDP)) {
 522                             atomic_add_int(&nfsrc_tcpsavedreplies, 1);
 523                             if (nfsrc_tcpsavedreplies >
 524                                 newnfsstats.srvcache_tcppeak)
 525                                 newnfsstats.srvcache_tcppeak =
 526                                     nfsrc_tcpsavedreplies;
 527                         }
 528                         mtx_unlock(mutex);
 529                         m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
 530                         mtx_lock(mutex);
 531                         rp->rc_reply = m;
 532                         rp->rc_flag |= RC_REPMBUF;
 533                         mtx_unlock(mutex);
 534                 }
 535                 if (rp->rc_flag & RC_UDP) {
 536                         rp->rc_timestamp = NFSD_MONOSEC +
 537                             NFSRVCACHE_UDPTIMEOUT;
 538                         nfsrc_unlock(rp);
 539                 } else {
 540                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 541                         if (rp->rc_refcnt > 0)
 542                                 nfsrc_unlock(rp);
 543                         else
 544                                 retrp = rp;
 545                 }
 546         } else {
 547                 nfsrc_freecache(rp);
 548                 mtx_unlock(mutex);
 549         }
 550
 551 out:
 552         nfsrc_trimcache(nd->nd_sockref, so);
 553         NFSEXITCODE2(0, nd);
 554         return (retrp);
 555 }
 556
 557 /*
 558  * Invalidate and, if possible, free an in prog cache entry.
 559  * Must not sleep.
 560  */
 561 APPLESTATIC void
 562 nfsrvd_delcache(struct nfsrvcache *rp)
 563 {
 564         struct mtx *mutex;
 565
 566         mutex = nfsrc_cachemutex(rp);
 567         if (!(rp->rc_flag & RC_INPROG))
 568                 panic("nfsrvd_delcache not in prog");
 569         mtx_lock(mutex);
 570         rp->rc_flag &= ~RC_INPROG;
 571         if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
 572                 nfsrc_freecache(rp);
 573         mtx_unlock(mutex);
 574 }
 575
 576 /*
 577  * Called after nfsrvd_updatecache() once the reply is sent, to update
 578  * the entry for nfsrc_activesocket() and unlock it. The argument is
 579  * the pointer returned by nfsrvd_updatecache().
 580  */
 581 APPLESTATIC void
 582 nfsrvd_sentcache(struct nfsrvcache *rp, struct socket *so, int err)
 583 {
 584         tcp_seq tmp_seq;
 585         struct mtx *mutex;
 586
 587         mutex = nfsrc_cachemutex(rp);
 588         if (!(rp->rc_flag & RC_LOCKED))
 589                 panic("nfsrvd_sentcache not locked");
 590         if (!err) {
 591                 if ((so->so_proto->pr_domain->dom_family != AF_INET &&
 592                      so->so_proto->pr_domain->dom_family != AF_INET6) ||
 593                      so->so_proto->pr_protocol != IPPROTO_TCP)
 594                         panic("nfs sent cache");
 595                 if (nfsrv_getsockseqnum(so, &tmp_seq)) {
 596                         mtx_lock(mutex);
 597                         rp->rc_tcpseq = tmp_seq;
 598                         rp->rc_flag |= RC_TCPSEQ;
 599                         mtx_unlock(mutex);
 600                 }
 601         }
 602         nfsrc_unlock(rp);
 603 }
 604
 605 /*
 606  * Get a cache entry for TCP
 607  * - key on <xid, nfs version>
 608  *   (allow multiple entries for a given key)
 609  */
 610 static int
 611 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
 612 {
 613         struct nfsrvcache *rp, *nextrp;
 614         int i;
 615         struct nfsrvcache *hitrp;
 616         struct nfsrvhashhead *hp, nfsrc_templist;
 617         int hit, ret = 0;
 618         struct mtx *mutex;
 619
 620         mutex = nfsrc_cachemutex(newrp);
 621         hp = NFSRCHASH(newrp->rc_xid);
 622         newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
 623 tryagain:
 624         mtx_lock(mutex);
 625         hit = 1;
 626         LIST_INIT(&nfsrc_templist);
 627         /*
 628          * Get all the matches and put them on the temp list.
 629          */
 630         rp = LIST_FIRST(hp);
 631         while (rp != LIST_END(hp)) {
 632                 nextrp = LIST_NEXT(rp, rc_hash);
 633                 if (newrp->rc_xid == rp->rc_xid &&
 634                     (!(rp->rc_flag & RC_INPROG) ||
 635                      ((newrp->rc_flag & RC_SAMETCPCONN) &&
 636                       newrp->rc_sockref == rp->rc_sockref)) &&
 637                     (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
 638                     newrp->rc_proc == rp->rc_proc &&
 639                     ((newrp->rc_flag & RC_NFSV4) &&
 640                      newrp->rc_sockref != rp->rc_sockref &&
 641                      newrp->rc_cachetime >= rp->rc_cachetime)
 642                     && newrp->rc_reqlen == rp->rc_reqlen &&
 643                     newrp->rc_cksum == rp->rc_cksum) {
 644                         LIST_REMOVE(rp, rc_hash);
 645                         LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
 646                 }
 647                 rp = nextrp;
 648         }
 649
 650         /*
 651          * Now, use nfsrc_templist to decide if there is a match.
 652          */
 653         i = 0;
 654         LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
 655                 i++;
 656                 if (rp->rc_refcnt > 0) {
 657                         hit = 0;
 658                         break;
 659                 }
 660         }
 661         /*
 662          * Can be a hit only if one entry left.
 663          * Note possible hit entry and put nfsrc_templist back on hash
 664          * list.
 665          */
 666         if (i != 1)
 667                 hit = 0;
 668         hitrp = rp = LIST_FIRST(&nfsrc_templist);
 669         while (rp != LIST_END(&nfsrc_templist)) {
 670                 nextrp = LIST_NEXT(rp, rc_hash);
 671                 LIST_REMOVE(rp, rc_hash);
 672                 LIST_INSERT_HEAD(hp, rp, rc_hash);
 673                 rp = nextrp;
 674         }
 675         if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
 676                 panic("nfs gettcp cache templist");
 677
 678         if (hit) {
 679                 rp = hitrp;
 680                 if ((rp->rc_flag & RC_LOCKED) != 0) {
 681                         rp->rc_flag |= RC_WANTED;
 682                         (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
 683                             "nfsrc", 10 * hz);
 684                         goto tryagain;
 685                 }
 686                 if (rp->rc_flag == 0)
 687                         panic("nfs tcp cache0");
 688                 rp->rc_flag |= RC_LOCKED;
 689                 if (rp->rc_flag & RC_INPROG) {
 690                         newnfsstats.srvcache_inproghits++;
 691                         mtx_unlock(mutex);
 692                         if (newrp->rc_sockref == rp->rc_sockref)
 693                                 nfsrc_marksametcpconn(rp->rc_sockref);
 694                         ret = RC_DROPIT;
 695                 } else if (rp->rc_flag & RC_REPSTATUS) {
 696                         /*
 697                          * V2 only.
 698                          */
 699                         newnfsstats.srvcache_nonidemdonehits++;
 700                         mtx_unlock(mutex);
 701                         if (newrp->rc_sockref == rp->rc_sockref)
 702                                 nfsrc_marksametcpconn(rp->rc_sockref);
 703                         ret = RC_REPLY;
 704                         nfsrvd_rephead(nd);
 705                         *(nd->nd_errp) = rp->rc_status;
 706                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 707                 } else if (rp->rc_flag & RC_REPMBUF) {
 708                         newnfsstats.srvcache_nonidemdonehits++;
 709                         mtx_unlock(mutex);
 710                         if (newrp->rc_sockref == rp->rc_sockref)
 711                                 nfsrc_marksametcpconn(rp->rc_sockref);
 712                         ret = RC_REPLY;
 713                         nd->nd_mreq = m_copym(rp->rc_reply, 0,
 714                                 M_COPYALL, M_WAITOK);
 715                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 716                 } else {
 717                         panic("nfs tcp cache1");
 718                 }
 719                 nfsrc_unlock(rp);
 720                 free((caddr_t)newrp, M_NFSRVCACHE);
 721                 goto out;
 722         }
 723         newnfsstats.srvcache_misses++;
 724         atomic_add_int(&newnfsstats.srvcache_size, 1);
 725
 726         /*
 727          * For TCP, multiple entries for a key are allowed, so don't
 728          * chain it into the hash table until done.
 729          */
 730         newrp->rc_cachetime = NFSD_MONOSEC;
 731         newrp->rc_flag |= RC_INPROG;
 732         LIST_INSERT_HEAD(hp, newrp, rc_hash);
 733         mtx_unlock(mutex);
 734         nd->nd_rp = newrp;
 735         ret = RC_DOIT;
 736
 737 out:
 738         NFSEXITCODE2(0, nd);
 739         return (ret);
 740 }
 741
 742 /*
 743  * Lock a cache entry.
 744  */
 745 static void
 746 nfsrc_lock(struct nfsrvcache *rp)
 747 {
 748         struct mtx *mutex;
 749
 750         mutex = nfsrc_cachemutex(rp);
 751         mtx_assert(mutex, MA_OWNED);
 752         while ((rp->rc_flag & RC_LOCKED) != 0) {
 753                 rp->rc_flag |= RC_WANTED;
 754                 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
 755         }
 756         rp->rc_flag |= RC_LOCKED;
 757 }
 758
 759 /*
 760  * Unlock a cache entry.
 761  */
 762 static void
 763 nfsrc_unlock(struct nfsrvcache *rp)
 764 {
 765         struct mtx *mutex;
 766
 767         mutex = nfsrc_cachemutex(rp);
 768         mtx_lock(mutex);
 769         rp->rc_flag &= ~RC_LOCKED;
 770         nfsrc_wanted(rp);
 771         mtx_unlock(mutex);
 772 }
 773
 774 /*
 775  * Wakeup anyone wanting entry.
 776  */
 777 static void
 778 nfsrc_wanted(struct nfsrvcache *rp)
 779 {
 780         if (rp->rc_flag & RC_WANTED) {
 781                 rp->rc_flag &= ~RC_WANTED;
 782                 wakeup((caddr_t)rp);
 783         }
 784 }
 785
 786 /*
 787  * Free up the entry.
 788  * Must not sleep.
 789  */
 790 static void
 791 nfsrc_freecache(struct nfsrvcache *rp)
 792 {
 793
 794         LIST_REMOVE(rp, rc_hash);
 795         if (rp->rc_flag & RC_UDP) {
 796                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
 797                 nfsrc_udpcachesize--;
 798         }
 799         nfsrc_wanted(rp);
 800         if (rp->rc_flag & RC_REPMBUF) {
 801                 mbuf_freem(rp->rc_reply);
 802                 if (!(rp->rc_flag & RC_UDP))
 803                         atomic_add_int(&nfsrc_tcpsavedreplies, -1);
 804         }
 805         FREE((caddr_t)rp, M_NFSRVCACHE);
 806         atomic_add_int(&newnfsstats.srvcache_size, -1);
 807 }
 808
 809 /*
 810  * Clean out the cache. Called when nfsserver module is unloaded.
 811  */
 812 APPLESTATIC void
 813 nfsrvd_cleancache(void)
 814 {
 815         struct nfsrvcache *rp, *nextrp;
 816         int i;
 817
 818         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 819                 mtx_lock(&nfsrchash_table[i].mtx);
 820                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
 821                         nfsrc_freecache(rp);
 822                 mtx_unlock(&nfsrchash_table[i].mtx);
 823         }
 824         mtx_lock(&nfsrc_udpmtx);
 825         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 826                 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
 827                         nfsrc_freecache(rp);
 828                 }
 829         }
 830         newnfsstats.srvcache_size = 0;
 831         mtx_unlock(&nfsrc_udpmtx);
 832         nfsrc_tcpsavedreplies = 0;
 833 }
 834
 835 /*
 836  * The basic rule is to get rid of entries that are expired.
 837  */
 838 static void
 839 nfsrc_trimcache(u_int64_t sockref, struct socket *so)
 840 {
 841         struct nfsrvcache *rp, *nextrp;
 842         int i, j, k, time_histo[10];
 843         time_t thisstamp;
 844         static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
 845         static int onethread = 0;
 846
 847         if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
 848                 return;
 849         if (NFSD_MONOSEC != udp_lasttrim ||
 850             nfsrc_udpcachesize >= (nfsrc_udphighwater +
 851             nfsrc_udphighwater / 2)) {
 852                 mtx_lock(&nfsrc_udpmtx);
 853                 udp_lasttrim = NFSD_MONOSEC;
 854                 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
 855                         if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
 856                              && rp->rc_refcnt == 0
 857                              && ((rp->rc_flag & RC_REFCNT) ||
 858                                  udp_lasttrim > rp->rc_timestamp ||
 859                                  nfsrc_udpcachesize > nfsrc_udphighwater))
 860                                 nfsrc_freecache(rp);
 861                 }
 862                 mtx_unlock(&nfsrc_udpmtx);
 863         }
 864         if (NFSD_MONOSEC != tcp_lasttrim ||
 865             nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
 866                 for (i = 0; i < 10; i++)
 867                         time_histo[i] = 0;
 868                 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 869                         mtx_lock(&nfsrchash_table[i].mtx);
 870                         if (i == 0)
 871                                 tcp_lasttrim = NFSD_MONOSEC;
 872                         LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
 873                             nextrp) {
 874                                 if (!(rp->rc_flag &
 875                                      (RC_INPROG|RC_LOCKED|RC_WANTED))
 876                                      && rp->rc_refcnt == 0) {
 877                                         /*
 878                                          * The timestamps range from roughly the
 879                                          * present (tcp_lasttrim) to the present
 880                                          * + nfsrc_tcptimeout. Generate a simple
 881                                          * histogram of where the timeouts fall.
 882                                          */
 883                                         j = rp->rc_timestamp - tcp_lasttrim;
 884                                         if (j >= nfsrc_tcptimeout)
 885                                                 j = nfsrc_tcptimeout - 1;
 886                                         if (j < 0)
 887                                                 j = 0;
 888                                         j = (j * 10 / nfsrc_tcptimeout) % 10;
 889                                         time_histo[j]++;
 890                                         if ((rp->rc_flag & RC_REFCNT) ||
 891                                             tcp_lasttrim > rp->rc_timestamp ||
 892                                             nfsrc_activesocket(rp, sockref, so))
 893                                                 nfsrc_freecache(rp);
 894                                 }
 895                         }
 896                         mtx_unlock(&nfsrchash_table[i].mtx);
 897                 }
 898                 j = nfsrc_tcphighwater / 5;     /* 20% of it */
 899                 if (j > 0 && (nfsrc_tcpsavedreplies + j) > nfsrc_tcphighwater) {
 900                         /*
 901                          * Trim some more with a smaller timeout of as little
 902                          * as 20% of nfsrc_tcptimeout to try and get below
 903                          * 80% of the nfsrc_tcphighwater.
 904                          */
 905                         k = 0;
 906                         for (i = 0; i < 8; i++) {
 907                                 k += time_histo[i];
 908                                 if (k > j)
 909                                         break;
 910                         }
 911                         k = nfsrc_tcptimeout * (i + 1) / 10;
 912                         if (k < 1)
 913                                 k = 1;
 914                         thisstamp = tcp_lasttrim + k;
 915                         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 916                                 mtx_lock(&nfsrchash_table[i].mtx);
 917                                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
 918                                     rc_hash, nextrp) {
 919                                         if (!(rp->rc_flag &
 920                                              (RC_INPROG|RC_LOCKED|RC_WANTED))
 921                                              && rp->rc_refcnt == 0
 922                                              && ((rp->rc_flag & RC_REFCNT) ||
 923                                                  thisstamp > rp->rc_timestamp ||
 924                                                  nfsrc_activesocket(rp, sockref,
 925                                                     so)))
 926                                                 nfsrc_freecache(rp);
 927                                 }
 928                                 mtx_unlock(&nfsrchash_table[i].mtx);
 929                         }
 930                 }
 931         }
 932         atomic_store_rel_int(&onethread, 0);
 933 }
 934
 935 /*
 936  * Add a seqid# reference to the cache entry.
 937  */
 938 APPLESTATIC void
 939 nfsrvd_refcache(struct nfsrvcache *rp)
 940 {
 941         struct mtx *mutex;
 942
 943         mutex = nfsrc_cachemutex(rp);
 944         mtx_lock(mutex);
 945         if (rp->rc_refcnt < 0)
 946                 panic("nfs cache refcnt");
 947         rp->rc_refcnt++;
 948         mtx_unlock(mutex);
 949 }
 950
 951 /*
 952  * Dereference a seqid# cache entry.
 953  */
 954 APPLESTATIC void
 955 nfsrvd_derefcache(struct nfsrvcache *rp)
 956 {
 957         struct mtx *mutex;
 958
 959         mutex = nfsrc_cachemutex(rp);
 960         mtx_lock(mutex);
 961         if (rp->rc_refcnt <= 0)
 962                 panic("nfs cache derefcnt");
 963         rp->rc_refcnt--;
 964         if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
 965                 nfsrc_freecache(rp);
 966         mtx_unlock(mutex);
 967 }
 968
 969 /*
 970  * Check to see if the socket is active.
 971  * Return 1 if the reply has been received/acknowledged by the client,
 972  * 0 otherwise.
 973  * XXX - Uses tcp internals.
 974  */
 975 static int
 976 nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t cur_sockref,
 977     struct socket *cur_so)
 978 {
 979         int ret = 0;
 980
 981         if (!(rp->rc_flag & RC_TCPSEQ))
 982                 return (ret);
 983         /*
 984          * If the sockref is the same, it is the same TCP connection.
 985          */
 986         if (cur_sockref == rp->rc_sockref)
 987                 ret = nfsrv_checksockseqnum(cur_so, rp->rc_tcpseq);
 988         return (ret);
 989 }
 990
 991 /*
 992  * Calculate the length of the mbuf list and a checksum on the first up to
 993  * NFSRVCACHE_CHECKLEN bytes.
 994  */
 995 static int
 996 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
 997 {
 998         int len = 0, cklen;
 999         mbuf_t m;
1000
1001         m = m1;
1002         while (m) {
1003                 len += mbuf_len(m);
1004                 m = mbuf_next(m);
1005         }
1006         cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1007         *cksum = in_cksum(m1, cklen);
1008         return (len);
1009 }
1010
1011 /*
1012  * Mark a TCP connection that is seeing retries. Should never happen for
1013  * NFSv4.
1014  */
1015 static void
1016 nfsrc_marksametcpconn(u_int64_t sockref)
1017 {
1018 }
1019