sys/fs/nfsserver/nfs_nfsdcache.c

   1 /*-
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * Rick Macklem at The University of Guelph.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 4. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  */
  33
  34 #include <sys/cdefs.h>
  35 __FBSDID("$FreeBSD$");
  36
  37 /*
  38  * Here is the basic algorithm:
  39  * First, some design criteria I used:
  40  * - I think a false hit is more serious than a false miss
  41  * - A false hit for an RPC that has Op(s) that order via seqid# must be
  42  *   avoided at all cost
  43  * - A valid hit will probably happen a long time after the original reply
  44  *   and the TCP socket that the original request was received on will no
  45  *   longer be active
  46  *   (The long time delay implies to me that LRU is not appropriate.)
  47  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
  48  *   in them as well as minimizing the risk of redoing retried non-idempotent
  49  *   Ops.
  50  * Because it is biased towards avoiding false hits, multiple entries with
  51  * the same xid are to be expected, especially for the case of the entry
  52  * in the cache being related to a seqid# sequenced Op.
  53  *
  54  * The basic algorithm I'm about to code up:
  55  * - Null RPCs bypass the cache and are just done
  56  * For TCP
  57  *      - key on <xid, NFS version> (as noted above, there can be several
  58  *                                   entries with the same key)
  59  *      When a request arrives:
  60  *              For all that match key
  61  *              - if RPC# != OR request_size !=
  62  *                      - not a match with this one
  63  *              - if NFSv4 and received on same TCP socket OR
  64  *                      received on a TCP connection created before the
  65  *                      entry was cached
  66  *                      - not a match with this one
  67  *                      (V2,3 clients might retry on same TCP socket)
  68  *              - calculate checksum on first N bytes of NFS XDR
  69  *              - if checksum !=
  70  *                      - not a match for this one
  71  *              If any of the remaining ones that match has a
  72  *                      seqid_refcnt > 0
  73  *                      - not a match (go do RPC, using new cache entry)
  74  *              If one match left
  75  *                      - a hit (reply from cache)
  76  *              else
  77  *                      - miss (go do RPC, using new cache entry)
  78  *
  79  *      During processing of NFSv4 request:
  80  *              - set a flag when a non-idempotent Op is processed
  81  *              - when an Op that uses a seqid# (Open,...) is processed
  82  *                      - if same seqid# as referenced entry in cache
  83  *                              - free new cache entry
  84  *                              - reply from referenced cache entry
  85  *                        else if next seqid# in order
  86  *                              - free referenced cache entry
  87  *                              - increment seqid_refcnt on new cache entry
  88  *                              - set pointer from Openowner/Lockowner to
  89  *                                      new cache entry (aka reference it)
  90  *                        else if first seqid# in sequence
  91  *                              - increment seqid_refcnt on new cache entry
  92  *                              - set pointer from Openowner/Lockowner to
  93  *                                      new cache entry (aka reference it)
  94  *
  95  *      At end of RPC processing:
  96  *              - if seqid_refcnt > 0 OR flagged non-idempotent on new
  97  *                      cache entry
  98  *                      - save reply in cache entry
  99  *                      - calculate checksum on first N bytes of NFS XDR
 100  *                              request
 101  *                      - note op and length of XDR request (in bytes)
 102  *                      - timestamp it
 103  *                else
 104  *                      - free new cache entry
 105  *              - Send reply (noting info for socket activity check, below)
 106  *
 107  *      For cache entries saved above:
 108  *              - if saved since seqid_refcnt was > 0
 109  *                      - free when seqid_refcnt decrements to 0
 110  *                        (when next one in sequence is processed above, or
 111  *                         when Openowner/Lockowner is discarded)
 112  *                else { non-idempotent Op(s) }
 113  *                      - free when
 114  *                              - some further activity observed on same
 115  *                                      socket
 116  *                                (I'm not yet sure how I'm going to do
 117  *                                 this. Maybe look at the TCP connection
 118  *                                 to see if the send_tcp_sequence# is well
 119  *                                 past sent reply OR K additional RPCs
 120  *                                 replied on same socket OR?)
 121  *                        OR
 122  *                              - when very old (hours, days, weeks?)
 123  *
 124  * For UDP (v2, 3 only), pretty much the old way:
 125  * - key on <xid, NFS version, RPC#, Client host ip#>
 126  *   (at most one entry for each key)
 127  *
 128  * When a Request arrives:
 129  * - if a match with entry via key
 130  *      - if RPC marked In_progress
 131  *              - discard request (don't send reply)
 132  *        else
 133  *              - reply from cache
 134  *              - timestamp cache entry
 135  *   else
 136  *      - add entry to cache, marked In_progress
 137  *      - do RPC
 138  *      - when RPC done
 139  *              - if RPC# non-idempotent
 140  *                      - mark entry Done (not In_progress)
 141  *                      - save reply
 142  *                      - timestamp cache entry
 143  *                else
 144  *                      - free cache entry
 145  *              - send reply
 146  *
 147  * Later, entries with saved replies are free'd a short time (few minutes)
 148  * after reply sent (timestamp).
 149  * Reference: Chet Juszczak, "Improving the Performance and Correctness
 150  *              of an NFS Server", in Proc. Winter 1989 USENIX Conference,
 151  *              pages 53-63. San Diego, February 1989.
 152  *       for the UDP case.
 153  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
 154  *      for TCP. For V3, a reply won't be saved when the flood level is
 155  *      hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
 156  *      that case. This level should be set high enough that this almost
 157  *      never happens.
 158  */
 159 #ifndef APPLEKEXT
 160 #include <fs/nfs/nfsport.h>
 161
 162 extern struct nfsstats newnfsstats;
 163 extern struct mtx nfsrc_udpmtx;
 164 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
 165 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
 166 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
 167 #endif  /* !APPLEKEXT */
 168
 169 SYSCTL_DECL(_vfs_nfsd);
 170
 171 static u_int    nfsrc_tcphighwater = 0;
 172 static int
 173 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
 174 {
 175         int error, newhighwater;
 176
 177         newhighwater = nfsrc_tcphighwater;
 178         error = sysctl_handle_int(oidp, &newhighwater, 0, req);
 179         if (error != 0 || req->newptr == NULL)
 180                 return (error);
 181         if (newhighwater < 0)
 182                 return (EINVAL);
 183         if (newhighwater >= nfsrc_floodlevel)
 184                 nfsrc_floodlevel = newhighwater + newhighwater / 5;
 185         nfsrc_tcphighwater = newhighwater;
 186         return (0);
 187 }
 188 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
 189     sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
 190     "High water mark for TCP cache entries");
 191
 192 static u_int    nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
 193 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
 194     &nfsrc_udphighwater, 0,
 195     "High water mark for UDP cache entries");
 196 static u_int    nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
 197 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
 198     &nfsrc_tcptimeout, 0,
 199     "Timeout for TCP entries in the DRC");
 200 static u_int nfsrc_tcpnonidempotent = 1;
 201 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
 202     &nfsrc_tcpnonidempotent, 0,
 203     "Enable the DRC for NFS over TCP");
 204
 205 static int nfsrc_udpcachesize = 0;
 206 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
 207 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
 208
 209 /*
 210  * and the reverse mapping from generic to Version 2 procedure numbers
 211  */
 212 static int newnfsv2_procid[NFS_V3NPROCS] = {
 213         NFSV2PROC_NULL,
 214         NFSV2PROC_GETATTR,
 215         NFSV2PROC_SETATTR,
 216         NFSV2PROC_LOOKUP,
 217         NFSV2PROC_NOOP,
 218         NFSV2PROC_READLINK,
 219         NFSV2PROC_READ,
 220         NFSV2PROC_WRITE,
 221         NFSV2PROC_CREATE,
 222         NFSV2PROC_MKDIR,
 223         NFSV2PROC_SYMLINK,
 224         NFSV2PROC_CREATE,
 225         NFSV2PROC_REMOVE,
 226         NFSV2PROC_RMDIR,
 227         NFSV2PROC_RENAME,
 228         NFSV2PROC_LINK,
 229         NFSV2PROC_READDIR,
 230         NFSV2PROC_NOOP,
 231         NFSV2PROC_STATFS,
 232         NFSV2PROC_NOOP,
 233         NFSV2PROC_NOOP,
 234         NFSV2PROC_NOOP,
 235 };
 236
 237 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
 238 #define NFSRCUDPHASH(xid) \
 239         (&nfsrvudphashtbl[nfsrc_hash(xid)])
 240 #define NFSRCHASH(xid) \
 241         (&nfsrchash_table[nfsrc_hash(xid)].tbl)
 242 #define NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
 243 #define TRUE    1
 244 #define FALSE   0
 245 #define NFSRVCACHE_CHECKLEN     100
 246
 247 /* True iff the rpc reply is an nfs status ONLY! */
 248 static int nfsv2_repstat[NFS_V3NPROCS] = {
 249         FALSE,
 250         FALSE,
 251         FALSE,
 252         FALSE,
 253         FALSE,
 254         FALSE,
 255         FALSE,
 256         FALSE,
 257         FALSE,
 258         FALSE,
 259         TRUE,
 260         TRUE,
 261         TRUE,
 262         TRUE,
 263         FALSE,
 264         TRUE,
 265         FALSE,
 266         FALSE,
 267         FALSE,
 268         FALSE,
 269         FALSE,
 270         FALSE,
 271 };
 272
 273 /*
 274  * Will NFS want to work over IPv6 someday?
 275  */
 276 #define NETFAMILY(rp) \
 277                 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
 278
 279 /* local functions */
 280 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
 281 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
 282 static void nfsrc_lock(struct nfsrvcache *rp);
 283 static void nfsrc_unlock(struct nfsrvcache *rp);
 284 static void nfsrc_wanted(struct nfsrvcache *rp);
 285 static void nfsrc_freecache(struct nfsrvcache *rp);
 286 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
 287 static void nfsrc_marksametcpconn(u_int64_t);
 288
 289 /*
 290  * Return the correct mutex for this cache entry.
 291  */
 292 static __inline struct mtx *
 293 nfsrc_cachemutex(struct nfsrvcache *rp)
 294 {
 295
 296         if ((rp->rc_flag & RC_UDP) != 0)
 297                 return (&nfsrc_udpmtx);
 298         return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
 299 }
 300
 301 /*
 302  * Initialize the server request cache list
 303  */
 304 APPLESTATIC void
 305 nfsrvd_initcache(void)
 306 {
 307         int i;
 308         static int inited = 0;
 309
 310         if (inited)
 311                 return;
 312         inited = 1;
 313         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 314                 LIST_INIT(&nfsrvudphashtbl[i]);
 315                 LIST_INIT(&nfsrchash_table[i].tbl);
 316                 LIST_INIT(&nfsrcahash_table[i].tbl);
 317         }
 318         TAILQ_INIT(&nfsrvudplru);
 319         nfsrc_tcpsavedreplies = 0;
 320         nfsrc_udpcachesize = 0;
 321         newnfsstats.srvcache_tcppeak = 0;
 322         newnfsstats.srvcache_size = 0;
 323 }
 324
 325 /*
 326  * Get a cache entry for this request. Basically just malloc a new one
 327  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
 328  */
 329 APPLESTATIC int
 330 nfsrvd_getcache(struct nfsrv_descript *nd)
 331 {
 332         struct nfsrvcache *newrp;
 333         int ret;
 334
 335         if (nd->nd_procnum == NFSPROC_NULL)
 336                 panic("nfsd cache null");
 337         MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
 338             M_NFSRVCACHE, M_WAITOK);
 339         NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
 340         if (nd->nd_flag & ND_NFSV4)
 341                 newrp->rc_flag = RC_NFSV4;
 342         else if (nd->nd_flag & ND_NFSV3)
 343                 newrp->rc_flag = RC_NFSV3;
 344         else
 345                 newrp->rc_flag = RC_NFSV2;
 346         newrp->rc_xid = nd->nd_retxid;
 347         newrp->rc_proc = nd->nd_procnum;
 348         newrp->rc_sockref = nd->nd_sockref;
 349         newrp->rc_cachetime = nd->nd_tcpconntime;
 350         if (nd->nd_flag & ND_SAMETCPCONN)
 351                 newrp->rc_flag |= RC_SAMETCPCONN;
 352         if (nd->nd_nam2 != NULL) {
 353                 newrp->rc_flag |= RC_UDP;
 354                 ret = nfsrc_getudp(nd, newrp);
 355         } else {
 356                 ret = nfsrc_gettcp(nd, newrp);
 357         }
 358         NFSEXITCODE2(0, nd);
 359         return (ret);
 360 }
 361
 362 /*
 363  * For UDP (v2, v3):
 364  * - key on <xid, NFS version, RPC#, Client host ip#>
 365  *   (at most one entry for each key)
 366  */
 367 static int
 368 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
 369 {
 370         struct nfsrvcache *rp;
 371         struct sockaddr_in *saddr;
 372         struct sockaddr_in6 *saddr6;
 373         struct nfsrvhashhead *hp;
 374         int ret = 0;
 375         struct mtx *mutex;
 376
 377         mutex = nfsrc_cachemutex(newrp);
 378         hp = NFSRCUDPHASH(newrp->rc_xid);
 379 loop:
 380         mtx_lock(mutex);
 381         LIST_FOREACH(rp, hp, rc_hash) {
 382             if (newrp->rc_xid == rp->rc_xid &&
 383                 newrp->rc_proc == rp->rc_proc &&
 384                 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
 385                 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
 386                         if ((rp->rc_flag & RC_LOCKED) != 0) {
 387                                 rp->rc_flag |= RC_WANTED;
 388                                 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
 389                                     "nfsrc", 10 * hz);
 390                                 goto loop;
 391                         }
 392                         if (rp->rc_flag == 0)
 393                                 panic("nfs udp cache0");
 394                         rp->rc_flag |= RC_LOCKED;
 395                         TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
 396                         TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
 397                         if (rp->rc_flag & RC_INPROG) {
 398                                 newnfsstats.srvcache_inproghits++;
 399                                 mtx_unlock(mutex);
 400                                 ret = RC_DROPIT;
 401                         } else if (rp->rc_flag & RC_REPSTATUS) {
 402                                 /*
 403                                  * V2 only.
 404                                  */
 405                                 newnfsstats.srvcache_nonidemdonehits++;
 406                                 mtx_unlock(mutex);
 407                                 nfsrvd_rephead(nd);
 408                                 *(nd->nd_errp) = rp->rc_status;
 409                                 ret = RC_REPLY;
 410                                 rp->rc_timestamp = NFSD_MONOSEC +
 411                                         NFSRVCACHE_UDPTIMEOUT;
 412                         } else if (rp->rc_flag & RC_REPMBUF) {
 413                                 newnfsstats.srvcache_nonidemdonehits++;
 414                                 mtx_unlock(mutex);
 415                                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
 416                                         M_COPYALL, M_WAITOK);
 417                                 ret = RC_REPLY;
 418                                 rp->rc_timestamp = NFSD_MONOSEC +
 419                                         NFSRVCACHE_UDPTIMEOUT;
 420                         } else {
 421                                 panic("nfs udp cache1");
 422                         }
 423                         nfsrc_unlock(rp);
 424                         free((caddr_t)newrp, M_NFSRVCACHE);
 425                         goto out;
 426                 }
 427         }
 428         newnfsstats.srvcache_misses++;
 429         atomic_add_int(&newnfsstats.srvcache_size, 1);
 430         nfsrc_udpcachesize++;
 431
 432         newrp->rc_flag |= RC_INPROG;
 433         saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
 434         if (saddr->sin_family == AF_INET)
 435                 newrp->rc_inet = saddr->sin_addr.s_addr;
 436         else if (saddr->sin_family == AF_INET6) {
 437                 saddr6 = (struct sockaddr_in6 *)saddr;
 438                 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
 439                     sizeof (struct in6_addr));
 440                 newrp->rc_flag |= RC_INETIPV6;
 441         }
 442         LIST_INSERT_HEAD(hp, newrp, rc_hash);
 443         TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
 444         mtx_unlock(mutex);
 445         nd->nd_rp = newrp;
 446         ret = RC_DOIT;
 447
 448 out:
 449         NFSEXITCODE2(0, nd);
 450         return (ret);
 451 }
 452
 453 /*
 454  * Update a request cache entry after the rpc has been done
 455  */
 456 APPLESTATIC struct nfsrvcache *
 457 nfsrvd_updatecache(struct nfsrv_descript *nd)
 458 {
 459         struct nfsrvcache *rp;
 460         struct nfsrvcache *retrp = NULL;
 461         mbuf_t m;
 462         struct mtx *mutex;
 463
 464         rp = nd->nd_rp;
 465         if (!rp)
 466                 panic("nfsrvd_updatecache null rp");
 467         nd->nd_rp = NULL;
 468         mutex = nfsrc_cachemutex(rp);
 469         mtx_lock(mutex);
 470         nfsrc_lock(rp);
 471         if (!(rp->rc_flag & RC_INPROG))
 472                 panic("nfsrvd_updatecache not inprog");
 473         rp->rc_flag &= ~RC_INPROG;
 474         if (rp->rc_flag & RC_UDP) {
 475                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
 476                 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
 477         }
 478
 479         /*
 480          * Reply from cache is a special case returned by nfsrv_checkseqid().
 481          */
 482         if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
 483                 newnfsstats.srvcache_nonidemdonehits++;
 484                 mtx_unlock(mutex);
 485                 nd->nd_repstat = 0;
 486                 if (nd->nd_mreq)
 487                         mbuf_freem(nd->nd_mreq);
 488                 if (!(rp->rc_flag & RC_REPMBUF))
 489                         panic("reply from cache");
 490                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
 491                     M_COPYALL, M_WAITOK);
 492                 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 493                 nfsrc_unlock(rp);
 494                 goto out;
 495         }
 496
 497         /*
 498          * If rc_refcnt > 0, save it
 499          * For UDP, save it if ND_SAVEREPLY is set
 500          * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
 501          */
 502         if (nd->nd_repstat != NFSERR_DONTREPLY &&
 503             (rp->rc_refcnt > 0 ||
 504              ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
 505              ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
 506               nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
 507               nfsrc_tcpnonidempotent))) {
 508                 if (rp->rc_refcnt > 0) {
 509                         if (!(rp->rc_flag & RC_NFSV4))
 510                                 panic("update_cache refcnt");
 511                         rp->rc_flag |= RC_REFCNT;
 512                 }
 513                 if ((nd->nd_flag & ND_NFSV2) &&
 514                     nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
 515                         rp->rc_status = nd->nd_repstat;
 516                         rp->rc_flag |= RC_REPSTATUS;
 517                         mtx_unlock(mutex);
 518                 } else {
 519                         if (!(rp->rc_flag & RC_UDP)) {
 520                             atomic_add_int(&nfsrc_tcpsavedreplies, 1);
 521                             if (nfsrc_tcpsavedreplies >
 522                                 newnfsstats.srvcache_tcppeak)
 523                                 newnfsstats.srvcache_tcppeak =
 524                                     nfsrc_tcpsavedreplies;
 525                         }
 526                         mtx_unlock(mutex);
 527                         m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
 528                         mtx_lock(mutex);
 529                         rp->rc_reply = m;
 530                         rp->rc_flag |= RC_REPMBUF;
 531                         mtx_unlock(mutex);
 532                 }
 533                 if (rp->rc_flag & RC_UDP) {
 534                         rp->rc_timestamp = NFSD_MONOSEC +
 535                             NFSRVCACHE_UDPTIMEOUT;
 536                         nfsrc_unlock(rp);
 537                 } else {
 538                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 539                         if (rp->rc_refcnt > 0)
 540                                 nfsrc_unlock(rp);
 541                         else
 542                                 retrp = rp;
 543                 }
 544         } else {
 545                 nfsrc_freecache(rp);
 546                 mtx_unlock(mutex);
 547         }
 548
 549 out:
 550         NFSEXITCODE2(0, nd);
 551         return (retrp);
 552 }
 553
 554 /*
 555  * Invalidate and, if possible, free an in prog cache entry.
 556  * Must not sleep.
 557  */
 558 APPLESTATIC void
 559 nfsrvd_delcache(struct nfsrvcache *rp)
 560 {
 561         struct mtx *mutex;
 562
 563         mutex = nfsrc_cachemutex(rp);
 564         if (!(rp->rc_flag & RC_INPROG))
 565                 panic("nfsrvd_delcache not in prog");
 566         mtx_lock(mutex);
 567         rp->rc_flag &= ~RC_INPROG;
 568         if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
 569                 nfsrc_freecache(rp);
 570         mtx_unlock(mutex);
 571 }
 572
 573 /*
 574  * Called after nfsrvd_updatecache() once the reply is sent, to update
 575  * the entry's sequence number and unlock it. The argument is
 576  * the pointer returned by nfsrvd_updatecache().
 577  */
 578 APPLESTATIC void
 579 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
 580 {
 581         struct nfsrchash_bucket *hbp;
 582
 583         KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
 584         if (have_seq) {
 585                 hbp = NFSRCAHASH(rp->rc_sockref);
 586                 mtx_lock(&hbp->mtx);
 587                 rp->rc_tcpseq = seq;
 588                 if (rp->rc_acked != RC_NO_ACK)
 589                         LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
 590                 rp->rc_acked = RC_NO_ACK;
 591                 mtx_unlock(&hbp->mtx);
 592         }
 593         nfsrc_unlock(rp);
 594 }
 595
 596 /*
 597  * Get a cache entry for TCP
 598  * - key on <xid, nfs version>
 599  *   (allow multiple entries for a given key)
 600  */
 601 static int
 602 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
 603 {
 604         struct nfsrvcache *rp, *nextrp;
 605         int i;
 606         struct nfsrvcache *hitrp;
 607         struct nfsrvhashhead *hp, nfsrc_templist;
 608         int hit, ret = 0;
 609         struct mtx *mutex;
 610
 611         mutex = nfsrc_cachemutex(newrp);
 612         hp = NFSRCHASH(newrp->rc_xid);
 613         newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
 614 tryagain:
 615         mtx_lock(mutex);
 616         hit = 1;
 617         LIST_INIT(&nfsrc_templist);
 618         /*
 619          * Get all the matches and put them on the temp list.
 620          */
 621         rp = LIST_FIRST(hp);
 622         while (rp != LIST_END(hp)) {
 623                 nextrp = LIST_NEXT(rp, rc_hash);
 624                 if (newrp->rc_xid == rp->rc_xid &&
 625                     (!(rp->rc_flag & RC_INPROG) ||
 626                      ((newrp->rc_flag & RC_SAMETCPCONN) &&
 627                       newrp->rc_sockref == rp->rc_sockref)) &&
 628                     (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
 629                     newrp->rc_proc == rp->rc_proc &&
 630                     ((newrp->rc_flag & RC_NFSV4) &&
 631                      newrp->rc_sockref != rp->rc_sockref &&
 632                      newrp->rc_cachetime >= rp->rc_cachetime)
 633                     && newrp->rc_reqlen == rp->rc_reqlen &&
 634                     newrp->rc_cksum == rp->rc_cksum) {
 635                         LIST_REMOVE(rp, rc_hash);
 636                         LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
 637                 }
 638                 rp = nextrp;
 639         }
 640
 641         /*
 642          * Now, use nfsrc_templist to decide if there is a match.
 643          */
 644         i = 0;
 645         LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
 646                 i++;
 647                 if (rp->rc_refcnt > 0) {
 648                         hit = 0;
 649                         break;
 650                 }
 651         }
 652         /*
 653          * Can be a hit only if one entry left.
 654          * Note possible hit entry and put nfsrc_templist back on hash
 655          * list.
 656          */
 657         if (i != 1)
 658                 hit = 0;
 659         hitrp = rp = LIST_FIRST(&nfsrc_templist);
 660         while (rp != LIST_END(&nfsrc_templist)) {
 661                 nextrp = LIST_NEXT(rp, rc_hash);
 662                 LIST_REMOVE(rp, rc_hash);
 663                 LIST_INSERT_HEAD(hp, rp, rc_hash);
 664                 rp = nextrp;
 665         }
 666         if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
 667                 panic("nfs gettcp cache templist");
 668
 669         if (hit) {
 670                 rp = hitrp;
 671                 if ((rp->rc_flag & RC_LOCKED) != 0) {
 672                         rp->rc_flag |= RC_WANTED;
 673                         (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
 674                             "nfsrc", 10 * hz);
 675                         goto tryagain;
 676                 }
 677                 if (rp->rc_flag == 0)
 678                         panic("nfs tcp cache0");
 679                 rp->rc_flag |= RC_LOCKED;
 680                 if (rp->rc_flag & RC_INPROG) {
 681                         newnfsstats.srvcache_inproghits++;
 682                         mtx_unlock(mutex);
 683                         if (newrp->rc_sockref == rp->rc_sockref)
 684                                 nfsrc_marksametcpconn(rp->rc_sockref);
 685                         ret = RC_DROPIT;
 686                 } else if (rp->rc_flag & RC_REPSTATUS) {
 687                         /*
 688                          * V2 only.
 689                          */
 690                         newnfsstats.srvcache_nonidemdonehits++;
 691                         mtx_unlock(mutex);
 692                         if (newrp->rc_sockref == rp->rc_sockref)
 693                                 nfsrc_marksametcpconn(rp->rc_sockref);
 694                         ret = RC_REPLY;
 695                         nfsrvd_rephead(nd);
 696                         *(nd->nd_errp) = rp->rc_status;
 697                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 698                 } else if (rp->rc_flag & RC_REPMBUF) {
 699                         newnfsstats.srvcache_nonidemdonehits++;
 700                         mtx_unlock(mutex);
 701                         if (newrp->rc_sockref == rp->rc_sockref)
 702                                 nfsrc_marksametcpconn(rp->rc_sockref);
 703                         ret = RC_REPLY;
 704                         nd->nd_mreq = m_copym(rp->rc_reply, 0,
 705                                 M_COPYALL, M_WAITOK);
 706                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 707                 } else {
 708                         panic("nfs tcp cache1");
 709                 }
 710                 nfsrc_unlock(rp);
 711                 free((caddr_t)newrp, M_NFSRVCACHE);
 712                 goto out;
 713         }
 714         newnfsstats.srvcache_misses++;
 715         atomic_add_int(&newnfsstats.srvcache_size, 1);
 716
 717         /*
 718          * For TCP, multiple entries for a key are allowed, so don't
 719          * chain it into the hash table until done.
 720          */
 721         newrp->rc_cachetime = NFSD_MONOSEC;
 722         newrp->rc_flag |= RC_INPROG;
 723         LIST_INSERT_HEAD(hp, newrp, rc_hash);
 724         mtx_unlock(mutex);
 725         nd->nd_rp = newrp;
 726         ret = RC_DOIT;
 727
 728 out:
 729         NFSEXITCODE2(0, nd);
 730         return (ret);
 731 }
 732
 733 /*
 734  * Lock a cache entry.
 735  */
 736 static void
 737 nfsrc_lock(struct nfsrvcache *rp)
 738 {
 739         struct mtx *mutex;
 740
 741         mutex = nfsrc_cachemutex(rp);
 742         mtx_assert(mutex, MA_OWNED);
 743         while ((rp->rc_flag & RC_LOCKED) != 0) {
 744                 rp->rc_flag |= RC_WANTED;
 745                 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
 746         }
 747         rp->rc_flag |= RC_LOCKED;
 748 }
 749
 750 /*
 751  * Unlock a cache entry.
 752  */
 753 static void
 754 nfsrc_unlock(struct nfsrvcache *rp)
 755 {
 756         struct mtx *mutex;
 757
 758         mutex = nfsrc_cachemutex(rp);
 759         mtx_lock(mutex);
 760         rp->rc_flag &= ~RC_LOCKED;
 761         nfsrc_wanted(rp);
 762         mtx_unlock(mutex);
 763 }
 764
 765 /*
 766  * Wakeup anyone wanting entry.
 767  */
 768 static void
 769 nfsrc_wanted(struct nfsrvcache *rp)
 770 {
 771         if (rp->rc_flag & RC_WANTED) {
 772                 rp->rc_flag &= ~RC_WANTED;
 773                 wakeup((caddr_t)rp);
 774         }
 775 }
 776
 777 /*
 778  * Free up the entry.
 779  * Must not sleep.
 780  */
 781 static void
 782 nfsrc_freecache(struct nfsrvcache *rp)
 783 {
 784         struct nfsrchash_bucket *hbp;
 785
 786         LIST_REMOVE(rp, rc_hash);
 787         if (rp->rc_flag & RC_UDP) {
 788                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
 789                 nfsrc_udpcachesize--;
 790         } else if (rp->rc_acked != RC_NO_SEQ) {
 791                 hbp = NFSRCAHASH(rp->rc_sockref);
 792                 mtx_lock(&hbp->mtx);
 793                 if (rp->rc_acked == RC_NO_ACK)
 794                         LIST_REMOVE(rp, rc_ahash);
 795                 mtx_unlock(&hbp->mtx);
 796         }
 797         nfsrc_wanted(rp);
 798         if (rp->rc_flag & RC_REPMBUF) {
 799                 mbuf_freem(rp->rc_reply);
 800                 if (!(rp->rc_flag & RC_UDP))
 801                         atomic_add_int(&nfsrc_tcpsavedreplies, -1);
 802         }
 803         FREE((caddr_t)rp, M_NFSRVCACHE);
 804         atomic_add_int(&newnfsstats.srvcache_size, -1);
 805 }
 806
 807 /*
 808  * Clean out the cache. Called when nfsserver module is unloaded.
 809  */
 810 APPLESTATIC void
 811 nfsrvd_cleancache(void)
 812 {
 813         struct nfsrvcache *rp, *nextrp;
 814         int i;
 815
 816         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 817                 mtx_lock(&nfsrchash_table[i].mtx);
 818                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
 819                         nfsrc_freecache(rp);
 820                 mtx_unlock(&nfsrchash_table[i].mtx);
 821         }
 822         mtx_lock(&nfsrc_udpmtx);
 823         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 824                 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
 825                         nfsrc_freecache(rp);
 826                 }
 827         }
 828         newnfsstats.srvcache_size = 0;
 829         mtx_unlock(&nfsrc_udpmtx);
 830         nfsrc_tcpsavedreplies = 0;
 831 }
 832
 833 #define HISTSIZE        16
 834 /*
 835  * The basic rule is to get rid of entries that are expired.
 836  */
 837 void
 838 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
 839 {
 840         struct nfsrchash_bucket *hbp;
 841         struct nfsrvcache *rp, *nextrp;
 842         int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
 843         time_t thisstamp;
 844         static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
 845         static int onethread = 0, oneslot = 0;
 846
 847         if (sockref != 0) {
 848                 hbp = NFSRCAHASH(sockref);
 849                 mtx_lock(&hbp->mtx);
 850                 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
 851                         if (sockref == rp->rc_sockref) {
 852                                 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
 853                                         rp->rc_acked = RC_ACK;
 854                                         LIST_REMOVE(rp, rc_ahash);
 855                                 } else if (final) {
 856                                         rp->rc_acked = RC_NACK;
 857                                         LIST_REMOVE(rp, rc_ahash);
 858                                 }
 859                         }
 860                 }
 861                 mtx_unlock(&hbp->mtx);
 862         }
 863
 864         if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
 865                 return;
 866         if (NFSD_MONOSEC != udp_lasttrim ||
 867             nfsrc_udpcachesize >= (nfsrc_udphighwater +
 868             nfsrc_udphighwater / 2)) {
 869                 mtx_lock(&nfsrc_udpmtx);
 870                 udp_lasttrim = NFSD_MONOSEC;
 871                 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
 872                         if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
 873                              && rp->rc_refcnt == 0
 874                              && ((rp->rc_flag & RC_REFCNT) ||
 875                                  udp_lasttrim > rp->rc_timestamp ||
 876                                  nfsrc_udpcachesize > nfsrc_udphighwater))
 877                                 nfsrc_freecache(rp);
 878                 }
 879                 mtx_unlock(&nfsrc_udpmtx);
 880         }
 881         if (NFSD_MONOSEC != tcp_lasttrim ||
 882             nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
 883                 force = nfsrc_tcphighwater / 4;
 884                 if (force > 0 &&
 885                     nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
 886                         for (i = 0; i < HISTSIZE; i++)
 887                                 time_histo[i] = 0;
 888                         i = 0;
 889                         lastslot = NFSRVCACHE_HASHSIZE - 1;
 890                 } else {
 891                         force = 0;
 892                         if (NFSD_MONOSEC != tcp_lasttrim) {
 893                                 i = 0;
 894                                 lastslot = NFSRVCACHE_HASHSIZE - 1;
 895                         } else {
 896                                 lastslot = i = oneslot;
 897                                 if (++oneslot >= NFSRVCACHE_HASHSIZE)
 898                                         oneslot = 0;
 899                         }
 900                 }
 901                 tto = nfsrc_tcptimeout;
 902                 tcp_lasttrim = NFSD_MONOSEC;
 903                 for (; i <= lastslot; i++) {
 904                         mtx_lock(&nfsrchash_table[i].mtx);
 905                         LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
 906                             nextrp) {
 907                                 if (!(rp->rc_flag &
 908                                      (RC_INPROG|RC_LOCKED|RC_WANTED))
 909                                      && rp->rc_refcnt == 0) {
 910                                         if ((rp->rc_flag & RC_REFCNT) ||
 911                                             tcp_lasttrim > rp->rc_timestamp ||
 912                                             rp->rc_acked == RC_ACK) {
 913                                                 nfsrc_freecache(rp);
 914                                                 continue;
 915                                         }
 916
 917                                         if (force == 0)
 918                                                 continue;
 919                                         /*
 920                                          * The timestamps range from roughly the
 921                                          * present (tcp_lasttrim) to the present
 922                                          * + nfsrc_tcptimeout. Generate a simple
 923                                          * histogram of where the timeouts fall.
 924                                          */
 925                                         j = rp->rc_timestamp - tcp_lasttrim;
 926                                         if (j >= tto)
 927                                                 j = HISTSIZE - 1;
 928                                         else if (j < 0)
 929                                                 j = 0;
 930                                         else
 931                                                 j = j * HISTSIZE / tto;
 932                                         time_histo[j]++;
 933                                 }
 934                         }
 935                         mtx_unlock(&nfsrchash_table[i].mtx);
 936                 }
 937                 if (force) {
 938                         /*
 939                          * Trim some more with a smaller timeout of as little
 940                          * as 20% of nfsrc_tcptimeout to try and get below
 941                          * 80% of the nfsrc_tcphighwater.
 942                          */
 943                         k = 0;
 944                         for (i = 0; i < (HISTSIZE - 2); i++) {
 945                                 k += time_histo[i];
 946                                 if (k > force)
 947                                         break;
 948                         }
 949                         k = tto * (i + 1) / HISTSIZE;
 950                         if (k < 1)
 951                                 k = 1;
 952                         thisstamp = tcp_lasttrim + k;
 953                         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 954                                 mtx_lock(&nfsrchash_table[i].mtx);
 955                                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
 956                                     rc_hash, nextrp) {
 957                                         if (!(rp->rc_flag &
 958                                              (RC_INPROG|RC_LOCKED|RC_WANTED))
 959                                              && rp->rc_refcnt == 0
 960                                              && ((rp->rc_flag & RC_REFCNT) ||
 961                                                  thisstamp > rp->rc_timestamp ||
 962                                                  rp->rc_acked == RC_ACK))
 963                                                 nfsrc_freecache(rp);
 964                                 }
 965                                 mtx_unlock(&nfsrchash_table[i].mtx);
 966                         }
 967                 }
 968         }
 969         atomic_store_rel_int(&onethread, 0);
 970 }
 971
 972 /*
 973  * Add a seqid# reference to the cache entry.
 974  */
 975 APPLESTATIC void
 976 nfsrvd_refcache(struct nfsrvcache *rp)
 977 {
 978         struct mtx *mutex;
 979
 980         mutex = nfsrc_cachemutex(rp);
 981         mtx_lock(mutex);
 982         if (rp->rc_refcnt < 0)
 983                 panic("nfs cache refcnt");
 984         rp->rc_refcnt++;
 985         mtx_unlock(mutex);
 986 }
 987
 988 /*
 989  * Dereference a seqid# cache entry.
 990  */
 991 APPLESTATIC void
 992 nfsrvd_derefcache(struct nfsrvcache *rp)
 993 {
 994         struct mtx *mutex;
 995
 996         mutex = nfsrc_cachemutex(rp);
 997         mtx_lock(mutex);
 998         if (rp->rc_refcnt <= 0)
 999                 panic("nfs cache derefcnt");
1000         rp->rc_refcnt--;
1001         if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1002                 nfsrc_freecache(rp);
1003         mtx_unlock(mutex);
1004 }
1005
1006 /*
1007  * Calculate the length of the mbuf list and a checksum on the first up to
1008  * NFSRVCACHE_CHECKLEN bytes.
1009  */
1010 static int
1011 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1012 {
1013         int len = 0, cklen;
1014         mbuf_t m;
1015
1016         m = m1;
1017         while (m) {
1018                 len += mbuf_len(m);
1019                 m = mbuf_next(m);
1020         }
1021         cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1022         *cksum = in_cksum(m1, cklen);
1023         return (len);
1024 }
1025
1026 /*
1027  * Mark a TCP connection that is seeing retries. Should never happen for
1028  * NFSv4.
1029  */
1030 static void
1031 nfsrc_marksametcpconn(u_int64_t sockref)
1032 {
1033 }
1034