sys/fs/nfsserver/nfs_nfsdcache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Rick Macklem at The University of Guelph.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  */
  35
  36 #include <sys/cdefs.h>
  37 __FBSDID("$FreeBSD$");
  38
  39 /*
  40  * Here is the basic algorithm:
  41  * First, some design criteria I used:
  42  * - I think a false hit is more serious than a false miss
  43  * - A false hit for an RPC that has Op(s) that order via seqid# must be
  44  *   avoided at all cost
  45  * - A valid hit will probably happen a long time after the original reply
  46  *   and the TCP socket that the original request was received on will no
  47  *   longer be active
  48  *   (The long time delay implies to me that LRU is not appropriate.)
  49  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
  50  *   in them as well as minimizing the risk of redoing retried non-idempotent
  51  *   Ops.
  52  * Because it is biased towards avoiding false hits, multiple entries with
  53  * the same xid are to be expected, especially for the case of the entry
  54  * in the cache being related to a seqid# sequenced Op.
  55  *
  56  * The basic algorithm I'm about to code up:
  57  * - Null RPCs bypass the cache and are just done
  58  * For TCP
  59  *      - key on <xid, NFS version> (as noted above, there can be several
  60  *                                   entries with the same key)
  61  *      When a request arrives:
  62  *              For all that match key
  63  *              - if RPC# != OR request_size !=
  64  *                      - not a match with this one
  65  *              - if NFSv4 and received on same TCP socket OR
  66  *                      received on a TCP connection created before the
  67  *                      entry was cached
  68  *                      - not a match with this one
  69  *                      (V2,3 clients might retry on same TCP socket)
  70  *              - calculate checksum on first N bytes of NFS XDR
  71  *              - if checksum !=
  72  *                      - not a match for this one
  73  *              If any of the remaining ones that match has a
  74  *                      seqid_refcnt > 0
  75  *                      - not a match (go do RPC, using new cache entry)
  76  *              If one match left
  77  *                      - a hit (reply from cache)
  78  *              else
  79  *                      - miss (go do RPC, using new cache entry)
  80  *
  81  *      During processing of NFSv4 request:
  82  *              - set a flag when a non-idempotent Op is processed
  83  *              - when an Op that uses a seqid# (Open,...) is processed
  84  *                      - if same seqid# as referenced entry in cache
  85  *                              - free new cache entry
  86  *                              - reply from referenced cache entry
  87  *                        else if next seqid# in order
  88  *                              - free referenced cache entry
  89  *                              - increment seqid_refcnt on new cache entry
  90  *                              - set pointer from Openowner/Lockowner to
  91  *                                      new cache entry (aka reference it)
  92  *                        else if first seqid# in sequence
  93  *                              - increment seqid_refcnt on new cache entry
  94  *                              - set pointer from Openowner/Lockowner to
  95  *                                      new cache entry (aka reference it)
  96  *
  97  *      At end of RPC processing:
  98  *              - if seqid_refcnt > 0 OR flagged non-idempotent on new
  99  *                      cache entry
 100  *                      - save reply in cache entry
 101  *                      - calculate checksum on first N bytes of NFS XDR
 102  *                              request
 103  *                      - note op and length of XDR request (in bytes)
 104  *                      - timestamp it
 105  *                else
 106  *                      - free new cache entry
 107  *              - Send reply (noting info for socket activity check, below)
 108  *
 109  *      For cache entries saved above:
 110  *              - if saved since seqid_refcnt was > 0
 111  *                      - free when seqid_refcnt decrements to 0
 112  *                        (when next one in sequence is processed above, or
 113  *                         when Openowner/Lockowner is discarded)
 114  *                else { non-idempotent Op(s) }
 115  *                      - free when
 116  *                              - some further activity observed on same
 117  *                                      socket
 118  *                                (I'm not yet sure how I'm going to do
 119  *                                 this. Maybe look at the TCP connection
 120  *                                 to see if the send_tcp_sequence# is well
 121  *                                 past sent reply OR K additional RPCs
 122  *                                 replied on same socket OR?)
 123  *                        OR
 124  *                              - when very old (hours, days, weeks?)
 125  *
 126  * For UDP (v2, 3 only), pretty much the old way:
 127  * - key on <xid, NFS version, RPC#, Client host ip#>
 128  *   (at most one entry for each key)
 129  *
 130  * When a Request arrives:
 131  * - if a match with entry via key
 132  *      - if RPC marked In_progress
 133  *              - discard request (don't send reply)
 134  *        else
 135  *              - reply from cache
 136  *              - timestamp cache entry
 137  *   else
 138  *      - add entry to cache, marked In_progress
 139  *      - do RPC
 140  *      - when RPC done
 141  *              - if RPC# non-idempotent
 142  *                      - mark entry Done (not In_progress)
 143  *                      - save reply
 144  *                      - timestamp cache entry
 145  *                else
 146  *                      - free cache entry
 147  *              - send reply
 148  *
 149  * Later, entries with saved replies are free'd a short time (few minutes)
 150  * after reply sent (timestamp).
 151  * Reference: Chet Juszczak, "Improving the Performance and Correctness
 152  *              of an NFS Server", in Proc. Winter 1989 USENIX Conference,
 153  *              pages 53-63. San Diego, February 1989.
 154  *       for the UDP case.
 155  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
 156  *      for TCP. For V3, a reply won't be saved when the flood level is
 157  *      hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
 158  *      that case. This level should be set high enough that this almost
 159  *      never happens.
 160  */
 161 #ifndef APPLEKEXT
 162 #include <fs/nfs/nfsport.h>
 163
 164 extern struct nfsstatsv1 nfsstatsv1;
 165 extern struct mtx nfsrc_udpmtx;
 166 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
 167 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
 168 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
 169 #endif  /* !APPLEKEXT */
 170
 171 SYSCTL_DECL(_vfs_nfsd);
 172
 173 static u_int    nfsrc_tcphighwater = 0;
 174 static int
 175 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
 176 {
 177         int error, newhighwater;
 178
 179         newhighwater = nfsrc_tcphighwater;
 180         error = sysctl_handle_int(oidp, &newhighwater, 0, req);
 181         if (error != 0 || req->newptr == NULL)
 182                 return (error);
 183         if (newhighwater < 0)
 184                 return (EINVAL);
 185         if (newhighwater >= nfsrc_floodlevel)
 186                 nfsrc_floodlevel = newhighwater + newhighwater / 5;
 187         nfsrc_tcphighwater = newhighwater;
 188         return (0);
 189 }
 190 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
 191     CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
 192     sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
 193
 194 static u_int    nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
 195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
 196     &nfsrc_udphighwater, 0,
 197     "High water mark for UDP cache entries");
 198 static u_int    nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
 199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
 200     &nfsrc_tcptimeout, 0,
 201     "Timeout for TCP entries in the DRC");
 202 static u_int nfsrc_tcpnonidempotent = 1;
 203 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
 204     &nfsrc_tcpnonidempotent, 0,
 205     "Enable the DRC for NFS over TCP");
 206
 207 static int nfsrc_udpcachesize = 0;
 208 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
 209 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
 210
 211 /*
 212  * and the reverse mapping from generic to Version 2 procedure numbers
 213  */
 214 static int newnfsv2_procid[NFS_V3NPROCS] = {
 215         NFSV2PROC_NULL,
 216         NFSV2PROC_GETATTR,
 217         NFSV2PROC_SETATTR,
 218         NFSV2PROC_LOOKUP,
 219         NFSV2PROC_NOOP,
 220         NFSV2PROC_READLINK,
 221         NFSV2PROC_READ,
 222         NFSV2PROC_WRITE,
 223         NFSV2PROC_CREATE,
 224         NFSV2PROC_MKDIR,
 225         NFSV2PROC_SYMLINK,
 226         NFSV2PROC_CREATE,
 227         NFSV2PROC_REMOVE,
 228         NFSV2PROC_RMDIR,
 229         NFSV2PROC_RENAME,
 230         NFSV2PROC_LINK,
 231         NFSV2PROC_READDIR,
 232         NFSV2PROC_NOOP,
 233         NFSV2PROC_STATFS,
 234         NFSV2PROC_NOOP,
 235         NFSV2PROC_NOOP,
 236         NFSV2PROC_NOOP,
 237 };
 238
 239 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
 240 #define NFSRCUDPHASH(xid) \
 241         (&nfsrvudphashtbl[nfsrc_hash(xid)])
 242 #define NFSRCHASH(xid) \
 243         (&nfsrchash_table[nfsrc_hash(xid)].tbl)
 244 #define NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
 245 #define TRUE    1
 246 #define FALSE   0
 247 #define NFSRVCACHE_CHECKLEN     100
 248
 249 /* True iff the rpc reply is an nfs status ONLY! */
 250 static int nfsv2_repstat[NFS_V3NPROCS] = {
 251         FALSE,
 252         FALSE,
 253         FALSE,
 254         FALSE,
 255         FALSE,
 256         FALSE,
 257         FALSE,
 258         FALSE,
 259         FALSE,
 260         FALSE,
 261         TRUE,
 262         TRUE,
 263         TRUE,
 264         TRUE,
 265         FALSE,
 266         TRUE,
 267         FALSE,
 268         FALSE,
 269         FALSE,
 270         FALSE,
 271         FALSE,
 272         FALSE,
 273 };
 274
 275 /*
 276  * Will NFS want to work over IPv6 someday?
 277  */
 278 #define NETFAMILY(rp) \
 279                 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
 280
 281 /* local functions */
 282 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
 283 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
 284 static void nfsrc_lock(struct nfsrvcache *rp);
 285 static void nfsrc_unlock(struct nfsrvcache *rp);
 286 static void nfsrc_wanted(struct nfsrvcache *rp);
 287 static void nfsrc_freecache(struct nfsrvcache *rp);
 288 static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
 289 static void nfsrc_marksametcpconn(u_int64_t);
 290
 291 /*
 292  * Return the correct mutex for this cache entry.
 293  */
 294 static __inline struct mtx *
 295 nfsrc_cachemutex(struct nfsrvcache *rp)
 296 {
 297
 298         if ((rp->rc_flag & RC_UDP) != 0)
 299                 return (&nfsrc_udpmtx);
 300         return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
 301 }
 302
 303 /*
 304  * Initialize the server request cache list
 305  */
 306 APPLESTATIC void
 307 nfsrvd_initcache(void)
 308 {
 309         int i;
 310         static int inited = 0;
 311
 312         if (inited)
 313                 return;
 314         inited = 1;
 315         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 316                 LIST_INIT(&nfsrvudphashtbl[i]);
 317                 LIST_INIT(&nfsrchash_table[i].tbl);
 318                 LIST_INIT(&nfsrcahash_table[i].tbl);
 319         }
 320         TAILQ_INIT(&nfsrvudplru);
 321         nfsrc_tcpsavedreplies = 0;
 322         nfsrc_udpcachesize = 0;
 323         nfsstatsv1.srvcache_tcppeak = 0;
 324         nfsstatsv1.srvcache_size = 0;
 325 }
 326
 327 /*
 328  * Get a cache entry for this request. Basically just malloc a new one
 329  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
 330  */
 331 APPLESTATIC int
 332 nfsrvd_getcache(struct nfsrv_descript *nd)
 333 {
 334         struct nfsrvcache *newrp;
 335         int ret;
 336
 337         if (nd->nd_procnum == NFSPROC_NULL)
 338                 panic("nfsd cache null");
 339         newrp = malloc(sizeof (struct nfsrvcache),
 340             M_NFSRVCACHE, M_WAITOK);
 341         NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
 342         if (nd->nd_flag & ND_NFSV4)
 343                 newrp->rc_flag = RC_NFSV4;
 344         else if (nd->nd_flag & ND_NFSV3)
 345                 newrp->rc_flag = RC_NFSV3;
 346         else
 347                 newrp->rc_flag = RC_NFSV2;
 348         newrp->rc_xid = nd->nd_retxid;
 349         newrp->rc_proc = nd->nd_procnum;
 350         newrp->rc_sockref = nd->nd_sockref;
 351         newrp->rc_cachetime = nd->nd_tcpconntime;
 352         if (nd->nd_flag & ND_SAMETCPCONN)
 353                 newrp->rc_flag |= RC_SAMETCPCONN;
 354         if (nd->nd_nam2 != NULL) {
 355                 newrp->rc_flag |= RC_UDP;
 356                 ret = nfsrc_getudp(nd, newrp);
 357         } else {
 358                 ret = nfsrc_gettcp(nd, newrp);
 359         }
 360         NFSEXITCODE2(0, nd);
 361         return (ret);
 362 }
 363
 364 /*
 365  * For UDP (v2, v3):
 366  * - key on <xid, NFS version, RPC#, Client host ip#>
 367  *   (at most one entry for each key)
 368  */
 369 static int
 370 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
 371 {
 372         struct nfsrvcache *rp;
 373         struct sockaddr_in *saddr;
 374         struct sockaddr_in6 *saddr6;
 375         struct nfsrvhashhead *hp;
 376         int ret = 0;
 377         struct mtx *mutex;
 378
 379         mutex = nfsrc_cachemutex(newrp);
 380         hp = NFSRCUDPHASH(newrp->rc_xid);
 381 loop:
 382         mtx_lock(mutex);
 383         LIST_FOREACH(rp, hp, rc_hash) {
 384             if (newrp->rc_xid == rp->rc_xid &&
 385                 newrp->rc_proc == rp->rc_proc &&
 386                 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
 387                 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
 388                         if ((rp->rc_flag & RC_LOCKED) != 0) {
 389                                 rp->rc_flag |= RC_WANTED;
 390                                 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
 391                                     "nfsrc", 10 * hz);
 392                                 goto loop;
 393                         }
 394                         if (rp->rc_flag == 0)
 395                                 panic("nfs udp cache0");
 396                         rp->rc_flag |= RC_LOCKED;
 397                         TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
 398                         TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
 399                         if (rp->rc_flag & RC_INPROG) {
 400                                 nfsstatsv1.srvcache_inproghits++;
 401                                 mtx_unlock(mutex);
 402                                 ret = RC_DROPIT;
 403                         } else if (rp->rc_flag & RC_REPSTATUS) {
 404                                 /*
 405                                  * V2 only.
 406                                  */
 407                                 nfsstatsv1.srvcache_nonidemdonehits++;
 408                                 mtx_unlock(mutex);
 409                                 nfsrvd_rephead(nd);
 410                                 *(nd->nd_errp) = rp->rc_status;
 411                                 ret = RC_REPLY;
 412                                 rp->rc_timestamp = NFSD_MONOSEC +
 413                                         NFSRVCACHE_UDPTIMEOUT;
 414                         } else if (rp->rc_flag & RC_REPMBUF) {
 415                                 nfsstatsv1.srvcache_nonidemdonehits++;
 416                                 mtx_unlock(mutex);
 417                                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
 418                                         M_COPYALL, M_WAITOK);
 419                                 ret = RC_REPLY;
 420                                 rp->rc_timestamp = NFSD_MONOSEC +
 421                                         NFSRVCACHE_UDPTIMEOUT;
 422                         } else {
 423                                 panic("nfs udp cache1");
 424                         }
 425                         nfsrc_unlock(rp);
 426                         free(newrp, M_NFSRVCACHE);
 427                         goto out;
 428                 }
 429         }
 430         nfsstatsv1.srvcache_misses++;
 431         atomic_add_int(&nfsstatsv1.srvcache_size, 1);
 432         nfsrc_udpcachesize++;
 433
 434         newrp->rc_flag |= RC_INPROG;
 435         saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
 436         if (saddr->sin_family == AF_INET)
 437                 newrp->rc_inet = saddr->sin_addr.s_addr;
 438         else if (saddr->sin_family == AF_INET6) {
 439                 saddr6 = (struct sockaddr_in6 *)saddr;
 440                 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
 441                     sizeof (struct in6_addr));
 442                 newrp->rc_flag |= RC_INETIPV6;
 443         }
 444         LIST_INSERT_HEAD(hp, newrp, rc_hash);
 445         TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
 446         mtx_unlock(mutex);
 447         nd->nd_rp = newrp;
 448         ret = RC_DOIT;
 449
 450 out:
 451         NFSEXITCODE2(0, nd);
 452         return (ret);
 453 }
 454
 455 /*
 456  * Update a request cache entry after the rpc has been done
 457  */
 458 APPLESTATIC struct nfsrvcache *
 459 nfsrvd_updatecache(struct nfsrv_descript *nd)
 460 {
 461         struct nfsrvcache *rp;
 462         struct nfsrvcache *retrp = NULL;
 463         struct mbuf *m;
 464         struct mtx *mutex;
 465
 466         rp = nd->nd_rp;
 467         if (!rp)
 468                 panic("nfsrvd_updatecache null rp");
 469         nd->nd_rp = NULL;
 470         mutex = nfsrc_cachemutex(rp);
 471         mtx_lock(mutex);
 472         nfsrc_lock(rp);
 473         if (!(rp->rc_flag & RC_INPROG))
 474                 panic("nfsrvd_updatecache not inprog");
 475         rp->rc_flag &= ~RC_INPROG;
 476         if (rp->rc_flag & RC_UDP) {
 477                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
 478                 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
 479         }
 480
 481         /*
 482          * Reply from cache is a special case returned by nfsrv_checkseqid().
 483          */
 484         if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
 485                 nfsstatsv1.srvcache_nonidemdonehits++;
 486                 mtx_unlock(mutex);
 487                 nd->nd_repstat = 0;
 488                 if (nd->nd_mreq)
 489                         m_freem(nd->nd_mreq);
 490                 if (!(rp->rc_flag & RC_REPMBUF))
 491                         panic("reply from cache");
 492                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
 493                     M_COPYALL, M_WAITOK);
 494                 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 495                 nfsrc_unlock(rp);
 496                 goto out;
 497         }
 498
 499         /*
 500          * If rc_refcnt > 0, save it
 501          * For UDP, save it if ND_SAVEREPLY is set
 502          * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
 503          */
 504         if (nd->nd_repstat != NFSERR_DONTREPLY &&
 505             (rp->rc_refcnt > 0 ||
 506              ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
 507              ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
 508               nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
 509               nfsrc_tcpnonidempotent))) {
 510                 if (rp->rc_refcnt > 0) {
 511                         if (!(rp->rc_flag & RC_NFSV4))
 512                                 panic("update_cache refcnt");
 513                         rp->rc_flag |= RC_REFCNT;
 514                 }
 515                 if ((nd->nd_flag & ND_NFSV2) &&
 516                     nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
 517                         rp->rc_status = nd->nd_repstat;
 518                         rp->rc_flag |= RC_REPSTATUS;
 519                         mtx_unlock(mutex);
 520                 } else {
 521                         if (!(rp->rc_flag & RC_UDP)) {
 522                             atomic_add_int(&nfsrc_tcpsavedreplies, 1);
 523                             if (nfsrc_tcpsavedreplies >
 524                                 nfsstatsv1.srvcache_tcppeak)
 525                                 nfsstatsv1.srvcache_tcppeak =
 526                                     nfsrc_tcpsavedreplies;
 527                         }
 528                         mtx_unlock(mutex);
 529                         m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
 530                         mtx_lock(mutex);
 531                         rp->rc_reply = m;
 532                         rp->rc_flag |= RC_REPMBUF;
 533                         mtx_unlock(mutex);
 534                 }
 535                 if (rp->rc_flag & RC_UDP) {
 536                         rp->rc_timestamp = NFSD_MONOSEC +
 537                             NFSRVCACHE_UDPTIMEOUT;
 538                         nfsrc_unlock(rp);
 539                 } else {
 540                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 541                         if (rp->rc_refcnt > 0)
 542                                 nfsrc_unlock(rp);
 543                         else
 544                                 retrp = rp;
 545                 }
 546         } else {
 547                 nfsrc_freecache(rp);
 548                 mtx_unlock(mutex);
 549         }
 550
 551 out:
 552         NFSEXITCODE2(0, nd);
 553         return (retrp);
 554 }
 555
 556 /*
 557  * Invalidate and, if possible, free an in prog cache entry.
 558  * Must not sleep.
 559  */
 560 APPLESTATIC void
 561 nfsrvd_delcache(struct nfsrvcache *rp)
 562 {
 563         struct mtx *mutex;
 564
 565         mutex = nfsrc_cachemutex(rp);
 566         if (!(rp->rc_flag & RC_INPROG))
 567                 panic("nfsrvd_delcache not in prog");
 568         mtx_lock(mutex);
 569         rp->rc_flag &= ~RC_INPROG;
 570         if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
 571                 nfsrc_freecache(rp);
 572         mtx_unlock(mutex);
 573 }
 574
 575 /*
 576  * Called after nfsrvd_updatecache() once the reply is sent, to update
 577  * the entry's sequence number and unlock it. The argument is
 578  * the pointer returned by nfsrvd_updatecache().
 579  */
 580 APPLESTATIC void
 581 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
 582 {
 583         struct nfsrchash_bucket *hbp;
 584
 585         KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
 586         if (have_seq) {
 587                 hbp = NFSRCAHASH(rp->rc_sockref);
 588                 mtx_lock(&hbp->mtx);
 589                 rp->rc_tcpseq = seq;
 590                 if (rp->rc_acked != RC_NO_ACK)
 591                         LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
 592                 rp->rc_acked = RC_NO_ACK;
 593                 mtx_unlock(&hbp->mtx);
 594         }
 595         nfsrc_unlock(rp);
 596 }
 597
 598 /*
 599  * Get a cache entry for TCP
 600  * - key on <xid, nfs version>
 601  *   (allow multiple entries for a given key)
 602  */
 603 static int
 604 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
 605 {
 606         struct nfsrvcache *rp, *nextrp;
 607         int i;
 608         struct nfsrvcache *hitrp;
 609         struct nfsrvhashhead *hp, nfsrc_templist;
 610         int hit, ret = 0;
 611         struct mtx *mutex;
 612
 613         mutex = nfsrc_cachemutex(newrp);
 614         hp = NFSRCHASH(newrp->rc_xid);
 615         newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
 616 tryagain:
 617         mtx_lock(mutex);
 618         hit = 1;
 619         LIST_INIT(&nfsrc_templist);
 620         /*
 621          * Get all the matches and put them on the temp list.
 622          */
 623         rp = LIST_FIRST(hp);
 624         while (rp != LIST_END(hp)) {
 625                 nextrp = LIST_NEXT(rp, rc_hash);
 626                 if (newrp->rc_xid == rp->rc_xid &&
 627                     (!(rp->rc_flag & RC_INPROG) ||
 628                      ((newrp->rc_flag & RC_SAMETCPCONN) &&
 629                       newrp->rc_sockref == rp->rc_sockref)) &&
 630                     (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
 631                     newrp->rc_proc == rp->rc_proc &&
 632                     ((newrp->rc_flag & RC_NFSV4) &&
 633                      newrp->rc_sockref != rp->rc_sockref &&
 634                      newrp->rc_cachetime >= rp->rc_cachetime)
 635                     && newrp->rc_reqlen == rp->rc_reqlen &&
 636                     newrp->rc_cksum == rp->rc_cksum) {
 637                         LIST_REMOVE(rp, rc_hash);
 638                         LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
 639                 }
 640                 rp = nextrp;
 641         }
 642
 643         /*
 644          * Now, use nfsrc_templist to decide if there is a match.
 645          */
 646         i = 0;
 647         LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
 648                 i++;
 649                 if (rp->rc_refcnt > 0) {
 650                         hit = 0;
 651                         break;
 652                 }
 653         }
 654         /*
 655          * Can be a hit only if one entry left.
 656          * Note possible hit entry and put nfsrc_templist back on hash
 657          * list.
 658          */
 659         if (i != 1)
 660                 hit = 0;
 661         hitrp = rp = LIST_FIRST(&nfsrc_templist);
 662         while (rp != LIST_END(&nfsrc_templist)) {
 663                 nextrp = LIST_NEXT(rp, rc_hash);
 664                 LIST_REMOVE(rp, rc_hash);
 665                 LIST_INSERT_HEAD(hp, rp, rc_hash);
 666                 rp = nextrp;
 667         }
 668         if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
 669                 panic("nfs gettcp cache templist");
 670
 671         if (hit) {
 672                 rp = hitrp;
 673                 if ((rp->rc_flag & RC_LOCKED) != 0) {
 674                         rp->rc_flag |= RC_WANTED;
 675                         (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
 676                             "nfsrc", 10 * hz);
 677                         goto tryagain;
 678                 }
 679                 if (rp->rc_flag == 0)
 680                         panic("nfs tcp cache0");
 681                 rp->rc_flag |= RC_LOCKED;
 682                 if (rp->rc_flag & RC_INPROG) {
 683                         nfsstatsv1.srvcache_inproghits++;
 684                         mtx_unlock(mutex);
 685                         if (newrp->rc_sockref == rp->rc_sockref)
 686                                 nfsrc_marksametcpconn(rp->rc_sockref);
 687                         ret = RC_DROPIT;
 688                 } else if (rp->rc_flag & RC_REPSTATUS) {
 689                         /*
 690                          * V2 only.
 691                          */
 692                         nfsstatsv1.srvcache_nonidemdonehits++;
 693                         mtx_unlock(mutex);
 694                         if (newrp->rc_sockref == rp->rc_sockref)
 695                                 nfsrc_marksametcpconn(rp->rc_sockref);
 696                         ret = RC_REPLY;
 697                         nfsrvd_rephead(nd);
 698                         *(nd->nd_errp) = rp->rc_status;
 699                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 700                 } else if (rp->rc_flag & RC_REPMBUF) {
 701                         nfsstatsv1.srvcache_nonidemdonehits++;
 702                         mtx_unlock(mutex);
 703                         if (newrp->rc_sockref == rp->rc_sockref)
 704                                 nfsrc_marksametcpconn(rp->rc_sockref);
 705                         ret = RC_REPLY;
 706                         nd->nd_mreq = m_copym(rp->rc_reply, 0,
 707                                 M_COPYALL, M_WAITOK);
 708                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 709                 } else {
 710                         panic("nfs tcp cache1");
 711                 }
 712                 nfsrc_unlock(rp);
 713                 free(newrp, M_NFSRVCACHE);
 714                 goto out;
 715         }
 716         nfsstatsv1.srvcache_misses++;
 717         atomic_add_int(&nfsstatsv1.srvcache_size, 1);
 718
 719         /*
 720          * For TCP, multiple entries for a key are allowed, so don't
 721          * chain it into the hash table until done.
 722          */
 723         newrp->rc_cachetime = NFSD_MONOSEC;
 724         newrp->rc_flag |= RC_INPROG;
 725         LIST_INSERT_HEAD(hp, newrp, rc_hash);
 726         mtx_unlock(mutex);
 727         nd->nd_rp = newrp;
 728         ret = RC_DOIT;
 729
 730 out:
 731         NFSEXITCODE2(0, nd);
 732         return (ret);
 733 }
 734
 735 /*
 736  * Lock a cache entry.
 737  */
 738 static void
 739 nfsrc_lock(struct nfsrvcache *rp)
 740 {
 741         struct mtx *mutex;
 742
 743         mutex = nfsrc_cachemutex(rp);
 744         mtx_assert(mutex, MA_OWNED);
 745         while ((rp->rc_flag & RC_LOCKED) != 0) {
 746                 rp->rc_flag |= RC_WANTED;
 747                 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
 748         }
 749         rp->rc_flag |= RC_LOCKED;
 750 }
 751
 752 /*
 753  * Unlock a cache entry.
 754  */
 755 static void
 756 nfsrc_unlock(struct nfsrvcache *rp)
 757 {
 758         struct mtx *mutex;
 759
 760         mutex = nfsrc_cachemutex(rp);
 761         mtx_lock(mutex);
 762         rp->rc_flag &= ~RC_LOCKED;
 763         nfsrc_wanted(rp);
 764         mtx_unlock(mutex);
 765 }
 766
 767 /*
 768  * Wakeup anyone wanting entry.
 769  */
 770 static void
 771 nfsrc_wanted(struct nfsrvcache *rp)
 772 {
 773         if (rp->rc_flag & RC_WANTED) {
 774                 rp->rc_flag &= ~RC_WANTED;
 775                 wakeup((caddr_t)rp);
 776         }
 777 }
 778
 779 /*
 780  * Free up the entry.
 781  * Must not sleep.
 782  */
 783 static void
 784 nfsrc_freecache(struct nfsrvcache *rp)
 785 {
 786         struct nfsrchash_bucket *hbp;
 787
 788         LIST_REMOVE(rp, rc_hash);
 789         if (rp->rc_flag & RC_UDP) {
 790                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
 791                 nfsrc_udpcachesize--;
 792         } else if (rp->rc_acked != RC_NO_SEQ) {
 793                 hbp = NFSRCAHASH(rp->rc_sockref);
 794                 mtx_lock(&hbp->mtx);
 795                 if (rp->rc_acked == RC_NO_ACK)
 796                         LIST_REMOVE(rp, rc_ahash);
 797                 mtx_unlock(&hbp->mtx);
 798         }
 799         nfsrc_wanted(rp);
 800         if (rp->rc_flag & RC_REPMBUF) {
 801                 m_freem(rp->rc_reply);
 802                 if (!(rp->rc_flag & RC_UDP))
 803                         atomic_add_int(&nfsrc_tcpsavedreplies, -1);
 804         }
 805         free(rp, M_NFSRVCACHE);
 806         atomic_add_int(&nfsstatsv1.srvcache_size, -1);
 807 }
 808
 809 /*
 810  * Clean out the cache. Called when nfsserver module is unloaded.
 811  */
 812 APPLESTATIC void
 813 nfsrvd_cleancache(void)
 814 {
 815         struct nfsrvcache *rp, *nextrp;
 816         int i;
 817
 818         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 819                 mtx_lock(&nfsrchash_table[i].mtx);
 820                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
 821                         nfsrc_freecache(rp);
 822                 mtx_unlock(&nfsrchash_table[i].mtx);
 823         }
 824         mtx_lock(&nfsrc_udpmtx);
 825         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 826                 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
 827                         nfsrc_freecache(rp);
 828                 }
 829         }
 830         nfsstatsv1.srvcache_size = 0;
 831         mtx_unlock(&nfsrc_udpmtx);
 832         nfsrc_tcpsavedreplies = 0;
 833 }
 834
 835 #define HISTSIZE        16
 836 /*
 837  * The basic rule is to get rid of entries that are expired.
 838  */
 839 void
 840 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
 841 {
 842         struct nfsrchash_bucket *hbp;
 843         struct nfsrvcache *rp, *nextrp;
 844         int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
 845         time_t thisstamp;
 846         static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
 847         static int onethread = 0, oneslot = 0;
 848
 849         if (sockref != 0) {
 850                 hbp = NFSRCAHASH(sockref);
 851                 mtx_lock(&hbp->mtx);
 852                 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
 853                         if (sockref == rp->rc_sockref) {
 854                                 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
 855                                         rp->rc_acked = RC_ACK;
 856                                         LIST_REMOVE(rp, rc_ahash);
 857                                 } else if (final) {
 858                                         rp->rc_acked = RC_NACK;
 859                                         LIST_REMOVE(rp, rc_ahash);
 860                                 }
 861                         }
 862                 }
 863                 mtx_unlock(&hbp->mtx);
 864         }
 865
 866         if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
 867                 return;
 868         if (NFSD_MONOSEC != udp_lasttrim ||
 869             nfsrc_udpcachesize >= (nfsrc_udphighwater +
 870             nfsrc_udphighwater / 2)) {
 871                 mtx_lock(&nfsrc_udpmtx);
 872                 udp_lasttrim = NFSD_MONOSEC;
 873                 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
 874                         if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
 875                              && rp->rc_refcnt == 0
 876                              && ((rp->rc_flag & RC_REFCNT) ||
 877                                  udp_lasttrim > rp->rc_timestamp ||
 878                                  nfsrc_udpcachesize > nfsrc_udphighwater))
 879                                 nfsrc_freecache(rp);
 880                 }
 881                 mtx_unlock(&nfsrc_udpmtx);
 882         }
 883         if (NFSD_MONOSEC != tcp_lasttrim ||
 884             nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
 885                 force = nfsrc_tcphighwater / 4;
 886                 if (force > 0 &&
 887                     nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
 888                         for (i = 0; i < HISTSIZE; i++)
 889                                 time_histo[i] = 0;
 890                         i = 0;
 891                         lastslot = NFSRVCACHE_HASHSIZE - 1;
 892                 } else {
 893                         force = 0;
 894                         if (NFSD_MONOSEC != tcp_lasttrim) {
 895                                 i = 0;
 896                                 lastslot = NFSRVCACHE_HASHSIZE - 1;
 897                         } else {
 898                                 lastslot = i = oneslot;
 899                                 if (++oneslot >= NFSRVCACHE_HASHSIZE)
 900                                         oneslot = 0;
 901                         }
 902                 }
 903                 tto = nfsrc_tcptimeout;
 904                 tcp_lasttrim = NFSD_MONOSEC;
 905                 for (; i <= lastslot; i++) {
 906                         mtx_lock(&nfsrchash_table[i].mtx);
 907                         LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
 908                             nextrp) {
 909                                 if (!(rp->rc_flag &
 910                                      (RC_INPROG|RC_LOCKED|RC_WANTED))
 911                                      && rp->rc_refcnt == 0) {
 912                                         if ((rp->rc_flag & RC_REFCNT) ||
 913                                             tcp_lasttrim > rp->rc_timestamp ||
 914                                             rp->rc_acked == RC_ACK) {
 915                                                 nfsrc_freecache(rp);
 916                                                 continue;
 917                                         }
 918
 919                                         if (force == 0)
 920                                                 continue;
 921                                         /*
 922                                          * The timestamps range from roughly the
 923                                          * present (tcp_lasttrim) to the present
 924                                          * + nfsrc_tcptimeout. Generate a simple
 925                                          * histogram of where the timeouts fall.
 926                                          */
 927                                         j = rp->rc_timestamp - tcp_lasttrim;
 928                                         if (j >= tto)
 929                                                 j = HISTSIZE - 1;
 930                                         else if (j < 0)
 931                                                 j = 0;
 932                                         else
 933                                                 j = j * HISTSIZE / tto;
 934                                         time_histo[j]++;
 935                                 }
 936                         }
 937                         mtx_unlock(&nfsrchash_table[i].mtx);
 938                 }
 939                 if (force) {
 940                         /*
 941                          * Trim some more with a smaller timeout of as little
 942                          * as 20% of nfsrc_tcptimeout to try and get below
 943                          * 80% of the nfsrc_tcphighwater.
 944                          */
 945                         k = 0;
 946                         for (i = 0; i < (HISTSIZE - 2); i++) {
 947                                 k += time_histo[i];
 948                                 if (k > force)
 949                                         break;
 950                         }
 951                         k = tto * (i + 1) / HISTSIZE;
 952                         if (k < 1)
 953                                 k = 1;
 954                         thisstamp = tcp_lasttrim + k;
 955                         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 956                                 mtx_lock(&nfsrchash_table[i].mtx);
 957                                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
 958                                     rc_hash, nextrp) {
 959                                         if (!(rp->rc_flag &
 960                                              (RC_INPROG|RC_LOCKED|RC_WANTED))
 961                                              && rp->rc_refcnt == 0
 962                                              && ((rp->rc_flag & RC_REFCNT) ||
 963                                                  thisstamp > rp->rc_timestamp ||
 964                                                  rp->rc_acked == RC_ACK))
 965                                                 nfsrc_freecache(rp);
 966                                 }
 967                                 mtx_unlock(&nfsrchash_table[i].mtx);
 968                         }
 969                 }
 970         }
 971         atomic_store_rel_int(&onethread, 0);
 972 }
 973
 974 /*
 975  * Add a seqid# reference to the cache entry.
 976  */
 977 APPLESTATIC void
 978 nfsrvd_refcache(struct nfsrvcache *rp)
 979 {
 980         struct mtx *mutex;
 981
 982         if (rp == NULL)
 983                 /* For NFSv4.1, there is no cache entry. */
 984                 return;
 985         mutex = nfsrc_cachemutex(rp);
 986         mtx_lock(mutex);
 987         if (rp->rc_refcnt < 0)
 988                 panic("nfs cache refcnt");
 989         rp->rc_refcnt++;
 990         mtx_unlock(mutex);
 991 }
 992
 993 /*
 994  * Dereference a seqid# cache entry.
 995  */
 996 APPLESTATIC void
 997 nfsrvd_derefcache(struct nfsrvcache *rp)
 998 {
 999         struct mtx *mutex;
1000
1001         mutex = nfsrc_cachemutex(rp);
1002         mtx_lock(mutex);
1003         if (rp->rc_refcnt <= 0)
1004                 panic("nfs cache derefcnt");
1005         rp->rc_refcnt--;
1006         if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1007                 nfsrc_freecache(rp);
1008         mtx_unlock(mutex);
1009 }
1010
1011 /*
1012  * Calculate the length of the mbuf list and a checksum on the first up to
1013  * NFSRVCACHE_CHECKLEN bytes.
1014  */
1015 static int
1016 nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
1017 {
1018         int len = 0, cklen;
1019         struct mbuf *m;
1020
1021         m = m1;
1022         while (m) {
1023                 len += m->m_len;
1024                 m = m->m_next;
1025         }
1026         cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1027         *cksum = in_cksum(m1, cklen);
1028         return (len);
1029 }
1030
1031 /*
1032  * Mark a TCP connection that is seeing retries. Should never happen for
1033  * NFSv4.
1034  */
1035 static void
1036 nfsrc_marksametcpconn(u_int64_t sockref)
1037 {
1038 }
1039