sys/fs/nfsserver/nfs_nfsdcache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Rick Macklem at The University of Guelph.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  */
  35
  36 #include <sys/cdefs.h>
  37 /*
  38  * Here is the basic algorithm:
  39  * First, some design criteria I used:
  40  * - I think a false hit is more serious than a false miss
  41  * - A false hit for an RPC that has Op(s) that order via seqid# must be
  42  *   avoided at all cost
  43  * - A valid hit will probably happen a long time after the original reply
  44  *   and the TCP socket that the original request was received on will no
  45  *   longer be active
  46  *   (The long time delay implies to me that LRU is not appropriate.)
  47  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
  48  *   in them as well as minimizing the risk of redoing retried non-idempotent
  49  *   Ops.
  50  * Because it is biased towards avoiding false hits, multiple entries with
  51  * the same xid are to be expected, especially for the case of the entry
  52  * in the cache being related to a seqid# sequenced Op.
  53  *
  54  * The basic algorithm I'm about to code up:
  55  * - Null RPCs bypass the cache and are just done
  56  * For TCP
  57  *      - key on <xid, NFS version> (as noted above, there can be several
  58  *                                   entries with the same key)
  59  *      When a request arrives:
  60  *              For all that match key
  61  *              - if RPC# != OR request_size !=
  62  *                      - not a match with this one
  63  *              - if NFSv4 and received on same TCP socket OR
  64  *                      received on a TCP connection created before the
  65  *                      entry was cached
  66  *                      - not a match with this one
  67  *                      (V2,3 clients might retry on same TCP socket)
  68  *              - calculate checksum on first N bytes of NFS XDR
  69  *              - if checksum !=
  70  *                      - not a match for this one
  71  *              If any of the remaining ones that match has a
  72  *                      seqid_refcnt > 0
  73  *                      - not a match (go do RPC, using new cache entry)
  74  *              If one match left
  75  *                      - a hit (reply from cache)
  76  *              else
  77  *                      - miss (go do RPC, using new cache entry)
  78  *
  79  *      During processing of NFSv4 request:
  80  *              - set a flag when a non-idempotent Op is processed
  81  *              - when an Op that uses a seqid# (Open,...) is processed
  82  *                      - if same seqid# as referenced entry in cache
  83  *                              - free new cache entry
  84  *                              - reply from referenced cache entry
  85  *                        else if next seqid# in order
  86  *                              - free referenced cache entry
  87  *                              - increment seqid_refcnt on new cache entry
  88  *                              - set pointer from Openowner/Lockowner to
  89  *                                      new cache entry (aka reference it)
  90  *                        else if first seqid# in sequence
  91  *                              - increment seqid_refcnt on new cache entry
  92  *                              - set pointer from Openowner/Lockowner to
  93  *                                      new cache entry (aka reference it)
  94  *
  95  *      At end of RPC processing:
  96  *              - if seqid_refcnt > 0 OR flagged non-idempotent on new
  97  *                      cache entry
  98  *                      - save reply in cache entry
  99  *                      - calculate checksum on first N bytes of NFS XDR
 100  *                              request
 101  *                      - note op and length of XDR request (in bytes)
 102  *                      - timestamp it
 103  *                else
 104  *                      - free new cache entry
 105  *              - Send reply (noting info for socket activity check, below)
 106  *
 107  *      For cache entries saved above:
 108  *              - if saved since seqid_refcnt was > 0
 109  *                      - free when seqid_refcnt decrements to 0
 110  *                        (when next one in sequence is processed above, or
 111  *                         when Openowner/Lockowner is discarded)
 112  *                else { non-idempotent Op(s) }
 113  *                      - free when
 114  *                              - some further activity observed on same
 115  *                                      socket
 116  *                                (I'm not yet sure how I'm going to do
 117  *                                 this. Maybe look at the TCP connection
 118  *                                 to see if the send_tcp_sequence# is well
 119  *                                 past sent reply OR K additional RPCs
 120  *                                 replied on same socket OR?)
 121  *                        OR
 122  *                              - when very old (hours, days, weeks?)
 123  *
 124  * For UDP (v2, 3 only), pretty much the old way:
 125  * - key on <xid, NFS version, RPC#, Client host ip#>
 126  *   (at most one entry for each key)
 127  *
 128  * When a Request arrives:
 129  * - if a match with entry via key
 130  *      - if RPC marked In_progress
 131  *              - discard request (don't send reply)
 132  *        else
 133  *              - reply from cache
 134  *              - timestamp cache entry
 135  *   else
 136  *      - add entry to cache, marked In_progress
 137  *      - do RPC
 138  *      - when RPC done
 139  *              - if RPC# non-idempotent
 140  *                      - mark entry Done (not In_progress)
 141  *                      - save reply
 142  *                      - timestamp cache entry
 143  *                else
 144  *                      - free cache entry
 145  *              - send reply
 146  *
 147  * Later, entries with saved replies are free'd a short time (few minutes)
 148  * after reply sent (timestamp).
 149  * Reference: Chet Juszczak, "Improving the Performance and Correctness
 150  *              of an NFS Server", in Proc. Winter 1989 USENIX Conference,
 151  *              pages 53-63. San Diego, February 1989.
 152  *       for the UDP case.
 153  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
 154  *      for TCP. For V3, a reply won't be saved when the flood level is
 155  *      hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
 156  *      that case. This level should be set high enough that this almost
 157  *      never happens.
 158  */
 159 #include <fs/nfs/nfsport.h>
 160
 161 extern struct mtx nfsrc_udpmtx;
 162
 163 NFSD_VNET_DECLARE(struct nfsrvhashhead *, nfsrvudphashtbl);
 164 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrchash_table);
 165 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrcahash_table);
 166 NFSD_VNET_DECLARE(struct nfsstatsv1 *, nfsstatsv1_p);
 167
 168 NFSD_VNET_DEFINE(int, nfsrc_floodlevel) = NFSRVCACHE_FLOODLEVEL;
 169 NFSD_VNET_DEFINE(int, nfsrc_tcpsavedreplies) = 0;
 170
 171 SYSCTL_DECL(_vfs_nfsd);
 172
 173 static u_int    nfsrc_tcphighwater = 0;
 174 static int
 175 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
 176 {
 177         int error, newhighwater;
 178
 179         newhighwater = nfsrc_tcphighwater;
 180         error = sysctl_handle_int(oidp, &newhighwater, 0, req);
 181         if (error != 0 || req->newptr == NULL)
 182                 return (error);
 183         if (newhighwater < 0)
 184                 return (EINVAL);
 185         if (newhighwater >= NFSD_VNET(nfsrc_floodlevel))
 186                 NFSD_VNET(nfsrc_floodlevel) = newhighwater + newhighwater / 5;
 187         nfsrc_tcphighwater = newhighwater;
 188         return (0);
 189 }
 190 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
 191     CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
 192     sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
 193
 194 static u_int    nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
 195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
 196     &nfsrc_udphighwater, 0,
 197     "High water mark for UDP cache entries");
 198 static u_int    nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
 199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
 200     &nfsrc_tcptimeout, 0,
 201     "Timeout for TCP entries in the DRC");
 202 static u_int nfsrc_tcpnonidempotent = 1;
 203 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
 204     &nfsrc_tcpnonidempotent, 0,
 205     "Enable the DRC for NFS over TCP");
 206
 207 NFSD_VNET_DEFINE_STATIC(int, nfsrc_udpcachesize) = 0;
 208 NFSD_VNET_DEFINE_STATIC(TAILQ_HEAD(, nfsrvcache), nfsrvudplru);
 209
 210 /*
 211  * and the reverse mapping from generic to Version 2 procedure numbers
 212  */
 213 static int newnfsv2_procid[NFS_V3NPROCS] = {
 214         NFSV2PROC_NULL,
 215         NFSV2PROC_GETATTR,
 216         NFSV2PROC_SETATTR,
 217         NFSV2PROC_LOOKUP,
 218         NFSV2PROC_NOOP,
 219         NFSV2PROC_READLINK,
 220         NFSV2PROC_READ,
 221         NFSV2PROC_WRITE,
 222         NFSV2PROC_CREATE,
 223         NFSV2PROC_MKDIR,
 224         NFSV2PROC_SYMLINK,
 225         NFSV2PROC_CREATE,
 226         NFSV2PROC_REMOVE,
 227         NFSV2PROC_RMDIR,
 228         NFSV2PROC_RENAME,
 229         NFSV2PROC_LINK,
 230         NFSV2PROC_READDIR,
 231         NFSV2PROC_NOOP,
 232         NFSV2PROC_STATFS,
 233         NFSV2PROC_NOOP,
 234         NFSV2PROC_NOOP,
 235         NFSV2PROC_NOOP,
 236 };
 237
 238 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
 239 #define NFSRCUDPHASH(xid) \
 240         (&NFSD_VNET(nfsrvudphashtbl)[nfsrc_hash(xid)])
 241 #define NFSRCHASH(xid) \
 242         (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(xid)].tbl)
 243 #define NFSRCAHASH(xid) (&NFSD_VNET(nfsrcahash_table)[nfsrc_hash(xid)])
 244 #define TRUE    1
 245 #define FALSE   0
 246 #define NFSRVCACHE_CHECKLEN     100
 247
 248 /* True iff the rpc reply is an nfs status ONLY! */
 249 static int nfsv2_repstat[NFS_V3NPROCS] = {
 250         FALSE,
 251         FALSE,
 252         FALSE,
 253         FALSE,
 254         FALSE,
 255         FALSE,
 256         FALSE,
 257         FALSE,
 258         FALSE,
 259         FALSE,
 260         TRUE,
 261         TRUE,
 262         TRUE,
 263         TRUE,
 264         FALSE,
 265         TRUE,
 266         FALSE,
 267         FALSE,
 268         FALSE,
 269         FALSE,
 270         FALSE,
 271         FALSE,
 272 };
 273
 274 /*
 275  * Will NFS want to work over IPv6 someday?
 276  */
 277 #define NETFAMILY(rp) \
 278                 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
 279
 280 /* local functions */
 281 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
 282 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
 283 static void nfsrc_lock(struct nfsrvcache *rp);
 284 static void nfsrc_unlock(struct nfsrvcache *rp);
 285 static void nfsrc_wanted(struct nfsrvcache *rp);
 286 static void nfsrc_freecache(struct nfsrvcache *rp);
 287 static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
 288 static void nfsrc_marksametcpconn(u_int64_t);
 289
 290 /*
 291  * Return the correct mutex for this cache entry.
 292  */
 293 static __inline struct mtx *
 294 nfsrc_cachemutex(struct nfsrvcache *rp)
 295 {
 296
 297         if ((rp->rc_flag & RC_UDP) != 0)
 298                 return (&nfsrc_udpmtx);
 299         return (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(rp->rc_xid)].mtx);
 300 }
 301
 302 /*
 303  * Initialize the server request cache list
 304  */
 305 void
 306 nfsrvd_initcache(void)
 307 {
 308         int i;
 309
 310         NFSD_VNET(nfsrvudphashtbl) = malloc(sizeof(struct nfsrvhashhead) *
 311             NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
 312         NFSD_VNET(nfsrchash_table) = malloc(sizeof(struct nfsrchash_bucket) *
 313             NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
 314         NFSD_VNET(nfsrcahash_table) = malloc(sizeof(struct nfsrchash_bucket) *
 315             NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
 316         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 317                 mtx_init(&NFSD_VNET(nfsrchash_table)[i].mtx, "nfsrtc", NULL,
 318                     MTX_DEF);
 319                 mtx_init(&NFSD_VNET(nfsrcahash_table)[i].mtx, "nfsrtca", NULL,
 320                     MTX_DEF);
 321         }
 322         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 323                 LIST_INIT(&NFSD_VNET(nfsrvudphashtbl)[i]);
 324                 LIST_INIT(&NFSD_VNET(nfsrchash_table)[i].tbl);
 325                 LIST_INIT(&NFSD_VNET(nfsrcahash_table)[i].tbl);
 326         }
 327         TAILQ_INIT(&NFSD_VNET(nfsrvudplru));
 328         NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
 329         NFSD_VNET(nfsrc_udpcachesize) = 0;
 330 }
 331
 332 /*
 333  * Get a cache entry for this request. Basically just malloc a new one
 334  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
 335  */
 336 int
 337 nfsrvd_getcache(struct nfsrv_descript *nd)
 338 {
 339         struct nfsrvcache *newrp;
 340         int ret;
 341
 342         if (nd->nd_procnum == NFSPROC_NULL)
 343                 panic("nfsd cache null");
 344         newrp = malloc(sizeof (struct nfsrvcache),
 345             M_NFSRVCACHE, M_WAITOK);
 346         NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
 347         if (nd->nd_flag & ND_NFSV4)
 348                 newrp->rc_flag = RC_NFSV4;
 349         else if (nd->nd_flag & ND_NFSV3)
 350                 newrp->rc_flag = RC_NFSV3;
 351         else
 352                 newrp->rc_flag = RC_NFSV2;
 353         newrp->rc_xid = nd->nd_retxid;
 354         newrp->rc_proc = nd->nd_procnum;
 355         newrp->rc_sockref = nd->nd_sockref;
 356         newrp->rc_cachetime = nd->nd_tcpconntime;
 357         if (nd->nd_flag & ND_SAMETCPCONN)
 358                 newrp->rc_flag |= RC_SAMETCPCONN;
 359         if (nd->nd_nam2 != NULL) {
 360                 newrp->rc_flag |= RC_UDP;
 361                 ret = nfsrc_getudp(nd, newrp);
 362         } else {
 363                 ret = nfsrc_gettcp(nd, newrp);
 364         }
 365         NFSEXITCODE2(0, nd);
 366         return (ret);
 367 }
 368
 369 /*
 370  * For UDP (v2, v3):
 371  * - key on <xid, NFS version, RPC#, Client host ip#>
 372  *   (at most one entry for each key)
 373  */
 374 static int
 375 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
 376 {
 377         struct nfsrvcache *rp;
 378         struct sockaddr_in *saddr;
 379         struct sockaddr_in6 *saddr6;
 380         struct nfsrvhashhead *hp;
 381         int ret = 0;
 382         struct mtx *mutex;
 383
 384         mutex = nfsrc_cachemutex(newrp);
 385         hp = NFSRCUDPHASH(newrp->rc_xid);
 386 loop:
 387         mtx_lock(mutex);
 388         LIST_FOREACH(rp, hp, rc_hash) {
 389             if (newrp->rc_xid == rp->rc_xid &&
 390                 newrp->rc_proc == rp->rc_proc &&
 391                 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
 392                 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
 393                         if ((rp->rc_flag & RC_LOCKED) != 0) {
 394                                 rp->rc_flag |= RC_WANTED;
 395                                 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
 396                                     "nfsrc", 10 * hz);
 397                                 goto loop;
 398                         }
 399                         if (rp->rc_flag == 0)
 400                                 panic("nfs udp cache0");
 401                         rp->rc_flag |= RC_LOCKED;
 402                         TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
 403                         TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
 404                         if (rp->rc_flag & RC_INPROG) {
 405                                 NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
 406                                 mtx_unlock(mutex);
 407                                 ret = RC_DROPIT;
 408                         } else if (rp->rc_flag & RC_REPSTATUS) {
 409                                 /*
 410                                  * V2 only.
 411                                  */
 412                                 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
 413                                 mtx_unlock(mutex);
 414                                 nfsrvd_rephead(nd);
 415                                 *(nd->nd_errp) = rp->rc_status;
 416                                 ret = RC_REPLY;
 417                                 rp->rc_timestamp = NFSD_MONOSEC +
 418                                         NFSRVCACHE_UDPTIMEOUT;
 419                         } else if (rp->rc_flag & RC_REPMBUF) {
 420                                 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
 421                                 mtx_unlock(mutex);
 422                                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
 423                                         M_COPYALL, M_WAITOK);
 424                                 ret = RC_REPLY;
 425                                 rp->rc_timestamp = NFSD_MONOSEC +
 426                                         NFSRVCACHE_UDPTIMEOUT;
 427                         } else {
 428                                 panic("nfs udp cache1");
 429                         }
 430                         nfsrc_unlock(rp);
 431                         free(newrp, M_NFSRVCACHE);
 432                         goto out;
 433                 }
 434         }
 435         NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
 436         atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
 437         NFSD_VNET(nfsrc_udpcachesize)++;
 438
 439         newrp->rc_flag |= RC_INPROG;
 440         saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
 441         if (saddr->sin_family == AF_INET)
 442                 newrp->rc_inet = saddr->sin_addr.s_addr;
 443         else if (saddr->sin_family == AF_INET6) {
 444                 saddr6 = (struct sockaddr_in6 *)saddr;
 445                 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
 446                     sizeof (struct in6_addr));
 447                 newrp->rc_flag |= RC_INETIPV6;
 448         }
 449         LIST_INSERT_HEAD(hp, newrp, rc_hash);
 450         TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), newrp, rc_lru);
 451         mtx_unlock(mutex);
 452         nd->nd_rp = newrp;
 453         ret = RC_DOIT;
 454
 455 out:
 456         NFSEXITCODE2(0, nd);
 457         return (ret);
 458 }
 459
 460 /*
 461  * Update a request cache entry after the rpc has been done
 462  */
 463 struct nfsrvcache *
 464 nfsrvd_updatecache(struct nfsrv_descript *nd)
 465 {
 466         struct nfsrvcache *rp;
 467         struct nfsrvcache *retrp = NULL;
 468         struct mbuf *m;
 469         struct mtx *mutex;
 470
 471         rp = nd->nd_rp;
 472         if (!rp)
 473                 panic("nfsrvd_updatecache null rp");
 474         nd->nd_rp = NULL;
 475         mutex = nfsrc_cachemutex(rp);
 476         mtx_lock(mutex);
 477         nfsrc_lock(rp);
 478         if (!(rp->rc_flag & RC_INPROG))
 479                 panic("nfsrvd_updatecache not inprog");
 480         rp->rc_flag &= ~RC_INPROG;
 481         if (rp->rc_flag & RC_UDP) {
 482                 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
 483                 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
 484         }
 485
 486         /*
 487          * Reply from cache is a special case returned by nfsrv_checkseqid().
 488          */
 489         if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
 490                 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
 491                 mtx_unlock(mutex);
 492                 nd->nd_repstat = 0;
 493                 if (nd->nd_mreq)
 494                         m_freem(nd->nd_mreq);
 495                 if (!(rp->rc_flag & RC_REPMBUF))
 496                         panic("reply from cache");
 497                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
 498                     M_COPYALL, M_WAITOK);
 499                 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 500                 nfsrc_unlock(rp);
 501                 goto out;
 502         }
 503
 504         /*
 505          * If rc_refcnt > 0, save it
 506          * For UDP, save it if ND_SAVEREPLY is set
 507          * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
 508          */
 509         if (nd->nd_repstat != NFSERR_DONTREPLY &&
 510             (rp->rc_refcnt > 0 ||
 511              ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
 512              ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
 513               NFSD_VNET(nfsrc_tcpsavedreplies) <= NFSD_VNET(nfsrc_floodlevel) &&
 514               nfsrc_tcpnonidempotent))) {
 515                 if (rp->rc_refcnt > 0) {
 516                         if (!(rp->rc_flag & RC_NFSV4))
 517                                 panic("update_cache refcnt");
 518                         rp->rc_flag |= RC_REFCNT;
 519                 }
 520                 if ((nd->nd_flag & ND_NFSV2) &&
 521                     nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
 522                         rp->rc_status = nd->nd_repstat;
 523                         rp->rc_flag |= RC_REPSTATUS;
 524                         mtx_unlock(mutex);
 525                 } else {
 526                         if (!(rp->rc_flag & RC_UDP)) {
 527                             atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies),
 528                                 1);
 529                             if (NFSD_VNET(nfsrc_tcpsavedreplies) >
 530                                 NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak)
 531                                 NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak =
 532                                     NFSD_VNET(nfsrc_tcpsavedreplies);
 533                         }
 534                         mtx_unlock(mutex);
 535                         m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
 536                         mtx_lock(mutex);
 537                         rp->rc_reply = m;
 538                         rp->rc_flag |= RC_REPMBUF;
 539                         mtx_unlock(mutex);
 540                 }
 541                 if (rp->rc_flag & RC_UDP) {
 542                         rp->rc_timestamp = NFSD_MONOSEC +
 543                             NFSRVCACHE_UDPTIMEOUT;
 544                         nfsrc_unlock(rp);
 545                 } else {
 546                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 547                         if (rp->rc_refcnt > 0)
 548                                 nfsrc_unlock(rp);
 549                         else
 550                                 retrp = rp;
 551                 }
 552         } else {
 553                 nfsrc_freecache(rp);
 554                 mtx_unlock(mutex);
 555         }
 556
 557 out:
 558         NFSEXITCODE2(0, nd);
 559         return (retrp);
 560 }
 561
 562 /*
 563  * Invalidate and, if possible, free an in prog cache entry.
 564  * Must not sleep.
 565  */
 566 void
 567 nfsrvd_delcache(struct nfsrvcache *rp)
 568 {
 569         struct mtx *mutex;
 570
 571         mutex = nfsrc_cachemutex(rp);
 572         if (!(rp->rc_flag & RC_INPROG))
 573                 panic("nfsrvd_delcache not in prog");
 574         mtx_lock(mutex);
 575         rp->rc_flag &= ~RC_INPROG;
 576         if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
 577                 nfsrc_freecache(rp);
 578         mtx_unlock(mutex);
 579 }
 580
 581 /*
 582  * Called after nfsrvd_updatecache() once the reply is sent, to update
 583  * the entry's sequence number and unlock it. The argument is
 584  * the pointer returned by nfsrvd_updatecache().
 585  */
 586 void
 587 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
 588 {
 589         struct nfsrchash_bucket *hbp;
 590
 591         KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
 592         if (have_seq) {
 593                 hbp = NFSRCAHASH(rp->rc_sockref);
 594                 mtx_lock(&hbp->mtx);
 595                 rp->rc_tcpseq = seq;
 596                 if (rp->rc_acked != RC_NO_ACK)
 597                         LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
 598                 rp->rc_acked = RC_NO_ACK;
 599                 mtx_unlock(&hbp->mtx);
 600         }
 601         nfsrc_unlock(rp);
 602 }
 603
 604 /*
 605  * Get a cache entry for TCP
 606  * - key on <xid, nfs version>
 607  *   (allow multiple entries for a given key)
 608  */
 609 static int
 610 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
 611 {
 612         struct nfsrvcache *rp, *nextrp;
 613         int i;
 614         struct nfsrvcache *hitrp;
 615         struct nfsrvhashhead *hp, nfsrc_templist;
 616         int hit, ret = 0;
 617         struct mtx *mutex;
 618
 619         mutex = nfsrc_cachemutex(newrp);
 620         hp = NFSRCHASH(newrp->rc_xid);
 621         newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
 622 tryagain:
 623         mtx_lock(mutex);
 624         hit = 1;
 625         LIST_INIT(&nfsrc_templist);
 626         /*
 627          * Get all the matches and put them on the temp list.
 628          */
 629         rp = LIST_FIRST(hp);
 630         while (rp != LIST_END(hp)) {
 631                 nextrp = LIST_NEXT(rp, rc_hash);
 632                 if (newrp->rc_xid == rp->rc_xid &&
 633                     (!(rp->rc_flag & RC_INPROG) ||
 634                      ((newrp->rc_flag & RC_SAMETCPCONN) &&
 635                       newrp->rc_sockref == rp->rc_sockref)) &&
 636                     (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
 637                     newrp->rc_proc == rp->rc_proc &&
 638                     ((newrp->rc_flag & RC_NFSV4) &&
 639                      newrp->rc_sockref != rp->rc_sockref &&
 640                      newrp->rc_cachetime >= rp->rc_cachetime)
 641                     && newrp->rc_reqlen == rp->rc_reqlen &&
 642                     newrp->rc_cksum == rp->rc_cksum) {
 643                         LIST_REMOVE(rp, rc_hash);
 644                         LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
 645                 }
 646                 rp = nextrp;
 647         }
 648
 649         /*
 650          * Now, use nfsrc_templist to decide if there is a match.
 651          */
 652         i = 0;
 653         LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
 654                 i++;
 655                 if (rp->rc_refcnt > 0) {
 656                         hit = 0;
 657                         break;
 658                 }
 659         }
 660         /*
 661          * Can be a hit only if one entry left.
 662          * Note possible hit entry and put nfsrc_templist back on hash
 663          * list.
 664          */
 665         if (i != 1)
 666                 hit = 0;
 667         hitrp = rp = LIST_FIRST(&nfsrc_templist);
 668         while (rp != LIST_END(&nfsrc_templist)) {
 669                 nextrp = LIST_NEXT(rp, rc_hash);
 670                 LIST_REMOVE(rp, rc_hash);
 671                 LIST_INSERT_HEAD(hp, rp, rc_hash);
 672                 rp = nextrp;
 673         }
 674         if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
 675                 panic("nfs gettcp cache templist");
 676
 677         if (hit) {
 678                 rp = hitrp;
 679                 if ((rp->rc_flag & RC_LOCKED) != 0) {
 680                         rp->rc_flag |= RC_WANTED;
 681                         (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
 682                             "nfsrc", 10 * hz);
 683                         goto tryagain;
 684                 }
 685                 if (rp->rc_flag == 0)
 686                         panic("nfs tcp cache0");
 687                 rp->rc_flag |= RC_LOCKED;
 688                 if (rp->rc_flag & RC_INPROG) {
 689                         NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
 690                         mtx_unlock(mutex);
 691                         if (newrp->rc_sockref == rp->rc_sockref)
 692                                 nfsrc_marksametcpconn(rp->rc_sockref);
 693                         ret = RC_DROPIT;
 694                 } else if (rp->rc_flag & RC_REPSTATUS) {
 695                         /*
 696                          * V2 only.
 697                          */
 698                         NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
 699                         mtx_unlock(mutex);
 700                         if (newrp->rc_sockref == rp->rc_sockref)
 701                                 nfsrc_marksametcpconn(rp->rc_sockref);
 702                         ret = RC_REPLY;
 703                         nfsrvd_rephead(nd);
 704                         *(nd->nd_errp) = rp->rc_status;
 705                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 706                 } else if (rp->rc_flag & RC_REPMBUF) {
 707                         NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
 708                         mtx_unlock(mutex);
 709                         if (newrp->rc_sockref == rp->rc_sockref)
 710                                 nfsrc_marksametcpconn(rp->rc_sockref);
 711                         ret = RC_REPLY;
 712                         nd->nd_mreq = m_copym(rp->rc_reply, 0,
 713                                 M_COPYALL, M_WAITOK);
 714                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 715                 } else {
 716                         panic("nfs tcp cache1");
 717                 }
 718                 nfsrc_unlock(rp);
 719                 free(newrp, M_NFSRVCACHE);
 720                 goto out;
 721         }
 722         NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
 723         atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
 724
 725         /*
 726          * For TCP, multiple entries for a key are allowed, so don't
 727          * chain it into the hash table until done.
 728          */
 729         newrp->rc_cachetime = NFSD_MONOSEC;
 730         newrp->rc_flag |= RC_INPROG;
 731         LIST_INSERT_HEAD(hp, newrp, rc_hash);
 732         mtx_unlock(mutex);
 733         nd->nd_rp = newrp;
 734         ret = RC_DOIT;
 735
 736 out:
 737         NFSEXITCODE2(0, nd);
 738         return (ret);
 739 }
 740
 741 /*
 742  * Lock a cache entry.
 743  */
 744 static void
 745 nfsrc_lock(struct nfsrvcache *rp)
 746 {
 747         struct mtx *mutex;
 748
 749         mutex = nfsrc_cachemutex(rp);
 750         mtx_assert(mutex, MA_OWNED);
 751         while ((rp->rc_flag & RC_LOCKED) != 0) {
 752                 rp->rc_flag |= RC_WANTED;
 753                 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
 754         }
 755         rp->rc_flag |= RC_LOCKED;
 756 }
 757
 758 /*
 759  * Unlock a cache entry.
 760  */
 761 static void
 762 nfsrc_unlock(struct nfsrvcache *rp)
 763 {
 764         struct mtx *mutex;
 765
 766         mutex = nfsrc_cachemutex(rp);
 767         mtx_lock(mutex);
 768         rp->rc_flag &= ~RC_LOCKED;
 769         nfsrc_wanted(rp);
 770         mtx_unlock(mutex);
 771 }
 772
 773 /*
 774  * Wakeup anyone wanting entry.
 775  */
 776 static void
 777 nfsrc_wanted(struct nfsrvcache *rp)
 778 {
 779         if (rp->rc_flag & RC_WANTED) {
 780                 rp->rc_flag &= ~RC_WANTED;
 781                 wakeup((caddr_t)rp);
 782         }
 783 }
 784
 785 /*
 786  * Free up the entry.
 787  * Must not sleep.
 788  */
 789 static void
 790 nfsrc_freecache(struct nfsrvcache *rp)
 791 {
 792         struct nfsrchash_bucket *hbp;
 793
 794         LIST_REMOVE(rp, rc_hash);
 795         if (rp->rc_flag & RC_UDP) {
 796                 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
 797                 NFSD_VNET(nfsrc_udpcachesize)--;
 798         } else if (rp->rc_acked != RC_NO_SEQ) {
 799                 hbp = NFSRCAHASH(rp->rc_sockref);
 800                 mtx_lock(&hbp->mtx);
 801                 if (rp->rc_acked == RC_NO_ACK)
 802                         LIST_REMOVE(rp, rc_ahash);
 803                 mtx_unlock(&hbp->mtx);
 804         }
 805         nfsrc_wanted(rp);
 806         if (rp->rc_flag & RC_REPMBUF) {
 807                 m_freem(rp->rc_reply);
 808                 if (!(rp->rc_flag & RC_UDP))
 809                         atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), -1);
 810         }
 811         free(rp, M_NFSRVCACHE);
 812         atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, -1);
 813 }
 814
 815 /*
 816  * Clean out the cache. Called when nfsserver module is unloaded.
 817  */
 818 void
 819 nfsrvd_cleancache(void)
 820 {
 821         struct nfsrvcache *rp, *nextrp;
 822         int i;
 823
 824         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 825                 LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrchash_table)[i].tbl,
 826                     rc_hash, nextrp)
 827                         nfsrc_freecache(rp);
 828         }
 829         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 830                 LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudphashtbl)[i], rc_hash,
 831                     nextrp) {
 832                         nfsrc_freecache(rp);
 833                 }
 834         }
 835         NFSD_VNET(nfsstatsv1_p)->srvcache_size = 0;
 836         NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
 837 }
 838
 839 #define HISTSIZE        16
 840 /*
 841  * The basic rule is to get rid of entries that are expired.
 842  */
 843 void
 844 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
 845 {
 846         struct nfsrchash_bucket *hbp;
 847         struct nfsrvcache *rp, *nextrp;
 848         int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
 849         time_t thisstamp;
 850         static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
 851         static int onethread = 0, oneslot = 0;
 852
 853         if (sockref != 0) {
 854                 hbp = NFSRCAHASH(sockref);
 855                 mtx_lock(&hbp->mtx);
 856                 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
 857                         if (sockref == rp->rc_sockref) {
 858                                 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
 859                                         rp->rc_acked = RC_ACK;
 860                                         LIST_REMOVE(rp, rc_ahash);
 861                                 } else if (final) {
 862                                         rp->rc_acked = RC_NACK;
 863                                         LIST_REMOVE(rp, rc_ahash);
 864                                 }
 865                         }
 866                 }
 867                 mtx_unlock(&hbp->mtx);
 868         }
 869
 870         if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
 871                 return;
 872         if (NFSD_MONOSEC != udp_lasttrim ||
 873             NFSD_VNET(nfsrc_udpcachesize) >= (nfsrc_udphighwater +
 874             nfsrc_udphighwater / 2)) {
 875                 mtx_lock(&nfsrc_udpmtx);
 876                 udp_lasttrim = NFSD_MONOSEC;
 877                 TAILQ_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudplru), rc_lru,
 878                     nextrp) {
 879                         if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
 880                              && rp->rc_refcnt == 0
 881                              && ((rp->rc_flag & RC_REFCNT) ||
 882                                  udp_lasttrim > rp->rc_timestamp ||
 883                                  NFSD_VNET(nfsrc_udpcachesize) >
 884                                  nfsrc_udphighwater))
 885                                 nfsrc_freecache(rp);
 886                 }
 887                 mtx_unlock(&nfsrc_udpmtx);
 888         }
 889         if (NFSD_MONOSEC != tcp_lasttrim ||
 890             NFSD_VNET(nfsrc_tcpsavedreplies) >= nfsrc_tcphighwater) {
 891                 force = nfsrc_tcphighwater / 4;
 892                 if (force > 0 &&
 893                     NFSD_VNET(nfsrc_tcpsavedreplies) + force >=
 894                     nfsrc_tcphighwater) {
 895                         for (i = 0; i < HISTSIZE; i++)
 896                                 time_histo[i] = 0;
 897                         i = 0;
 898                         lastslot = NFSRVCACHE_HASHSIZE - 1;
 899                 } else {
 900                         force = 0;
 901                         if (NFSD_MONOSEC != tcp_lasttrim) {
 902                                 i = 0;
 903                                 lastslot = NFSRVCACHE_HASHSIZE - 1;
 904                         } else {
 905                                 lastslot = i = oneslot;
 906                                 if (++oneslot >= NFSRVCACHE_HASHSIZE)
 907                                         oneslot = 0;
 908                         }
 909                 }
 910                 tto = nfsrc_tcptimeout;
 911                 tcp_lasttrim = NFSD_MONOSEC;
 912                 for (; i <= lastslot; i++) {
 913                         mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
 914                         LIST_FOREACH_SAFE(rp,
 915                             &NFSD_VNET(nfsrchash_table)[i].tbl, rc_hash,
 916                             nextrp) {
 917                                 if (!(rp->rc_flag &
 918                                      (RC_INPROG|RC_LOCKED|RC_WANTED))
 919                                      && rp->rc_refcnt == 0) {
 920                                         if ((rp->rc_flag & RC_REFCNT) ||
 921                                             tcp_lasttrim > rp->rc_timestamp ||
 922                                             rp->rc_acked == RC_ACK) {
 923                                                 nfsrc_freecache(rp);
 924                                                 continue;
 925                                         }
 926
 927                                         if (force == 0)
 928                                                 continue;
 929                                         /*
 930                                          * The timestamps range from roughly the
 931                                          * present (tcp_lasttrim) to the present
 932                                          * + nfsrc_tcptimeout. Generate a simple
 933                                          * histogram of where the timeouts fall.
 934                                          */
 935                                         j = rp->rc_timestamp - tcp_lasttrim;
 936                                         if (j >= tto)
 937                                                 j = HISTSIZE - 1;
 938                                         else if (j < 0)
 939                                                 j = 0;
 940                                         else
 941                                                 j = j * HISTSIZE / tto;
 942                                         time_histo[j]++;
 943                                 }
 944                         }
 945                         mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
 946                 }
 947                 if (force) {
 948                         /*
 949                          * Trim some more with a smaller timeout of as little
 950                          * as 20% of nfsrc_tcptimeout to try and get below
 951                          * 80% of the nfsrc_tcphighwater.
 952                          */
 953                         k = 0;
 954                         for (i = 0; i < (HISTSIZE - 2); i++) {
 955                                 k += time_histo[i];
 956                                 if (k > force)
 957                                         break;
 958                         }
 959                         k = tto * (i + 1) / HISTSIZE;
 960                         if (k < 1)
 961                                 k = 1;
 962                         thisstamp = tcp_lasttrim + k;
 963                         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 964                                 mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
 965                                 LIST_FOREACH_SAFE(rp,
 966                                     &NFSD_VNET(nfsrchash_table)[i].tbl,
 967                                     rc_hash, nextrp) {
 968                                         if (!(rp->rc_flag &
 969                                              (RC_INPROG|RC_LOCKED|RC_WANTED))
 970                                              && rp->rc_refcnt == 0
 971                                              && ((rp->rc_flag & RC_REFCNT) ||
 972                                                  thisstamp > rp->rc_timestamp ||
 973                                                  rp->rc_acked == RC_ACK))
 974                                                 nfsrc_freecache(rp);
 975                                 }
 976                                 mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
 977                         }
 978                 }
 979         }
 980         atomic_store_rel_int(&onethread, 0);
 981 }
 982
 983 /*
 984  * Add a seqid# reference to the cache entry.
 985  */
 986 void
 987 nfsrvd_refcache(struct nfsrvcache *rp)
 988 {
 989         struct mtx *mutex;
 990
 991         if (rp == NULL)
 992                 /* For NFSv4.1, there is no cache entry. */
 993                 return;
 994         mutex = nfsrc_cachemutex(rp);
 995         mtx_lock(mutex);
 996         if (rp->rc_refcnt < 0)
 997                 panic("nfs cache refcnt");
 998         rp->rc_refcnt++;
 999         mtx_unlock(mutex);
1000 }
1001
1002 /*
1003  * Dereference a seqid# cache entry.
1004  */
1005 void
1006 nfsrvd_derefcache(struct nfsrvcache *rp)
1007 {
1008         struct mtx *mutex;
1009
1010         mutex = nfsrc_cachemutex(rp);
1011         mtx_lock(mutex);
1012         if (rp->rc_refcnt <= 0)
1013                 panic("nfs cache derefcnt");
1014         rp->rc_refcnt--;
1015         if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1016                 nfsrc_freecache(rp);
1017         mtx_unlock(mutex);
1018 }
1019
1020 /*
1021  * Calculate the length of the mbuf list and a checksum on the first up to
1022  * NFSRVCACHE_CHECKLEN bytes.
1023  */
1024 static int
1025 nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
1026 {
1027         int len = 0, cklen;
1028         struct mbuf *m;
1029
1030         m = m1;
1031         while (m) {
1032                 len += m->m_len;
1033                 m = m->m_next;
1034         }
1035         cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1036         *cksum = in_cksum(m1, cklen);
1037         return (len);
1038 }
1039
1040 /*
1041  * Mark a TCP connection that is seeing retries. Should never happen for
1042  * NFSv4.
1043  */
1044 static void
1045 nfsrc_marksametcpconn(u_int64_t sockref)
1046 {
1047 }