2 * SPDX-License-Identifier: BSD-3-Clause
4 * Copyright (c) 1989, 1993
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Rick Macklem at The University of Guelph.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 #include <sys/cdefs.h>
38 * Here is the basic algorithm:
39 * First, some design criteria I used:
40 * - I think a false hit is more serious than a false miss
41 * - A false hit for an RPC that has Op(s) that order via seqid# must be
43 * - A valid hit will probably happen a long time after the original reply
44 * and the TCP socket that the original request was received on will no
46 * (The long time delay implies to me that LRU is not appropriate.)
47 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48 * in them as well as minimizing the risk of redoing retried non-idempotent
50 * Because it is biased towards avoiding false hits, multiple entries with
51 * the same xid are to be expected, especially for the case of the entry
52 * in the cache being related to a seqid# sequenced Op.
54 * The basic algorithm I'm about to code up:
55 * - Null RPCs bypass the cache and are just done
57 * - key on <xid, NFS version> (as noted above, there can be several
58 * entries with the same key)
59 * When a request arrives:
60 * For all that match key
61 * - if RPC# != OR request_size !=
62 * - not a match with this one
63 * - if NFSv4 and received on same TCP socket OR
64 * received on a TCP connection created before the
66 * - not a match with this one
67 * (V2,3 clients might retry on same TCP socket)
68 * - calculate checksum on first N bytes of NFS XDR
70 * - not a match for this one
71 * If any of the remaining ones that match has a
73 * - not a match (go do RPC, using new cache entry)
75 * - a hit (reply from cache)
77 * - miss (go do RPC, using new cache entry)
79 * During processing of NFSv4 request:
80 * - set a flag when a non-idempotent Op is processed
81 * - when an Op that uses a seqid# (Open,...) is processed
82 * - if same seqid# as referenced entry in cache
83 * - free new cache entry
84 * - reply from referenced cache entry
85 * else if next seqid# in order
86 * - free referenced cache entry
87 * - increment seqid_refcnt on new cache entry
88 * - set pointer from Openowner/Lockowner to
89 * new cache entry (aka reference it)
90 * else if first seqid# in sequence
91 * - increment seqid_refcnt on new cache entry
92 * - set pointer from Openowner/Lockowner to
93 * new cache entry (aka reference it)
95 * At end of RPC processing:
96 * - if seqid_refcnt > 0 OR flagged non-idempotent on new
98 * - save reply in cache entry
99 * - calculate checksum on first N bytes of NFS XDR
101 * - note op and length of XDR request (in bytes)
104 * - free new cache entry
105 * - Send reply (noting info for socket activity check, below)
107 * For cache entries saved above:
108 * - if saved since seqid_refcnt was > 0
109 * - free when seqid_refcnt decrements to 0
110 * (when next one in sequence is processed above, or
111 * when Openowner/Lockowner is discarded)
112 * else { non-idempotent Op(s) }
114 * - some further activity observed on same
116 * (I'm not yet sure how I'm going to do
117 * this. Maybe look at the TCP connection
118 * to see if the send_tcp_sequence# is well
119 * past sent reply OR K additional RPCs
120 * replied on same socket OR?)
122 * - when very old (hours, days, weeks?)
124 * For UDP (v2, 3 only), pretty much the old way:
125 * - key on <xid, NFS version, RPC#, Client host ip#>
126 * (at most one entry for each key)
128 * When a Request arrives:
129 * - if a match with entry via key
130 * - if RPC marked In_progress
131 * - discard request (don't send reply)
134 * - timestamp cache entry
136 * - add entry to cache, marked In_progress
139 * - if RPC# non-idempotent
140 * - mark entry Done (not In_progress)
142 * - timestamp cache entry
147 * Later, entries with saved replies are free'd a short time (few minutes)
148 * after reply sent (timestamp).
149 * Reference: Chet Juszczak, "Improving the Performance and Correctness
150 * of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151 * pages 53-63. San Diego, February 1989.
153 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154 * for TCP. For V3, a reply won't be saved when the flood level is
155 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156 * that case. This level should be set high enough that this almost
159 #include <fs/nfs/nfsport.h>
161 extern struct mtx nfsrc_udpmtx;
163 NFSD_VNET_DECLARE(struct nfsrvhashhead *, nfsrvudphashtbl);
164 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrchash_table);
165 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrcahash_table);
166 NFSD_VNET_DECLARE(struct nfsstatsv1 *, nfsstatsv1_p);
168 NFSD_VNET_DEFINE(int, nfsrc_floodlevel) = NFSRVCACHE_FLOODLEVEL;
169 NFSD_VNET_DEFINE(int, nfsrc_tcpsavedreplies) = 0;
171 SYSCTL_DECL(_vfs_nfsd);
173 static u_int nfsrc_tcphighwater = 0;
175 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
177 int error, newhighwater;
179 newhighwater = nfsrc_tcphighwater;
180 error = sysctl_handle_int(oidp, &newhighwater, 0, req);
181 if (error != 0 || req->newptr == NULL)
183 if (newhighwater < 0)
185 if (newhighwater >= NFSD_VNET(nfsrc_floodlevel))
186 NFSD_VNET(nfsrc_floodlevel) = newhighwater + newhighwater / 5;
187 nfsrc_tcphighwater = newhighwater;
190 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
191 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
192 sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
194 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
196 &nfsrc_udphighwater, 0,
197 "High water mark for UDP cache entries");
198 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
200 &nfsrc_tcptimeout, 0,
201 "Timeout for TCP entries in the DRC");
202 static u_int nfsrc_tcpnonidempotent = 1;
203 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
204 &nfsrc_tcpnonidempotent, 0,
205 "Enable the DRC for NFS over TCP");
207 NFSD_VNET_DEFINE_STATIC(int, nfsrc_udpcachesize) = 0;
208 NFSD_VNET_DEFINE_STATIC(TAILQ_HEAD(, nfsrvcache), nfsrvudplru);
211 * and the reverse mapping from generic to Version 2 procedure numbers
213 static int newnfsv2_procid[NFS_V3NPROCS] = {
238 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
239 #define NFSRCUDPHASH(xid) \
240 (&NFSD_VNET(nfsrvudphashtbl)[nfsrc_hash(xid)])
241 #define NFSRCHASH(xid) \
242 (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(xid)].tbl)
243 #define NFSRCAHASH(xid) (&NFSD_VNET(nfsrcahash_table)[nfsrc_hash(xid)])
246 #define NFSRVCACHE_CHECKLEN 100
248 /* True iff the rpc reply is an nfs status ONLY! */
249 static int nfsv2_repstat[NFS_V3NPROCS] = {
275 * Will NFS want to work over IPv6 someday?
277 #define NETFAMILY(rp) \
278 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
280 /* local functions */
281 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
283 static void nfsrc_lock(struct nfsrvcache *rp);
284 static void nfsrc_unlock(struct nfsrvcache *rp);
285 static void nfsrc_wanted(struct nfsrvcache *rp);
286 static void nfsrc_freecache(struct nfsrvcache *rp);
287 static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
288 static void nfsrc_marksametcpconn(u_int64_t);
291 * Return the correct mutex for this cache entry.
293 static __inline struct mtx *
294 nfsrc_cachemutex(struct nfsrvcache *rp)
297 if ((rp->rc_flag & RC_UDP) != 0)
298 return (&nfsrc_udpmtx);
299 return (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(rp->rc_xid)].mtx);
303 * Initialize the server request cache list
306 nfsrvd_initcache(void)
310 NFSD_VNET(nfsrvudphashtbl) = malloc(sizeof(struct nfsrvhashhead) *
311 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
312 NFSD_VNET(nfsrchash_table) = malloc(sizeof(struct nfsrchash_bucket) *
313 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
314 NFSD_VNET(nfsrcahash_table) = malloc(sizeof(struct nfsrchash_bucket) *
315 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
316 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
317 mtx_init(&NFSD_VNET(nfsrchash_table)[i].mtx, "nfsrtc", NULL,
319 mtx_init(&NFSD_VNET(nfsrcahash_table)[i].mtx, "nfsrtca", NULL,
322 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
323 LIST_INIT(&NFSD_VNET(nfsrvudphashtbl)[i]);
324 LIST_INIT(&NFSD_VNET(nfsrchash_table)[i].tbl);
325 LIST_INIT(&NFSD_VNET(nfsrcahash_table)[i].tbl);
327 TAILQ_INIT(&NFSD_VNET(nfsrvudplru));
328 NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
329 NFSD_VNET(nfsrc_udpcachesize) = 0;
333 * Get a cache entry for this request. Basically just malloc a new one
334 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
337 nfsrvd_getcache(struct nfsrv_descript *nd)
339 struct nfsrvcache *newrp;
342 if (nd->nd_procnum == NFSPROC_NULL)
343 panic("nfsd cache null");
344 newrp = malloc(sizeof (struct nfsrvcache),
345 M_NFSRVCACHE, M_WAITOK);
346 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
347 if (nd->nd_flag & ND_NFSV4)
348 newrp->rc_flag = RC_NFSV4;
349 else if (nd->nd_flag & ND_NFSV3)
350 newrp->rc_flag = RC_NFSV3;
352 newrp->rc_flag = RC_NFSV2;
353 newrp->rc_xid = nd->nd_retxid;
354 newrp->rc_proc = nd->nd_procnum;
355 newrp->rc_sockref = nd->nd_sockref;
356 newrp->rc_cachetime = nd->nd_tcpconntime;
357 if (nd->nd_flag & ND_SAMETCPCONN)
358 newrp->rc_flag |= RC_SAMETCPCONN;
359 if (nd->nd_nam2 != NULL) {
360 newrp->rc_flag |= RC_UDP;
361 ret = nfsrc_getudp(nd, newrp);
363 ret = nfsrc_gettcp(nd, newrp);
371 * - key on <xid, NFS version, RPC#, Client host ip#>
372 * (at most one entry for each key)
375 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
377 struct nfsrvcache *rp;
378 struct sockaddr_in *saddr;
379 struct sockaddr_in6 *saddr6;
380 struct nfsrvhashhead *hp;
384 mutex = nfsrc_cachemutex(newrp);
385 hp = NFSRCUDPHASH(newrp->rc_xid);
388 LIST_FOREACH(rp, hp, rc_hash) {
389 if (newrp->rc_xid == rp->rc_xid &&
390 newrp->rc_proc == rp->rc_proc &&
391 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
392 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
393 if ((rp->rc_flag & RC_LOCKED) != 0) {
394 rp->rc_flag |= RC_WANTED;
395 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
399 if (rp->rc_flag == 0)
400 panic("nfs udp cache0");
401 rp->rc_flag |= RC_LOCKED;
402 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
403 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
404 if (rp->rc_flag & RC_INPROG) {
405 NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
408 } else if (rp->rc_flag & RC_REPSTATUS) {
412 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
415 *(nd->nd_errp) = rp->rc_status;
417 rp->rc_timestamp = NFSD_MONOSEC +
418 NFSRVCACHE_UDPTIMEOUT;
419 } else if (rp->rc_flag & RC_REPMBUF) {
420 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
422 nd->nd_mreq = m_copym(rp->rc_reply, 0,
423 M_COPYALL, M_WAITOK);
425 rp->rc_timestamp = NFSD_MONOSEC +
426 NFSRVCACHE_UDPTIMEOUT;
428 panic("nfs udp cache1");
431 free(newrp, M_NFSRVCACHE);
435 NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
436 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
437 NFSD_VNET(nfsrc_udpcachesize)++;
439 newrp->rc_flag |= RC_INPROG;
440 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
441 if (saddr->sin_family == AF_INET)
442 newrp->rc_inet = saddr->sin_addr.s_addr;
443 else if (saddr->sin_family == AF_INET6) {
444 saddr6 = (struct sockaddr_in6 *)saddr;
445 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
446 sizeof (struct in6_addr));
447 newrp->rc_flag |= RC_INETIPV6;
449 LIST_INSERT_HEAD(hp, newrp, rc_hash);
450 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), newrp, rc_lru);
461 * Update a request cache entry after the rpc has been done
464 nfsrvd_updatecache(struct nfsrv_descript *nd)
466 struct nfsrvcache *rp;
467 struct nfsrvcache *retrp = NULL;
473 panic("nfsrvd_updatecache null rp");
475 mutex = nfsrc_cachemutex(rp);
478 if (!(rp->rc_flag & RC_INPROG))
479 panic("nfsrvd_updatecache not inprog");
480 rp->rc_flag &= ~RC_INPROG;
481 if (rp->rc_flag & RC_UDP) {
482 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
483 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
487 * Reply from cache is a special case returned by nfsrv_checkseqid().
489 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
490 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
494 m_freem(nd->nd_mreq);
495 if (!(rp->rc_flag & RC_REPMBUF))
496 panic("reply from cache");
497 nd->nd_mreq = m_copym(rp->rc_reply, 0,
498 M_COPYALL, M_WAITOK);
499 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
505 * If rc_refcnt > 0, save it
506 * For UDP, save it if ND_SAVEREPLY is set
507 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
509 if (nd->nd_repstat != NFSERR_DONTREPLY &&
510 (rp->rc_refcnt > 0 ||
511 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
512 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
513 NFSD_VNET(nfsrc_tcpsavedreplies) <= NFSD_VNET(nfsrc_floodlevel) &&
514 nfsrc_tcpnonidempotent))) {
515 if (rp->rc_refcnt > 0) {
516 if (!(rp->rc_flag & RC_NFSV4))
517 panic("update_cache refcnt");
518 rp->rc_flag |= RC_REFCNT;
520 if ((nd->nd_flag & ND_NFSV2) &&
521 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
522 rp->rc_status = nd->nd_repstat;
523 rp->rc_flag |= RC_REPSTATUS;
526 if (!(rp->rc_flag & RC_UDP)) {
527 atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies),
529 if (NFSD_VNET(nfsrc_tcpsavedreplies) >
530 NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak)
531 NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak =
532 NFSD_VNET(nfsrc_tcpsavedreplies);
535 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
538 rp->rc_flag |= RC_REPMBUF;
541 if (rp->rc_flag & RC_UDP) {
542 rp->rc_timestamp = NFSD_MONOSEC +
543 NFSRVCACHE_UDPTIMEOUT;
546 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
547 if (rp->rc_refcnt > 0)
563 * Invalidate and, if possible, free an in prog cache entry.
567 nfsrvd_delcache(struct nfsrvcache *rp)
571 mutex = nfsrc_cachemutex(rp);
572 if (!(rp->rc_flag & RC_INPROG))
573 panic("nfsrvd_delcache not in prog");
575 rp->rc_flag &= ~RC_INPROG;
576 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
582 * Called after nfsrvd_updatecache() once the reply is sent, to update
583 * the entry's sequence number and unlock it. The argument is
584 * the pointer returned by nfsrvd_updatecache().
587 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
589 struct nfsrchash_bucket *hbp;
591 KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
593 hbp = NFSRCAHASH(rp->rc_sockref);
596 if (rp->rc_acked != RC_NO_ACK)
597 LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
598 rp->rc_acked = RC_NO_ACK;
599 mtx_unlock(&hbp->mtx);
605 * Get a cache entry for TCP
606 * - key on <xid, nfs version>
607 * (allow multiple entries for a given key)
610 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
612 struct nfsrvcache *rp, *nextrp;
614 struct nfsrvcache *hitrp;
615 struct nfsrvhashhead *hp, nfsrc_templist;
619 mutex = nfsrc_cachemutex(newrp);
620 hp = NFSRCHASH(newrp->rc_xid);
621 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
625 LIST_INIT(&nfsrc_templist);
627 * Get all the matches and put them on the temp list.
630 while (rp != LIST_END(hp)) {
631 nextrp = LIST_NEXT(rp, rc_hash);
632 if (newrp->rc_xid == rp->rc_xid &&
633 (!(rp->rc_flag & RC_INPROG) ||
634 ((newrp->rc_flag & RC_SAMETCPCONN) &&
635 newrp->rc_sockref == rp->rc_sockref)) &&
636 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
637 newrp->rc_proc == rp->rc_proc &&
638 ((newrp->rc_flag & RC_NFSV4) &&
639 newrp->rc_sockref != rp->rc_sockref &&
640 newrp->rc_cachetime >= rp->rc_cachetime)
641 && newrp->rc_reqlen == rp->rc_reqlen &&
642 newrp->rc_cksum == rp->rc_cksum) {
643 LIST_REMOVE(rp, rc_hash);
644 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
650 * Now, use nfsrc_templist to decide if there is a match.
653 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
655 if (rp->rc_refcnt > 0) {
661 * Can be a hit only if one entry left.
662 * Note possible hit entry and put nfsrc_templist back on hash
667 hitrp = rp = LIST_FIRST(&nfsrc_templist);
668 while (rp != LIST_END(&nfsrc_templist)) {
669 nextrp = LIST_NEXT(rp, rc_hash);
670 LIST_REMOVE(rp, rc_hash);
671 LIST_INSERT_HEAD(hp, rp, rc_hash);
674 if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
675 panic("nfs gettcp cache templist");
679 if ((rp->rc_flag & RC_LOCKED) != 0) {
680 rp->rc_flag |= RC_WANTED;
681 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
685 if (rp->rc_flag == 0)
686 panic("nfs tcp cache0");
687 rp->rc_flag |= RC_LOCKED;
688 if (rp->rc_flag & RC_INPROG) {
689 NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
691 if (newrp->rc_sockref == rp->rc_sockref)
692 nfsrc_marksametcpconn(rp->rc_sockref);
694 } else if (rp->rc_flag & RC_REPSTATUS) {
698 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
700 if (newrp->rc_sockref == rp->rc_sockref)
701 nfsrc_marksametcpconn(rp->rc_sockref);
704 *(nd->nd_errp) = rp->rc_status;
705 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
706 } else if (rp->rc_flag & RC_REPMBUF) {
707 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
709 if (newrp->rc_sockref == rp->rc_sockref)
710 nfsrc_marksametcpconn(rp->rc_sockref);
712 nd->nd_mreq = m_copym(rp->rc_reply, 0,
713 M_COPYALL, M_WAITOK);
714 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
716 panic("nfs tcp cache1");
719 free(newrp, M_NFSRVCACHE);
722 NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
723 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
726 * For TCP, multiple entries for a key are allowed, so don't
727 * chain it into the hash table until done.
729 newrp->rc_cachetime = NFSD_MONOSEC;
730 newrp->rc_flag |= RC_INPROG;
731 LIST_INSERT_HEAD(hp, newrp, rc_hash);
742 * Lock a cache entry.
745 nfsrc_lock(struct nfsrvcache *rp)
749 mutex = nfsrc_cachemutex(rp);
750 mtx_assert(mutex, MA_OWNED);
751 while ((rp->rc_flag & RC_LOCKED) != 0) {
752 rp->rc_flag |= RC_WANTED;
753 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
755 rp->rc_flag |= RC_LOCKED;
759 * Unlock a cache entry.
762 nfsrc_unlock(struct nfsrvcache *rp)
766 mutex = nfsrc_cachemutex(rp);
768 rp->rc_flag &= ~RC_LOCKED;
774 * Wakeup anyone wanting entry.
777 nfsrc_wanted(struct nfsrvcache *rp)
779 if (rp->rc_flag & RC_WANTED) {
780 rp->rc_flag &= ~RC_WANTED;
790 nfsrc_freecache(struct nfsrvcache *rp)
792 struct nfsrchash_bucket *hbp;
794 LIST_REMOVE(rp, rc_hash);
795 if (rp->rc_flag & RC_UDP) {
796 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
797 NFSD_VNET(nfsrc_udpcachesize)--;
798 } else if (rp->rc_acked != RC_NO_SEQ) {
799 hbp = NFSRCAHASH(rp->rc_sockref);
801 if (rp->rc_acked == RC_NO_ACK)
802 LIST_REMOVE(rp, rc_ahash);
803 mtx_unlock(&hbp->mtx);
806 if (rp->rc_flag & RC_REPMBUF) {
807 m_freem(rp->rc_reply);
808 if (!(rp->rc_flag & RC_UDP))
809 atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), -1);
811 free(rp, M_NFSRVCACHE);
812 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, -1);
816 * Clean out the cache. Called when nfsserver module is unloaded.
819 nfsrvd_cleancache(void)
821 struct nfsrvcache *rp, *nextrp;
824 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
825 LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrchash_table)[i].tbl,
829 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
830 LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudphashtbl)[i], rc_hash,
835 NFSD_VNET(nfsstatsv1_p)->srvcache_size = 0;
836 NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
841 * The basic rule is to get rid of entries that are expired.
844 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
846 struct nfsrchash_bucket *hbp;
847 struct nfsrvcache *rp, *nextrp;
848 int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
850 static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
851 static int onethread = 0, oneslot = 0;
854 hbp = NFSRCAHASH(sockref);
856 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
857 if (sockref == rp->rc_sockref) {
858 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
859 rp->rc_acked = RC_ACK;
860 LIST_REMOVE(rp, rc_ahash);
862 rp->rc_acked = RC_NACK;
863 LIST_REMOVE(rp, rc_ahash);
867 mtx_unlock(&hbp->mtx);
870 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
872 if (NFSD_MONOSEC != udp_lasttrim ||
873 NFSD_VNET(nfsrc_udpcachesize) >= (nfsrc_udphighwater +
874 nfsrc_udphighwater / 2)) {
875 mtx_lock(&nfsrc_udpmtx);
876 udp_lasttrim = NFSD_MONOSEC;
877 TAILQ_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudplru), rc_lru,
879 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
880 && rp->rc_refcnt == 0
881 && ((rp->rc_flag & RC_REFCNT) ||
882 udp_lasttrim > rp->rc_timestamp ||
883 NFSD_VNET(nfsrc_udpcachesize) >
887 mtx_unlock(&nfsrc_udpmtx);
889 if (NFSD_MONOSEC != tcp_lasttrim ||
890 NFSD_VNET(nfsrc_tcpsavedreplies) >= nfsrc_tcphighwater) {
891 force = nfsrc_tcphighwater / 4;
893 NFSD_VNET(nfsrc_tcpsavedreplies) + force >=
894 nfsrc_tcphighwater) {
895 for (i = 0; i < HISTSIZE; i++)
898 lastslot = NFSRVCACHE_HASHSIZE - 1;
901 if (NFSD_MONOSEC != tcp_lasttrim) {
903 lastslot = NFSRVCACHE_HASHSIZE - 1;
905 lastslot = i = oneslot;
906 if (++oneslot >= NFSRVCACHE_HASHSIZE)
910 tto = nfsrc_tcptimeout;
911 tcp_lasttrim = NFSD_MONOSEC;
912 for (; i <= lastslot; i++) {
913 mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
914 LIST_FOREACH_SAFE(rp,
915 &NFSD_VNET(nfsrchash_table)[i].tbl, rc_hash,
918 (RC_INPROG|RC_LOCKED|RC_WANTED))
919 && rp->rc_refcnt == 0) {
920 if ((rp->rc_flag & RC_REFCNT) ||
921 tcp_lasttrim > rp->rc_timestamp ||
922 rp->rc_acked == RC_ACK) {
930 * The timestamps range from roughly the
931 * present (tcp_lasttrim) to the present
932 * + nfsrc_tcptimeout. Generate a simple
933 * histogram of where the timeouts fall.
935 j = rp->rc_timestamp - tcp_lasttrim;
941 j = j * HISTSIZE / tto;
945 mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
949 * Trim some more with a smaller timeout of as little
950 * as 20% of nfsrc_tcptimeout to try and get below
951 * 80% of the nfsrc_tcphighwater.
954 for (i = 0; i < (HISTSIZE - 2); i++) {
959 k = tto * (i + 1) / HISTSIZE;
962 thisstamp = tcp_lasttrim + k;
963 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
964 mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
965 LIST_FOREACH_SAFE(rp,
966 &NFSD_VNET(nfsrchash_table)[i].tbl,
969 (RC_INPROG|RC_LOCKED|RC_WANTED))
970 && rp->rc_refcnt == 0
971 && ((rp->rc_flag & RC_REFCNT) ||
972 thisstamp > rp->rc_timestamp ||
973 rp->rc_acked == RC_ACK))
976 mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
980 atomic_store_rel_int(&onethread, 0);
984 * Add a seqid# reference to the cache entry.
987 nfsrvd_refcache(struct nfsrvcache *rp)
992 /* For NFSv4.1, there is no cache entry. */
994 mutex = nfsrc_cachemutex(rp);
996 if (rp->rc_refcnt < 0)
997 panic("nfs cache refcnt");
1003 * Dereference a seqid# cache entry.
1006 nfsrvd_derefcache(struct nfsrvcache *rp)
1010 mutex = nfsrc_cachemutex(rp);
1012 if (rp->rc_refcnt <= 0)
1013 panic("nfs cache derefcnt");
1015 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1016 nfsrc_freecache(rp);
1021 * Calculate the length of the mbuf list and a checksum on the first up to
1022 * NFSRVCACHE_CHECKLEN bytes.
1025 nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
1035 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1036 *cksum = in_cksum(m1, cklen);
1041 * Mark a TCP connection that is seeing retries. Should never happen for
1045 nfsrc_marksametcpconn(u_int64_t sockref)