2 * SPDX-License-Identifier: BSD-3-Clause
4 * Copyright (c) 1989, 1993
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Rick Macklem at The University of Guelph.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
40 * Here is the basic algorithm:
41 * First, some design criteria I used:
42 * - I think a false hit is more serious than a false miss
43 * - A false hit for an RPC that has Op(s) that order via seqid# must be
45 * - A valid hit will probably happen a long time after the original reply
46 * and the TCP socket that the original request was received on will no
48 * (The long time delay implies to me that LRU is not appropriate.)
49 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
50 * in them as well as minimizing the risk of redoing retried non-idempotent
52 * Because it is biased towards avoiding false hits, multiple entries with
53 * the same xid are to be expected, especially for the case of the entry
54 * in the cache being related to a seqid# sequenced Op.
56 * The basic algorithm I'm about to code up:
57 * - Null RPCs bypass the cache and are just done
59 * - key on <xid, NFS version> (as noted above, there can be several
60 * entries with the same key)
61 * When a request arrives:
62 * For all that match key
63 * - if RPC# != OR request_size !=
64 * - not a match with this one
65 * - if NFSv4 and received on same TCP socket OR
66 * received on a TCP connection created before the
68 * - not a match with this one
69 * (V2,3 clients might retry on same TCP socket)
70 * - calculate checksum on first N bytes of NFS XDR
72 * - not a match for this one
73 * If any of the remaining ones that match has a
75 * - not a match (go do RPC, using new cache entry)
77 * - a hit (reply from cache)
79 * - miss (go do RPC, using new cache entry)
81 * During processing of NFSv4 request:
82 * - set a flag when a non-idempotent Op is processed
83 * - when an Op that uses a seqid# (Open,...) is processed
84 * - if same seqid# as referenced entry in cache
85 * - free new cache entry
86 * - reply from referenced cache entry
87 * else if next seqid# in order
88 * - free referenced cache entry
89 * - increment seqid_refcnt on new cache entry
90 * - set pointer from Openowner/Lockowner to
91 * new cache entry (aka reference it)
92 * else if first seqid# in sequence
93 * - increment seqid_refcnt on new cache entry
94 * - set pointer from Openowner/Lockowner to
95 * new cache entry (aka reference it)
97 * At end of RPC processing:
98 * - if seqid_refcnt > 0 OR flagged non-idempotent on new
100 * - save reply in cache entry
101 * - calculate checksum on first N bytes of NFS XDR
103 * - note op and length of XDR request (in bytes)
106 * - free new cache entry
107 * - Send reply (noting info for socket activity check, below)
109 * For cache entries saved above:
110 * - if saved since seqid_refcnt was > 0
111 * - free when seqid_refcnt decrements to 0
112 * (when next one in sequence is processed above, or
113 * when Openowner/Lockowner is discarded)
114 * else { non-idempotent Op(s) }
116 * - some further activity observed on same
118 * (I'm not yet sure how I'm going to do
119 * this. Maybe look at the TCP connection
120 * to see if the send_tcp_sequence# is well
121 * past sent reply OR K additional RPCs
122 * replied on same socket OR?)
124 * - when very old (hours, days, weeks?)
126 * For UDP (v2, 3 only), pretty much the old way:
127 * - key on <xid, NFS version, RPC#, Client host ip#>
128 * (at most one entry for each key)
130 * When a Request arrives:
131 * - if a match with entry via key
132 * - if RPC marked In_progress
133 * - discard request (don't send reply)
136 * - timestamp cache entry
138 * - add entry to cache, marked In_progress
141 * - if RPC# non-idempotent
142 * - mark entry Done (not In_progress)
144 * - timestamp cache entry
149 * Later, entries with saved replies are free'd a short time (few minutes)
150 * after reply sent (timestamp).
151 * Reference: Chet Juszczak, "Improving the Performance and Correctness
152 * of an NFS Server", in Proc. Winter 1989 USENIX Conference,
153 * pages 53-63. San Diego, February 1989.
155 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
156 * for TCP. For V3, a reply won't be saved when the flood level is
157 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
158 * that case. This level should be set high enough that this almost
161 #include <fs/nfs/nfsport.h>
163 extern struct mtx nfsrc_udpmtx;
165 NFSD_VNET_DECLARE(struct nfsrvhashhead *, nfsrvudphashtbl);
166 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrchash_table);
167 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrcahash_table);
168 NFSD_VNET_DECLARE(struct nfsstatsv1 *, nfsstatsv1_p);
170 NFSD_VNET_DEFINE(int, nfsrc_floodlevel) = NFSRVCACHE_FLOODLEVEL;
171 NFSD_VNET_DEFINE(int, nfsrc_tcpsavedreplies) = 0;
173 SYSCTL_DECL(_vfs_nfsd);
175 static u_int nfsrc_tcphighwater = 0;
177 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
179 int error, newhighwater;
181 newhighwater = nfsrc_tcphighwater;
182 error = sysctl_handle_int(oidp, &newhighwater, 0, req);
183 if (error != 0 || req->newptr == NULL)
185 if (newhighwater < 0)
187 if (newhighwater >= NFSD_VNET(nfsrc_floodlevel))
188 NFSD_VNET(nfsrc_floodlevel) = newhighwater + newhighwater / 5;
189 nfsrc_tcphighwater = newhighwater;
192 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
193 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
194 sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
196 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
197 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
198 &nfsrc_udphighwater, 0,
199 "High water mark for UDP cache entries");
200 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
201 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
202 &nfsrc_tcptimeout, 0,
203 "Timeout for TCP entries in the DRC");
204 static u_int nfsrc_tcpnonidempotent = 1;
205 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
206 &nfsrc_tcpnonidempotent, 0,
207 "Enable the DRC for NFS over TCP");
209 NFSD_VNET_DEFINE_STATIC(int, nfsrc_udpcachesize) = 0;
210 NFSD_VNET_DEFINE_STATIC(TAILQ_HEAD(, nfsrvcache), nfsrvudplru);
213 * and the reverse mapping from generic to Version 2 procedure numbers
215 static int newnfsv2_procid[NFS_V3NPROCS] = {
240 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
241 #define NFSRCUDPHASH(xid) \
242 (&NFSD_VNET(nfsrvudphashtbl)[nfsrc_hash(xid)])
243 #define NFSRCHASH(xid) \
244 (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(xid)].tbl)
245 #define NFSRCAHASH(xid) (&NFSD_VNET(nfsrcahash_table)[nfsrc_hash(xid)])
248 #define NFSRVCACHE_CHECKLEN 100
250 /* True iff the rpc reply is an nfs status ONLY! */
251 static int nfsv2_repstat[NFS_V3NPROCS] = {
277 * Will NFS want to work over IPv6 someday?
279 #define NETFAMILY(rp) \
280 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
282 /* local functions */
283 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
284 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
285 static void nfsrc_lock(struct nfsrvcache *rp);
286 static void nfsrc_unlock(struct nfsrvcache *rp);
287 static void nfsrc_wanted(struct nfsrvcache *rp);
288 static void nfsrc_freecache(struct nfsrvcache *rp);
289 static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
290 static void nfsrc_marksametcpconn(u_int64_t);
293 * Return the correct mutex for this cache entry.
295 static __inline struct mtx *
296 nfsrc_cachemutex(struct nfsrvcache *rp)
299 if ((rp->rc_flag & RC_UDP) != 0)
300 return (&nfsrc_udpmtx);
301 return (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(rp->rc_xid)].mtx);
305 * Initialize the server request cache list
308 nfsrvd_initcache(void)
312 NFSD_VNET(nfsrvudphashtbl) = malloc(sizeof(struct nfsrvhashhead) *
313 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
314 NFSD_VNET(nfsrchash_table) = malloc(sizeof(struct nfsrchash_bucket) *
315 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
316 NFSD_VNET(nfsrcahash_table) = malloc(sizeof(struct nfsrchash_bucket) *
317 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
318 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
319 mtx_init(&NFSD_VNET(nfsrchash_table)[i].mtx, "nfsrtc", NULL,
321 mtx_init(&NFSD_VNET(nfsrcahash_table)[i].mtx, "nfsrtca", NULL,
324 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
325 LIST_INIT(&NFSD_VNET(nfsrvudphashtbl)[i]);
326 LIST_INIT(&NFSD_VNET(nfsrchash_table)[i].tbl);
327 LIST_INIT(&NFSD_VNET(nfsrcahash_table)[i].tbl);
329 TAILQ_INIT(&NFSD_VNET(nfsrvudplru));
330 NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
331 NFSD_VNET(nfsrc_udpcachesize) = 0;
335 * Get a cache entry for this request. Basically just malloc a new one
336 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
339 nfsrvd_getcache(struct nfsrv_descript *nd)
341 struct nfsrvcache *newrp;
344 if (nd->nd_procnum == NFSPROC_NULL)
345 panic("nfsd cache null");
346 newrp = malloc(sizeof (struct nfsrvcache),
347 M_NFSRVCACHE, M_WAITOK);
348 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
349 if (nd->nd_flag & ND_NFSV4)
350 newrp->rc_flag = RC_NFSV4;
351 else if (nd->nd_flag & ND_NFSV3)
352 newrp->rc_flag = RC_NFSV3;
354 newrp->rc_flag = RC_NFSV2;
355 newrp->rc_xid = nd->nd_retxid;
356 newrp->rc_proc = nd->nd_procnum;
357 newrp->rc_sockref = nd->nd_sockref;
358 newrp->rc_cachetime = nd->nd_tcpconntime;
359 if (nd->nd_flag & ND_SAMETCPCONN)
360 newrp->rc_flag |= RC_SAMETCPCONN;
361 if (nd->nd_nam2 != NULL) {
362 newrp->rc_flag |= RC_UDP;
363 ret = nfsrc_getudp(nd, newrp);
365 ret = nfsrc_gettcp(nd, newrp);
373 * - key on <xid, NFS version, RPC#, Client host ip#>
374 * (at most one entry for each key)
377 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
379 struct nfsrvcache *rp;
380 struct sockaddr_in *saddr;
381 struct sockaddr_in6 *saddr6;
382 struct nfsrvhashhead *hp;
386 mutex = nfsrc_cachemutex(newrp);
387 hp = NFSRCUDPHASH(newrp->rc_xid);
390 LIST_FOREACH(rp, hp, rc_hash) {
391 if (newrp->rc_xid == rp->rc_xid &&
392 newrp->rc_proc == rp->rc_proc &&
393 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
394 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
395 if ((rp->rc_flag & RC_LOCKED) != 0) {
396 rp->rc_flag |= RC_WANTED;
397 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
401 if (rp->rc_flag == 0)
402 panic("nfs udp cache0");
403 rp->rc_flag |= RC_LOCKED;
404 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
405 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
406 if (rp->rc_flag & RC_INPROG) {
407 NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
410 } else if (rp->rc_flag & RC_REPSTATUS) {
414 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
417 *(nd->nd_errp) = rp->rc_status;
419 rp->rc_timestamp = NFSD_MONOSEC +
420 NFSRVCACHE_UDPTIMEOUT;
421 } else if (rp->rc_flag & RC_REPMBUF) {
422 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
424 nd->nd_mreq = m_copym(rp->rc_reply, 0,
425 M_COPYALL, M_WAITOK);
427 rp->rc_timestamp = NFSD_MONOSEC +
428 NFSRVCACHE_UDPTIMEOUT;
430 panic("nfs udp cache1");
433 free(newrp, M_NFSRVCACHE);
437 NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
438 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
439 NFSD_VNET(nfsrc_udpcachesize)++;
441 newrp->rc_flag |= RC_INPROG;
442 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
443 if (saddr->sin_family == AF_INET)
444 newrp->rc_inet = saddr->sin_addr.s_addr;
445 else if (saddr->sin_family == AF_INET6) {
446 saddr6 = (struct sockaddr_in6 *)saddr;
447 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
448 sizeof (struct in6_addr));
449 newrp->rc_flag |= RC_INETIPV6;
451 LIST_INSERT_HEAD(hp, newrp, rc_hash);
452 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), newrp, rc_lru);
463 * Update a request cache entry after the rpc has been done
466 nfsrvd_updatecache(struct nfsrv_descript *nd)
468 struct nfsrvcache *rp;
469 struct nfsrvcache *retrp = NULL;
475 panic("nfsrvd_updatecache null rp");
477 mutex = nfsrc_cachemutex(rp);
480 if (!(rp->rc_flag & RC_INPROG))
481 panic("nfsrvd_updatecache not inprog");
482 rp->rc_flag &= ~RC_INPROG;
483 if (rp->rc_flag & RC_UDP) {
484 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
485 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
489 * Reply from cache is a special case returned by nfsrv_checkseqid().
491 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
492 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
496 m_freem(nd->nd_mreq);
497 if (!(rp->rc_flag & RC_REPMBUF))
498 panic("reply from cache");
499 nd->nd_mreq = m_copym(rp->rc_reply, 0,
500 M_COPYALL, M_WAITOK);
501 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
507 * If rc_refcnt > 0, save it
508 * For UDP, save it if ND_SAVEREPLY is set
509 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
511 if (nd->nd_repstat != NFSERR_DONTREPLY &&
512 (rp->rc_refcnt > 0 ||
513 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
514 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
515 NFSD_VNET(nfsrc_tcpsavedreplies) <= NFSD_VNET(nfsrc_floodlevel) &&
516 nfsrc_tcpnonidempotent))) {
517 if (rp->rc_refcnt > 0) {
518 if (!(rp->rc_flag & RC_NFSV4))
519 panic("update_cache refcnt");
520 rp->rc_flag |= RC_REFCNT;
522 if ((nd->nd_flag & ND_NFSV2) &&
523 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
524 rp->rc_status = nd->nd_repstat;
525 rp->rc_flag |= RC_REPSTATUS;
528 if (!(rp->rc_flag & RC_UDP)) {
529 atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies),
531 if (NFSD_VNET(nfsrc_tcpsavedreplies) >
532 NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak)
533 NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak =
534 NFSD_VNET(nfsrc_tcpsavedreplies);
537 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
540 rp->rc_flag |= RC_REPMBUF;
543 if (rp->rc_flag & RC_UDP) {
544 rp->rc_timestamp = NFSD_MONOSEC +
545 NFSRVCACHE_UDPTIMEOUT;
548 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
549 if (rp->rc_refcnt > 0)
565 * Invalidate and, if possible, free an in prog cache entry.
569 nfsrvd_delcache(struct nfsrvcache *rp)
573 mutex = nfsrc_cachemutex(rp);
574 if (!(rp->rc_flag & RC_INPROG))
575 panic("nfsrvd_delcache not in prog");
577 rp->rc_flag &= ~RC_INPROG;
578 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
584 * Called after nfsrvd_updatecache() once the reply is sent, to update
585 * the entry's sequence number and unlock it. The argument is
586 * the pointer returned by nfsrvd_updatecache().
589 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
591 struct nfsrchash_bucket *hbp;
593 KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
595 hbp = NFSRCAHASH(rp->rc_sockref);
598 if (rp->rc_acked != RC_NO_ACK)
599 LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
600 rp->rc_acked = RC_NO_ACK;
601 mtx_unlock(&hbp->mtx);
607 * Get a cache entry for TCP
608 * - key on <xid, nfs version>
609 * (allow multiple entries for a given key)
612 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
614 struct nfsrvcache *rp, *nextrp;
616 struct nfsrvcache *hitrp;
617 struct nfsrvhashhead *hp, nfsrc_templist;
621 mutex = nfsrc_cachemutex(newrp);
622 hp = NFSRCHASH(newrp->rc_xid);
623 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
627 LIST_INIT(&nfsrc_templist);
629 * Get all the matches and put them on the temp list.
632 while (rp != LIST_END(hp)) {
633 nextrp = LIST_NEXT(rp, rc_hash);
634 if (newrp->rc_xid == rp->rc_xid &&
635 (!(rp->rc_flag & RC_INPROG) ||
636 ((newrp->rc_flag & RC_SAMETCPCONN) &&
637 newrp->rc_sockref == rp->rc_sockref)) &&
638 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
639 newrp->rc_proc == rp->rc_proc &&
640 ((newrp->rc_flag & RC_NFSV4) &&
641 newrp->rc_sockref != rp->rc_sockref &&
642 newrp->rc_cachetime >= rp->rc_cachetime)
643 && newrp->rc_reqlen == rp->rc_reqlen &&
644 newrp->rc_cksum == rp->rc_cksum) {
645 LIST_REMOVE(rp, rc_hash);
646 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
652 * Now, use nfsrc_templist to decide if there is a match.
655 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
657 if (rp->rc_refcnt > 0) {
663 * Can be a hit only if one entry left.
664 * Note possible hit entry and put nfsrc_templist back on hash
669 hitrp = rp = LIST_FIRST(&nfsrc_templist);
670 while (rp != LIST_END(&nfsrc_templist)) {
671 nextrp = LIST_NEXT(rp, rc_hash);
672 LIST_REMOVE(rp, rc_hash);
673 LIST_INSERT_HEAD(hp, rp, rc_hash);
676 if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
677 panic("nfs gettcp cache templist");
681 if ((rp->rc_flag & RC_LOCKED) != 0) {
682 rp->rc_flag |= RC_WANTED;
683 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
687 if (rp->rc_flag == 0)
688 panic("nfs tcp cache0");
689 rp->rc_flag |= RC_LOCKED;
690 if (rp->rc_flag & RC_INPROG) {
691 NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
693 if (newrp->rc_sockref == rp->rc_sockref)
694 nfsrc_marksametcpconn(rp->rc_sockref);
696 } else if (rp->rc_flag & RC_REPSTATUS) {
700 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
702 if (newrp->rc_sockref == rp->rc_sockref)
703 nfsrc_marksametcpconn(rp->rc_sockref);
706 *(nd->nd_errp) = rp->rc_status;
707 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
708 } else if (rp->rc_flag & RC_REPMBUF) {
709 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
711 if (newrp->rc_sockref == rp->rc_sockref)
712 nfsrc_marksametcpconn(rp->rc_sockref);
714 nd->nd_mreq = m_copym(rp->rc_reply, 0,
715 M_COPYALL, M_WAITOK);
716 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
718 panic("nfs tcp cache1");
721 free(newrp, M_NFSRVCACHE);
724 NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
725 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
728 * For TCP, multiple entries for a key are allowed, so don't
729 * chain it into the hash table until done.
731 newrp->rc_cachetime = NFSD_MONOSEC;
732 newrp->rc_flag |= RC_INPROG;
733 LIST_INSERT_HEAD(hp, newrp, rc_hash);
744 * Lock a cache entry.
747 nfsrc_lock(struct nfsrvcache *rp)
751 mutex = nfsrc_cachemutex(rp);
752 mtx_assert(mutex, MA_OWNED);
753 while ((rp->rc_flag & RC_LOCKED) != 0) {
754 rp->rc_flag |= RC_WANTED;
755 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
757 rp->rc_flag |= RC_LOCKED;
761 * Unlock a cache entry.
764 nfsrc_unlock(struct nfsrvcache *rp)
768 mutex = nfsrc_cachemutex(rp);
770 rp->rc_flag &= ~RC_LOCKED;
776 * Wakeup anyone wanting entry.
779 nfsrc_wanted(struct nfsrvcache *rp)
781 if (rp->rc_flag & RC_WANTED) {
782 rp->rc_flag &= ~RC_WANTED;
792 nfsrc_freecache(struct nfsrvcache *rp)
794 struct nfsrchash_bucket *hbp;
796 LIST_REMOVE(rp, rc_hash);
797 if (rp->rc_flag & RC_UDP) {
798 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
799 NFSD_VNET(nfsrc_udpcachesize)--;
800 } else if (rp->rc_acked != RC_NO_SEQ) {
801 hbp = NFSRCAHASH(rp->rc_sockref);
803 if (rp->rc_acked == RC_NO_ACK)
804 LIST_REMOVE(rp, rc_ahash);
805 mtx_unlock(&hbp->mtx);
808 if (rp->rc_flag & RC_REPMBUF) {
809 m_freem(rp->rc_reply);
810 if (!(rp->rc_flag & RC_UDP))
811 atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), -1);
813 free(rp, M_NFSRVCACHE);
814 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, -1);
818 * Clean out the cache. Called when nfsserver module is unloaded.
821 nfsrvd_cleancache(void)
823 struct nfsrvcache *rp, *nextrp;
826 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
827 LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrchash_table)[i].tbl,
831 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
832 LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudphashtbl)[i], rc_hash,
837 NFSD_VNET(nfsstatsv1_p)->srvcache_size = 0;
838 NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
843 * The basic rule is to get rid of entries that are expired.
846 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
848 struct nfsrchash_bucket *hbp;
849 struct nfsrvcache *rp, *nextrp;
850 int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
852 static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
853 static int onethread = 0, oneslot = 0;
856 hbp = NFSRCAHASH(sockref);
858 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
859 if (sockref == rp->rc_sockref) {
860 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
861 rp->rc_acked = RC_ACK;
862 LIST_REMOVE(rp, rc_ahash);
864 rp->rc_acked = RC_NACK;
865 LIST_REMOVE(rp, rc_ahash);
869 mtx_unlock(&hbp->mtx);
872 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
874 if (NFSD_MONOSEC != udp_lasttrim ||
875 NFSD_VNET(nfsrc_udpcachesize) >= (nfsrc_udphighwater +
876 nfsrc_udphighwater / 2)) {
877 mtx_lock(&nfsrc_udpmtx);
878 udp_lasttrim = NFSD_MONOSEC;
879 TAILQ_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudplru), rc_lru,
881 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
882 && rp->rc_refcnt == 0
883 && ((rp->rc_flag & RC_REFCNT) ||
884 udp_lasttrim > rp->rc_timestamp ||
885 NFSD_VNET(nfsrc_udpcachesize) >
889 mtx_unlock(&nfsrc_udpmtx);
891 if (NFSD_MONOSEC != tcp_lasttrim ||
892 NFSD_VNET(nfsrc_tcpsavedreplies) >= nfsrc_tcphighwater) {
893 force = nfsrc_tcphighwater / 4;
895 NFSD_VNET(nfsrc_tcpsavedreplies) + force >=
896 nfsrc_tcphighwater) {
897 for (i = 0; i < HISTSIZE; i++)
900 lastslot = NFSRVCACHE_HASHSIZE - 1;
903 if (NFSD_MONOSEC != tcp_lasttrim) {
905 lastslot = NFSRVCACHE_HASHSIZE - 1;
907 lastslot = i = oneslot;
908 if (++oneslot >= NFSRVCACHE_HASHSIZE)
912 tto = nfsrc_tcptimeout;
913 tcp_lasttrim = NFSD_MONOSEC;
914 for (; i <= lastslot; i++) {
915 mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
916 LIST_FOREACH_SAFE(rp,
917 &NFSD_VNET(nfsrchash_table)[i].tbl, rc_hash,
920 (RC_INPROG|RC_LOCKED|RC_WANTED))
921 && rp->rc_refcnt == 0) {
922 if ((rp->rc_flag & RC_REFCNT) ||
923 tcp_lasttrim > rp->rc_timestamp ||
924 rp->rc_acked == RC_ACK) {
932 * The timestamps range from roughly the
933 * present (tcp_lasttrim) to the present
934 * + nfsrc_tcptimeout. Generate a simple
935 * histogram of where the timeouts fall.
937 j = rp->rc_timestamp - tcp_lasttrim;
943 j = j * HISTSIZE / tto;
947 mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
951 * Trim some more with a smaller timeout of as little
952 * as 20% of nfsrc_tcptimeout to try and get below
953 * 80% of the nfsrc_tcphighwater.
956 for (i = 0; i < (HISTSIZE - 2); i++) {
961 k = tto * (i + 1) / HISTSIZE;
964 thisstamp = tcp_lasttrim + k;
965 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
966 mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
967 LIST_FOREACH_SAFE(rp,
968 &NFSD_VNET(nfsrchash_table)[i].tbl,
971 (RC_INPROG|RC_LOCKED|RC_WANTED))
972 && rp->rc_refcnt == 0
973 && ((rp->rc_flag & RC_REFCNT) ||
974 thisstamp > rp->rc_timestamp ||
975 rp->rc_acked == RC_ACK))
978 mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
982 atomic_store_rel_int(&onethread, 0);
986 * Add a seqid# reference to the cache entry.
989 nfsrvd_refcache(struct nfsrvcache *rp)
994 /* For NFSv4.1, there is no cache entry. */
996 mutex = nfsrc_cachemutex(rp);
998 if (rp->rc_refcnt < 0)
999 panic("nfs cache refcnt");
1005 * Dereference a seqid# cache entry.
1008 nfsrvd_derefcache(struct nfsrvcache *rp)
1012 mutex = nfsrc_cachemutex(rp);
1014 if (rp->rc_refcnt <= 0)
1015 panic("nfs cache derefcnt");
1017 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1018 nfsrc_freecache(rp);
1023 * Calculate the length of the mbuf list and a checksum on the first up to
1024 * NFSRVCACHE_CHECKLEN bytes.
1027 nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
1037 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1038 *cksum = in_cksum(m1, cklen);
1043 * Mark a TCP connection that is seeing retries. Should never happen for
1047 nfsrc_marksametcpconn(u_int64_t sockref)