2 * SPDX-License-Identifier: BSD-3-Clause
4 * Copyright (c) 1989, 1993
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Rick Macklem at The University of Guelph.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
40 * Here is the basic algorithm:
41 * First, some design criteria I used:
42 * - I think a false hit is more serious than a false miss
43 * - A false hit for an RPC that has Op(s) that order via seqid# must be
45 * - A valid hit will probably happen a long time after the original reply
46 * and the TCP socket that the original request was received on will no
48 * (The long time delay implies to me that LRU is not appropriate.)
49 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
50 * in them as well as minimizing the risk of redoing retried non-idempotent
52 * Because it is biased towards avoiding false hits, multiple entries with
53 * the same xid are to be expected, especially for the case of the entry
54 * in the cache being related to a seqid# sequenced Op.
56 * The basic algorithm I'm about to code up:
57 * - Null RPCs bypass the cache and are just done
59 * - key on <xid, NFS version> (as noted above, there can be several
60 * entries with the same key)
61 * When a request arrives:
62 * For all that match key
63 * - if RPC# != OR request_size !=
64 * - not a match with this one
65 * - if NFSv4 and received on same TCP socket OR
66 * received on a TCP connection created before the
68 * - not a match with this one
69 * (V2,3 clients might retry on same TCP socket)
70 * - calculate checksum on first N bytes of NFS XDR
72 * - not a match for this one
73 * If any of the remaining ones that match has a
75 * - not a match (go do RPC, using new cache entry)
77 * - a hit (reply from cache)
79 * - miss (go do RPC, using new cache entry)
81 * During processing of NFSv4 request:
82 * - set a flag when a non-idempotent Op is processed
83 * - when an Op that uses a seqid# (Open,...) is processed
84 * - if same seqid# as referenced entry in cache
85 * - free new cache entry
86 * - reply from referenced cache entry
87 * else if next seqid# in order
88 * - free referenced cache entry
89 * - increment seqid_refcnt on new cache entry
90 * - set pointer from Openowner/Lockowner to
91 * new cache entry (aka reference it)
92 * else if first seqid# in sequence
93 * - increment seqid_refcnt on new cache entry
94 * - set pointer from Openowner/Lockowner to
95 * new cache entry (aka reference it)
97 * At end of RPC processing:
98 * - if seqid_refcnt > 0 OR flagged non-idempotent on new
100 * - save reply in cache entry
101 * - calculate checksum on first N bytes of NFS XDR
103 * - note op and length of XDR request (in bytes)
106 * - free new cache entry
107 * - Send reply (noting info for socket activity check, below)
109 * For cache entries saved above:
110 * - if saved since seqid_refcnt was > 0
111 * - free when seqid_refcnt decrements to 0
112 * (when next one in sequence is processed above, or
113 * when Openowner/Lockowner is discarded)
114 * else { non-idempotent Op(s) }
116 * - some further activity observed on same
118 * (I'm not yet sure how I'm going to do
119 * this. Maybe look at the TCP connection
120 * to see if the send_tcp_sequence# is well
121 * past sent reply OR K additional RPCs
122 * replied on same socket OR?)
124 * - when very old (hours, days, weeks?)
126 * For UDP (v2, 3 only), pretty much the old way:
127 * - key on <xid, NFS version, RPC#, Client host ip#>
128 * (at most one entry for each key)
130 * When a Request arrives:
131 * - if a match with entry via key
132 * - if RPC marked In_progress
133 * - discard request (don't send reply)
136 * - timestamp cache entry
138 * - add entry to cache, marked In_progress
141 * - if RPC# non-idempotent
142 * - mark entry Done (not In_progress)
144 * - timestamp cache entry
149 * Later, entries with saved replies are free'd a short time (few minutes)
150 * after reply sent (timestamp).
151 * Reference: Chet Juszczak, "Improving the Performance and Correctness
152 * of an NFS Server", in Proc. Winter 1989 USENIX Conference,
153 * pages 53-63. San Diego, February 1989.
155 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
156 * for TCP. For V3, a reply won't be saved when the flood level is
157 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
158 * that case. This level should be set high enough that this almost
162 #include <fs/nfs/nfsport.h>
164 extern struct nfsstatsv1 nfsstatsv1;
165 extern struct mtx nfsrc_udpmtx;
166 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
167 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
168 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
169 #endif /* !APPLEKEXT */
171 SYSCTL_DECL(_vfs_nfsd);
173 static u_int nfsrc_tcphighwater = 0;
175 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
177 int error, newhighwater;
179 newhighwater = nfsrc_tcphighwater;
180 error = sysctl_handle_int(oidp, &newhighwater, 0, req);
181 if (error != 0 || req->newptr == NULL)
183 if (newhighwater < 0)
185 if (newhighwater >= nfsrc_floodlevel)
186 nfsrc_floodlevel = newhighwater + newhighwater / 5;
187 nfsrc_tcphighwater = newhighwater;
190 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
191 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
192 sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
194 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
196 &nfsrc_udphighwater, 0,
197 "High water mark for UDP cache entries");
198 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
200 &nfsrc_tcptimeout, 0,
201 "Timeout for TCP entries in the DRC");
202 static u_int nfsrc_tcpnonidempotent = 1;
203 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
204 &nfsrc_tcpnonidempotent, 0,
205 "Enable the DRC for NFS over TCP");
207 static int nfsrc_udpcachesize = 0;
208 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
209 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
212 * and the reverse mapping from generic to Version 2 procedure numbers
214 static int newnfsv2_procid[NFS_V3NPROCS] = {
239 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
240 #define NFSRCUDPHASH(xid) \
241 (&nfsrvudphashtbl[nfsrc_hash(xid)])
242 #define NFSRCHASH(xid) \
243 (&nfsrchash_table[nfsrc_hash(xid)].tbl)
244 #define NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
247 #define NFSRVCACHE_CHECKLEN 100
249 /* True iff the rpc reply is an nfs status ONLY! */
250 static int nfsv2_repstat[NFS_V3NPROCS] = {
276 * Will NFS want to work over IPv6 someday?
278 #define NETFAMILY(rp) \
279 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
281 /* local functions */
282 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
283 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
284 static void nfsrc_lock(struct nfsrvcache *rp);
285 static void nfsrc_unlock(struct nfsrvcache *rp);
286 static void nfsrc_wanted(struct nfsrvcache *rp);
287 static void nfsrc_freecache(struct nfsrvcache *rp);
288 static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
289 static void nfsrc_marksametcpconn(u_int64_t);
292 * Return the correct mutex for this cache entry.
294 static __inline struct mtx *
295 nfsrc_cachemutex(struct nfsrvcache *rp)
298 if ((rp->rc_flag & RC_UDP) != 0)
299 return (&nfsrc_udpmtx);
300 return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
304 * Initialize the server request cache list
307 nfsrvd_initcache(void)
310 static int inited = 0;
315 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
316 LIST_INIT(&nfsrvudphashtbl[i]);
317 LIST_INIT(&nfsrchash_table[i].tbl);
318 LIST_INIT(&nfsrcahash_table[i].tbl);
320 TAILQ_INIT(&nfsrvudplru);
321 nfsrc_tcpsavedreplies = 0;
322 nfsrc_udpcachesize = 0;
323 nfsstatsv1.srvcache_tcppeak = 0;
324 nfsstatsv1.srvcache_size = 0;
328 * Get a cache entry for this request. Basically just malloc a new one
329 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
332 nfsrvd_getcache(struct nfsrv_descript *nd)
334 struct nfsrvcache *newrp;
337 if (nd->nd_procnum == NFSPROC_NULL)
338 panic("nfsd cache null");
339 newrp = malloc(sizeof (struct nfsrvcache),
340 M_NFSRVCACHE, M_WAITOK);
341 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
342 if (nd->nd_flag & ND_NFSV4)
343 newrp->rc_flag = RC_NFSV4;
344 else if (nd->nd_flag & ND_NFSV3)
345 newrp->rc_flag = RC_NFSV3;
347 newrp->rc_flag = RC_NFSV2;
348 newrp->rc_xid = nd->nd_retxid;
349 newrp->rc_proc = nd->nd_procnum;
350 newrp->rc_sockref = nd->nd_sockref;
351 newrp->rc_cachetime = nd->nd_tcpconntime;
352 if (nd->nd_flag & ND_SAMETCPCONN)
353 newrp->rc_flag |= RC_SAMETCPCONN;
354 if (nd->nd_nam2 != NULL) {
355 newrp->rc_flag |= RC_UDP;
356 ret = nfsrc_getudp(nd, newrp);
358 ret = nfsrc_gettcp(nd, newrp);
366 * - key on <xid, NFS version, RPC#, Client host ip#>
367 * (at most one entry for each key)
370 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
372 struct nfsrvcache *rp;
373 struct sockaddr_in *saddr;
374 struct sockaddr_in6 *saddr6;
375 struct nfsrvhashhead *hp;
379 mutex = nfsrc_cachemutex(newrp);
380 hp = NFSRCUDPHASH(newrp->rc_xid);
383 LIST_FOREACH(rp, hp, rc_hash) {
384 if (newrp->rc_xid == rp->rc_xid &&
385 newrp->rc_proc == rp->rc_proc &&
386 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
387 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
388 if ((rp->rc_flag & RC_LOCKED) != 0) {
389 rp->rc_flag |= RC_WANTED;
390 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
394 if (rp->rc_flag == 0)
395 panic("nfs udp cache0");
396 rp->rc_flag |= RC_LOCKED;
397 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
398 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
399 if (rp->rc_flag & RC_INPROG) {
400 nfsstatsv1.srvcache_inproghits++;
403 } else if (rp->rc_flag & RC_REPSTATUS) {
407 nfsstatsv1.srvcache_nonidemdonehits++;
410 *(nd->nd_errp) = rp->rc_status;
412 rp->rc_timestamp = NFSD_MONOSEC +
413 NFSRVCACHE_UDPTIMEOUT;
414 } else if (rp->rc_flag & RC_REPMBUF) {
415 nfsstatsv1.srvcache_nonidemdonehits++;
417 nd->nd_mreq = m_copym(rp->rc_reply, 0,
418 M_COPYALL, M_WAITOK);
420 rp->rc_timestamp = NFSD_MONOSEC +
421 NFSRVCACHE_UDPTIMEOUT;
423 panic("nfs udp cache1");
426 free(newrp, M_NFSRVCACHE);
430 nfsstatsv1.srvcache_misses++;
431 atomic_add_int(&nfsstatsv1.srvcache_size, 1);
432 nfsrc_udpcachesize++;
434 newrp->rc_flag |= RC_INPROG;
435 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
436 if (saddr->sin_family == AF_INET)
437 newrp->rc_inet = saddr->sin_addr.s_addr;
438 else if (saddr->sin_family == AF_INET6) {
439 saddr6 = (struct sockaddr_in6 *)saddr;
440 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
441 sizeof (struct in6_addr));
442 newrp->rc_flag |= RC_INETIPV6;
444 LIST_INSERT_HEAD(hp, newrp, rc_hash);
445 TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
456 * Update a request cache entry after the rpc has been done
458 APPLESTATIC struct nfsrvcache *
459 nfsrvd_updatecache(struct nfsrv_descript *nd)
461 struct nfsrvcache *rp;
462 struct nfsrvcache *retrp = NULL;
468 panic("nfsrvd_updatecache null rp");
470 mutex = nfsrc_cachemutex(rp);
473 if (!(rp->rc_flag & RC_INPROG))
474 panic("nfsrvd_updatecache not inprog");
475 rp->rc_flag &= ~RC_INPROG;
476 if (rp->rc_flag & RC_UDP) {
477 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
478 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
482 * Reply from cache is a special case returned by nfsrv_checkseqid().
484 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
485 nfsstatsv1.srvcache_nonidemdonehits++;
489 m_freem(nd->nd_mreq);
490 if (!(rp->rc_flag & RC_REPMBUF))
491 panic("reply from cache");
492 nd->nd_mreq = m_copym(rp->rc_reply, 0,
493 M_COPYALL, M_WAITOK);
494 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
500 * If rc_refcnt > 0, save it
501 * For UDP, save it if ND_SAVEREPLY is set
502 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
504 if (nd->nd_repstat != NFSERR_DONTREPLY &&
505 (rp->rc_refcnt > 0 ||
506 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
507 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
508 nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
509 nfsrc_tcpnonidempotent))) {
510 if (rp->rc_refcnt > 0) {
511 if (!(rp->rc_flag & RC_NFSV4))
512 panic("update_cache refcnt");
513 rp->rc_flag |= RC_REFCNT;
515 if ((nd->nd_flag & ND_NFSV2) &&
516 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
517 rp->rc_status = nd->nd_repstat;
518 rp->rc_flag |= RC_REPSTATUS;
521 if (!(rp->rc_flag & RC_UDP)) {
522 atomic_add_int(&nfsrc_tcpsavedreplies, 1);
523 if (nfsrc_tcpsavedreplies >
524 nfsstatsv1.srvcache_tcppeak)
525 nfsstatsv1.srvcache_tcppeak =
526 nfsrc_tcpsavedreplies;
529 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
532 rp->rc_flag |= RC_REPMBUF;
535 if (rp->rc_flag & RC_UDP) {
536 rp->rc_timestamp = NFSD_MONOSEC +
537 NFSRVCACHE_UDPTIMEOUT;
540 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
541 if (rp->rc_refcnt > 0)
557 * Invalidate and, if possible, free an in prog cache entry.
561 nfsrvd_delcache(struct nfsrvcache *rp)
565 mutex = nfsrc_cachemutex(rp);
566 if (!(rp->rc_flag & RC_INPROG))
567 panic("nfsrvd_delcache not in prog");
569 rp->rc_flag &= ~RC_INPROG;
570 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
576 * Called after nfsrvd_updatecache() once the reply is sent, to update
577 * the entry's sequence number and unlock it. The argument is
578 * the pointer returned by nfsrvd_updatecache().
581 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
583 struct nfsrchash_bucket *hbp;
585 KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
587 hbp = NFSRCAHASH(rp->rc_sockref);
590 if (rp->rc_acked != RC_NO_ACK)
591 LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
592 rp->rc_acked = RC_NO_ACK;
593 mtx_unlock(&hbp->mtx);
599 * Get a cache entry for TCP
600 * - key on <xid, nfs version>
601 * (allow multiple entries for a given key)
604 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
606 struct nfsrvcache *rp, *nextrp;
608 struct nfsrvcache *hitrp;
609 struct nfsrvhashhead *hp, nfsrc_templist;
613 mutex = nfsrc_cachemutex(newrp);
614 hp = NFSRCHASH(newrp->rc_xid);
615 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
619 LIST_INIT(&nfsrc_templist);
621 * Get all the matches and put them on the temp list.
624 while (rp != LIST_END(hp)) {
625 nextrp = LIST_NEXT(rp, rc_hash);
626 if (newrp->rc_xid == rp->rc_xid &&
627 (!(rp->rc_flag & RC_INPROG) ||
628 ((newrp->rc_flag & RC_SAMETCPCONN) &&
629 newrp->rc_sockref == rp->rc_sockref)) &&
630 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
631 newrp->rc_proc == rp->rc_proc &&
632 ((newrp->rc_flag & RC_NFSV4) &&
633 newrp->rc_sockref != rp->rc_sockref &&
634 newrp->rc_cachetime >= rp->rc_cachetime)
635 && newrp->rc_reqlen == rp->rc_reqlen &&
636 newrp->rc_cksum == rp->rc_cksum) {
637 LIST_REMOVE(rp, rc_hash);
638 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
644 * Now, use nfsrc_templist to decide if there is a match.
647 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
649 if (rp->rc_refcnt > 0) {
655 * Can be a hit only if one entry left.
656 * Note possible hit entry and put nfsrc_templist back on hash
661 hitrp = rp = LIST_FIRST(&nfsrc_templist);
662 while (rp != LIST_END(&nfsrc_templist)) {
663 nextrp = LIST_NEXT(rp, rc_hash);
664 LIST_REMOVE(rp, rc_hash);
665 LIST_INSERT_HEAD(hp, rp, rc_hash);
668 if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
669 panic("nfs gettcp cache templist");
673 if ((rp->rc_flag & RC_LOCKED) != 0) {
674 rp->rc_flag |= RC_WANTED;
675 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
679 if (rp->rc_flag == 0)
680 panic("nfs tcp cache0");
681 rp->rc_flag |= RC_LOCKED;
682 if (rp->rc_flag & RC_INPROG) {
683 nfsstatsv1.srvcache_inproghits++;
685 if (newrp->rc_sockref == rp->rc_sockref)
686 nfsrc_marksametcpconn(rp->rc_sockref);
688 } else if (rp->rc_flag & RC_REPSTATUS) {
692 nfsstatsv1.srvcache_nonidemdonehits++;
694 if (newrp->rc_sockref == rp->rc_sockref)
695 nfsrc_marksametcpconn(rp->rc_sockref);
698 *(nd->nd_errp) = rp->rc_status;
699 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
700 } else if (rp->rc_flag & RC_REPMBUF) {
701 nfsstatsv1.srvcache_nonidemdonehits++;
703 if (newrp->rc_sockref == rp->rc_sockref)
704 nfsrc_marksametcpconn(rp->rc_sockref);
706 nd->nd_mreq = m_copym(rp->rc_reply, 0,
707 M_COPYALL, M_WAITOK);
708 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
710 panic("nfs tcp cache1");
713 free(newrp, M_NFSRVCACHE);
716 nfsstatsv1.srvcache_misses++;
717 atomic_add_int(&nfsstatsv1.srvcache_size, 1);
720 * For TCP, multiple entries for a key are allowed, so don't
721 * chain it into the hash table until done.
723 newrp->rc_cachetime = NFSD_MONOSEC;
724 newrp->rc_flag |= RC_INPROG;
725 LIST_INSERT_HEAD(hp, newrp, rc_hash);
736 * Lock a cache entry.
739 nfsrc_lock(struct nfsrvcache *rp)
743 mutex = nfsrc_cachemutex(rp);
744 mtx_assert(mutex, MA_OWNED);
745 while ((rp->rc_flag & RC_LOCKED) != 0) {
746 rp->rc_flag |= RC_WANTED;
747 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
749 rp->rc_flag |= RC_LOCKED;
753 * Unlock a cache entry.
756 nfsrc_unlock(struct nfsrvcache *rp)
760 mutex = nfsrc_cachemutex(rp);
762 rp->rc_flag &= ~RC_LOCKED;
768 * Wakeup anyone wanting entry.
771 nfsrc_wanted(struct nfsrvcache *rp)
773 if (rp->rc_flag & RC_WANTED) {
774 rp->rc_flag &= ~RC_WANTED;
784 nfsrc_freecache(struct nfsrvcache *rp)
786 struct nfsrchash_bucket *hbp;
788 LIST_REMOVE(rp, rc_hash);
789 if (rp->rc_flag & RC_UDP) {
790 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
791 nfsrc_udpcachesize--;
792 } else if (rp->rc_acked != RC_NO_SEQ) {
793 hbp = NFSRCAHASH(rp->rc_sockref);
795 if (rp->rc_acked == RC_NO_ACK)
796 LIST_REMOVE(rp, rc_ahash);
797 mtx_unlock(&hbp->mtx);
800 if (rp->rc_flag & RC_REPMBUF) {
801 m_freem(rp->rc_reply);
802 if (!(rp->rc_flag & RC_UDP))
803 atomic_add_int(&nfsrc_tcpsavedreplies, -1);
805 free(rp, M_NFSRVCACHE);
806 atomic_add_int(&nfsstatsv1.srvcache_size, -1);
810 * Clean out the cache. Called when nfsserver module is unloaded.
813 nfsrvd_cleancache(void)
815 struct nfsrvcache *rp, *nextrp;
818 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
819 mtx_lock(&nfsrchash_table[i].mtx);
820 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
822 mtx_unlock(&nfsrchash_table[i].mtx);
824 mtx_lock(&nfsrc_udpmtx);
825 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
826 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
830 nfsstatsv1.srvcache_size = 0;
831 mtx_unlock(&nfsrc_udpmtx);
832 nfsrc_tcpsavedreplies = 0;
837 * The basic rule is to get rid of entries that are expired.
840 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
842 struct nfsrchash_bucket *hbp;
843 struct nfsrvcache *rp, *nextrp;
844 int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
846 static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
847 static int onethread = 0, oneslot = 0;
850 hbp = NFSRCAHASH(sockref);
852 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
853 if (sockref == rp->rc_sockref) {
854 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
855 rp->rc_acked = RC_ACK;
856 LIST_REMOVE(rp, rc_ahash);
858 rp->rc_acked = RC_NACK;
859 LIST_REMOVE(rp, rc_ahash);
863 mtx_unlock(&hbp->mtx);
866 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
868 if (NFSD_MONOSEC != udp_lasttrim ||
869 nfsrc_udpcachesize >= (nfsrc_udphighwater +
870 nfsrc_udphighwater / 2)) {
871 mtx_lock(&nfsrc_udpmtx);
872 udp_lasttrim = NFSD_MONOSEC;
873 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
874 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
875 && rp->rc_refcnt == 0
876 && ((rp->rc_flag & RC_REFCNT) ||
877 udp_lasttrim > rp->rc_timestamp ||
878 nfsrc_udpcachesize > nfsrc_udphighwater))
881 mtx_unlock(&nfsrc_udpmtx);
883 if (NFSD_MONOSEC != tcp_lasttrim ||
884 nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
885 force = nfsrc_tcphighwater / 4;
887 nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
888 for (i = 0; i < HISTSIZE; i++)
891 lastslot = NFSRVCACHE_HASHSIZE - 1;
894 if (NFSD_MONOSEC != tcp_lasttrim) {
896 lastslot = NFSRVCACHE_HASHSIZE - 1;
898 lastslot = i = oneslot;
899 if (++oneslot >= NFSRVCACHE_HASHSIZE)
903 tto = nfsrc_tcptimeout;
904 tcp_lasttrim = NFSD_MONOSEC;
905 for (; i <= lastslot; i++) {
906 mtx_lock(&nfsrchash_table[i].mtx);
907 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
910 (RC_INPROG|RC_LOCKED|RC_WANTED))
911 && rp->rc_refcnt == 0) {
912 if ((rp->rc_flag & RC_REFCNT) ||
913 tcp_lasttrim > rp->rc_timestamp ||
914 rp->rc_acked == RC_ACK) {
922 * The timestamps range from roughly the
923 * present (tcp_lasttrim) to the present
924 * + nfsrc_tcptimeout. Generate a simple
925 * histogram of where the timeouts fall.
927 j = rp->rc_timestamp - tcp_lasttrim;
933 j = j * HISTSIZE / tto;
937 mtx_unlock(&nfsrchash_table[i].mtx);
941 * Trim some more with a smaller timeout of as little
942 * as 20% of nfsrc_tcptimeout to try and get below
943 * 80% of the nfsrc_tcphighwater.
946 for (i = 0; i < (HISTSIZE - 2); i++) {
951 k = tto * (i + 1) / HISTSIZE;
954 thisstamp = tcp_lasttrim + k;
955 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
956 mtx_lock(&nfsrchash_table[i].mtx);
957 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
960 (RC_INPROG|RC_LOCKED|RC_WANTED))
961 && rp->rc_refcnt == 0
962 && ((rp->rc_flag & RC_REFCNT) ||
963 thisstamp > rp->rc_timestamp ||
964 rp->rc_acked == RC_ACK))
967 mtx_unlock(&nfsrchash_table[i].mtx);
971 atomic_store_rel_int(&onethread, 0);
975 * Add a seqid# reference to the cache entry.
978 nfsrvd_refcache(struct nfsrvcache *rp)
983 /* For NFSv4.1, there is no cache entry. */
985 mutex = nfsrc_cachemutex(rp);
987 if (rp->rc_refcnt < 0)
988 panic("nfs cache refcnt");
994 * Dereference a seqid# cache entry.
997 nfsrvd_derefcache(struct nfsrvcache *rp)
1001 mutex = nfsrc_cachemutex(rp);
1003 if (rp->rc_refcnt <= 0)
1004 panic("nfs cache derefcnt");
1006 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1007 nfsrc_freecache(rp);
1012 * Calculate the length of the mbuf list and a checksum on the first up to
1013 * NFSRVCACHE_CHECKLEN bytes.
1016 nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
1026 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1027 *cksum = in_cksum(m1, cklen);
1032 * Mark a TCP connection that is seeing retries. Should never happen for
1036 nfsrc_marksametcpconn(u_int64_t sockref)