2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
38 * Here is the basic algorithm:
39 * First, some design criteria I used:
40 * - I think a false hit is more serious than a false miss
41 * - A false hit for an RPC that has Op(s) that order via seqid# must be
43 * - A valid hit will probably happen a long time after the original reply
44 * and the TCP socket that the original request was received on will no
46 * (The long time delay implies to me that LRU is not appropriate.)
47 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48 * in them as well as minimizing the risk of redoing retried non-idempotent
50 * Because it is biased towards avoiding false hits, multiple entries with
51 * the same xid are to be expected, especially for the case of the entry
52 * in the cache being related to a seqid# sequenced Op.
54 * The basic algorithm I'm about to code up:
55 * - Null RPCs bypass the cache and are just done
57 * - key on <xid, NFS version> (as noted above, there can be several
58 * entries with the same key)
59 * When a request arrives:
60 * For all that match key
61 * - if RPC# != OR request_size !=
62 * - not a match with this one
63 * - if NFSv4 and received on same TCP socket OR
64 * received on a TCP connection created before the
66 * - not a match with this one
67 * (V2,3 clients might retry on same TCP socket)
68 * - calculate checksum on first N bytes of NFS XDR
70 * - not a match for this one
71 * If any of the remaining ones that match has a
73 * - not a match (go do RPC, using new cache entry)
75 * - a hit (reply from cache)
77 * - miss (go do RPC, using new cache entry)
79 * During processing of NFSv4 request:
80 * - set a flag when a non-idempotent Op is processed
81 * - when an Op that uses a seqid# (Open,...) is processed
82 * - if same seqid# as referenced entry in cache
83 * - free new cache entry
84 * - reply from referenced cache entry
85 * else if next seqid# in order
86 * - free referenced cache entry
87 * - increment seqid_refcnt on new cache entry
88 * - set pointer from Openowner/Lockowner to
89 * new cache entry (aka reference it)
90 * else if first seqid# in sequence
91 * - increment seqid_refcnt on new cache entry
92 * - set pointer from Openowner/Lockowner to
93 * new cache entry (aka reference it)
95 * At end of RPC processing:
96 * - if seqid_refcnt > 0 OR flagged non-idempotent on new
98 * - save reply in cache entry
99 * - calculate checksum on first N bytes of NFS XDR
101 * - note op and length of XDR request (in bytes)
104 * - free new cache entry
105 * - Send reply (noting info for socket activity check, below)
107 * For cache entries saved above:
108 * - if saved since seqid_refcnt was > 0
109 * - free when seqid_refcnt decrements to 0
110 * (when next one in sequence is processed above, or
111 * when Openowner/Lockowner is discarded)
112 * else { non-idempotent Op(s) }
114 * - some further activity observed on same
116 * (I'm not yet sure how I'm going to do
117 * this. Maybe look at the TCP connection
118 * to see if the send_tcp_sequence# is well
119 * past sent reply OR K additional RPCs
120 * replied on same socket OR?)
122 * - when very old (hours, days, weeks?)
124 * For UDP (v2, 3 only), pretty much the old way:
125 * - key on <xid, NFS version, RPC#, Client host ip#>
126 * (at most one entry for each key)
128 * When a Request arrives:
129 * - if a match with entry via key
130 * - if RPC marked In_progress
131 * - discard request (don't send reply)
134 * - timestamp cache entry
136 * - add entry to cache, marked In_progress
139 * - if RPC# non-idempotent
140 * - mark entry Done (not In_progress)
142 * - timestamp cache entry
147 * Later, entries with saved replies are free'd a short time (few minutes)
148 * after reply sent (timestamp).
149 * Reference: Chet Juszczak, "Improving the Performance and Correctness
150 * of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151 * pages 53-63. San Diego, February 1989.
153 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154 * for TCP. For V3, a reply won't be saved when the flood level is
155 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156 * that case. This level should be set high enough that this almost
160 #include <fs/nfs/nfsport.h>
162 extern struct nfsstats newnfsstats;
163 extern struct mtx nfsrc_udpmtx;
164 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
165 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
166 #endif /* !APPLEKEXT */
168 SYSCTL_DECL(_vfs_nfsd);
170 static u_int nfsrc_tcphighwater = 0;
172 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
174 int error, newhighwater;
176 newhighwater = nfsrc_tcphighwater;
177 error = sysctl_handle_int(oidp, &newhighwater, 0, req);
178 if (error != 0 || req->newptr == NULL)
180 if (newhighwater < 0)
182 if (newhighwater >= nfsrc_floodlevel)
183 nfsrc_floodlevel = newhighwater + newhighwater / 5;
184 nfsrc_tcphighwater = newhighwater;
187 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
188 sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
189 "High water mark for TCP cache entries");
191 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
192 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
193 &nfsrc_udphighwater, 0,
194 "High water mark for UDP cache entries");
195 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
196 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
197 &nfsrc_tcptimeout, 0,
198 "Timeout for TCP entries in the DRC");
199 static u_int nfsrc_tcpnonidempotent = 1;
200 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
201 &nfsrc_tcpnonidempotent, 0,
202 "Enable the DRC for NFS over TCP");
204 static int nfsrc_udpcachesize = 0;
205 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
206 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
209 * and the reverse mapping from generic to Version 2 procedure numbers
211 static int newnfsv2_procid[NFS_V3NPROCS] = {
236 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
237 #define NFSRCUDPHASH(xid) \
238 (&nfsrvudphashtbl[nfsrc_hash(xid)])
239 #define NFSRCHASH(xid) \
240 (&nfsrchash_table[nfsrc_hash(xid)].tbl)
243 #define NFSRVCACHE_CHECKLEN 100
245 /* True iff the rpc reply is an nfs status ONLY! */
246 static int nfsv2_repstat[NFS_V3NPROCS] = {
272 * Will NFS want to work over IPv6 someday?
274 #define NETFAMILY(rp) \
275 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
277 /* local functions */
278 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
279 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
280 static void nfsrc_lock(struct nfsrvcache *rp);
281 static void nfsrc_unlock(struct nfsrvcache *rp);
282 static void nfsrc_wanted(struct nfsrvcache *rp);
283 static void nfsrc_freecache(struct nfsrvcache *rp);
284 static void nfsrc_trimcache(u_int64_t, struct socket *);
285 static int nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t,
287 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
288 static void nfsrc_marksametcpconn(u_int64_t);
291 * Return the correct mutex for this cache entry.
293 static __inline struct mtx *
294 nfsrc_cachemutex(struct nfsrvcache *rp)
297 if ((rp->rc_flag & RC_UDP) != 0)
298 return (&nfsrc_udpmtx);
299 return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
303 * Initialize the server request cache list
306 nfsrvd_initcache(void)
309 static int inited = 0;
314 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
315 LIST_INIT(&nfsrvudphashtbl[i]);
316 LIST_INIT(&nfsrchash_table[i].tbl);
318 TAILQ_INIT(&nfsrvudplru);
319 nfsrc_tcpsavedreplies = 0;
320 nfsrc_udpcachesize = 0;
321 newnfsstats.srvcache_tcppeak = 0;
322 newnfsstats.srvcache_size = 0;
326 * Get a cache entry for this request. Basically just malloc a new one
327 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
328 * Call nfsrc_trimcache() to clean up the cache before returning.
331 nfsrvd_getcache(struct nfsrv_descript *nd, struct socket *so)
333 struct nfsrvcache *newrp;
336 if (nd->nd_procnum == NFSPROC_NULL)
337 panic("nfsd cache null");
338 MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
339 M_NFSRVCACHE, M_WAITOK);
340 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
341 if (nd->nd_flag & ND_NFSV4)
342 newrp->rc_flag = RC_NFSV4;
343 else if (nd->nd_flag & ND_NFSV3)
344 newrp->rc_flag = RC_NFSV3;
346 newrp->rc_flag = RC_NFSV2;
347 newrp->rc_xid = nd->nd_retxid;
348 newrp->rc_proc = nd->nd_procnum;
349 newrp->rc_sockref = nd->nd_sockref;
350 newrp->rc_cachetime = nd->nd_tcpconntime;
351 if (nd->nd_flag & ND_SAMETCPCONN)
352 newrp->rc_flag |= RC_SAMETCPCONN;
353 if (nd->nd_nam2 != NULL) {
354 newrp->rc_flag |= RC_UDP;
355 ret = nfsrc_getudp(nd, newrp);
357 ret = nfsrc_gettcp(nd, newrp);
359 nfsrc_trimcache(nd->nd_sockref, so);
366 * - key on <xid, NFS version, RPC#, Client host ip#>
367 * (at most one entry for each key)
370 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
372 struct nfsrvcache *rp;
373 struct sockaddr_in *saddr;
374 struct sockaddr_in6 *saddr6;
375 struct nfsrvhashhead *hp;
379 mutex = nfsrc_cachemutex(newrp);
380 hp = NFSRCUDPHASH(newrp->rc_xid);
383 LIST_FOREACH(rp, hp, rc_hash) {
384 if (newrp->rc_xid == rp->rc_xid &&
385 newrp->rc_proc == rp->rc_proc &&
386 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
387 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
388 if ((rp->rc_flag & RC_LOCKED) != 0) {
389 rp->rc_flag |= RC_WANTED;
390 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
394 if (rp->rc_flag == 0)
395 panic("nfs udp cache0");
396 rp->rc_flag |= RC_LOCKED;
397 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
398 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
399 if (rp->rc_flag & RC_INPROG) {
400 newnfsstats.srvcache_inproghits++;
403 } else if (rp->rc_flag & RC_REPSTATUS) {
407 newnfsstats.srvcache_nonidemdonehits++;
410 *(nd->nd_errp) = rp->rc_status;
412 rp->rc_timestamp = NFSD_MONOSEC +
413 NFSRVCACHE_UDPTIMEOUT;
414 } else if (rp->rc_flag & RC_REPMBUF) {
415 newnfsstats.srvcache_nonidemdonehits++;
417 nd->nd_mreq = m_copym(rp->rc_reply, 0,
418 M_COPYALL, M_WAITOK);
420 rp->rc_timestamp = NFSD_MONOSEC +
421 NFSRVCACHE_UDPTIMEOUT;
423 panic("nfs udp cache1");
426 free((caddr_t)newrp, M_NFSRVCACHE);
430 newnfsstats.srvcache_misses++;
431 atomic_add_int(&newnfsstats.srvcache_size, 1);
432 nfsrc_udpcachesize++;
434 newrp->rc_flag |= RC_INPROG;
435 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
436 if (saddr->sin_family == AF_INET)
437 newrp->rc_inet = saddr->sin_addr.s_addr;
438 else if (saddr->sin_family == AF_INET6) {
439 saddr6 = (struct sockaddr_in6 *)saddr;
440 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
441 sizeof (struct in6_addr));
442 newrp->rc_flag |= RC_INETIPV6;
444 LIST_INSERT_HEAD(hp, newrp, rc_hash);
445 TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
456 * Update a request cache entry after the rpc has been done
458 APPLESTATIC struct nfsrvcache *
459 nfsrvd_updatecache(struct nfsrv_descript *nd, struct socket *so)
461 struct nfsrvcache *rp;
462 struct nfsrvcache *retrp = NULL;
468 panic("nfsrvd_updatecache null rp");
470 mutex = nfsrc_cachemutex(rp);
473 if (!(rp->rc_flag & RC_INPROG))
474 panic("nfsrvd_updatecache not inprog");
475 rp->rc_flag &= ~RC_INPROG;
476 if (rp->rc_flag & RC_UDP) {
477 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
478 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
482 * Reply from cache is a special case returned by nfsrv_checkseqid().
484 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
485 newnfsstats.srvcache_nonidemdonehits++;
489 mbuf_freem(nd->nd_mreq);
490 if (!(rp->rc_flag & RC_REPMBUF))
491 panic("reply from cache");
492 nd->nd_mreq = m_copym(rp->rc_reply, 0,
493 M_COPYALL, M_WAITOK);
494 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
500 * If rc_refcnt > 0, save it
501 * For UDP, save it if ND_SAVEREPLY is set
502 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
504 if (nd->nd_repstat != NFSERR_DONTREPLY &&
505 (rp->rc_refcnt > 0 ||
506 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
507 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
508 nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
509 nfsrc_tcpnonidempotent))) {
510 if (rp->rc_refcnt > 0) {
511 if (!(rp->rc_flag & RC_NFSV4))
512 panic("update_cache refcnt");
513 rp->rc_flag |= RC_REFCNT;
515 if ((nd->nd_flag & ND_NFSV2) &&
516 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
517 rp->rc_status = nd->nd_repstat;
518 rp->rc_flag |= RC_REPSTATUS;
521 if (!(rp->rc_flag & RC_UDP)) {
522 atomic_add_int(&nfsrc_tcpsavedreplies, 1);
523 if (nfsrc_tcpsavedreplies >
524 newnfsstats.srvcache_tcppeak)
525 newnfsstats.srvcache_tcppeak =
526 nfsrc_tcpsavedreplies;
529 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
532 rp->rc_flag |= RC_REPMBUF;
535 if (rp->rc_flag & RC_UDP) {
536 rp->rc_timestamp = NFSD_MONOSEC +
537 NFSRVCACHE_UDPTIMEOUT;
540 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
541 if (rp->rc_refcnt > 0)
552 nfsrc_trimcache(nd->nd_sockref, so);
558 * Invalidate and, if possible, free an in prog cache entry.
562 nfsrvd_delcache(struct nfsrvcache *rp)
566 mutex = nfsrc_cachemutex(rp);
567 if (!(rp->rc_flag & RC_INPROG))
568 panic("nfsrvd_delcache not in prog");
570 rp->rc_flag &= ~RC_INPROG;
571 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
577 * Called after nfsrvd_updatecache() once the reply is sent, to update
578 * the entry for nfsrc_activesocket() and unlock it. The argument is
579 * the pointer returned by nfsrvd_updatecache().
582 nfsrvd_sentcache(struct nfsrvcache *rp, struct socket *so, int err)
587 mutex = nfsrc_cachemutex(rp);
588 if (!(rp->rc_flag & RC_LOCKED))
589 panic("nfsrvd_sentcache not locked");
591 if ((so->so_proto->pr_domain->dom_family != AF_INET &&
592 so->so_proto->pr_domain->dom_family != AF_INET6) ||
593 so->so_proto->pr_protocol != IPPROTO_TCP)
594 panic("nfs sent cache");
595 if (nfsrv_getsockseqnum(so, &tmp_seq)) {
597 rp->rc_tcpseq = tmp_seq;
598 rp->rc_flag |= RC_TCPSEQ;
606 * Get a cache entry for TCP
607 * - key on <xid, nfs version>
608 * (allow multiple entries for a given key)
611 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
613 struct nfsrvcache *rp, *nextrp;
615 struct nfsrvcache *hitrp;
616 struct nfsrvhashhead *hp, nfsrc_templist;
620 mutex = nfsrc_cachemutex(newrp);
621 hp = NFSRCHASH(newrp->rc_xid);
622 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
626 LIST_INIT(&nfsrc_templist);
628 * Get all the matches and put them on the temp list.
631 while (rp != LIST_END(hp)) {
632 nextrp = LIST_NEXT(rp, rc_hash);
633 if (newrp->rc_xid == rp->rc_xid &&
634 (!(rp->rc_flag & RC_INPROG) ||
635 ((newrp->rc_flag & RC_SAMETCPCONN) &&
636 newrp->rc_sockref == rp->rc_sockref)) &&
637 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
638 newrp->rc_proc == rp->rc_proc &&
639 ((newrp->rc_flag & RC_NFSV4) &&
640 newrp->rc_sockref != rp->rc_sockref &&
641 newrp->rc_cachetime >= rp->rc_cachetime)
642 && newrp->rc_reqlen == rp->rc_reqlen &&
643 newrp->rc_cksum == rp->rc_cksum) {
644 LIST_REMOVE(rp, rc_hash);
645 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
651 * Now, use nfsrc_templist to decide if there is a match.
654 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
656 if (rp->rc_refcnt > 0) {
662 * Can be a hit only if one entry left.
663 * Note possible hit entry and put nfsrc_templist back on hash
668 hitrp = rp = LIST_FIRST(&nfsrc_templist);
669 while (rp != LIST_END(&nfsrc_templist)) {
670 nextrp = LIST_NEXT(rp, rc_hash);
671 LIST_REMOVE(rp, rc_hash);
672 LIST_INSERT_HEAD(hp, rp, rc_hash);
675 if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
676 panic("nfs gettcp cache templist");
680 if ((rp->rc_flag & RC_LOCKED) != 0) {
681 rp->rc_flag |= RC_WANTED;
682 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
686 if (rp->rc_flag == 0)
687 panic("nfs tcp cache0");
688 rp->rc_flag |= RC_LOCKED;
689 if (rp->rc_flag & RC_INPROG) {
690 newnfsstats.srvcache_inproghits++;
692 if (newrp->rc_sockref == rp->rc_sockref)
693 nfsrc_marksametcpconn(rp->rc_sockref);
695 } else if (rp->rc_flag & RC_REPSTATUS) {
699 newnfsstats.srvcache_nonidemdonehits++;
701 if (newrp->rc_sockref == rp->rc_sockref)
702 nfsrc_marksametcpconn(rp->rc_sockref);
705 *(nd->nd_errp) = rp->rc_status;
706 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
707 } else if (rp->rc_flag & RC_REPMBUF) {
708 newnfsstats.srvcache_nonidemdonehits++;
710 if (newrp->rc_sockref == rp->rc_sockref)
711 nfsrc_marksametcpconn(rp->rc_sockref);
713 nd->nd_mreq = m_copym(rp->rc_reply, 0,
714 M_COPYALL, M_WAITOK);
715 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
717 panic("nfs tcp cache1");
720 free((caddr_t)newrp, M_NFSRVCACHE);
723 newnfsstats.srvcache_misses++;
724 atomic_add_int(&newnfsstats.srvcache_size, 1);
727 * For TCP, multiple entries for a key are allowed, so don't
728 * chain it into the hash table until done.
730 newrp->rc_cachetime = NFSD_MONOSEC;
731 newrp->rc_flag |= RC_INPROG;
732 LIST_INSERT_HEAD(hp, newrp, rc_hash);
743 * Lock a cache entry.
746 nfsrc_lock(struct nfsrvcache *rp)
750 mutex = nfsrc_cachemutex(rp);
751 mtx_assert(mutex, MA_OWNED);
752 while ((rp->rc_flag & RC_LOCKED) != 0) {
753 rp->rc_flag |= RC_WANTED;
754 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
756 rp->rc_flag |= RC_LOCKED;
760 * Unlock a cache entry.
763 nfsrc_unlock(struct nfsrvcache *rp)
767 mutex = nfsrc_cachemutex(rp);
769 rp->rc_flag &= ~RC_LOCKED;
775 * Wakeup anyone wanting entry.
778 nfsrc_wanted(struct nfsrvcache *rp)
780 if (rp->rc_flag & RC_WANTED) {
781 rp->rc_flag &= ~RC_WANTED;
791 nfsrc_freecache(struct nfsrvcache *rp)
794 LIST_REMOVE(rp, rc_hash);
795 if (rp->rc_flag & RC_UDP) {
796 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
797 nfsrc_udpcachesize--;
800 if (rp->rc_flag & RC_REPMBUF) {
801 mbuf_freem(rp->rc_reply);
802 if (!(rp->rc_flag & RC_UDP))
803 atomic_add_int(&nfsrc_tcpsavedreplies, -1);
805 FREE((caddr_t)rp, M_NFSRVCACHE);
806 atomic_add_int(&newnfsstats.srvcache_size, -1);
810 * Clean out the cache. Called when nfsserver module is unloaded.
813 nfsrvd_cleancache(void)
815 struct nfsrvcache *rp, *nextrp;
818 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
819 mtx_lock(&nfsrchash_table[i].mtx);
820 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
822 mtx_unlock(&nfsrchash_table[i].mtx);
824 mtx_lock(&nfsrc_udpmtx);
825 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
826 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
830 newnfsstats.srvcache_size = 0;
831 mtx_unlock(&nfsrc_udpmtx);
832 nfsrc_tcpsavedreplies = 0;
836 * The basic rule is to get rid of entries that are expired.
839 nfsrc_trimcache(u_int64_t sockref, struct socket *so)
841 struct nfsrvcache *rp, *nextrp;
842 int i, j, k, time_histo[10];
844 static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
845 static int onethread = 0;
847 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
849 if (NFSD_MONOSEC != udp_lasttrim ||
850 nfsrc_udpcachesize >= (nfsrc_udphighwater +
851 nfsrc_udphighwater / 2)) {
852 mtx_lock(&nfsrc_udpmtx);
853 udp_lasttrim = NFSD_MONOSEC;
854 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
855 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
856 && rp->rc_refcnt == 0
857 && ((rp->rc_flag & RC_REFCNT) ||
858 udp_lasttrim > rp->rc_timestamp ||
859 nfsrc_udpcachesize > nfsrc_udphighwater))
862 mtx_unlock(&nfsrc_udpmtx);
864 if (NFSD_MONOSEC != tcp_lasttrim ||
865 nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
866 for (i = 0; i < 10; i++)
868 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
869 mtx_lock(&nfsrchash_table[i].mtx);
871 tcp_lasttrim = NFSD_MONOSEC;
872 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
875 (RC_INPROG|RC_LOCKED|RC_WANTED))
876 && rp->rc_refcnt == 0) {
878 * The timestamps range from roughly the
879 * present (tcp_lasttrim) to the present
880 * + nfsrc_tcptimeout. Generate a simple
881 * histogram of where the timeouts fall.
883 j = rp->rc_timestamp - tcp_lasttrim;
884 if (j >= nfsrc_tcptimeout)
885 j = nfsrc_tcptimeout - 1;
888 j = (j * 10 / nfsrc_tcptimeout) % 10;
890 if ((rp->rc_flag & RC_REFCNT) ||
891 tcp_lasttrim > rp->rc_timestamp ||
892 nfsrc_activesocket(rp, sockref, so))
896 mtx_unlock(&nfsrchash_table[i].mtx);
898 j = nfsrc_tcphighwater / 5; /* 20% of it */
899 if (j > 0 && (nfsrc_tcpsavedreplies + j) > nfsrc_tcphighwater) {
901 * Trim some more with a smaller timeout of as little
902 * as 20% of nfsrc_tcptimeout to try and get below
903 * 80% of the nfsrc_tcphighwater.
906 for (i = 0; i < 8; i++) {
911 k = nfsrc_tcptimeout * (i + 1) / 10;
914 thisstamp = tcp_lasttrim + k;
915 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
916 mtx_lock(&nfsrchash_table[i].mtx);
917 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
920 (RC_INPROG|RC_LOCKED|RC_WANTED))
921 && rp->rc_refcnt == 0
922 && ((rp->rc_flag & RC_REFCNT) ||
923 thisstamp > rp->rc_timestamp ||
924 nfsrc_activesocket(rp, sockref,
928 mtx_unlock(&nfsrchash_table[i].mtx);
932 atomic_store_rel_int(&onethread, 0);
936 * Add a seqid# reference to the cache entry.
939 nfsrvd_refcache(struct nfsrvcache *rp)
943 mutex = nfsrc_cachemutex(rp);
945 if (rp->rc_refcnt < 0)
946 panic("nfs cache refcnt");
952 * Dereference a seqid# cache entry.
955 nfsrvd_derefcache(struct nfsrvcache *rp)
959 mutex = nfsrc_cachemutex(rp);
961 if (rp->rc_refcnt <= 0)
962 panic("nfs cache derefcnt");
964 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
970 * Check to see if the socket is active.
971 * Return 1 if the reply has been received/acknowledged by the client,
973 * XXX - Uses tcp internals.
976 nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t cur_sockref,
977 struct socket *cur_so)
981 if (!(rp->rc_flag & RC_TCPSEQ))
984 * If the sockref is the same, it is the same TCP connection.
986 if (cur_sockref == rp->rc_sockref)
987 ret = nfsrv_checksockseqnum(cur_so, rp->rc_tcpseq);
992 * Calculate the length of the mbuf list and a checksum on the first up to
993 * NFSRVCACHE_CHECKLEN bytes.
996 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1006 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1007 *cksum = in_cksum(m1, cklen);
1012 * Mark a TCP connection that is seeing retries. Should never happen for
1016 nfsrc_marksametcpconn(u_int64_t sockref)