2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
38 * Here is the basic algorithm:
39 * First, some design criteria I used:
40 * - I think a false hit is more serious than a false miss
41 * - A false hit for an RPC that has Op(s) that order via seqid# must be
43 * - A valid hit will probably happen a long time after the original reply
44 * and the TCP socket that the original request was received on will no
46 * (The long time delay implies to me that LRU is not appropriate.)
47 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48 * in them as well as minimizing the risk of redoing retried non-idempotent
50 * Because it is biased towards avoiding false hits, multiple entries with
51 * the same xid are to be expected, especially for the case of the entry
52 * in the cache being related to a seqid# sequenced Op.
54 * The basic algorithm I'm about to code up:
55 * - Null RPCs bypass the cache and are just done
57 * - key on <xid, NFS version> (as noted above, there can be several
58 * entries with the same key)
59 * When a request arrives:
60 * For all that match key
61 * - if RPC# != OR request_size !=
62 * - not a match with this one
63 * - if NFSv4 and received on same TCP socket OR
64 * received on a TCP connection created before the
66 * - not a match with this one
67 * (V2,3 clients might retry on same TCP socket)
68 * - calculate checksum on first N bytes of NFS XDR
70 * - not a match for this one
71 * If any of the remaining ones that match has a
73 * - not a match (go do RPC, using new cache entry)
75 * - a hit (reply from cache)
77 * - miss (go do RPC, using new cache entry)
79 * During processing of NFSv4 request:
80 * - set a flag when a non-idempotent Op is processed
81 * - when an Op that uses a seqid# (Open,...) is processed
82 * - if same seqid# as referenced entry in cache
83 * - free new cache entry
84 * - reply from referenced cache entry
85 * else if next seqid# in order
86 * - free referenced cache entry
87 * - increment seqid_refcnt on new cache entry
88 * - set pointer from Openowner/Lockowner to
89 * new cache entry (aka reference it)
90 * else if first seqid# in sequence
91 * - increment seqid_refcnt on new cache entry
92 * - set pointer from Openowner/Lockowner to
93 * new cache entry (aka reference it)
95 * At end of RPC processing:
96 * - if seqid_refcnt > 0 OR flagged non-idempotent on new
98 * - save reply in cache entry
99 * - calculate checksum on first N bytes of NFS XDR
101 * - note op and length of XDR request (in bytes)
104 * - free new cache entry
105 * - Send reply (noting info for socket activity check, below)
107 * For cache entries saved above:
108 * - if saved since seqid_refcnt was > 0
109 * - free when seqid_refcnt decrements to 0
110 * (when next one in sequence is processed above, or
111 * when Openowner/Lockowner is discarded)
112 * else { non-idempotent Op(s) }
114 * - some further activity observed on same
116 * (I'm not yet sure how I'm going to do
117 * this. Maybe look at the TCP connection
118 * to see if the send_tcp_sequence# is well
119 * past sent reply OR K additional RPCs
120 * replied on same socket OR?)
122 * - when very old (hours, days, weeks?)
124 * For UDP (v2, 3 only), pretty much the old way:
125 * - key on <xid, NFS version, RPC#, Client host ip#>
126 * (at most one entry for each key)
128 * When a Request arrives:
129 * - if a match with entry via key
130 * - if RPC marked In_progress
131 * - discard request (don't send reply)
134 * - timestamp cache entry
136 * - add entry to cache, marked In_progress
139 * - if RPC# non-idempotent
140 * - mark entry Done (not In_progress)
142 * - timestamp cache entry
147 * Later, entries with saved replies are free'd a short time (few minutes)
148 * after reply sent (timestamp).
149 * Reference: Chet Juszczak, "Improving the Performance and Correctness
150 * of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151 * pages 53-63. San Diego, February 1989.
153 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154 * for TCP. For V3, a reply won't be saved when the flood level is
155 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156 * that case. This level should be set high enough that this almost
160 #include <fs/nfs/nfsport.h>
162 extern struct nfsstats newnfsstats;
163 extern struct mtx nfsrc_udpmtx;
164 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
165 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
166 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
167 #endif /* !APPLEKEXT */
169 SYSCTL_DECL(_vfs_nfsd);
171 static u_int nfsrc_tcphighwater = 0;
173 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
175 int error, newhighwater;
177 newhighwater = nfsrc_tcphighwater;
178 error = sysctl_handle_int(oidp, &newhighwater, 0, req);
179 if (error != 0 || req->newptr == NULL)
181 if (newhighwater < 0)
183 if (newhighwater >= nfsrc_floodlevel)
184 nfsrc_floodlevel = newhighwater + newhighwater / 5;
185 nfsrc_tcphighwater = newhighwater;
188 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
189 sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
190 "High water mark for TCP cache entries");
192 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
193 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
194 &nfsrc_udphighwater, 0,
195 "High water mark for UDP cache entries");
196 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
197 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
198 &nfsrc_tcptimeout, 0,
199 "Timeout for TCP entries in the DRC");
200 static u_int nfsrc_tcpnonidempotent = 1;
201 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
202 &nfsrc_tcpnonidempotent, 0,
203 "Enable the DRC for NFS over TCP");
205 static int nfsrc_udpcachesize = 0;
206 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
207 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
210 * and the reverse mapping from generic to Version 2 procedure numbers
212 static int newnfsv2_procid[NFS_V3NPROCS] = {
237 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
238 #define NFSRCUDPHASH(xid) \
239 (&nfsrvudphashtbl[nfsrc_hash(xid)])
240 #define NFSRCHASH(xid) \
241 (&nfsrchash_table[nfsrc_hash(xid)].tbl)
242 #define NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
245 #define NFSRVCACHE_CHECKLEN 100
247 /* True iff the rpc reply is an nfs status ONLY! */
248 static int nfsv2_repstat[NFS_V3NPROCS] = {
274 * Will NFS want to work over IPv6 someday?
276 #define NETFAMILY(rp) \
277 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
279 /* local functions */
280 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
281 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282 static void nfsrc_lock(struct nfsrvcache *rp);
283 static void nfsrc_unlock(struct nfsrvcache *rp);
284 static void nfsrc_wanted(struct nfsrvcache *rp);
285 static void nfsrc_freecache(struct nfsrvcache *rp);
286 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
287 static void nfsrc_marksametcpconn(u_int64_t);
290 * Return the correct mutex for this cache entry.
292 static __inline struct mtx *
293 nfsrc_cachemutex(struct nfsrvcache *rp)
296 if ((rp->rc_flag & RC_UDP) != 0)
297 return (&nfsrc_udpmtx);
298 return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
302 * Initialize the server request cache list
305 nfsrvd_initcache(void)
308 static int inited = 0;
313 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
314 LIST_INIT(&nfsrvudphashtbl[i]);
315 LIST_INIT(&nfsrchash_table[i].tbl);
316 LIST_INIT(&nfsrcahash_table[i].tbl);
318 TAILQ_INIT(&nfsrvudplru);
319 nfsrc_tcpsavedreplies = 0;
320 nfsrc_udpcachesize = 0;
321 newnfsstats.srvcache_tcppeak = 0;
322 newnfsstats.srvcache_size = 0;
326 * Get a cache entry for this request. Basically just malloc a new one
327 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
330 nfsrvd_getcache(struct nfsrv_descript *nd)
332 struct nfsrvcache *newrp;
335 if (nd->nd_procnum == NFSPROC_NULL)
336 panic("nfsd cache null");
337 MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
338 M_NFSRVCACHE, M_WAITOK);
339 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
340 if (nd->nd_flag & ND_NFSV4)
341 newrp->rc_flag = RC_NFSV4;
342 else if (nd->nd_flag & ND_NFSV3)
343 newrp->rc_flag = RC_NFSV3;
345 newrp->rc_flag = RC_NFSV2;
346 newrp->rc_xid = nd->nd_retxid;
347 newrp->rc_proc = nd->nd_procnum;
348 newrp->rc_sockref = nd->nd_sockref;
349 newrp->rc_cachetime = nd->nd_tcpconntime;
350 if (nd->nd_flag & ND_SAMETCPCONN)
351 newrp->rc_flag |= RC_SAMETCPCONN;
352 if (nd->nd_nam2 != NULL) {
353 newrp->rc_flag |= RC_UDP;
354 ret = nfsrc_getudp(nd, newrp);
356 ret = nfsrc_gettcp(nd, newrp);
364 * - key on <xid, NFS version, RPC#, Client host ip#>
365 * (at most one entry for each key)
368 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
370 struct nfsrvcache *rp;
371 struct sockaddr_in *saddr;
372 struct sockaddr_in6 *saddr6;
373 struct nfsrvhashhead *hp;
377 mutex = nfsrc_cachemutex(newrp);
378 hp = NFSRCUDPHASH(newrp->rc_xid);
381 LIST_FOREACH(rp, hp, rc_hash) {
382 if (newrp->rc_xid == rp->rc_xid &&
383 newrp->rc_proc == rp->rc_proc &&
384 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
385 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
386 if ((rp->rc_flag & RC_LOCKED) != 0) {
387 rp->rc_flag |= RC_WANTED;
388 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
392 if (rp->rc_flag == 0)
393 panic("nfs udp cache0");
394 rp->rc_flag |= RC_LOCKED;
395 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
396 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
397 if (rp->rc_flag & RC_INPROG) {
398 newnfsstats.srvcache_inproghits++;
401 } else if (rp->rc_flag & RC_REPSTATUS) {
405 newnfsstats.srvcache_nonidemdonehits++;
408 *(nd->nd_errp) = rp->rc_status;
410 rp->rc_timestamp = NFSD_MONOSEC +
411 NFSRVCACHE_UDPTIMEOUT;
412 } else if (rp->rc_flag & RC_REPMBUF) {
413 newnfsstats.srvcache_nonidemdonehits++;
415 nd->nd_mreq = m_copym(rp->rc_reply, 0,
416 M_COPYALL, M_WAITOK);
418 rp->rc_timestamp = NFSD_MONOSEC +
419 NFSRVCACHE_UDPTIMEOUT;
421 panic("nfs udp cache1");
424 free((caddr_t)newrp, M_NFSRVCACHE);
428 newnfsstats.srvcache_misses++;
429 atomic_add_int(&newnfsstats.srvcache_size, 1);
430 nfsrc_udpcachesize++;
432 newrp->rc_flag |= RC_INPROG;
433 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
434 if (saddr->sin_family == AF_INET)
435 newrp->rc_inet = saddr->sin_addr.s_addr;
436 else if (saddr->sin_family == AF_INET6) {
437 saddr6 = (struct sockaddr_in6 *)saddr;
438 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
439 sizeof (struct in6_addr));
440 newrp->rc_flag |= RC_INETIPV6;
442 LIST_INSERT_HEAD(hp, newrp, rc_hash);
443 TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
454 * Update a request cache entry after the rpc has been done
456 APPLESTATIC struct nfsrvcache *
457 nfsrvd_updatecache(struct nfsrv_descript *nd)
459 struct nfsrvcache *rp;
460 struct nfsrvcache *retrp = NULL;
466 panic("nfsrvd_updatecache null rp");
468 mutex = nfsrc_cachemutex(rp);
471 if (!(rp->rc_flag & RC_INPROG))
472 panic("nfsrvd_updatecache not inprog");
473 rp->rc_flag &= ~RC_INPROG;
474 if (rp->rc_flag & RC_UDP) {
475 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
476 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
480 * Reply from cache is a special case returned by nfsrv_checkseqid().
482 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
483 newnfsstats.srvcache_nonidemdonehits++;
487 mbuf_freem(nd->nd_mreq);
488 if (!(rp->rc_flag & RC_REPMBUF))
489 panic("reply from cache");
490 nd->nd_mreq = m_copym(rp->rc_reply, 0,
491 M_COPYALL, M_WAITOK);
492 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
498 * If rc_refcnt > 0, save it
499 * For UDP, save it if ND_SAVEREPLY is set
500 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
502 if (nd->nd_repstat != NFSERR_DONTREPLY &&
503 (rp->rc_refcnt > 0 ||
504 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
505 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
506 nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
507 nfsrc_tcpnonidempotent))) {
508 if (rp->rc_refcnt > 0) {
509 if (!(rp->rc_flag & RC_NFSV4))
510 panic("update_cache refcnt");
511 rp->rc_flag |= RC_REFCNT;
513 if ((nd->nd_flag & ND_NFSV2) &&
514 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
515 rp->rc_status = nd->nd_repstat;
516 rp->rc_flag |= RC_REPSTATUS;
519 if (!(rp->rc_flag & RC_UDP)) {
520 atomic_add_int(&nfsrc_tcpsavedreplies, 1);
521 if (nfsrc_tcpsavedreplies >
522 newnfsstats.srvcache_tcppeak)
523 newnfsstats.srvcache_tcppeak =
524 nfsrc_tcpsavedreplies;
527 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
530 rp->rc_flag |= RC_REPMBUF;
533 if (rp->rc_flag & RC_UDP) {
534 rp->rc_timestamp = NFSD_MONOSEC +
535 NFSRVCACHE_UDPTIMEOUT;
538 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
539 if (rp->rc_refcnt > 0)
555 * Invalidate and, if possible, free an in prog cache entry.
559 nfsrvd_delcache(struct nfsrvcache *rp)
563 mutex = nfsrc_cachemutex(rp);
564 if (!(rp->rc_flag & RC_INPROG))
565 panic("nfsrvd_delcache not in prog");
567 rp->rc_flag &= ~RC_INPROG;
568 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
574 * Called after nfsrvd_updatecache() once the reply is sent, to update
575 * the entry's sequence number and unlock it. The argument is
576 * the pointer returned by nfsrvd_updatecache().
579 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
581 struct nfsrchash_bucket *hbp;
583 KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
585 hbp = NFSRCAHASH(rp->rc_sockref);
588 if (rp->rc_acked != RC_NO_ACK)
589 LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
590 rp->rc_acked = RC_NO_ACK;
591 mtx_unlock(&hbp->mtx);
597 * Get a cache entry for TCP
598 * - key on <xid, nfs version>
599 * (allow multiple entries for a given key)
602 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
604 struct nfsrvcache *rp, *nextrp;
606 struct nfsrvcache *hitrp;
607 struct nfsrvhashhead *hp, nfsrc_templist;
611 mutex = nfsrc_cachemutex(newrp);
612 hp = NFSRCHASH(newrp->rc_xid);
613 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
617 LIST_INIT(&nfsrc_templist);
619 * Get all the matches and put them on the temp list.
622 while (rp != LIST_END(hp)) {
623 nextrp = LIST_NEXT(rp, rc_hash);
624 if (newrp->rc_xid == rp->rc_xid &&
625 (!(rp->rc_flag & RC_INPROG) ||
626 ((newrp->rc_flag & RC_SAMETCPCONN) &&
627 newrp->rc_sockref == rp->rc_sockref)) &&
628 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
629 newrp->rc_proc == rp->rc_proc &&
630 ((newrp->rc_flag & RC_NFSV4) &&
631 newrp->rc_sockref != rp->rc_sockref &&
632 newrp->rc_cachetime >= rp->rc_cachetime)
633 && newrp->rc_reqlen == rp->rc_reqlen &&
634 newrp->rc_cksum == rp->rc_cksum) {
635 LIST_REMOVE(rp, rc_hash);
636 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
642 * Now, use nfsrc_templist to decide if there is a match.
645 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
647 if (rp->rc_refcnt > 0) {
653 * Can be a hit only if one entry left.
654 * Note possible hit entry and put nfsrc_templist back on hash
659 hitrp = rp = LIST_FIRST(&nfsrc_templist);
660 while (rp != LIST_END(&nfsrc_templist)) {
661 nextrp = LIST_NEXT(rp, rc_hash);
662 LIST_REMOVE(rp, rc_hash);
663 LIST_INSERT_HEAD(hp, rp, rc_hash);
666 if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
667 panic("nfs gettcp cache templist");
671 if ((rp->rc_flag & RC_LOCKED) != 0) {
672 rp->rc_flag |= RC_WANTED;
673 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
677 if (rp->rc_flag == 0)
678 panic("nfs tcp cache0");
679 rp->rc_flag |= RC_LOCKED;
680 if (rp->rc_flag & RC_INPROG) {
681 newnfsstats.srvcache_inproghits++;
683 if (newrp->rc_sockref == rp->rc_sockref)
684 nfsrc_marksametcpconn(rp->rc_sockref);
686 } else if (rp->rc_flag & RC_REPSTATUS) {
690 newnfsstats.srvcache_nonidemdonehits++;
692 if (newrp->rc_sockref == rp->rc_sockref)
693 nfsrc_marksametcpconn(rp->rc_sockref);
696 *(nd->nd_errp) = rp->rc_status;
697 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
698 } else if (rp->rc_flag & RC_REPMBUF) {
699 newnfsstats.srvcache_nonidemdonehits++;
701 if (newrp->rc_sockref == rp->rc_sockref)
702 nfsrc_marksametcpconn(rp->rc_sockref);
704 nd->nd_mreq = m_copym(rp->rc_reply, 0,
705 M_COPYALL, M_WAITOK);
706 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
708 panic("nfs tcp cache1");
711 free((caddr_t)newrp, M_NFSRVCACHE);
714 newnfsstats.srvcache_misses++;
715 atomic_add_int(&newnfsstats.srvcache_size, 1);
718 * For TCP, multiple entries for a key are allowed, so don't
719 * chain it into the hash table until done.
721 newrp->rc_cachetime = NFSD_MONOSEC;
722 newrp->rc_flag |= RC_INPROG;
723 LIST_INSERT_HEAD(hp, newrp, rc_hash);
734 * Lock a cache entry.
737 nfsrc_lock(struct nfsrvcache *rp)
741 mutex = nfsrc_cachemutex(rp);
742 mtx_assert(mutex, MA_OWNED);
743 while ((rp->rc_flag & RC_LOCKED) != 0) {
744 rp->rc_flag |= RC_WANTED;
745 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
747 rp->rc_flag |= RC_LOCKED;
751 * Unlock a cache entry.
754 nfsrc_unlock(struct nfsrvcache *rp)
758 mutex = nfsrc_cachemutex(rp);
760 rp->rc_flag &= ~RC_LOCKED;
766 * Wakeup anyone wanting entry.
769 nfsrc_wanted(struct nfsrvcache *rp)
771 if (rp->rc_flag & RC_WANTED) {
772 rp->rc_flag &= ~RC_WANTED;
782 nfsrc_freecache(struct nfsrvcache *rp)
784 struct nfsrchash_bucket *hbp;
786 LIST_REMOVE(rp, rc_hash);
787 if (rp->rc_flag & RC_UDP) {
788 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
789 nfsrc_udpcachesize--;
790 } else if (rp->rc_acked != RC_NO_SEQ) {
791 hbp = NFSRCAHASH(rp->rc_sockref);
793 if (rp->rc_acked == RC_NO_ACK)
794 LIST_REMOVE(rp, rc_ahash);
795 mtx_unlock(&hbp->mtx);
798 if (rp->rc_flag & RC_REPMBUF) {
799 mbuf_freem(rp->rc_reply);
800 if (!(rp->rc_flag & RC_UDP))
801 atomic_add_int(&nfsrc_tcpsavedreplies, -1);
803 FREE((caddr_t)rp, M_NFSRVCACHE);
804 atomic_add_int(&newnfsstats.srvcache_size, -1);
808 * Clean out the cache. Called when nfsserver module is unloaded.
811 nfsrvd_cleancache(void)
813 struct nfsrvcache *rp, *nextrp;
816 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
817 mtx_lock(&nfsrchash_table[i].mtx);
818 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
820 mtx_unlock(&nfsrchash_table[i].mtx);
822 mtx_lock(&nfsrc_udpmtx);
823 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
824 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
828 newnfsstats.srvcache_size = 0;
829 mtx_unlock(&nfsrc_udpmtx);
830 nfsrc_tcpsavedreplies = 0;
835 * The basic rule is to get rid of entries that are expired.
838 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
840 struct nfsrchash_bucket *hbp;
841 struct nfsrvcache *rp, *nextrp;
842 int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
844 static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
845 static int onethread = 0, oneslot = 0;
848 hbp = NFSRCAHASH(sockref);
850 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
851 if (sockref == rp->rc_sockref) {
852 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
853 rp->rc_acked = RC_ACK;
854 LIST_REMOVE(rp, rc_ahash);
856 rp->rc_acked = RC_NACK;
857 LIST_REMOVE(rp, rc_ahash);
861 mtx_unlock(&hbp->mtx);
864 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
866 if (NFSD_MONOSEC != udp_lasttrim ||
867 nfsrc_udpcachesize >= (nfsrc_udphighwater +
868 nfsrc_udphighwater / 2)) {
869 mtx_lock(&nfsrc_udpmtx);
870 udp_lasttrim = NFSD_MONOSEC;
871 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
872 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
873 && rp->rc_refcnt == 0
874 && ((rp->rc_flag & RC_REFCNT) ||
875 udp_lasttrim > rp->rc_timestamp ||
876 nfsrc_udpcachesize > nfsrc_udphighwater))
879 mtx_unlock(&nfsrc_udpmtx);
881 if (NFSD_MONOSEC != tcp_lasttrim ||
882 nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
883 force = nfsrc_tcphighwater / 4;
885 nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
886 for (i = 0; i < HISTSIZE; i++)
889 lastslot = NFSRVCACHE_HASHSIZE - 1;
892 if (NFSD_MONOSEC != tcp_lasttrim) {
894 lastslot = NFSRVCACHE_HASHSIZE - 1;
896 lastslot = i = oneslot;
897 if (++oneslot >= NFSRVCACHE_HASHSIZE)
901 tto = nfsrc_tcptimeout;
902 tcp_lasttrim = NFSD_MONOSEC;
903 for (; i <= lastslot; i++) {
904 mtx_lock(&nfsrchash_table[i].mtx);
905 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
908 (RC_INPROG|RC_LOCKED|RC_WANTED))
909 && rp->rc_refcnt == 0) {
910 if ((rp->rc_flag & RC_REFCNT) ||
911 tcp_lasttrim > rp->rc_timestamp ||
912 rp->rc_acked == RC_ACK) {
920 * The timestamps range from roughly the
921 * present (tcp_lasttrim) to the present
922 * + nfsrc_tcptimeout. Generate a simple
923 * histogram of where the timeouts fall.
925 j = rp->rc_timestamp - tcp_lasttrim;
931 j = j * HISTSIZE / tto;
935 mtx_unlock(&nfsrchash_table[i].mtx);
939 * Trim some more with a smaller timeout of as little
940 * as 20% of nfsrc_tcptimeout to try and get below
941 * 80% of the nfsrc_tcphighwater.
944 for (i = 0; i < (HISTSIZE - 2); i++) {
949 k = tto * (i + 1) / HISTSIZE;
952 thisstamp = tcp_lasttrim + k;
953 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
954 mtx_lock(&nfsrchash_table[i].mtx);
955 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
958 (RC_INPROG|RC_LOCKED|RC_WANTED))
959 && rp->rc_refcnt == 0
960 && ((rp->rc_flag & RC_REFCNT) ||
961 thisstamp > rp->rc_timestamp ||
962 rp->rc_acked == RC_ACK))
965 mtx_unlock(&nfsrchash_table[i].mtx);
969 atomic_store_rel_int(&onethread, 0);
973 * Add a seqid# reference to the cache entry.
976 nfsrvd_refcache(struct nfsrvcache *rp)
980 mutex = nfsrc_cachemutex(rp);
982 if (rp->rc_refcnt < 0)
983 panic("nfs cache refcnt");
989 * Dereference a seqid# cache entry.
992 nfsrvd_derefcache(struct nfsrvcache *rp)
996 mutex = nfsrc_cachemutex(rp);
998 if (rp->rc_refcnt <= 0)
999 panic("nfs cache derefcnt");
1001 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1002 nfsrc_freecache(rp);
1007 * Calculate the length of the mbuf list and a checksum on the first up to
1008 * NFSRVCACHE_CHECKLEN bytes.
1011 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1021 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1022 *cksum = in_cksum(m1, cklen);
1027 * Mark a TCP connection that is seeing retries. Should never happen for
1031 nfsrc_marksametcpconn(u_int64_t sockref)