]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/fs/nfsserver/nfs_nfsdcache.c
zfs: merge openzfs/zfs@feff9dfed
[FreeBSD/FreeBSD.git] / sys / fs / nfsserver / nfs_nfsdcache.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993
5  *      The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Rick Macklem at The University of Guelph.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  */
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38
39 /*
40  * Here is the basic algorithm:
41  * First, some design criteria I used:
42  * - I think a false hit is more serious than a false miss
43  * - A false hit for an RPC that has Op(s) that order via seqid# must be
44  *   avoided at all cost
45  * - A valid hit will probably happen a long time after the original reply
46  *   and the TCP socket that the original request was received on will no
47  *   longer be active
48  *   (The long time delay implies to me that LRU is not appropriate.)
49  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
50  *   in them as well as minimizing the risk of redoing retried non-idempotent
51  *   Ops.
52  * Because it is biased towards avoiding false hits, multiple entries with
53  * the same xid are to be expected, especially for the case of the entry
54  * in the cache being related to a seqid# sequenced Op.
55  * 
56  * The basic algorithm I'm about to code up:
57  * - Null RPCs bypass the cache and are just done
58  * For TCP
59  *      - key on <xid, NFS version> (as noted above, there can be several
60  *                                   entries with the same key)
61  *      When a request arrives:
62  *              For all that match key
63  *              - if RPC# != OR request_size !=
64  *                      - not a match with this one
65  *              - if NFSv4 and received on same TCP socket OR
66  *                      received on a TCP connection created before the
67  *                      entry was cached
68  *                      - not a match with this one
69  *                      (V2,3 clients might retry on same TCP socket)
70  *              - calculate checksum on first N bytes of NFS XDR
71  *              - if checksum !=
72  *                      - not a match for this one
73  *              If any of the remaining ones that match has a
74  *                      seqid_refcnt > 0
75  *                      - not a match (go do RPC, using new cache entry)
76  *              If one match left
77  *                      - a hit (reply from cache)
78  *              else
79  *                      - miss (go do RPC, using new cache entry)
80  * 
81  *      During processing of NFSv4 request:
82  *              - set a flag when a non-idempotent Op is processed
83  *              - when an Op that uses a seqid# (Open,...) is processed
84  *                      - if same seqid# as referenced entry in cache
85  *                              - free new cache entry
86  *                              - reply from referenced cache entry
87  *                        else if next seqid# in order
88  *                              - free referenced cache entry
89  *                              - increment seqid_refcnt on new cache entry
90  *                              - set pointer from Openowner/Lockowner to
91  *                                      new cache entry (aka reference it)
92  *                        else if first seqid# in sequence
93  *                              - increment seqid_refcnt on new cache entry
94  *                              - set pointer from Openowner/Lockowner to
95  *                                      new cache entry (aka reference it)
96  * 
97  *      At end of RPC processing:
98  *              - if seqid_refcnt > 0 OR flagged non-idempotent on new
99  *                      cache entry
100  *                      - save reply in cache entry
101  *                      - calculate checksum on first N bytes of NFS XDR
102  *                              request
103  *                      - note op and length of XDR request (in bytes)
104  *                      - timestamp it
105  *                else
106  *                      - free new cache entry
107  *              - Send reply (noting info for socket activity check, below)
108  * 
109  *      For cache entries saved above:
110  *              - if saved since seqid_refcnt was > 0
111  *                      - free when seqid_refcnt decrements to 0
112  *                        (when next one in sequence is processed above, or
113  *                         when Openowner/Lockowner is discarded)
114  *                else { non-idempotent Op(s) }
115  *                      - free when
116  *                              - some further activity observed on same
117  *                                      socket
118  *                                (I'm not yet sure how I'm going to do
119  *                                 this. Maybe look at the TCP connection
120  *                                 to see if the send_tcp_sequence# is well
121  *                                 past sent reply OR K additional RPCs
122  *                                 replied on same socket OR?)
123  *                        OR
124  *                              - when very old (hours, days, weeks?)
125  * 
126  * For UDP (v2, 3 only), pretty much the old way:
127  * - key on <xid, NFS version, RPC#, Client host ip#>
128  *   (at most one entry for each key)
129  * 
130  * When a Request arrives:
131  * - if a match with entry via key
132  *      - if RPC marked In_progress
133  *              - discard request (don't send reply)
134  *        else
135  *              - reply from cache
136  *              - timestamp cache entry
137  *   else
138  *      - add entry to cache, marked In_progress
139  *      - do RPC
140  *      - when RPC done
141  *              - if RPC# non-idempotent
142  *                      - mark entry Done (not In_progress)
143  *                      - save reply
144  *                      - timestamp cache entry
145  *                else
146  *                      - free cache entry
147  *              - send reply
148  * 
149  * Later, entries with saved replies are free'd a short time (few minutes)
150  * after reply sent (timestamp).
151  * Reference: Chet Juszczak, "Improving the Performance and Correctness
152  *              of an NFS Server", in Proc. Winter 1989 USENIX Conference,
153  *              pages 53-63. San Diego, February 1989.
154  *       for the UDP case.
155  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
156  *      for TCP. For V3, a reply won't be saved when the flood level is
157  *      hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
158  *      that case. This level should be set high enough that this almost
159  *      never happens.
160  */
161 #include <fs/nfs/nfsport.h>
162
163 extern struct mtx nfsrc_udpmtx;
164
165 NFSD_VNET_DECLARE(struct nfsrvhashhead *, nfsrvudphashtbl);
166 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrchash_table);
167 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrcahash_table);
168 NFSD_VNET_DECLARE(struct nfsstatsv1 *, nfsstatsv1_p);
169
170 NFSD_VNET_DEFINE(int, nfsrc_floodlevel) = NFSRVCACHE_FLOODLEVEL;
171 NFSD_VNET_DEFINE(int, nfsrc_tcpsavedreplies) = 0;
172
173 SYSCTL_DECL(_vfs_nfsd);
174
175 static u_int    nfsrc_tcphighwater = 0;
176 static int
177 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
178 {
179         int error, newhighwater;
180
181         newhighwater = nfsrc_tcphighwater;
182         error = sysctl_handle_int(oidp, &newhighwater, 0, req);
183         if (error != 0 || req->newptr == NULL)
184                 return (error);
185         if (newhighwater < 0)
186                 return (EINVAL);
187         if (newhighwater >= NFSD_VNET(nfsrc_floodlevel))
188                 NFSD_VNET(nfsrc_floodlevel) = newhighwater + newhighwater / 5;
189         nfsrc_tcphighwater = newhighwater;
190         return (0);
191 }
192 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
193     CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
194     sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
195
196 static u_int    nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
197 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
198     &nfsrc_udphighwater, 0,
199     "High water mark for UDP cache entries");
200 static u_int    nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
201 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
202     &nfsrc_tcptimeout, 0,
203     "Timeout for TCP entries in the DRC");
204 static u_int nfsrc_tcpnonidempotent = 1;
205 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
206     &nfsrc_tcpnonidempotent, 0,
207     "Enable the DRC for NFS over TCP");
208
209 NFSD_VNET_DEFINE_STATIC(int, nfsrc_udpcachesize) = 0;
210 NFSD_VNET_DEFINE_STATIC(TAILQ_HEAD(, nfsrvcache), nfsrvudplru);
211
212 /*
213  * and the reverse mapping from generic to Version 2 procedure numbers
214  */
215 static int newnfsv2_procid[NFS_V3NPROCS] = {
216         NFSV2PROC_NULL,
217         NFSV2PROC_GETATTR,
218         NFSV2PROC_SETATTR,
219         NFSV2PROC_LOOKUP,
220         NFSV2PROC_NOOP,
221         NFSV2PROC_READLINK,
222         NFSV2PROC_READ,
223         NFSV2PROC_WRITE,
224         NFSV2PROC_CREATE,
225         NFSV2PROC_MKDIR,
226         NFSV2PROC_SYMLINK,
227         NFSV2PROC_CREATE,
228         NFSV2PROC_REMOVE,
229         NFSV2PROC_RMDIR,
230         NFSV2PROC_RENAME,
231         NFSV2PROC_LINK,
232         NFSV2PROC_READDIR,
233         NFSV2PROC_NOOP,
234         NFSV2PROC_STATFS,
235         NFSV2PROC_NOOP,
236         NFSV2PROC_NOOP,
237         NFSV2PROC_NOOP,
238 };
239
240 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
241 #define NFSRCUDPHASH(xid) \
242         (&NFSD_VNET(nfsrvudphashtbl)[nfsrc_hash(xid)])
243 #define NFSRCHASH(xid) \
244         (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(xid)].tbl)
245 #define NFSRCAHASH(xid) (&NFSD_VNET(nfsrcahash_table)[nfsrc_hash(xid)])
246 #define TRUE    1
247 #define FALSE   0
248 #define NFSRVCACHE_CHECKLEN     100
249
250 /* True iff the rpc reply is an nfs status ONLY! */
251 static int nfsv2_repstat[NFS_V3NPROCS] = {
252         FALSE,
253         FALSE,
254         FALSE,
255         FALSE,
256         FALSE,
257         FALSE,
258         FALSE,
259         FALSE,
260         FALSE,
261         FALSE,
262         TRUE,
263         TRUE,
264         TRUE,
265         TRUE,
266         FALSE,
267         TRUE,
268         FALSE,
269         FALSE,
270         FALSE,
271         FALSE,
272         FALSE,
273         FALSE,
274 };
275
276 /*
277  * Will NFS want to work over IPv6 someday?
278  */
279 #define NETFAMILY(rp) \
280                 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
281
282 /* local functions */
283 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
284 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
285 static void nfsrc_lock(struct nfsrvcache *rp);
286 static void nfsrc_unlock(struct nfsrvcache *rp);
287 static void nfsrc_wanted(struct nfsrvcache *rp);
288 static void nfsrc_freecache(struct nfsrvcache *rp);
289 static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
290 static void nfsrc_marksametcpconn(u_int64_t);
291
292 /*
293  * Return the correct mutex for this cache entry.
294  */
295 static __inline struct mtx *
296 nfsrc_cachemutex(struct nfsrvcache *rp)
297 {
298
299         if ((rp->rc_flag & RC_UDP) != 0)
300                 return (&nfsrc_udpmtx);
301         return (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(rp->rc_xid)].mtx);
302 }
303
304 /*
305  * Initialize the server request cache list
306  */
307 void
308 nfsrvd_initcache(void)
309 {
310         int i;
311
312         NFSD_VNET(nfsrvudphashtbl) = malloc(sizeof(struct nfsrvhashhead) *
313             NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
314         NFSD_VNET(nfsrchash_table) = malloc(sizeof(struct nfsrchash_bucket) *
315             NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
316         NFSD_VNET(nfsrcahash_table) = malloc(sizeof(struct nfsrchash_bucket) *
317             NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
318         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
319                 mtx_init(&NFSD_VNET(nfsrchash_table)[i].mtx, "nfsrtc", NULL,
320                     MTX_DEF);
321                 mtx_init(&NFSD_VNET(nfsrcahash_table)[i].mtx, "nfsrtca", NULL,
322                     MTX_DEF);
323         }
324         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
325                 LIST_INIT(&NFSD_VNET(nfsrvudphashtbl)[i]);
326                 LIST_INIT(&NFSD_VNET(nfsrchash_table)[i].tbl);
327                 LIST_INIT(&NFSD_VNET(nfsrcahash_table)[i].tbl);
328         }
329         TAILQ_INIT(&NFSD_VNET(nfsrvudplru));
330         NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
331         NFSD_VNET(nfsrc_udpcachesize) = 0;
332 }
333
334 /*
335  * Get a cache entry for this request. Basically just malloc a new one
336  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
337  */
338 int
339 nfsrvd_getcache(struct nfsrv_descript *nd)
340 {
341         struct nfsrvcache *newrp;
342         int ret;
343
344         if (nd->nd_procnum == NFSPROC_NULL)
345                 panic("nfsd cache null");
346         newrp = malloc(sizeof (struct nfsrvcache),
347             M_NFSRVCACHE, M_WAITOK);
348         NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
349         if (nd->nd_flag & ND_NFSV4)
350                 newrp->rc_flag = RC_NFSV4;
351         else if (nd->nd_flag & ND_NFSV3)
352                 newrp->rc_flag = RC_NFSV3;
353         else
354                 newrp->rc_flag = RC_NFSV2;
355         newrp->rc_xid = nd->nd_retxid;
356         newrp->rc_proc = nd->nd_procnum;
357         newrp->rc_sockref = nd->nd_sockref;
358         newrp->rc_cachetime = nd->nd_tcpconntime;
359         if (nd->nd_flag & ND_SAMETCPCONN)
360                 newrp->rc_flag |= RC_SAMETCPCONN;
361         if (nd->nd_nam2 != NULL) {
362                 newrp->rc_flag |= RC_UDP;
363                 ret = nfsrc_getudp(nd, newrp);
364         } else {
365                 ret = nfsrc_gettcp(nd, newrp);
366         }
367         NFSEXITCODE2(0, nd);
368         return (ret);
369 }
370
371 /*
372  * For UDP (v2, v3):
373  * - key on <xid, NFS version, RPC#, Client host ip#>
374  *   (at most one entry for each key)
375  */
376 static int
377 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
378 {
379         struct nfsrvcache *rp;
380         struct sockaddr_in *saddr;
381         struct sockaddr_in6 *saddr6;
382         struct nfsrvhashhead *hp;
383         int ret = 0;
384         struct mtx *mutex;
385
386         mutex = nfsrc_cachemutex(newrp);
387         hp = NFSRCUDPHASH(newrp->rc_xid);
388 loop:
389         mtx_lock(mutex);
390         LIST_FOREACH(rp, hp, rc_hash) {
391             if (newrp->rc_xid == rp->rc_xid &&
392                 newrp->rc_proc == rp->rc_proc &&
393                 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
394                 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
395                         if ((rp->rc_flag & RC_LOCKED) != 0) {
396                                 rp->rc_flag |= RC_WANTED;
397                                 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
398                                     "nfsrc", 10 * hz);
399                                 goto loop;
400                         }
401                         if (rp->rc_flag == 0)
402                                 panic("nfs udp cache0");
403                         rp->rc_flag |= RC_LOCKED;
404                         TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
405                         TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
406                         if (rp->rc_flag & RC_INPROG) {
407                                 NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
408                                 mtx_unlock(mutex);
409                                 ret = RC_DROPIT;
410                         } else if (rp->rc_flag & RC_REPSTATUS) {
411                                 /*
412                                  * V2 only.
413                                  */
414                                 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
415                                 mtx_unlock(mutex);
416                                 nfsrvd_rephead(nd);
417                                 *(nd->nd_errp) = rp->rc_status;
418                                 ret = RC_REPLY;
419                                 rp->rc_timestamp = NFSD_MONOSEC +
420                                         NFSRVCACHE_UDPTIMEOUT;
421                         } else if (rp->rc_flag & RC_REPMBUF) {
422                                 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
423                                 mtx_unlock(mutex);
424                                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
425                                         M_COPYALL, M_WAITOK);
426                                 ret = RC_REPLY;
427                                 rp->rc_timestamp = NFSD_MONOSEC +
428                                         NFSRVCACHE_UDPTIMEOUT;
429                         } else {
430                                 panic("nfs udp cache1");
431                         }
432                         nfsrc_unlock(rp);
433                         free(newrp, M_NFSRVCACHE);
434                         goto out;
435                 }
436         }
437         NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
438         atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
439         NFSD_VNET(nfsrc_udpcachesize)++;
440
441         newrp->rc_flag |= RC_INPROG;
442         saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
443         if (saddr->sin_family == AF_INET)
444                 newrp->rc_inet = saddr->sin_addr.s_addr;
445         else if (saddr->sin_family == AF_INET6) {
446                 saddr6 = (struct sockaddr_in6 *)saddr;
447                 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
448                     sizeof (struct in6_addr));
449                 newrp->rc_flag |= RC_INETIPV6;
450         }
451         LIST_INSERT_HEAD(hp, newrp, rc_hash);
452         TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), newrp, rc_lru);
453         mtx_unlock(mutex);
454         nd->nd_rp = newrp;
455         ret = RC_DOIT;
456
457 out:
458         NFSEXITCODE2(0, nd);
459         return (ret);
460 }
461
462 /*
463  * Update a request cache entry after the rpc has been done
464  */
465 struct nfsrvcache *
466 nfsrvd_updatecache(struct nfsrv_descript *nd)
467 {
468         struct nfsrvcache *rp;
469         struct nfsrvcache *retrp = NULL;
470         struct mbuf *m;
471         struct mtx *mutex;
472
473         rp = nd->nd_rp;
474         if (!rp)
475                 panic("nfsrvd_updatecache null rp");
476         nd->nd_rp = NULL;
477         mutex = nfsrc_cachemutex(rp);
478         mtx_lock(mutex);
479         nfsrc_lock(rp);
480         if (!(rp->rc_flag & RC_INPROG))
481                 panic("nfsrvd_updatecache not inprog");
482         rp->rc_flag &= ~RC_INPROG;
483         if (rp->rc_flag & RC_UDP) {
484                 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
485                 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
486         }
487
488         /*
489          * Reply from cache is a special case returned by nfsrv_checkseqid().
490          */
491         if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
492                 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
493                 mtx_unlock(mutex);
494                 nd->nd_repstat = 0;
495                 if (nd->nd_mreq)
496                         m_freem(nd->nd_mreq);
497                 if (!(rp->rc_flag & RC_REPMBUF))
498                         panic("reply from cache");
499                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
500                     M_COPYALL, M_WAITOK);
501                 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
502                 nfsrc_unlock(rp);
503                 goto out;
504         }
505
506         /*
507          * If rc_refcnt > 0, save it
508          * For UDP, save it if ND_SAVEREPLY is set
509          * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
510          */
511         if (nd->nd_repstat != NFSERR_DONTREPLY &&
512             (rp->rc_refcnt > 0 ||
513              ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
514              ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
515               NFSD_VNET(nfsrc_tcpsavedreplies) <= NFSD_VNET(nfsrc_floodlevel) &&
516               nfsrc_tcpnonidempotent))) {
517                 if (rp->rc_refcnt > 0) {
518                         if (!(rp->rc_flag & RC_NFSV4))
519                                 panic("update_cache refcnt");
520                         rp->rc_flag |= RC_REFCNT;
521                 }
522                 if ((nd->nd_flag & ND_NFSV2) &&
523                     nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
524                         rp->rc_status = nd->nd_repstat;
525                         rp->rc_flag |= RC_REPSTATUS;
526                         mtx_unlock(mutex);
527                 } else {
528                         if (!(rp->rc_flag & RC_UDP)) {
529                             atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies),
530                                 1);
531                             if (NFSD_VNET(nfsrc_tcpsavedreplies) >
532                                 NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak)
533                                 NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak =
534                                     NFSD_VNET(nfsrc_tcpsavedreplies);
535                         }
536                         mtx_unlock(mutex);
537                         m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
538                         mtx_lock(mutex);
539                         rp->rc_reply = m;
540                         rp->rc_flag |= RC_REPMBUF;
541                         mtx_unlock(mutex);
542                 }
543                 if (rp->rc_flag & RC_UDP) {
544                         rp->rc_timestamp = NFSD_MONOSEC +
545                             NFSRVCACHE_UDPTIMEOUT;
546                         nfsrc_unlock(rp);
547                 } else {
548                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
549                         if (rp->rc_refcnt > 0)
550                                 nfsrc_unlock(rp);
551                         else
552                                 retrp = rp;
553                 }
554         } else {
555                 nfsrc_freecache(rp);
556                 mtx_unlock(mutex);
557         }
558
559 out:
560         NFSEXITCODE2(0, nd);
561         return (retrp);
562 }
563
564 /*
565  * Invalidate and, if possible, free an in prog cache entry.
566  * Must not sleep.
567  */
568 void
569 nfsrvd_delcache(struct nfsrvcache *rp)
570 {
571         struct mtx *mutex;
572
573         mutex = nfsrc_cachemutex(rp);
574         if (!(rp->rc_flag & RC_INPROG))
575                 panic("nfsrvd_delcache not in prog");
576         mtx_lock(mutex);
577         rp->rc_flag &= ~RC_INPROG;
578         if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
579                 nfsrc_freecache(rp);
580         mtx_unlock(mutex);
581 }
582
583 /*
584  * Called after nfsrvd_updatecache() once the reply is sent, to update
585  * the entry's sequence number and unlock it. The argument is
586  * the pointer returned by nfsrvd_updatecache().
587  */
588 void
589 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
590 {
591         struct nfsrchash_bucket *hbp;
592
593         KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
594         if (have_seq) {
595                 hbp = NFSRCAHASH(rp->rc_sockref);
596                 mtx_lock(&hbp->mtx);
597                 rp->rc_tcpseq = seq;
598                 if (rp->rc_acked != RC_NO_ACK)
599                         LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
600                 rp->rc_acked = RC_NO_ACK;
601                 mtx_unlock(&hbp->mtx);
602         }
603         nfsrc_unlock(rp);
604 }
605
606 /*
607  * Get a cache entry for TCP
608  * - key on <xid, nfs version>
609  *   (allow multiple entries for a given key)
610  */
611 static int
612 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
613 {
614         struct nfsrvcache *rp, *nextrp;
615         int i;
616         struct nfsrvcache *hitrp;
617         struct nfsrvhashhead *hp, nfsrc_templist;
618         int hit, ret = 0;
619         struct mtx *mutex;
620
621         mutex = nfsrc_cachemutex(newrp);
622         hp = NFSRCHASH(newrp->rc_xid);
623         newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
624 tryagain:
625         mtx_lock(mutex);
626         hit = 1;
627         LIST_INIT(&nfsrc_templist);
628         /*
629          * Get all the matches and put them on the temp list.
630          */
631         rp = LIST_FIRST(hp);
632         while (rp != LIST_END(hp)) {
633                 nextrp = LIST_NEXT(rp, rc_hash);
634                 if (newrp->rc_xid == rp->rc_xid &&
635                     (!(rp->rc_flag & RC_INPROG) ||
636                      ((newrp->rc_flag & RC_SAMETCPCONN) &&
637                       newrp->rc_sockref == rp->rc_sockref)) &&
638                     (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
639                     newrp->rc_proc == rp->rc_proc &&
640                     ((newrp->rc_flag & RC_NFSV4) &&
641                      newrp->rc_sockref != rp->rc_sockref &&
642                      newrp->rc_cachetime >= rp->rc_cachetime)
643                     && newrp->rc_reqlen == rp->rc_reqlen &&
644                     newrp->rc_cksum == rp->rc_cksum) {
645                         LIST_REMOVE(rp, rc_hash);
646                         LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
647                 }
648                 rp = nextrp;
649         }
650
651         /*
652          * Now, use nfsrc_templist to decide if there is a match.
653          */
654         i = 0;
655         LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
656                 i++;
657                 if (rp->rc_refcnt > 0) {
658                         hit = 0;
659                         break;
660                 }
661         }
662         /*
663          * Can be a hit only if one entry left.
664          * Note possible hit entry and put nfsrc_templist back on hash
665          * list.
666          */
667         if (i != 1)
668                 hit = 0;
669         hitrp = rp = LIST_FIRST(&nfsrc_templist);
670         while (rp != LIST_END(&nfsrc_templist)) {
671                 nextrp = LIST_NEXT(rp, rc_hash);
672                 LIST_REMOVE(rp, rc_hash);
673                 LIST_INSERT_HEAD(hp, rp, rc_hash);
674                 rp = nextrp;
675         }
676         if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
677                 panic("nfs gettcp cache templist");
678
679         if (hit) {
680                 rp = hitrp;
681                 if ((rp->rc_flag & RC_LOCKED) != 0) {
682                         rp->rc_flag |= RC_WANTED;
683                         (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
684                             "nfsrc", 10 * hz);
685                         goto tryagain;
686                 }
687                 if (rp->rc_flag == 0)
688                         panic("nfs tcp cache0");
689                 rp->rc_flag |= RC_LOCKED;
690                 if (rp->rc_flag & RC_INPROG) {
691                         NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
692                         mtx_unlock(mutex);
693                         if (newrp->rc_sockref == rp->rc_sockref)
694                                 nfsrc_marksametcpconn(rp->rc_sockref);
695                         ret = RC_DROPIT;
696                 } else if (rp->rc_flag & RC_REPSTATUS) {
697                         /*
698                          * V2 only.
699                          */
700                         NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
701                         mtx_unlock(mutex);
702                         if (newrp->rc_sockref == rp->rc_sockref)
703                                 nfsrc_marksametcpconn(rp->rc_sockref);
704                         ret = RC_REPLY;
705                         nfsrvd_rephead(nd);
706                         *(nd->nd_errp) = rp->rc_status;
707                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
708                 } else if (rp->rc_flag & RC_REPMBUF) {
709                         NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
710                         mtx_unlock(mutex);
711                         if (newrp->rc_sockref == rp->rc_sockref)
712                                 nfsrc_marksametcpconn(rp->rc_sockref);
713                         ret = RC_REPLY;
714                         nd->nd_mreq = m_copym(rp->rc_reply, 0,
715                                 M_COPYALL, M_WAITOK);
716                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
717                 } else {
718                         panic("nfs tcp cache1");
719                 }
720                 nfsrc_unlock(rp);
721                 free(newrp, M_NFSRVCACHE);
722                 goto out;
723         }
724         NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
725         atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
726
727         /*
728          * For TCP, multiple entries for a key are allowed, so don't
729          * chain it into the hash table until done.
730          */
731         newrp->rc_cachetime = NFSD_MONOSEC;
732         newrp->rc_flag |= RC_INPROG;
733         LIST_INSERT_HEAD(hp, newrp, rc_hash);
734         mtx_unlock(mutex);
735         nd->nd_rp = newrp;
736         ret = RC_DOIT;
737
738 out:
739         NFSEXITCODE2(0, nd);
740         return (ret);
741 }
742
743 /*
744  * Lock a cache entry.
745  */
746 static void
747 nfsrc_lock(struct nfsrvcache *rp)
748 {
749         struct mtx *mutex;
750
751         mutex = nfsrc_cachemutex(rp);
752         mtx_assert(mutex, MA_OWNED);
753         while ((rp->rc_flag & RC_LOCKED) != 0) {
754                 rp->rc_flag |= RC_WANTED;
755                 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
756         }
757         rp->rc_flag |= RC_LOCKED;
758 }
759
760 /*
761  * Unlock a cache entry.
762  */
763 static void
764 nfsrc_unlock(struct nfsrvcache *rp)
765 {
766         struct mtx *mutex;
767
768         mutex = nfsrc_cachemutex(rp);
769         mtx_lock(mutex);
770         rp->rc_flag &= ~RC_LOCKED;
771         nfsrc_wanted(rp);
772         mtx_unlock(mutex);
773 }
774
775 /*
776  * Wakeup anyone wanting entry.
777  */
778 static void
779 nfsrc_wanted(struct nfsrvcache *rp)
780 {
781         if (rp->rc_flag & RC_WANTED) {
782                 rp->rc_flag &= ~RC_WANTED;
783                 wakeup((caddr_t)rp);
784         }
785 }
786
787 /*
788  * Free up the entry.
789  * Must not sleep.
790  */
791 static void
792 nfsrc_freecache(struct nfsrvcache *rp)
793 {
794         struct nfsrchash_bucket *hbp;
795
796         LIST_REMOVE(rp, rc_hash);
797         if (rp->rc_flag & RC_UDP) {
798                 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
799                 NFSD_VNET(nfsrc_udpcachesize)--;
800         } else if (rp->rc_acked != RC_NO_SEQ) {
801                 hbp = NFSRCAHASH(rp->rc_sockref);
802                 mtx_lock(&hbp->mtx);
803                 if (rp->rc_acked == RC_NO_ACK)
804                         LIST_REMOVE(rp, rc_ahash);
805                 mtx_unlock(&hbp->mtx);
806         }
807         nfsrc_wanted(rp);
808         if (rp->rc_flag & RC_REPMBUF) {
809                 m_freem(rp->rc_reply);
810                 if (!(rp->rc_flag & RC_UDP))
811                         atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), -1);
812         }
813         free(rp, M_NFSRVCACHE);
814         atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, -1);
815 }
816
817 /*
818  * Clean out the cache. Called when nfsserver module is unloaded.
819  */
820 void
821 nfsrvd_cleancache(void)
822 {
823         struct nfsrvcache *rp, *nextrp;
824         int i;
825
826         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
827                 LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrchash_table)[i].tbl,
828                     rc_hash, nextrp)
829                         nfsrc_freecache(rp);
830         }
831         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
832                 LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudphashtbl)[i], rc_hash,
833                     nextrp) {
834                         nfsrc_freecache(rp);
835                 }
836         }
837         NFSD_VNET(nfsstatsv1_p)->srvcache_size = 0;
838         NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
839 }
840
841 #define HISTSIZE        16
842 /*
843  * The basic rule is to get rid of entries that are expired.
844  */
845 void
846 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
847 {
848         struct nfsrchash_bucket *hbp;
849         struct nfsrvcache *rp, *nextrp;
850         int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
851         time_t thisstamp;
852         static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
853         static int onethread = 0, oneslot = 0;
854
855         if (sockref != 0) {
856                 hbp = NFSRCAHASH(sockref);
857                 mtx_lock(&hbp->mtx);
858                 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
859                         if (sockref == rp->rc_sockref) {
860                                 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
861                                         rp->rc_acked = RC_ACK;
862                                         LIST_REMOVE(rp, rc_ahash);
863                                 } else if (final) {
864                                         rp->rc_acked = RC_NACK;
865                                         LIST_REMOVE(rp, rc_ahash);
866                                 }
867                         }
868                 }
869                 mtx_unlock(&hbp->mtx);
870         }
871
872         if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
873                 return;
874         if (NFSD_MONOSEC != udp_lasttrim ||
875             NFSD_VNET(nfsrc_udpcachesize) >= (nfsrc_udphighwater +
876             nfsrc_udphighwater / 2)) {
877                 mtx_lock(&nfsrc_udpmtx);
878                 udp_lasttrim = NFSD_MONOSEC;
879                 TAILQ_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudplru), rc_lru,
880                     nextrp) {
881                         if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
882                              && rp->rc_refcnt == 0
883                              && ((rp->rc_flag & RC_REFCNT) ||
884                                  udp_lasttrim > rp->rc_timestamp ||
885                                  NFSD_VNET(nfsrc_udpcachesize) >
886                                  nfsrc_udphighwater))
887                                 nfsrc_freecache(rp);
888                 }
889                 mtx_unlock(&nfsrc_udpmtx);
890         }
891         if (NFSD_MONOSEC != tcp_lasttrim ||
892             NFSD_VNET(nfsrc_tcpsavedreplies) >= nfsrc_tcphighwater) {
893                 force = nfsrc_tcphighwater / 4;
894                 if (force > 0 &&
895                     NFSD_VNET(nfsrc_tcpsavedreplies) + force >=
896                     nfsrc_tcphighwater) {
897                         for (i = 0; i < HISTSIZE; i++)
898                                 time_histo[i] = 0;
899                         i = 0;
900                         lastslot = NFSRVCACHE_HASHSIZE - 1;
901                 } else {
902                         force = 0;
903                         if (NFSD_MONOSEC != tcp_lasttrim) {
904                                 i = 0;
905                                 lastslot = NFSRVCACHE_HASHSIZE - 1;
906                         } else {
907                                 lastslot = i = oneslot;
908                                 if (++oneslot >= NFSRVCACHE_HASHSIZE)
909                                         oneslot = 0;
910                         }
911                 }
912                 tto = nfsrc_tcptimeout;
913                 tcp_lasttrim = NFSD_MONOSEC;
914                 for (; i <= lastslot; i++) {
915                         mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
916                         LIST_FOREACH_SAFE(rp,
917                             &NFSD_VNET(nfsrchash_table)[i].tbl, rc_hash,
918                             nextrp) {
919                                 if (!(rp->rc_flag &
920                                      (RC_INPROG|RC_LOCKED|RC_WANTED))
921                                      && rp->rc_refcnt == 0) {
922                                         if ((rp->rc_flag & RC_REFCNT) ||
923                                             tcp_lasttrim > rp->rc_timestamp ||
924                                             rp->rc_acked == RC_ACK) {
925                                                 nfsrc_freecache(rp);
926                                                 continue;
927                                         }
928
929                                         if (force == 0)
930                                                 continue;
931                                         /*
932                                          * The timestamps range from roughly the
933                                          * present (tcp_lasttrim) to the present
934                                          * + nfsrc_tcptimeout. Generate a simple
935                                          * histogram of where the timeouts fall.
936                                          */
937                                         j = rp->rc_timestamp - tcp_lasttrim;
938                                         if (j >= tto)
939                                                 j = HISTSIZE - 1;
940                                         else if (j < 0)
941                                                 j = 0;
942                                         else
943                                                 j = j * HISTSIZE / tto;
944                                         time_histo[j]++;
945                                 }
946                         }
947                         mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
948                 }
949                 if (force) {
950                         /*
951                          * Trim some more with a smaller timeout of as little
952                          * as 20% of nfsrc_tcptimeout to try and get below
953                          * 80% of the nfsrc_tcphighwater.
954                          */
955                         k = 0;
956                         for (i = 0; i < (HISTSIZE - 2); i++) {
957                                 k += time_histo[i];
958                                 if (k > force)
959                                         break;
960                         }
961                         k = tto * (i + 1) / HISTSIZE;
962                         if (k < 1)
963                                 k = 1;
964                         thisstamp = tcp_lasttrim + k;
965                         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
966                                 mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
967                                 LIST_FOREACH_SAFE(rp,
968                                     &NFSD_VNET(nfsrchash_table)[i].tbl,
969                                     rc_hash, nextrp) {
970                                         if (!(rp->rc_flag &
971                                              (RC_INPROG|RC_LOCKED|RC_WANTED))
972                                              && rp->rc_refcnt == 0
973                                              && ((rp->rc_flag & RC_REFCNT) ||
974                                                  thisstamp > rp->rc_timestamp ||
975                                                  rp->rc_acked == RC_ACK))
976                                                 nfsrc_freecache(rp);
977                                 }
978                                 mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
979                         }
980                 }
981         }
982         atomic_store_rel_int(&onethread, 0);
983 }
984
985 /*
986  * Add a seqid# reference to the cache entry.
987  */
988 void
989 nfsrvd_refcache(struct nfsrvcache *rp)
990 {
991         struct mtx *mutex;
992
993         if (rp == NULL)
994                 /* For NFSv4.1, there is no cache entry. */
995                 return;
996         mutex = nfsrc_cachemutex(rp);
997         mtx_lock(mutex);
998         if (rp->rc_refcnt < 0)
999                 panic("nfs cache refcnt");
1000         rp->rc_refcnt++;
1001         mtx_unlock(mutex);
1002 }
1003
1004 /*
1005  * Dereference a seqid# cache entry.
1006  */
1007 void
1008 nfsrvd_derefcache(struct nfsrvcache *rp)
1009 {
1010         struct mtx *mutex;
1011
1012         mutex = nfsrc_cachemutex(rp);
1013         mtx_lock(mutex);
1014         if (rp->rc_refcnt <= 0)
1015                 panic("nfs cache derefcnt");
1016         rp->rc_refcnt--;
1017         if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1018                 nfsrc_freecache(rp);
1019         mtx_unlock(mutex);
1020 }
1021
1022 /*
1023  * Calculate the length of the mbuf list and a checksum on the first up to
1024  * NFSRVCACHE_CHECKLEN bytes.
1025  */
1026 static int
1027 nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
1028 {
1029         int len = 0, cklen;
1030         struct mbuf *m;
1031
1032         m = m1;
1033         while (m) {
1034                 len += m->m_len;
1035                 m = m->m_next;
1036         }
1037         cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1038         *cksum = in_cksum(m1, cklen);
1039         return (len);
1040 }
1041
1042 /*
1043  * Mark a TCP connection that is seeing retries. Should never happen for
1044  * NFSv4.
1045  */
1046 static void
1047 nfsrc_marksametcpconn(u_int64_t sockref)
1048 {
1049 }