]> CyberLeo.Net >> Repos - FreeBSD/releng/10.0.git/blob - sys/fs/nfsserver/nfs_nfsdcache.c
- Copy stable/10 (r259064) to releng/10.0 as part of the
[FreeBSD/releng/10.0.git] / sys / fs / nfsserver / nfs_nfsdcache.c
1 /*-
2  * Copyright (c) 1989, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Rick Macklem at The University of Guelph.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 /*
38  * Here is the basic algorithm:
39  * First, some design criteria I used:
40  * - I think a false hit is more serious than a false miss
41  * - A false hit for an RPC that has Op(s) that order via seqid# must be
42  *   avoided at all cost
43  * - A valid hit will probably happen a long time after the original reply
44  *   and the TCP socket that the original request was received on will no
45  *   longer be active
46  *   (The long time delay implies to me that LRU is not appropriate.)
47  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48  *   in them as well as minimizing the risk of redoing retried non-idempotent
49  *   Ops.
50  * Because it is biased towards avoiding false hits, multiple entries with
51  * the same xid are to be expected, especially for the case of the entry
52  * in the cache being related to a seqid# sequenced Op.
53  * 
54  * The basic algorithm I'm about to code up:
55  * - Null RPCs bypass the cache and are just done
56  * For TCP
57  *      - key on <xid, NFS version> (as noted above, there can be several
58  *                                   entries with the same key)
59  *      When a request arrives:
60  *              For all that match key
61  *              - if RPC# != OR request_size !=
62  *                      - not a match with this one
63  *              - if NFSv4 and received on same TCP socket OR
64  *                      received on a TCP connection created before the
65  *                      entry was cached
66  *                      - not a match with this one
67  *                      (V2,3 clients might retry on same TCP socket)
68  *              - calculate checksum on first N bytes of NFS XDR
69  *              - if checksum !=
70  *                      - not a match for this one
71  *              If any of the remaining ones that match has a
72  *                      seqid_refcnt > 0
73  *                      - not a match (go do RPC, using new cache entry)
74  *              If one match left
75  *                      - a hit (reply from cache)
76  *              else
77  *                      - miss (go do RPC, using new cache entry)
78  * 
79  *      During processing of NFSv4 request:
80  *              - set a flag when a non-idempotent Op is processed
81  *              - when an Op that uses a seqid# (Open,...) is processed
82  *                      - if same seqid# as referenced entry in cache
83  *                              - free new cache entry
84  *                              - reply from referenced cache entry
85  *                        else if next seqid# in order
86  *                              - free referenced cache entry
87  *                              - increment seqid_refcnt on new cache entry
88  *                              - set pointer from Openowner/Lockowner to
89  *                                      new cache entry (aka reference it)
90  *                        else if first seqid# in sequence
91  *                              - increment seqid_refcnt on new cache entry
92  *                              - set pointer from Openowner/Lockowner to
93  *                                      new cache entry (aka reference it)
94  * 
95  *      At end of RPC processing:
96  *              - if seqid_refcnt > 0 OR flagged non-idempotent on new
97  *                      cache entry
98  *                      - save reply in cache entry
99  *                      - calculate checksum on first N bytes of NFS XDR
100  *                              request
101  *                      - note op and length of XDR request (in bytes)
102  *                      - timestamp it
103  *                else
104  *                      - free new cache entry
105  *              - Send reply (noting info for socket activity check, below)
106  * 
107  *      For cache entries saved above:
108  *              - if saved since seqid_refcnt was > 0
109  *                      - free when seqid_refcnt decrements to 0
110  *                        (when next one in sequence is processed above, or
111  *                         when Openowner/Lockowner is discarded)
112  *                else { non-idempotent Op(s) }
113  *                      - free when
114  *                              - some further activity observed on same
115  *                                      socket
116  *                                (I'm not yet sure how I'm going to do
117  *                                 this. Maybe look at the TCP connection
118  *                                 to see if the send_tcp_sequence# is well
119  *                                 past sent reply OR K additional RPCs
120  *                                 replied on same socket OR?)
121  *                        OR
122  *                              - when very old (hours, days, weeks?)
123  * 
124  * For UDP (v2, 3 only), pretty much the old way:
125  * - key on <xid, NFS version, RPC#, Client host ip#>
126  *   (at most one entry for each key)
127  * 
128  * When a Request arrives:
129  * - if a match with entry via key
130  *      - if RPC marked In_progress
131  *              - discard request (don't send reply)
132  *        else
133  *              - reply from cache
134  *              - timestamp cache entry
135  *   else
136  *      - add entry to cache, marked In_progress
137  *      - do RPC
138  *      - when RPC done
139  *              - if RPC# non-idempotent
140  *                      - mark entry Done (not In_progress)
141  *                      - save reply
142  *                      - timestamp cache entry
143  *                else
144  *                      - free cache entry
145  *              - send reply
146  * 
147  * Later, entries with saved replies are free'd a short time (few minutes)
148  * after reply sent (timestamp).
149  * Reference: Chet Juszczak, "Improving the Performance and Correctness
150  *              of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151  *              pages 53-63. San Diego, February 1989.
152  *       for the UDP case.
153  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154  *      for TCP. For V3, a reply won't be saved when the flood level is
155  *      hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156  *      that case. This level should be set high enough that this almost
157  *      never happens.
158  */
159 #ifndef APPLEKEXT
160 #include <fs/nfs/nfsport.h>
161
162 extern struct nfsstats newnfsstats;
163 extern struct mtx nfsrc_udpmtx;
164 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
165 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
166 #endif  /* !APPLEKEXT */
167
168 SYSCTL_DECL(_vfs_nfsd);
169
170 static u_int    nfsrc_tcphighwater = 0;
171 static int
172 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
173 {
174         int error, newhighwater;
175
176         newhighwater = nfsrc_tcphighwater;
177         error = sysctl_handle_int(oidp, &newhighwater, 0, req);
178         if (error != 0 || req->newptr == NULL)
179                 return (error);
180         if (newhighwater < 0)
181                 return (EINVAL);
182         if (newhighwater >= nfsrc_floodlevel)
183                 nfsrc_floodlevel = newhighwater + newhighwater / 5;
184         nfsrc_tcphighwater = newhighwater;
185         return (0);
186 }
187 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
188     sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
189     "High water mark for TCP cache entries");
190
191 static u_int    nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
192 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
193     &nfsrc_udphighwater, 0,
194     "High water mark for UDP cache entries");
195 static u_int    nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
196 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
197     &nfsrc_tcptimeout, 0,
198     "Timeout for TCP entries in the DRC");
199 static u_int nfsrc_tcpnonidempotent = 1;
200 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
201     &nfsrc_tcpnonidempotent, 0,
202     "Enable the DRC for NFS over TCP");
203
204 static int nfsrc_udpcachesize = 0;
205 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
206 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
207
208 /*
209  * and the reverse mapping from generic to Version 2 procedure numbers
210  */
211 static int newnfsv2_procid[NFS_V3NPROCS] = {
212         NFSV2PROC_NULL,
213         NFSV2PROC_GETATTR,
214         NFSV2PROC_SETATTR,
215         NFSV2PROC_LOOKUP,
216         NFSV2PROC_NOOP,
217         NFSV2PROC_READLINK,
218         NFSV2PROC_READ,
219         NFSV2PROC_WRITE,
220         NFSV2PROC_CREATE,
221         NFSV2PROC_MKDIR,
222         NFSV2PROC_SYMLINK,
223         NFSV2PROC_CREATE,
224         NFSV2PROC_REMOVE,
225         NFSV2PROC_RMDIR,
226         NFSV2PROC_RENAME,
227         NFSV2PROC_LINK,
228         NFSV2PROC_READDIR,
229         NFSV2PROC_NOOP,
230         NFSV2PROC_STATFS,
231         NFSV2PROC_NOOP,
232         NFSV2PROC_NOOP,
233         NFSV2PROC_NOOP,
234 };
235
236 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
237 #define NFSRCUDPHASH(xid) \
238         (&nfsrvudphashtbl[nfsrc_hash(xid)])
239 #define NFSRCHASH(xid) \
240         (&nfsrchash_table[nfsrc_hash(xid)].tbl)
241 #define TRUE    1
242 #define FALSE   0
243 #define NFSRVCACHE_CHECKLEN     100
244
245 /* True iff the rpc reply is an nfs status ONLY! */
246 static int nfsv2_repstat[NFS_V3NPROCS] = {
247         FALSE,
248         FALSE,
249         FALSE,
250         FALSE,
251         FALSE,
252         FALSE,
253         FALSE,
254         FALSE,
255         FALSE,
256         FALSE,
257         TRUE,
258         TRUE,
259         TRUE,
260         TRUE,
261         FALSE,
262         TRUE,
263         FALSE,
264         FALSE,
265         FALSE,
266         FALSE,
267         FALSE,
268         FALSE,
269 };
270
271 /*
272  * Will NFS want to work over IPv6 someday?
273  */
274 #define NETFAMILY(rp) \
275                 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
276
277 /* local functions */
278 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
279 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
280 static void nfsrc_lock(struct nfsrvcache *rp);
281 static void nfsrc_unlock(struct nfsrvcache *rp);
282 static void nfsrc_wanted(struct nfsrvcache *rp);
283 static void nfsrc_freecache(struct nfsrvcache *rp);
284 static void nfsrc_trimcache(u_int64_t, struct socket *);
285 static int nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t,
286     struct socket *);
287 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
288 static void nfsrc_marksametcpconn(u_int64_t);
289
290 /*
291  * Return the correct mutex for this cache entry.
292  */
293 static __inline struct mtx *
294 nfsrc_cachemutex(struct nfsrvcache *rp)
295 {
296
297         if ((rp->rc_flag & RC_UDP) != 0)
298                 return (&nfsrc_udpmtx);
299         return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
300 }
301
302 /*
303  * Initialize the server request cache list
304  */
305 APPLESTATIC void
306 nfsrvd_initcache(void)
307 {
308         int i;
309         static int inited = 0;
310
311         if (inited)
312                 return;
313         inited = 1;
314         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
315                 LIST_INIT(&nfsrvudphashtbl[i]);
316                 LIST_INIT(&nfsrchash_table[i].tbl);
317         }
318         TAILQ_INIT(&nfsrvudplru);
319         nfsrc_tcpsavedreplies = 0;
320         nfsrc_udpcachesize = 0;
321         newnfsstats.srvcache_tcppeak = 0;
322         newnfsstats.srvcache_size = 0;
323 }
324
325 /*
326  * Get a cache entry for this request. Basically just malloc a new one
327  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
328  * Call nfsrc_trimcache() to clean up the cache before returning.
329  */
330 APPLESTATIC int
331 nfsrvd_getcache(struct nfsrv_descript *nd, struct socket *so)
332 {
333         struct nfsrvcache *newrp;
334         int ret;
335
336         if (nd->nd_procnum == NFSPROC_NULL)
337                 panic("nfsd cache null");
338         MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
339             M_NFSRVCACHE, M_WAITOK);
340         NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
341         if (nd->nd_flag & ND_NFSV4)
342                 newrp->rc_flag = RC_NFSV4;
343         else if (nd->nd_flag & ND_NFSV3)
344                 newrp->rc_flag = RC_NFSV3;
345         else
346                 newrp->rc_flag = RC_NFSV2;
347         newrp->rc_xid = nd->nd_retxid;
348         newrp->rc_proc = nd->nd_procnum;
349         newrp->rc_sockref = nd->nd_sockref;
350         newrp->rc_cachetime = nd->nd_tcpconntime;
351         if (nd->nd_flag & ND_SAMETCPCONN)
352                 newrp->rc_flag |= RC_SAMETCPCONN;
353         if (nd->nd_nam2 != NULL) {
354                 newrp->rc_flag |= RC_UDP;
355                 ret = nfsrc_getudp(nd, newrp);
356         } else {
357                 ret = nfsrc_gettcp(nd, newrp);
358         }
359         nfsrc_trimcache(nd->nd_sockref, so);
360         NFSEXITCODE2(0, nd);
361         return (ret);
362 }
363
364 /*
365  * For UDP (v2, v3):
366  * - key on <xid, NFS version, RPC#, Client host ip#>
367  *   (at most one entry for each key)
368  */
369 static int
370 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
371 {
372         struct nfsrvcache *rp;
373         struct sockaddr_in *saddr;
374         struct sockaddr_in6 *saddr6;
375         struct nfsrvhashhead *hp;
376         int ret = 0;
377         struct mtx *mutex;
378
379         mutex = nfsrc_cachemutex(newrp);
380         hp = NFSRCUDPHASH(newrp->rc_xid);
381 loop:
382         mtx_lock(mutex);
383         LIST_FOREACH(rp, hp, rc_hash) {
384             if (newrp->rc_xid == rp->rc_xid &&
385                 newrp->rc_proc == rp->rc_proc &&
386                 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
387                 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
388                         if ((rp->rc_flag & RC_LOCKED) != 0) {
389                                 rp->rc_flag |= RC_WANTED;
390                                 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
391                                     "nfsrc", 10 * hz);
392                                 goto loop;
393                         }
394                         if (rp->rc_flag == 0)
395                                 panic("nfs udp cache0");
396                         rp->rc_flag |= RC_LOCKED;
397                         TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
398                         TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
399                         if (rp->rc_flag & RC_INPROG) {
400                                 newnfsstats.srvcache_inproghits++;
401                                 mtx_unlock(mutex);
402                                 ret = RC_DROPIT;
403                         } else if (rp->rc_flag & RC_REPSTATUS) {
404                                 /*
405                                  * V2 only.
406                                  */
407                                 newnfsstats.srvcache_nonidemdonehits++;
408                                 mtx_unlock(mutex);
409                                 nfsrvd_rephead(nd);
410                                 *(nd->nd_errp) = rp->rc_status;
411                                 ret = RC_REPLY;
412                                 rp->rc_timestamp = NFSD_MONOSEC +
413                                         NFSRVCACHE_UDPTIMEOUT;
414                         } else if (rp->rc_flag & RC_REPMBUF) {
415                                 newnfsstats.srvcache_nonidemdonehits++;
416                                 mtx_unlock(mutex);
417                                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
418                                         M_COPYALL, M_WAITOK);
419                                 ret = RC_REPLY;
420                                 rp->rc_timestamp = NFSD_MONOSEC +
421                                         NFSRVCACHE_UDPTIMEOUT;
422                         } else {
423                                 panic("nfs udp cache1");
424                         }
425                         nfsrc_unlock(rp);
426                         free((caddr_t)newrp, M_NFSRVCACHE);
427                         goto out;
428                 }
429         }
430         newnfsstats.srvcache_misses++;
431         atomic_add_int(&newnfsstats.srvcache_size, 1);
432         nfsrc_udpcachesize++;
433
434         newrp->rc_flag |= RC_INPROG;
435         saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
436         if (saddr->sin_family == AF_INET)
437                 newrp->rc_inet = saddr->sin_addr.s_addr;
438         else if (saddr->sin_family == AF_INET6) {
439                 saddr6 = (struct sockaddr_in6 *)saddr;
440                 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
441                     sizeof (struct in6_addr));
442                 newrp->rc_flag |= RC_INETIPV6;
443         }
444         LIST_INSERT_HEAD(hp, newrp, rc_hash);
445         TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
446         mtx_unlock(mutex);
447         nd->nd_rp = newrp;
448         ret = RC_DOIT;
449
450 out:
451         NFSEXITCODE2(0, nd);
452         return (ret);
453 }
454
455 /*
456  * Update a request cache entry after the rpc has been done
457  */
458 APPLESTATIC struct nfsrvcache *
459 nfsrvd_updatecache(struct nfsrv_descript *nd, struct socket *so)
460 {
461         struct nfsrvcache *rp;
462         struct nfsrvcache *retrp = NULL;
463         mbuf_t m;
464         struct mtx *mutex;
465
466         rp = nd->nd_rp;
467         if (!rp)
468                 panic("nfsrvd_updatecache null rp");
469         nd->nd_rp = NULL;
470         mutex = nfsrc_cachemutex(rp);
471         mtx_lock(mutex);
472         nfsrc_lock(rp);
473         if (!(rp->rc_flag & RC_INPROG))
474                 panic("nfsrvd_updatecache not inprog");
475         rp->rc_flag &= ~RC_INPROG;
476         if (rp->rc_flag & RC_UDP) {
477                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
478                 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
479         }
480
481         /*
482          * Reply from cache is a special case returned by nfsrv_checkseqid().
483          */
484         if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
485                 newnfsstats.srvcache_nonidemdonehits++;
486                 mtx_unlock(mutex);
487                 nd->nd_repstat = 0;
488                 if (nd->nd_mreq)
489                         mbuf_freem(nd->nd_mreq);
490                 if (!(rp->rc_flag & RC_REPMBUF))
491                         panic("reply from cache");
492                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
493                     M_COPYALL, M_WAITOK);
494                 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
495                 nfsrc_unlock(rp);
496                 goto out;
497         }
498
499         /*
500          * If rc_refcnt > 0, save it
501          * For UDP, save it if ND_SAVEREPLY is set
502          * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
503          */
504         if (nd->nd_repstat != NFSERR_DONTREPLY &&
505             (rp->rc_refcnt > 0 ||
506              ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
507              ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
508               nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
509               nfsrc_tcpnonidempotent))) {
510                 if (rp->rc_refcnt > 0) {
511                         if (!(rp->rc_flag & RC_NFSV4))
512                                 panic("update_cache refcnt");
513                         rp->rc_flag |= RC_REFCNT;
514                 }
515                 if ((nd->nd_flag & ND_NFSV2) &&
516                     nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
517                         rp->rc_status = nd->nd_repstat;
518                         rp->rc_flag |= RC_REPSTATUS;
519                         mtx_unlock(mutex);
520                 } else {
521                         if (!(rp->rc_flag & RC_UDP)) {
522                             atomic_add_int(&nfsrc_tcpsavedreplies, 1);
523                             if (nfsrc_tcpsavedreplies >
524                                 newnfsstats.srvcache_tcppeak)
525                                 newnfsstats.srvcache_tcppeak =
526                                     nfsrc_tcpsavedreplies;
527                         }
528                         mtx_unlock(mutex);
529                         m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
530                         mtx_lock(mutex);
531                         rp->rc_reply = m;
532                         rp->rc_flag |= RC_REPMBUF;
533                         mtx_unlock(mutex);
534                 }
535                 if (rp->rc_flag & RC_UDP) {
536                         rp->rc_timestamp = NFSD_MONOSEC +
537                             NFSRVCACHE_UDPTIMEOUT;
538                         nfsrc_unlock(rp);
539                 } else {
540                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
541                         if (rp->rc_refcnt > 0)
542                                 nfsrc_unlock(rp);
543                         else
544                                 retrp = rp;
545                 }
546         } else {
547                 nfsrc_freecache(rp);
548                 mtx_unlock(mutex);
549         }
550
551 out:
552         nfsrc_trimcache(nd->nd_sockref, so);
553         NFSEXITCODE2(0, nd);
554         return (retrp);
555 }
556
557 /*
558  * Invalidate and, if possible, free an in prog cache entry.
559  * Must not sleep.
560  */
561 APPLESTATIC void
562 nfsrvd_delcache(struct nfsrvcache *rp)
563 {
564         struct mtx *mutex;
565
566         mutex = nfsrc_cachemutex(rp);
567         if (!(rp->rc_flag & RC_INPROG))
568                 panic("nfsrvd_delcache not in prog");
569         mtx_lock(mutex);
570         rp->rc_flag &= ~RC_INPROG;
571         if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
572                 nfsrc_freecache(rp);
573         mtx_unlock(mutex);
574 }
575
576 /*
577  * Called after nfsrvd_updatecache() once the reply is sent, to update
578  * the entry for nfsrc_activesocket() and unlock it. The argument is
579  * the pointer returned by nfsrvd_updatecache().
580  */
581 APPLESTATIC void
582 nfsrvd_sentcache(struct nfsrvcache *rp, struct socket *so, int err)
583 {
584         tcp_seq tmp_seq;
585         struct mtx *mutex;
586
587         mutex = nfsrc_cachemutex(rp);
588         if (!(rp->rc_flag & RC_LOCKED))
589                 panic("nfsrvd_sentcache not locked");
590         if (!err) {
591                 if ((so->so_proto->pr_domain->dom_family != AF_INET &&
592                      so->so_proto->pr_domain->dom_family != AF_INET6) ||
593                      so->so_proto->pr_protocol != IPPROTO_TCP)
594                         panic("nfs sent cache");
595                 if (nfsrv_getsockseqnum(so, &tmp_seq)) {
596                         mtx_lock(mutex);
597                         rp->rc_tcpseq = tmp_seq;
598                         rp->rc_flag |= RC_TCPSEQ;
599                         mtx_unlock(mutex);
600                 }
601         }
602         nfsrc_unlock(rp);
603 }
604
605 /*
606  * Get a cache entry for TCP
607  * - key on <xid, nfs version>
608  *   (allow multiple entries for a given key)
609  */
610 static int
611 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
612 {
613         struct nfsrvcache *rp, *nextrp;
614         int i;
615         struct nfsrvcache *hitrp;
616         struct nfsrvhashhead *hp, nfsrc_templist;
617         int hit, ret = 0;
618         struct mtx *mutex;
619
620         mutex = nfsrc_cachemutex(newrp);
621         hp = NFSRCHASH(newrp->rc_xid);
622         newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
623 tryagain:
624         mtx_lock(mutex);
625         hit = 1;
626         LIST_INIT(&nfsrc_templist);
627         /*
628          * Get all the matches and put them on the temp list.
629          */
630         rp = LIST_FIRST(hp);
631         while (rp != LIST_END(hp)) {
632                 nextrp = LIST_NEXT(rp, rc_hash);
633                 if (newrp->rc_xid == rp->rc_xid &&
634                     (!(rp->rc_flag & RC_INPROG) ||
635                      ((newrp->rc_flag & RC_SAMETCPCONN) &&
636                       newrp->rc_sockref == rp->rc_sockref)) &&
637                     (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
638                     newrp->rc_proc == rp->rc_proc &&
639                     ((newrp->rc_flag & RC_NFSV4) &&
640                      newrp->rc_sockref != rp->rc_sockref &&
641                      newrp->rc_cachetime >= rp->rc_cachetime)
642                     && newrp->rc_reqlen == rp->rc_reqlen &&
643                     newrp->rc_cksum == rp->rc_cksum) {
644                         LIST_REMOVE(rp, rc_hash);
645                         LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
646                 }
647                 rp = nextrp;
648         }
649
650         /*
651          * Now, use nfsrc_templist to decide if there is a match.
652          */
653         i = 0;
654         LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
655                 i++;
656                 if (rp->rc_refcnt > 0) {
657                         hit = 0;
658                         break;
659                 }
660         }
661         /*
662          * Can be a hit only if one entry left.
663          * Note possible hit entry and put nfsrc_templist back on hash
664          * list.
665          */
666         if (i != 1)
667                 hit = 0;
668         hitrp = rp = LIST_FIRST(&nfsrc_templist);
669         while (rp != LIST_END(&nfsrc_templist)) {
670                 nextrp = LIST_NEXT(rp, rc_hash);
671                 LIST_REMOVE(rp, rc_hash);
672                 LIST_INSERT_HEAD(hp, rp, rc_hash);
673                 rp = nextrp;
674         }
675         if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
676                 panic("nfs gettcp cache templist");
677
678         if (hit) {
679                 rp = hitrp;
680                 if ((rp->rc_flag & RC_LOCKED) != 0) {
681                         rp->rc_flag |= RC_WANTED;
682                         (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
683                             "nfsrc", 10 * hz);
684                         goto tryagain;
685                 }
686                 if (rp->rc_flag == 0)
687                         panic("nfs tcp cache0");
688                 rp->rc_flag |= RC_LOCKED;
689                 if (rp->rc_flag & RC_INPROG) {
690                         newnfsstats.srvcache_inproghits++;
691                         mtx_unlock(mutex);
692                         if (newrp->rc_sockref == rp->rc_sockref)
693                                 nfsrc_marksametcpconn(rp->rc_sockref);
694                         ret = RC_DROPIT;
695                 } else if (rp->rc_flag & RC_REPSTATUS) {
696                         /*
697                          * V2 only.
698                          */
699                         newnfsstats.srvcache_nonidemdonehits++;
700                         mtx_unlock(mutex);
701                         if (newrp->rc_sockref == rp->rc_sockref)
702                                 nfsrc_marksametcpconn(rp->rc_sockref);
703                         ret = RC_REPLY;
704                         nfsrvd_rephead(nd);
705                         *(nd->nd_errp) = rp->rc_status;
706                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
707                 } else if (rp->rc_flag & RC_REPMBUF) {
708                         newnfsstats.srvcache_nonidemdonehits++;
709                         mtx_unlock(mutex);
710                         if (newrp->rc_sockref == rp->rc_sockref)
711                                 nfsrc_marksametcpconn(rp->rc_sockref);
712                         ret = RC_REPLY;
713                         nd->nd_mreq = m_copym(rp->rc_reply, 0,
714                                 M_COPYALL, M_WAITOK);
715                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
716                 } else {
717                         panic("nfs tcp cache1");
718                 }
719                 nfsrc_unlock(rp);
720                 free((caddr_t)newrp, M_NFSRVCACHE);
721                 goto out;
722         }
723         newnfsstats.srvcache_misses++;
724         atomic_add_int(&newnfsstats.srvcache_size, 1);
725
726         /*
727          * For TCP, multiple entries for a key are allowed, so don't
728          * chain it into the hash table until done.
729          */
730         newrp->rc_cachetime = NFSD_MONOSEC;
731         newrp->rc_flag |= RC_INPROG;
732         LIST_INSERT_HEAD(hp, newrp, rc_hash);
733         mtx_unlock(mutex);
734         nd->nd_rp = newrp;
735         ret = RC_DOIT;
736
737 out:
738         NFSEXITCODE2(0, nd);
739         return (ret);
740 }
741
742 /*
743  * Lock a cache entry.
744  */
745 static void
746 nfsrc_lock(struct nfsrvcache *rp)
747 {
748         struct mtx *mutex;
749
750         mutex = nfsrc_cachemutex(rp);
751         mtx_assert(mutex, MA_OWNED);
752         while ((rp->rc_flag & RC_LOCKED) != 0) {
753                 rp->rc_flag |= RC_WANTED;
754                 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
755         }
756         rp->rc_flag |= RC_LOCKED;
757 }
758
759 /*
760  * Unlock a cache entry.
761  */
762 static void
763 nfsrc_unlock(struct nfsrvcache *rp)
764 {
765         struct mtx *mutex;
766
767         mutex = nfsrc_cachemutex(rp);
768         mtx_lock(mutex);
769         rp->rc_flag &= ~RC_LOCKED;
770         nfsrc_wanted(rp);
771         mtx_unlock(mutex);
772 }
773
774 /*
775  * Wakeup anyone wanting entry.
776  */
777 static void
778 nfsrc_wanted(struct nfsrvcache *rp)
779 {
780         if (rp->rc_flag & RC_WANTED) {
781                 rp->rc_flag &= ~RC_WANTED;
782                 wakeup((caddr_t)rp);
783         }
784 }
785
786 /*
787  * Free up the entry.
788  * Must not sleep.
789  */
790 static void
791 nfsrc_freecache(struct nfsrvcache *rp)
792 {
793
794         LIST_REMOVE(rp, rc_hash);
795         if (rp->rc_flag & RC_UDP) {
796                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
797                 nfsrc_udpcachesize--;
798         }
799         nfsrc_wanted(rp);
800         if (rp->rc_flag & RC_REPMBUF) {
801                 mbuf_freem(rp->rc_reply);
802                 if (!(rp->rc_flag & RC_UDP))
803                         atomic_add_int(&nfsrc_tcpsavedreplies, -1);
804         }
805         FREE((caddr_t)rp, M_NFSRVCACHE);
806         atomic_add_int(&newnfsstats.srvcache_size, -1);
807 }
808
809 /*
810  * Clean out the cache. Called when nfsserver module is unloaded.
811  */
812 APPLESTATIC void
813 nfsrvd_cleancache(void)
814 {
815         struct nfsrvcache *rp, *nextrp;
816         int i;
817
818         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
819                 mtx_lock(&nfsrchash_table[i].mtx);
820                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
821                         nfsrc_freecache(rp);
822                 mtx_unlock(&nfsrchash_table[i].mtx);
823         }
824         mtx_lock(&nfsrc_udpmtx);
825         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
826                 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
827                         nfsrc_freecache(rp);
828                 }
829         }
830         newnfsstats.srvcache_size = 0;
831         mtx_unlock(&nfsrc_udpmtx);
832         nfsrc_tcpsavedreplies = 0;
833 }
834
835 /*
836  * The basic rule is to get rid of entries that are expired.
837  */
838 static void
839 nfsrc_trimcache(u_int64_t sockref, struct socket *so)
840 {
841         struct nfsrvcache *rp, *nextrp;
842         int i, j, k, time_histo[10];
843         time_t thisstamp;
844         static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
845         static int onethread = 0;
846
847         if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
848                 return;
849         if (NFSD_MONOSEC != udp_lasttrim ||
850             nfsrc_udpcachesize >= (nfsrc_udphighwater +
851             nfsrc_udphighwater / 2)) {
852                 mtx_lock(&nfsrc_udpmtx);
853                 udp_lasttrim = NFSD_MONOSEC;
854                 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
855                         if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
856                              && rp->rc_refcnt == 0
857                              && ((rp->rc_flag & RC_REFCNT) ||
858                                  udp_lasttrim > rp->rc_timestamp ||
859                                  nfsrc_udpcachesize > nfsrc_udphighwater))
860                                 nfsrc_freecache(rp);
861                 }
862                 mtx_unlock(&nfsrc_udpmtx);
863         }
864         if (NFSD_MONOSEC != tcp_lasttrim ||
865             nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
866                 for (i = 0; i < 10; i++)
867                         time_histo[i] = 0;
868                 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
869                         mtx_lock(&nfsrchash_table[i].mtx);
870                         if (i == 0)
871                                 tcp_lasttrim = NFSD_MONOSEC;
872                         LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
873                             nextrp) {
874                                 if (!(rp->rc_flag &
875                                      (RC_INPROG|RC_LOCKED|RC_WANTED))
876                                      && rp->rc_refcnt == 0) {
877                                         /*
878                                          * The timestamps range from roughly the
879                                          * present (tcp_lasttrim) to the present
880                                          * + nfsrc_tcptimeout. Generate a simple
881                                          * histogram of where the timeouts fall.
882                                          */
883                                         j = rp->rc_timestamp - tcp_lasttrim;
884                                         if (j >= nfsrc_tcptimeout)
885                                                 j = nfsrc_tcptimeout - 1;
886                                         if (j < 0)
887                                                 j = 0;
888                                         j = (j * 10 / nfsrc_tcptimeout) % 10;
889                                         time_histo[j]++;
890                                         if ((rp->rc_flag & RC_REFCNT) ||
891                                             tcp_lasttrim > rp->rc_timestamp ||
892                                             nfsrc_activesocket(rp, sockref, so))
893                                                 nfsrc_freecache(rp);
894                                 }
895                         }
896                         mtx_unlock(&nfsrchash_table[i].mtx);
897                 }
898                 j = nfsrc_tcphighwater / 5;     /* 20% of it */
899                 if (j > 0 && (nfsrc_tcpsavedreplies + j) > nfsrc_tcphighwater) {
900                         /*
901                          * Trim some more with a smaller timeout of as little
902                          * as 20% of nfsrc_tcptimeout to try and get below
903                          * 80% of the nfsrc_tcphighwater.
904                          */
905                         k = 0;
906                         for (i = 0; i < 8; i++) {
907                                 k += time_histo[i];
908                                 if (k > j)
909                                         break;
910                         }
911                         k = nfsrc_tcptimeout * (i + 1) / 10;
912                         if (k < 1)
913                                 k = 1;
914                         thisstamp = tcp_lasttrim + k;
915                         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
916                                 mtx_lock(&nfsrchash_table[i].mtx);
917                                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
918                                     rc_hash, nextrp) {
919                                         if (!(rp->rc_flag &
920                                              (RC_INPROG|RC_LOCKED|RC_WANTED))
921                                              && rp->rc_refcnt == 0
922                                              && ((rp->rc_flag & RC_REFCNT) ||
923                                                  thisstamp > rp->rc_timestamp ||
924                                                  nfsrc_activesocket(rp, sockref,
925                                                     so)))
926                                                 nfsrc_freecache(rp);
927                                 }
928                                 mtx_unlock(&nfsrchash_table[i].mtx);
929                         }
930                 }
931         }
932         atomic_store_rel_int(&onethread, 0);
933 }
934
935 /*
936  * Add a seqid# reference to the cache entry.
937  */
938 APPLESTATIC void
939 nfsrvd_refcache(struct nfsrvcache *rp)
940 {
941         struct mtx *mutex;
942
943         mutex = nfsrc_cachemutex(rp);
944         mtx_lock(mutex);
945         if (rp->rc_refcnt < 0)
946                 panic("nfs cache refcnt");
947         rp->rc_refcnt++;
948         mtx_unlock(mutex);
949 }
950
951 /*
952  * Dereference a seqid# cache entry.
953  */
954 APPLESTATIC void
955 nfsrvd_derefcache(struct nfsrvcache *rp)
956 {
957         struct mtx *mutex;
958
959         mutex = nfsrc_cachemutex(rp);
960         mtx_lock(mutex);
961         if (rp->rc_refcnt <= 0)
962                 panic("nfs cache derefcnt");
963         rp->rc_refcnt--;
964         if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
965                 nfsrc_freecache(rp);
966         mtx_unlock(mutex);
967 }
968
969 /*
970  * Check to see if the socket is active.
971  * Return 1 if the reply has been received/acknowledged by the client,
972  * 0 otherwise.
973  * XXX - Uses tcp internals.
974  */
975 static int
976 nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t cur_sockref,
977     struct socket *cur_so)
978 {
979         int ret = 0;
980
981         if (!(rp->rc_flag & RC_TCPSEQ))
982                 return (ret);
983         /*
984          * If the sockref is the same, it is the same TCP connection.
985          */
986         if (cur_sockref == rp->rc_sockref)
987                 ret = nfsrv_checksockseqnum(cur_so, rp->rc_tcpseq);
988         return (ret);
989 }
990
991 /*
992  * Calculate the length of the mbuf list and a checksum on the first up to
993  * NFSRVCACHE_CHECKLEN bytes.
994  */
995 static int
996 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
997 {
998         int len = 0, cklen;
999         mbuf_t m;
1000
1001         m = m1;
1002         while (m) {
1003                 len += mbuf_len(m);
1004                 m = mbuf_next(m);
1005         }
1006         cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1007         *cksum = in_cksum(m1, cklen);
1008         return (len);
1009 }
1010
1011 /*
1012  * Mark a TCP connection that is seeing retries. Should never happen for
1013  * NFSv4.
1014  */
1015 static void
1016 nfsrc_marksametcpconn(u_int64_t sockref)
1017 {
1018 }
1019