]> CyberLeo.Net >> Repos - FreeBSD/releng/9.0.git/blob - sys/fs/nfsserver/nfs_nfsdcache.c
Copy stable/9 to releng/9.0 as part of the FreeBSD 9.0-RELEASE release
[FreeBSD/releng/9.0.git] / sys / fs / nfsserver / nfs_nfsdcache.c
1 /*-
2  * Copyright (c) 1989, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Rick Macklem at The University of Guelph.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 /*
38  * Here is the basic algorithm:
39  * First, some design criteria I used:
40  * - I think a false hit is more serious than a false miss
41  * - A false hit for an RPC that has Op(s) that order via seqid# must be
42  *   avoided at all cost
43  * - A valid hit will probably happen a long time after the original reply
44  *   and the TCP socket that the original request was received on will no
45  *   longer be active
46  *   (The long time delay implies to me that LRU is not appropriate.)
47  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48  *   in them as well as minimizing the risk of redoing retried non-idempotent
49  *   Ops.
50  * Because it is biased towards avoiding false hits, multiple entries with
51  * the same xid are to be expected, especially for the case of the entry
52  * in the cache being related to a seqid# sequenced Op.
53  * 
54  * The basic algorithm I'm about to code up:
55  * - Null RPCs bypass the cache and are just done
56  * For TCP
57  *      - key on <xid, NFS version> (as noted above, there can be several
58  *                                   entries with the same key)
59  *      When a request arrives:
60  *              For all that match key
61  *              - if RPC# != OR request_size !=
62  *                      - not a match with this one
63  *              - if NFSv4 and received on same TCP socket OR
64  *                      received on a TCP connection created before the
65  *                      entry was cached
66  *                      - not a match with this one
67  *                      (V2,3 clients might retry on same TCP socket)
68  *              - calculate checksum on first N bytes of NFS XDR
69  *              - if checksum !=
70  *                      - not a match for this one
71  *              If any of the remaining ones that match has a
72  *                      seqid_refcnt > 0
73  *                      - not a match (go do RPC, using new cache entry)
74  *              If one match left
75  *                      - a hit (reply from cache)
76  *              else
77  *                      - miss (go do RPC, using new cache entry)
78  * 
79  *      During processing of NFSv4 request:
80  *              - set a flag when a non-idempotent Op is processed
81  *              - when an Op that uses a seqid# (Open,...) is processed
82  *                      - if same seqid# as referenced entry in cache
83  *                              - free new cache entry
84  *                              - reply from referenced cache entry
85  *                        else if next seqid# in order
86  *                              - free referenced cache entry
87  *                              - increment seqid_refcnt on new cache entry
88  *                              - set pointer from Openowner/Lockowner to
89  *                                      new cache entry (aka reference it)
90  *                        else if first seqid# in sequence
91  *                              - increment seqid_refcnt on new cache entry
92  *                              - set pointer from Openowner/Lockowner to
93  *                                      new cache entry (aka reference it)
94  * 
95  *      At end of RPC processing:
96  *              - if seqid_refcnt > 0 OR flagged non-idempotent on new
97  *                      cache entry
98  *                      - save reply in cache entry
99  *                      - calculate checksum on first N bytes of NFS XDR
100  *                              request
101  *                      - note op and length of XDR request (in bytes)
102  *                      - timestamp it
103  *                else
104  *                      - free new cache entry
105  *              - Send reply (noting info for socket activity check, below)
106  * 
107  *      For cache entries saved above:
108  *              - if saved since seqid_refcnt was > 0
109  *                      - free when seqid_refcnt decrements to 0
110  *                        (when next one in sequence is processed above, or
111  *                         when Openowner/Lockowner is discarded)
112  *                else { non-idempotent Op(s) }
113  *                      - free when
114  *                              - some further activity observed on same
115  *                                      socket
116  *                                (I'm not yet sure how I'm going to do
117  *                                 this. Maybe look at the TCP connection
118  *                                 to see if the send_tcp_sequence# is well
119  *                                 past sent reply OR K additional RPCs
120  *                                 replied on same socket OR?)
121  *                        OR
122  *                              - when very old (hours, days, weeks?)
123  * 
124  * For UDP (v2, 3 only), pretty much the old way:
125  * - key on <xid, NFS version, RPC#, Client host ip#>
126  *   (at most one entry for each key)
127  * 
128  * When a Request arrives:
129  * - if a match with entry via key
130  *      - if RPC marked In_progress
131  *              - discard request (don't send reply)
132  *        else
133  *              - reply from cache
134  *              - timestamp cache entry
135  *   else
136  *      - add entry to cache, marked In_progress
137  *      - do RPC
138  *      - when RPC done
139  *              - if RPC# non-idempotent
140  *                      - mark entry Done (not In_progress)
141  *                      - save reply
142  *                      - timestamp cache entry
143  *                else
144  *                      - free cache entry
145  *              - send reply
146  * 
147  * Later, entries with saved replies are free'd a short time (few minutes)
148  * after reply sent (timestamp).
149  * Reference: Chet Juszczak, "Improving the Performance and Correctness
150  *              of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151  *              pages 53-63. San Diego, February 1989.
152  *       for the UDP case.
153  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154  *      for TCP. For V3, a reply won't be saved when the flood level is
155  *      hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156  *      that case. This level should be set high enough that this almost
157  *      never happens.
158  */
159 #ifndef APPLEKEXT
160 #include <fs/nfs/nfsport.h>
161
162 extern struct nfsstats newnfsstats;
163 NFSCACHEMUTEX;
164 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
165 #endif  /* !APPLEKEXT */
166
167 static int nfsrc_tcpnonidempotent = 1;
168 static int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER, nfsrc_udpcachesize = 0;
169 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
170 static struct nfsrvhashhead nfsrvhashtbl[NFSRVCACHE_HASHSIZE],
171     nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
172 /*
173  * and the reverse mapping from generic to Version 2 procedure numbers
174  */
175 static int newnfsv2_procid[NFS_V3NPROCS] = {
176         NFSV2PROC_NULL,
177         NFSV2PROC_GETATTR,
178         NFSV2PROC_SETATTR,
179         NFSV2PROC_LOOKUP,
180         NFSV2PROC_NOOP,
181         NFSV2PROC_READLINK,
182         NFSV2PROC_READ,
183         NFSV2PROC_WRITE,
184         NFSV2PROC_CREATE,
185         NFSV2PROC_MKDIR,
186         NFSV2PROC_SYMLINK,
187         NFSV2PROC_CREATE,
188         NFSV2PROC_REMOVE,
189         NFSV2PROC_RMDIR,
190         NFSV2PROC_RENAME,
191         NFSV2PROC_LINK,
192         NFSV2PROC_READDIR,
193         NFSV2PROC_NOOP,
194         NFSV2PROC_STATFS,
195         NFSV2PROC_NOOP,
196         NFSV2PROC_NOOP,
197         NFSV2PROC_NOOP,
198 };
199
200 #define NFSRCUDPHASH(xid) \
201         (&nfsrvudphashtbl[((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE])
202 #define NFSRCHASH(xid) \
203         (&nfsrvhashtbl[((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE])
204 #define TRUE    1
205 #define FALSE   0
206 #define NFSRVCACHE_CHECKLEN     100
207
208 /* True iff the rpc reply is an nfs status ONLY! */
209 static int nfsv2_repstat[NFS_V3NPROCS] = {
210         FALSE,
211         FALSE,
212         FALSE,
213         FALSE,
214         FALSE,
215         FALSE,
216         FALSE,
217         FALSE,
218         FALSE,
219         FALSE,
220         TRUE,
221         TRUE,
222         TRUE,
223         TRUE,
224         FALSE,
225         TRUE,
226         FALSE,
227         FALSE,
228         FALSE,
229         FALSE,
230         FALSE,
231         FALSE,
232 };
233
234 /*
235  * Will NFS want to work over IPv6 someday?
236  */
237 #define NETFAMILY(rp) \
238                 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
239
240 /* local functions */
241 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
242 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
243 static void nfsrc_lock(struct nfsrvcache *rp);
244 static void nfsrc_unlock(struct nfsrvcache *rp);
245 static void nfsrc_wanted(struct nfsrvcache *rp);
246 static void nfsrc_freecache(struct nfsrvcache *rp);
247 static void nfsrc_trimcache(u_int64_t, struct socket *);
248 static int nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t,
249     struct socket *);
250 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
251 static void nfsrc_marksametcpconn(u_int64_t);
252
253 /*
254  * Initialize the server request cache list
255  */
256 APPLESTATIC void
257 nfsrvd_initcache(void)
258 {
259         int i;
260         static int inited = 0;
261
262         if (inited)
263                 return;
264         inited = 1;
265         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
266                 LIST_INIT(&nfsrvudphashtbl[i]);
267                 LIST_INIT(&nfsrvhashtbl[i]);
268         }
269         TAILQ_INIT(&nfsrvudplru);
270         nfsrc_tcpsavedreplies = 0;
271         nfsrc_udpcachesize = 0;
272         newnfsstats.srvcache_tcppeak = 0;
273         newnfsstats.srvcache_size = 0;
274 }
275
276 /*
277  * Get a cache entry for this request. Basically just malloc a new one
278  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
279  * Call nfsrc_trimcache() to clean up the cache before returning.
280  */
281 APPLESTATIC int
282 nfsrvd_getcache(struct nfsrv_descript *nd, struct socket *so)
283 {
284         struct nfsrvcache *newrp;
285         int ret;
286
287         if (nd->nd_procnum == NFSPROC_NULL)
288                 panic("nfsd cache null");
289         MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
290             M_NFSRVCACHE, M_WAITOK);
291         NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
292         if (nd->nd_flag & ND_NFSV4)
293                 newrp->rc_flag = RC_NFSV4;
294         else if (nd->nd_flag & ND_NFSV3)
295                 newrp->rc_flag = RC_NFSV3;
296         else
297                 newrp->rc_flag = RC_NFSV2;
298         newrp->rc_xid = nd->nd_retxid;
299         newrp->rc_proc = nd->nd_procnum;
300         newrp->rc_sockref = nd->nd_sockref;
301         newrp->rc_cachetime = nd->nd_tcpconntime;
302         if (nd->nd_flag & ND_SAMETCPCONN)
303                 newrp->rc_flag |= RC_SAMETCPCONN;
304         if (nd->nd_nam2 != NULL) {
305                 newrp->rc_flag |= RC_UDP;
306                 ret = nfsrc_getudp(nd, newrp);
307         } else {
308                 ret = nfsrc_gettcp(nd, newrp);
309         }
310         nfsrc_trimcache(nd->nd_sockref, so);
311         NFSEXITCODE2(0, nd);
312         return (ret);
313 }
314
315 /*
316  * For UDP (v2, v3):
317  * - key on <xid, NFS version, RPC#, Client host ip#>
318  *   (at most one entry for each key)
319  */
320 static int
321 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
322 {
323         struct nfsrvcache *rp;
324         struct sockaddr_in *saddr;
325         struct sockaddr_in6 *saddr6;
326         struct nfsrvhashhead *hp;
327         int ret = 0;
328
329         hp = NFSRCUDPHASH(newrp->rc_xid);
330 loop:
331         NFSLOCKCACHE();
332         LIST_FOREACH(rp, hp, rc_hash) {
333             if (newrp->rc_xid == rp->rc_xid &&
334                 newrp->rc_proc == rp->rc_proc &&
335                 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
336                 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
337                         if ((rp->rc_flag & RC_LOCKED) != 0) {
338                                 rp->rc_flag |= RC_WANTED;
339                                 NFSUNLOCKCACHE();
340                                 (void) tsleep((caddr_t)rp, PZERO - 1,
341                                     "nfsrc", 10 * hz);
342                                 goto loop;
343                         }
344                         if (rp->rc_flag == 0)
345                                 panic("nfs udp cache0");
346                         rp->rc_flag |= RC_LOCKED;
347                         TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
348                         TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
349                         if (rp->rc_flag & RC_INPROG) {
350                                 newnfsstats.srvcache_inproghits++;
351                                 NFSUNLOCKCACHE();
352                                 ret = RC_DROPIT;
353                         } else if (rp->rc_flag & RC_REPSTATUS) {
354                                 /*
355                                  * V2 only.
356                                  */
357                                 newnfsstats.srvcache_nonidemdonehits++;
358                                 NFSUNLOCKCACHE();
359                                 nfsrvd_rephead(nd);
360                                 *(nd->nd_errp) = rp->rc_status;
361                                 ret = RC_REPLY;
362                                 rp->rc_timestamp = NFSD_MONOSEC +
363                                         NFSRVCACHE_UDPTIMEOUT;
364                         } else if (rp->rc_flag & RC_REPMBUF) {
365                                 newnfsstats.srvcache_nonidemdonehits++;
366                                 NFSUNLOCKCACHE();
367                                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
368                                         M_COPYALL, M_WAIT);
369                                 ret = RC_REPLY;
370                                 rp->rc_timestamp = NFSD_MONOSEC +
371                                         NFSRVCACHE_UDPTIMEOUT;
372                         } else {
373                                 panic("nfs udp cache1");
374                         }
375                         nfsrc_unlock(rp);
376                         free((caddr_t)newrp, M_NFSRVCACHE);
377                         goto out;
378                 }
379         }
380         newnfsstats.srvcache_misses++;
381         newnfsstats.srvcache_size++;
382         nfsrc_udpcachesize++;
383
384         newrp->rc_flag |= RC_INPROG;
385         saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
386         if (saddr->sin_family == AF_INET)
387                 newrp->rc_inet = saddr->sin_addr.s_addr;
388         else if (saddr->sin_family == AF_INET6) {
389                 saddr6 = (struct sockaddr_in6 *)saddr;
390                 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
391                     sizeof (struct in6_addr));
392                 newrp->rc_flag |= RC_INETIPV6;
393         }
394         LIST_INSERT_HEAD(hp, newrp, rc_hash);
395         TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
396         NFSUNLOCKCACHE();
397         nd->nd_rp = newrp;
398         ret = RC_DOIT;
399
400 out:
401         NFSEXITCODE2(0, nd);
402         return (ret);
403 }
404
405 /*
406  * Update a request cache entry after the rpc has been done
407  */
408 APPLESTATIC struct nfsrvcache *
409 nfsrvd_updatecache(struct nfsrv_descript *nd, struct socket *so)
410 {
411         struct nfsrvcache *rp;
412         struct nfsrvcache *retrp = NULL;
413         mbuf_t m;
414
415         rp = nd->nd_rp;
416         if (!rp)
417                 panic("nfsrvd_updatecache null rp");
418         nd->nd_rp = NULL;
419         NFSLOCKCACHE();
420         nfsrc_lock(rp);
421         if (!(rp->rc_flag & RC_INPROG))
422                 panic("nfsrvd_updatecache not inprog");
423         rp->rc_flag &= ~RC_INPROG;
424         if (rp->rc_flag & RC_UDP) {
425                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
426                 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
427         }
428
429         /*
430          * Reply from cache is a special case returned by nfsrv_checkseqid().
431          */
432         if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
433                 newnfsstats.srvcache_nonidemdonehits++;
434                 NFSUNLOCKCACHE();
435                 nd->nd_repstat = 0;
436                 if (nd->nd_mreq)
437                         mbuf_freem(nd->nd_mreq);
438                 if (!(rp->rc_flag & RC_REPMBUF))
439                         panic("reply from cache");
440                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
441                     M_COPYALL, M_WAIT);
442                 rp->rc_timestamp = NFSD_MONOSEC + NFSRVCACHE_TCPTIMEOUT;
443                 nfsrc_unlock(rp);
444                 goto out;
445         }
446
447         /*
448          * If rc_refcnt > 0, save it
449          * For UDP, save it if ND_SAVEREPLY is set
450          * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
451          */
452         if (nd->nd_repstat != NFSERR_DONTREPLY &&
453             (rp->rc_refcnt > 0 ||
454              ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
455              ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
456               nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
457               nfsrc_tcpnonidempotent))) {
458                 if (rp->rc_refcnt > 0) {
459                         if (!(rp->rc_flag & RC_NFSV4))
460                                 panic("update_cache refcnt");
461                         rp->rc_flag |= RC_REFCNT;
462                 }
463                 if ((nd->nd_flag & ND_NFSV2) &&
464                     nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
465                         rp->rc_status = nd->nd_repstat;
466                         rp->rc_flag |= RC_REPSTATUS;
467                         NFSUNLOCKCACHE();
468                 } else {
469                         if (!(rp->rc_flag & RC_UDP)) {
470                             nfsrc_tcpsavedreplies++;
471                             if (nfsrc_tcpsavedreplies >
472                                 newnfsstats.srvcache_tcppeak)
473                                 newnfsstats.srvcache_tcppeak =
474                                     nfsrc_tcpsavedreplies;
475                         }
476                         NFSUNLOCKCACHE();
477                         m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAIT);
478                         NFSLOCKCACHE();
479                         rp->rc_reply = m;
480                         rp->rc_flag |= RC_REPMBUF;
481                         NFSUNLOCKCACHE();
482                 }
483                 if (rp->rc_flag & RC_UDP) {
484                         rp->rc_timestamp = NFSD_MONOSEC +
485                             NFSRVCACHE_UDPTIMEOUT;
486                         nfsrc_unlock(rp);
487                 } else {
488                         rp->rc_timestamp = NFSD_MONOSEC +
489                             NFSRVCACHE_TCPTIMEOUT;
490                         if (rp->rc_refcnt > 0)
491                                 nfsrc_unlock(rp);
492                         else
493                                 retrp = rp;
494                 }
495         } else {
496                 nfsrc_freecache(rp);
497                 NFSUNLOCKCACHE();
498         }
499
500 out:
501         nfsrc_trimcache(nd->nd_sockref, so);
502         NFSEXITCODE2(0, nd);
503         return (retrp);
504 }
505
506 /*
507  * Invalidate and, if possible, free an in prog cache entry.
508  * Must not sleep.
509  */
510 APPLESTATIC void
511 nfsrvd_delcache(struct nfsrvcache *rp)
512 {
513
514         if (!(rp->rc_flag & RC_INPROG))
515                 panic("nfsrvd_delcache not in prog");
516         NFSLOCKCACHE();
517         rp->rc_flag &= ~RC_INPROG;
518         if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
519                 nfsrc_freecache(rp);
520         NFSUNLOCKCACHE();
521 }
522
523 /*
524  * Called after nfsrvd_updatecache() once the reply is sent, to update
525  * the entry for nfsrc_activesocket() and unlock it. The argument is
526  * the pointer returned by nfsrvd_updatecache().
527  */
528 APPLESTATIC void
529 nfsrvd_sentcache(struct nfsrvcache *rp, struct socket *so, int err)
530 {
531         tcp_seq tmp_seq;
532
533         if (!(rp->rc_flag & RC_LOCKED))
534                 panic("nfsrvd_sentcache not locked");
535         if (!err) {
536                 if ((so->so_proto->pr_domain->dom_family != AF_INET &&
537                      so->so_proto->pr_domain->dom_family != AF_INET6) ||
538                      so->so_proto->pr_protocol != IPPROTO_TCP)
539                         panic("nfs sent cache");
540                 if (nfsrv_getsockseqnum(so, &tmp_seq)) {
541                         NFSLOCKCACHE();
542                         rp->rc_tcpseq = tmp_seq;
543                         rp->rc_flag |= RC_TCPSEQ;
544                         NFSUNLOCKCACHE();
545                 }
546         }
547         nfsrc_unlock(rp);
548 }
549
550 /*
551  * Get a cache entry for TCP
552  * - key on <xid, nfs version>
553  *   (allow multiple entries for a given key)
554  */
555 static int
556 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
557 {
558         struct nfsrvcache *rp, *nextrp;
559         int i;
560         struct nfsrvcache *hitrp;
561         struct nfsrvhashhead *hp, nfsrc_templist;
562         int hit, ret = 0;
563
564         hp = NFSRCHASH(newrp->rc_xid);
565         newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
566 tryagain:
567         NFSLOCKCACHE();
568         hit = 1;
569         LIST_INIT(&nfsrc_templist);
570         /*
571          * Get all the matches and put them on the temp list.
572          */
573         rp = LIST_FIRST(hp);
574         while (rp != LIST_END(hp)) {
575                 nextrp = LIST_NEXT(rp, rc_hash);
576                 if (newrp->rc_xid == rp->rc_xid &&
577                     (!(rp->rc_flag & RC_INPROG) ||
578                      ((newrp->rc_flag & RC_SAMETCPCONN) &&
579                       newrp->rc_sockref == rp->rc_sockref)) &&
580                     (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
581                     newrp->rc_proc == rp->rc_proc &&
582                     ((newrp->rc_flag & RC_NFSV4) &&
583                      newrp->rc_sockref != rp->rc_sockref &&
584                      newrp->rc_cachetime >= rp->rc_cachetime)
585                     && newrp->rc_reqlen == rp->rc_reqlen &&
586                     newrp->rc_cksum == rp->rc_cksum) {
587                         LIST_REMOVE(rp, rc_hash);
588                         LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
589                 }
590                 rp = nextrp;
591         }
592
593         /*
594          * Now, use nfsrc_templist to decide if there is a match.
595          */
596         i = 0;
597         LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
598                 i++;
599                 if (rp->rc_refcnt > 0) {
600                         hit = 0;
601                         break;
602                 }
603         }
604         /*
605          * Can be a hit only if one entry left.
606          * Note possible hit entry and put nfsrc_templist back on hash
607          * list.
608          */
609         if (i != 1)
610                 hit = 0;
611         hitrp = rp = LIST_FIRST(&nfsrc_templist);
612         while (rp != LIST_END(&nfsrc_templist)) {
613                 nextrp = LIST_NEXT(rp, rc_hash);
614                 LIST_REMOVE(rp, rc_hash);
615                 LIST_INSERT_HEAD(hp, rp, rc_hash);
616                 rp = nextrp;
617         }
618         if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
619                 panic("nfs gettcp cache templist");
620
621         if (hit) {
622                 rp = hitrp;
623                 if ((rp->rc_flag & RC_LOCKED) != 0) {
624                         rp->rc_flag |= RC_WANTED;
625                         NFSUNLOCKCACHE();
626                         (void) tsleep((caddr_t)rp, PZERO-1, "nfsrc", 10 * hz);
627                         goto tryagain;
628                 }
629                 if (rp->rc_flag == 0)
630                         panic("nfs tcp cache0");
631                 rp->rc_flag |= RC_LOCKED;
632                 if (rp->rc_flag & RC_INPROG) {
633                         newnfsstats.srvcache_inproghits++;
634                         NFSUNLOCKCACHE();
635                         if (newrp->rc_sockref == rp->rc_sockref)
636                                 nfsrc_marksametcpconn(rp->rc_sockref);
637                         ret = RC_DROPIT;
638                 } else if (rp->rc_flag & RC_REPSTATUS) {
639                         /*
640                          * V2 only.
641                          */
642                         newnfsstats.srvcache_nonidemdonehits++;
643                         NFSUNLOCKCACHE();
644                         if (newrp->rc_sockref == rp->rc_sockref)
645                                 nfsrc_marksametcpconn(rp->rc_sockref);
646                         ret = RC_REPLY;
647                         nfsrvd_rephead(nd);
648                         *(nd->nd_errp) = rp->rc_status;
649                         rp->rc_timestamp = NFSD_MONOSEC +
650                                 NFSRVCACHE_TCPTIMEOUT;
651                 } else if (rp->rc_flag & RC_REPMBUF) {
652                         newnfsstats.srvcache_nonidemdonehits++;
653                         NFSUNLOCKCACHE();
654                         if (newrp->rc_sockref == rp->rc_sockref)
655                                 nfsrc_marksametcpconn(rp->rc_sockref);
656                         ret = RC_REPLY;
657                         nd->nd_mreq = m_copym(rp->rc_reply, 0,
658                                 M_COPYALL, M_WAIT);
659                         rp->rc_timestamp = NFSD_MONOSEC +
660                                 NFSRVCACHE_TCPTIMEOUT;
661                 } else {
662                         panic("nfs tcp cache1");
663                 }
664                 nfsrc_unlock(rp);
665                 free((caddr_t)newrp, M_NFSRVCACHE);
666                 goto out;
667         }
668         newnfsstats.srvcache_misses++;
669         newnfsstats.srvcache_size++;
670
671         /*
672          * For TCP, multiple entries for a key are allowed, so don't
673          * chain it into the hash table until done.
674          */
675         newrp->rc_cachetime = NFSD_MONOSEC;
676         newrp->rc_flag |= RC_INPROG;
677         LIST_INSERT_HEAD(hp, newrp, rc_hash);
678         NFSUNLOCKCACHE();
679         nd->nd_rp = newrp;
680         ret = RC_DOIT;
681
682 out:
683         NFSEXITCODE2(0, nd);
684         return (ret);
685 }
686
687 /*
688  * Lock a cache entry.
689  * Also puts a mutex lock on the cache list.
690  */
691 static void
692 nfsrc_lock(struct nfsrvcache *rp)
693 {
694         NFSCACHELOCKREQUIRED();
695         while ((rp->rc_flag & RC_LOCKED) != 0) {
696                 rp->rc_flag |= RC_WANTED;
697                 (void) nfsmsleep((caddr_t)rp, NFSCACHEMUTEXPTR, PZERO - 1,
698                     "nfsrc", 0);
699         }
700         rp->rc_flag |= RC_LOCKED;
701 }
702
703 /*
704  * Unlock a cache entry.
705  */
706 static void
707 nfsrc_unlock(struct nfsrvcache *rp)
708 {
709
710         NFSLOCKCACHE();
711         rp->rc_flag &= ~RC_LOCKED;
712         nfsrc_wanted(rp);
713         NFSUNLOCKCACHE();
714 }
715
716 /*
717  * Wakeup anyone wanting entry.
718  */
719 static void
720 nfsrc_wanted(struct nfsrvcache *rp)
721 {
722         if (rp->rc_flag & RC_WANTED) {
723                 rp->rc_flag &= ~RC_WANTED;
724                 wakeup((caddr_t)rp);
725         }
726 }
727
728 /*
729  * Free up the entry.
730  * Must not sleep.
731  */
732 static void
733 nfsrc_freecache(struct nfsrvcache *rp)
734 {
735
736         NFSCACHELOCKREQUIRED();
737         LIST_REMOVE(rp, rc_hash);
738         if (rp->rc_flag & RC_UDP) {
739                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
740                 nfsrc_udpcachesize--;
741         }
742         nfsrc_wanted(rp);
743         if (rp->rc_flag & RC_REPMBUF) {
744                 mbuf_freem(rp->rc_reply);
745                 if (!(rp->rc_flag & RC_UDP))
746                         nfsrc_tcpsavedreplies--;
747         }
748         FREE((caddr_t)rp, M_NFSRVCACHE);
749         newnfsstats.srvcache_size--;
750 }
751
752 /*
753  * Clean out the cache. Called when nfsserver module is unloaded.
754  */
755 APPLESTATIC void
756 nfsrvd_cleancache(void)
757 {
758         struct nfsrvcache *rp, *nextrp;
759         int i;
760
761         NFSLOCKCACHE();
762         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
763                 LIST_FOREACH_SAFE(rp, &nfsrvhashtbl[i], rc_hash, nextrp) {
764                         nfsrc_freecache(rp);
765                 }
766         }
767         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
768                 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
769                         nfsrc_freecache(rp);
770                 }
771         }
772         newnfsstats.srvcache_size = 0;
773         nfsrc_tcpsavedreplies = 0;
774         NFSUNLOCKCACHE();
775 }
776
777 /*
778  * The basic rule is to get rid of entries that are expired.
779  */
780 static void
781 nfsrc_trimcache(u_int64_t sockref, struct socket *so)
782 {
783         struct nfsrvcache *rp, *nextrp;
784         int i;
785
786         NFSLOCKCACHE();
787         TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
788                 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
789                      && rp->rc_refcnt == 0
790                      && ((rp->rc_flag & RC_REFCNT) ||
791                          NFSD_MONOSEC > rp->rc_timestamp ||
792                          nfsrc_udpcachesize > nfsrc_udphighwater))
793                         nfsrc_freecache(rp);
794         }
795         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
796                 LIST_FOREACH_SAFE(rp, &nfsrvhashtbl[i], rc_hash, nextrp) {
797                         if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
798                              && rp->rc_refcnt == 0
799                              && ((rp->rc_flag & RC_REFCNT) ||
800                                  NFSD_MONOSEC > rp->rc_timestamp ||
801                                  nfsrc_activesocket(rp, sockref, so)))
802                                 nfsrc_freecache(rp);
803                 }
804         }
805         NFSUNLOCKCACHE();
806 }
807
808 /*
809  * Add a seqid# reference to the cache entry.
810  */
811 APPLESTATIC void
812 nfsrvd_refcache(struct nfsrvcache *rp)
813 {
814
815         NFSLOCKCACHE();
816         if (rp->rc_refcnt < 0)
817                 panic("nfs cache refcnt");
818         rp->rc_refcnt++;
819         NFSUNLOCKCACHE();
820 }
821
822 /*
823  * Dereference a seqid# cache entry.
824  */
825 APPLESTATIC void
826 nfsrvd_derefcache(struct nfsrvcache *rp)
827 {
828
829         NFSLOCKCACHE();
830         if (rp->rc_refcnt <= 0)
831                 panic("nfs cache derefcnt");
832         rp->rc_refcnt--;
833         if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
834                 nfsrc_freecache(rp);
835         NFSUNLOCKCACHE();
836 }
837
838 /*
839  * Check to see if the socket is active.
840  * Return 1 if the reply has been received/acknowledged by the client,
841  * 0 otherwise.
842  * XXX - Uses tcp internals.
843  */
844 static int
845 nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t cur_sockref,
846     struct socket *cur_so)
847 {
848         int ret = 0;
849
850         if (!(rp->rc_flag & RC_TCPSEQ))
851                 return (ret);
852         /*
853          * If the sockref is the same, it is the same TCP connection.
854          */
855         if (cur_sockref == rp->rc_sockref)
856                 ret = nfsrv_checksockseqnum(cur_so, rp->rc_tcpseq);
857         return (ret);
858 }
859
860 /*
861  * Calculate the length of the mbuf list and a checksum on the first up to
862  * NFSRVCACHE_CHECKLEN bytes.
863  */
864 static int
865 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
866 {
867         int len = 0, cklen;
868         mbuf_t m;
869
870         m = m1;
871         while (m) {
872                 len += mbuf_len(m);
873                 m = mbuf_next(m);
874         }
875         cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
876         *cksum = in_cksum(m1, cklen);
877         return (len);
878 }
879
880 /*
881  * Mark a TCP connection that is seeing retries. Should never happen for
882  * NFSv4.
883  */
884 static void
885 nfsrc_marksametcpconn(u_int64_t sockref)
886 {
887 }
888