]> CyberLeo.Net >> Repos - FreeBSD/releng/8.1.git/blob - sys/fs/nfsserver/nfs_nfsdcache.c
Copy stable/8 to releng/8.1 in preparation for 8.1-RC1.
[FreeBSD/releng/8.1.git] / sys / fs / nfsserver / nfs_nfsdcache.c
1 /*-
2  * Copyright (c) 1989, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Rick Macklem at The University of Guelph.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 /*
38  * Here is the basic algorithm:
39  * First, some design criteria I used:
40  * - I think a false hit is more serious than a false miss
41  * - A false hit for an RPC that has Op(s) that order via seqid# must be
42  *   avoided at all cost
43  * - A valid hit will probably happen a long time after the original reply
44  *   and the TCP socket that the original request was received on will no
45  *   longer be active
46  *   (The long time delay implies to me that LRU is not appropriate.)
47  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48  *   in them as well as minimizing the risk of redoing retried non-idempotent
49  *   Ops.
50  * Because it is biased towards avoiding false hits, multiple entries with
51  * the same xid are to be expected, especially for the case of the entry
52  * in the cache being related to a seqid# sequenced Op.
53  * 
54  * The basic algorithm I'm about to code up:
55  * - Null RPCs bypass the cache and are just done
56  * For TCP
57  *      - key on <xid, NFS version> (as noted above, there can be several
58  *                                   entries with the same key)
59  *      When a request arrives:
60  *              For all that match key
61  *              - if RPC# != OR request_size !=
62  *                      - not a match with this one
63  *              - if NFSv4 and received on same TCP socket OR
64  *                      received on a TCP connection created before the
65  *                      entry was cached
66  *                      - not a match with this one
67  *                      (V2,3 clients might retry on same TCP socket)
68  *              - calculate checksum on first N bytes of NFS XDR
69  *              - if checksum !=
70  *                      - not a match for this one
71  *              If any of the remaining ones that match has a
72  *                      seqid_refcnt > 0
73  *                      - not a match (go do RPC, using new cache entry)
74  *              If one match left
75  *                      - a hit (reply from cache)
76  *              else
77  *                      - miss (go do RPC, using new cache entry)
78  * 
79  *      During processing of NFSv4 request:
80  *              - set a flag when a non-idempotent Op is processed
81  *              - when an Op that uses a seqid# (Open,...) is processed
82  *                      - if same seqid# as referenced entry in cache
83  *                              - free new cache entry
84  *                              - reply from referenced cache entry
85  *                        else if next seqid# in order
86  *                              - free referenced cache entry
87  *                              - increment seqid_refcnt on new cache entry
88  *                              - set pointer from Openowner/Lockowner to
89  *                                      new cache entry (aka reference it)
90  *                        else if first seqid# in sequence
91  *                              - increment seqid_refcnt on new cache entry
92  *                              - set pointer from Openowner/Lockowner to
93  *                                      new cache entry (aka reference it)
94  * 
95  *      At end of RPC processing:
96  *              - if seqid_refcnt > 0 OR flagged non-idempotent on new
97  *                      cache entry
98  *                      - save reply in cache entry
99  *                      - calculate checksum on first N bytes of NFS XDR
100  *                              request
101  *                      - note op and length of XDR request (in bytes)
102  *                      - timestamp it
103  *                else
104  *                      - free new cache entry
105  *              - Send reply (noting info for socket activity check, below)
106  * 
107  *      For cache entries saved above:
108  *              - if saved since seqid_refcnt was > 0
109  *                      - free when seqid_refcnt decrements to 0
110  *                        (when next one in sequence is processed above, or
111  *                         when Openowner/Lockowner is discarded)
112  *                else { non-idempotent Op(s) }
113  *                      - free when
114  *                              - some further activity observed on same
115  *                                      socket
116  *                                (I'm not yet sure how I'm going to do
117  *                                 this. Maybe look at the TCP connection
118  *                                 to see if the send_tcp_sequence# is well
119  *                                 past sent reply OR K additional RPCs
120  *                                 replied on same socket OR?)
121  *                        OR
122  *                              - when very old (hours, days, weeks?)
123  * 
124  * For UDP (v2, 3 only), pretty much the old way:
125  * - key on <xid, NFS version, RPC#, Client host ip#>
126  *   (at most one entry for each key)
127  * 
128  * When a Request arrives:
129  * - if a match with entry via key
130  *      - if RPC marked In_progress
131  *              - discard request (don't send reply)
132  *        else
133  *              - reply from cache
134  *              - timestamp cache entry
135  *   else
136  *      - add entry to cache, marked In_progress
137  *      - do RPC
138  *      - when RPC done
139  *              - if RPC# non-idempotent
140  *                      - mark entry Done (not In_progress)
141  *                      - save reply
142  *                      - timestamp cache entry
143  *                else
144  *                      - free cache entry
145  *              - send reply
146  * 
147  * Later, entries with saved replies are free'd a short time (few minutes)
148  * after reply sent (timestamp).
149  * Reference: Chet Juszczak, "Improving the Performance and Correctness
150  *              of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151  *              pages 53-63. San Diego, February 1989.
152  *       for the UDP case.
153  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154  *      for TCP. For V3, a reply won't be saved when the flood level is
155  *      hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156  *      that case. This level should be set high enough that this almost
157  *      never happens.
158  */
159 #ifndef APPLEKEXT
160 #include <fs/nfs/nfsport.h>
161
162 extern struct nfsstats newnfsstats;
163 NFSCACHEMUTEX;
164 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
165 #endif  /* !APPLEKEXT */
166
167 static int nfsrc_tcpnonidempotent = 1;
168 static int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER, nfsrc_udpcachesize = 0;
169 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
170 static struct nfsrvhashhead nfsrvhashtbl[NFSRVCACHE_HASHSIZE],
171     nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
172 /*
173  * and the reverse mapping from generic to Version 2 procedure numbers
174  */
175 static int newnfsv2_procid[NFS_V3NPROCS] = {
176         NFSV2PROC_NULL,
177         NFSV2PROC_GETATTR,
178         NFSV2PROC_SETATTR,
179         NFSV2PROC_LOOKUP,
180         NFSV2PROC_NOOP,
181         NFSV2PROC_READLINK,
182         NFSV2PROC_READ,
183         NFSV2PROC_WRITE,
184         NFSV2PROC_CREATE,
185         NFSV2PROC_MKDIR,
186         NFSV2PROC_SYMLINK,
187         NFSV2PROC_CREATE,
188         NFSV2PROC_REMOVE,
189         NFSV2PROC_RMDIR,
190         NFSV2PROC_RENAME,
191         NFSV2PROC_LINK,
192         NFSV2PROC_READDIR,
193         NFSV2PROC_NOOP,
194         NFSV2PROC_STATFS,
195         NFSV2PROC_NOOP,
196         NFSV2PROC_NOOP,
197         NFSV2PROC_NOOP,
198 };
199
200 #define NFSRCUDPHASH(xid) \
201         (&nfsrvudphashtbl[((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE])
202 #define NFSRCHASH(xid) \
203         (&nfsrvhashtbl[((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE])
204 #define TRUE    1
205 #define FALSE   0
206 #define NFSRVCACHE_CHECKLEN     100
207
208 /* True iff the rpc reply is an nfs status ONLY! */
209 static int nfsv2_repstat[NFS_V3NPROCS] = {
210         FALSE,
211         FALSE,
212         FALSE,
213         FALSE,
214         FALSE,
215         FALSE,
216         FALSE,
217         FALSE,
218         FALSE,
219         FALSE,
220         TRUE,
221         TRUE,
222         TRUE,
223         TRUE,
224         FALSE,
225         TRUE,
226         FALSE,
227         FALSE,
228         FALSE,
229         FALSE,
230         FALSE,
231         FALSE,
232 };
233
234 /*
235  * Will NFS want to work over IPv6 someday?
236  */
237 #define NETFAMILY(rp) \
238                 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
239
240 /* local functions */
241 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
242 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
243 static void nfsrc_lock(struct nfsrvcache *rp);
244 static void nfsrc_unlock(struct nfsrvcache *rp);
245 static void nfsrc_wanted(struct nfsrvcache *rp);
246 static void nfsrc_freecache(struct nfsrvcache *rp);
247 static void nfsrc_trimcache(u_int64_t, struct socket *);
248 static int nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t,
249     struct socket *);
250 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
251 static void nfsrc_marksametcpconn(u_int64_t);
252
253 /*
254  * Initialize the server request cache list
255  */
256 APPLESTATIC void
257 nfsrvd_initcache(void)
258 {
259         int i;
260         static int inited = 0;
261
262         if (inited)
263                 return;
264         inited = 1;
265         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
266                 LIST_INIT(&nfsrvudphashtbl[i]);
267                 LIST_INIT(&nfsrvhashtbl[i]);
268         }
269         TAILQ_INIT(&nfsrvudplru);
270         nfsrc_tcpsavedreplies = 0;
271         nfsrc_udpcachesize = 0;
272         newnfsstats.srvcache_tcppeak = 0;
273         newnfsstats.srvcache_size = 0;
274 }
275
276 /*
277  * Get a cache entry for this request. Basically just malloc a new one
278  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
279  * Call nfsrc_trimcache() to clean up the cache before returning.
280  */
281 APPLESTATIC int
282 nfsrvd_getcache(struct nfsrv_descript *nd, struct socket *so)
283 {
284         struct nfsrvcache *newrp;
285         int ret;
286
287         if (nd->nd_procnum == NFSPROC_NULL)
288                 panic("nfsd cache null");
289         MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
290             M_NFSRVCACHE, M_WAITOK);
291         NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
292         if (nd->nd_flag & ND_NFSV4)
293                 newrp->rc_flag = RC_NFSV4;
294         else if (nd->nd_flag & ND_NFSV3)
295                 newrp->rc_flag = RC_NFSV3;
296         else
297                 newrp->rc_flag = RC_NFSV2;
298         newrp->rc_xid = nd->nd_retxid;
299         newrp->rc_proc = nd->nd_procnum;
300         newrp->rc_sockref = nd->nd_sockref;
301         newrp->rc_cachetime = nd->nd_tcpconntime;
302         if (nd->nd_flag & ND_SAMETCPCONN)
303                 newrp->rc_flag |= RC_SAMETCPCONN;
304         if (nd->nd_nam2 != NULL) {
305                 newrp->rc_flag |= RC_UDP;
306                 ret = nfsrc_getudp(nd, newrp);
307         } else {
308                 ret = nfsrc_gettcp(nd, newrp);
309         }
310         nfsrc_trimcache(nd->nd_sockref, so);
311         return (ret);
312 }
313
314 /*
315  * For UDP (v2, v3):
316  * - key on <xid, NFS version, RPC#, Client host ip#>
317  *   (at most one entry for each key)
318  */
319 static int
320 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
321 {
322         struct nfsrvcache *rp;
323         struct sockaddr_in *saddr;
324         struct sockaddr_in6 *saddr6;
325         struct nfsrvhashhead *hp;
326         int ret = 0;
327
328         hp = NFSRCUDPHASH(newrp->rc_xid);
329 loop:
330         NFSLOCKCACHE();
331         LIST_FOREACH(rp, hp, rc_hash) {
332             if (newrp->rc_xid == rp->rc_xid &&
333                 newrp->rc_proc == rp->rc_proc &&
334                 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
335                 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
336                         if ((rp->rc_flag & RC_LOCKED) != 0) {
337                                 rp->rc_flag |= RC_WANTED;
338                                 NFSUNLOCKCACHE();
339                                 (void) tsleep((caddr_t)rp, PZERO - 1,
340                                     "nfsrc", 10 * hz);
341                                 goto loop;
342                         }
343                         if (rp->rc_flag == 0)
344                                 panic("nfs udp cache0");
345                         rp->rc_flag |= RC_LOCKED;
346                         TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
347                         TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
348                         if (rp->rc_flag & RC_INPROG) {
349                                 newnfsstats.srvcache_inproghits++;
350                                 NFSUNLOCKCACHE();
351                                 ret = RC_DROPIT;
352                         } else if (rp->rc_flag & RC_REPSTATUS) {
353                                 /*
354                                  * V2 only.
355                                  */
356                                 newnfsstats.srvcache_nonidemdonehits++;
357                                 NFSUNLOCKCACHE();
358                                 nfsrvd_rephead(nd);
359                                 *(nd->nd_errp) = rp->rc_status;
360                                 ret = RC_REPLY;
361                                 rp->rc_timestamp = NFSD_MONOSEC +
362                                         NFSRVCACHE_UDPTIMEOUT;
363                         } else if (rp->rc_flag & RC_REPMBUF) {
364                                 newnfsstats.srvcache_nonidemdonehits++;
365                                 NFSUNLOCKCACHE();
366                                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
367                                         M_COPYALL, M_WAIT);
368                                 ret = RC_REPLY;
369                                 rp->rc_timestamp = NFSD_MONOSEC +
370                                         NFSRVCACHE_UDPTIMEOUT;
371                         } else {
372                                 panic("nfs udp cache1");
373                         }
374                         nfsrc_unlock(rp);
375                         free((caddr_t)newrp, M_NFSRVCACHE);
376                         return (ret);
377                 }
378         }
379         newnfsstats.srvcache_misses++;
380         newnfsstats.srvcache_size++;
381         nfsrc_udpcachesize++;
382
383         newrp->rc_flag |= RC_INPROG;
384         saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
385         if (saddr->sin_family == AF_INET)
386                 newrp->rc_inet = saddr->sin_addr.s_addr;
387         else if (saddr->sin_family == AF_INET6) {
388                 saddr6 = (struct sockaddr_in6 *)saddr;
389                 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
390                     sizeof (struct in6_addr));
391                 newrp->rc_flag |= RC_INETIPV6;
392         }
393         LIST_INSERT_HEAD(hp, newrp, rc_hash);
394         TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
395         NFSUNLOCKCACHE();
396         nd->nd_rp = newrp;
397         return (RC_DOIT);
398 }
399
400 /*
401  * Update a request cache entry after the rpc has been done
402  */
403 APPLESTATIC struct nfsrvcache *
404 nfsrvd_updatecache(struct nfsrv_descript *nd, struct socket *so)
405 {
406         struct nfsrvcache *rp;
407         struct nfsrvcache *retrp = NULL;
408
409         rp = nd->nd_rp;
410         if (!rp)
411                 panic("nfsrvd_updatecache null rp");
412         nd->nd_rp = NULL;
413         NFSLOCKCACHE();
414         nfsrc_lock(rp);
415         if (!(rp->rc_flag & RC_INPROG))
416                 panic("nfsrvd_updatecache not inprog");
417         rp->rc_flag &= ~RC_INPROG;
418         if (rp->rc_flag & RC_UDP) {
419                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
420                 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
421         }
422
423         /*
424          * Reply from cache is a special case returned by nfsrv_checkseqid().
425          */
426         if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
427                 newnfsstats.srvcache_nonidemdonehits++;
428                 NFSUNLOCKCACHE();
429                 nd->nd_repstat = 0;
430                 if (nd->nd_mreq)
431                         mbuf_freem(nd->nd_mreq);
432                 if (!(rp->rc_flag & RC_REPMBUF))
433                         panic("reply from cache");
434                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
435                     M_COPYALL, M_WAIT);
436                 rp->rc_timestamp = NFSD_MONOSEC + NFSRVCACHE_TCPTIMEOUT;
437                 nfsrc_unlock(rp);
438                 nfsrc_trimcache(nd->nd_sockref, so);
439                 return (retrp);
440         }
441
442         /*
443          * If rc_refcnt > 0, save it
444          * For UDP, save it if ND_SAVEREPLY is set
445          * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
446          */
447         if (nd->nd_repstat != NFSERR_DONTREPLY &&
448             (rp->rc_refcnt > 0 ||
449              ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
450              ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
451               nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
452               nfsrc_tcpnonidempotent))) {
453                 if (rp->rc_refcnt > 0) {
454                         if (!(rp->rc_flag & RC_NFSV4))
455                                 panic("update_cache refcnt");
456                         rp->rc_flag |= RC_REFCNT;
457                 }
458                 if ((nd->nd_flag & ND_NFSV2) &&
459                     nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
460                         NFSUNLOCKCACHE();
461                         rp->rc_status = nd->nd_repstat;
462                         rp->rc_flag |= RC_REPSTATUS;
463                 } else {
464                         if (!(rp->rc_flag & RC_UDP)) {
465                             nfsrc_tcpsavedreplies++;
466                             if (nfsrc_tcpsavedreplies >
467                                 newnfsstats.srvcache_tcppeak)
468                                 newnfsstats.srvcache_tcppeak =
469                                     nfsrc_tcpsavedreplies;
470                         }
471                         NFSUNLOCKCACHE();
472                         rp->rc_reply = m_copym(nd->nd_mreq, 0, M_COPYALL,
473                             M_WAIT);
474                         rp->rc_flag |= RC_REPMBUF;
475                 }
476                 if (rp->rc_flag & RC_UDP) {
477                         rp->rc_timestamp = NFSD_MONOSEC +
478                             NFSRVCACHE_UDPTIMEOUT;
479                         nfsrc_unlock(rp);
480                 } else {
481                         rp->rc_timestamp = NFSD_MONOSEC +
482                             NFSRVCACHE_TCPTIMEOUT;
483                         if (rp->rc_refcnt > 0)
484                                 nfsrc_unlock(rp);
485                         else
486                                 retrp = rp;
487                 }
488         } else {
489                 nfsrc_freecache(rp);
490                 NFSUNLOCKCACHE();
491         }
492         nfsrc_trimcache(nd->nd_sockref, so);
493         return (retrp);
494 }
495
496 /*
497  * Invalidate and, if possible, free an in prog cache entry.
498  * Must not sleep.
499  */
500 APPLESTATIC void
501 nfsrvd_delcache(struct nfsrvcache *rp)
502 {
503
504         if (!(rp->rc_flag & RC_INPROG))
505                 panic("nfsrvd_delcache not in prog");
506         NFSLOCKCACHE();
507         rp->rc_flag &= ~RC_INPROG;
508         if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
509                 nfsrc_freecache(rp);
510         NFSUNLOCKCACHE();
511 }
512
513 /*
514  * Called after nfsrvd_updatecache() once the reply is sent, to update
515  * the entry for nfsrc_activesocket() and unlock it. The argument is
516  * the pointer returned by nfsrvd_updatecache().
517  */
518 APPLESTATIC void
519 nfsrvd_sentcache(struct nfsrvcache *rp, struct socket *so, int err)
520 {
521
522         if (!(rp->rc_flag & RC_LOCKED))
523                 panic("nfsrvd_sentcache not locked");
524         if (!err) {
525                 if ((so->so_proto->pr_domain->dom_family != AF_INET &&
526                      so->so_proto->pr_domain->dom_family != AF_INET6) ||
527                      so->so_proto->pr_protocol != IPPROTO_TCP)
528                         panic("nfs sent cache");
529                 if (nfsrv_getsockseqnum(so, &rp->rc_tcpseq))
530                         rp->rc_flag |= RC_TCPSEQ;
531         }
532         nfsrc_unlock(rp);
533 }
534
535 /*
536  * Get a cache entry for TCP
537  * - key on <xid, nfs version>
538  *   (allow multiple entries for a given key)
539  */
540 static int
541 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
542 {
543         struct nfsrvcache *rp, *nextrp;
544         int i;
545         struct nfsrvcache *hitrp;
546         struct nfsrvhashhead *hp, nfsrc_templist;
547         int hit, ret = 0;
548
549         hp = NFSRCHASH(newrp->rc_xid);
550         newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
551 tryagain:
552         NFSLOCKCACHE();
553         hit = 1;
554         LIST_INIT(&nfsrc_templist);
555         /*
556          * Get all the matches and put them on the temp list.
557          */
558         rp = LIST_FIRST(hp);
559         while (rp != LIST_END(hp)) {
560                 nextrp = LIST_NEXT(rp, rc_hash);
561                 if (newrp->rc_xid == rp->rc_xid &&
562                     (!(rp->rc_flag & RC_INPROG) ||
563                      ((newrp->rc_flag & RC_SAMETCPCONN) &&
564                       newrp->rc_sockref == rp->rc_sockref)) &&
565                     (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
566                     newrp->rc_proc == rp->rc_proc &&
567                     ((newrp->rc_flag & RC_NFSV4) &&
568                      newrp->rc_sockref != rp->rc_sockref &&
569                      newrp->rc_cachetime >= rp->rc_cachetime)
570                     && newrp->rc_reqlen == rp->rc_reqlen &&
571                     newrp->rc_cksum == rp->rc_cksum) {
572                         LIST_REMOVE(rp, rc_hash);
573                         LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
574                 }
575                 rp = nextrp;
576         }
577
578         /*
579          * Now, use nfsrc_templist to decide if there is a match.
580          */
581         i = 0;
582         LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
583                 i++;
584                 if (rp->rc_refcnt > 0) {
585                         hit = 0;
586                         break;
587                 }
588         }
589         /*
590          * Can be a hit only if one entry left.
591          * Note possible hit entry and put nfsrc_templist back on hash
592          * list.
593          */
594         if (i != 1)
595                 hit = 0;
596         hitrp = rp = LIST_FIRST(&nfsrc_templist);
597         while (rp != LIST_END(&nfsrc_templist)) {
598                 nextrp = LIST_NEXT(rp, rc_hash);
599                 LIST_REMOVE(rp, rc_hash);
600                 LIST_INSERT_HEAD(hp, rp, rc_hash);
601                 rp = nextrp;
602         }
603         if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
604                 panic("nfs gettcp cache templist");
605
606         if (hit) {
607                 rp = hitrp;
608                 if ((rp->rc_flag & RC_LOCKED) != 0) {
609                         rp->rc_flag |= RC_WANTED;
610                         NFSUNLOCKCACHE();
611                         (void) tsleep((caddr_t)rp, PZERO-1, "nfsrc", 10 * hz);
612                         goto tryagain;
613                 }
614                 if (rp->rc_flag == 0)
615                         panic("nfs tcp cache0");
616                 rp->rc_flag |= RC_LOCKED;
617                 if (rp->rc_flag & RC_INPROG) {
618                         newnfsstats.srvcache_inproghits++;
619                         NFSUNLOCKCACHE();
620                         if (newrp->rc_sockref == rp->rc_sockref)
621                                 nfsrc_marksametcpconn(rp->rc_sockref);
622                         ret = RC_DROPIT;
623                 } else if (rp->rc_flag & RC_REPSTATUS) {
624                         /*
625                          * V2 only.
626                          */
627                         newnfsstats.srvcache_nonidemdonehits++;
628                         NFSUNLOCKCACHE();
629                         if (newrp->rc_sockref == rp->rc_sockref)
630                                 nfsrc_marksametcpconn(rp->rc_sockref);
631                         ret = RC_REPLY;
632                         nfsrvd_rephead(nd);
633                         *(nd->nd_errp) = rp->rc_status;
634                         rp->rc_timestamp = NFSD_MONOSEC +
635                                 NFSRVCACHE_TCPTIMEOUT;
636                 } else if (rp->rc_flag & RC_REPMBUF) {
637                         newnfsstats.srvcache_nonidemdonehits++;
638                         NFSUNLOCKCACHE();
639                         if (newrp->rc_sockref == rp->rc_sockref)
640                                 nfsrc_marksametcpconn(rp->rc_sockref);
641                         ret = RC_REPLY;
642                         nd->nd_mreq = m_copym(rp->rc_reply, 0,
643                                 M_COPYALL, M_WAIT);
644                         rp->rc_timestamp = NFSD_MONOSEC +
645                                 NFSRVCACHE_TCPTIMEOUT;
646                 } else {
647                         panic("nfs tcp cache1");
648                 }
649                 nfsrc_unlock(rp);
650                 free((caddr_t)newrp, M_NFSRVCACHE);
651                 return (ret);
652         }
653         newnfsstats.srvcache_misses++;
654         newnfsstats.srvcache_size++;
655
656         /*
657          * For TCP, multiple entries for a key are allowed, so don't
658          * chain it into the hash table until done.
659          */
660         newrp->rc_cachetime = NFSD_MONOSEC;
661         newrp->rc_flag |= RC_INPROG;
662         LIST_INSERT_HEAD(hp, newrp, rc_hash);
663         NFSUNLOCKCACHE();
664         nd->nd_rp = newrp;
665         return (RC_DOIT);
666 }
667
668 /*
669  * Lock a cache entry.
670  * Also puts a mutex lock on the cache list.
671  */
672 static void
673 nfsrc_lock(struct nfsrvcache *rp)
674 {
675         NFSCACHELOCKREQUIRED();
676         while ((rp->rc_flag & RC_LOCKED) != 0) {
677                 rp->rc_flag |= RC_WANTED;
678                 (void) nfsmsleep((caddr_t)rp, NFSCACHEMUTEXPTR, PZERO - 1,
679                     "nfsrc", 0);
680         }
681         rp->rc_flag |= RC_LOCKED;
682 }
683
684 /*
685  * Unlock a cache entry.
686  */
687 static void
688 nfsrc_unlock(struct nfsrvcache *rp)
689 {
690         rp->rc_flag &= ~RC_LOCKED;
691         nfsrc_wanted(rp);
692 }
693
694 /*
695  * Wakeup anyone wanting entry.
696  */
697 static void
698 nfsrc_wanted(struct nfsrvcache *rp)
699 {
700         if (rp->rc_flag & RC_WANTED) {
701                 rp->rc_flag &= ~RC_WANTED;
702                 wakeup((caddr_t)rp);
703         }
704 }
705
706 /*
707  * Free up the entry.
708  * Must not sleep.
709  */
710 static void
711 nfsrc_freecache(struct nfsrvcache *rp)
712 {
713
714         NFSCACHELOCKREQUIRED();
715         LIST_REMOVE(rp, rc_hash);
716         if (rp->rc_flag & RC_UDP) {
717                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
718                 nfsrc_udpcachesize--;
719         }
720         nfsrc_wanted(rp);
721         if (rp->rc_flag & RC_REPMBUF) {
722                 mbuf_freem(rp->rc_reply);
723                 if (!(rp->rc_flag & RC_UDP))
724                         nfsrc_tcpsavedreplies--;
725         }
726         FREE((caddr_t)rp, M_NFSRVCACHE);
727         newnfsstats.srvcache_size--;
728 }
729
730 #ifdef notdef
731 /*
732  * Clean out the cache. Called when the last nfsd terminates.
733  */
734 APPLESTATIC void
735 nfsrvd_cleancache(void)
736 {
737         struct nfsrvcache *rp, *nextrp;
738         int i;
739
740         NFSLOCKCACHE();
741         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
742                 LIST_FOREACH_SAFE(rp, &nfsrvhashtbl[i], rc_hash, nextrp) {
743                         nfsrc_freecache(rp);
744                 }
745         }
746         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
747                 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
748                         nfsrc_freecache(rp);
749                 }
750         }
751         newnfsstats.srvcache_size = 0;
752         nfsrc_tcpsavedreplies = 0;
753         NFSUNLOCKCACHE();
754 }
755 #endif  /* notdef */
756
757 /*
758  * The basic rule is to get rid of entries that are expired.
759  */
760 static void
761 nfsrc_trimcache(u_int64_t sockref, struct socket *so)
762 {
763         struct nfsrvcache *rp, *nextrp;
764         int i;
765
766         NFSLOCKCACHE();
767         TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
768                 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
769                      && rp->rc_refcnt == 0
770                      && ((rp->rc_flag & RC_REFCNT) ||
771                          NFSD_MONOSEC > rp->rc_timestamp ||
772                          nfsrc_udpcachesize > nfsrc_udphighwater))
773                         nfsrc_freecache(rp);
774         }
775         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
776                 LIST_FOREACH_SAFE(rp, &nfsrvhashtbl[i], rc_hash, nextrp) {
777                         if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
778                              && rp->rc_refcnt == 0
779                              && ((rp->rc_flag & RC_REFCNT) ||
780                                  NFSD_MONOSEC > rp->rc_timestamp ||
781                                  nfsrc_activesocket(rp, sockref, so)))
782                                 nfsrc_freecache(rp);
783                 }
784         }
785         NFSUNLOCKCACHE();
786 }
787
788 /*
789  * Add a seqid# reference to the cache entry.
790  */
791 APPLESTATIC void
792 nfsrvd_refcache(struct nfsrvcache *rp)
793 {
794
795         NFSLOCKCACHE();
796         if (rp->rc_refcnt < 0)
797                 panic("nfs cache refcnt");
798         rp->rc_refcnt++;
799         NFSUNLOCKCACHE();
800 }
801
802 /*
803  * Dereference a seqid# cache entry.
804  */
805 APPLESTATIC void
806 nfsrvd_derefcache(struct nfsrvcache *rp)
807 {
808
809         NFSLOCKCACHE();
810         if (rp->rc_refcnt <= 0)
811                 panic("nfs cache derefcnt");
812         rp->rc_refcnt--;
813         if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
814                 nfsrc_freecache(rp);
815         NFSUNLOCKCACHE();
816 }
817
818 /*
819  * Check to see if the socket is active.
820  * Return 1 if the reply has been received/acknowledged by the client,
821  * 0 otherwise.
822  * XXX - Uses tcp internals.
823  */
824 static int
825 nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t cur_sockref,
826     struct socket *cur_so)
827 {
828         int ret = 0;
829
830         if (!(rp->rc_flag & RC_TCPSEQ))
831                 return (ret);
832         /*
833          * If the sockref is the same, it is the same TCP connection.
834          */
835         if (cur_sockref == rp->rc_sockref)
836                 ret = nfsrv_checksockseqnum(cur_so, rp->rc_tcpseq);
837         return (ret);
838 }
839
840 /*
841  * Calculate the length of the mbuf list and a checksum on the first up to
842  * NFSRVCACHE_CHECKLEN bytes.
843  */
844 static int
845 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
846 {
847         int len = 0, cklen;
848         mbuf_t m;
849
850         m = m1;
851         while (m) {
852                 len += mbuf_len(m);
853                 m = mbuf_next(m);
854         }
855         cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
856         *cksum = in_cksum(m1, cklen);
857         return (len);
858 }
859
860 /*
861  * Mark a TCP connection that is seeing retries. Should never happen for
862  * NFSv4.
863  */
864 static void
865 nfsrc_marksametcpconn(u_int64_t sockref)
866 {
867 }
868