]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_timer.c
Cleanup the handling of control chunks. While there fix some minor
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_timer.c
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *      @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
30  */
31
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34
35 #include "opt_inet.h"
36 #include "opt_inet6.h"
37 #include "opt_tcpdebug.h"
38 #include "opt_rss.h"
39
40 #include <sys/param.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/mbuf.h>
44 #include <sys/mutex.h>
45 #include <sys/protosw.h>
46 #include <sys/smp.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/sysctl.h>
50 #include <sys/systm.h>
51
52 #include <net/if.h>
53 #include <net/route.h>
54 #include <net/rss_config.h>
55 #include <net/vnet.h>
56 #include <net/netisr.h>
57
58 #include <netinet/in.h>
59 #include <netinet/in_kdtrace.h>
60 #include <netinet/in_pcb.h>
61 #include <netinet/in_rss.h>
62 #include <netinet/in_systm.h>
63 #ifdef INET6
64 #include <netinet6/in6_pcb.h>
65 #endif
66 #include <netinet/ip_var.h>
67 #include <netinet/tcp.h>
68 #include <netinet/tcp_fsm.h>
69 #include <netinet/tcp_timer.h>
70 #include <netinet/tcp_var.h>
71 #include <netinet/cc/cc.h>
72 #ifdef INET6
73 #include <netinet6/tcp6_var.h>
74 #endif
75 #include <netinet/tcpip.h>
76 #ifdef TCPDEBUG
77 #include <netinet/tcp_debug.h>
78 #endif
79
80 int    tcp_persmin;
81 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW,
82     &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval");
83
84 int    tcp_persmax;
85 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW,
86     &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval");
87
88 int     tcp_keepinit;
89 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
90     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
91
92 int     tcp_keepidle;
93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
94     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
95
96 int     tcp_keepintvl;
97 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
98     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
99
100 int     tcp_delacktime;
101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
102     &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
103     "Time before a delayed ACK is sent");
104
105 int     tcp_msl;
106 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
107     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
108
109 int     tcp_rexmit_min;
110 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
111     &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
112     "Minimum Retransmission Timeout");
113
114 int     tcp_rexmit_slop;
115 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
116     &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
117     "Retransmission Timer Slop");
118
119 static int      always_keepalive = 1;
120 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
121     &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
122
123 int    tcp_fast_finwait2_recycle = 0;
124 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 
125     &tcp_fast_finwait2_recycle, 0,
126     "Recycle closed FIN_WAIT_2 connections faster");
127
128 int    tcp_finwait2_timeout;
129 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
130     &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
131
132 int     tcp_keepcnt = TCPTV_KEEPCNT;
133 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
134     "Number of keepalive probes to send");
135
136         /* max idle probes */
137 int     tcp_maxpersistidle;
138
139 static int      tcp_rexmit_drop_options = 0;
140 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
141     &tcp_rexmit_drop_options, 0,
142     "Drop TCP options from 3rd and later retransmitted SYN");
143
144 VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
145 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
146     CTLFLAG_RW|CTLFLAG_VNET,
147     &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
148     "Path MTU Discovery Black Hole Detection Enabled");
149
150 #ifdef INET
151 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
152 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
153     CTLFLAG_RW|CTLFLAG_VNET,
154     &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
155     "Path MTU Discovery Black Hole Detection lowered MSS");
156 #endif
157
158 #ifdef INET6
159 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
160 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
161     CTLFLAG_RW|CTLFLAG_VNET,
162     &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
163     "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
164 #endif
165
166 #ifdef  RSS
167 static int      per_cpu_timers = 1;
168 #else
169 static int      per_cpu_timers = 0;
170 #endif
171 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
172     &per_cpu_timers , 0, "run tcp timers on all cpus");
173
174 #if 0
175 #define INP_CPU(inp)    (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
176                 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
177 #endif
178
179 /*
180  * Map the given inp to a CPU id.
181  *
182  * This queries RSS if it's compiled in, else it defaults to the current
183  * CPU ID.
184  */
185 static inline int
186 inp_to_cpuid(struct inpcb *inp)
187 {
188         u_int cpuid;
189
190 #ifdef  RSS
191         if (per_cpu_timers) {
192                 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
193                 if (cpuid == NETISR_CPUID_NONE)
194                         return (curcpu);        /* XXX */
195                 else
196                         return (cpuid);
197         }
198 #else
199         /* Legacy, pre-RSS behaviour */
200         if (per_cpu_timers) {
201                 /*
202                  * We don't have a flowid -> cpuid mapping, so cheat and
203                  * just map unknown cpuids to curcpu.  Not the best, but
204                  * apparently better than defaulting to swi 0.
205                  */
206                 cpuid = inp->inp_flowid % (mp_maxid + 1);
207                 if (! CPU_ABSENT(cpuid))
208                         return (cpuid);
209                 return (curcpu);
210         }
211 #endif
212         /* Default for RSS and non-RSS - cpuid 0 */
213         else {
214                 return (0);
215         }
216 }
217
218 /*
219  * Tcp protocol timeout routine called every 500 ms.
220  * Updates timestamps used for TCP
221  * causes finite state machine actions if timers expire.
222  */
223 void
224 tcp_slowtimo(void)
225 {
226         VNET_ITERATOR_DECL(vnet_iter);
227
228         VNET_LIST_RLOCK_NOSLEEP();
229         VNET_FOREACH(vnet_iter) {
230                 CURVNET_SET(vnet_iter);
231                 (void) tcp_tw_2msl_scan(0);
232                 CURVNET_RESTORE();
233         }
234         VNET_LIST_RUNLOCK_NOSLEEP();
235 }
236
237 int     tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
238     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
239
240 int     tcp_backoff[TCP_MAXRXTSHIFT + 1] =
241     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
242
243 static int tcp_totbackoff = 2559;       /* sum of tcp_backoff[] */
244
245 /*
246  * TCP timer processing.
247  */
248
249 void
250 tcp_timer_delack(void *xtp)
251 {
252         struct tcpcb *tp = xtp;
253         struct inpcb *inp;
254         CURVNET_SET(tp->t_vnet);
255
256         inp = tp->t_inpcb;
257         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
258         INP_WLOCK(inp);
259         if (callout_pending(&tp->t_timers->tt_delack) ||
260             !callout_active(&tp->t_timers->tt_delack)) {
261                 INP_WUNLOCK(inp);
262                 CURVNET_RESTORE();
263                 return;
264         }
265         callout_deactivate(&tp->t_timers->tt_delack);
266         if ((inp->inp_flags & INP_DROPPED) != 0) {
267                 INP_WUNLOCK(inp);
268                 CURVNET_RESTORE();
269                 return;
270         }
271         tp->t_flags |= TF_ACKNOW;
272         TCPSTAT_INC(tcps_delack);
273         (void) tp->t_fb->tfb_tcp_output(tp);
274         INP_WUNLOCK(inp);
275         CURVNET_RESTORE();
276 }
277
278 /*
279  * When a timer wants to remove a TCB it must
280  * hold the INP_INFO_RLOCK(). The timer function
281  * should only have grabbed the INP_WLOCK() when
282  * it entered. To safely switch to holding both the
283  * INP_INFO_RLOCK() and the INP_WLOCK() we must first
284  * grab a reference on the inp, which will hold the inp
285  * so that it can't be removed. We then unlock the INP_WLOCK(), 
286  * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK()
287  * we proceed again to get the INP_WLOCK() (this preserves proper
288  * lock order). After acquiring the INP_WLOCK we must check if someone 
289  * else deleted the pcb i.e. the inp_flags check.
290  * If so we return 1 otherwise we return 0.
291  *
292  * No matter what the tcp_inpinfo_lock_add() function
293  * returns the caller must afterwards call tcp_inpinfo_lock_del()
294  * to drop the locks and reference properly.
295  */
296
297 int
298 tcp_inpinfo_lock_add(struct inpcb *inp)
299 {
300         in_pcbref(inp);
301         INP_WUNLOCK(inp);
302         INP_INFO_RLOCK(&V_tcbinfo);
303         INP_WLOCK(inp);
304         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
305                 return(1);
306         }
307         return(0);
308
309 }
310
311 void
312 tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp)
313 {
314         INP_INFO_RUNLOCK(&V_tcbinfo);
315         if (inp && (tp == NULL)) {
316                 /*
317                  * If tcp_close/drop() gets called and tp
318                  * returns NULL, then the function dropped
319                  * the inp lock, we hold a reference keeping
320                  * this around, so we must re-aquire the 
321                  * INP_WLOCK() in order to proceed with
322                  * our dropping the inp reference.
323                  */
324                 INP_WLOCK(inp);
325         }
326         if (inp && in_pcbrele_wlocked(inp) == 0)
327                 INP_WUNLOCK(inp);
328 }
329
330 void
331 tcp_timer_2msl(void *xtp)
332 {
333         struct tcpcb *tp = xtp;
334         struct inpcb *inp;
335         CURVNET_SET(tp->t_vnet);
336 #ifdef TCPDEBUG
337         int ostate;
338
339         ostate = tp->t_state;
340 #endif
341         inp = tp->t_inpcb;
342         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
343         INP_WLOCK(inp);
344         tcp_free_sackholes(tp);
345         if (callout_pending(&tp->t_timers->tt_2msl) ||
346             !callout_active(&tp->t_timers->tt_2msl)) {
347                 INP_WUNLOCK(tp->t_inpcb);
348                 CURVNET_RESTORE();
349                 return;
350         }
351         callout_deactivate(&tp->t_timers->tt_2msl);
352         if ((inp->inp_flags & INP_DROPPED) != 0) {
353                 INP_WUNLOCK(inp);
354                 CURVNET_RESTORE();
355                 return;
356         }
357         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
358                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
359         /*
360          * 2 MSL timeout in shutdown went off.  If we're closed but
361          * still waiting for peer to close and connection has been idle
362          * too long delete connection control block.  Otherwise, check
363          * again in a bit.
364          *
365          * If in TIME_WAIT state just ignore as this timeout is handled in
366          * tcp_tw_2msl_scan().
367          *
368          * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 
369          * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 
370          * Ignore fact that there were recent incoming segments.
371          */
372         if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
373                 INP_WUNLOCK(inp);
374                 CURVNET_RESTORE();
375                 return;
376         }
377         if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
378             tp->t_inpcb && tp->t_inpcb->inp_socket && 
379             (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
380                 TCPSTAT_INC(tcps_finwait2_drops);
381                 if (tcp_inpinfo_lock_add(inp)) {
382                         tcp_inpinfo_lock_del(inp, tp);
383                         goto out;
384                 }
385                 tp = tcp_close(tp);             
386                 tcp_inpinfo_lock_del(inp, tp);
387                 goto out;
388         } else {
389                 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
390                         callout_reset(&tp->t_timers->tt_2msl,
391                                       TP_KEEPINTVL(tp), tcp_timer_2msl, tp);
392                 } else {
393                         if (tcp_inpinfo_lock_add(inp)) {
394                                 tcp_inpinfo_lock_del(inp, tp);
395                                 goto out;
396                         }
397                         tp = tcp_close(tp);
398                         tcp_inpinfo_lock_del(inp, tp);
399                         goto out;
400                 }
401        }
402
403 #ifdef TCPDEBUG
404         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
405                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
406                           PRU_SLOWTIMO);
407 #endif
408         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
409
410         if (tp != NULL)
411                 INP_WUNLOCK(inp);
412 out:
413         CURVNET_RESTORE();
414 }
415
416 void
417 tcp_timer_keep(void *xtp)
418 {
419         struct tcpcb *tp = xtp;
420         struct tcptemp *t_template;
421         struct inpcb *inp;
422         CURVNET_SET(tp->t_vnet);
423 #ifdef TCPDEBUG
424         int ostate;
425
426         ostate = tp->t_state;
427 #endif
428         inp = tp->t_inpcb;
429         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
430         INP_WLOCK(inp);
431         if (callout_pending(&tp->t_timers->tt_keep) ||
432             !callout_active(&tp->t_timers->tt_keep)) {
433                 INP_WUNLOCK(inp);
434                 CURVNET_RESTORE();
435                 return;
436         }
437         callout_deactivate(&tp->t_timers->tt_keep);
438         if ((inp->inp_flags & INP_DROPPED) != 0) {
439                 INP_WUNLOCK(inp);
440                 CURVNET_RESTORE();
441                 return;
442         }
443         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
444                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
445
446         /*
447          * Because we don't regularly reset the keepalive callout in
448          * the ESTABLISHED state, it may be that we don't actually need
449          * to send a keepalive yet. If that occurs, schedule another
450          * call for the next time the keepalive timer might expire.
451          */
452         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
453                 u_int idletime;
454
455                 idletime = ticks - tp->t_rcvtime;
456                 if (idletime < TP_KEEPIDLE(tp)) {
457                         callout_reset(&tp->t_timers->tt_keep,
458                             TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp);
459                         INP_WUNLOCK(inp);
460                         CURVNET_RESTORE();
461                         return;
462                 }
463         }
464
465         /*
466          * Keep-alive timer went off; send something
467          * or drop connection if idle for too long.
468          */
469         TCPSTAT_INC(tcps_keeptimeo);
470         if (tp->t_state < TCPS_ESTABLISHED)
471                 goto dropit;
472         if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
473             tp->t_state <= TCPS_CLOSING) {
474                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
475                         goto dropit;
476                 /*
477                  * Send a packet designed to force a response
478                  * if the peer is up and reachable:
479                  * either an ACK if the connection is still alive,
480                  * or an RST if the peer has closed the connection
481                  * due to timeout or reboot.
482                  * Using sequence number tp->snd_una-1
483                  * causes the transmitted zero-length segment
484                  * to lie outside the receive window;
485                  * by the protocol spec, this requires the
486                  * correspondent TCP to respond.
487                  */
488                 TCPSTAT_INC(tcps_keepprobe);
489                 t_template = tcpip_maketemplate(inp);
490                 if (t_template) {
491                         tcp_respond(tp, t_template->tt_ipgen,
492                                     &t_template->tt_t, (struct mbuf *)NULL,
493                                     tp->rcv_nxt, tp->snd_una - 1, 0);
494                         free(t_template, M_TEMP);
495                 }
496                 callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
497                               tcp_timer_keep, tp);
498         } else
499                 callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
500                               tcp_timer_keep, tp);
501
502 #ifdef TCPDEBUG
503         if (inp->inp_socket->so_options & SO_DEBUG)
504                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
505                           PRU_SLOWTIMO);
506 #endif
507         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
508         INP_WUNLOCK(inp);
509         CURVNET_RESTORE();
510         return;
511
512 dropit:
513         TCPSTAT_INC(tcps_keepdrops);
514
515         if (tcp_inpinfo_lock_add(inp)) {
516                 tcp_inpinfo_lock_del(inp, tp);
517                 goto out;
518         }
519         tp = tcp_drop(tp, ETIMEDOUT);
520
521 #ifdef TCPDEBUG
522         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
523                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
524                           PRU_SLOWTIMO);
525 #endif
526         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
527         tcp_inpinfo_lock_del(inp, tp);
528 out:
529         CURVNET_RESTORE();
530 }
531
532 void
533 tcp_timer_persist(void *xtp)
534 {
535         struct tcpcb *tp = xtp;
536         struct inpcb *inp;
537         CURVNET_SET(tp->t_vnet);
538 #ifdef TCPDEBUG
539         int ostate;
540
541         ostate = tp->t_state;
542 #endif
543         inp = tp->t_inpcb;
544         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
545         INP_WLOCK(inp);
546         if (callout_pending(&tp->t_timers->tt_persist) ||
547             !callout_active(&tp->t_timers->tt_persist)) {
548                 INP_WUNLOCK(inp);
549                 CURVNET_RESTORE();
550                 return;
551         }
552         callout_deactivate(&tp->t_timers->tt_persist);
553         if ((inp->inp_flags & INP_DROPPED) != 0) {
554                 INP_WUNLOCK(inp);
555                 CURVNET_RESTORE();
556                 return;
557         }
558         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
559                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
560         /*
561          * Persistence timer into zero window.
562          * Force a byte to be output, if possible.
563          */
564         TCPSTAT_INC(tcps_persisttimeo);
565         /*
566          * Hack: if the peer is dead/unreachable, we do not
567          * time out if the window is closed.  After a full
568          * backoff, drop the connection if the idle time
569          * (no responses to probes) reaches the maximum
570          * backoff that we would use if retransmitting.
571          */
572         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
573             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
574              ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
575                 TCPSTAT_INC(tcps_persistdrop);
576                 if (tcp_inpinfo_lock_add(inp)) {
577                         tcp_inpinfo_lock_del(inp, tp);
578                         goto out;
579                 }
580                 tp = tcp_drop(tp, ETIMEDOUT);
581                 tcp_inpinfo_lock_del(inp, tp);
582                 goto out;
583         }
584         /*
585          * If the user has closed the socket then drop a persisting
586          * connection after a much reduced timeout.
587          */
588         if (tp->t_state > TCPS_CLOSE_WAIT &&
589             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
590                 TCPSTAT_INC(tcps_persistdrop);
591                 if (tcp_inpinfo_lock_add(inp)) {
592                         tcp_inpinfo_lock_del(inp, tp);
593                         goto out;
594                 }
595                 tp = tcp_drop(tp, ETIMEDOUT);
596                 tcp_inpinfo_lock_del(inp, tp);
597                 goto out;
598         }
599         tcp_setpersist(tp);
600         tp->t_flags |= TF_FORCEDATA;
601         (void) tp->t_fb->tfb_tcp_output(tp);
602         tp->t_flags &= ~TF_FORCEDATA;
603
604 #ifdef TCPDEBUG
605         if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
606                 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
607 #endif
608         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
609         INP_WUNLOCK(inp);
610 out:
611         CURVNET_RESTORE();
612 }
613
614 void
615 tcp_timer_rexmt(void * xtp)
616 {
617         struct tcpcb *tp = xtp;
618         CURVNET_SET(tp->t_vnet);
619         int rexmt;
620         struct inpcb *inp;
621 #ifdef TCPDEBUG
622         int ostate;
623
624         ostate = tp->t_state;
625 #endif
626         inp = tp->t_inpcb;
627         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
628         INP_WLOCK(inp);
629         if (callout_pending(&tp->t_timers->tt_rexmt) ||
630             !callout_active(&tp->t_timers->tt_rexmt)) {
631                 INP_WUNLOCK(inp);
632                 CURVNET_RESTORE();
633                 return;
634         }
635         callout_deactivate(&tp->t_timers->tt_rexmt);
636         if ((inp->inp_flags & INP_DROPPED) != 0) {
637                 INP_WUNLOCK(inp);
638                 CURVNET_RESTORE();
639                 return;
640         }
641         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
642                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
643         tcp_free_sackholes(tp);
644         if (tp->t_fb->tfb_tcp_rexmit_tmr) {
645                 /* The stack has a timer action too. */
646                 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
647         }
648         /*
649          * Retransmission timer went off.  Message has not
650          * been acked within retransmit interval.  Back off
651          * to a longer retransmit interval and retransmit one segment.
652          */
653         if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
654                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
655                 TCPSTAT_INC(tcps_timeoutdrop);
656                 if (tcp_inpinfo_lock_add(inp)) {
657                         tcp_inpinfo_lock_del(inp, tp);
658                         goto out;
659                 }
660                 tp = tcp_drop(tp, tp->t_softerror ?
661                               tp->t_softerror : ETIMEDOUT);
662                 tcp_inpinfo_lock_del(inp, tp);
663                 goto out;
664         }
665         if (tp->t_state == TCPS_SYN_SENT) {
666                 /*
667                  * If the SYN was retransmitted, indicate CWND to be
668                  * limited to 1 segment in cc_conn_init().
669                  */
670                 tp->snd_cwnd = 1;
671         } else if (tp->t_rxtshift == 1) {
672                 /*
673                  * first retransmit; record ssthresh and cwnd so they can
674                  * be recovered if this turns out to be a "bad" retransmit.
675                  * A retransmit is considered "bad" if an ACK for this
676                  * segment is received within RTT/2 interval; the assumption
677                  * here is that the ACK was already in flight.  See
678                  * "On Estimating End-to-End Network Path Properties" by
679                  * Allman and Paxson for more details.
680                  */
681                 tp->snd_cwnd_prev = tp->snd_cwnd;
682                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
683                 tp->snd_recover_prev = tp->snd_recover;
684                 if (IN_FASTRECOVERY(tp->t_flags))
685                         tp->t_flags |= TF_WASFRECOVERY;
686                 else
687                         tp->t_flags &= ~TF_WASFRECOVERY;
688                 if (IN_CONGRECOVERY(tp->t_flags))
689                         tp->t_flags |= TF_WASCRECOVERY;
690                 else
691                         tp->t_flags &= ~TF_WASCRECOVERY;
692                 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
693                 tp->t_flags |= TF_PREVVALID;
694         } else
695                 tp->t_flags &= ~TF_PREVVALID;
696         TCPSTAT_INC(tcps_rexmttimeo);
697         if ((tp->t_state == TCPS_SYN_SENT) ||
698             (tp->t_state == TCPS_SYN_RECEIVED))
699                 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
700         else
701                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
702         TCPT_RANGESET(tp->t_rxtcur, rexmt,
703                       tp->t_rttmin, TCPTV_REXMTMAX);
704
705         /*
706          * We enter the path for PLMTUD if connection is established or, if
707          * connection is FIN_WAIT_1 status, reason for the last is that if
708          * amount of data we send is very small, we could send it in couple of
709          * packets and process straight to FIN. In that case we won't catch
710          * ESTABLISHED state.
711          */
712         if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
713             || (tp->t_state == TCPS_FIN_WAIT_1))) {
714 #ifdef INET6
715                 int isipv6;
716 #endif
717
718                 /*
719                  * Idea here is that at each stage of mtu probe (usually, 1448
720                  * -> 1188 -> 524) should be given 2 chances to recover before
721                  *  further clamping down. 'tp->t_rxtshift % 2 == 0' should
722                  *  take care of that.
723                  */
724                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
725                     (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
726                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
727                     tp->t_rxtshift % 2 == 0)) {
728                         /*
729                          * Enter Path MTU Black-hole Detection mechanism:
730                          * - Disable Path MTU Discovery (IP "DF" bit).
731                          * - Reduce MTU to lower value than what we
732                          *   negotiated with peer.
733                          */
734                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
735                                 /* Record that we may have found a black hole. */
736                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
737                                 /* Keep track of previous MSS. */
738                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
739                         }
740
741                         /* 
742                          * Reduce the MSS to blackhole value or to the default
743                          * in an attempt to retransmit.
744                          */
745 #ifdef INET6
746                         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
747                         if (isipv6 &&
748                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
749                                 /* Use the sysctl tuneable blackhole MSS. */
750                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
751                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
752                         } else if (isipv6) {
753                                 /* Use the default MSS. */
754                                 tp->t_maxseg = V_tcp_v6mssdflt;
755                                 /*
756                                  * Disable Path MTU Discovery when we switch to
757                                  * minmss.
758                                  */
759                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
760                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
761                         }
762 #endif
763 #if defined(INET6) && defined(INET)
764                         else
765 #endif
766 #ifdef INET
767                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
768                                 /* Use the sysctl tuneable blackhole MSS. */
769                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
770                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
771                         } else {
772                                 /* Use the default MSS. */
773                                 tp->t_maxseg = V_tcp_mssdflt;
774                                 /*
775                                  * Disable Path MTU Discovery when we switch to
776                                  * minmss.
777                                  */
778                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
779                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
780                         }
781 #endif
782                         /*
783                          * Reset the slow-start flight size
784                          * as it may depend on the new MSS.
785                          */
786                         if (CC_ALGO(tp)->conn_init != NULL)
787                                 CC_ALGO(tp)->conn_init(tp->ccv);
788                 } else {
789                         /*
790                          * If further retransmissions are still unsuccessful
791                          * with a lowered MTU, maybe this isn't a blackhole and
792                          * we restore the previous MSS and blackhole detection
793                          * flags.
794                          * The limit '6' is determined by giving each probe
795                          * stage (1448, 1188, 524) 2 chances to recover.
796                          */
797                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
798                             (tp->t_rxtshift >= 6)) {
799                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
800                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
801                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
802                                 TCPSTAT_INC(tcps_pmtud_blackhole_failed);
803                                 /*
804                                  * Reset the slow-start flight size as it
805                                  * may depend on the new MSS.
806                                  */
807                                 if (CC_ALGO(tp)->conn_init != NULL)
808                                         CC_ALGO(tp)->conn_init(tp->ccv);
809                         }
810                 }
811         }
812
813         /*
814          * Disable RFC1323 and SACK if we haven't got any response to
815          * our third SYN to work-around some broken terminal servers
816          * (most of which have hopefully been retired) that have bad VJ
817          * header compression code which trashes TCP segments containing
818          * unknown-to-them TCP options.
819          */
820         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
821             (tp->t_rxtshift == 3))
822                 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
823         /*
824          * If we backed off this far, notify the L3 protocol that we're having
825          * connection problems.
826          */
827         if (tp->t_rxtshift > TCP_RTT_INVALIDATE) {
828 #ifdef INET6
829                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
830                         in6_losing(tp->t_inpcb);
831                 else
832 #endif
833                         in_losing(tp->t_inpcb);
834         }
835         tp->snd_nxt = tp->snd_una;
836         tp->snd_recover = tp->snd_max;
837         /*
838          * Force a segment to be sent.
839          */
840         tp->t_flags |= TF_ACKNOW;
841         /*
842          * If timing a segment in this window, stop the timer.
843          */
844         tp->t_rtttime = 0;
845
846         cc_cong_signal(tp, NULL, CC_RTO);
847
848         (void) tp->t_fb->tfb_tcp_output(tp);
849
850 #ifdef TCPDEBUG
851         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
852                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
853                           PRU_SLOWTIMO);
854 #endif
855         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
856         INP_WUNLOCK(inp);
857 out:
858         CURVNET_RESTORE();
859 }
860
861 void
862 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
863 {
864         struct callout *t_callout;
865         timeout_t *f_callout;
866         struct inpcb *inp = tp->t_inpcb;
867         int cpu = inp_to_cpuid(inp);
868
869 #ifdef TCP_OFFLOAD
870         if (tp->t_flags & TF_TOE)
871                 return;
872 #endif
873
874         if (tp->t_timers->tt_flags & TT_STOPPED)
875                 return;
876
877         switch (timer_type) {
878                 case TT_DELACK:
879                         t_callout = &tp->t_timers->tt_delack;
880                         f_callout = tcp_timer_delack;
881                         break;
882                 case TT_REXMT:
883                         t_callout = &tp->t_timers->tt_rexmt;
884                         f_callout = tcp_timer_rexmt;
885                         break;
886                 case TT_PERSIST:
887                         t_callout = &tp->t_timers->tt_persist;
888                         f_callout = tcp_timer_persist;
889                         break;
890                 case TT_KEEP:
891                         t_callout = &tp->t_timers->tt_keep;
892                         f_callout = tcp_timer_keep;
893                         break;
894                 case TT_2MSL:
895                         t_callout = &tp->t_timers->tt_2msl;
896                         f_callout = tcp_timer_2msl;
897                         break;
898                 default:
899                         if (tp->t_fb->tfb_tcp_timer_activate) {
900                                 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
901                                 return;
902                         }
903                         panic("tp %p bad timer_type %#x", tp, timer_type);
904                 }
905         if (delta == 0) {
906                 callout_stop(t_callout);
907         } else {
908                 callout_reset_on(t_callout, delta, f_callout, tp, cpu);
909         }
910 }
911
912 int
913 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
914 {
915         struct callout *t_callout;
916
917         switch (timer_type) {
918                 case TT_DELACK:
919                         t_callout = &tp->t_timers->tt_delack;
920                         break;
921                 case TT_REXMT:
922                         t_callout = &tp->t_timers->tt_rexmt;
923                         break;
924                 case TT_PERSIST:
925                         t_callout = &tp->t_timers->tt_persist;
926                         break;
927                 case TT_KEEP:
928                         t_callout = &tp->t_timers->tt_keep;
929                         break;
930                 case TT_2MSL:
931                         t_callout = &tp->t_timers->tt_2msl;
932                         break;
933                 default:
934                         if (tp->t_fb->tfb_tcp_timer_active) {
935                                 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type));
936                         }
937                         panic("tp %p bad timer_type %#x", tp, timer_type);
938                 }
939         return callout_active(t_callout);
940 }
941
942 void
943 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
944 {
945         struct callout *t_callout;
946
947         tp->t_timers->tt_flags |= TT_STOPPED;
948         switch (timer_type) {
949                 case TT_DELACK:
950                         t_callout = &tp->t_timers->tt_delack;
951                         break;
952                 case TT_REXMT:
953                         t_callout = &tp->t_timers->tt_rexmt;
954                         break;
955                 case TT_PERSIST:
956                         t_callout = &tp->t_timers->tt_persist;
957                         break;
958                 case TT_KEEP:
959                         t_callout = &tp->t_timers->tt_keep;
960                         break;
961                 case TT_2MSL:
962                         t_callout = &tp->t_timers->tt_2msl;
963                         break;
964                 default:
965                         if (tp->t_fb->tfb_tcp_timer_stop) {
966                                 /* 
967                                  * XXXrrs we need to look at this with the
968                                  * stop case below (flags).
969                                  */
970                                 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type);
971                                 return;
972                         }
973                         panic("tp %p bad timer_type %#x", tp, timer_type);
974                 }
975
976         if (callout_async_drain(t_callout, tcp_timer_discard) == 0) {
977                 /*
978                  * Can't stop the callout, defer tcpcb actual deletion
979                  * to the last one. We do this using the async drain
980                  * function and incrementing the count in 
981                  */
982                 tp->t_timers->tt_draincnt++;
983         }
984 }