]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/netinet/tcp_timer.c
MFC r368207,368607:
[FreeBSD/stable/10.git] / sys / netinet / tcp_timer.c
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *      @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
30  */
31
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34
35 #include "opt_inet.h"
36 #include "opt_inet6.h"
37 #include "opt_tcpdebug.h"
38
39 #include <sys/param.h>
40 #include <sys/kernel.h>
41 #include <sys/lock.h>
42 #include <sys/mbuf.h>
43 #include <sys/mutex.h>
44 #include <sys/protosw.h>
45 #include <sys/smp.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/systm.h>
50
51 #include <net/if.h>
52 #include <net/route.h>
53 #include <net/vnet.h>
54
55 #include <netinet/cc.h>
56 #include <netinet/in.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_systm.h>
59 #ifdef INET6
60 #include <netinet6/in6_pcb.h>
61 #endif
62 #include <netinet/ip_var.h>
63 #include <netinet/tcp_fsm.h>
64 #include <netinet/tcp_timer.h>
65 #include <netinet/tcp_var.h>
66 #ifdef INET6
67 #include <netinet6/tcp6_var.h>
68 #endif
69 #include <netinet/tcpip.h>
70 #ifdef TCPDEBUG
71 #include <netinet/tcp_debug.h>
72 #endif
73
74 int    tcp_persmin;
75 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW,
76     &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval");
77
78 int    tcp_persmax;
79 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW,
80     &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval");
81
82 int     tcp_keepinit;
83 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
84     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
85
86 int     tcp_keepidle;
87 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
88     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
89
90 int     tcp_keepintvl;
91 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
92     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
93
94 int     tcp_delacktime;
95 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
96     &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
97     "Time before a delayed ACK is sent");
98
99 int     tcp_msl;
100 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
101     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
102
103 int     tcp_rexmit_min;
104 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
105     &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
106     "Minimum Retransmission Timeout");
107
108 int     tcp_rexmit_slop;
109 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
110     &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
111     "Retransmission Timer Slop");
112
113 int     tcp_always_keepalive = 1;
114 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
115     &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
116 __strong_reference(tcp_always_keepalive, always_keepalive);
117
118 int    tcp_fast_finwait2_recycle = 0;
119 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 
120     &tcp_fast_finwait2_recycle, 0,
121     "Recycle closed FIN_WAIT_2 connections faster");
122
123 int    tcp_finwait2_timeout;
124 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
125     &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
126
127 int     tcp_keepcnt = TCPTV_KEEPCNT;
128 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
129     "Number of keepalive probes to send");
130
131         /* max idle probes */
132 int     tcp_maxpersistidle;
133
134 static int      tcp_rexmit_drop_options = 0;
135 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
136     &tcp_rexmit_drop_options, 0,
137     "Drop TCP options from 3rd and later retransmitted SYN");
138
139 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
140 #define V_tcp_pmtud_blackhole_detect    VNET(tcp_pmtud_blackhole_detect)
141 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
142     CTLFLAG_RW,
143     &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
144     "Path MTU Discovery Black Hole Detection Enabled");
145
146 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
147 #define V_tcp_pmtud_blackhole_activated \
148     VNET(tcp_pmtud_blackhole_activated)
149 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
150     CTLFLAG_RD,
151     &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
152     "Path MTU Discovery Black Hole Detection, Activation Count");
153
154 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
155 #define V_tcp_pmtud_blackhole_activated_min_mss \
156     VNET(tcp_pmtud_blackhole_activated_min_mss)
157 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
158     CTLFLAG_RD,
159     &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
160     "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
161
162 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
163 #define V_tcp_pmtud_blackhole_failed    VNET(tcp_pmtud_blackhole_failed)
164 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
165     CTLFLAG_RD,
166     &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
167     "Path MTU Discovery Black Hole Detection, Failure Count");
168
169 #ifdef INET
170 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
171 #define V_tcp_pmtud_blackhole_mss       VNET(tcp_pmtud_blackhole_mss)
172 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
173     CTLFLAG_RW,
174     &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
175     "Path MTU Discovery Black Hole Detection lowered MSS");
176 #endif
177
178 #ifdef INET6
179 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
180 #define V_tcp_v6pmtud_blackhole_mss     VNET(tcp_v6pmtud_blackhole_mss)
181 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
182     CTLFLAG_RW,
183     &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
184     "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
185 #endif
186
187 static int      per_cpu_timers = 0;
188 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
189     &per_cpu_timers , 0, "run tcp timers on all cpus");
190
191 #define INP_CPU(inp)    (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
192                 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
193
194 /*
195  * Tcp protocol timeout routine called every 500 ms.
196  * Updates timestamps used for TCP
197  * causes finite state machine actions if timers expire.
198  */
199 void
200 tcp_slowtimo(void)
201 {
202         VNET_ITERATOR_DECL(vnet_iter);
203
204         VNET_LIST_RLOCK_NOSLEEP();
205         VNET_FOREACH(vnet_iter) {
206                 CURVNET_SET(vnet_iter);
207                 (void) tcp_tw_2msl_scan(0);
208                 CURVNET_RESTORE();
209         }
210         VNET_LIST_RUNLOCK_NOSLEEP();
211 }
212
213 int     tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
214     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
215
216 int     tcp_backoff[TCP_MAXRXTSHIFT + 1] =
217     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
218
219 static int tcp_totbackoff = 2559;       /* sum of tcp_backoff[] */
220
221 /*
222  * TCP timer processing.
223  */
224
225 void
226 tcp_timer_delack(void *xtp)
227 {
228         struct tcpcb *tp = xtp;
229         struct inpcb *inp;
230         CURVNET_SET(tp->t_vnet);
231
232         inp = tp->t_inpcb;
233         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
234         INP_WLOCK(inp);
235         if (callout_pending(&tp->t_timers->tt_delack) ||
236             !callout_active(&tp->t_timers->tt_delack)) {
237                 INP_WUNLOCK(inp);
238                 CURVNET_RESTORE();
239                 return;
240         }
241         callout_deactivate(&tp->t_timers->tt_delack);
242         if ((inp->inp_flags & INP_DROPPED) != 0) {
243                 INP_WUNLOCK(inp);
244                 CURVNET_RESTORE();
245                 return;
246         }
247         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
248                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
249         KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0,
250                 ("%s: tp %p delack callout should be running", __func__, tp));
251
252         tp->t_flags |= TF_ACKNOW;
253         TCPSTAT_INC(tcps_delack);
254         (void) tcp_output(tp);
255         INP_WUNLOCK(inp);
256         CURVNET_RESTORE();
257 }
258
259 void
260 tcp_timer_2msl(void *xtp)
261 {
262         struct tcpcb *tp = xtp;
263         struct inpcb *inp;
264         CURVNET_SET(tp->t_vnet);
265 #ifdef TCPDEBUG
266         int ostate;
267
268         ostate = tp->t_state;
269 #endif
270         INP_INFO_RLOCK(&V_tcbinfo);
271         inp = tp->t_inpcb;
272         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
273         INP_WLOCK(inp);
274         tcp_free_sackholes(tp);
275         if (callout_pending(&tp->t_timers->tt_2msl) ||
276             !callout_active(&tp->t_timers->tt_2msl)) {
277                 INP_WUNLOCK(tp->t_inpcb);
278                 INP_INFO_RUNLOCK(&V_tcbinfo);
279                 CURVNET_RESTORE();
280                 return;
281         }
282         callout_deactivate(&tp->t_timers->tt_2msl);
283         if ((inp->inp_flags & INP_DROPPED) != 0) {
284                 INP_WUNLOCK(inp);
285                 INP_INFO_RUNLOCK(&V_tcbinfo);
286                 CURVNET_RESTORE();
287                 return;
288         }
289         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
290                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
291         KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0,
292                 ("%s: tp %p 2msl callout should be running", __func__, tp));
293         /*
294          * 2 MSL timeout in shutdown went off.  If we're closed but
295          * still waiting for peer to close and connection has been idle
296          * too long delete connection control block.  Otherwise, check
297          * again in a bit.
298          *
299          * If in TIME_WAIT state just ignore as this timeout is handled in
300          * tcp_tw_2msl_scan().
301          *
302          * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 
303          * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 
304          * Ignore fact that there were recent incoming segments.
305          */
306         if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
307                 INP_WUNLOCK(inp);
308                 INP_INFO_RUNLOCK(&V_tcbinfo);
309                 CURVNET_RESTORE();
310                 return;
311         }
312         if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
313             tp->t_inpcb && tp->t_inpcb->inp_socket && 
314             (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
315                 TCPSTAT_INC(tcps_finwait2_drops);
316                 tp = tcp_close(tp);             
317         } else {
318                 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
319                         if (!callout_reset(&tp->t_timers->tt_2msl,
320                            TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) {
321                                 tp->t_timers->tt_flags &= ~TT_2MSL_RST;
322                         }
323                 } else
324                        tp = tcp_close(tp);
325        }
326
327 #ifdef TCPDEBUG
328         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
329                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
330                           PRU_SLOWTIMO);
331 #endif
332         if (tp != NULL)
333                 INP_WUNLOCK(inp);
334         INP_INFO_RUNLOCK(&V_tcbinfo);
335         CURVNET_RESTORE();
336 }
337
338 void
339 tcp_timer_keep(void *xtp)
340 {
341         struct tcpcb *tp = xtp;
342         struct tcptemp *t_template;
343         struct inpcb *inp;
344         CURVNET_SET(tp->t_vnet);
345 #ifdef TCPDEBUG
346         int ostate;
347
348         ostate = tp->t_state;
349 #endif
350         INP_INFO_RLOCK(&V_tcbinfo);
351         inp = tp->t_inpcb;
352         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
353         INP_WLOCK(inp);
354         if (callout_pending(&tp->t_timers->tt_keep) ||
355             !callout_active(&tp->t_timers->tt_keep)) {
356                 INP_WUNLOCK(inp);
357                 INP_INFO_RUNLOCK(&V_tcbinfo);
358                 CURVNET_RESTORE();
359                 return;
360         }
361         callout_deactivate(&tp->t_timers->tt_keep);
362         if ((inp->inp_flags & INP_DROPPED) != 0) {
363                 INP_WUNLOCK(inp);
364                 INP_INFO_RUNLOCK(&V_tcbinfo);
365                 CURVNET_RESTORE();
366                 return;
367         }
368         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
369                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
370         KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0,
371                 ("%s: tp %p keep callout should be running", __func__, tp));
372         /*
373          * Keep-alive timer went off; send something
374          * or drop connection if idle for too long.
375          */
376         TCPSTAT_INC(tcps_keeptimeo);
377         if (tp->t_state < TCPS_ESTABLISHED)
378                 goto dropit;
379         if ((tcp_always_keepalive ||
380             inp->inp_socket->so_options & SO_KEEPALIVE) &&
381             tp->t_state <= TCPS_CLOSING) {
382                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
383                         goto dropit;
384                 /*
385                  * Send a packet designed to force a response
386                  * if the peer is up and reachable:
387                  * either an ACK if the connection is still alive,
388                  * or an RST if the peer has closed the connection
389                  * due to timeout or reboot.
390                  * Using sequence number tp->snd_una-1
391                  * causes the transmitted zero-length segment
392                  * to lie outside the receive window;
393                  * by the protocol spec, this requires the
394                  * correspondent TCP to respond.
395                  */
396                 TCPSTAT_INC(tcps_keepprobe);
397                 t_template = tcpip_maketemplate(inp);
398                 if (t_template) {
399                         tcp_respond(tp, t_template->tt_ipgen,
400                                     &t_template->tt_t, (struct mbuf *)NULL,
401                                     tp->rcv_nxt, tp->snd_una - 1, 0);
402                         free(t_template, M_TEMP);
403                 }
404                 if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
405                     tcp_timer_keep, tp)) {
406                         tp->t_timers->tt_flags &= ~TT_KEEP_RST;
407                 }
408         } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
409                     tcp_timer_keep, tp)) {
410                         tp->t_timers->tt_flags &= ~TT_KEEP_RST;
411                 }
412
413 #ifdef TCPDEBUG
414         if (inp->inp_socket->so_options & SO_DEBUG)
415                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
416                           PRU_SLOWTIMO);
417 #endif
418         INP_WUNLOCK(inp);
419         INP_INFO_RUNLOCK(&V_tcbinfo);
420         CURVNET_RESTORE();
421         return;
422
423 dropit:
424         TCPSTAT_INC(tcps_keepdrops);
425         tp = tcp_drop(tp, ETIMEDOUT);
426
427 #ifdef TCPDEBUG
428         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
429                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
430                           PRU_SLOWTIMO);
431 #endif
432         if (tp != NULL)
433                 INP_WUNLOCK(tp->t_inpcb);
434         INP_INFO_RUNLOCK(&V_tcbinfo);
435         CURVNET_RESTORE();
436 }
437
438 void
439 tcp_timer_persist(void *xtp)
440 {
441         struct tcpcb *tp = xtp;
442         struct inpcb *inp;
443         CURVNET_SET(tp->t_vnet);
444 #ifdef TCPDEBUG
445         int ostate;
446
447         ostate = tp->t_state;
448 #endif
449         INP_INFO_RLOCK(&V_tcbinfo);
450         inp = tp->t_inpcb;
451         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
452         INP_WLOCK(inp);
453         if (callout_pending(&tp->t_timers->tt_persist) ||
454             !callout_active(&tp->t_timers->tt_persist)) {
455                 INP_WUNLOCK(inp);
456                 INP_INFO_RUNLOCK(&V_tcbinfo);
457                 CURVNET_RESTORE();
458                 return;
459         }
460         callout_deactivate(&tp->t_timers->tt_persist);
461         if ((inp->inp_flags & INP_DROPPED) != 0) {
462                 INP_WUNLOCK(inp);
463                 INP_INFO_RUNLOCK(&V_tcbinfo);
464                 CURVNET_RESTORE();
465                 return;
466         }
467         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
468                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
469         KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0,
470                 ("%s: tp %p persist callout should be running", __func__, tp));
471         /*
472          * Persistance timer into zero window.
473          * Force a byte to be output, if possible.
474          */
475         TCPSTAT_INC(tcps_persisttimeo);
476         /*
477          * Hack: if the peer is dead/unreachable, we do not
478          * time out if the window is closed.  After a full
479          * backoff, drop the connection if the idle time
480          * (no responses to probes) reaches the maximum
481          * backoff that we would use if retransmitting.
482          */
483         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
484             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
485              ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
486                 TCPSTAT_INC(tcps_persistdrop);
487                 tp = tcp_drop(tp, ETIMEDOUT);
488                 goto out;
489         }
490         /*
491          * If the user has closed the socket then drop a persisting
492          * connection after a much reduced timeout.
493          */
494         if (tp->t_state > TCPS_CLOSE_WAIT &&
495             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
496                 TCPSTAT_INC(tcps_persistdrop);
497                 tp = tcp_drop(tp, ETIMEDOUT);
498                 goto out;
499         }
500         tcp_setpersist(tp);
501         tp->t_flags |= TF_FORCEDATA;
502         (void) tcp_output(tp);
503         tp->t_flags &= ~TF_FORCEDATA;
504
505 out:
506 #ifdef TCPDEBUG
507         if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
508                 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
509 #endif
510         if (tp != NULL)
511                 INP_WUNLOCK(inp);
512         INP_INFO_RUNLOCK(&V_tcbinfo);
513         CURVNET_RESTORE();
514 }
515
516 void
517 tcp_timer_rexmt(void * xtp)
518 {
519         struct tcpcb *tp = xtp;
520         CURVNET_SET(tp->t_vnet);
521         int rexmt;
522         int headlocked;
523         struct inpcb *inp;
524 #ifdef TCPDEBUG
525         int ostate;
526
527         ostate = tp->t_state;
528 #endif
529
530         INP_INFO_RLOCK(&V_tcbinfo);
531         inp = tp->t_inpcb;
532         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
533         INP_WLOCK(inp);
534         if (callout_pending(&tp->t_timers->tt_rexmt) ||
535             !callout_active(&tp->t_timers->tt_rexmt)) {
536                 INP_WUNLOCK(inp);
537                 INP_INFO_RUNLOCK(&V_tcbinfo);
538                 CURVNET_RESTORE();
539                 return;
540         }
541         callout_deactivate(&tp->t_timers->tt_rexmt);
542         if ((inp->inp_flags & INP_DROPPED) != 0) {
543                 INP_WUNLOCK(inp);
544                 INP_INFO_RUNLOCK(&V_tcbinfo);
545                 CURVNET_RESTORE();
546                 return;
547         }
548         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
549                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
550         KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0,
551                 ("%s: tp %p rexmt callout should be running", __func__, tp));
552         tcp_free_sackholes(tp);
553         /*
554          * Retransmission timer went off.  Message has not
555          * been acked within retransmit interval.  Back off
556          * to a longer retransmit interval and retransmit one segment.
557          */
558         if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
559                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
560                 TCPSTAT_INC(tcps_timeoutdrop);
561
562                 tp = tcp_drop(tp, tp->t_softerror ?
563                               tp->t_softerror : ETIMEDOUT);
564                 headlocked = 1;
565                 goto out;
566         }
567         INP_INFO_RUNLOCK(&V_tcbinfo);
568         headlocked = 0;
569         if (tp->t_state == TCPS_SYN_SENT) {
570                 /*
571                  * If the SYN was retransmitted, indicate CWND to be
572                  * limited to 1 segment in cc_conn_init().
573                  */
574                 tp->snd_cwnd = 1;
575         } else if (tp->t_rxtshift == 1) {
576                 /*
577                  * first retransmit; record ssthresh and cwnd so they can
578                  * be recovered if this turns out to be a "bad" retransmit.
579                  * A retransmit is considered "bad" if an ACK for this
580                  * segment is received within RTT/2 interval; the assumption
581                  * here is that the ACK was already in flight.  See
582                  * "On Estimating End-to-End Network Path Properties" by
583                  * Allman and Paxson for more details.
584                  */
585                 tp->snd_cwnd_prev = tp->snd_cwnd;
586                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
587                 tp->snd_recover_prev = tp->snd_recover;
588                 if (IN_FASTRECOVERY(tp->t_flags))
589                         tp->t_flags |= TF_WASFRECOVERY;
590                 else
591                         tp->t_flags &= ~TF_WASFRECOVERY;
592                 if (IN_CONGRECOVERY(tp->t_flags))
593                         tp->t_flags |= TF_WASCRECOVERY;
594                 else
595                         tp->t_flags &= ~TF_WASCRECOVERY;
596                 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
597                 tp->t_flags |= TF_PREVVALID;
598         } else
599                 tp->t_flags &= ~TF_PREVVALID;
600         TCPSTAT_INC(tcps_rexmttimeo);
601         if ((tp->t_state == TCPS_SYN_SENT) ||
602             (tp->t_state == TCPS_SYN_RECEIVED))
603                 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
604         else
605                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
606         TCPT_RANGESET(tp->t_rxtcur, rexmt,
607                       tp->t_rttmin, TCPTV_REXMTMAX);
608
609         /*
610          * We enter the path for PLMTUD if connection is established or, if
611          * connection is FIN_WAIT_1 status, reason for the last is that if
612          * amount of data we send is very small, we could send it in couple of
613          * packets and process straight to FIN. In that case we won't catch
614          * ESTABLISHED state.
615          */
616         if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
617             || (tp->t_state == TCPS_FIN_WAIT_1))) {
618                 int optlen;
619 #ifdef INET6
620                 int isipv6;
621 #endif
622
623                 /*
624                  * Idea here is that at each stage of mtu probe (usually, 1448
625                  * -> 1188 -> 524) should be given 2 chances to recover before
626                  *  further clamping down. 'tp->t_rxtshift % 2 == 0' should
627                  *  take care of that.
628                  */
629                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
630                     (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
631                     (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) {
632                         /*
633                          * Enter Path MTU Black-hole Detection mechanism:
634                          * - Disable Path MTU Discovery (IP "DF" bit).
635                          * - Reduce MTU to lower value than what we
636                          *   negotiated with peer.
637                          */
638                         /* Record that we may have found a black hole. */
639                         tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
640
641                         /* Keep track of previous MSS. */
642                         optlen = tp->t_maxopd - tp->t_maxseg;
643                         tp->t_pmtud_saved_maxopd = tp->t_maxopd;
644
645                         /* 
646                          * Reduce the MSS to blackhole value or to the default
647                          * in an attempt to retransmit.
648                          */
649 #ifdef INET6
650                         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
651                         if (isipv6 &&
652                             tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) {
653                                 /* Use the sysctl tuneable blackhole MSS. */
654                                 tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss;
655                                 V_tcp_pmtud_blackhole_activated++;
656                         } else if (isipv6) {
657                                 /* Use the default MSS. */
658                                 tp->t_maxopd = V_tcp_v6mssdflt;
659                                 /*
660                                  * Disable Path MTU Discovery when we switch to
661                                  * minmss.
662                                  */
663                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
664                                 V_tcp_pmtud_blackhole_activated_min_mss++;
665                         }
666 #endif
667 #if defined(INET6) && defined(INET)
668                         else
669 #endif
670 #ifdef INET
671                         if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) {
672                                 /* Use the sysctl tuneable blackhole MSS. */
673                                 tp->t_maxopd = V_tcp_pmtud_blackhole_mss;
674                                 V_tcp_pmtud_blackhole_activated++;
675                         } else {
676                                 /* Use the default MSS. */
677                                 tp->t_maxopd = V_tcp_mssdflt;
678                                 /*
679                                  * Disable Path MTU Discovery when we switch to
680                                  * minmss.
681                                  */
682                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
683                                 V_tcp_pmtud_blackhole_activated_min_mss++;
684                         }
685 #endif
686                         tp->t_maxseg = tp->t_maxopd - optlen;
687                         /*
688                          * Reset the slow-start flight size
689                          * as it may depend on the new MSS.
690                          */
691                         if (CC_ALGO(tp)->conn_init != NULL)
692                                 CC_ALGO(tp)->conn_init(tp->ccv);
693                 } else {
694                         /*
695                          * If further retransmissions are still unsuccessful
696                          * with a lowered MTU, maybe this isn't a blackhole and
697                          * we restore the previous MSS and blackhole detection
698                          * flags.
699                          * The limit '6' is determined by giving each probe
700                          * stage (1448, 1188, 524) 2 chances to recover.
701                          */
702                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
703                             (tp->t_rxtshift > 6)) {
704                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
705                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
706                                 optlen = tp->t_maxopd - tp->t_maxseg;
707                                 tp->t_maxopd = tp->t_pmtud_saved_maxopd;
708                                 tp->t_maxseg = tp->t_maxopd - optlen;
709                                 V_tcp_pmtud_blackhole_failed++;
710                                 /*
711                                  * Reset the slow-start flight size as it
712                                  * may depend on the new MSS.
713                                  */
714                                 if (CC_ALGO(tp)->conn_init != NULL)
715                                         CC_ALGO(tp)->conn_init(tp->ccv);
716                         }
717                 }
718         }
719
720         /*
721          * Disable RFC1323 and SACK if we haven't got any response to
722          * our third SYN to work-around some broken terminal servers
723          * (most of which have hopefully been retired) that have bad VJ
724          * header compression code which trashes TCP segments containing
725          * unknown-to-them TCP options.
726          */
727         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
728             (tp->t_rxtshift == 3))
729                 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
730         /*
731          * If we backed off this far, our srtt estimate is probably bogus.
732          * Clobber it so we'll take the next rtt measurement as our srtt;
733          * move the current srtt into rttvar to keep the current
734          * retransmit times until then.
735          */
736         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
737 #ifdef INET6
738                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
739                         in6_losing(tp->t_inpcb);
740 #endif
741                 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
742                 tp->t_srtt = 0;
743         }
744         tp->snd_nxt = tp->snd_una;
745         tp->snd_recover = tp->snd_max;
746         /*
747          * Force a segment to be sent.
748          */
749         tp->t_flags |= TF_ACKNOW;
750         /*
751          * If timing a segment in this window, stop the timer.
752          */
753         tp->t_rtttime = 0;
754
755         cc_cong_signal(tp, NULL, CC_RTO);
756
757         (void) tcp_output(tp);
758
759 out:
760 #ifdef TCPDEBUG
761         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
762                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
763                           PRU_SLOWTIMO);
764 #endif
765         if (tp != NULL)
766                 INP_WUNLOCK(inp);
767         if (headlocked)
768                 INP_INFO_RUNLOCK(&V_tcbinfo);
769         CURVNET_RESTORE();
770 }
771
772 void
773 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
774 {
775         struct callout *t_callout;
776         timeout_t *f_callout;
777         struct inpcb *inp = tp->t_inpcb;
778         int cpu = INP_CPU(inp);
779         uint32_t f_reset;
780
781 #ifdef TCP_OFFLOAD
782         if (tp->t_flags & TF_TOE)
783                 return;
784 #endif
785
786         if (tp->t_timers->tt_flags & TT_STOPPED)
787                 return;
788
789         switch (timer_type) {
790                 case TT_DELACK:
791                         t_callout = &tp->t_timers->tt_delack;
792                         f_callout = tcp_timer_delack;
793                         f_reset = TT_DELACK_RST;
794                         break;
795                 case TT_REXMT:
796                         t_callout = &tp->t_timers->tt_rexmt;
797                         f_callout = tcp_timer_rexmt;
798                         f_reset = TT_REXMT_RST;
799                         break;
800                 case TT_PERSIST:
801                         t_callout = &tp->t_timers->tt_persist;
802                         f_callout = tcp_timer_persist;
803                         f_reset = TT_PERSIST_RST;
804                         break;
805                 case TT_KEEP:
806                         t_callout = &tp->t_timers->tt_keep;
807                         f_callout = tcp_timer_keep;
808                         f_reset = TT_KEEP_RST;
809                         break;
810                 case TT_2MSL:
811                         t_callout = &tp->t_timers->tt_2msl;
812                         f_callout = tcp_timer_2msl;
813                         f_reset = TT_2MSL_RST;
814                         break;
815                 default:
816                         panic("tp %p bad timer_type %#x", tp, timer_type);
817                 }
818         if (delta == 0) {
819                 if ((tp->t_timers->tt_flags & timer_type) &&
820                     callout_stop(t_callout) &&
821                     (tp->t_timers->tt_flags & f_reset)) {
822                         tp->t_timers->tt_flags &= ~(timer_type | f_reset);
823                 }
824         } else {
825                 if ((tp->t_timers->tt_flags & timer_type) == 0) {
826                         tp->t_timers->tt_flags |= (timer_type | f_reset);
827                         callout_reset_on(t_callout, delta, f_callout, tp, cpu);
828                 } else {
829                         /* Reset already running callout on the same CPU. */
830                         if (!callout_reset(t_callout, delta, f_callout, tp)) {
831                                 /*
832                                  * Callout not cancelled, consider it as not
833                                  * properly restarted. */
834                                 tp->t_timers->tt_flags &= ~f_reset;
835                         }
836                 }
837         }
838 }
839
840 int
841 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
842 {
843         struct callout *t_callout;
844
845         switch (timer_type) {
846                 case TT_DELACK:
847                         t_callout = &tp->t_timers->tt_delack;
848                         break;
849                 case TT_REXMT:
850                         t_callout = &tp->t_timers->tt_rexmt;
851                         break;
852                 case TT_PERSIST:
853                         t_callout = &tp->t_timers->tt_persist;
854                         break;
855                 case TT_KEEP:
856                         t_callout = &tp->t_timers->tt_keep;
857                         break;
858                 case TT_2MSL:
859                         t_callout = &tp->t_timers->tt_2msl;
860                         break;
861                 default:
862                         panic("tp %p bad timer_type %#x", tp, timer_type);
863                 }
864         return callout_active(t_callout);
865 }
866
867 void
868 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
869 {
870         struct callout *t_callout;
871         timeout_t *f_callout;
872         uint32_t f_reset;
873
874         tp->t_timers->tt_flags |= TT_STOPPED;
875
876         switch (timer_type) {
877                 case TT_DELACK:
878                         t_callout = &tp->t_timers->tt_delack;
879                         f_callout = tcp_timer_delack_discard;
880                         f_reset = TT_DELACK_RST;
881                         break;
882                 case TT_REXMT:
883                         t_callout = &tp->t_timers->tt_rexmt;
884                         f_callout = tcp_timer_rexmt_discard;
885                         f_reset = TT_REXMT_RST;
886                         break;
887                 case TT_PERSIST:
888                         t_callout = &tp->t_timers->tt_persist;
889                         f_callout = tcp_timer_persist_discard;
890                         f_reset = TT_PERSIST_RST;
891                         break;
892                 case TT_KEEP:
893                         t_callout = &tp->t_timers->tt_keep;
894                         f_callout = tcp_timer_keep_discard;
895                         f_reset = TT_KEEP_RST;
896                         break;
897                 case TT_2MSL:
898                         t_callout = &tp->t_timers->tt_2msl;
899                         f_callout = tcp_timer_2msl_discard;
900                         f_reset = TT_2MSL_RST;
901                         break;
902                 default:
903                         panic("tp %p bad timer_type %#x", tp, timer_type);
904                 }
905
906         if (tp->t_timers->tt_flags & timer_type) {
907                 if (callout_stop(t_callout) &&
908                     (tp->t_timers->tt_flags & f_reset)) {
909                         tp->t_timers->tt_flags &= ~(timer_type | f_reset);
910                 } else {
911                         /*
912                          * Can't stop the callout, defer tcpcb actual deletion
913                          * to the last tcp timer discard callout.
914                          * The TT_STOPPED flag will ensure that no tcp timer
915                          * callouts can be restarted on our behalf, and
916                          * past this point currently running callouts waiting
917                          * on inp lock will return right away after the
918                          * classical check for callout reset/stop events:
919                          * callout_pending() || !callout_active()
920                          */
921                         callout_reset(t_callout, 1, f_callout, tp);
922                 }
923         }
924 }
925
926 #define ticks_to_msecs(t)       (1000*(t) / hz)
927
928 void
929 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
930     struct xtcp_timer *xtimer)
931 {
932         sbintime_t now;
933
934         bzero(xtimer, sizeof(*xtimer));
935         if (timer == NULL)
936                 return;
937         now = getsbinuptime();
938         if (callout_active(&timer->tt_delack))
939                 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
940         if (callout_active(&timer->tt_rexmt))
941                 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
942         if (callout_active(&timer->tt_persist))
943                 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
944         if (callout_active(&timer->tt_keep))
945                 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
946         if (callout_active(&timer->tt_2msl))
947                 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
948         xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
949 }