]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_timer.c
Update compiler-rt to trunk r224034. This brings a number of new
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_timer.c
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *      @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
30  */
31
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34
35 #include "opt_inet.h"
36 #include "opt_inet6.h"
37 #include "opt_tcpdebug.h"
38 #include "opt_rss.h"
39
40 #include <sys/param.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/mbuf.h>
44 #include <sys/mutex.h>
45 #include <sys/protosw.h>
46 #include <sys/smp.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/sysctl.h>
50 #include <sys/systm.h>
51
52 #include <net/if.h>
53 #include <net/route.h>
54 #include <net/vnet.h>
55 #include <net/netisr.h>
56
57 #include <netinet/cc.h>
58 #include <netinet/in.h>
59 #include <netinet/in_pcb.h>
60 #include <netinet/in_rss.h>
61 #include <netinet/in_systm.h>
62 #ifdef INET6
63 #include <netinet6/in6_pcb.h>
64 #endif
65 #include <netinet/ip_var.h>
66 #include <netinet/tcp_fsm.h>
67 #include <netinet/tcp_timer.h>
68 #include <netinet/tcp_var.h>
69 #ifdef INET6
70 #include <netinet6/tcp6_var.h>
71 #endif
72 #include <netinet/tcpip.h>
73 #ifdef TCPDEBUG
74 #include <netinet/tcp_debug.h>
75 #endif
76
77 int     tcp_keepinit;
78 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
79     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
80
81 int     tcp_keepidle;
82 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
83     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
84
85 int     tcp_keepintvl;
86 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
87     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
88
89 int     tcp_delacktime;
90 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
91     &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
92     "Time before a delayed ACK is sent");
93
94 int     tcp_msl;
95 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
96     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
97
98 int     tcp_rexmit_min;
99 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
100     &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
101     "Minimum Retransmission Timeout");
102
103 int     tcp_rexmit_slop;
104 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
105     &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
106     "Retransmission Timer Slop");
107
108 static int      always_keepalive = 1;
109 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
110     &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
111
112 int    tcp_fast_finwait2_recycle = 0;
113 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 
114     &tcp_fast_finwait2_recycle, 0,
115     "Recycle closed FIN_WAIT_2 connections faster");
116
117 int    tcp_finwait2_timeout;
118 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
119     &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
120
121 int     tcp_keepcnt = TCPTV_KEEPCNT;
122 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
123     "Number of keepalive probes to send");
124
125         /* max idle probes */
126 int     tcp_maxpersistidle;
127
128 static int      tcp_rexmit_drop_options = 0;
129 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
130     &tcp_rexmit_drop_options, 0,
131     "Drop TCP options from 3rd and later retransmitted SYN");
132
133 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
134 #define V_tcp_pmtud_blackhole_detect    VNET(tcp_pmtud_blackhole_detect)
135 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
136     CTLFLAG_RW|CTLFLAG_VNET,
137     &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
138     "Path MTU Discovery Black Hole Detection Enabled");
139
140 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
141 #define V_tcp_pmtud_blackhole_activated \
142     VNET(tcp_pmtud_blackhole_activated)
143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
144     CTLFLAG_RD|CTLFLAG_VNET,
145     &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
146     "Path MTU Discovery Black Hole Detection, Activation Count");
147
148 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
149 #define V_tcp_pmtud_blackhole_activated_min_mss \
150     VNET(tcp_pmtud_blackhole_activated_min_mss)
151 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
152     CTLFLAG_RD|CTLFLAG_VNET,
153     &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
154     "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
155
156 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
157 #define V_tcp_pmtud_blackhole_failed    VNET(tcp_pmtud_blackhole_failed)
158 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
159     CTLFLAG_RD|CTLFLAG_VNET,
160     &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
161     "Path MTU Discovery Black Hole Detection, Failure Count");
162
163 #ifdef INET
164 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
165 #define V_tcp_pmtud_blackhole_mss       VNET(tcp_pmtud_blackhole_mss)
166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
167     CTLFLAG_RW|CTLFLAG_VNET,
168     &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
169     "Path MTU Discovery Black Hole Detection lowered MSS");
170 #endif
171
172 #ifdef INET6
173 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
174 #define V_tcp_v6pmtud_blackhole_mss     VNET(tcp_v6pmtud_blackhole_mss)
175 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
176     CTLFLAG_RW|CTLFLAG_VNET,
177     &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
178     "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
179 #endif
180
181 #ifdef  RSS
182 static int      per_cpu_timers = 1;
183 #else
184 static int      per_cpu_timers = 0;
185 #endif
186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
187     &per_cpu_timers , 0, "run tcp timers on all cpus");
188
189 #if 0
190 #define INP_CPU(inp)    (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
191                 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
192 #endif
193
194 /*
195  * Map the given inp to a CPU id.
196  *
197  * This queries RSS if it's compiled in, else it defaults to the current
198  * CPU ID.
199  */
200 static inline int
201 inp_to_cpuid(struct inpcb *inp)
202 {
203         u_int cpuid;
204
205 #ifdef  RSS
206         if (per_cpu_timers) {
207                 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
208                 if (cpuid == NETISR_CPUID_NONE)
209                         return (curcpu);        /* XXX */
210                 else
211                         return (cpuid);
212         }
213 #else
214         /* Legacy, pre-RSS behaviour */
215         if (per_cpu_timers) {
216                 /*
217                  * We don't have a flowid -> cpuid mapping, so cheat and
218                  * just map unknown cpuids to curcpu.  Not the best, but
219                  * apparently better than defaulting to swi 0.
220                  */
221                 cpuid = inp->inp_flowid % (mp_maxid + 1);
222                 if (! CPU_ABSENT(cpuid))
223                         return (cpuid);
224                 return (curcpu);
225         }
226 #endif
227         /* Default for RSS and non-RSS - cpuid 0 */
228         else {
229                 return (0);
230         }
231 }
232
233 /*
234  * Tcp protocol timeout routine called every 500 ms.
235  * Updates timestamps used for TCP
236  * causes finite state machine actions if timers expire.
237  */
238 void
239 tcp_slowtimo(void)
240 {
241         VNET_ITERATOR_DECL(vnet_iter);
242
243         VNET_LIST_RLOCK_NOSLEEP();
244         VNET_FOREACH(vnet_iter) {
245                 CURVNET_SET(vnet_iter);
246                 (void) tcp_tw_2msl_scan(0);
247                 CURVNET_RESTORE();
248         }
249         VNET_LIST_RUNLOCK_NOSLEEP();
250 }
251
252 int     tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
253     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
254
255 int     tcp_backoff[TCP_MAXRXTSHIFT + 1] =
256     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
257
258 static int tcp_totbackoff = 2559;       /* sum of tcp_backoff[] */
259
260 static int tcp_timer_race;
261 SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race,
262     0, "Count of t_inpcb races on tcp_discardcb");
263
264 /*
265  * TCP timer processing.
266  */
267
268 void
269 tcp_timer_delack(void *xtp)
270 {
271         struct tcpcb *tp = xtp;
272         struct inpcb *inp;
273         CURVNET_SET(tp->t_vnet);
274
275         inp = tp->t_inpcb;
276         /*
277          * XXXRW: While this assert is in fact correct, bugs in the tcpcb
278          * tear-down mean we need it as a work-around for races between
279          * timers and tcp_discardcb().
280          *
281          * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL"));
282          */
283         if (inp == NULL) {
284                 tcp_timer_race++;
285                 CURVNET_RESTORE();
286                 return;
287         }
288         INP_WLOCK(inp);
289         if (callout_pending(&tp->t_timers->tt_delack) ||
290             !callout_active(&tp->t_timers->tt_delack)) {
291                 INP_WUNLOCK(inp);
292                 CURVNET_RESTORE();
293                 return;
294         }
295         callout_deactivate(&tp->t_timers->tt_delack);
296         if ((inp->inp_flags & INP_DROPPED) != 0) {
297                 INP_WUNLOCK(inp);
298                 CURVNET_RESTORE();
299                 return;
300         }
301
302         tp->t_flags |= TF_ACKNOW;
303         TCPSTAT_INC(tcps_delack);
304         (void) tcp_output(tp);
305         INP_WUNLOCK(inp);
306         CURVNET_RESTORE();
307 }
308
309 void
310 tcp_timer_2msl(void *xtp)
311 {
312         struct tcpcb *tp = xtp;
313         struct inpcb *inp;
314         CURVNET_SET(tp->t_vnet);
315 #ifdef TCPDEBUG
316         int ostate;
317
318         ostate = tp->t_state;
319 #endif
320         /*
321          * XXXRW: Does this actually happen?
322          */
323         INP_INFO_WLOCK(&V_tcbinfo);
324         inp = tp->t_inpcb;
325         /*
326          * XXXRW: While this assert is in fact correct, bugs in the tcpcb
327          * tear-down mean we need it as a work-around for races between
328          * timers and tcp_discardcb().
329          *
330          * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL"));
331          */
332         if (inp == NULL) {
333                 tcp_timer_race++;
334                 INP_INFO_WUNLOCK(&V_tcbinfo);
335                 CURVNET_RESTORE();
336                 return;
337         }
338         INP_WLOCK(inp);
339         tcp_free_sackholes(tp);
340         if (callout_pending(&tp->t_timers->tt_2msl) ||
341             !callout_active(&tp->t_timers->tt_2msl)) {
342                 INP_WUNLOCK(tp->t_inpcb);
343                 INP_INFO_WUNLOCK(&V_tcbinfo);
344                 CURVNET_RESTORE();
345                 return;
346         }
347         callout_deactivate(&tp->t_timers->tt_2msl);
348         if ((inp->inp_flags & INP_DROPPED) != 0) {
349                 INP_WUNLOCK(inp);
350                 INP_INFO_WUNLOCK(&V_tcbinfo);
351                 CURVNET_RESTORE();
352                 return;
353         }
354         /*
355          * 2 MSL timeout in shutdown went off.  If we're closed but
356          * still waiting for peer to close and connection has been idle
357          * too long, or if 2MSL time is up from TIME_WAIT, delete connection
358          * control block.  Otherwise, check again in a bit.
359          *
360          * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 
361          * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 
362          * Ignore fact that there were recent incoming segments.
363          */
364         if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
365             tp->t_inpcb && tp->t_inpcb->inp_socket && 
366             (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
367                 TCPSTAT_INC(tcps_finwait2_drops);
368                 tp = tcp_close(tp);             
369         } else {
370                 if (tp->t_state != TCPS_TIME_WAIT &&
371                    ticks - tp->t_rcvtime <= TP_MAXIDLE(tp))
372                        callout_reset_on(&tp->t_timers->tt_2msl,
373                            TP_KEEPINTVL(tp), tcp_timer_2msl, tp,
374                            inp_to_cpuid(inp));
375                else
376                        tp = tcp_close(tp);
377        }
378
379 #ifdef TCPDEBUG
380         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
381                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
382                           PRU_SLOWTIMO);
383 #endif
384         if (tp != NULL)
385                 INP_WUNLOCK(inp);
386         INP_INFO_WUNLOCK(&V_tcbinfo);
387         CURVNET_RESTORE();
388 }
389
390 void
391 tcp_timer_keep(void *xtp)
392 {
393         struct tcpcb *tp = xtp;
394         struct tcptemp *t_template;
395         struct inpcb *inp;
396         CURVNET_SET(tp->t_vnet);
397 #ifdef TCPDEBUG
398         int ostate;
399
400         ostate = tp->t_state;
401 #endif
402         INP_INFO_WLOCK(&V_tcbinfo);
403         inp = tp->t_inpcb;
404         /*
405          * XXXRW: While this assert is in fact correct, bugs in the tcpcb
406          * tear-down mean we need it as a work-around for races between
407          * timers and tcp_discardcb().
408          *
409          * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL"));
410          */
411         if (inp == NULL) {
412                 tcp_timer_race++;
413                 INP_INFO_WUNLOCK(&V_tcbinfo);
414                 CURVNET_RESTORE();
415                 return;
416         }
417         INP_WLOCK(inp);
418         if (callout_pending(&tp->t_timers->tt_keep) ||
419             !callout_active(&tp->t_timers->tt_keep)) {
420                 INP_WUNLOCK(inp);
421                 INP_INFO_WUNLOCK(&V_tcbinfo);
422                 CURVNET_RESTORE();
423                 return;
424         }
425         callout_deactivate(&tp->t_timers->tt_keep);
426         if ((inp->inp_flags & INP_DROPPED) != 0) {
427                 INP_WUNLOCK(inp);
428                 INP_INFO_WUNLOCK(&V_tcbinfo);
429                 CURVNET_RESTORE();
430                 return;
431         }
432         /*
433          * Keep-alive timer went off; send something
434          * or drop connection if idle for too long.
435          */
436         TCPSTAT_INC(tcps_keeptimeo);
437         if (tp->t_state < TCPS_ESTABLISHED)
438                 goto dropit;
439         if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
440             tp->t_state <= TCPS_CLOSING) {
441                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
442                         goto dropit;
443                 /*
444                  * Send a packet designed to force a response
445                  * if the peer is up and reachable:
446                  * either an ACK if the connection is still alive,
447                  * or an RST if the peer has closed the connection
448                  * due to timeout or reboot.
449                  * Using sequence number tp->snd_una-1
450                  * causes the transmitted zero-length segment
451                  * to lie outside the receive window;
452                  * by the protocol spec, this requires the
453                  * correspondent TCP to respond.
454                  */
455                 TCPSTAT_INC(tcps_keepprobe);
456                 t_template = tcpip_maketemplate(inp);
457                 if (t_template) {
458                         tcp_respond(tp, t_template->tt_ipgen,
459                                     &t_template->tt_t, (struct mbuf *)NULL,
460                                     tp->rcv_nxt, tp->snd_una - 1, 0);
461                         free(t_template, M_TEMP);
462                 }
463                 callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
464                     tcp_timer_keep, tp, inp_to_cpuid(inp));
465         } else
466                 callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
467                     tcp_timer_keep, tp, inp_to_cpuid(inp));
468
469 #ifdef TCPDEBUG
470         if (inp->inp_socket->so_options & SO_DEBUG)
471                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
472                           PRU_SLOWTIMO);
473 #endif
474         INP_WUNLOCK(inp);
475         INP_INFO_WUNLOCK(&V_tcbinfo);
476         CURVNET_RESTORE();
477         return;
478
479 dropit:
480         TCPSTAT_INC(tcps_keepdrops);
481         tp = tcp_drop(tp, ETIMEDOUT);
482
483 #ifdef TCPDEBUG
484         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
485                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
486                           PRU_SLOWTIMO);
487 #endif
488         if (tp != NULL)
489                 INP_WUNLOCK(tp->t_inpcb);
490         INP_INFO_WUNLOCK(&V_tcbinfo);
491         CURVNET_RESTORE();
492 }
493
494 void
495 tcp_timer_persist(void *xtp)
496 {
497         struct tcpcb *tp = xtp;
498         struct inpcb *inp;
499         CURVNET_SET(tp->t_vnet);
500 #ifdef TCPDEBUG
501         int ostate;
502
503         ostate = tp->t_state;
504 #endif
505         INP_INFO_WLOCK(&V_tcbinfo);
506         inp = tp->t_inpcb;
507         /*
508          * XXXRW: While this assert is in fact correct, bugs in the tcpcb
509          * tear-down mean we need it as a work-around for races between
510          * timers and tcp_discardcb().
511          *
512          * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL"));
513          */
514         if (inp == NULL) {
515                 tcp_timer_race++;
516                 INP_INFO_WUNLOCK(&V_tcbinfo);
517                 CURVNET_RESTORE();
518                 return;
519         }
520         INP_WLOCK(inp);
521         if (callout_pending(&tp->t_timers->tt_persist) ||
522             !callout_active(&tp->t_timers->tt_persist)) {
523                 INP_WUNLOCK(inp);
524                 INP_INFO_WUNLOCK(&V_tcbinfo);
525                 CURVNET_RESTORE();
526                 return;
527         }
528         callout_deactivate(&tp->t_timers->tt_persist);
529         if ((inp->inp_flags & INP_DROPPED) != 0) {
530                 INP_WUNLOCK(inp);
531                 INP_INFO_WUNLOCK(&V_tcbinfo);
532                 CURVNET_RESTORE();
533                 return;
534         }
535         /*
536          * Persistance timer into zero window.
537          * Force a byte to be output, if possible.
538          */
539         TCPSTAT_INC(tcps_persisttimeo);
540         /*
541          * Hack: if the peer is dead/unreachable, we do not
542          * time out if the window is closed.  After a full
543          * backoff, drop the connection if the idle time
544          * (no responses to probes) reaches the maximum
545          * backoff that we would use if retransmitting.
546          */
547         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
548             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
549              ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
550                 TCPSTAT_INC(tcps_persistdrop);
551                 tp = tcp_drop(tp, ETIMEDOUT);
552                 goto out;
553         }
554         /*
555          * If the user has closed the socket then drop a persisting
556          * connection after a much reduced timeout.
557          */
558         if (tp->t_state > TCPS_CLOSE_WAIT &&
559             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
560                 TCPSTAT_INC(tcps_persistdrop);
561                 tp = tcp_drop(tp, ETIMEDOUT);
562                 goto out;
563         }
564         tcp_setpersist(tp);
565         tp->t_flags |= TF_FORCEDATA;
566         (void) tcp_output(tp);
567         tp->t_flags &= ~TF_FORCEDATA;
568
569 out:
570 #ifdef TCPDEBUG
571         if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
572                 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
573 #endif
574         if (tp != NULL)
575                 INP_WUNLOCK(inp);
576         INP_INFO_WUNLOCK(&V_tcbinfo);
577         CURVNET_RESTORE();
578 }
579
580 void
581 tcp_timer_rexmt(void * xtp)
582 {
583         struct tcpcb *tp = xtp;
584         CURVNET_SET(tp->t_vnet);
585         int rexmt;
586         int headlocked;
587         struct inpcb *inp;
588 #ifdef TCPDEBUG
589         int ostate;
590
591         ostate = tp->t_state;
592 #endif
593
594         INP_INFO_RLOCK(&V_tcbinfo);
595         inp = tp->t_inpcb;
596         /*
597          * XXXRW: While this assert is in fact correct, bugs in the tcpcb
598          * tear-down mean we need it as a work-around for races between
599          * timers and tcp_discardcb().
600          *
601          * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL"));
602          */
603         if (inp == NULL) {
604                 tcp_timer_race++;
605                 INP_INFO_RUNLOCK(&V_tcbinfo);
606                 CURVNET_RESTORE();
607                 return;
608         }
609         INP_WLOCK(inp);
610         if (callout_pending(&tp->t_timers->tt_rexmt) ||
611             !callout_active(&tp->t_timers->tt_rexmt)) {
612                 INP_WUNLOCK(inp);
613                 INP_INFO_RUNLOCK(&V_tcbinfo);
614                 CURVNET_RESTORE();
615                 return;
616         }
617         callout_deactivate(&tp->t_timers->tt_rexmt);
618         if ((inp->inp_flags & INP_DROPPED) != 0) {
619                 INP_WUNLOCK(inp);
620                 INP_INFO_RUNLOCK(&V_tcbinfo);
621                 CURVNET_RESTORE();
622                 return;
623         }
624         tcp_free_sackholes(tp);
625         /*
626          * Retransmission timer went off.  Message has not
627          * been acked within retransmit interval.  Back off
628          * to a longer retransmit interval and retransmit one segment.
629          */
630         if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
631                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
632                 TCPSTAT_INC(tcps_timeoutdrop);
633                 in_pcbref(inp);
634                 INP_INFO_RUNLOCK(&V_tcbinfo);
635                 INP_WUNLOCK(inp);
636                 INP_INFO_WLOCK(&V_tcbinfo);
637                 INP_WLOCK(inp);
638                 if (in_pcbrele_wlocked(inp)) {
639                         INP_INFO_WUNLOCK(&V_tcbinfo);
640                         CURVNET_RESTORE();
641                         return;
642                 }
643                 if (inp->inp_flags & INP_DROPPED) {
644                         INP_WUNLOCK(inp);
645                         INP_INFO_WUNLOCK(&V_tcbinfo);
646                         CURVNET_RESTORE();
647                         return;
648                 }
649
650                 tp = tcp_drop(tp, tp->t_softerror ?
651                               tp->t_softerror : ETIMEDOUT);
652                 headlocked = 1;
653                 goto out;
654         }
655         INP_INFO_RUNLOCK(&V_tcbinfo);
656         headlocked = 0;
657         if (tp->t_state == TCPS_SYN_SENT) {
658                 /*
659                  * If the SYN was retransmitted, indicate CWND to be
660                  * limited to 1 segment in cc_conn_init().
661                  */
662                 tp->snd_cwnd = 1;
663         } else if (tp->t_rxtshift == 1) {
664                 /*
665                  * first retransmit; record ssthresh and cwnd so they can
666                  * be recovered if this turns out to be a "bad" retransmit.
667                  * A retransmit is considered "bad" if an ACK for this
668                  * segment is received within RTT/2 interval; the assumption
669                  * here is that the ACK was already in flight.  See
670                  * "On Estimating End-to-End Network Path Properties" by
671                  * Allman and Paxson for more details.
672                  */
673                 tp->snd_cwnd_prev = tp->snd_cwnd;
674                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
675                 tp->snd_recover_prev = tp->snd_recover;
676                 if (IN_FASTRECOVERY(tp->t_flags))
677                         tp->t_flags |= TF_WASFRECOVERY;
678                 else
679                         tp->t_flags &= ~TF_WASFRECOVERY;
680                 if (IN_CONGRECOVERY(tp->t_flags))
681                         tp->t_flags |= TF_WASCRECOVERY;
682                 else
683                         tp->t_flags &= ~TF_WASCRECOVERY;
684                 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
685                 tp->t_flags |= TF_PREVVALID;
686         } else
687                 tp->t_flags &= ~TF_PREVVALID;
688         TCPSTAT_INC(tcps_rexmttimeo);
689         if (tp->t_state == TCPS_SYN_SENT)
690                 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
691         else
692                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
693         TCPT_RANGESET(tp->t_rxtcur, rexmt,
694                       tp->t_rttmin, TCPTV_REXMTMAX);
695
696         /*
697          * We enter the path for PLMTUD if connection is established or, if
698          * connection is FIN_WAIT_1 status, reason for the last is that if
699          * amount of data we send is very small, we could send it in couple of
700          * packets and process straight to FIN. In that case we won't catch
701          * ESTABLISHED state.
702          */
703         if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
704             || (tp->t_state == TCPS_FIN_WAIT_1))) {
705                 int optlen;
706 #ifdef INET6
707                 int isipv6;
708 #endif
709
710                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
711                     (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
712                     (tp->t_rxtshift <= 2)) {
713                         /*
714                          * Enter Path MTU Black-hole Detection mechanism:
715                          * - Disable Path MTU Discovery (IP "DF" bit).
716                          * - Reduce MTU to lower value than what we
717                          *   negotiated with peer.
718                          */
719                         /* Record that we may have found a black hole. */
720                         tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
721
722                         /* Keep track of previous MSS. */
723                         optlen = tp->t_maxopd - tp->t_maxseg;
724                         tp->t_pmtud_saved_maxopd = tp->t_maxopd;
725
726                         /* 
727                          * Reduce the MSS to blackhole value or to the default
728                          * in an attempt to retransmit.
729                          */
730 #ifdef INET6
731                         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
732                         if (isipv6 &&
733                             tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) {
734                                 /* Use the sysctl tuneable blackhole MSS. */
735                                 tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss;
736                                 V_tcp_pmtud_blackhole_activated++;
737                         } else if (isipv6) {
738                                 /* Use the default MSS. */
739                                 tp->t_maxopd = V_tcp_v6mssdflt;
740                                 /*
741                                  * Disable Path MTU Discovery when we switch to
742                                  * minmss.
743                                  */
744                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
745                                 V_tcp_pmtud_blackhole_activated_min_mss++;
746                         }
747 #endif
748 #if defined(INET6) && defined(INET)
749                         else
750 #endif
751 #ifdef INET
752                         if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) {
753                                 /* Use the sysctl tuneable blackhole MSS. */
754                                 tp->t_maxopd = V_tcp_pmtud_blackhole_mss;
755                                 V_tcp_pmtud_blackhole_activated++;
756                         } else {
757                                 /* Use the default MSS. */
758                                 tp->t_maxopd = V_tcp_mssdflt;
759                                 /*
760                                  * Disable Path MTU Discovery when we switch to
761                                  * minmss.
762                                  */
763                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
764                                 V_tcp_pmtud_blackhole_activated_min_mss++;
765                         }
766 #endif
767                         tp->t_maxseg = tp->t_maxopd - optlen;
768                         /*
769                          * Reset the slow-start flight size
770                          * as it may depend on the new MSS.
771                          */
772                         if (CC_ALGO(tp)->conn_init != NULL)
773                                 CC_ALGO(tp)->conn_init(tp->ccv);
774                 } else {
775                         /*
776                          * If further retransmissions are still unsuccessful
777                          * with a lowered MTU, maybe this isn't a blackhole and
778                          * we restore the previous MSS and blackhole detection
779                          * flags.
780                          */
781                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
782                             (tp->t_rxtshift > 4)) {
783                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
784                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
785                                 optlen = tp->t_maxopd - tp->t_maxseg;
786                                 tp->t_maxopd = tp->t_pmtud_saved_maxopd;
787                                 tp->t_maxseg = tp->t_maxopd - optlen;
788                                 V_tcp_pmtud_blackhole_failed++;
789                                 /*
790                                  * Reset the slow-start flight size as it
791                                  * may depend on the new MSS.
792                                  */
793                                 if (CC_ALGO(tp)->conn_init != NULL)
794                                         CC_ALGO(tp)->conn_init(tp->ccv);
795                         }
796                 }
797         }
798
799         /*
800          * Disable RFC1323 and SACK if we haven't got any response to
801          * our third SYN to work-around some broken terminal servers
802          * (most of which have hopefully been retired) that have bad VJ
803          * header compression code which trashes TCP segments containing
804          * unknown-to-them TCP options.
805          */
806         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
807             (tp->t_rxtshift == 3))
808                 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
809         /*
810          * If we backed off this far, our srtt estimate is probably bogus.
811          * Clobber it so we'll take the next rtt measurement as our srtt;
812          * move the current srtt into rttvar to keep the current
813          * retransmit times until then.
814          */
815         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
816 #ifdef INET6
817                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
818                         in6_losing(tp->t_inpcb);
819 #endif
820                 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
821                 tp->t_srtt = 0;
822         }
823         tp->snd_nxt = tp->snd_una;
824         tp->snd_recover = tp->snd_max;
825         /*
826          * Force a segment to be sent.
827          */
828         tp->t_flags |= TF_ACKNOW;
829         /*
830          * If timing a segment in this window, stop the timer.
831          */
832         tp->t_rtttime = 0;
833
834         cc_cong_signal(tp, NULL, CC_RTO);
835
836         (void) tcp_output(tp);
837
838 out:
839 #ifdef TCPDEBUG
840         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
841                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
842                           PRU_SLOWTIMO);
843 #endif
844         if (tp != NULL)
845                 INP_WUNLOCK(inp);
846         if (headlocked)
847                 INP_INFO_WUNLOCK(&V_tcbinfo);
848         CURVNET_RESTORE();
849 }
850
851 void
852 tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
853 {
854         struct callout *t_callout;
855         void *f_callout;
856         struct inpcb *inp = tp->t_inpcb;
857         int cpu = inp_to_cpuid(inp);
858
859 #ifdef TCP_OFFLOAD
860         if (tp->t_flags & TF_TOE)
861                 return;
862 #endif
863
864         switch (timer_type) {
865                 case TT_DELACK:
866                         t_callout = &tp->t_timers->tt_delack;
867                         f_callout = tcp_timer_delack;
868                         break;
869                 case TT_REXMT:
870                         t_callout = &tp->t_timers->tt_rexmt;
871                         f_callout = tcp_timer_rexmt;
872                         break;
873                 case TT_PERSIST:
874                         t_callout = &tp->t_timers->tt_persist;
875                         f_callout = tcp_timer_persist;
876                         break;
877                 case TT_KEEP:
878                         t_callout = &tp->t_timers->tt_keep;
879                         f_callout = tcp_timer_keep;
880                         break;
881                 case TT_2MSL:
882                         t_callout = &tp->t_timers->tt_2msl;
883                         f_callout = tcp_timer_2msl;
884                         break;
885                 default:
886                         panic("bad timer_type");
887                 }
888         if (delta == 0) {
889                 callout_stop(t_callout);
890         } else {
891                 callout_reset_on(t_callout, delta, f_callout, tp, cpu);
892         }
893 }
894
895 int
896 tcp_timer_active(struct tcpcb *tp, int timer_type)
897 {
898         struct callout *t_callout;
899
900         switch (timer_type) {
901                 case TT_DELACK:
902                         t_callout = &tp->t_timers->tt_delack;
903                         break;
904                 case TT_REXMT:
905                         t_callout = &tp->t_timers->tt_rexmt;
906                         break;
907                 case TT_PERSIST:
908                         t_callout = &tp->t_timers->tt_persist;
909                         break;
910                 case TT_KEEP:
911                         t_callout = &tp->t_timers->tt_keep;
912                         break;
913                 case TT_2MSL:
914                         t_callout = &tp->t_timers->tt_2msl;
915                         break;
916                 default:
917                         panic("bad timer_type");
918                 }
919         return callout_active(t_callout);
920 }
921
922 #define ticks_to_msecs(t)       (1000*(t) / hz)
923
924 void
925 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
926     struct xtcp_timer *xtimer)
927 {
928         sbintime_t now;
929
930         bzero(xtimer, sizeof(*xtimer));
931         if (timer == NULL)
932                 return;
933         now = getsbinuptime();
934         if (callout_active(&timer->tt_delack))
935                 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
936         if (callout_active(&timer->tt_rexmt))
937                 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
938         if (callout_active(&timer->tt_persist))
939                 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
940         if (callout_active(&timer->tt_keep))
941                 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
942         if (callout_active(&timer->tt_2msl))
943                 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
944         xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
945 }