]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_timer.c
contrib/tzdata: import tzdata 2023d
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_timer.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
5  *      The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31
32 #include <sys/cdefs.h>
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 #include "opt_rss.h"
36
37 #include <sys/param.h>
38 #include <sys/kernel.h>
39 #include <sys/lock.h>
40 #include <sys/mbuf.h>
41 #include <sys/mutex.h>
42 #include <sys/protosw.h>
43 #include <sys/smp.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/systm.h>
48
49 #include <net/if.h>
50 #include <net/route.h>
51 #include <net/rss_config.h>
52 #include <net/vnet.h>
53 #include <net/netisr.h>
54
55 #include <netinet/in.h>
56 #include <netinet/in_kdtrace.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_rss.h>
59 #include <netinet/in_systm.h>
60 #ifdef INET6
61 #include <netinet6/in6_pcb.h>
62 #endif
63 #include <netinet/ip_var.h>
64 #include <netinet/tcp.h>
65 #include <netinet/tcp_fsm.h>
66 #include <netinet/tcp_timer.h>
67 #include <netinet/tcp_var.h>
68 #include <netinet/tcp_log_buf.h>
69 #include <netinet/tcp_seq.h>
70 #include <netinet/cc/cc.h>
71 #ifdef INET6
72 #include <netinet6/tcp6_var.h>
73 #endif
74 #include <netinet/tcpip.h>
75
76 int    tcp_persmin;
77 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin,
78     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
79     &tcp_persmin, 0, sysctl_msec_to_ticks, "I",
80     "minimum persistence interval");
81
82 int    tcp_persmax;
83 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax,
84     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
85     &tcp_persmax, 0, sysctl_msec_to_ticks, "I",
86     "maximum persistence interval");
87
88 int     tcp_keepinit;
89 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
90     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
91     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I",
92     "time to establish connection");
93
94 int     tcp_keepidle;
95 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
96     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
97     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I",
98     "time before keepalive probes begin");
99
100 int     tcp_keepintvl;
101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
102     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
103     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I",
104     "time between keepalive probes");
105
106 int     tcp_delacktime;
107 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime,
108     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
109     &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
110     "Time before a delayed ACK is sent");
111
112 VNET_DEFINE(int, tcp_msl);
113 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
114     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET,
115     &VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I",
116     "Maximum segment lifetime");
117
118 int     tcp_rexmit_initial;
119 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial,
120    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
121     &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I",
122     "Initial Retransmission Timeout");
123
124 int     tcp_rexmit_min;
125 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min,
126     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
127     &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
128     "Minimum Retransmission Timeout");
129
130 int     tcp_rexmit_slop;
131 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop,
132     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
133     &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
134     "Retransmission Timer Slop");
135
136 VNET_DEFINE(int, tcp_always_keepalive) = 1;
137 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW,
138     &VNET_NAME(tcp_always_keepalive) , 0,
139     "Assume SO_KEEPALIVE on all TCP connections");
140
141 int    tcp_fast_finwait2_recycle = 0;
142 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
143     &tcp_fast_finwait2_recycle, 0,
144     "Recycle closed FIN_WAIT_2 connections faster");
145
146 int    tcp_finwait2_timeout;
147 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout,
148     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
149     &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I",
150     "FIN-WAIT2 timeout");
151
152 int     tcp_keepcnt = TCPTV_KEEPCNT;
153 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
154     "Number of keepalive probes to send");
155
156         /* max idle probes */
157 int     tcp_maxpersistidle;
158
159 int     tcp_rexmit_drop_options = 0;
160 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
161     &tcp_rexmit_drop_options, 0,
162     "Drop TCP options from 3rd and later retransmitted SYN");
163
164 int     tcp_maxunacktime = TCPTV_MAXUNACKTIME;
165 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime,
166     CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_NEEDGIANT,
167     &tcp_maxunacktime, 0, sysctl_msec_to_ticks, "I",
168     "Maximum time (in ms) that a session can linger without making progress");
169
170 VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
171 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
172     CTLFLAG_RW|CTLFLAG_VNET,
173     &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
174     "Path MTU Discovery Black Hole Detection Enabled");
175
176 #ifdef INET
177 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
178 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
179     CTLFLAG_RW|CTLFLAG_VNET,
180     &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
181     "Path MTU Discovery Black Hole Detection lowered MSS");
182 #endif
183
184 #ifdef INET6
185 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
187     CTLFLAG_RW|CTLFLAG_VNET,
188     &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
189     "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
190 #endif
191
192 #ifdef  RSS
193 static int      per_cpu_timers = 1;
194 #else
195 static int      per_cpu_timers = 0;
196 #endif
197 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
198     &per_cpu_timers , 0, "run tcp timers on all cpus");
199
200 static int
201 sysctl_net_inet_tcp_retries(SYSCTL_HANDLER_ARGS)
202 {
203         int error, new;
204
205         new = V_tcp_retries;
206         error = sysctl_handle_int(oidp, &new, 0, req);
207         if (error == 0 && req->newptr) {
208                 if ((new < 1) || (new > TCP_MAXRXTSHIFT))
209                         error = EINVAL;
210                 else
211                         V_tcp_retries = new;
212         }
213         return (error);
214 }
215
216 VNET_DEFINE(int, tcp_retries) = TCP_MAXRXTSHIFT;
217 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, retries,
218     CTLTYPE_INT | CTLFLAG_VNET | CTLFLAG_RW,
219     &VNET_NAME(tcp_retries), 0, sysctl_net_inet_tcp_retries, "I",
220     "maximum number of consecutive timer based retransmissions");
221
222 /*
223  * Map the given inp to a CPU id.
224  *
225  * This queries RSS if it's compiled in, else it defaults to the current
226  * CPU ID.
227  */
228 inline int
229 inp_to_cpuid(struct inpcb *inp)
230 {
231         u_int cpuid;
232
233         if (per_cpu_timers) {
234 #ifdef  RSS
235                 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
236                 if (cpuid == NETISR_CPUID_NONE)
237                         return (curcpu);        /* XXX */
238                 else
239                         return (cpuid);
240 #endif
241                 /*
242                  * We don't have a flowid -> cpuid mapping, so cheat and
243                  * just map unknown cpuids to curcpu.  Not the best, but
244                  * apparently better than defaulting to swi 0.
245                  */
246                 cpuid = inp->inp_flowid % (mp_maxid + 1);
247                 if (! CPU_ABSENT(cpuid))
248                         return (cpuid);
249                 return (curcpu);
250         } else {
251                 return (0);
252         }
253 }
254
255 int     tcp_backoff[TCP_MAXRXTSHIFT + 1] =
256     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
257
258 int tcp_totbackoff = 2559;      /* sum of tcp_backoff[] */
259
260 /*
261  * TCP timer processing.
262  *
263  * Each connection has 5 timers associated with it, which can be scheduled
264  * simultaneously.  They all are serviced by one callout tcp_timer_enter().
265  * This function executes the next timer via tcp_timersw[] vector.  Each
266  * timer is supposed to return 'true' unless the connection was destroyed.
267  * In the former case tcp_timer_enter() will schedule callout for next timer.
268  */
269
270 typedef bool tcp_timer_t(struct tcpcb *);
271 static tcp_timer_t tcp_timer_delack;
272 static tcp_timer_t tcp_timer_2msl;
273 static tcp_timer_t tcp_timer_keep;
274 static tcp_timer_t tcp_timer_persist;
275 static tcp_timer_t tcp_timer_rexmt;
276
277 static tcp_timer_t * const tcp_timersw[TT_N] = {
278         [TT_DELACK] = tcp_timer_delack,
279         [TT_REXMT] = tcp_timer_rexmt,
280         [TT_PERSIST] = tcp_timer_persist,
281         [TT_KEEP] = tcp_timer_keep,
282         [TT_2MSL] = tcp_timer_2msl,
283 };
284
285 /*
286  * tcp_output_locked() s a timer specific variation of call to tcp_output(),
287  * see tcp_var.h for the rest.  It handles drop request from advanced stacks,
288  * but keeps tcpcb locked unless tcp_drop() destroyed it.
289  * Returns true if tcpcb is valid and locked.
290  */
291 static inline bool
292 tcp_output_locked(struct tcpcb *tp)
293 {
294         int rv;
295
296         INP_WLOCK_ASSERT(tptoinpcb(tp));
297
298         if ((rv = tp->t_fb->tfb_tcp_output(tp)) < 0) {
299                 KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
300                     ("TCP stack %s requested tcp_drop(%p)",
301                     tp->t_fb->tfb_tcp_block_name, tp));
302                 tp = tcp_drop(tp, rv);
303         }
304
305         return (tp != NULL);
306 }
307
308 static bool
309 tcp_timer_delack(struct tcpcb *tp)
310 {
311         struct epoch_tracker et;
312 #if defined(INVARIANTS) || defined(VIMAGE)
313         struct inpcb *inp = tptoinpcb(tp);
314 #endif
315         bool rv;
316
317         INP_WLOCK_ASSERT(inp);
318
319         CURVNET_SET(inp->inp_vnet);
320         tp->t_flags |= TF_ACKNOW;
321         TCPSTAT_INC(tcps_delack);
322         NET_EPOCH_ENTER(et);
323         rv = tcp_output_locked(tp);
324         NET_EPOCH_EXIT(et);
325         CURVNET_RESTORE();
326
327         return (rv);
328 }
329
330 static bool
331 tcp_timer_2msl(struct tcpcb *tp)
332 {
333         struct inpcb *inp = tptoinpcb(tp);
334         bool close = false;
335
336         INP_WLOCK_ASSERT(inp);
337
338         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
339         CURVNET_SET(inp->inp_vnet);
340         tcp_log_end_status(tp, TCP_EI_STATUS_2MSL);
341         tcp_free_sackholes(tp);
342         /*
343          * 2 MSL timeout in shutdown went off.  If we're closed but
344          * still waiting for peer to close and connection has been idle
345          * too long delete connection control block.  Otherwise, check
346          * again in a bit.
347          *
348          * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
349          * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
350          * Ignore fact that there were recent incoming segments.
351          *
352          * XXXGL: check if inp_socket shall always be !NULL here?
353          */
354         if (tp->t_state == TCPS_TIME_WAIT) {
355                 close = true;
356         } else if (tp->t_state == TCPS_FIN_WAIT_2 &&
357             tcp_fast_finwait2_recycle && inp->inp_socket &&
358             (inp->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
359                 TCPSTAT_INC(tcps_finwait2_drops);
360                 close = true;
361         } else {
362                 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp))
363                         tcp_timer_activate(tp, TT_2MSL, TP_KEEPINTVL(tp));
364                 else
365                         close = true;
366         }
367         if (close) {
368                 struct epoch_tracker et;
369
370                 NET_EPOCH_ENTER(et);
371                 tp = tcp_close(tp);
372                 NET_EPOCH_EXIT(et);
373         }
374         CURVNET_RESTORE();
375
376         return (tp != NULL);
377 }
378
379 static bool
380 tcp_timer_keep(struct tcpcb *tp)
381 {
382         struct epoch_tracker et;
383         struct inpcb *inp = tptoinpcb(tp);
384         struct tcptemp *t_template;
385
386         INP_WLOCK_ASSERT(inp);
387
388         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
389         CURVNET_SET(inp->inp_vnet);
390         /*
391          * Because we don't regularly reset the keepalive callout in
392          * the ESTABLISHED state, it may be that we don't actually need
393          * to send a keepalive yet. If that occurs, schedule another
394          * call for the next time the keepalive timer might expire.
395          */
396         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
397                 u_int idletime;
398
399                 idletime = ticks - tp->t_rcvtime;
400                 if (idletime < TP_KEEPIDLE(tp)) {
401                         tcp_timer_activate(tp, TT_KEEP,
402                             TP_KEEPIDLE(tp) - idletime);
403                         CURVNET_RESTORE();
404                         return (true);
405                 }
406         }
407
408         /*
409          * Keep-alive timer went off; send something
410          * or drop connection if idle for too long.
411          */
412         TCPSTAT_INC(tcps_keeptimeo);
413         if (tp->t_state < TCPS_ESTABLISHED)
414                 goto dropit;
415         if ((V_tcp_always_keepalive ||
416             inp->inp_socket->so_options & SO_KEEPALIVE) &&
417             tp->t_state <= TCPS_CLOSING) {
418                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
419                         goto dropit;
420                 /*
421                  * Send a packet designed to force a response
422                  * if the peer is up and reachable:
423                  * either an ACK if the connection is still alive,
424                  * or an RST if the peer has closed the connection
425                  * due to timeout or reboot.
426                  * Using sequence number tp->snd_una-1
427                  * causes the transmitted zero-length segment
428                  * to lie outside the receive window;
429                  * by the protocol spec, this requires the
430                  * correspondent TCP to respond.
431                  */
432                 TCPSTAT_INC(tcps_keepprobe);
433                 t_template = tcpip_maketemplate(inp);
434                 if (t_template) {
435                         NET_EPOCH_ENTER(et);
436                         tcp_respond(tp, t_template->tt_ipgen,
437                                     &t_template->tt_t, (struct mbuf *)NULL,
438                                     tp->rcv_nxt, tp->snd_una - 1, 0);
439                         NET_EPOCH_EXIT(et);
440                         free(t_template, M_TEMP);
441                 }
442                 tcp_timer_activate(tp, TT_KEEP, TP_KEEPINTVL(tp));
443         } else
444                 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
445
446         CURVNET_RESTORE();
447         return (true);
448
449 dropit:
450         TCPSTAT_INC(tcps_keepdrops);
451         NET_EPOCH_ENTER(et);
452         tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
453         tp = tcp_drop(tp, ETIMEDOUT);
454         NET_EPOCH_EXIT(et);
455         CURVNET_RESTORE();
456
457         return (tp != NULL);
458 }
459
460 /*
461  * Has this session exceeded the maximum time without seeing a substantive
462  * acknowledgement? If so, return true; otherwise false.
463  */
464 static bool
465 tcp_maxunacktime_check(struct tcpcb *tp)
466 {
467
468         /* Are we tracking this timer for this session? */
469         if (TP_MAXUNACKTIME(tp) == 0)
470                 return false;
471
472         /* Do we have a current measurement. */
473         if (tp->t_acktime == 0)
474                 return false;
475
476         /* Are we within the acceptable range? */
477         if (TSTMP_GT(TP_MAXUNACKTIME(tp) + tp->t_acktime, (u_int)ticks))
478                 return false;
479
480         /* We exceeded the timer. */
481         TCPSTAT_INC(tcps_progdrops);
482         return true;
483 }
484
485 static bool
486 tcp_timer_persist(struct tcpcb *tp)
487 {
488         struct epoch_tracker et;
489 #if defined(INVARIANTS) || defined(VIMAGE)
490         struct inpcb *inp = tptoinpcb(tp);
491 #endif
492         bool progdrop, rv;
493
494         INP_WLOCK_ASSERT(inp);
495
496         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
497         CURVNET_SET(inp->inp_vnet);
498         /*
499          * Persistence timer into zero window.
500          * Force a byte to be output, if possible.
501          */
502         TCPSTAT_INC(tcps_persisttimeo);
503         /*
504          * Hack: if the peer is dead/unreachable, we do not
505          * time out if the window is closed.  After a full
506          * backoff, drop the connection if the idle time
507          * (no responses to probes) reaches the maximum
508          * backoff that we would use if retransmitting.
509          * Also, drop the connection if we haven't been making
510          * progress.
511          */
512         progdrop = tcp_maxunacktime_check(tp);
513         if (progdrop || (tp->t_rxtshift >= V_tcp_retries &&
514             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
515              ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) {
516                 if (!progdrop)
517                         TCPSTAT_INC(tcps_persistdrop);
518                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
519                 goto dropit;
520         }
521         /*
522          * If the user has closed the socket then drop a persisting
523          * connection after a much reduced timeout.
524          */
525         if (tp->t_state > TCPS_CLOSE_WAIT &&
526             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
527                 TCPSTAT_INC(tcps_persistdrop);
528                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
529                 goto dropit;
530         }
531         tcp_setpersist(tp);
532         tp->t_flags |= TF_FORCEDATA;
533         NET_EPOCH_ENTER(et);
534         if ((rv = tcp_output_locked(tp)))
535                 tp->t_flags &= ~TF_FORCEDATA;
536         NET_EPOCH_EXIT(et);
537         CURVNET_RESTORE();
538
539         return (rv);
540
541 dropit:
542         NET_EPOCH_ENTER(et);
543         tp = tcp_drop(tp, ETIMEDOUT);
544         NET_EPOCH_EXIT(et);
545         CURVNET_RESTORE();
546
547         return (tp != NULL);
548 }
549
550 static bool
551 tcp_timer_rexmt(struct tcpcb *tp)
552 {
553         struct epoch_tracker et;
554         struct inpcb *inp = tptoinpcb(tp);
555         int rexmt;
556         bool isipv6, rv;
557
558         INP_WLOCK_ASSERT(inp);
559
560         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
561         CURVNET_SET(inp->inp_vnet);
562         tcp_free_sackholes(tp);
563         if (tp->t_fb->tfb_tcp_rexmit_tmr) {
564                 /* The stack has a timer action too. */
565                 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
566         }
567         /*
568          * Retransmission timer went off.  Message has not
569          * been acked within retransmit interval.  Back off
570          * to a longer retransmit interval and retransmit one segment.
571          *
572          * If we've either exceeded the maximum number of retransmissions,
573          * or we've gone long enough without making progress, then drop
574          * the session.
575          */
576         if (++tp->t_rxtshift > V_tcp_retries || tcp_maxunacktime_check(tp)) {
577                 if (tp->t_rxtshift > V_tcp_retries)
578                         TCPSTAT_INC(tcps_timeoutdrop);
579                 tp->t_rxtshift = V_tcp_retries;
580                 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
581                 NET_EPOCH_ENTER(et);
582                 tp = tcp_drop(tp, ETIMEDOUT);
583                 NET_EPOCH_EXIT(et);
584                 CURVNET_RESTORE();
585
586                 return (tp != NULL);
587         }
588         if (tp->t_state == TCPS_SYN_SENT) {
589                 /*
590                  * If the SYN was retransmitted, indicate CWND to be
591                  * limited to 1 segment in cc_conn_init().
592                  */
593                 tp->snd_cwnd = 1;
594         } else if (tp->t_rxtshift == 1) {
595                 /*
596                  * first retransmit; record ssthresh and cwnd so they can
597                  * be recovered if this turns out to be a "bad" retransmit.
598                  * A retransmit is considered "bad" if an ACK for this
599                  * segment is received within RTT/2 interval; the assumption
600                  * here is that the ACK was already in flight.  See
601                  * "On Estimating End-to-End Network Path Properties" by
602                  * Allman and Paxson for more details.
603                  */
604                 tp->snd_cwnd_prev = tp->snd_cwnd;
605                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
606                 tp->snd_recover_prev = tp->snd_recover;
607                 if (IN_FASTRECOVERY(tp->t_flags))
608                         tp->t_flags |= TF_WASFRECOVERY;
609                 else
610                         tp->t_flags &= ~TF_WASFRECOVERY;
611                 if (IN_CONGRECOVERY(tp->t_flags))
612                         tp->t_flags |= TF_WASCRECOVERY;
613                 else
614                         tp->t_flags &= ~TF_WASCRECOVERY;
615                 if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
616                         tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
617                 /* In the event that we've negotiated timestamps
618                  * badrxtwin will be set to the value that we set
619                  * the retransmitted packet's to_tsval to by tcp_output
620                  */
621                 tp->t_flags |= TF_PREVVALID;
622         } else
623                 tp->t_flags &= ~TF_PREVVALID;
624         TCPSTAT_INC(tcps_rexmttimeo);
625         if ((tp->t_state == TCPS_SYN_SENT) ||
626             (tp->t_state == TCPS_SYN_RECEIVED))
627                 rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift];
628         else
629                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
630         TCPT_RANGESET(tp->t_rxtcur, rexmt,
631                       tp->t_rttmin, TCPTV_REXMTMAX);
632
633         /*
634          * We enter the path for PLMTUD if connection is established or, if
635          * connection is FIN_WAIT_1 status, reason for the last is that if
636          * amount of data we send is very small, we could send it in couple of
637          * packets and process straight to FIN. In that case we won't catch
638          * ESTABLISHED state.
639          */
640 #ifdef INET6
641         isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false;
642 #else
643         isipv6 = false;
644 #endif
645         if (((V_tcp_pmtud_blackhole_detect == 1) ||
646             (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
647             (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
648             ((tp->t_state == TCPS_ESTABLISHED) ||
649             (tp->t_state == TCPS_FIN_WAIT_1))) {
650                 if (tp->t_rxtshift == 1) {
651                         /*
652                          * We enter blackhole detection after the first
653                          * unsuccessful timer based retransmission.
654                          * Then we reduce up to two times the MSS, each
655                          * candidate giving two tries of retransmissions.
656                          * But we give a candidate only two tries, if it
657                          * actually reduces the MSS.
658                          */
659                         tp->t_blackhole_enter = 2;
660                         tp->t_blackhole_exit = tp->t_blackhole_enter;
661                         if (isipv6) {
662 #ifdef INET6
663                                 if (tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss)
664                                         tp->t_blackhole_exit += 2;
665                                 if (tp->t_maxseg > V_tcp_v6mssdflt &&
666                                     V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt)
667                                         tp->t_blackhole_exit += 2;
668 #endif
669                         } else {
670 #ifdef INET
671                                 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss)
672                                         tp->t_blackhole_exit += 2;
673                                 if (tp->t_maxseg > V_tcp_mssdflt &&
674                                     V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt)
675                                         tp->t_blackhole_exit += 2;
676 #endif
677                         }
678                 }
679                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
680                     (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
681                     (tp->t_rxtshift >= tp->t_blackhole_enter &&
682                     tp->t_rxtshift < tp->t_blackhole_exit &&
683                     (tp->t_rxtshift - tp->t_blackhole_enter) % 2 == 0)) {
684                         /*
685                          * Enter Path MTU Black-hole Detection mechanism:
686                          * - Disable Path MTU Discovery (IP "DF" bit).
687                          * - Reduce MTU to lower value than what we
688                          *   negotiated with peer.
689                          */
690                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
691                                 /* Record that we may have found a black hole. */
692                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
693                                 /* Keep track of previous MSS. */
694                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
695                         }
696
697                         /*
698                          * Reduce the MSS to blackhole value or to the default
699                          * in an attempt to retransmit.
700                          */
701 #ifdef INET6
702                         if (isipv6 &&
703                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss &&
704                             V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) {
705                                 /* Use the sysctl tuneable blackhole MSS. */
706                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
707                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
708                         } else if (isipv6) {
709                                 /* Use the default MSS. */
710                                 tp->t_maxseg = V_tcp_v6mssdflt;
711                                 /*
712                                  * Disable Path MTU Discovery when we switch to
713                                  * minmss.
714                                  */
715                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
716                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
717                         }
718 #endif
719 #if defined(INET6) && defined(INET)
720                         else
721 #endif
722 #ifdef INET
723                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss &&
724                             V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) {
725                                 /* Use the sysctl tuneable blackhole MSS. */
726                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
727                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
728                         } else {
729                                 /* Use the default MSS. */
730                                 tp->t_maxseg = V_tcp_mssdflt;
731                                 /*
732                                  * Disable Path MTU Discovery when we switch to
733                                  * minmss.
734                                  */
735                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
736                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
737                         }
738 #endif
739                         /*
740                          * Reset the slow-start flight size
741                          * as it may depend on the new MSS.
742                          */
743                         if (CC_ALGO(tp)->conn_init != NULL)
744                                 CC_ALGO(tp)->conn_init(&tp->t_ccv);
745                 } else {
746                         /*
747                          * If further retransmissions are still unsuccessful
748                          * with a lowered MTU, maybe this isn't a blackhole and
749                          * we restore the previous MSS and blackhole detection
750                          * flags.
751                          */
752                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
753                             (tp->t_rxtshift >= tp->t_blackhole_exit)) {
754                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
755                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
756                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
757                                 TCPSTAT_INC(tcps_pmtud_blackhole_failed);
758                                 /*
759                                  * Reset the slow-start flight size as it
760                                  * may depend on the new MSS.
761                                  */
762                                 if (CC_ALGO(tp)->conn_init != NULL)
763                                         CC_ALGO(tp)->conn_init(&tp->t_ccv);
764                         }
765                 }
766         }
767
768         /*
769          * Disable RFC1323 and SACK if we haven't got any response to
770          * our third SYN to work-around some broken terminal servers
771          * (most of which have hopefully been retired) that have bad VJ
772          * header compression code which trashes TCP segments containing
773          * unknown-to-them TCP options.
774          */
775         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
776             (tp->t_rxtshift == 3))
777                 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
778         /*
779          * If we backed off this far, notify the L3 protocol that we're having
780          * connection problems.
781          */
782         if (tp->t_rxtshift > TCP_RTT_INVALIDATE) {
783 #ifdef INET6
784                 if ((inp->inp_vflag & INP_IPV6) != 0)
785                         in6_losing(inp);
786                 else
787 #endif
788                         in_losing(inp);
789         }
790         tp->snd_nxt = tp->snd_una;
791         tp->snd_recover = tp->snd_max;
792         /*
793          * Force a segment to be sent.
794          */
795         tp->t_flags |= TF_ACKNOW;
796         /*
797          * If timing a segment in this window, stop the timer.
798          */
799         tp->t_rtttime = 0;
800
801         cc_cong_signal(tp, NULL, CC_RTO);
802         NET_EPOCH_ENTER(et);
803         rv = tcp_output_locked(tp);
804         NET_EPOCH_EXIT(et);
805         CURVNET_RESTORE();
806
807         return (rv);
808 }
809
810 static void
811 tcp_bblog_timer(struct tcpcb *tp, tt_which which, tt_what what, uint32_t ticks)
812 {
813         struct tcp_log_buffer *lgb;
814         uint64_t ms;
815
816         INP_WLOCK_ASSERT(tptoinpcb(tp));
817         if (tcp_bblogging_on(tp))
818                 lgb = tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0,
819                     NULL, false, NULL, NULL, 0, NULL);
820         else
821                 lgb = NULL;
822         if (lgb != NULL) {
823                 lgb->tlb_flex1 = (what << 8) | which;
824                 if (what == TT_STARTING) {
825                         /* Convert ticks to ms and store it in tlb_flex2. */
826                         if (hz == 1000)
827                                 lgb->tlb_flex2 = ticks;
828                         else {
829                                 ms = (((uint64_t)ticks * 1000) + (hz - 1)) / hz;
830                                 if (ms > UINT32_MAX)
831                                         lgb->tlb_flex2 = UINT32_MAX;
832                                 else
833                                         lgb->tlb_flex2 = (uint32_t)ms;
834                         }
835                 }
836         }
837 }
838
839 static inline tt_which
840 tcp_timer_next(struct tcpcb *tp, sbintime_t *precision)
841 {
842         tt_which i, rv;
843         sbintime_t after, before;
844
845         for (i = 0, rv = TT_N, after = before = SBT_MAX; i < TT_N; i++) {
846                 if (tp->t_timers[i] < after) {
847                         after = tp->t_timers[i];
848                         rv = i;
849                 }
850                 before = MIN(before, tp->t_timers[i] + tp->t_precisions[i]);
851         }
852         if (precision != NULL)
853                 *precision = before - after;
854
855         return (rv);
856 }
857
858 static void
859 tcp_timer_enter(void *xtp)
860 {
861         struct tcpcb *tp = xtp;
862         struct inpcb *inp = tptoinpcb(tp);
863         sbintime_t precision;
864         tt_which which;
865         bool tp_valid;
866
867         INP_WLOCK_ASSERT(inp);
868         MPASS((curthread->td_pflags & TDP_INTCPCALLOUT) == 0);
869
870         curthread->td_pflags |= TDP_INTCPCALLOUT;
871
872         which = tcp_timer_next(tp, NULL);
873         MPASS(which < TT_N);
874         tp->t_timers[which] = SBT_MAX;
875         tp->t_precisions[which] = 0;
876
877         tcp_bblog_timer(tp, which, TT_PROCESSING, 0);
878         tp_valid = tcp_timersw[which](tp);
879         if (tp_valid) {
880                 tcp_bblog_timer(tp, which, TT_PROCESSED, 0);
881                 if ((which = tcp_timer_next(tp, &precision)) != TT_N) {
882                         callout_reset_sbt_on(&tp->t_callout,
883                             tp->t_timers[which], precision, tcp_timer_enter,
884                             tp, inp_to_cpuid(inp), C_ABSOLUTE);
885                 }
886                 INP_WUNLOCK(inp);
887         }
888
889         curthread->td_pflags &= ~TDP_INTCPCALLOUT;
890 }
891
892 /*
893  * Activate or stop (delta == 0) a TCP timer.
894  */
895 void
896 tcp_timer_activate(struct tcpcb *tp, tt_which which, u_int delta)
897 {
898         struct inpcb *inp = tptoinpcb(tp);
899         sbintime_t precision;
900         tt_what what;
901
902 #ifdef TCP_OFFLOAD
903         if (tp->t_flags & TF_TOE)
904                 return;
905 #endif
906
907         INP_WLOCK_ASSERT(inp);
908
909         if (delta > 0) {
910                 what = TT_STARTING;
911                 callout_when(tick_sbt * delta, 0, C_HARDCLOCK,
912                     &tp->t_timers[which], &tp->t_precisions[which]);
913         } else {
914                 what = TT_STOPPING;
915                 tp->t_timers[which] = SBT_MAX;
916         }
917         tcp_bblog_timer(tp, which, what, delta);
918
919         if ((which = tcp_timer_next(tp, &precision)) != TT_N)
920                 callout_reset_sbt_on(&tp->t_callout, tp->t_timers[which],
921                     precision, tcp_timer_enter, tp, inp_to_cpuid(inp),
922                     C_ABSOLUTE);
923         else
924                 callout_stop(&tp->t_callout);
925 }
926
927 bool
928 tcp_timer_active(struct tcpcb *tp, tt_which which)
929 {
930
931         INP_WLOCK_ASSERT(tptoinpcb(tp));
932
933         return (tp->t_timers[which] != SBT_MAX);
934 }
935
936 /*
937  * Stop all timers associated with tcpcb.
938  *
939  * Called only on tcpcb destruction.  The tcpcb shall already be dropped from
940  * the pcb lookup database and socket is not losing the last reference.
941  *
942  * XXXGL: unfortunately our callout(9) is not able to fully stop a locked
943  * callout even when only two threads are involved: the callout itself and the
944  * thread that does callout_stop().  See where softclock_call_cc() swaps the
945  * callwheel lock to callout lock and then checks cc_exec_cancel().  This is
946  * the race window.  If it happens, the tcp_timer_enter() won't be executed,
947  * however pcb lock will be locked and released, hence we can't free memory.
948  * Until callout(9) is improved, just keep retrying.  In my profiling I've seen
949  * such event happening less than 1 time per hour with 20-30 Gbit/s of traffic.
950  */
951 void
952 tcp_timer_stop(struct tcpcb *tp)
953 {
954         struct inpcb *inp = tptoinpcb(tp);
955
956         INP_WLOCK_ASSERT(inp);
957
958         if (curthread->td_pflags & TDP_INTCPCALLOUT) {
959                 int stopped __diagused;
960
961                 stopped = callout_stop(&tp->t_callout);
962                 MPASS(stopped == 0);
963         } else while(__predict_false(callout_stop(&tp->t_callout) == 0)) {
964                 INP_WUNLOCK(inp);
965                 kern_yield(PRI_UNCHANGED);
966                 INP_WLOCK(inp);
967         }
968 }