sys/netinet/tcp_timer.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. Neither the name of the University nor the names of its contributors
  14  *    may be used to endorse or promote products derived from this software
  15  *    without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  *
  29  *      @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
  30  */
  31
  32 #include <sys/cdefs.h>
  33 __FBSDID("$FreeBSD$");
  34
  35 #include "opt_inet.h"
  36 #include "opt_inet6.h"
  37 #include "opt_tcpdebug.h"
  38 #include "opt_rss.h"
  39
  40 #include <sys/param.h>
  41 #include <sys/kernel.h>
  42 #include <sys/lock.h>
  43 #include <sys/mbuf.h>
  44 #include <sys/mutex.h>
  45 #include <sys/protosw.h>
  46 #include <sys/smp.h>
  47 #include <sys/socket.h>
  48 #include <sys/socketvar.h>
  49 #include <sys/sysctl.h>
  50 #include <sys/systm.h>
  51
  52 #include <net/if.h>
  53 #include <net/route.h>
  54 #include <net/rss_config.h>
  55 #include <net/vnet.h>
  56 #include <net/netisr.h>
  57
  58 #include <netinet/in.h>
  59 #include <netinet/in_kdtrace.h>
  60 #include <netinet/in_pcb.h>
  61 #include <netinet/in_rss.h>
  62 #include <netinet/in_systm.h>
  63 #ifdef INET6
  64 #include <netinet6/in6_pcb.h>
  65 #endif
  66 #include <netinet/ip_var.h>
  67 #include <netinet/tcp.h>
  68 #include <netinet/tcp_fsm.h>
  69 #include <netinet/tcp_timer.h>
  70 #include <netinet/tcp_var.h>
  71 #include <netinet/cc/cc.h>
  72 #ifdef INET6
  73 #include <netinet6/tcp6_var.h>
  74 #endif
  75 #include <netinet/tcpip.h>
  76 #ifdef TCPDEBUG
  77 #include <netinet/tcp_debug.h>
  78 #endif
  79
  80 int    tcp_persmin;
  81 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW,
  82     &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval");
  83
  84 int    tcp_persmax;
  85 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW,
  86     &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval");
  87
  88 int     tcp_keepinit;
  89 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
  90     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
  91
  92 int     tcp_keepidle;
  93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
  94     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
  95
  96 int     tcp_keepintvl;
  97 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
  98     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
  99
 100 int     tcp_delacktime;
 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
 102     &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
 103     "Time before a delayed ACK is sent");
 104
 105 int     tcp_msl;
 106 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
 107     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
 108
 109 int     tcp_rexmit_min;
 110 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
 111     &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
 112     "Minimum Retransmission Timeout");
 113
 114 int     tcp_rexmit_slop;
 115 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
 116     &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
 117     "Retransmission Timer Slop");
 118
 119 static int      always_keepalive = 1;
 120 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
 121     &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
 122
 123 int    tcp_fast_finwait2_recycle = 0;
 124 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
 125     &tcp_fast_finwait2_recycle, 0,
 126     "Recycle closed FIN_WAIT_2 connections faster");
 127
 128 int    tcp_finwait2_timeout;
 129 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
 130     &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
 131
 132 int     tcp_keepcnt = TCPTV_KEEPCNT;
 133 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
 134     "Number of keepalive probes to send");
 135
 136         /* max idle probes */
 137 int     tcp_maxpersistidle;
 138
 139 static int      tcp_rexmit_drop_options = 0;
 140 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
 141     &tcp_rexmit_drop_options, 0,
 142     "Drop TCP options from 3rd and later retransmitted SYN");
 143
 144 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
 145 #define V_tcp_pmtud_blackhole_detect    VNET(tcp_pmtud_blackhole_detect)
 146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
 147     CTLFLAG_RW|CTLFLAG_VNET,
 148     &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
 149     "Path MTU Discovery Black Hole Detection Enabled");
 150
 151 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
 152 #define V_tcp_pmtud_blackhole_activated \
 153     VNET(tcp_pmtud_blackhole_activated)
 154 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
 155     CTLFLAG_RD|CTLFLAG_VNET,
 156     &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
 157     "Path MTU Discovery Black Hole Detection, Activation Count");
 158
 159 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
 160 #define V_tcp_pmtud_blackhole_activated_min_mss \
 161     VNET(tcp_pmtud_blackhole_activated_min_mss)
 162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
 163     CTLFLAG_RD|CTLFLAG_VNET,
 164     &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
 165     "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
 166
 167 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
 168 #define V_tcp_pmtud_blackhole_failed    VNET(tcp_pmtud_blackhole_failed)
 169 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
 170     CTLFLAG_RD|CTLFLAG_VNET,
 171     &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
 172     "Path MTU Discovery Black Hole Detection, Failure Count");
 173
 174 #ifdef INET
 175 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
 176 #define V_tcp_pmtud_blackhole_mss       VNET(tcp_pmtud_blackhole_mss)
 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
 178     CTLFLAG_RW|CTLFLAG_VNET,
 179     &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
 180     "Path MTU Discovery Black Hole Detection lowered MSS");
 181 #endif
 182
 183 #ifdef INET6
 184 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
 185 #define V_tcp_v6pmtud_blackhole_mss     VNET(tcp_v6pmtud_blackhole_mss)
 186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
 187     CTLFLAG_RW|CTLFLAG_VNET,
 188     &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
 189     "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
 190 #endif
 191
 192 #ifdef  RSS
 193 static int      per_cpu_timers = 1;
 194 #else
 195 static int      per_cpu_timers = 0;
 196 #endif
 197 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
 198     &per_cpu_timers , 0, "run tcp timers on all cpus");
 199
 200 #if 0
 201 #define INP_CPU(inp)    (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
 202                 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
 203 #endif
 204
 205 /*
 206  * Map the given inp to a CPU id.
 207  *
 208  * This queries RSS if it's compiled in, else it defaults to the current
 209  * CPU ID.
 210  */
 211 static inline int
 212 inp_to_cpuid(struct inpcb *inp)
 213 {
 214         u_int cpuid;
 215
 216 #ifdef  RSS
 217         if (per_cpu_timers) {
 218                 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 219                 if (cpuid == NETISR_CPUID_NONE)
 220                         return (curcpu);        /* XXX */
 221                 else
 222                         return (cpuid);
 223         }
 224 #else
 225         /* Legacy, pre-RSS behaviour */
 226         if (per_cpu_timers) {
 227                 /*
 228                  * We don't have a flowid -> cpuid mapping, so cheat and
 229                  * just map unknown cpuids to curcpu.  Not the best, but
 230                  * apparently better than defaulting to swi 0.
 231                  */
 232                 cpuid = inp->inp_flowid % (mp_maxid + 1);
 233                 if (! CPU_ABSENT(cpuid))
 234                         return (cpuid);
 235                 return (curcpu);
 236         }
 237 #endif
 238         /* Default for RSS and non-RSS - cpuid 0 */
 239         else {
 240                 return (0);
 241         }
 242 }
 243
 244 /*
 245  * Tcp protocol timeout routine called every 500 ms.
 246  * Updates timestamps used for TCP
 247  * causes finite state machine actions if timers expire.
 248  */
 249 void
 250 tcp_slowtimo(void)
 251 {
 252         VNET_ITERATOR_DECL(vnet_iter);
 253
 254         VNET_LIST_RLOCK_NOSLEEP();
 255         VNET_FOREACH(vnet_iter) {
 256                 CURVNET_SET(vnet_iter);
 257                 (void) tcp_tw_2msl_scan(0);
 258                 CURVNET_RESTORE();
 259         }
 260         VNET_LIST_RUNLOCK_NOSLEEP();
 261 }
 262
 263 int     tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
 264     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
 265
 266 int     tcp_backoff[TCP_MAXRXTSHIFT + 1] =
 267     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
 268
 269 static int tcp_totbackoff = 2559;       /* sum of tcp_backoff[] */
 270
 271 /*
 272  * TCP timer processing.
 273  */
 274
 275 void
 276 tcp_timer_delack(void *xtp)
 277 {
 278         struct tcpcb *tp = xtp;
 279         struct inpcb *inp;
 280         CURVNET_SET(tp->t_vnet);
 281
 282         inp = tp->t_inpcb;
 283         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 284         INP_WLOCK(inp);
 285         if (callout_pending(&tp->t_timers->tt_delack) ||
 286             !callout_active(&tp->t_timers->tt_delack)) {
 287                 INP_WUNLOCK(inp);
 288                 CURVNET_RESTORE();
 289                 return;
 290         }
 291         callout_deactivate(&tp->t_timers->tt_delack);
 292         if ((inp->inp_flags & INP_DROPPED) != 0) {
 293                 INP_WUNLOCK(inp);
 294                 CURVNET_RESTORE();
 295                 return;
 296         }
 297         tp->t_flags |= TF_ACKNOW;
 298         TCPSTAT_INC(tcps_delack);
 299         (void) tp->t_fb->tfb_tcp_output(tp);
 300         INP_WUNLOCK(inp);
 301         CURVNET_RESTORE();
 302 }
 303
 304 /*
 305  * When a timer wants to remove a TCB it must
 306  * hold the INP_INFO_RLOCK(). The timer function
 307  * should only have grabbed the INP_WLOCK() when
 308  * it entered. To safely switch to holding both the
 309  * INP_INFO_RLOCK() and the INP_WLOCK() we must first
 310  * grab a reference on the inp, which will hold the inp
 311  * so that it can't be removed. We then unlock the INP_WLOCK(),
 312  * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK()
 313  * we proceed again to get the INP_WLOCK() (this preserves proper
 314  * lock order). After acquiring the INP_WLOCK we must check if someone
 315  * else deleted the pcb i.e. the inp_flags check.
 316  * If so we return 1 otherwise we return 0.
 317  *
 318  * No matter what the tcp_inpinfo_lock_add() function
 319  * returns the caller must afterwards call tcp_inpinfo_lock_del()
 320  * to drop the locks and reference properly.
 321  */
 322
 323 int
 324 tcp_inpinfo_lock_add(struct inpcb *inp)
 325 {
 326         in_pcbref(inp);
 327         INP_WUNLOCK(inp);
 328         INP_INFO_RLOCK(&V_tcbinfo);
 329         INP_WLOCK(inp);
 330         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 331                 return(1);
 332         }
 333         return(0);
 334
 335 }
 336
 337 void
 338 tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp)
 339 {
 340         INP_INFO_RUNLOCK(&V_tcbinfo);
 341         if (inp && (tp == NULL)) {
 342                 /*
 343                  * If tcp_close/drop() gets called and tp
 344                  * returns NULL, then the function dropped
 345                  * the inp lock, we hold a reference keeping
 346                  * this around, so we must re-aquire the
 347                  * INP_WLOCK() in order to proceed with
 348                  * our dropping the inp reference.
 349                  */
 350                 INP_WLOCK(inp);
 351         }
 352         if (inp && in_pcbrele_wlocked(inp) == 0)
 353                 INP_WUNLOCK(inp);
 354 }
 355
 356 void
 357 tcp_timer_2msl(void *xtp)
 358 {
 359         struct tcpcb *tp = xtp;
 360         struct inpcb *inp;
 361         CURVNET_SET(tp->t_vnet);
 362 #ifdef TCPDEBUG
 363         int ostate;
 364
 365         ostate = tp->t_state;
 366 #endif
 367         inp = tp->t_inpcb;
 368         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 369         INP_WLOCK(inp);
 370         tcp_free_sackholes(tp);
 371         if (callout_pending(&tp->t_timers->tt_2msl) ||
 372             !callout_active(&tp->t_timers->tt_2msl)) {
 373                 INP_WUNLOCK(tp->t_inpcb);
 374                 CURVNET_RESTORE();
 375                 return;
 376         }
 377         callout_deactivate(&tp->t_timers->tt_2msl);
 378         if ((inp->inp_flags & INP_DROPPED) != 0) {
 379                 INP_WUNLOCK(inp);
 380                 CURVNET_RESTORE();
 381                 return;
 382         }
 383         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 384                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 385         /*
 386          * 2 MSL timeout in shutdown went off.  If we're closed but
 387          * still waiting for peer to close and connection has been idle
 388          * too long delete connection control block.  Otherwise, check
 389          * again in a bit.
 390          *
 391          * If in TIME_WAIT state just ignore as this timeout is handled in
 392          * tcp_tw_2msl_scan().
 393          *
 394          * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
 395          * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
 396          * Ignore fact that there were recent incoming segments.
 397          */
 398         if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
 399                 INP_WUNLOCK(inp);
 400                 CURVNET_RESTORE();
 401                 return;
 402         }
 403         if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
 404             tp->t_inpcb && tp->t_inpcb->inp_socket &&
 405             (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
 406                 TCPSTAT_INC(tcps_finwait2_drops);
 407                 if (tcp_inpinfo_lock_add(inp)) {
 408                         tcp_inpinfo_lock_del(inp, tp);
 409                         goto out;
 410                 }
 411                 tp = tcp_close(tp);
 412                 tcp_inpinfo_lock_del(inp, tp);
 413                 goto out;
 414         } else {
 415                 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
 416                         callout_reset(&tp->t_timers->tt_2msl,
 417                                       TP_KEEPINTVL(tp), tcp_timer_2msl, tp);
 418                 } else {
 419                         if (tcp_inpinfo_lock_add(inp)) {
 420                                 tcp_inpinfo_lock_del(inp, tp);
 421                                 goto out;
 422                         }
 423                         tp = tcp_close(tp);
 424                         tcp_inpinfo_lock_del(inp, tp);
 425                         goto out;
 426                 }
 427        }
 428
 429 #ifdef TCPDEBUG
 430         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 431                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 432                           PRU_SLOWTIMO);
 433 #endif
 434         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 435
 436         if (tp != NULL)
 437                 INP_WUNLOCK(inp);
 438 out:
 439         CURVNET_RESTORE();
 440 }
 441
 442 void
 443 tcp_timer_keep(void *xtp)
 444 {
 445         struct tcpcb *tp = xtp;
 446         struct tcptemp *t_template;
 447         struct inpcb *inp;
 448         CURVNET_SET(tp->t_vnet);
 449 #ifdef TCPDEBUG
 450         int ostate;
 451
 452         ostate = tp->t_state;
 453 #endif
 454         inp = tp->t_inpcb;
 455         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 456         INP_WLOCK(inp);
 457         if (callout_pending(&tp->t_timers->tt_keep) ||
 458             !callout_active(&tp->t_timers->tt_keep)) {
 459                 INP_WUNLOCK(inp);
 460                 CURVNET_RESTORE();
 461                 return;
 462         }
 463         callout_deactivate(&tp->t_timers->tt_keep);
 464         if ((inp->inp_flags & INP_DROPPED) != 0) {
 465                 INP_WUNLOCK(inp);
 466                 CURVNET_RESTORE();
 467                 return;
 468         }
 469         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 470                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 471
 472         /*
 473          * Because we don't regularly reset the keepalive callout in
 474          * the ESTABLISHED state, it may be that we don't actually need
 475          * to send a keepalive yet. If that occurs, schedule another
 476          * call for the next time the keepalive timer might expire.
 477          */
 478         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 479                 u_int idletime;
 480
 481                 idletime = ticks - tp->t_rcvtime;
 482                 if (idletime < TP_KEEPIDLE(tp)) {
 483                         callout_reset(&tp->t_timers->tt_keep,
 484                             TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp);
 485                         INP_WUNLOCK(inp);
 486                         CURVNET_RESTORE();
 487                         return;
 488                 }
 489         }
 490
 491         /*
 492          * Keep-alive timer went off; send something
 493          * or drop connection if idle for too long.
 494          */
 495         TCPSTAT_INC(tcps_keeptimeo);
 496         if (tp->t_state < TCPS_ESTABLISHED)
 497                 goto dropit;
 498         if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
 499             tp->t_state <= TCPS_CLOSING) {
 500                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
 501                         goto dropit;
 502                 /*
 503                  * Send a packet designed to force a response
 504                  * if the peer is up and reachable:
 505                  * either an ACK if the connection is still alive,
 506                  * or an RST if the peer has closed the connection
 507                  * due to timeout or reboot.
 508                  * Using sequence number tp->snd_una-1
 509                  * causes the transmitted zero-length segment
 510                  * to lie outside the receive window;
 511                  * by the protocol spec, this requires the
 512                  * correspondent TCP to respond.
 513                  */
 514                 TCPSTAT_INC(tcps_keepprobe);
 515                 t_template = tcpip_maketemplate(inp);
 516                 if (t_template) {
 517                         tcp_respond(tp, t_template->tt_ipgen,
 518                                     &t_template->tt_t, (struct mbuf *)NULL,
 519                                     tp->rcv_nxt, tp->snd_una - 1, 0);
 520                         free(t_template, M_TEMP);
 521                 }
 522                 callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
 523                               tcp_timer_keep, tp);
 524         } else
 525                 callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
 526                               tcp_timer_keep, tp);
 527
 528 #ifdef TCPDEBUG
 529         if (inp->inp_socket->so_options & SO_DEBUG)
 530                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 531                           PRU_SLOWTIMO);
 532 #endif
 533         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 534         INP_WUNLOCK(inp);
 535         CURVNET_RESTORE();
 536         return;
 537
 538 dropit:
 539         TCPSTAT_INC(tcps_keepdrops);
 540
 541         if (tcp_inpinfo_lock_add(inp)) {
 542                 tcp_inpinfo_lock_del(inp, tp);
 543                 goto out;
 544         }
 545         tp = tcp_drop(tp, ETIMEDOUT);
 546
 547 #ifdef TCPDEBUG
 548         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 549                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 550                           PRU_SLOWTIMO);
 551 #endif
 552         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 553         tcp_inpinfo_lock_del(inp, tp);
 554 out:
 555         CURVNET_RESTORE();
 556 }
 557
 558 void
 559 tcp_timer_persist(void *xtp)
 560 {
 561         struct tcpcb *tp = xtp;
 562         struct inpcb *inp;
 563         CURVNET_SET(tp->t_vnet);
 564 #ifdef TCPDEBUG
 565         int ostate;
 566
 567         ostate = tp->t_state;
 568 #endif
 569         inp = tp->t_inpcb;
 570         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 571         INP_WLOCK(inp);
 572         if (callout_pending(&tp->t_timers->tt_persist) ||
 573             !callout_active(&tp->t_timers->tt_persist)) {
 574                 INP_WUNLOCK(inp);
 575                 CURVNET_RESTORE();
 576                 return;
 577         }
 578         callout_deactivate(&tp->t_timers->tt_persist);
 579         if ((inp->inp_flags & INP_DROPPED) != 0) {
 580                 INP_WUNLOCK(inp);
 581                 CURVNET_RESTORE();
 582                 return;
 583         }
 584         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 585                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 586         /*
 587          * Persistence timer into zero window.
 588          * Force a byte to be output, if possible.
 589          */
 590         TCPSTAT_INC(tcps_persisttimeo);
 591         /*
 592          * Hack: if the peer is dead/unreachable, we do not
 593          * time out if the window is closed.  After a full
 594          * backoff, drop the connection if the idle time
 595          * (no responses to probes) reaches the maximum
 596          * backoff that we would use if retransmitting.
 597          */
 598         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
 599             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 600              ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 601                 TCPSTAT_INC(tcps_persistdrop);
 602                 if (tcp_inpinfo_lock_add(inp)) {
 603                         tcp_inpinfo_lock_del(inp, tp);
 604                         goto out;
 605                 }
 606                 tp = tcp_drop(tp, ETIMEDOUT);
 607                 tcp_inpinfo_lock_del(inp, tp);
 608                 goto out;
 609         }
 610         /*
 611          * If the user has closed the socket then drop a persisting
 612          * connection after a much reduced timeout.
 613          */
 614         if (tp->t_state > TCPS_CLOSE_WAIT &&
 615             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
 616                 TCPSTAT_INC(tcps_persistdrop);
 617                 if (tcp_inpinfo_lock_add(inp)) {
 618                         tcp_inpinfo_lock_del(inp, tp);
 619                         goto out;
 620                 }
 621                 tp = tcp_drop(tp, ETIMEDOUT);
 622                 tcp_inpinfo_lock_del(inp, tp);
 623                 goto out;
 624         }
 625         tcp_setpersist(tp);
 626         tp->t_flags |= TF_FORCEDATA;
 627         (void) tp->t_fb->tfb_tcp_output(tp);
 628         tp->t_flags &= ~TF_FORCEDATA;
 629
 630 #ifdef TCPDEBUG
 631         if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
 632                 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
 633 #endif
 634         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 635         INP_WUNLOCK(inp);
 636 out:
 637         CURVNET_RESTORE();
 638 }
 639
 640 void
 641 tcp_timer_rexmt(void * xtp)
 642 {
 643         struct tcpcb *tp = xtp;
 644         CURVNET_SET(tp->t_vnet);
 645         int rexmt;
 646         struct inpcb *inp;
 647 #ifdef TCPDEBUG
 648         int ostate;
 649
 650         ostate = tp->t_state;
 651 #endif
 652         inp = tp->t_inpcb;
 653         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 654         INP_WLOCK(inp);
 655         if (callout_pending(&tp->t_timers->tt_rexmt) ||
 656             !callout_active(&tp->t_timers->tt_rexmt)) {
 657                 INP_WUNLOCK(inp);
 658                 CURVNET_RESTORE();
 659                 return;
 660         }
 661         callout_deactivate(&tp->t_timers->tt_rexmt);
 662         if ((inp->inp_flags & INP_DROPPED) != 0) {
 663                 INP_WUNLOCK(inp);
 664                 CURVNET_RESTORE();
 665                 return;
 666         }
 667         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 668                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 669         tcp_free_sackholes(tp);
 670         if (tp->t_fb->tfb_tcp_rexmit_tmr) {
 671                 /* The stack has a timer action too. */
 672                 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
 673         }
 674         /*
 675          * Retransmission timer went off.  Message has not
 676          * been acked within retransmit interval.  Back off
 677          * to a longer retransmit interval and retransmit one segment.
 678          */
 679         if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 680                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
 681                 TCPSTAT_INC(tcps_timeoutdrop);
 682                 if (tcp_inpinfo_lock_add(inp)) {
 683                         tcp_inpinfo_lock_del(inp, tp);
 684                         goto out;
 685                 }
 686                 tp = tcp_drop(tp, tp->t_softerror ?
 687                               tp->t_softerror : ETIMEDOUT);
 688                 tcp_inpinfo_lock_del(inp, tp);
 689                 goto out;
 690         }
 691         if (tp->t_state == TCPS_SYN_SENT) {
 692                 /*
 693                  * If the SYN was retransmitted, indicate CWND to be
 694                  * limited to 1 segment in cc_conn_init().
 695                  */
 696                 tp->snd_cwnd = 1;
 697         } else if (tp->t_rxtshift == 1) {
 698                 /*
 699                  * first retransmit; record ssthresh and cwnd so they can
 700                  * be recovered if this turns out to be a "bad" retransmit.
 701                  * A retransmit is considered "bad" if an ACK for this
 702                  * segment is received within RTT/2 interval; the assumption
 703                  * here is that the ACK was already in flight.  See
 704                  * "On Estimating End-to-End Network Path Properties" by
 705                  * Allman and Paxson for more details.
 706                  */
 707                 tp->snd_cwnd_prev = tp->snd_cwnd;
 708                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
 709                 tp->snd_recover_prev = tp->snd_recover;
 710                 if (IN_FASTRECOVERY(tp->t_flags))
 711                         tp->t_flags |= TF_WASFRECOVERY;
 712                 else
 713                         tp->t_flags &= ~TF_WASFRECOVERY;
 714                 if (IN_CONGRECOVERY(tp->t_flags))
 715                         tp->t_flags |= TF_WASCRECOVERY;
 716                 else
 717                         tp->t_flags &= ~TF_WASCRECOVERY;
 718                 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
 719                 tp->t_flags |= TF_PREVVALID;
 720         } else
 721                 tp->t_flags &= ~TF_PREVVALID;
 722         TCPSTAT_INC(tcps_rexmttimeo);
 723         if ((tp->t_state == TCPS_SYN_SENT) ||
 724             (tp->t_state == TCPS_SYN_RECEIVED))
 725                 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
 726         else
 727                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
 728         TCPT_RANGESET(tp->t_rxtcur, rexmt,
 729                       tp->t_rttmin, TCPTV_REXMTMAX);
 730
 731         /*
 732          * We enter the path for PLMTUD if connection is established or, if
 733          * connection is FIN_WAIT_1 status, reason for the last is that if
 734          * amount of data we send is very small, we could send it in couple of
 735          * packets and process straight to FIN. In that case we won't catch
 736          * ESTABLISHED state.
 737          */
 738         if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
 739             || (tp->t_state == TCPS_FIN_WAIT_1))) {
 740 #ifdef INET6
 741                 int isipv6;
 742 #endif
 743
 744                 /*
 745                  * Idea here is that at each stage of mtu probe (usually, 1448
 746                  * -> 1188 -> 524) should be given 2 chances to recover before
 747                  *  further clamping down. 'tp->t_rxtshift % 2 == 0' should
 748                  *  take care of that.
 749                  */
 750                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
 751                     (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
 752                     (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) {
 753                         /*
 754                          * Enter Path MTU Black-hole Detection mechanism:
 755                          * - Disable Path MTU Discovery (IP "DF" bit).
 756                          * - Reduce MTU to lower value than what we
 757                          *   negotiated with peer.
 758                          */
 759                         /* Record that we may have found a black hole. */
 760                         tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
 761
 762                         /* Keep track of previous MSS. */
 763                         tp->t_pmtud_saved_maxseg = tp->t_maxseg;
 764
 765                         /*
 766                          * Reduce the MSS to blackhole value or to the default
 767                          * in an attempt to retransmit.
 768                          */
 769 #ifdef INET6
 770                         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
 771                         if (isipv6 &&
 772                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
 773                                 /* Use the sysctl tuneable blackhole MSS. */
 774                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
 775                                 V_tcp_pmtud_blackhole_activated++;
 776                         } else if (isipv6) {
 777                                 /* Use the default MSS. */
 778                                 tp->t_maxseg = V_tcp_v6mssdflt;
 779                                 /*
 780                                  * Disable Path MTU Discovery when we switch to
 781                                  * minmss.
 782                                  */
 783                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 784                                 V_tcp_pmtud_blackhole_activated_min_mss++;
 785                         }
 786 #endif
 787 #if defined(INET6) && defined(INET)
 788                         else
 789 #endif
 790 #ifdef INET
 791                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
 792                                 /* Use the sysctl tuneable blackhole MSS. */
 793                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
 794                                 V_tcp_pmtud_blackhole_activated++;
 795                         } else {
 796                                 /* Use the default MSS. */
 797                                 tp->t_maxseg = V_tcp_mssdflt;
 798                                 /*
 799                                  * Disable Path MTU Discovery when we switch to
 800                                  * minmss.
 801                                  */
 802                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 803                                 V_tcp_pmtud_blackhole_activated_min_mss++;
 804                         }
 805 #endif
 806                         /*
 807                          * Reset the slow-start flight size
 808                          * as it may depend on the new MSS.
 809                          */
 810                         if (CC_ALGO(tp)->conn_init != NULL)
 811                                 CC_ALGO(tp)->conn_init(tp->ccv);
 812                 } else {
 813                         /*
 814                          * If further retransmissions are still unsuccessful
 815                          * with a lowered MTU, maybe this isn't a blackhole and
 816                          * we restore the previous MSS and blackhole detection
 817                          * flags.
 818                          * The limit '6' is determined by giving each probe
 819                          * stage (1448, 1188, 524) 2 chances to recover.
 820                          */
 821                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
 822                             (tp->t_rxtshift > 6)) {
 823                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 824                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
 825                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
 826                                 V_tcp_pmtud_blackhole_failed++;
 827                                 /*
 828                                  * Reset the slow-start flight size as it
 829                                  * may depend on the new MSS.
 830                                  */
 831                                 if (CC_ALGO(tp)->conn_init != NULL)
 832                                         CC_ALGO(tp)->conn_init(tp->ccv);
 833                         }
 834                 }
 835         }
 836
 837         /*
 838          * Disable RFC1323 and SACK if we haven't got any response to
 839          * our third SYN to work-around some broken terminal servers
 840          * (most of which have hopefully been retired) that have bad VJ
 841          * header compression code which trashes TCP segments containing
 842          * unknown-to-them TCP options.
 843          */
 844         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
 845             (tp->t_rxtshift == 3))
 846                 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
 847         /*
 848          * If we backed off this far, notify the L3 protocol that we're having
 849          * connection problems.
 850          */
 851         if (tp->t_rxtshift > TCP_RTT_INVALIDATE) {
 852 #ifdef INET6
 853                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
 854                         in6_losing(tp->t_inpcb);
 855                 else
 856 #endif
 857                         in_losing(tp->t_inpcb);
 858         }
 859         tp->snd_nxt = tp->snd_una;
 860         tp->snd_recover = tp->snd_max;
 861         /*
 862          * Force a segment to be sent.
 863          */
 864         tp->t_flags |= TF_ACKNOW;
 865         /*
 866          * If timing a segment in this window, stop the timer.
 867          */
 868         tp->t_rtttime = 0;
 869
 870         cc_cong_signal(tp, NULL, CC_RTO);
 871
 872         (void) tp->t_fb->tfb_tcp_output(tp);
 873
 874 #ifdef TCPDEBUG
 875         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 876                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 877                           PRU_SLOWTIMO);
 878 #endif
 879         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 880         INP_WUNLOCK(inp);
 881 out:
 882         CURVNET_RESTORE();
 883 }
 884
 885 void
 886 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
 887 {
 888         struct callout *t_callout;
 889         timeout_t *f_callout;
 890         struct inpcb *inp = tp->t_inpcb;
 891         int cpu = inp_to_cpuid(inp);
 892
 893 #ifdef TCP_OFFLOAD
 894         if (tp->t_flags & TF_TOE)
 895                 return;
 896 #endif
 897
 898         if (tp->t_timers->tt_flags & TT_STOPPED)
 899                 return;
 900
 901         switch (timer_type) {
 902                 case TT_DELACK:
 903                         t_callout = &tp->t_timers->tt_delack;
 904                         f_callout = tcp_timer_delack;
 905                         break;
 906                 case TT_REXMT:
 907                         t_callout = &tp->t_timers->tt_rexmt;
 908                         f_callout = tcp_timer_rexmt;
 909                         break;
 910                 case TT_PERSIST:
 911                         t_callout = &tp->t_timers->tt_persist;
 912                         f_callout = tcp_timer_persist;
 913                         break;
 914                 case TT_KEEP:
 915                         t_callout = &tp->t_timers->tt_keep;
 916                         f_callout = tcp_timer_keep;
 917                         break;
 918                 case TT_2MSL:
 919                         t_callout = &tp->t_timers->tt_2msl;
 920                         f_callout = tcp_timer_2msl;
 921                         break;
 922                 default:
 923                         if (tp->t_fb->tfb_tcp_timer_activate) {
 924                                 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
 925                                 return;
 926                         }
 927                         panic("tp %p bad timer_type %#x", tp, timer_type);
 928                 }
 929         if (delta == 0) {
 930                 callout_stop(t_callout);
 931         } else {
 932                 callout_reset_on(t_callout, delta, f_callout, tp, cpu);
 933         }
 934 }
 935
 936 int
 937 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
 938 {
 939         struct callout *t_callout;
 940
 941         switch (timer_type) {
 942                 case TT_DELACK:
 943                         t_callout = &tp->t_timers->tt_delack;
 944                         break;
 945                 case TT_REXMT:
 946                         t_callout = &tp->t_timers->tt_rexmt;
 947                         break;
 948                 case TT_PERSIST:
 949                         t_callout = &tp->t_timers->tt_persist;
 950                         break;
 951                 case TT_KEEP:
 952                         t_callout = &tp->t_timers->tt_keep;
 953                         break;
 954                 case TT_2MSL:
 955                         t_callout = &tp->t_timers->tt_2msl;
 956                         break;
 957                 default:
 958                         if (tp->t_fb->tfb_tcp_timer_active) {
 959                                 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type));
 960                         }
 961                         panic("tp %p bad timer_type %#x", tp, timer_type);
 962                 }
 963         return callout_active(t_callout);
 964 }
 965
 966 void
 967 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
 968 {
 969         struct callout *t_callout;
 970
 971         tp->t_timers->tt_flags |= TT_STOPPED;
 972         switch (timer_type) {
 973                 case TT_DELACK:
 974                         t_callout = &tp->t_timers->tt_delack;
 975                         break;
 976                 case TT_REXMT:
 977                         t_callout = &tp->t_timers->tt_rexmt;
 978                         break;
 979                 case TT_PERSIST:
 980                         t_callout = &tp->t_timers->tt_persist;
 981                         break;
 982                 case TT_KEEP:
 983                         t_callout = &tp->t_timers->tt_keep;
 984                         break;
 985                 case TT_2MSL:
 986                         t_callout = &tp->t_timers->tt_2msl;
 987                         break;
 988                 default:
 989                         if (tp->t_fb->tfb_tcp_timer_stop) {
 990                                 /*
 991                                  * XXXrrs we need to look at this with the
 992                                  * stop case below (flags).
 993                                  */
 994                                 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type);
 995                                 return;
 996                         }
 997                         panic("tp %p bad timer_type %#x", tp, timer_type);
 998                 }
 999
1000         if (callout_async_drain(t_callout, tcp_timer_discard) == 0) {
1001                 /*
1002                  * Can't stop the callout, defer tcpcb actual deletion
1003                  * to the last one. We do this using the async drain
1004                  * function and incrementing the count in
1005                  */
1006                 tp->t_timers->tt_draincnt++;
1007         }
1008 }