sys/netinet/tcp_timer.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 4. Neither the name of the University nor the names of its contributors
  14  *    may be used to endorse or promote products derived from this software
  15  *    without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  *
  29  *      @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
  30  */
  31
  32 #include <sys/cdefs.h>
  33 __FBSDID("$FreeBSD$");
  34
  35 #include "opt_inet.h"
  36 #include "opt_inet6.h"
  37 #include "opt_tcpdebug.h"
  38 #include "opt_rss.h"
  39
  40 #include <sys/param.h>
  41 #include <sys/kernel.h>
  42 #include <sys/lock.h>
  43 #include <sys/mbuf.h>
  44 #include <sys/mutex.h>
  45 #include <sys/protosw.h>
  46 #include <sys/smp.h>
  47 #include <sys/socket.h>
  48 #include <sys/socketvar.h>
  49 #include <sys/sysctl.h>
  50 #include <sys/systm.h>
  51
  52 #include <net/if.h>
  53 #include <net/route.h>
  54 #include <net/rss_config.h>
  55 #include <net/vnet.h>
  56 #include <net/netisr.h>
  57
  58 #include <netinet/in.h>
  59 #include <netinet/in_kdtrace.h>
  60 #include <netinet/in_pcb.h>
  61 #include <netinet/in_rss.h>
  62 #include <netinet/in_systm.h>
  63 #ifdef INET6
  64 #include <netinet6/in6_pcb.h>
  65 #endif
  66 #include <netinet/ip_var.h>
  67 #include <netinet/tcp.h>
  68 #include <netinet/tcp_fsm.h>
  69 #include <netinet/tcp_timer.h>
  70 #include <netinet/tcp_var.h>
  71 #include <netinet/cc/cc.h>
  72 #ifdef INET6
  73 #include <netinet6/tcp6_var.h>
  74 #endif
  75 #include <netinet/tcpip.h>
  76 #ifdef TCPDEBUG
  77 #include <netinet/tcp_debug.h>
  78 #endif
  79
  80 int    tcp_persmin;
  81 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW,
  82     &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval");
  83
  84 int    tcp_persmax;
  85 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW,
  86     &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval");
  87
  88 int     tcp_keepinit;
  89 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
  90     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
  91
  92 int     tcp_keepidle;
  93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
  94     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
  95
  96 int     tcp_keepintvl;
  97 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
  98     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
  99
 100 int     tcp_delacktime;
 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
 102     &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
 103     "Time before a delayed ACK is sent");
 104
 105 int     tcp_msl;
 106 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
 107     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
 108
 109 int     tcp_rexmit_min;
 110 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
 111     &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
 112     "Minimum Retransmission Timeout");
 113
 114 int     tcp_rexmit_slop;
 115 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
 116     &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
 117     "Retransmission Timer Slop");
 118
 119 static int      always_keepalive = 1;
 120 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
 121     &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
 122
 123 int    tcp_fast_finwait2_recycle = 0;
 124 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
 125     &tcp_fast_finwait2_recycle, 0,
 126     "Recycle closed FIN_WAIT_2 connections faster");
 127
 128 int    tcp_finwait2_timeout;
 129 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
 130     &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
 131
 132 int     tcp_keepcnt = TCPTV_KEEPCNT;
 133 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
 134     "Number of keepalive probes to send");
 135
 136         /* max idle probes */
 137 int     tcp_maxpersistidle;
 138
 139 static int      tcp_rexmit_drop_options = 0;
 140 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
 141     &tcp_rexmit_drop_options, 0,
 142     "Drop TCP options from 3rd and later retransmitted SYN");
 143
 144 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
 145 #define V_tcp_pmtud_blackhole_detect    VNET(tcp_pmtud_blackhole_detect)
 146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
 147     CTLFLAG_RW|CTLFLAG_VNET,
 148     &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
 149     "Path MTU Discovery Black Hole Detection Enabled");
 150
 151 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
 152 #define V_tcp_pmtud_blackhole_activated \
 153     VNET(tcp_pmtud_blackhole_activated)
 154 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
 155     CTLFLAG_RD|CTLFLAG_VNET,
 156     &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
 157     "Path MTU Discovery Black Hole Detection, Activation Count");
 158
 159 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
 160 #define V_tcp_pmtud_blackhole_activated_min_mss \
 161     VNET(tcp_pmtud_blackhole_activated_min_mss)
 162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
 163     CTLFLAG_RD|CTLFLAG_VNET,
 164     &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
 165     "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
 166
 167 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
 168 #define V_tcp_pmtud_blackhole_failed    VNET(tcp_pmtud_blackhole_failed)
 169 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
 170     CTLFLAG_RD|CTLFLAG_VNET,
 171     &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
 172     "Path MTU Discovery Black Hole Detection, Failure Count");
 173
 174 #ifdef INET
 175 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
 176 #define V_tcp_pmtud_blackhole_mss       VNET(tcp_pmtud_blackhole_mss)
 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
 178     CTLFLAG_RW|CTLFLAG_VNET,
 179     &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
 180     "Path MTU Discovery Black Hole Detection lowered MSS");
 181 #endif
 182
 183 #ifdef INET6
 184 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
 185 #define V_tcp_v6pmtud_blackhole_mss     VNET(tcp_v6pmtud_blackhole_mss)
 186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
 187     CTLFLAG_RW|CTLFLAG_VNET,
 188     &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
 189     "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
 190 #endif
 191
 192 #ifdef  RSS
 193 static int      per_cpu_timers = 1;
 194 #else
 195 static int      per_cpu_timers = 0;
 196 #endif
 197 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
 198     &per_cpu_timers , 0, "run tcp timers on all cpus");
 199
 200 #if 0
 201 #define INP_CPU(inp)    (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
 202                 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
 203 #endif
 204
 205 /*
 206  * Map the given inp to a CPU id.
 207  *
 208  * This queries RSS if it's compiled in, else it defaults to the current
 209  * CPU ID.
 210  */
 211 static inline int
 212 inp_to_cpuid(struct inpcb *inp)
 213 {
 214         u_int cpuid;
 215
 216 #ifdef  RSS
 217         if (per_cpu_timers) {
 218                 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 219                 if (cpuid == NETISR_CPUID_NONE)
 220                         return (curcpu);        /* XXX */
 221                 else
 222                         return (cpuid);
 223         }
 224 #else
 225         /* Legacy, pre-RSS behaviour */
 226         if (per_cpu_timers) {
 227                 /*
 228                  * We don't have a flowid -> cpuid mapping, so cheat and
 229                  * just map unknown cpuids to curcpu.  Not the best, but
 230                  * apparently better than defaulting to swi 0.
 231                  */
 232                 cpuid = inp->inp_flowid % (mp_maxid + 1);
 233                 if (! CPU_ABSENT(cpuid))
 234                         return (cpuid);
 235                 return (curcpu);
 236         }
 237 #endif
 238         /* Default for RSS and non-RSS - cpuid 0 */
 239         else {
 240                 return (0);
 241         }
 242 }
 243
 244 /*
 245  * Tcp protocol timeout routine called every 500 ms.
 246  * Updates timestamps used for TCP
 247  * causes finite state machine actions if timers expire.
 248  */
 249 void
 250 tcp_slowtimo(void)
 251 {
 252         VNET_ITERATOR_DECL(vnet_iter);
 253
 254         VNET_LIST_RLOCK_NOSLEEP();
 255         VNET_FOREACH(vnet_iter) {
 256                 CURVNET_SET(vnet_iter);
 257                 (void) tcp_tw_2msl_scan(0);
 258                 CURVNET_RESTORE();
 259         }
 260         VNET_LIST_RUNLOCK_NOSLEEP();
 261 }
 262
 263 int     tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
 264     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
 265
 266 int     tcp_backoff[TCP_MAXRXTSHIFT + 1] =
 267     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
 268
 269 static int tcp_totbackoff = 2559;       /* sum of tcp_backoff[] */
 270
 271 /*
 272  * TCP timer processing.
 273  */
 274
 275 void
 276 tcp_timer_delack(void *xtp)
 277 {
 278         struct tcpcb *tp = xtp;
 279         struct inpcb *inp;
 280         CURVNET_SET(tp->t_vnet);
 281
 282         inp = tp->t_inpcb;
 283         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 284         INP_WLOCK(inp);
 285         if (callout_pending(&tp->t_timers->tt_delack) ||
 286             !callout_active(&tp->t_timers->tt_delack)) {
 287                 INP_WUNLOCK(inp);
 288                 CURVNET_RESTORE();
 289                 return;
 290         }
 291         callout_deactivate(&tp->t_timers->tt_delack);
 292         if ((inp->inp_flags & INP_DROPPED) != 0) {
 293                 INP_WUNLOCK(inp);
 294                 CURVNET_RESTORE();
 295                 return;
 296         }
 297         tp->t_flags |= TF_ACKNOW;
 298         TCPSTAT_INC(tcps_delack);
 299         (void) tp->t_fb->tfb_tcp_output(tp);
 300         INP_WUNLOCK(inp);
 301         CURVNET_RESTORE();
 302 }
 303
 304 /*
 305  * When a timer wants to remove a TCB it must
 306  * hold the INP_INFO_RLOCK(). The timer function
 307  * should only have grabbed the INP_WLOCK() when
 308  * it entered. To safely switch to holding both the
 309  * INP_INFO_RLOCK() and the INP_WLOCK() we must first
 310  * grab a reference on the inp, which will hold the inp
 311  * so that it can't be removed. We then unlock the INP_WLOCK(),
 312  * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK()
 313  * we proceed again to get the INP_WLOCK() (this preserves proper
 314  * lock order). After acquiring the INP_WLOCK we must check if someone
 315  * else deleted the pcb i.e. the inp_flags check.
 316  * If so we return 1 otherwise we return 0.
 317  *
 318  * No matter what the tcp_inpinfo_lock_add() function
 319  * returns the caller must afterwards call tcp_inpinfo_lock_del()
 320  * to drop the locks and reference properly.
 321  */
 322
 323 int
 324 tcp_inpinfo_lock_add(struct inpcb *inp)
 325 {
 326         in_pcbref(inp);
 327         INP_WUNLOCK(inp);
 328         INP_INFO_RLOCK(&V_tcbinfo);
 329         INP_WLOCK(inp);
 330         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 331                 return(1);
 332         }
 333         return(0);
 334
 335 }
 336
 337 void
 338 tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp)
 339 {
 340         INP_INFO_RUNLOCK(&V_tcbinfo);
 341         if (inp && (tp == NULL)) {
 342                 /*
 343                  * If tcp_close/drop() gets called and tp
 344                  * returns NULL, then the function dropped
 345                  * the inp lock, we hold a reference keeping
 346                  * this around, so we must re-aquire the
 347                  * INP_WLOCK() in order to proceed with
 348                  * our dropping the inp reference.
 349                  */
 350                 INP_WLOCK(inp);
 351         }
 352         if (inp && in_pcbrele_wlocked(inp) == 0)
 353                 INP_WUNLOCK(inp);
 354 }
 355
 356 void
 357 tcp_timer_2msl(void *xtp)
 358 {
 359         struct tcpcb *tp = xtp;
 360         struct inpcb *inp;
 361         CURVNET_SET(tp->t_vnet);
 362 #ifdef TCPDEBUG
 363         int ostate;
 364
 365         ostate = tp->t_state;
 366 #endif
 367         inp = tp->t_inpcb;
 368         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 369         INP_WLOCK(inp);
 370         tcp_free_sackholes(tp);
 371         if (callout_pending(&tp->t_timers->tt_2msl) ||
 372             !callout_active(&tp->t_timers->tt_2msl)) {
 373                 INP_WUNLOCK(tp->t_inpcb);
 374                 CURVNET_RESTORE();
 375                 return;
 376         }
 377         callout_deactivate(&tp->t_timers->tt_2msl);
 378         if ((inp->inp_flags & INP_DROPPED) != 0) {
 379                 INP_WUNLOCK(inp);
 380                 CURVNET_RESTORE();
 381                 return;
 382         }
 383         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 384                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 385         /*
 386          * 2 MSL timeout in shutdown went off.  If we're closed but
 387          * still waiting for peer to close and connection has been idle
 388          * too long delete connection control block.  Otherwise, check
 389          * again in a bit.
 390          *
 391          * If in TIME_WAIT state just ignore as this timeout is handled in
 392          * tcp_tw_2msl_scan().
 393          *
 394          * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
 395          * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
 396          * Ignore fact that there were recent incoming segments.
 397          */
 398         if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
 399                 INP_WUNLOCK(inp);
 400                 CURVNET_RESTORE();
 401                 return;
 402         }
 403         if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
 404             tp->t_inpcb && tp->t_inpcb->inp_socket &&
 405             (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
 406                 TCPSTAT_INC(tcps_finwait2_drops);
 407                 if (tcp_inpinfo_lock_add(inp)) {
 408                         tcp_inpinfo_lock_del(inp, tp);
 409                         goto out;
 410                 }
 411                 tp = tcp_close(tp);
 412                 tcp_inpinfo_lock_del(inp, tp);
 413                 goto out;
 414         } else {
 415                 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
 416                         callout_reset(&tp->t_timers->tt_2msl,
 417                                       TP_KEEPINTVL(tp), tcp_timer_2msl, tp);
 418                 } else {
 419                         if (tcp_inpinfo_lock_add(inp)) {
 420                                 tcp_inpinfo_lock_del(inp, tp);
 421                                 goto out;
 422                         }
 423                         tp = tcp_close(tp);
 424                         tcp_inpinfo_lock_del(inp, tp);
 425                         goto out;
 426                 }
 427        }
 428
 429 #ifdef TCPDEBUG
 430         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 431                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 432                           PRU_SLOWTIMO);
 433 #endif
 434         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 435
 436         if (tp != NULL)
 437                 INP_WUNLOCK(inp);
 438 out:
 439         CURVNET_RESTORE();
 440 }
 441
 442 void
 443 tcp_timer_keep(void *xtp)
 444 {
 445         struct tcpcb *tp = xtp;
 446         struct tcptemp *t_template;
 447         struct inpcb *inp;
 448         CURVNET_SET(tp->t_vnet);
 449 #ifdef TCPDEBUG
 450         int ostate;
 451
 452         ostate = tp->t_state;
 453 #endif
 454         inp = tp->t_inpcb;
 455         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 456         INP_WLOCK(inp);
 457         if (callout_pending(&tp->t_timers->tt_keep) ||
 458             !callout_active(&tp->t_timers->tt_keep)) {
 459                 INP_WUNLOCK(inp);
 460                 CURVNET_RESTORE();
 461                 return;
 462         }
 463         callout_deactivate(&tp->t_timers->tt_keep);
 464         if ((inp->inp_flags & INP_DROPPED) != 0) {
 465                 INP_WUNLOCK(inp);
 466                 CURVNET_RESTORE();
 467                 return;
 468         }
 469         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 470                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 471         /*
 472          * Keep-alive timer went off; send something
 473          * or drop connection if idle for too long.
 474          */
 475         TCPSTAT_INC(tcps_keeptimeo);
 476         if (tp->t_state < TCPS_ESTABLISHED)
 477                 goto dropit;
 478         if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
 479             tp->t_state <= TCPS_CLOSING) {
 480                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
 481                         goto dropit;
 482                 /*
 483                  * Send a packet designed to force a response
 484                  * if the peer is up and reachable:
 485                  * either an ACK if the connection is still alive,
 486                  * or an RST if the peer has closed the connection
 487                  * due to timeout or reboot.
 488                  * Using sequence number tp->snd_una-1
 489                  * causes the transmitted zero-length segment
 490                  * to lie outside the receive window;
 491                  * by the protocol spec, this requires the
 492                  * correspondent TCP to respond.
 493                  */
 494                 TCPSTAT_INC(tcps_keepprobe);
 495                 t_template = tcpip_maketemplate(inp);
 496                 if (t_template) {
 497                         tcp_respond(tp, t_template->tt_ipgen,
 498                                     &t_template->tt_t, (struct mbuf *)NULL,
 499                                     tp->rcv_nxt, tp->snd_una - 1, 0);
 500                         free(t_template, M_TEMP);
 501                 }
 502                 callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
 503                               tcp_timer_keep, tp);
 504         } else
 505                 callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
 506                               tcp_timer_keep, tp);
 507
 508 #ifdef TCPDEBUG
 509         if (inp->inp_socket->so_options & SO_DEBUG)
 510                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 511                           PRU_SLOWTIMO);
 512 #endif
 513         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 514         INP_WUNLOCK(inp);
 515         CURVNET_RESTORE();
 516         return;
 517
 518 dropit:
 519         TCPSTAT_INC(tcps_keepdrops);
 520
 521         if (tcp_inpinfo_lock_add(inp)) {
 522                 tcp_inpinfo_lock_del(inp, tp);
 523                 goto out;
 524         }
 525         tp = tcp_drop(tp, ETIMEDOUT);
 526
 527 #ifdef TCPDEBUG
 528         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 529                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 530                           PRU_SLOWTIMO);
 531 #endif
 532         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 533         tcp_inpinfo_lock_del(inp, tp);
 534 out:
 535         CURVNET_RESTORE();
 536 }
 537
 538 void
 539 tcp_timer_persist(void *xtp)
 540 {
 541         struct tcpcb *tp = xtp;
 542         struct inpcb *inp;
 543         CURVNET_SET(tp->t_vnet);
 544 #ifdef TCPDEBUG
 545         int ostate;
 546
 547         ostate = tp->t_state;
 548 #endif
 549         inp = tp->t_inpcb;
 550         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 551         INP_WLOCK(inp);
 552         if (callout_pending(&tp->t_timers->tt_persist) ||
 553             !callout_active(&tp->t_timers->tt_persist)) {
 554                 INP_WUNLOCK(inp);
 555                 CURVNET_RESTORE();
 556                 return;
 557         }
 558         callout_deactivate(&tp->t_timers->tt_persist);
 559         if ((inp->inp_flags & INP_DROPPED) != 0) {
 560                 INP_WUNLOCK(inp);
 561                 CURVNET_RESTORE();
 562                 return;
 563         }
 564         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 565                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 566         /*
 567          * Persistence timer into zero window.
 568          * Force a byte to be output, if possible.
 569          */
 570         TCPSTAT_INC(tcps_persisttimeo);
 571         /*
 572          * Hack: if the peer is dead/unreachable, we do not
 573          * time out if the window is closed.  After a full
 574          * backoff, drop the connection if the idle time
 575          * (no responses to probes) reaches the maximum
 576          * backoff that we would use if retransmitting.
 577          */
 578         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
 579             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 580              ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 581                 TCPSTAT_INC(tcps_persistdrop);
 582                 if (tcp_inpinfo_lock_add(inp)) {
 583                         tcp_inpinfo_lock_del(inp, tp);
 584                         goto out;
 585                 }
 586                 tp = tcp_drop(tp, ETIMEDOUT);
 587                 tcp_inpinfo_lock_del(inp, tp);
 588                 goto out;
 589         }
 590         /*
 591          * If the user has closed the socket then drop a persisting
 592          * connection after a much reduced timeout.
 593          */
 594         if (tp->t_state > TCPS_CLOSE_WAIT &&
 595             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
 596                 TCPSTAT_INC(tcps_persistdrop);
 597                 if (tcp_inpinfo_lock_add(inp)) {
 598                         tcp_inpinfo_lock_del(inp, tp);
 599                         goto out;
 600                 }
 601                 tp = tcp_drop(tp, ETIMEDOUT);
 602                 tcp_inpinfo_lock_del(inp, tp);
 603                 goto out;
 604         }
 605         tcp_setpersist(tp);
 606         tp->t_flags |= TF_FORCEDATA;
 607         (void) tp->t_fb->tfb_tcp_output(tp);
 608         tp->t_flags &= ~TF_FORCEDATA;
 609
 610 #ifdef TCPDEBUG
 611         if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
 612                 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
 613 #endif
 614         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 615         INP_WUNLOCK(inp);
 616 out:
 617         CURVNET_RESTORE();
 618 }
 619
 620 void
 621 tcp_timer_rexmt(void * xtp)
 622 {
 623         struct tcpcb *tp = xtp;
 624         CURVNET_SET(tp->t_vnet);
 625         int rexmt;
 626         struct inpcb *inp;
 627 #ifdef TCPDEBUG
 628         int ostate;
 629
 630         ostate = tp->t_state;
 631 #endif
 632         inp = tp->t_inpcb;
 633         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 634         INP_WLOCK(inp);
 635         if (callout_pending(&tp->t_timers->tt_rexmt) ||
 636             !callout_active(&tp->t_timers->tt_rexmt)) {
 637                 INP_WUNLOCK(inp);
 638                 CURVNET_RESTORE();
 639                 return;
 640         }
 641         callout_deactivate(&tp->t_timers->tt_rexmt);
 642         if ((inp->inp_flags & INP_DROPPED) != 0) {
 643                 INP_WUNLOCK(inp);
 644                 CURVNET_RESTORE();
 645                 return;
 646         }
 647         KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 648                 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 649         tcp_free_sackholes(tp);
 650         if (tp->t_fb->tfb_tcp_rexmit_tmr) {
 651                 /* The stack has a timer action too. */
 652                 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
 653         }
 654         /*
 655          * Retransmission timer went off.  Message has not
 656          * been acked within retransmit interval.  Back off
 657          * to a longer retransmit interval and retransmit one segment.
 658          */
 659         if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 660                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
 661                 TCPSTAT_INC(tcps_timeoutdrop);
 662                 if (tcp_inpinfo_lock_add(inp)) {
 663                         tcp_inpinfo_lock_del(inp, tp);
 664                         goto out;
 665                 }
 666                 tp = tcp_drop(tp, tp->t_softerror ?
 667                               tp->t_softerror : ETIMEDOUT);
 668                 tcp_inpinfo_lock_del(inp, tp);
 669                 goto out;
 670         }
 671         if (tp->t_state == TCPS_SYN_SENT) {
 672                 /*
 673                  * If the SYN was retransmitted, indicate CWND to be
 674                  * limited to 1 segment in cc_conn_init().
 675                  */
 676                 tp->snd_cwnd = 1;
 677         } else if (tp->t_rxtshift == 1) {
 678                 /*
 679                  * first retransmit; record ssthresh and cwnd so they can
 680                  * be recovered if this turns out to be a "bad" retransmit.
 681                  * A retransmit is considered "bad" if an ACK for this
 682                  * segment is received within RTT/2 interval; the assumption
 683                  * here is that the ACK was already in flight.  See
 684                  * "On Estimating End-to-End Network Path Properties" by
 685                  * Allman and Paxson for more details.
 686                  */
 687                 tp->snd_cwnd_prev = tp->snd_cwnd;
 688                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
 689                 tp->snd_recover_prev = tp->snd_recover;
 690                 if (IN_FASTRECOVERY(tp->t_flags))
 691                         tp->t_flags |= TF_WASFRECOVERY;
 692                 else
 693                         tp->t_flags &= ~TF_WASFRECOVERY;
 694                 if (IN_CONGRECOVERY(tp->t_flags))
 695                         tp->t_flags |= TF_WASCRECOVERY;
 696                 else
 697                         tp->t_flags &= ~TF_WASCRECOVERY;
 698                 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
 699                 tp->t_flags |= TF_PREVVALID;
 700         } else
 701                 tp->t_flags &= ~TF_PREVVALID;
 702         TCPSTAT_INC(tcps_rexmttimeo);
 703         if ((tp->t_state == TCPS_SYN_SENT) ||
 704             (tp->t_state == TCPS_SYN_RECEIVED))
 705                 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
 706         else
 707                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
 708         TCPT_RANGESET(tp->t_rxtcur, rexmt,
 709                       tp->t_rttmin, TCPTV_REXMTMAX);
 710
 711         /*
 712          * We enter the path for PLMTUD if connection is established or, if
 713          * connection is FIN_WAIT_1 status, reason for the last is that if
 714          * amount of data we send is very small, we could send it in couple of
 715          * packets and process straight to FIN. In that case we won't catch
 716          * ESTABLISHED state.
 717          */
 718         if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
 719             || (tp->t_state == TCPS_FIN_WAIT_1))) {
 720 #ifdef INET6
 721                 int isipv6;
 722 #endif
 723
 724                 /*
 725                  * Idea here is that at each stage of mtu probe (usually, 1448
 726                  * -> 1188 -> 524) should be given 2 chances to recover before
 727                  *  further clamping down. 'tp->t_rxtshift % 2 == 0' should
 728                  *  take care of that.
 729                  */
 730                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
 731                     (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
 732                     (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) {
 733                         /*
 734                          * Enter Path MTU Black-hole Detection mechanism:
 735                          * - Disable Path MTU Discovery (IP "DF" bit).
 736                          * - Reduce MTU to lower value than what we
 737                          *   negotiated with peer.
 738                          */
 739                         /* Record that we may have found a black hole. */
 740                         tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
 741
 742                         /* Keep track of previous MSS. */
 743                         tp->t_pmtud_saved_maxseg = tp->t_maxseg;
 744
 745                         /*
 746                          * Reduce the MSS to blackhole value or to the default
 747                          * in an attempt to retransmit.
 748                          */
 749 #ifdef INET6
 750                         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
 751                         if (isipv6 &&
 752                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
 753                                 /* Use the sysctl tuneable blackhole MSS. */
 754                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
 755                                 V_tcp_pmtud_blackhole_activated++;
 756                         } else if (isipv6) {
 757                                 /* Use the default MSS. */
 758                                 tp->t_maxseg = V_tcp_v6mssdflt;
 759                                 /*
 760                                  * Disable Path MTU Discovery when we switch to
 761                                  * minmss.
 762                                  */
 763                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 764                                 V_tcp_pmtud_blackhole_activated_min_mss++;
 765                         }
 766 #endif
 767 #if defined(INET6) && defined(INET)
 768                         else
 769 #endif
 770 #ifdef INET
 771                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
 772                                 /* Use the sysctl tuneable blackhole MSS. */
 773                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
 774                                 V_tcp_pmtud_blackhole_activated++;
 775                         } else {
 776                                 /* Use the default MSS. */
 777                                 tp->t_maxseg = V_tcp_mssdflt;
 778                                 /*
 779                                  * Disable Path MTU Discovery when we switch to
 780                                  * minmss.
 781                                  */
 782                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 783                                 V_tcp_pmtud_blackhole_activated_min_mss++;
 784                         }
 785 #endif
 786                         /*
 787                          * Reset the slow-start flight size
 788                          * as it may depend on the new MSS.
 789                          */
 790                         if (CC_ALGO(tp)->conn_init != NULL)
 791                                 CC_ALGO(tp)->conn_init(tp->ccv);
 792                 } else {
 793                         /*
 794                          * If further retransmissions are still unsuccessful
 795                          * with a lowered MTU, maybe this isn't a blackhole and
 796                          * we restore the previous MSS and blackhole detection
 797                          * flags.
 798                          * The limit '6' is determined by giving each probe
 799                          * stage (1448, 1188, 524) 2 chances to recover.
 800                          */
 801                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
 802                             (tp->t_rxtshift > 6)) {
 803                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 804                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
 805                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
 806                                 V_tcp_pmtud_blackhole_failed++;
 807                                 /*
 808                                  * Reset the slow-start flight size as it
 809                                  * may depend on the new MSS.
 810                                  */
 811                                 if (CC_ALGO(tp)->conn_init != NULL)
 812                                         CC_ALGO(tp)->conn_init(tp->ccv);
 813                         }
 814                 }
 815         }
 816
 817         /*
 818          * Disable RFC1323 and SACK if we haven't got any response to
 819          * our third SYN to work-around some broken terminal servers
 820          * (most of which have hopefully been retired) that have bad VJ
 821          * header compression code which trashes TCP segments containing
 822          * unknown-to-them TCP options.
 823          */
 824         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
 825             (tp->t_rxtshift == 3))
 826                 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
 827         /*
 828          * If we backed off this far, our srtt estimate is probably bogus.
 829          * Clobber it so we'll take the next rtt measurement as our srtt;
 830          * move the current srtt into rttvar to keep the current
 831          * retransmit times until then.
 832          */
 833         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
 834 #ifdef INET6
 835                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
 836                         in6_losing(tp->t_inpcb);
 837                 else
 838 #endif
 839                         in_losing(tp->t_inpcb);
 840                 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
 841                 tp->t_srtt = 0;
 842         }
 843         tp->snd_nxt = tp->snd_una;
 844         tp->snd_recover = tp->snd_max;
 845         /*
 846          * Force a segment to be sent.
 847          */
 848         tp->t_flags |= TF_ACKNOW;
 849         /*
 850          * If timing a segment in this window, stop the timer.
 851          */
 852         tp->t_rtttime = 0;
 853
 854         cc_cong_signal(tp, NULL, CC_RTO);
 855
 856         (void) tp->t_fb->tfb_tcp_output(tp);
 857
 858 #ifdef TCPDEBUG
 859         if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 860                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 861                           PRU_SLOWTIMO);
 862 #endif
 863         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 864         INP_WUNLOCK(inp);
 865 out:
 866         CURVNET_RESTORE();
 867 }
 868
 869 void
 870 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
 871 {
 872         struct callout *t_callout;
 873         timeout_t *f_callout;
 874         struct inpcb *inp = tp->t_inpcb;
 875         int cpu = inp_to_cpuid(inp);
 876
 877 #ifdef TCP_OFFLOAD
 878         if (tp->t_flags & TF_TOE)
 879                 return;
 880 #endif
 881
 882         if (tp->t_timers->tt_flags & TT_STOPPED)
 883                 return;
 884
 885         switch (timer_type) {
 886                 case TT_DELACK:
 887                         t_callout = &tp->t_timers->tt_delack;
 888                         f_callout = tcp_timer_delack;
 889                         break;
 890                 case TT_REXMT:
 891                         t_callout = &tp->t_timers->tt_rexmt;
 892                         f_callout = tcp_timer_rexmt;
 893                         break;
 894                 case TT_PERSIST:
 895                         t_callout = &tp->t_timers->tt_persist;
 896                         f_callout = tcp_timer_persist;
 897                         break;
 898                 case TT_KEEP:
 899                         t_callout = &tp->t_timers->tt_keep;
 900                         f_callout = tcp_timer_keep;
 901                         break;
 902                 case TT_2MSL:
 903                         t_callout = &tp->t_timers->tt_2msl;
 904                         f_callout = tcp_timer_2msl;
 905                         break;
 906                 default:
 907                         if (tp->t_fb->tfb_tcp_timer_activate) {
 908                                 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
 909                                 return;
 910                         }
 911                         panic("tp %p bad timer_type %#x", tp, timer_type);
 912                 }
 913         if (delta == 0) {
 914                 callout_stop(t_callout);
 915         } else {
 916                 callout_reset_on(t_callout, delta, f_callout, tp, cpu);
 917         }
 918 }
 919
 920 int
 921 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
 922 {
 923         struct callout *t_callout;
 924
 925         switch (timer_type) {
 926                 case TT_DELACK:
 927                         t_callout = &tp->t_timers->tt_delack;
 928                         break;
 929                 case TT_REXMT:
 930                         t_callout = &tp->t_timers->tt_rexmt;
 931                         break;
 932                 case TT_PERSIST:
 933                         t_callout = &tp->t_timers->tt_persist;
 934                         break;
 935                 case TT_KEEP:
 936                         t_callout = &tp->t_timers->tt_keep;
 937                         break;
 938                 case TT_2MSL:
 939                         t_callout = &tp->t_timers->tt_2msl;
 940                         break;
 941                 default:
 942                         if (tp->t_fb->tfb_tcp_timer_active) {
 943                                 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type));
 944                         }
 945                         panic("tp %p bad timer_type %#x", tp, timer_type);
 946                 }
 947         return callout_active(t_callout);
 948 }
 949
 950 void
 951 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
 952 {
 953         struct callout *t_callout;
 954
 955         tp->t_timers->tt_flags |= TT_STOPPED;
 956         switch (timer_type) {
 957                 case TT_DELACK:
 958                         t_callout = &tp->t_timers->tt_delack;
 959                         break;
 960                 case TT_REXMT:
 961                         t_callout = &tp->t_timers->tt_rexmt;
 962                         break;
 963                 case TT_PERSIST:
 964                         t_callout = &tp->t_timers->tt_persist;
 965                         break;
 966                 case TT_KEEP:
 967                         t_callout = &tp->t_timers->tt_keep;
 968                         break;
 969                 case TT_2MSL:
 970                         t_callout = &tp->t_timers->tt_2msl;
 971                         break;
 972                 default:
 973                         if (tp->t_fb->tfb_tcp_timer_stop) {
 974                                 /*
 975                                  * XXXrrs we need to look at this with the
 976                                  * stop case below (flags).
 977                                  */
 978                                 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type);
 979                                 return;
 980                         }
 981                         panic("tp %p bad timer_type %#x", tp, timer_type);
 982                 }
 983
 984         if (callout_async_drain(t_callout, tcp_timer_discard) == 0) {
 985                 /*
 986                  * Can't stop the callout, defer tcpcb actual deletion
 987                  * to the last one. We do this using the async drain
 988                  * function and incrementing the count in
 989                  */
 990                 tp->t_timers->tt_draincnt++;
 991         }
 992 }
 993
 994 #define ticks_to_msecs(t)       (1000*(t) / hz)
 995
 996 void
 997 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
 998     struct xtcp_timer *xtimer)
 999 {
1000         sbintime_t now;
1001
1002         bzero(xtimer, sizeof(*xtimer));
1003         if (timer == NULL)
1004                 return;
1005         now = getsbinuptime();
1006         if (callout_active(&timer->tt_delack))
1007                 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
1008         if (callout_active(&timer->tt_rexmt))
1009                 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
1010         if (callout_active(&timer->tt_persist))
1011                 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
1012         if (callout_active(&timer->tt_keep))
1013                 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
1014         if (callout_active(&timer->tt_2msl))
1015                 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
1016         xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
1017 }