sys/netinet/tcp_stacks/rack_bbr_common.c

   1 /*-
   2  * Copyright (c) 2016-9
   3  *      Netflix Inc.
   4  *      All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25  * SUCH DAMAGE.
  26  *
  27  */
  28 /*
  29  * Author: Randall Stewart <rrs@netflix.com>
  30  * This work is based on the ACM Queue paper
  31  * BBR - Congestion Based Congestion Control
  32  * and also numerous discussions with Neal, Yuchung and Van.
  33  */
  34
  35 #include <sys/cdefs.h>
  36 __FBSDID("$FreeBSD$");
  37
  38 #include "opt_inet.h"
  39 #include "opt_inet6.h"
  40 #include "opt_ipsec.h"
  41 #include "opt_tcpdebug.h"
  42 #include "opt_ratelimit.h"
  43 #include "opt_kern_tls.h"
  44 #include <sys/param.h>
  45 #include <sys/arb.h>
  46 #include <sys/module.h>
  47 #include <sys/kernel.h>
  48 #ifdef TCP_HHOOK
  49 #include <sys/hhook.h>
  50 #endif
  51 #include <sys/malloc.h>
  52 #include <sys/mbuf.h>
  53 #include <sys/proc.h>
  54 #include <sys/qmath.h>
  55 #include <sys/socket.h>
  56 #include <sys/socketvar.h>
  57 #ifdef KERN_TLS
  58 #include <sys/ktls.h>
  59 #endif
  60 #include <sys/sysctl.h>
  61 #include <sys/systm.h>
  62 #include <sys/tree.h>
  63 #ifdef NETFLIX_STATS
  64 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
  65 #endif
  66 #include <sys/refcount.h>
  67 #include <sys/queue.h>
  68 #include <sys/smp.h>
  69 #include <sys/kthread.h>
  70 #include <sys/lock.h>
  71 #include <sys/mutex.h>
  72 #include <sys/tim_filter.h>
  73 #include <sys/time.h>
  74 #include <vm/uma.h>
  75 #include <sys/kern_prefetch.h>
  76
  77 #include <net/route.h>
  78 #include <net/vnet.h>
  79 #include <net/ethernet.h>
  80 #include <net/bpf.h>
  81
  82 #define TCPSTATES               /* for logging */
  83
  84 #include <netinet/in.h>
  85 #include <netinet/in_kdtrace.h>
  86 #include <netinet/in_pcb.h>
  87 #include <netinet/ip.h>
  88 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
  89 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
  90 #include <netinet/ip_var.h>
  91 #include <netinet/ip6.h>
  92 #include <netinet6/in6_pcb.h>
  93 #include <netinet6/ip6_var.h>
  94 #define TCPOUTFLAGS
  95 #include <netinet/tcp.h>
  96 #include <netinet/tcp_fsm.h>
  97 #include <netinet/tcp_seq.h>
  98 #include <netinet/tcp_timer.h>
  99 #include <netinet/tcp_var.h>
 100 #include <netinet/tcpip.h>
 101 #include <netinet/tcp_hpts.h>
 102 #include <netinet/cc/cc.h>
 103 #include <netinet/tcp_log_buf.h>
 104 #ifdef TCPDEBUG
 105 #include <netinet/tcp_debug.h>
 106 #endif                          /* TCPDEBUG */
 107 #ifdef TCP_OFFLOAD
 108 #include <netinet/tcp_offload.h>
 109 #endif
 110 #ifdef INET6
 111 #include <netinet6/tcp6_var.h>
 112 #endif
 113 #include <netinet/tcp_fastopen.h>
 114
 115 #include <netipsec/ipsec_support.h>
 116 #include <net/if.h>
 117 #include <net/if_var.h>
 118
 119 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 120 #include <netipsec/ipsec.h>
 121 #include <netipsec/ipsec6.h>
 122 #endif                          /* IPSEC */
 123
 124 #include <netinet/udp.h>
 125 #include <netinet/udp_var.h>
 126 #include <machine/in_cksum.h>
 127
 128 #ifdef MAC
 129 #include <security/mac/mac_framework.h>
 130 #endif
 131 #include "rack_bbr_common.h"
 132
 133 /*
 134  * Common TCP Functions - These are shared by borth
 135  * rack and BBR.
 136  */
 137 #ifdef KERN_TLS
 138 uint32_t
 139 ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
 140 {
 141         struct ktls_session *tls;
 142         uint32_t len;
 143
 144 again:
 145         tls = so->so_snd.sb_tls_info;
 146         len = tls->params.max_frame_len;         /* max tls payload */
 147         len += tls->params.tls_hlen;      /* tls header len  */
 148         len += tls->params.tls_tlen;      /* tls trailer len */
 149         if ((len * 4) > rwnd) {
 150                 /*
 151                  * Stroke this will suck counter and what
 152                  * else should we do Drew? From the
 153                  * TCP perspective I am not sure
 154                  * what should be done...
 155                  */
 156                 if (tls->params.max_frame_len > 4096) {
 157                         tls->params.max_frame_len -= 4096;
 158                         if (tls->params.max_frame_len < 4096)
 159                                 tls->params.max_frame_len = 4096;
 160                         goto again;
 161                 }
 162         }
 163         return (len);
 164 }
 165 #endif
 166
 167
 168 /*
 169  * The function ctf_process_inbound_raw() is used by
 170  * transport developers to do the steps needed to
 171  * support MBUF Queuing i.e. the flags in
 172  * inp->inp_flags2:
 173  *
 174  * - INP_SUPPORTS_MBUFQ
 175  * - INP_MBUF_QUEUE_READY
 176  * - INP_DONT_SACK_QUEUE
 177  *
 178  * These flags help control how LRO will deliver
 179  * packets to the transport. You first set in inp_flags2
 180  * the INP_SUPPORTS_MBUFQ to tell the LRO code that you
 181  * will gladly take a queue of packets instead of a compressed
 182  * single packet. You also set in your t_fb pointer the
 183  * tfb_do_queued_segments to point to ctf_process_inbound_raw.
 184  *
 185  * This then gets you lists of inbound ACK's/Data instead
 186  * of a condensed compressed ACK/DATA packet. Why would you
 187  * want that? This will get you access to all the arrival
 188  * times of at least LRO and possibly at the Hardware (if
 189  * the interface card supports that) of the actual ACK/DATA.
 190  * In some transport designs this is important since knowing
 191  * the actual time we got the packet is useful information.
 192  *
 193  * Now there are some interesting Caveats that the transport
 194  * designer needs to take into account when using this feature.
 195  *
 196  * 1) It is used with HPTS and pacing, when the pacing timer
 197  *    for output calls it will first call the input.
 198  * 2) When you set INP_MBUF_QUEUE_READY this tells LRO
 199  *    queue normal packets, I am busy pacing out data and
 200  *    will process the queued packets before my tfb_tcp_output
 201  *    call from pacing. If a non-normal packet arrives, (e.g. sack)
 202  *    you will be awoken immediately.
 203  * 3) Finally you can add the INP_DONT_SACK_QUEUE to not even
 204  *    be awoken if a SACK has arrived. You would do this when
 205  *    you were not only running a pacing for output timer
 206  *    but a Rack timer as well i.e. you know you are in recovery
 207  *    and are in the process (via the timers) of dealing with
 208  *    the loss.
 209  *
 210  * Now a critical thing you must be aware of here is that the
 211  * use of the flags has a far greater scope then just your
 212  * typical LRO. Why? Well thats because in the normal compressed
 213  * LRO case at the end of a driver interupt all packets are going
 214  * to get presented to the transport no matter if there is one
 215  * or 100. With the MBUF_QUEUE model, this is not true. You will
 216  * only be awoken to process the queue of packets when:
 217  *     a) The flags discussed above allow it.
 218  *          <or>
 219  *     b) You exceed a ack or data limit (by default the
 220  *        ack limit is infinity (64k acks) and the data
 221  *        limit is 64k of new TCP data)
 222  *         <or>
 223  *     c) The push bit has been set by the peer
 224  */
 225
 226 int
 227 ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt)
 228 {
 229         /*
 230          * We are passed a raw change of mbuf packets
 231          * that arrived in LRO. They are linked via
 232          * the m_nextpkt link in the pkt-headers.
 233          *
 234          * We process each one by:
 235          * a) saving off the next
 236          * b) stripping off the ether-header
 237          * c) formulating the arguments for
 238          *    the tfb_tcp_hpts_do_segment
 239          * d) calling each mbuf to tfb_tcp_hpts_do_segment
 240          *    after adjusting the time to match the arrival time.
 241          * Note that the LRO code assures no IP options are present.
 242          *
 243          * The symantics for calling tfb_tcp_hpts_do_segment are the
 244          * following:
 245          * 1) It returns 0 if all went well and you (the caller) need
 246          *    to release the lock.
 247          * 2) If nxt_pkt is set, then the function will surpress calls
 248          *    to tfb_tcp_output() since you are promising to call again
 249          *    with another packet.
 250          * 3) If it returns 1, then you must free all the packets being
 251          *    shipped in, the tcb has been destroyed (or about to be destroyed).
 252          */
 253         struct mbuf *m_save;
 254         struct ether_header *eh;
 255         struct tcphdr *th;
 256 #ifdef INET6
 257         struct ip6_hdr *ip6 = NULL;     /* Keep compiler happy. */
 258 #endif
 259 #ifdef INET
 260         struct ip *ip = NULL;           /* Keep compiler happy. */
 261 #endif
 262         struct ifnet *ifp;
 263         struct timeval tv;
 264         int32_t retval, nxt_pkt, tlen, off;
 265         uint16_t etype;
 266         uint16_t drop_hdrlen;
 267         uint8_t iptos, no_vn=0, bpf_req=0;
 268
 269         NET_EPOCH_ASSERT();
 270
 271         if (m && m->m_pkthdr.rcvif)
 272                 ifp = m->m_pkthdr.rcvif;
 273         else
 274                 ifp = NULL;
 275         if (ifp) {
 276                 bpf_req = bpf_peers_present(ifp->if_bpf);
 277         } else  {
 278                 /*
 279                  * We probably should not work around
 280                  * but kassert, since lro alwasy sets rcvif.
 281                  */
 282                 no_vn = 1;
 283                 goto skip_vnet;
 284         }
 285         CURVNET_SET(ifp->if_vnet);
 286 skip_vnet:
 287         while (m) {
 288                 m_save = m->m_nextpkt;
 289                 m->m_nextpkt = NULL;
 290                 /* Now lets get the ether header */
 291                 eh = mtod(m, struct ether_header *);
 292                 etype = ntohs(eh->ether_type);
 293                 /* Let the BPF see the packet */
 294                 if (bpf_req && ifp)
 295                         ETHER_BPF_MTAP(ifp, m);
 296                 m_adj(m,  sizeof(*eh));
 297                 /* Trim off the ethernet header */
 298                 switch (etype) {
 299 #ifdef INET6
 300                 case ETHERTYPE_IPV6:
 301                 {
 302                         if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
 303                                 m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
 304                                 if (m == NULL) {
 305                                         TCPSTAT_INC(tcps_rcvshort);
 306                                         m_freem(m);
 307                                         goto skipped_pkt;
 308                                 }
 309                         }
 310                         ip6 = (struct ip6_hdr *)(eh + 1);
 311                         th = (struct tcphdr *)(ip6 + 1);
 312                         tlen = ntohs(ip6->ip6_plen);
 313                         drop_hdrlen = sizeof(*ip6);
 314                         if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
 315                                 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 316                                         th->th_sum = m->m_pkthdr.csum_data;
 317                                 else
 318                                         th->th_sum = in6_cksum_pseudo(ip6, tlen,
 319                                                                       IPPROTO_TCP, m->m_pkthdr.csum_data);
 320                                 th->th_sum ^= 0xffff;
 321                         } else
 322                                 th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
 323                         if (th->th_sum) {
 324                                 TCPSTAT_INC(tcps_rcvbadsum);
 325                                 m_freem(m);
 326                                 goto skipped_pkt;
 327                         }
 328                         /*
 329                          * Be proactive about unspecified IPv6 address in source.
 330                          * As we use all-zero to indicate unbounded/unconnected pcb,
 331                          * unspecified IPv6 address can be used to confuse us.
 332                          *
 333                          * Note that packets with unspecified IPv6 destination is
 334                          * already dropped in ip6_input.
 335                          */
 336                         if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 337                                 /* XXX stat */
 338                                 m_freem(m);
 339                                 goto skipped_pkt;
 340                         }
 341                         iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 342                         break;
 343                 }
 344 #endif
 345 #ifdef INET
 346                 case ETHERTYPE_IP:
 347                 {
 348                         if (m->m_len < sizeof (struct tcpiphdr)) {
 349                                 if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
 350                                     == NULL) {
 351                                         TCPSTAT_INC(tcps_rcvshort);
 352                                         m_freem(m);
 353                                         goto skipped_pkt;
 354                                 }
 355                         }
 356                         ip = (struct ip *)(eh + 1);
 357                         th = (struct tcphdr *)(ip + 1);
 358                         drop_hdrlen = sizeof(*ip);
 359                         iptos = ip->ip_tos;
 360                         tlen = ntohs(ip->ip_len) - sizeof(struct ip);
 361                         if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 362                                 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 363                                         th->th_sum = m->m_pkthdr.csum_data;
 364                                 else
 365                                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
 366                                                                ip->ip_dst.s_addr,
 367                                                                htonl(m->m_pkthdr.csum_data + tlen +
 368                                                                      IPPROTO_TCP));
 369                                 th->th_sum ^= 0xffff;
 370                         } else {
 371                                 int len;
 372                                 struct ipovly *ipov = (struct ipovly *)ip;
 373                                 /*
 374                                  * Checksum extended TCP header and data.
 375                                  */
 376                                 len = drop_hdrlen + tlen;
 377                                 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
 378                                 ipov->ih_len = htons(tlen);
 379                                 th->th_sum = in_cksum(m, len);
 380                                 /* Reset length for SDT probes. */
 381                                 ip->ip_len = htons(len);
 382                                 /* Reset TOS bits */
 383                                 ip->ip_tos = iptos;
 384                                 /* Re-initialization for later version check */
 385                                 ip->ip_v = IPVERSION;
 386                                 ip->ip_hl = sizeof(*ip) >> 2;
 387                         }
 388                         if (th->th_sum) {
 389                                 TCPSTAT_INC(tcps_rcvbadsum);
 390                                 m_freem(m);
 391                                 goto skipped_pkt;
 392                         }
 393                         break;
 394                 }
 395 #endif
 396                 }
 397                 /*
 398                  * Convert TCP protocol specific fields to host format.
 399                  */
 400                 tcp_fields_to_host(th);
 401
 402                 off = th->th_off << 2;
 403                 if (off < sizeof (struct tcphdr) || off > tlen) {
 404                         TCPSTAT_INC(tcps_rcvbadoff);
 405                                 m_freem(m);
 406                                 goto skipped_pkt;
 407                 }
 408                 tlen -= off;
 409                 drop_hdrlen += off;
 410                 /*
 411                  * Now lets setup the timeval to be when we should
 412                  * have been called (if we can).
 413                  */
 414                 m->m_pkthdr.lro_nsegs = 1;
 415                 if (m->m_flags & M_TSTMP_LRO) {
 416                         tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
 417                         tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
 418                 } else {
 419                         /* Should not be should we kassert instead? */
 420                         tcp_get_usecs(&tv);
 421                 }
 422                 /* Now what about next packet? */
 423                 if (m_save || has_pkt)
 424                         nxt_pkt = 1;
 425                 else
 426                         nxt_pkt = 0;
 427                 retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen,
 428                                                               iptos, nxt_pkt, &tv);
 429                 if (retval) {
 430                         /* We lost the lock and tcb probably */
 431                         m = m_save;
 432                         while(m) {
 433                                 m_save = m->m_nextpkt;
 434                                 m->m_nextpkt = NULL;
 435                                 m_freem(m);
 436                                 m = m_save;
 437                         }
 438                         if (no_vn == 0)
 439                                 CURVNET_RESTORE();
 440                         return(retval);
 441                 }
 442 skipped_pkt:
 443                 m = m_save;
 444         }
 445         if (no_vn == 0)
 446                 CURVNET_RESTORE();
 447         return(retval);
 448 }
 449
 450 int
 451 ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt)
 452 {
 453         struct mbuf *m;
 454
 455         /* First lets see if we have old packets */
 456         if (tp->t_in_pkt) {
 457                 m = tp->t_in_pkt;
 458                 tp->t_in_pkt = NULL;
 459                 tp->t_tail_pkt = NULL;
 460                 if (ctf_process_inbound_raw(tp, so, m, have_pkt)) {
 461                         /* We lost the tcpcb (maybe a RST came in)? */
 462                         return(1);
 463                 }
 464         }
 465         return (0);
 466 }
 467
 468 uint32_t
 469 ctf_outstanding(struct tcpcb *tp)
 470 {
 471         return(tp->snd_max - tp->snd_una);
 472 }
 473
 474 uint32_t
 475 ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked)
 476 {
 477         if (rc_sacked <= ctf_outstanding(tp))
 478                 return(ctf_outstanding(tp) - rc_sacked);
 479         else {
 480                 /* TSNH */
 481 #ifdef INVARIANTS
 482                 panic("tp:%p rc_sacked:%d > out:%d",
 483                       tp, rc_sacked, ctf_outstanding(tp));
 484 #endif
 485                 return (0);
 486         }
 487 }
 488
 489 void
 490 ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
 491     int32_t rstreason, int32_t tlen)
 492 {
 493         if (tp != NULL) {
 494                 tcp_dropwithreset(m, th, tp, tlen, rstreason);
 495                 INP_WUNLOCK(tp->t_inpcb);
 496         } else
 497                 tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 498 }
 499
 500 /*
 501  * ctf_drop_checks returns 1 for you should not proceed. It places
 502  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
 503  * that the TCB is unlocked and probably dropped. The 0 indicates the
 504  * TCB is still valid and locked.
 505  */
 506 int
 507 ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp,  int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
 508 {
 509         int32_t todrop;
 510         int32_t thflags;
 511         int32_t tlen;
 512
 513         thflags = *thf;
 514         tlen = *tlenp;
 515         todrop = tp->rcv_nxt - th->th_seq;
 516         if (todrop > 0) {
 517                 if (thflags & TH_SYN) {
 518                         thflags &= ~TH_SYN;
 519                         th->th_seq++;
 520                         if (th->th_urp > 1)
 521                                 th->th_urp--;
 522                         else
 523                                 thflags &= ~TH_URG;
 524                         todrop--;
 525                 }
 526                 /*
 527                  * Following if statement from Stevens, vol. 2, p. 960.
 528                  */
 529                 if (todrop > tlen
 530                     || (todrop == tlen && (thflags & TH_FIN) == 0)) {
 531                         /*
 532                          * Any valid FIN must be to the left of the window.
 533                          * At this point the FIN must be a duplicate or out
 534                          * of sequence; drop it.
 535                          */
 536                         thflags &= ~TH_FIN;
 537                         /*
 538                          * Send an ACK to resynchronize and drop any data.
 539                          * But keep on processing for RST or ACK.
 540                          */
 541                         tp->t_flags |= TF_ACKNOW;
 542                         todrop = tlen;
 543                         TCPSTAT_INC(tcps_rcvduppack);
 544                         TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
 545                 } else {
 546                         TCPSTAT_INC(tcps_rcvpartduppack);
 547                         TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
 548                 }
 549                 /*
 550                  * DSACK - add SACK block for dropped range
 551                  */
 552                 if (tp->t_flags & TF_SACK_PERMIT) {
 553                         tcp_update_sack_list(tp, th->th_seq,
 554                             th->th_seq + todrop);
 555                         /*
 556                          * ACK now, as the next in-sequence segment
 557                          * will clear the DSACK block again
 558                          */
 559                         tp->t_flags |= TF_ACKNOW;
 560                 }
 561                 *drop_hdrlen += todrop; /* drop from the top afterwards */
 562                 th->th_seq += todrop;
 563                 tlen -= todrop;
 564                 if (th->th_urp > todrop)
 565                         th->th_urp -= todrop;
 566                 else {
 567                         thflags &= ~TH_URG;
 568                         th->th_urp = 0;
 569                 }
 570         }
 571         /*
 572          * If segment ends after window, drop trailing data (and PUSH and
 573          * FIN); if nothing left, just ACK.
 574          */
 575         todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
 576         if (todrop > 0) {
 577                 TCPSTAT_INC(tcps_rcvpackafterwin);
 578                 if (todrop >= tlen) {
 579                         TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
 580                         /*
 581                          * If window is closed can only take segments at
 582                          * window edge, and have to drop data and PUSH from
 583                          * incoming segments.  Continue processing, but
 584                          * remember to ack.  Otherwise, drop segment and
 585                          * ack.
 586                          */
 587                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 588                                 tp->t_flags |= TF_ACKNOW;
 589                                 TCPSTAT_INC(tcps_rcvwinprobe);
 590                         } else {
 591                                 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
 592                                 return (1);
 593                         }
 594                 } else
 595                         TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 596                 m_adj(m, -todrop);
 597                 tlen -= todrop;
 598                 thflags &= ~(TH_PUSH | TH_FIN);
 599         }
 600         *thf = thflags;
 601         *tlenp = tlen;
 602         return (0);
 603 }
 604
 605 /*
 606  * The value in ret_val informs the caller
 607  * if we dropped the tcb (and lock) or not.
 608  * 1 = we dropped it, 0 = the TCB is still locked
 609  * and valid.
 610  */
 611 void
 612 ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
 613 {
 614         /*
 615          * Generate an ACK dropping incoming segment if it occupies sequence
 616          * space, where the ACK reflects our state.
 617          *
 618          * We can now skip the test for the RST flag since all paths to this
 619          * code happen after packets containing RST have been dropped.
 620          *
 621          * In the SYN-RECEIVED state, don't send an ACK unless the segment
 622          * we received passes the SYN-RECEIVED ACK test. If it fails send a
 623          * RST.  This breaks the loop in the "LAND" DoS attack, and also
 624          * prevents an ACK storm between two listening ports that have been
 625          * sent forged SYN segments, each with the source address of the
 626          * other.
 627          */
 628         if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
 629             (SEQ_GT(tp->snd_una, th->th_ack) ||
 630             SEQ_GT(th->th_ack, tp->snd_max))) {
 631                 *ret_val = 1;
 632                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 633                 return;
 634         } else
 635                 *ret_val = 0;
 636         tp->t_flags |= TF_ACKNOW;
 637         if (m)
 638                 m_freem(m);
 639 }
 640
 641 void
 642 ctf_do_drop(struct mbuf *m, struct tcpcb *tp)
 643 {
 644
 645         /*
 646          * Drop space held by incoming segment and return.
 647          */
 648         if (tp != NULL)
 649                 INP_WUNLOCK(tp->t_inpcb);
 650         if (m)
 651                 m_freem(m);
 652 }
 653
 654 int
 655 ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp)
 656 {
 657         /*
 658          * RFC5961 Section 3.2
 659          *
 660          * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
 661          * window, we send challenge ACK.
 662          *
 663          * Note: to take into account delayed ACKs, we should test against
 664          * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
 665          * of closed window, not covered by the RFC.
 666          */
 667         int dropped = 0;
 668
 669         if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
 670             SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
 671             (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
 672
 673                 KASSERT(tp->t_state != TCPS_SYN_SENT,
 674                     ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
 675                     __func__, th, tp));
 676
 677                 if (V_tcp_insecure_rst ||
 678                     (tp->last_ack_sent == th->th_seq) ||
 679                     (tp->rcv_nxt == th->th_seq) ||
 680                     ((tp->last_ack_sent - 1) == th->th_seq)) {
 681                         TCPSTAT_INC(tcps_drops);
 682                         /* Drop the connection. */
 683                         switch (tp->t_state) {
 684                         case TCPS_SYN_RECEIVED:
 685                                 so->so_error = ECONNREFUSED;
 686                                 goto close;
 687                         case TCPS_ESTABLISHED:
 688                         case TCPS_FIN_WAIT_1:
 689                         case TCPS_FIN_WAIT_2:
 690                         case TCPS_CLOSE_WAIT:
 691                         case TCPS_CLOSING:
 692                         case TCPS_LAST_ACK:
 693                                 so->so_error = ECONNRESET;
 694                 close:
 695                                 tcp_state_change(tp, TCPS_CLOSED);
 696                                 /* FALLTHROUGH */
 697                         default:
 698                                 tp = tcp_close(tp);
 699                         }
 700                         dropped = 1;
 701                         ctf_do_drop(m, tp);
 702                 } else {
 703                         TCPSTAT_INC(tcps_badrst);
 704                         /* Send challenge ACK. */
 705                         tcp_respond(tp, mtod(m, void *), th, m,
 706                             tp->rcv_nxt, tp->snd_nxt, TH_ACK);
 707                         tp->last_ack_sent = tp->rcv_nxt;
 708                 }
 709         } else {
 710                 m_freem(m);
 711         }
 712         return (dropped);
 713 }
 714
 715 /*
 716  * The value in ret_val informs the caller
 717  * if we dropped the tcb (and lock) or not.
 718  * 1 = we dropped it, 0 = the TCB is still locked
 719  * and valid.
 720  */
 721 void
 722 ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val)
 723 {
 724
 725         NET_EPOCH_ASSERT();
 726
 727         TCPSTAT_INC(tcps_badsyn);
 728         if (V_tcp_insecure_syn &&
 729             SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
 730             SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
 731                 tp = tcp_drop(tp, ECONNRESET);
 732                 *ret_val = 1;
 733                 ctf_do_drop(m, tp);
 734         } else {
 735                 /* Send challenge ACK. */
 736                 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
 737                     tp->snd_nxt, TH_ACK);
 738                 tp->last_ack_sent = tp->rcv_nxt;
 739                 m = NULL;
 740                 *ret_val = 0;
 741                 ctf_do_drop(m, NULL);
 742         }
 743 }
 744
 745 /*
 746  * bbr_ts_check returns 1 for you should not proceed, the state
 747  * machine should return. It places in ret_val what should
 748  * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates
 749  * that the TCB is unlocked and probably dropped. The 0 indicates the
 750  * TCB is still valid and locked.
 751  */
 752 int
 753 ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
 754     int32_t tlen, int32_t thflags, int32_t * ret_val)
 755 {
 756
 757         if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
 758                 /*
 759                  * Invalidate ts_recent.  If this segment updates ts_recent,
 760                  * the age will be reset later and ts_recent will get a
 761                  * valid value.  If it does not, setting ts_recent to zero
 762                  * will at least satisfy the requirement that zero be placed
 763                  * in the timestamp echo reply when ts_recent isn't valid.
 764                  * The age isn't reset until we get a valid ts_recent
 765                  * because we don't want out-of-order segments to be dropped
 766                  * when ts_recent is old.
 767                  */
 768                 tp->ts_recent = 0;
 769         } else {
 770                 TCPSTAT_INC(tcps_rcvduppack);
 771                 TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
 772                 TCPSTAT_INC(tcps_pawsdrop);
 773                 *ret_val = 0;
 774                 if (tlen) {
 775                         ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
 776                 } else {
 777                         ctf_do_drop(m, NULL);
 778                 }
 779                 return (1);
 780         }
 781         return (0);
 782 }
 783
 784 void
 785 ctf_calc_rwin(struct socket *so, struct tcpcb *tp)
 786 {
 787         int32_t win;
 788
 789         /*
 790          * Calculate amount of space in receive window, and then do TCP
 791          * input processing. Receive window is amount of space in rcv queue,
 792          * but not less than advertised window.
 793          */
 794         win = sbspace(&so->so_rcv);
 795         if (win < 0)
 796                 win = 0;
 797         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 798 }
 799
 800 void
 801 ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
 802     int32_t rstreason, int32_t tlen)
 803 {
 804
 805         if (tp->t_inpcb) {
 806                 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 807         }
 808         tcp_dropwithreset(m, th, tp, tlen, rstreason);
 809         INP_WUNLOCK(tp->t_inpcb);
 810 }
 811
 812 uint32_t
 813 ctf_fixed_maxseg(struct tcpcb *tp)
 814 {
 815         int optlen;
 816
 817         if (tp->t_flags & TF_NOOPT)
 818                 return (tp->t_maxseg);
 819
 820         /*
 821          * Here we have a simplified code from tcp_addoptions(),
 822          * without a proper loop, and having most of paddings hardcoded.
 823          * We only consider fixed options that we would send every
 824          * time I.e. SACK is not considered.
 825          *
 826          */
 827 #define PAD(len)        ((((len) / 4) + !!((len) % 4)) * 4)
 828         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 829                 if (tp->t_flags & TF_RCVD_TSTMP)
 830                         optlen = TCPOLEN_TSTAMP_APPA;
 831                 else
 832                         optlen = 0;
 833 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 834                 if (tp->t_flags & TF_SIGNATURE)
 835                         optlen += PAD(TCPOLEN_SIGNATURE);
 836 #endif
 837         } else {
 838                 if (tp->t_flags & TF_REQ_TSTMP)
 839                         optlen = TCPOLEN_TSTAMP_APPA;
 840                 else
 841                         optlen = PAD(TCPOLEN_MAXSEG);
 842                 if (tp->t_flags & TF_REQ_SCALE)
 843                         optlen += PAD(TCPOLEN_WINDOW);
 844 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 845                 if (tp->t_flags & TF_SIGNATURE)
 846                         optlen += PAD(TCPOLEN_SIGNATURE);
 847 #endif
 848                 if (tp->t_flags & TF_SACK_PERMIT)
 849                         optlen += PAD(TCPOLEN_SACK_PERMITTED);
 850         }
 851 #undef PAD
 852         optlen = min(optlen, TCP_MAXOLEN);
 853         return (tp->t_maxseg - optlen);
 854 }
 855
 856 void
 857 ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks)
 858 {
 859         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 860                 union tcp_log_stackspecific log;
 861                 struct timeval tv;
 862
 863                 memset(&log, 0, sizeof(log));
 864                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 865                 log.u_bbr.flex8 = num_sack_blks;
 866                 if (num_sack_blks > 0) {
 867                         log.u_bbr.flex1 = sack_blocks[0].start;
 868                         log.u_bbr.flex2 = sack_blocks[0].end;
 869                 }
 870                 if (num_sack_blks > 1) {
 871                         log.u_bbr.flex3 = sack_blocks[1].start;
 872                         log.u_bbr.flex4 = sack_blocks[1].end;
 873                 }
 874                 if (num_sack_blks > 2) {
 875                         log.u_bbr.flex5 = sack_blocks[2].start;
 876                         log.u_bbr.flex6 = sack_blocks[2].end;
 877                 }
 878                 if (num_sack_blks > 3) {
 879                         log.u_bbr.applimited = sack_blocks[3].start;
 880                         log.u_bbr.pkts_out = sack_blocks[3].end;
 881                 }
 882                 TCP_LOG_EVENTP(tp, NULL,
 883                     &tp->t_inpcb->inp_socket->so_rcv,
 884                     &tp->t_inpcb->inp_socket->so_snd,
 885                     TCP_SACK_FILTER_RES, 0,
 886                     0, &log, false, &tv);
 887         }
 888 }
 889
 890 uint32_t
 891 ctf_decay_count(uint32_t count, uint32_t decay)
 892 {
 893         /*
 894          * Given a count, decay it by a set percentage. The
 895          * percentage is in thousands i.e. 100% = 1000,
 896          * 19.3% = 193.
 897          */
 898         uint64_t perc_count, decay_per;
 899         uint32_t decayed_count;
 900         if (decay > 1000) {
 901                 /* We don't raise it */
 902                 return (count);
 903         }
 904         perc_count = count;
 905         decay_per = decay;
 906         perc_count *= decay_per;
 907         perc_count /= 1000;
 908         /*
 909          * So now perc_count holds the
 910          * count decay value.
 911          */
 912         decayed_count = count - (uint32_t)perc_count;
 913         return(decayed_count);
 914 }