sys/netinet/tcp_sack.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
   3  *      The Regents of the University of California.
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  * 4. Neither the name of the University nor the names of its contributors
  15  *    may be used to endorse or promote products derived from this software
  16  *    without specific prior written permission.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  28  * SUCH DAMAGE.
  29  *
  30  *      @(#)tcp_sack.c  8.12 (Berkeley) 5/24/95
  31  */
  32
  33 /*-
  34  *      @@(#)COPYRIGHT  1.1 (NRL) 17 January 1995
  35  *
  36  * NRL grants permission for redistribution and use in source and binary
  37  * forms, with or without modification, of the software and documentation
  38  * created at NRL provided that the following conditions are met:
  39  *
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgements:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  *      This product includes software developed at the Information
  50  *      Technology Division, US Naval Research Laboratory.
  51  * 4. Neither the name of the NRL nor the names of its contributors
  52  *    may be used to endorse or promote products derived from this software
  53  *    without specific prior written permission.
  54  *
  55  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
  56  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  57  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  58  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
  59  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  60  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  61  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  62  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  63  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  64  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  65  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  66  *
  67  * The views and conclusions contained in the software and documentation
  68  * are those of the authors and should not be interpreted as representing
  69  * official policies, either expressed or implied, of the US Naval
  70  * Research Laboratory (NRL).
  71  */
  72
  73 #include <sys/cdefs.h>
  74 __FBSDID("$FreeBSD$");
  75
  76 #include "opt_inet.h"
  77 #include "opt_inet6.h"
  78 #include "opt_tcpdebug.h"
  79
  80 #include <sys/param.h>
  81 #include <sys/systm.h>
  82 #include <sys/kernel.h>
  83 #include <sys/sysctl.h>
  84 #include <sys/malloc.h>
  85 #include <sys/mbuf.h>
  86 #include <sys/proc.h>           /* for proc0 declaration */
  87 #include <sys/protosw.h>
  88 #include <sys/socket.h>
  89 #include <sys/socketvar.h>
  90 #include <sys/syslog.h>
  91 #include <sys/systm.h>
  92
  93 #include <machine/cpu.h>        /* before tcp_seq.h, for tcp_random18() */
  94
  95 #include <vm/uma.h>
  96
  97 #include <net/if.h>
  98 #include <net/if_var.h>
  99 #include <net/route.h>
 100 #include <net/vnet.h>
 101
 102 #include <netinet/in.h>
 103 #include <netinet/in_systm.h>
 104 #include <netinet/ip.h>
 105 #include <netinet/in_var.h>
 106 #include <netinet/in_pcb.h>
 107 #include <netinet/ip_var.h>
 108 #include <netinet/ip6.h>
 109 #include <netinet/icmp6.h>
 110 #include <netinet6/nd6.h>
 111 #include <netinet6/ip6_var.h>
 112 #include <netinet6/in6_pcb.h>
 113 #include <netinet/tcp.h>
 114 #include <netinet/tcp_fsm.h>
 115 #include <netinet/tcp_seq.h>
 116 #include <netinet/tcp_timer.h>
 117 #include <netinet/tcp_var.h>
 118 #include <netinet6/tcp6_var.h>
 119 #include <netinet/tcpip.h>
 120 #ifdef TCPDEBUG
 121 #include <netinet/tcp_debug.h>
 122 #endif /* TCPDEBUG */
 123
 124 #include <machine/in_cksum.h>
 125
 126 VNET_DECLARE(struct uma_zone *, sack_hole_zone);
 127 #define V_sack_hole_zone                VNET(sack_hole_zone)
 128
 129 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK");
 130 VNET_DEFINE(int, tcp_do_sack) = 1;
 131 #define V_tcp_do_sack                   VNET(tcp_do_sack)
 132 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
 133     &VNET_NAME(tcp_do_sack), 0, "Enable/Disable TCP SACK support");
 134
 135 VNET_DEFINE(int, tcp_sack_maxholes) = 128;
 136 #define V_tcp_sack_maxholes             VNET(tcp_sack_maxholes)
 137 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_VNET | CTLFLAG_RW,
 138     &VNET_NAME(tcp_sack_maxholes), 0,
 139     "Maximum number of TCP SACK holes allowed per connection");
 140
 141 VNET_DEFINE(int, tcp_sack_globalmaxholes) = 65536;
 142 #define V_tcp_sack_globalmaxholes       VNET(tcp_sack_globalmaxholes)
 143 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_VNET | CTLFLAG_RW,
 144     &VNET_NAME(tcp_sack_globalmaxholes), 0,
 145     "Global maximum number of TCP SACK holes");
 146
 147 VNET_DEFINE(int, tcp_sack_globalholes) = 0;
 148 #define V_tcp_sack_globalholes          VNET(tcp_sack_globalholes)
 149 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_VNET | CTLFLAG_RD,
 150     &VNET_NAME(tcp_sack_globalholes), 0,
 151     "Global number of TCP SACK holes currently allocated");
 152
 153 /*
 154  * This function is called upon receipt of new valid data (while not in
 155  * header prediction mode), and it updates the ordered list of sacks.
 156  */
 157 void
 158 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
 159 {
 160         /*
 161          * First reported block MUST be the most recent one.  Subsequent
 162          * blocks SHOULD be in the order in which they arrived at the
 163          * receiver.  These two conditions make the implementation fully
 164          * compliant with RFC 2018.
 165          */
 166         struct sackblk head_blk, saved_blks[MAX_SACK_BLKS];
 167         int num_head, num_saved, i;
 168
 169         INP_WLOCK_ASSERT(tp->t_inpcb);
 170
 171         /* Check arguments. */
 172         KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end"));
 173
 174         /* SACK block for the received segment. */
 175         head_blk.start = rcv_start;
 176         head_blk.end = rcv_end;
 177
 178         /*
 179          * Merge updated SACK blocks into head_blk, and save unchanged SACK
 180          * blocks into saved_blks[].  num_saved will have the number of the
 181          * saved SACK blocks.
 182          */
 183         num_saved = 0;
 184         for (i = 0; i < tp->rcv_numsacks; i++) {
 185                 tcp_seq start = tp->sackblks[i].start;
 186                 tcp_seq end = tp->sackblks[i].end;
 187                 if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
 188                         /*
 189                          * Discard this SACK block.
 190                          */
 191                 } else if (SEQ_LEQ(head_blk.start, end) &&
 192                            SEQ_GEQ(head_blk.end, start)) {
 193                         /*
 194                          * Merge this SACK block into head_blk.  This SACK
 195                          * block itself will be discarded.
 196                          */
 197                         if (SEQ_GT(head_blk.start, start))
 198                                 head_blk.start = start;
 199                         if (SEQ_LT(head_blk.end, end))
 200                                 head_blk.end = end;
 201                 } else {
 202                         /*
 203                          * Save this SACK block.
 204                          */
 205                         saved_blks[num_saved].start = start;
 206                         saved_blks[num_saved].end = end;
 207                         num_saved++;
 208                 }
 209         }
 210
 211         /*
 212          * Update SACK list in tp->sackblks[].
 213          */
 214         num_head = 0;
 215         if (SEQ_GT(head_blk.start, tp->rcv_nxt)) {
 216                 /*
 217                  * The received data segment is an out-of-order segment.  Put
 218                  * head_blk at the top of SACK list.
 219                  */
 220                 tp->sackblks[0] = head_blk;
 221                 num_head = 1;
 222                 /*
 223                  * If the number of saved SACK blocks exceeds its limit,
 224                  * discard the last SACK block.
 225                  */
 226                 if (num_saved >= MAX_SACK_BLKS)
 227                         num_saved--;
 228         }
 229         if (num_saved > 0) {
 230                 /*
 231                  * Copy the saved SACK blocks back.
 232                  */
 233                 bcopy(saved_blks, &tp->sackblks[num_head],
 234                       sizeof(struct sackblk) * num_saved);
 235         }
 236
 237         /* Save the number of SACK blocks. */
 238         tp->rcv_numsacks = num_head + num_saved;
 239 }
 240
 241 /*
 242  * Delete all receiver-side SACK information.
 243  */
 244 void
 245 tcp_clean_sackreport(struct tcpcb *tp)
 246 {
 247         int i;
 248
 249         INP_WLOCK_ASSERT(tp->t_inpcb);
 250         tp->rcv_numsacks = 0;
 251         for (i = 0; i < MAX_SACK_BLKS; i++)
 252                 tp->sackblks[i].start = tp->sackblks[i].end=0;
 253 }
 254
 255 /*
 256  * Allocate struct sackhole.
 257  */
 258 static struct sackhole *
 259 tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end)
 260 {
 261         struct sackhole *hole;
 262
 263         if (tp->snd_numholes >= V_tcp_sack_maxholes ||
 264             V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) {
 265                 TCPSTAT_INC(tcps_sack_sboverflow);
 266                 return NULL;
 267         }
 268
 269         hole = (struct sackhole *)uma_zalloc(V_sack_hole_zone, M_NOWAIT);
 270         if (hole == NULL)
 271                 return NULL;
 272
 273         hole->start = start;
 274         hole->end = end;
 275         hole->rxmit = start;
 276
 277         tp->snd_numholes++;
 278         atomic_add_int(&V_tcp_sack_globalholes, 1);
 279
 280         return hole;
 281 }
 282
 283 /*
 284  * Free struct sackhole.
 285  */
 286 static void
 287 tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole)
 288 {
 289
 290         uma_zfree(V_sack_hole_zone, hole);
 291
 292         tp->snd_numholes--;
 293         atomic_subtract_int(&V_tcp_sack_globalholes, 1);
 294
 295         KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0"));
 296         KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0"));
 297 }
 298
 299 /*
 300  * Insert new SACK hole into scoreboard.
 301  */
 302 static struct sackhole *
 303 tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end,
 304     struct sackhole *after)
 305 {
 306         struct sackhole *hole;
 307
 308         /* Allocate a new SACK hole. */
 309         hole = tcp_sackhole_alloc(tp, start, end);
 310         if (hole == NULL)
 311                 return NULL;
 312
 313         /* Insert the new SACK hole into scoreboard. */
 314         if (after != NULL)
 315                 TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink);
 316         else
 317                 TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink);
 318
 319         /* Update SACK hint. */
 320         if (tp->sackhint.nexthole == NULL)
 321                 tp->sackhint.nexthole = hole;
 322
 323         return hole;
 324 }
 325
 326 /*
 327  * Remove SACK hole from scoreboard.
 328  */
 329 static void
 330 tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole)
 331 {
 332
 333         /* Update SACK hint. */
 334         if (tp->sackhint.nexthole == hole)
 335                 tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink);
 336
 337         /* Remove this SACK hole. */
 338         TAILQ_REMOVE(&tp->snd_holes, hole, scblink);
 339
 340         /* Free this SACK hole. */
 341         tcp_sackhole_free(tp, hole);
 342 }
 343
 344 /*
 345  * Process cumulative ACK and the TCP SACK option to update the scoreboard.
 346  * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of
 347  * the sequence space).
 348  * Returns 1 if incoming ACK has previously unknown SACK information,
 349  * 0 otherwise. Note: We treat (snd_una, th_ack) as a sack block so any changes
 350  * to that (i.e. left edge moving) would also be considered a change in SACK
 351  * information which is slightly different than rfc6675.
 352  */
 353 int
 354 tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
 355 {
 356         struct sackhole *cur, *temp;
 357         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp;
 358         int i, j, num_sack_blks, sack_changed;
 359
 360         INP_WLOCK_ASSERT(tp->t_inpcb);
 361
 362         num_sack_blks = 0;
 363         sack_changed = 0;
 364         /*
 365          * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist,
 366          * treat [SND.UNA, SEG.ACK) as if it is a SACK block.
 367          */
 368         if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) {
 369                 sack_blocks[num_sack_blks].start = tp->snd_una;
 370                 sack_blocks[num_sack_blks++].end = th_ack;
 371         }
 372         /*
 373          * Append received valid SACK blocks to sack_blocks[], but only if we
 374          * received new blocks from the other side.
 375          */
 376         if (to->to_flags & TOF_SACK) {
 377                 tp->sackhint.sacked_bytes = 0;  /* reset */
 378                 for (i = 0; i < to->to_nsacks; i++) {
 379                         bcopy((to->to_sacks + i * TCPOLEN_SACK),
 380                             &sack, sizeof(sack));
 381                         sack.start = ntohl(sack.start);
 382                         sack.end = ntohl(sack.end);
 383                         if (SEQ_GT(sack.end, sack.start) &&
 384                             SEQ_GT(sack.start, tp->snd_una) &&
 385                             SEQ_GT(sack.start, th_ack) &&
 386                             SEQ_LT(sack.start, tp->snd_max) &&
 387                             SEQ_GT(sack.end, tp->snd_una) &&
 388                             SEQ_LEQ(sack.end, tp->snd_max)) {
 389                                 sack_blocks[num_sack_blks++] = sack;
 390                                 tp->sackhint.sacked_bytes +=
 391                                     (sack.end-sack.start);
 392                         }
 393                 }
 394         }
 395         /*
 396          * Return if SND.UNA is not advanced and no valid SACK block is
 397          * received.
 398          */
 399         if (num_sack_blks == 0)
 400                 return (sack_changed);
 401
 402         /*
 403          * Sort the SACK blocks so we can update the scoreboard with just one
 404          * pass. The overhead of sorting up to 4+1 elements is less than
 405          * making up to 4+1 passes over the scoreboard.
 406          */
 407         for (i = 0; i < num_sack_blks; i++) {
 408                 for (j = i + 1; j < num_sack_blks; j++) {
 409                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
 410                                 sack = sack_blocks[i];
 411                                 sack_blocks[i] = sack_blocks[j];
 412                                 sack_blocks[j] = sack;
 413                         }
 414                 }
 415         }
 416         if (TAILQ_EMPTY(&tp->snd_holes))
 417                 /*
 418                  * Empty scoreboard. Need to initialize snd_fack (it may be
 419                  * uninitialized or have a bogus value). Scoreboard holes
 420                  * (from the sack blocks received) are created later below
 421                  * (in the logic that adds holes to the tail of the
 422                  * scoreboard).
 423                  */
 424                 tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack);
 425         /*
 426          * In the while-loop below, incoming SACK blocks (sack_blocks[]) and
 427          * SACK holes (snd_holes) are traversed from their tails with just
 428          * one pass in order to reduce the number of compares especially when
 429          * the bandwidth-delay product is large.
 430          *
 431          * Note: Typically, in the first RTT of SACK recovery, the highest
 432          * three or four SACK blocks with the same ack number are received.
 433          * In the second RTT, if retransmitted data segments are not lost,
 434          * the highest three or four SACK blocks with ack number advancing
 435          * are received.
 436          */
 437         sblkp = &sack_blocks[num_sack_blks - 1];        /* Last SACK block */
 438         tp->sackhint.last_sack_ack = sblkp->end;
 439         if (SEQ_LT(tp->snd_fack, sblkp->start)) {
 440                 /*
 441                  * The highest SACK block is beyond fack.  Append new SACK
 442                  * hole at the tail.  If the second or later highest SACK
 443                  * blocks are also beyond the current fack, they will be
 444                  * inserted by way of hole splitting in the while-loop below.
 445                  */
 446                 temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL);
 447                 if (temp != NULL) {
 448                         tp->snd_fack = sblkp->end;
 449                         /* Go to the previous sack block. */
 450                         sblkp--;
 451                         sack_changed = 1;
 452                 } else {
 453                         /*
 454                          * We failed to add a new hole based on the current
 455                          * sack block.  Skip over all the sack blocks that
 456                          * fall completely to the right of snd_fack and
 457                          * proceed to trim the scoreboard based on the
 458                          * remaining sack blocks.  This also trims the
 459                          * scoreboard for th_ack (which is sack_blocks[0]).
 460                          */
 461                         while (sblkp >= sack_blocks &&
 462                                SEQ_LT(tp->snd_fack, sblkp->start))
 463                                 sblkp--;
 464                         if (sblkp >= sack_blocks &&
 465                             SEQ_LT(tp->snd_fack, sblkp->end))
 466                                 tp->snd_fack = sblkp->end;
 467                 }
 468         } else if (SEQ_LT(tp->snd_fack, sblkp->end)) {
 469                 /* fack is advanced. */
 470                 tp->snd_fack = sblkp->end;
 471                 sack_changed = 1;
 472         }
 473         cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */
 474         /*
 475          * Since the incoming sack blocks are sorted, we can process them
 476          * making one sweep of the scoreboard.
 477          */
 478         while (sblkp >= sack_blocks  && cur != NULL) {
 479                 if (SEQ_GEQ(sblkp->start, cur->end)) {
 480                         /*
 481                          * SACKs data beyond the current hole.  Go to the
 482                          * previous sack block.
 483                          */
 484                         sblkp--;
 485                         continue;
 486                 }
 487                 if (SEQ_LEQ(sblkp->end, cur->start)) {
 488                         /*
 489                          * SACKs data before the current hole.  Go to the
 490                          * previous hole.
 491                          */
 492                         cur = TAILQ_PREV(cur, sackhole_head, scblink);
 493                         continue;
 494                 }
 495                 tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start);
 496                 KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
 497                     ("sackhint bytes rtx >= 0"));
 498                 sack_changed = 1;
 499                 if (SEQ_LEQ(sblkp->start, cur->start)) {
 500                         /* Data acks at least the beginning of hole. */
 501                         if (SEQ_GEQ(sblkp->end, cur->end)) {
 502                                 /* Acks entire hole, so delete hole. */
 503                                 temp = cur;
 504                                 cur = TAILQ_PREV(cur, sackhole_head, scblink);
 505                                 tcp_sackhole_remove(tp, temp);
 506                                 /*
 507                                  * The sack block may ack all or part of the
 508                                  * next hole too, so continue onto the next
 509                                  * hole.
 510                                  */
 511                                 continue;
 512                         } else {
 513                                 /* Move start of hole forward. */
 514                                 cur->start = sblkp->end;
 515                                 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
 516                         }
 517                 } else {
 518                         /* Data acks at least the end of hole. */
 519                         if (SEQ_GEQ(sblkp->end, cur->end)) {
 520                                 /* Move end of hole backward. */
 521                                 cur->end = sblkp->start;
 522                                 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
 523                         } else {
 524                                 /*
 525                                  * ACKs some data in middle of a hole; need
 526                                  * to split current hole
 527                                  */
 528                                 temp = tcp_sackhole_insert(tp, sblkp->end,
 529                                     cur->end, cur);
 530                                 if (temp != NULL) {
 531                                         if (SEQ_GT(cur->rxmit, temp->rxmit)) {
 532                                                 temp->rxmit = cur->rxmit;
 533                                                 tp->sackhint.sack_bytes_rexmit
 534                                                     += (temp->rxmit
 535                                                     - temp->start);
 536                                         }
 537                                         cur->end = sblkp->start;
 538                                         cur->rxmit = SEQ_MIN(cur->rxmit,
 539                                             cur->end);
 540                                 }
 541                         }
 542                 }
 543                 tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start);
 544                 /*
 545                  * Testing sblkp->start against cur->start tells us whether
 546                  * we're done with the sack block or the sack hole.
 547                  * Accordingly, we advance one or the other.
 548                  */
 549                 if (SEQ_LEQ(sblkp->start, cur->start))
 550                         cur = TAILQ_PREV(cur, sackhole_head, scblink);
 551                 else
 552                         sblkp--;
 553         }
 554         return (sack_changed);
 555 }
 556
 557 /*
 558  * Free all SACK holes to clear the scoreboard.
 559  */
 560 void
 561 tcp_free_sackholes(struct tcpcb *tp)
 562 {
 563         struct sackhole *q;
 564
 565         INP_WLOCK_ASSERT(tp->t_inpcb);
 566         while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL)
 567                 tcp_sackhole_remove(tp, q);
 568         tp->sackhint.sack_bytes_rexmit = 0;
 569
 570         KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0"));
 571         KASSERT(tp->sackhint.nexthole == NULL,
 572                 ("tp->sackhint.nexthole == NULL"));
 573 }
 574
 575 /*
 576  * Partial ack handling within a sack recovery episode.  Keeping this very
 577  * simple for now.  When a partial ack is received, force snd_cwnd to a value
 578  * that will allow the sender to transmit no more than 2 segments.  If
 579  * necessary, a better scheme can be adopted at a later point, but for now,
 580  * the goal is to prevent the sender from bursting a large amount of data in
 581  * the midst of sack recovery.
 582  */
 583 void
 584 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
 585 {
 586         int num_segs = 1;
 587
 588         INP_WLOCK_ASSERT(tp->t_inpcb);
 589         tcp_timer_activate(tp, TT_REXMT, 0);
 590         tp->t_rtttime = 0;
 591         /* Send one or 2 segments based on how much new data was acked. */
 592         if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) >= 2)
 593                 num_segs = 2;
 594         tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
 595             (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg);
 596         if (tp->snd_cwnd > tp->snd_ssthresh)
 597                 tp->snd_cwnd = tp->snd_ssthresh;
 598         tp->t_flags |= TF_ACKNOW;
 599         (void) tp->t_fb->tfb_tcp_output(tp);
 600 }
 601
 602 #if 0
 603 /*
 604  * Debug version of tcp_sack_output() that walks the scoreboard.  Used for
 605  * now to sanity check the hint.
 606  */
 607 static struct sackhole *
 608 tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt)
 609 {
 610         struct sackhole *p;
 611
 612         INP_WLOCK_ASSERT(tp->t_inpcb);
 613         *sack_bytes_rexmt = 0;
 614         TAILQ_FOREACH(p, &tp->snd_holes, scblink) {
 615                 if (SEQ_LT(p->rxmit, p->end)) {
 616                         if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
 617                                 continue;
 618                         }
 619                         *sack_bytes_rexmt += (p->rxmit - p->start);
 620                         break;
 621                 }
 622                 *sack_bytes_rexmt += (p->rxmit - p->start);
 623         }
 624         return (p);
 625 }
 626 #endif
 627
 628 /*
 629  * Returns the next hole to retransmit and the number of retransmitted bytes
 630  * from the scoreboard.  We store both the next hole and the number of
 631  * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK
 632  * reception).  This avoids scoreboard traversals completely.
 633  *
 634  * The loop here will traverse *at most* one link.  Here's the argument.  For
 635  * the loop to traverse more than 1 link before finding the next hole to
 636  * retransmit, we would need to have at least 1 node following the current
 637  * hint with (rxmit == end).  But, for all holes following the current hint,
 638  * (start == rxmit), since we have not yet retransmitted from them.
 639  * Therefore, in order to traverse more 1 link in the loop below, we need to
 640  * have at least one node following the current hint with (start == rxmit ==
 641  * end).  But that can't happen, (start == end) means that all the data in
 642  * that hole has been sacked, in which case, the hole would have been removed
 643  * from the scoreboard.
 644  */
 645 struct sackhole *
 646 tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
 647 {
 648         struct sackhole *hole = NULL;
 649
 650         INP_WLOCK_ASSERT(tp->t_inpcb);
 651         *sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit;
 652         hole = tp->sackhint.nexthole;
 653         if (hole == NULL || SEQ_LT(hole->rxmit, hole->end))
 654                 goto out;
 655         while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) {
 656                 if (SEQ_LT(hole->rxmit, hole->end)) {
 657                         tp->sackhint.nexthole = hole;
 658                         break;
 659                 }
 660         }
 661 out:
 662         return (hole);
 663 }
 664
 665 /*
 666  * After a timeout, the SACK list may be rebuilt.  This SACK information
 667  * should be used to avoid retransmitting SACKed data.  This function
 668  * traverses the SACK list to see if snd_nxt should be moved forward.
 669  */
 670 void
 671 tcp_sack_adjust(struct tcpcb *tp)
 672 {
 673         struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes);
 674
 675         INP_WLOCK_ASSERT(tp->t_inpcb);
 676         if (cur == NULL)
 677                 return; /* No holes */
 678         if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack))
 679                 return; /* We're already beyond any SACKed blocks */
 680         /*-
 681          * Two cases for which we want to advance snd_nxt:
 682          * i) snd_nxt lies between end of one hole and beginning of another
 683          * ii) snd_nxt lies between end of last hole and snd_fack
 684          */
 685         while ((p = TAILQ_NEXT(cur, scblink)) != NULL) {
 686                 if (SEQ_LT(tp->snd_nxt, cur->end))
 687                         return;
 688                 if (SEQ_GEQ(tp->snd_nxt, p->start))
 689                         cur = p;
 690                 else {
 691                         tp->snd_nxt = p->start;
 692                         return;
 693                 }
 694         }
 695         if (SEQ_LT(tp->snd_nxt, cur->end))
 696                 return;
 697         tp->snd_nxt = tp->snd_fack;
 698 }