2 * Copyright (c) 2017 Netflix, Inc.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 #include <sys/types.h>
29 #include <sys/queue.h>
30 #include <sys/socket.h>
32 #include <sys/sockopt.h>
33 #include <netinet/tcp.h>
34 #include <netinet/tcp_var.h>
35 #include <netinet/tcp_seq.h>
45 #include "sack_filter.h"
48 * Sack filter is used to filter out sacks
49 * that have already been processed. The idea
50 * is pretty simple really, consider two sacks
60 * The previous sack information (B-C) is repeated
61 * in SACK 2. If the receiver gets SACK 1 and then
62 * SACK 2 then any work associated with B-C as already
63 * been completed. This only effects where we may have
64 * (as in bbr or rack) cases where we walk a linked list.
66 * Now the utility trys to keep everything in a single
67 * cache line. This means that its not perfect and
68 * it could be that so big of sack's come that a
69 * "remembered" processed sack falls off the list and
70 * so gets re-processed. Thats ok, it just means we
71 * did some extra work. We could of course take more
72 * cache line hits by expanding the size of this
73 * structure, but then that would cost more.
77 int detailed_dump = 0;
78 uint64_t cnt_skipped_oldsack = 0;
79 uint64_t cnt_used_oldsack = 0;
88 #define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits)
89 #define sack_blk_set(sf, i) ((1 << i) | sf->sf_bits)
90 #define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits)
96 sack_filter_clear(struct sack_filter *sf, tcp_seq seq)
104 * Given a previous sack filter block, filter out
105 * any entries where the cum-ack moves over them
106 * fully or partially.
109 sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack)
112 /* start with the oldest */
113 for (i = 0; i < SACK_FILTER_BLOCKS; i++) {
114 if (sack_blk_used(sf, i)) {
115 if (SEQ_GT(th_ack, sf->sf_blks[i].end)) {
116 /* This block is consumed */
117 sf->sf_bits = sack_blk_clr(sf, i);
119 } else if (SEQ_GT(th_ack, sf->sf_blks[i].start)) {
120 /* Some of it is acked */
121 sf->sf_blks[i].start = th_ack;
122 /* We could in theory break here, but
123 * there are some broken implementations
124 * that send multiple blocks. We want
125 * to catch them all with similar seq's.
134 * Return true if you find that
135 * the sackblock b is on the score
136 * board. Update it along the way
137 * if part of it is on the board.
140 is_sack_on_board(struct sack_filter *sf, struct sackblk *b)
143 for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) {
144 if (sack_blk_used(sf, i)) {
145 if (SEQ_LT(b->start, sf->sf_ack)) {
146 /* Behind cum-ack update */
147 b->start = sf->sf_ack;
149 if (SEQ_LT(b->end, sf->sf_ack)) {
150 /* End back behind too */
153 if (b->start == b->end)
155 /* Jonathans Rule 1 */
156 if (SEQ_LEQ(sf->sf_blks[i].start, b->start) &&
157 SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
159 * Our board has this entirely in
162 * board |-------------|
163 * sack |-------------|
165 * board |-------------|
171 /* Jonathans Rule 2 */
172 if(SEQ_LT(sf->sf_blks[i].end, b->start)) {
174 * Not near each other:
181 /* Jonathans Rule 3 */
182 if (SEQ_GT(sf->sf_blks[i].start, b->end)) {
184 * Not near each other:
191 if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) {
193 * The board block partial meets:
199 * sack |--------------|
201 * up with this one (we have part of it).
202 * 1) Update the board block to the new end
204 * 2) Update the start of this block to my end.
206 b->start = sf->sf_blks[i].end;
207 sf->sf_blks[i].end = b->end;
210 if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
212 * The board block partial meets:
219 * 1) Update the board block to the new start
221 * 2) Update the start of this block to my end.
223 b->end = sf->sf_blks[i].start;
224 sf->sf_blks[i].start = b->start;
230 i %= SACK_FILTER_BLOCKS;
232 /* Did we totally consume it in pieces? */
233 if (b->start != b->end)
240 sack_filter_old(struct sack_filter *sf, struct sackblk *in, int numblks)
243 struct sackblk blkboard[TCP_MAX_SACK];
245 * An old sack has arrived. It may contain data
246 * we do not have. We might not have it since
247 * we could have had a lost ack <or> we might have the
248 * entire thing on our current board. We want to prune
249 * off anything we have. With this function though we
250 * won't add to the board.
252 for( i = 0, num = 0; i<numblks; i++ ) {
253 if (is_sack_on_board(sf, &in[i])) {
255 cnt_skipped_oldsack++;
259 /* Did not find it (or found only
260 * a piece of it). Copy it to
261 * our outgoing board.
263 memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
270 memcpy(in, blkboard, (num * sizeof(struct sackblk)));
276 * Given idx its used but there is space available
277 * move the entry to the next free slot
280 sack_move_to_empty(struct sack_filter *sf, uint32_t idx)
284 i = (idx + 1) % SACK_FILTER_BLOCKS;
285 for (cnt=0; cnt <(SACK_FILTER_BLOCKS-1); cnt++) {
286 if (sack_blk_used(sf, i) == 0) {
287 memcpy(&sf->sf_blks[i], &sf->sf_blks[idx], sizeof(struct sackblk));
288 sf->sf_bits = sack_blk_clr(sf, idx);
289 sf->sf_bits = sack_blk_set(sf, i);
293 i %= SACK_FILTER_BLOCKS;
298 sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
300 struct sackblk blkboard[TCP_MAX_SACK];
303 * First lets trim the old and possibly
304 * throw any away we have.
306 for(i=0, num=0; i<numblks; i++) {
307 if (is_sack_on_board(sf, &in[i]))
309 memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
315 /* Now what we are left is either
316 * completely merged on to the board
317 * from the above steps, or are new
318 * and need to be added to the board
319 * with the last one updated to current.
321 * First copy it out we want to return that
322 * to our caller for processing.
324 memcpy(in, blkboard, (num * sizeof(struct sackblk)));
326 /* Now go through and add to our board as needed */
327 for(i=(num-1); i>=0; i--) {
328 if (is_sack_on_board(sf, &blkboard[i]))
330 /* Add this guy its not listed */
332 sf->sf_cur %= SACK_FILTER_BLOCKS;
333 if ((sack_blk_used(sf, sf->sf_cur)) &&
334 (sf->sf_used < SACK_FILTER_BLOCKS)) {
335 sack_move_to_empty(sf, sf->sf_cur);
338 if (sack_blk_used(sf, sf->sf_cur)) {
340 if (sf->sf_used < SACK_FILTER_BLOCKS)
344 memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
345 if (sack_blk_used(sf, sf->sf_cur) == 0) {
348 if (sf->sf_used > highest_used)
349 highest_used = sf->sf_used;
351 sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
358 * Given a sack block on the board (the skip index) see if
359 * any other used entries overlap or meet, if so return the index.
362 sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t skip)
366 for(i=0; i<SACK_FILTER_BLOCKS; i++) {
367 if (sack_blk_used(sf, i) == 0)
371 if (SEQ_GEQ(sf->sf_blks[i].end, sb->start) &&
372 SEQ_LEQ(sf->sf_blks[i].end, sb->end) &&
373 SEQ_LEQ(sf->sf_blks[i].start, sb->start)) {
375 * The two board blocks meet:
378 * board2 |----------|
381 * board2 |--------------|
388 if (SEQ_LEQ(sf->sf_blks[i].start, sb->end) &&
389 SEQ_GEQ(sf->sf_blks[i].start, sb->start) &&
390 SEQ_GEQ(sf->sf_blks[i].end, sb->end)) {
392 * The board block partial meets:
399 * 1) Update the board block to the new start
401 * 2) Update the start of this block to my end.
410 * Collapse entry src into entry into
411 * and free up the src entry afterwards.
414 sack_collapse(struct sack_filter *sf, int32_t src, int32_t into)
416 if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) {
417 /* src has a lower starting point */
418 sf->sf_blks[into].start = sf->sf_blks[src].start;
420 if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) {
421 /* src has a higher ending point */
422 sf->sf_blks[into].end = sf->sf_blks[src].end;
424 sf->sf_bits = sack_blk_clr(sf, src);
429 sack_board_collapse(struct sack_filter *sf)
431 int32_t i, j, i_d, j_d;
433 for(i=0; i<SACK_FILTER_BLOCKS; i++) {
434 if (sack_blk_used(sf, i) == 0)
437 * Look at all other blocks but this guy
438 * to see if they overlap. If so we collapse
439 * the two blocks together.
441 j = sack_blocks_overlap_or_meet(sf, &sf->sf_blks[i], i);
447 * Ok j and i overlap with each other, collapse the
448 * one out furthest away from the current position.
451 i_d = sf->sf_cur - i;
453 i_d = i - sf->sf_cur;
455 j_d = sf->sf_cur - j;
457 j_d = j - sf->sf_cur;
459 sack_collapse(sf, j, i);
461 sack_collapse(sf, i, j);
469 sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
473 if (numblks > TCP_MAX_SACK) {
474 panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n",
479 if ((sf->sf_used == 0) && numblks) {
481 * We are brand new add the blocks in
482 * reverse order. Note we can see more
483 * than one in new, since ack's could be lost.
486 for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) {
487 memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
488 sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
490 sf->sf_cur %= SACK_FILTER_BLOCKS;
493 if (sf->sf_used > highest_used)
494 highest_used = sf->sf_used;
501 if (SEQ_GT(th_ack, sf->sf_ack)) {
502 sack_filter_prune(sf, th_ack);
505 if (SEQ_GEQ(th_ack, sf->sf_ack)) {
506 ret = sack_filter_new(sf, in, numblks, th_ack);
508 ret = sack_filter_old(sf, in, numblks);
513 if ((sf->sf_used > 1) && (no_collapse == 0))
514 sack_board_collapse(sf);
518 sack_board_collapse(sf);
526 uint64_t tot_sack_blks=0;
529 sack_filter_dump(FILE *out, struct sack_filter *sf)
532 fprintf(out, " sf_ack:%u sf_bits:0x%x c:%d used:%d\n",
533 sf->sf_ack, sf->sf_bits,
534 sf->sf_cur, sf->sf_used);
536 for(i=0; i<SACK_FILTER_BLOCKS; i++) {
537 if (sack_blk_used(sf, i)) {
538 fprintf(out, "Entry:%d start:%u end:%u\n", i,
539 sf->sf_blks[i].start,
546 main(int argc, char **argv)
549 struct sackblk blks[TCP_MAX_SACK];
551 tcp_seq th_ack, snd_una;
552 struct sack_filter sf;
556 int invalid_sack_print = 0;
557 uint32_t chg_remembered=0;
559 char line_buf[10][256];
564 while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) {
573 invalid_sack_print = 1;
576 in = fopen(optarg, "r");
578 fprintf(stderr, "Fatal error can't open %s for input\n", optarg);
583 out = fopen(optarg, "w");
585 fprintf(stderr, "Fatal error can't open %s for output\n", optarg);
592 fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]);
597 sack_filter_clear(&sf, 0);
598 memset(buffer, 0, sizeof(buffer));
599 memset(blks, 0, sizeof(blks));
601 fprintf(out, "************************************\n");
602 while (fgets(buffer, sizeof(buffer), in) != NULL) {
603 sprintf(line_buf[line_buf_at], "%s", buffer);
605 if (strncmp(buffer, "QUIT", 4) == 0) {
607 } else if (strncmp(buffer, "DONE", 4) == 0) {
610 uint32_t szof, tot_chg;
611 for(ii=0; ii<line_buf_at; ii++) {
612 fprintf(out, "%s", line_buf[ii]);
614 fprintf(out, "------------------------------------\n");
615 nn = sack_filter_blks(&sf, blks, numblks, th_ack);
616 saved += numblks - nn;
617 tot_sack_blks += numblks;
618 fprintf(out, "ACK:%u\n", sf.sf_ack);
619 for(ii=0, tot_chg=0; ii<nn; ii++) {
620 szof = blks[ii].end - blks[ii].start;
622 fprintf(out, "SACK:%u:%u [%u]\n",
626 fprintf(out,"************************************\n");
627 chg_remembered = tot_chg;
629 sack_filter_dump(out, &sf);
630 fprintf(out,"************************************\n");
633 memset(blks, 0, sizeof(blks));
634 memset(line_buf, 0, sizeof(line_buf));
637 } else if (strncmp(buffer, "CHG:", 4) == 0) {
638 sack_chg = strtoul(&buffer[4], NULL, 0);
639 if ((sack_chg != chg_remembered) &&
640 (sack_chg > chg_remembered)){
641 fprintf(out,"***WARNING WILL RODGERS DANGER!! sack_chg:%u last:%u\n",
642 sack_chg, chg_remembered
645 sack_chg = chg_remembered = 0;
646 } else if (strncmp(buffer, "RXT", 3) == 0) {
647 sack_filter_clear(&sf, snd_una);
648 } else if (strncmp(buffer, "ACK:", 4) == 0) {
649 th_ack = strtoul(&buffer[4], NULL, 0);
650 if (snd_una_set == 0) {
653 } else if (SEQ_GT(th_ack, snd_una)) {
656 } else if (strncmp(buffer, "EXIT", 4) == 0) {
657 sack_filter_clear(&sf, snd_una);
658 sack_chg = chg_remembered = 0;
659 } else if (strncmp(buffer, "SACK:", 5) == 0) {
663 start = strtoul(&buffer[5], &end, 0);
665 endv = strtoul(&end[1], NULL, 0);
667 fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start);
670 if (SEQ_LT(endv, start)) {
671 fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start);
674 if (numblks == TCP_MAX_SACK) {
675 fprintf(out, "--Exceeded max %d\n", numblks);
678 blks[numblks].start = start;
679 blks[numblks].end = endv;
682 memset(buffer, 0, sizeof(buffer));
691 b = tot_sack_blks * 1.0;
700 fprintf(err, "Saved %lu sack blocks out of %lu (%2.3f%%) old_skip:%lu old_usd:%lu high_cnt:%d ow:%d ea:%d\n",
701 saved, tot_sack_blks, c, cnt_skipped_oldsack, cnt_used_oldsack, highest_used, over_written, empty_avail);