2 * Copyright (c) 2017-9 Netflix, Inc.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
31 #include <sys/types.h>
32 #include <sys/queue.h>
33 #include <sys/socket.h>
36 #include <sys/sockopt.h>
38 #include <netinet/tcp.h>
39 #include <netinet/tcp_var.h>
40 #include <netinet/tcp_seq.h>
50 #include "sack_filter.h"
53 * Sack filter is used to filter out sacks
54 * that have already been processed. The idea
55 * is pretty simple really, consider two sacks
65 * The previous sack information (B-C) is repeated
66 * in SACK 2. If the receiver gets SACK 1 and then
67 * SACK 2 then any work associated with B-C as already
68 * been completed. This only effects where we may have
69 * (as in bbr or rack) cases where we walk a linked list.
71 * Now the utility trys to keep everything in a single
72 * cache line. This means that its not perfect and
73 * it could be that so big of sack's come that a
74 * "remembered" processed sack falls off the list and
75 * so gets re-processed. Thats ok, it just means we
76 * did some extra work. We could of course take more
77 * cache line hits by expanding the size of this
78 * structure, but then that would cost more.
82 int detailed_dump = 0;
83 uint64_t cnt_skipped_oldsack = 0;
84 uint64_t cnt_used_oldsack = 0;
93 #define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits)
94 #define sack_blk_set(sf, i) ((1 << i) | sf->sf_bits)
95 #define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits)
101 sack_filter_clear(struct sack_filter *sf, tcp_seq seq)
109 * Given a previous sack filter block, filter out
110 * any entries where the cum-ack moves over them
111 * fully or partially.
114 sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack)
117 /* start with the oldest */
118 for (i = 0; i < SACK_FILTER_BLOCKS; i++) {
119 if (sack_blk_used(sf, i)) {
120 if (SEQ_GT(th_ack, sf->sf_blks[i].end)) {
121 /* This block is consumed */
122 sf->sf_bits = sack_blk_clr(sf, i);
124 } else if (SEQ_GT(th_ack, sf->sf_blks[i].start)) {
125 /* Some of it is acked */
126 sf->sf_blks[i].start = th_ack;
127 /* We could in theory break here, but
128 * there are some broken implementations
129 * that send multiple blocks. We want
130 * to catch them all with similar seq's.
139 * Return true if you find that
140 * the sackblock b is on the score
141 * board. Update it along the way
142 * if part of it is on the board.
145 is_sack_on_board(struct sack_filter *sf, struct sackblk *b)
149 for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) {
150 if (sack_blk_used(sf, i)) {
151 if (SEQ_LT(b->start, sf->sf_ack)) {
152 /* Behind cum-ack update */
153 b->start = sf->sf_ack;
155 if (SEQ_LT(b->end, sf->sf_ack)) {
156 /* End back behind too */
159 if (b->start == b->end) {
162 /* Jonathans Rule 1 */
163 if (SEQ_LEQ(sf->sf_blks[i].start, b->start) &&
164 SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
166 * Our board has this entirely in
169 * board |-------------|
170 * sack |-------------|
172 * board |-------------|
178 /* Jonathans Rule 2 */
179 if(SEQ_LT(sf->sf_blks[i].end, b->start)) {
181 * Not near each other:
188 /* Jonathans Rule 3 */
189 if (SEQ_GT(sf->sf_blks[i].start, b->end)) {
191 * Not near each other:
198 if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) {
200 * The board block partial meets:
206 * sack |--------------|
208 * up with this one (we have part of it).
209 * 1) Update the board block to the new end
211 * 2) Update the start of this block to my end.
213 b->start = sf->sf_blks[i].end;
214 sf->sf_blks[i].end = b->end;
217 if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
219 * The board block partial meets:
226 * 1) Update the board block to the new start
228 * 2) Update the start of this block to my end.
230 b->end = sf->sf_blks[i].start;
231 sf->sf_blks[i].start = b->start;
237 i %= SACK_FILTER_BLOCKS;
239 /* Did we totally consume it in pieces? */
240 if (b->start != b->end)
247 sack_filter_old(struct sack_filter *sf, struct sackblk *in, int numblks)
250 struct sackblk blkboard[TCP_MAX_SACK];
252 * An old sack has arrived. It may contain data
253 * we do not have. We might not have it since
254 * we could have had a lost ack <or> we might have the
255 * entire thing on our current board. We want to prune
256 * off anything we have. With this function though we
257 * won't add to the board.
259 for( i = 0, num = 0; i<numblks; i++ ) {
260 if (is_sack_on_board(sf, &in[i])) {
262 cnt_skipped_oldsack++;
266 /* Did not find it (or found only
267 * a piece of it). Copy it to
268 * our outgoing board.
270 memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
277 memcpy(in, blkboard, (num * sizeof(struct sackblk)));
283 * Given idx its used but there is space available
284 * move the entry to the next free slot
287 sack_move_to_empty(struct sack_filter *sf, uint32_t idx)
291 i = (idx + 1) % SACK_FILTER_BLOCKS;
292 for (cnt=0; cnt <(SACK_FILTER_BLOCKS-1); cnt++) {
293 if (sack_blk_used(sf, i) == 0) {
294 memcpy(&sf->sf_blks[i], &sf->sf_blks[idx], sizeof(struct sackblk));
295 sf->sf_bits = sack_blk_clr(sf, idx);
296 sf->sf_bits = sack_blk_set(sf, i);
300 i %= SACK_FILTER_BLOCKS;
305 sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
307 struct sackblk blkboard[TCP_MAX_SACK];
310 * First lets trim the old and possibly
311 * throw any away we have.
313 for(i=0, num=0; i<numblks; i++) {
314 if (is_sack_on_board(sf, &in[i]))
316 memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
322 /* Now what we are left with is either
323 * completely merged on to the board
324 * from the above steps, or is new
325 * and need to be added to the board
326 * with the last one updated to current.
328 * First copy it out, we want to return that
329 * to our caller for processing.
331 memcpy(in, blkboard, (num * sizeof(struct sackblk)));
333 /* Now go through and add to our board as needed */
334 for(i=(num-1); i>=0; i--) {
335 if (is_sack_on_board(sf, &blkboard[i])) {
338 /* Add this guy its not listed */
340 sf->sf_cur %= SACK_FILTER_BLOCKS;
341 if ((sack_blk_used(sf, sf->sf_cur)) &&
342 (sf->sf_used < SACK_FILTER_BLOCKS)) {
343 sack_move_to_empty(sf, sf->sf_cur);
346 if (sack_blk_used(sf, sf->sf_cur)) {
348 if (sf->sf_used < SACK_FILTER_BLOCKS)
352 memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
353 if (sack_blk_used(sf, sf->sf_cur) == 0) {
356 if (sf->sf_used > highest_used)
357 highest_used = sf->sf_used;
359 sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
366 * Given a sack block on the board (the skip index) see if
367 * any other used entries overlap or meet, if so return the index.
370 sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t skip)
374 for(i=0; i<SACK_FILTER_BLOCKS; i++) {
375 if (sack_blk_used(sf, i) == 0)
379 if (SEQ_GEQ(sf->sf_blks[i].end, sb->start) &&
380 SEQ_LEQ(sf->sf_blks[i].end, sb->end) &&
381 SEQ_LEQ(sf->sf_blks[i].start, sb->start)) {
383 * The two board blocks meet:
386 * board2 |----------|
389 * board2 |--------------|
396 if (SEQ_LEQ(sf->sf_blks[i].start, sb->end) &&
397 SEQ_GEQ(sf->sf_blks[i].start, sb->start) &&
398 SEQ_GEQ(sf->sf_blks[i].end, sb->end)) {
400 * The board block partial meets:
407 * 1) Update the board block to the new start
409 * 2) Update the start of this block to my end.
418 * Collapse entry src into entry into
419 * and free up the src entry afterwards.
422 sack_collapse(struct sack_filter *sf, int32_t src, int32_t into)
424 if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) {
425 /* src has a lower starting point */
426 sf->sf_blks[into].start = sf->sf_blks[src].start;
428 if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) {
429 /* src has a higher ending point */
430 sf->sf_blks[into].end = sf->sf_blks[src].end;
432 sf->sf_bits = sack_blk_clr(sf, src);
437 sack_board_collapse(struct sack_filter *sf)
439 int32_t i, j, i_d, j_d;
441 for(i=0; i<SACK_FILTER_BLOCKS; i++) {
442 if (sack_blk_used(sf, i) == 0)
445 * Look at all other blocks but this guy
446 * to see if they overlap. If so we collapse
447 * the two blocks together.
449 j = sack_blocks_overlap_or_meet(sf, &sf->sf_blks[i], i);
455 * Ok j and i overlap with each other, collapse the
456 * one out furthest away from the current position.
459 i_d = sf->sf_cur - i;
461 i_d = i - sf->sf_cur;
463 j_d = sf->sf_cur - j;
465 j_d = j - sf->sf_cur;
467 sack_collapse(sf, j, i);
469 sack_collapse(sf, i, j);
475 uint64_t tot_sack_blks=0;
478 sack_filter_dump(FILE *out, struct sack_filter *sf)
481 fprintf(out, " sf_ack:%u sf_bits:0x%x c:%d used:%d\n",
482 sf->sf_ack, sf->sf_bits,
483 sf->sf_cur, sf->sf_used);
485 for(i=0; i<SACK_FILTER_BLOCKS; i++) {
486 if (sack_blk_used(sf, i)) {
487 fprintf(out, "Entry:%d start:%u end:%u\n", i,
488 sf->sf_blks[i].start,
499 sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks,
504 if (numblks > TCP_MAX_SACK) {
506 panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n",
513 if ((sf->sf_used > 1) && (no_collapse == 0))
514 sack_board_collapse(sf);
518 sack_board_collapse(sf);
520 if ((sf->sf_used == 0) && numblks) {
522 * We are brand new add the blocks in
523 * reverse order. Note we can see more
524 * than one in new, since ack's could be lost.
529 for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) {
530 memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
531 sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
533 sf->sf_cur %= SACK_FILTER_BLOCKS;
537 if (sf->sf_used > highest_used)
538 highest_used = sf->sf_used;
546 if (SEQ_GT(th_ack, sf->sf_ack)) {
547 sack_filter_prune(sf, th_ack);
550 if (SEQ_GEQ(th_ack, sf->sf_ack)) {
551 ret = sack_filter_new(sf, in, numblks, th_ack);
553 ret = sack_filter_old(sf, in, numblks);
561 sack_filter_reject(struct sack_filter *sf, struct sackblk *in)
564 * Given a specified block (that had made
565 * it past the sack filter). Reject that
566 * block triming it off any sack-filter block
567 * that has it. Usually because the block was
568 * too small and did not cover a whole send.
570 * This function will only "undo" sack-blocks
571 * that are fresh and touch the edges of
572 * blocks in our filter.
576 for(i=0; i<SACK_FILTER_BLOCKS; i++) {
577 if (sack_blk_used(sf, i) == 0)
580 * Now given the sack-filter block does it touch
581 * with one of the ends
583 if (sf->sf_blks[i].end == in->end) {
584 /* The end moves back to start */
585 if (SEQ_GT(in->start, sf->sf_blks[i].start))
587 /* sf-blk |---------| */
588 sf->sf_blks[i].end = in->start;
590 /* It consumes this block */
591 /* in-blk |---------| */
592 /* sf-blk |------| */
594 /* sf-blk |---------| */
595 sf->sf_bits = sack_blk_clr(sf, i);
600 if (sf->sf_blks[i].start == in->start) {
601 if (SEQ_LT(in->end, sf->sf_blks[i].end)) {
603 /* sf-blk |---------| */
604 sf->sf_blks[i].start = in->end;
606 /* It consumes this block */
607 /* in-blk |----------| */
608 /* sf-blk |-------| */
610 /* sf-blk |----------| */
611 sf->sf_bits = sack_blk_clr(sf, i);
622 main(int argc, char **argv)
625 struct sackblk blks[TCP_MAX_SACK];
627 tcp_seq th_ack, snd_una, snd_max = 0;
628 struct sack_filter sf;
632 int invalid_sack_print = 0;
633 uint32_t chg_remembered=0;
635 char line_buf[10][256];
640 while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) {
649 invalid_sack_print = 1;
652 in = fopen(optarg, "r");
654 fprintf(stderr, "Fatal error can't open %s for input\n", optarg);
659 out = fopen(optarg, "w");
661 fprintf(stderr, "Fatal error can't open %s for output\n", optarg);
668 fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]);
673 sack_filter_clear(&sf, 0);
674 memset(buffer, 0, sizeof(buffer));
675 memset(blks, 0, sizeof(blks));
677 fprintf(out, "************************************\n");
678 while (fgets(buffer, sizeof(buffer), in) != NULL) {
679 sprintf(line_buf[line_buf_at], "%s", buffer);
681 if (strncmp(buffer, "QUIT", 4) == 0) {
683 } else if (strncmp(buffer, "DUMP", 4) == 0) {
684 sack_filter_dump(out, &sf);
685 } else if (strncmp(buffer, "MAX:", 4) == 0) {
686 snd_max = strtoul(&buffer[4], NULL, 0);
687 } else if (strncmp(buffer, "COMMIT", 6) == 0) {
690 uint32_t szof, tot_chg;
691 for(ii=0; ii<line_buf_at; ii++) {
692 fprintf(out, "%s", line_buf[ii]);
694 fprintf(out, "------------------------------------\n");
695 nn = sack_filter_blks(&sf, blks, numblks, th_ack);
696 saved += numblks - nn;
697 tot_sack_blks += numblks;
698 fprintf(out, "ACK:%u\n", sf.sf_ack);
699 for(ii=0, tot_chg=0; ii<nn; ii++) {
700 szof = blks[ii].end - blks[ii].start;
702 fprintf(out, "SACK:%u:%u [%u]\n",
706 fprintf(out,"************************************\n");
707 chg_remembered = tot_chg;
709 sack_filter_dump(out, &sf);
710 fprintf(out,"************************************\n");
713 memset(blks, 0, sizeof(blks));
714 memset(line_buf, 0, sizeof(line_buf));
717 } else if (strncmp(buffer, "CHG:", 4) == 0) {
718 sack_chg = strtoul(&buffer[4], NULL, 0);
719 if ((sack_chg != chg_remembered) &&
720 (sack_chg > chg_remembered)){
721 fprintf(out,"***WARNING WILL RODGERS DANGER!! sack_chg:%u last:%u\n",
722 sack_chg, chg_remembered
725 sack_chg = chg_remembered = 0;
726 } else if (strncmp(buffer, "RXT", 3) == 0) {
727 sack_filter_clear(&sf, snd_una);
728 } else if (strncmp(buffer, "ACK:", 4) == 0) {
729 th_ack = strtoul(&buffer[4], NULL, 0);
730 if (snd_una_set == 0) {
733 } else if (SEQ_GT(th_ack, snd_una)) {
736 } else if (strncmp(buffer, "EXIT", 4) == 0) {
737 sack_filter_clear(&sf, snd_una);
738 sack_chg = chg_remembered = 0;
739 } else if (strncmp(buffer, "SACK:", 5) == 0) {
744 start = strtoul(&buffer[5], &end, 0);
746 endv = strtoul(&end[1], NULL, 0);
748 fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start);
751 if (SEQ_GT(endv, snd_max))
753 if (SEQ_LT(endv, start)) {
754 fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start);
757 if (numblks == TCP_MAX_SACK) {
758 fprintf(out, "--Exceeded max %d\n", numblks);
761 blks[numblks].start = start;
762 blks[numblks].end = endv;
764 } else if (strncmp(buffer, "REJ:n:n", 4) == 0) {
768 in.start = strtoul(&buffer[4], &end, 0);
770 in.end = strtoul(&end[1], NULL, 0);
771 sack_filter_reject(&sf, &in);
773 fprintf(out, "Invalid input END:A:B\n");
774 } else if (strncmp(buffer, "HELP", 4) == 0) {
775 fprintf(out, "You can input:\n");
776 fprintf(out, "SACK:S:E -- to define a sack block\n");
777 fprintf(out, "RXT -- to clear the filter without changing the remembered\n");
778 fprintf(out, "EXIT -- To clear the sack filter and start all fresh\n");
779 fprintf(out, "ACK:N -- To advance the cum-ack to N\n");
780 fprintf(out, "MAX:N -- To set send-max to N\n");
781 fprintf(out, "COMMIT -- To apply the sack you built to the filter and dump the filter\n");
782 fprintf(out, "DUMP -- To display the current contents of the sack filter\n");
783 fprintf(out, "QUIT -- To exit this program\n");
785 fprintf(out, "Command %s unknown\n", buffer);
787 memset(buffer, 0, sizeof(buffer));
796 b = tot_sack_blks * 1.0;
805 fprintf(err, "Saved %lu sack blocks out of %lu (%2.3f%%) old_skip:%lu old_usd:%lu high_cnt:%d ow:%d ea:%d\n",
806 saved, tot_sack_blks, c, cnt_skipped_oldsack, cnt_used_oldsack, highest_used, over_written, empty_avail);