2 * Copyright (c) 2017-9 Netflix, Inc.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 #include <sys/types.h>
29 #include <sys/queue.h>
30 #include <sys/socket.h>
32 #include <sys/sockopt.h>
33 #include <netinet/tcp.h>
34 #include <netinet/tcp_var.h>
35 #include <netinet/tcp_seq.h>
45 #include "sack_filter.h"
48 * Sack filter is used to filter out sacks
49 * that have already been processed. The idea
50 * is pretty simple really, consider two sacks
60 * The previous sack information (B-C) is repeated
61 * in SACK 2. If the receiver gets SACK 1 and then
62 * SACK 2 then any work associated with B-C as already
63 * been completed. This only effects where we may have
64 * (as in bbr or rack) cases where we walk a linked list.
66 * Now the utility trys to keep everything in a single
67 * cache line. This means that its not perfect and
68 * it could be that so big of sack's come that a
69 * "remembered" processed sack falls off the list and
70 * so gets re-processed. Thats ok, it just means we
71 * did some extra work. We could of course take more
72 * cache line hits by expanding the size of this
73 * structure, but then that would cost more.
77 int detailed_dump = 0;
78 uint64_t cnt_skipped_oldsack = 0;
79 uint64_t cnt_used_oldsack = 0;
88 #define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits)
89 #define sack_blk_set(sf, i) ((1 << i) | sf->sf_bits)
90 #define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits)
96 sack_filter_clear(struct sack_filter *sf, tcp_seq seq)
104 * Given a previous sack filter block, filter out
105 * any entries where the cum-ack moves over them
106 * fully or partially.
109 sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack)
112 /* start with the oldest */
113 for (i = 0; i < SACK_FILTER_BLOCKS; i++) {
114 if (sack_blk_used(sf, i)) {
115 if (SEQ_GT(th_ack, sf->sf_blks[i].end)) {
116 /* This block is consumed */
117 sf->sf_bits = sack_blk_clr(sf, i);
119 } else if (SEQ_GT(th_ack, sf->sf_blks[i].start)) {
120 /* Some of it is acked */
121 sf->sf_blks[i].start = th_ack;
122 /* We could in theory break here, but
123 * there are some broken implementations
124 * that send multiple blocks. We want
125 * to catch them all with similar seq's.
134 * Return true if you find that
135 * the sackblock b is on the score
136 * board. Update it along the way
137 * if part of it is on the board.
140 is_sack_on_board(struct sack_filter *sf, struct sackblk *b)
144 for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) {
145 if (sack_blk_used(sf, i)) {
146 if (SEQ_LT(b->start, sf->sf_ack)) {
147 /* Behind cum-ack update */
148 b->start = sf->sf_ack;
150 if (SEQ_LT(b->end, sf->sf_ack)) {
151 /* End back behind too */
154 if (b->start == b->end) {
157 /* Jonathans Rule 1 */
158 if (SEQ_LEQ(sf->sf_blks[i].start, b->start) &&
159 SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
161 * Our board has this entirely in
164 * board |-------------|
165 * sack |-------------|
167 * board |-------------|
173 /* Jonathans Rule 2 */
174 if(SEQ_LT(sf->sf_blks[i].end, b->start)) {
176 * Not near each other:
183 /* Jonathans Rule 3 */
184 if (SEQ_GT(sf->sf_blks[i].start, b->end)) {
186 * Not near each other:
193 if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) {
195 * The board block partial meets:
201 * sack |--------------|
203 * up with this one (we have part of it).
204 * 1) Update the board block to the new end
206 * 2) Update the start of this block to my end.
208 b->start = sf->sf_blks[i].end;
209 sf->sf_blks[i].end = b->end;
212 if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
214 * The board block partial meets:
221 * 1) Update the board block to the new start
223 * 2) Update the start of this block to my end.
225 b->end = sf->sf_blks[i].start;
226 sf->sf_blks[i].start = b->start;
232 i %= SACK_FILTER_BLOCKS;
234 /* Did we totally consume it in pieces? */
235 if (b->start != b->end)
242 sack_filter_old(struct sack_filter *sf, struct sackblk *in, int numblks)
245 struct sackblk blkboard[TCP_MAX_SACK];
247 * An old sack has arrived. It may contain data
248 * we do not have. We might not have it since
249 * we could have had a lost ack <or> we might have the
250 * entire thing on our current board. We want to prune
251 * off anything we have. With this function though we
252 * won't add to the board.
254 for( i = 0, num = 0; i<numblks; i++ ) {
255 if (is_sack_on_board(sf, &in[i])) {
257 cnt_skipped_oldsack++;
261 /* Did not find it (or found only
262 * a piece of it). Copy it to
263 * our outgoing board.
265 memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
272 memcpy(in, blkboard, (num * sizeof(struct sackblk)));
278 * Given idx its used but there is space available
279 * move the entry to the next free slot
282 sack_move_to_empty(struct sack_filter *sf, uint32_t idx)
286 i = (idx + 1) % SACK_FILTER_BLOCKS;
287 for (cnt=0; cnt <(SACK_FILTER_BLOCKS-1); cnt++) {
288 if (sack_blk_used(sf, i) == 0) {
289 memcpy(&sf->sf_blks[i], &sf->sf_blks[idx], sizeof(struct sackblk));
290 sf->sf_bits = sack_blk_clr(sf, idx);
291 sf->sf_bits = sack_blk_set(sf, i);
295 i %= SACK_FILTER_BLOCKS;
300 sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
302 struct sackblk blkboard[TCP_MAX_SACK];
305 * First lets trim the old and possibly
306 * throw any away we have.
308 for(i=0, num=0; i<numblks; i++) {
309 if (is_sack_on_board(sf, &in[i]))
311 memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
317 /* Now what we are left with is either
318 * completely merged on to the board
319 * from the above steps, or is new
320 * and need to be added to the board
321 * with the last one updated to current.
323 * First copy it out, we want to return that
324 * to our caller for processing.
326 memcpy(in, blkboard, (num * sizeof(struct sackblk)));
328 /* Now go through and add to our board as needed */
329 for(i=(num-1); i>=0; i--) {
330 if (is_sack_on_board(sf, &blkboard[i])) {
333 /* Add this guy its not listed */
335 sf->sf_cur %= SACK_FILTER_BLOCKS;
336 if ((sack_blk_used(sf, sf->sf_cur)) &&
337 (sf->sf_used < SACK_FILTER_BLOCKS)) {
338 sack_move_to_empty(sf, sf->sf_cur);
341 if (sack_blk_used(sf, sf->sf_cur)) {
343 if (sf->sf_used < SACK_FILTER_BLOCKS)
347 memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
348 if (sack_blk_used(sf, sf->sf_cur) == 0) {
351 if (sf->sf_used > highest_used)
352 highest_used = sf->sf_used;
354 sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
361 * Given a sack block on the board (the skip index) see if
362 * any other used entries overlap or meet, if so return the index.
365 sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t skip)
369 for(i=0; i<SACK_FILTER_BLOCKS; i++) {
370 if (sack_blk_used(sf, i) == 0)
374 if (SEQ_GEQ(sf->sf_blks[i].end, sb->start) &&
375 SEQ_LEQ(sf->sf_blks[i].end, sb->end) &&
376 SEQ_LEQ(sf->sf_blks[i].start, sb->start)) {
378 * The two board blocks meet:
381 * board2 |----------|
384 * board2 |--------------|
391 if (SEQ_LEQ(sf->sf_blks[i].start, sb->end) &&
392 SEQ_GEQ(sf->sf_blks[i].start, sb->start) &&
393 SEQ_GEQ(sf->sf_blks[i].end, sb->end)) {
395 * The board block partial meets:
402 * 1) Update the board block to the new start
404 * 2) Update the start of this block to my end.
413 * Collapse entry src into entry into
414 * and free up the src entry afterwards.
417 sack_collapse(struct sack_filter *sf, int32_t src, int32_t into)
419 if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) {
420 /* src has a lower starting point */
421 sf->sf_blks[into].start = sf->sf_blks[src].start;
423 if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) {
424 /* src has a higher ending point */
425 sf->sf_blks[into].end = sf->sf_blks[src].end;
427 sf->sf_bits = sack_blk_clr(sf, src);
432 sack_board_collapse(struct sack_filter *sf)
434 int32_t i, j, i_d, j_d;
436 for(i=0; i<SACK_FILTER_BLOCKS; i++) {
437 if (sack_blk_used(sf, i) == 0)
440 * Look at all other blocks but this guy
441 * to see if they overlap. If so we collapse
442 * the two blocks together.
444 j = sack_blocks_overlap_or_meet(sf, &sf->sf_blks[i], i);
450 * Ok j and i overlap with each other, collapse the
451 * one out furthest away from the current position.
454 i_d = sf->sf_cur - i;
456 i_d = i - sf->sf_cur;
458 j_d = sf->sf_cur - j;
460 j_d = j - sf->sf_cur;
462 sack_collapse(sf, j, i);
464 sack_collapse(sf, i, j);
470 uint64_t tot_sack_blks=0;
473 sack_filter_dump(FILE *out, struct sack_filter *sf)
476 fprintf(out, " sf_ack:%u sf_bits:0x%x c:%d used:%d\n",
477 sf->sf_ack, sf->sf_bits,
478 sf->sf_cur, sf->sf_used);
480 for(i=0; i<SACK_FILTER_BLOCKS; i++) {
481 if (sack_blk_used(sf, i)) {
482 fprintf(out, "Entry:%d start:%u end:%u\n", i,
483 sf->sf_blks[i].start,
494 sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks,
499 if (numblks > TCP_MAX_SACK) {
501 panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n",
508 if ((sf->sf_used > 1) && (no_collapse == 0))
509 sack_board_collapse(sf);
513 sack_board_collapse(sf);
515 if ((sf->sf_used == 0) && numblks) {
517 * We are brand new add the blocks in
518 * reverse order. Note we can see more
519 * than one in new, since ack's could be lost.
524 for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) {
525 memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
526 sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
528 sf->sf_cur %= SACK_FILTER_BLOCKS;
532 if (sf->sf_used > highest_used)
533 highest_used = sf->sf_used;
541 if (SEQ_GT(th_ack, sf->sf_ack)) {
542 sack_filter_prune(sf, th_ack);
545 if (SEQ_GEQ(th_ack, sf->sf_ack)) {
546 ret = sack_filter_new(sf, in, numblks, th_ack);
548 ret = sack_filter_old(sf, in, numblks);
556 sack_filter_reject(struct sack_filter *sf, struct sackblk *in)
559 * Given a specified block (that had made
560 * it past the sack filter). Reject that
561 * block triming it off any sack-filter block
562 * that has it. Usually because the block was
563 * too small and did not cover a whole send.
565 * This function will only "undo" sack-blocks
566 * that are fresh and touch the edges of
567 * blocks in our filter.
571 for(i=0; i<SACK_FILTER_BLOCKS; i++) {
572 if (sack_blk_used(sf, i) == 0)
575 * Now given the sack-filter block does it touch
576 * with one of the ends
578 if (sf->sf_blks[i].end == in->end) {
579 /* The end moves back to start */
580 if (SEQ_GT(in->start, sf->sf_blks[i].start))
582 /* sf-blk |---------| */
583 sf->sf_blks[i].end = in->start;
585 /* It consumes this block */
586 /* in-blk |---------| */
587 /* sf-blk |------| */
589 /* sf-blk |---------| */
590 sf->sf_bits = sack_blk_clr(sf, i);
595 if (sf->sf_blks[i].start == in->start) {
596 if (SEQ_LT(in->end, sf->sf_blks[i].end)) {
598 /* sf-blk |---------| */
599 sf->sf_blks[i].start = in->end;
601 /* It consumes this block */
602 /* in-blk |----------| */
603 /* sf-blk |-------| */
605 /* sf-blk |----------| */
606 sf->sf_bits = sack_blk_clr(sf, i);
617 main(int argc, char **argv)
620 struct sackblk blks[TCP_MAX_SACK];
622 tcp_seq th_ack, snd_una, snd_max = 0;
623 struct sack_filter sf;
627 int invalid_sack_print = 0;
628 uint32_t chg_remembered=0;
630 char line_buf[10][256];
635 while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) {
644 invalid_sack_print = 1;
647 in = fopen(optarg, "r");
649 fprintf(stderr, "Fatal error can't open %s for input\n", optarg);
654 out = fopen(optarg, "w");
656 fprintf(stderr, "Fatal error can't open %s for output\n", optarg);
663 fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]);
668 sack_filter_clear(&sf, 0);
669 memset(buffer, 0, sizeof(buffer));
670 memset(blks, 0, sizeof(blks));
672 fprintf(out, "************************************\n");
673 while (fgets(buffer, sizeof(buffer), in) != NULL) {
674 sprintf(line_buf[line_buf_at], "%s", buffer);
676 if (strncmp(buffer, "QUIT", 4) == 0) {
678 } else if (strncmp(buffer, "DUMP", 4) == 0) {
679 sack_filter_dump(out, &sf);
680 } else if (strncmp(buffer, "MAX:", 4) == 0) {
681 snd_max = strtoul(&buffer[4], NULL, 0);
682 } else if (strncmp(buffer, "COMMIT", 6) == 0) {
685 uint32_t szof, tot_chg;
686 for(ii=0; ii<line_buf_at; ii++) {
687 fprintf(out, "%s", line_buf[ii]);
689 fprintf(out, "------------------------------------\n");
690 nn = sack_filter_blks(&sf, blks, numblks, th_ack);
691 saved += numblks - nn;
692 tot_sack_blks += numblks;
693 fprintf(out, "ACK:%u\n", sf.sf_ack);
694 for(ii=0, tot_chg=0; ii<nn; ii++) {
695 szof = blks[ii].end - blks[ii].start;
697 fprintf(out, "SACK:%u:%u [%u]\n",
701 fprintf(out,"************************************\n");
702 chg_remembered = tot_chg;
704 sack_filter_dump(out, &sf);
705 fprintf(out,"************************************\n");
708 memset(blks, 0, sizeof(blks));
709 memset(line_buf, 0, sizeof(line_buf));
712 } else if (strncmp(buffer, "CHG:", 4) == 0) {
713 sack_chg = strtoul(&buffer[4], NULL, 0);
714 if ((sack_chg != chg_remembered) &&
715 (sack_chg > chg_remembered)){
716 fprintf(out,"***WARNING WILL RODGERS DANGER!! sack_chg:%u last:%u\n",
717 sack_chg, chg_remembered
720 sack_chg = chg_remembered = 0;
721 } else if (strncmp(buffer, "RXT", 3) == 0) {
722 sack_filter_clear(&sf, snd_una);
723 } else if (strncmp(buffer, "ACK:", 4) == 0) {
724 th_ack = strtoul(&buffer[4], NULL, 0);
725 if (snd_una_set == 0) {
728 } else if (SEQ_GT(th_ack, snd_una)) {
731 } else if (strncmp(buffer, "EXIT", 4) == 0) {
732 sack_filter_clear(&sf, snd_una);
733 sack_chg = chg_remembered = 0;
734 } else if (strncmp(buffer, "SACK:", 5) == 0) {
739 start = strtoul(&buffer[5], &end, 0);
741 endv = strtoul(&end[1], NULL, 0);
743 fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start);
746 if (SEQ_GT(endv, snd_max))
748 if (SEQ_LT(endv, start)) {
749 fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start);
752 if (numblks == TCP_MAX_SACK) {
753 fprintf(out, "--Exceeded max %d\n", numblks);
756 blks[numblks].start = start;
757 blks[numblks].end = endv;
759 } else if (strncmp(buffer, "REJ:n:n", 4) == 0) {
763 in.start = strtoul(&buffer[4], &end, 0);
765 in.end = strtoul(&end[1], NULL, 0);
766 sack_filter_reject(&sf, &in);
768 fprintf(out, "Invalid input END:A:B\n");
769 } else if (strncmp(buffer, "HELP", 4) == 0) {
770 fprintf(out, "You can input:\n");
771 fprintf(out, "SACK:S:E -- to define a sack block\n");
772 fprintf(out, "RXT -- to clear the filter without changing the remembered\n");
773 fprintf(out, "EXIT -- To clear the sack filter and start all fresh\n");
774 fprintf(out, "ACK:N -- To advance the cum-ack to N\n");
775 fprintf(out, "MAX:N -- To set send-max to N\n");
776 fprintf(out, "COMMIT -- To apply the sack you built to the filter and dump the filter\n");
777 fprintf(out, "DUMP -- To display the current contents of the sack filter\n");
778 fprintf(out, "QUIT -- To exit this program\n");
780 fprintf(out, "Command %s unknown\n", buffer);
782 memset(buffer, 0, sizeof(buffer));
791 b = tot_sack_blks * 1.0;
800 fprintf(err, "Saved %lu sack blocks out of %lu (%2.3f%%) old_skip:%lu old_usd:%lu high_cnt:%d ow:%d ea:%d\n",
801 saved, tot_sack_blks, c, cnt_skipped_oldsack, cnt_used_oldsack, highest_used, over_written, empty_avail);