6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include <sys/types.h>
31 #include <sys/queue.h>
32 #include <sys/socket.h>
34 #include <sys/sockopt.h>
35 #include <netinet/tcp.h>
36 #include <netinet/tcp_var.h>
37 #include <netinet/tcp_seq.h>
47 #include "sack_filter.h"
50 * Sack filter is used to filter out sacks
51 * that have already been processed. The idea
52 * is pretty simple really, consider two sacks
62 * The previous sack information (B-C) is repeated
63 * in SACK 2. If the receiver gets SACK 1 and then
64 * SACK 2 then any work associated with B-C as already
65 * been completed. This only effects where we may have
66 * (as in bbr or rack) cases where we walk a linked list.
68 * Now the utility trys to keep everything in a single
69 * cache line. This means that its not perfect and
70 * it could be that so big of sack's come that a
71 * "remembered" processed sack falls off the list and
72 * so gets re-processed. Thats ok, it just means we
73 * did some extra work. We could of course take more
74 * cache line hits by expanding the size of this
75 * structure, but then that would cost more.
79 int detailed_dump = 0;
80 uint64_t cnt_skipped_oldsack = 0;
81 uint64_t cnt_used_oldsack = 0;
90 #define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits)
91 #define sack_blk_set(sf, i) ((1 << i) | sf->sf_bits)
92 #define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits)
98 sack_filter_clear(struct sack_filter *sf, tcp_seq seq)
106 * Given a previous sack filter block, filter out
107 * any entries where the cum-ack moves over them
108 * fully or partially.
111 sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack)
114 /* start with the oldest */
115 for (i = 0; i < SACK_FILTER_BLOCKS; i++) {
116 if (sack_blk_used(sf, i)) {
117 if (SEQ_GT(th_ack, sf->sf_blks[i].end)) {
118 /* This block is consumed */
119 sf->sf_bits = sack_blk_clr(sf, i);
121 } else if (SEQ_GT(th_ack, sf->sf_blks[i].start)) {
122 /* Some of it is acked */
123 sf->sf_blks[i].start = th_ack;
124 /* We could in theory break here, but
125 * there are some broken implementations
126 * that send multiple blocks. We want
127 * to catch them all with similar seq's.
136 * Return true if you find that
137 * the sackblock b is on the score
138 * board. Update it along the way
139 * if part of it is on the board.
142 is_sack_on_board(struct sack_filter *sf, struct sackblk *b)
145 for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) {
146 if (sack_blk_used(sf, i)) {
147 if (SEQ_LT(b->start, sf->sf_ack)) {
148 /* Behind cum-ack update */
149 b->start = sf->sf_ack;
151 if (SEQ_LT(b->end, sf->sf_ack)) {
152 /* End back behind too */
155 if (b->start == b->end)
157 /* Jonathans Rule 1 */
158 if (SEQ_LEQ(sf->sf_blks[i].start, b->start) &&
159 SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
161 * Our board has this entirely in
164 * board |-------------|
165 * sack |-------------|
167 * board |-------------|
173 /* Jonathans Rule 2 */
174 if(SEQ_LT(sf->sf_blks[i].end, b->start)) {
176 * Not near each other:
183 /* Jonathans Rule 3 */
184 if (SEQ_GT(sf->sf_blks[i].start, b->end)) {
186 * Not near each other:
193 if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) {
195 * The board block partial meets:
201 * sack |--------------|
203 * up with this one (we have part of it).
204 * 1) Update the board block to the new end
206 * 2) Update the start of this block to my end.
208 b->start = sf->sf_blks[i].end;
209 sf->sf_blks[i].end = b->end;
212 if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
214 * The board block partial meets:
221 * 1) Update the board block to the new start
223 * 2) Update the start of this block to my end.
225 b->end = sf->sf_blks[i].start;
226 sf->sf_blks[i].start = b->start;
232 i %= SACK_FILTER_BLOCKS;
234 /* Did we totally consume it in pieces? */
235 if (b->start != b->end)
242 sack_filter_old(struct sack_filter *sf, struct sackblk *in, int numblks)
245 struct sackblk blkboard[TCP_MAX_SACK];
247 * An old sack has arrived. It may contain data
248 * we do not have. We might not have it since
249 * we could have had a lost ack <or> we might have the
250 * entire thing on our current board. We want to prune
251 * off anything we have. With this function though we
252 * won't add to the board.
254 for( i = 0, num = 0; i<numblks; i++ ) {
255 if (is_sack_on_board(sf, &in[i])) {
257 cnt_skipped_oldsack++;
261 /* Did not find it (or found only
262 * a piece of it). Copy it to
263 * our outgoing board.
265 memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
272 memcpy(in, blkboard, (num * sizeof(struct sackblk)));
278 * Given idx its used but there is space available
279 * move the entry to the next free slot
282 sack_move_to_empty(struct sack_filter *sf, uint32_t idx)
286 i = (idx + 1) % SACK_FILTER_BLOCKS;
287 for (cnt=0; cnt <(SACK_FILTER_BLOCKS-1); cnt++) {
288 if (sack_blk_used(sf, i) == 0) {
289 memcpy(&sf->sf_blks[i], &sf->sf_blks[idx], sizeof(struct sackblk));
290 sf->sf_bits = sack_blk_clr(sf, idx);
291 sf->sf_bits = sack_blk_set(sf, i);
295 i %= SACK_FILTER_BLOCKS;
300 sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
302 struct sackblk blkboard[TCP_MAX_SACK];
305 * First lets trim the old and possibly
306 * throw any away we have.
308 for(i=0, num=0; i<numblks; i++) {
309 if (is_sack_on_board(sf, &in[i]))
311 memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
317 /* Now what we are left is either
318 * completely merged on to the board
319 * from the above steps, or are new
320 * and need to be added to the board
321 * with the last one updated to current.
323 * First copy it out we want to return that
324 * to our caller for processing.
326 memcpy(in, blkboard, (num * sizeof(struct sackblk)));
328 /* Now go through and add to our board as needed */
329 for(i=(num-1); i>=0; i--) {
330 if (is_sack_on_board(sf, &blkboard[i]))
332 /* Add this guy its not listed */
334 sf->sf_cur %= SACK_FILTER_BLOCKS;
335 if ((sack_blk_used(sf, sf->sf_cur)) &&
336 (sf->sf_used < SACK_FILTER_BLOCKS)) {
337 sack_move_to_empty(sf, sf->sf_cur);
340 if (sack_blk_used(sf, sf->sf_cur)) {
342 if (sf->sf_used < SACK_FILTER_BLOCKS)
346 memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
347 if (sack_blk_used(sf, sf->sf_cur) == 0) {
350 if (sf->sf_used > highest_used)
351 highest_used = sf->sf_used;
353 sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
360 * Given a sack block on the board (the skip index) see if
361 * any other used entries overlap or meet, if so return the index.
364 sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t skip)
368 for(i=0; i<SACK_FILTER_BLOCKS; i++) {
369 if (sack_blk_used(sf, i) == 0)
373 if (SEQ_GEQ(sf->sf_blks[i].end, sb->start) &&
374 SEQ_LEQ(sf->sf_blks[i].end, sb->end) &&
375 SEQ_LEQ(sf->sf_blks[i].start, sb->start)) {
377 * The two board blocks meet:
380 * board2 |----------|
383 * board2 |--------------|
390 if (SEQ_LEQ(sf->sf_blks[i].start, sb->end) &&
391 SEQ_GEQ(sf->sf_blks[i].start, sb->start) &&
392 SEQ_GEQ(sf->sf_blks[i].end, sb->end)) {
394 * The board block partial meets:
401 * 1) Update the board block to the new start
403 * 2) Update the start of this block to my end.
412 * Collapse entry src into entry into
413 * and free up the src entry afterwards.
416 sack_collapse(struct sack_filter *sf, int32_t src, int32_t into)
418 if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) {
419 /* src has a lower starting point */
420 sf->sf_blks[into].start = sf->sf_blks[src].start;
422 if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) {
423 /* src has a higher ending point */
424 sf->sf_blks[into].end = sf->sf_blks[src].end;
426 sf->sf_bits = sack_blk_clr(sf, src);
431 sack_board_collapse(struct sack_filter *sf)
433 int32_t i, j, i_d, j_d;
435 for(i=0; i<SACK_FILTER_BLOCKS; i++) {
436 if (sack_blk_used(sf, i) == 0)
439 * Look at all other blocks but this guy
440 * to see if they overlap. If so we collapse
441 * the two blocks together.
443 j = sack_blocks_overlap_or_meet(sf, &sf->sf_blks[i], i);
449 * Ok j and i overlap with each other, collapse the
450 * one out furthest away from the current position.
453 i_d = sf->sf_cur - i;
455 i_d = i - sf->sf_cur;
457 j_d = sf->sf_cur - j;
459 j_d = j - sf->sf_cur;
461 sack_collapse(sf, j, i);
463 sack_collapse(sf, i, j);
471 sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
475 if (numblks > TCP_MAX_SACK) {
476 panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n",
481 if ((sf->sf_used == 0) && numblks) {
483 * We are brand new add the blocks in
484 * reverse order. Note we can see more
485 * than one in new, since ack's could be lost.
488 for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) {
489 memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
490 sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
492 sf->sf_cur %= SACK_FILTER_BLOCKS;
495 if (sf->sf_used > highest_used)
496 highest_used = sf->sf_used;
503 if (SEQ_GT(th_ack, sf->sf_ack)) {
504 sack_filter_prune(sf, th_ack);
507 if (SEQ_GEQ(th_ack, sf->sf_ack)) {
508 ret = sack_filter_new(sf, in, numblks, th_ack);
510 ret = sack_filter_old(sf, in, numblks);
515 if ((sf->sf_used > 1) && (no_collapse == 0))
516 sack_board_collapse(sf);
520 sack_board_collapse(sf);
528 uint64_t tot_sack_blks=0;
531 sack_filter_dump(FILE *out, struct sack_filter *sf)
534 fprintf(out, " sf_ack:%u sf_bits:0x%x c:%d used:%d\n",
535 sf->sf_ack, sf->sf_bits,
536 sf->sf_cur, sf->sf_used);
538 for(i=0; i<SACK_FILTER_BLOCKS; i++) {
539 if (sack_blk_used(sf, i)) {
540 fprintf(out, "Entry:%d start:%u end:%u\n", i,
541 sf->sf_blks[i].start,
548 main(int argc, char **argv)
551 struct sackblk blks[TCP_MAX_SACK];
553 tcp_seq th_ack, snd_una;
554 struct sack_filter sf;
558 int invalid_sack_print = 0;
559 uint32_t chg_remembered=0;
561 char line_buf[10][256];
566 while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) {
575 invalid_sack_print = 1;
578 in = fopen(optarg, "r");
580 fprintf(stderr, "Fatal error can't open %s for input\n", optarg);
585 out = fopen(optarg, "w");
587 fprintf(stderr, "Fatal error can't open %s for output\n", optarg);
594 fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]);
599 sack_filter_clear(&sf, 0);
600 memset(buffer, 0, sizeof(buffer));
601 memset(blks, 0, sizeof(blks));
603 fprintf(out, "************************************\n");
604 while (fgets(buffer, sizeof(buffer), in) != NULL) {
605 sprintf(line_buf[line_buf_at], "%s", buffer);
607 if (strncmp(buffer, "QUIT", 4) == 0) {
609 } else if (strncmp(buffer, "DONE", 4) == 0) {
612 uint32_t szof, tot_chg;
613 for(ii=0; ii<line_buf_at; ii++) {
614 fprintf(out, "%s", line_buf[ii]);
616 fprintf(out, "------------------------------------\n");
617 nn = sack_filter_blks(&sf, blks, numblks, th_ack);
618 saved += numblks - nn;
619 tot_sack_blks += numblks;
620 fprintf(out, "ACK:%u\n", sf.sf_ack);
621 for(ii=0, tot_chg=0; ii<nn; ii++) {
622 szof = blks[ii].end - blks[ii].start;
624 fprintf(out, "SACK:%u:%u [%u]\n",
628 fprintf(out,"************************************\n");
629 chg_remembered = tot_chg;
631 sack_filter_dump(out, &sf);
632 fprintf(out,"************************************\n");
635 memset(blks, 0, sizeof(blks));
636 memset(line_buf, 0, sizeof(line_buf));
639 } else if (strncmp(buffer, "CHG:", 4) == 0) {
640 sack_chg = strtoul(&buffer[4], NULL, 0);
641 if ((sack_chg != chg_remembered) &&
642 (sack_chg > chg_remembered)){
643 fprintf(out,"***WARNING WILL RODGERS DANGER!! sack_chg:%u last:%u\n",
644 sack_chg, chg_remembered
647 sack_chg = chg_remembered = 0;
648 } else if (strncmp(buffer, "RXT", 3) == 0) {
649 sack_filter_clear(&sf, snd_una);
650 } else if (strncmp(buffer, "ACK:", 4) == 0) {
651 th_ack = strtoul(&buffer[4], NULL, 0);
652 if (snd_una_set == 0) {
655 } else if (SEQ_GT(th_ack, snd_una)) {
658 } else if (strncmp(buffer, "EXIT", 4) == 0) {
659 sack_filter_clear(&sf, snd_una);
660 sack_chg = chg_remembered = 0;
661 } else if (strncmp(buffer, "SACK:", 5) == 0) {
665 start = strtoul(&buffer[5], &end, 0);
667 endv = strtoul(&end[1], NULL, 0);
669 fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start);
672 if (SEQ_LT(endv, start)) {
673 fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start);
676 if (numblks == TCP_MAX_SACK) {
677 fprintf(out, "--Exceeded max %d\n", numblks);
680 blks[numblks].start = start;
681 blks[numblks].end = endv;
684 memset(buffer, 0, sizeof(buffer));
693 b = tot_sack_blks * 1.0;
702 fprintf(err, "Saved %lu sack blocks out of %lu (%2.3f%%) old_skip:%lu old_usd:%lu high_cnt:%d ow:%d ea:%d\n",
703 saved, tot_sack_blks, c, cnt_skipped_oldsack, cnt_used_oldsack, highest_used, over_written, empty_avail);