2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2009-2010
5 * Swinburne University of Technology, Melbourne, Australia
6 * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
7 * Copyright (c) 2010-2011 The FreeBSD Foundation
10 * This software was developed at the Centre for Advanced Internet
11 * Architectures, Swinburne University of Technology, by David Hayes, made
12 * possible in part by a grant from the Cisco University Research Program Fund
13 * at Community Foundation Silicon Valley.
15 * Portions of this software were developed at the Centre for Advanced
16 * Internet Architectures, Swinburne University of Technology, Melbourne,
17 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
19 * Redistribution and use in source and binary forms, with or without
20 * modification, are permitted provided that the following conditions
22 * 1. Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 * notice, this list of conditions and the following disclaimer in the
26 * documentation and/or other materials provided with the distribution.
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
44 #include <sys/param.h>
45 #include <sys/kernel.h>
47 #include <sys/module.h>
48 #include <sys/hhook.h>
49 #include <sys/khelp.h>
50 #include <sys/module_khelp.h>
51 #include <sys/socket.h>
52 #include <sys/sockopt.h>
56 #include <netinet/in.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/tcp_seq.h>
59 #include <netinet/tcp_var.h>
61 #include <netinet/khelp/h_ertt.h>
65 uma_zone_t txseginfo_zone;
67 /* Smoothing factor for delayed ack guess. */
68 #define DLYACK_SMOOTH 5
70 /* Max number of time stamp errors allowed in a session. */
73 static int ertt_packet_measurement_hook(int hhook_type, int hhook_id,
74 void *udata, void *ctx_data, void *hdata, struct osd *hosd);
75 static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id,
76 void *udata, void *ctx_data, void *hdata, struct osd *hosd);
77 static int ertt_mod_init(void);
78 static int ertt_mod_destroy(void);
79 static int ertt_uma_ctor(void *mem, int size, void *arg, int flags);
80 static void ertt_uma_dtor(void *mem, int size, void *arg);
83 * Contains information about the sent segment for comparison with the
89 /* Segment sequence number. */
91 /* Time stamp indicating when the packet was sent. */
93 /* Last received receiver ts (if the TCP option is used). */
96 TAILQ_ENTRY (txseginfo) txsegi_lnk;
99 /* Flags for struct txseginfo. */
100 #define TXSI_TSO 0x01 /* TSO was used for this entry. */
101 #define TXSI_RTT_MEASURE_START 0x02 /* Start a per RTT measurement. */
102 #define TXSI_RX_MEASURE_END 0x04 /* Measure the rx rate until this txsi. */
104 struct helper ertt_helper = {
105 .mod_init = ertt_mod_init,
106 .mod_destroy = ertt_mod_destroy,
107 .h_flags = HELPER_NEEDS_OSD,
108 .h_classes = HELPER_CLASS_TCP
111 /* Define the helper hook info required by ERTT. */
112 struct hookinfo ertt_hooks[] = {
114 .hook_type = HHOOK_TYPE_TCP,
115 .hook_id = HHOOK_TCP_EST_IN,
117 .hook_func = &ertt_packet_measurement_hook
120 .hook_type = HHOOK_TYPE_TCP,
121 .hook_id = HHOOK_TCP_EST_OUT,
123 .hook_func = &ertt_add_tx_segment_info_hook
127 /* Flags to indicate how marked_packet_rtt should handle this txsi. */
128 #define MULTI_ACK 0x01 /* More than this txsi is acked. */
129 #define OLD_TXSI 0x02 /* TXSI is old according to timestamps. */
130 #define CORRECT_ACK 0X04 /* Acks this TXSI. */
131 #define FORCED_MEASUREMENT 0X08 /* Force an RTT measurement. */
134 * This fuction measures the RTT of a particular segment/ack pair, or the next
135 * closest if this will yield an inaccurate result due to delayed acking or
139 marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp,
140 uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust,
145 * If we can't measure this one properly due to delayed acking adjust
146 * byte counters and flag to measure next txsi. Note that since the
147 * marked packet's transmitted bytes are measured we need to subtract the
148 * transmitted bytes. Then pretend the next txsi was marked.
150 if (mflag & (MULTI_ACK|OLD_TXSI)) {
151 *pmeasurenext = txsi->tx_ts;
152 *pmeasurenext_len = txsi->len;
153 *prtt_bytes_adjust += *pmeasurenext_len;
155 if (mflag & FORCED_MEASUREMENT) {
156 e_t->markedpkt_rtt = tcp_ts_getticks() -
158 e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
159 *pmeasurenext_len - *prtt_bytes_adjust;
161 e_t->markedpkt_rtt = tcp_ts_getticks() -
163 e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
166 e_t->marked_snd_cwnd = tp->snd_cwnd;
169 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to
170 * add_tx_segment_info that a new measurement should be started.
172 e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS;
174 * Set ERTT_NEW_MEASUREMENT to tell the congestion control
175 * algorithm that a new marked RTT measurement has has been made
176 * and is available for use.
178 e_t->flags |= ERTT_NEW_MEASUREMENT;
180 if (tp->t_flags & TF_TSO) {
181 /* Temporarily disable TSO to aid a new measurment. */
182 tp->t_flags &= ~TF_TSO;
183 /* Keep track that we've disabled it. */
184 e_t->flags |= ERTT_TSO_DISABLED;
190 * Ertt_packet_measurements uses a small amount of state kept on each packet
191 * sent to match incoming acknowledgements. This enables more accurate and
192 * secure round trip time measurements. The resulting measurement is used for
193 * congestion control algorithms which require a more accurate time.
194 * Ertt_packet_measurements is called via the helper hook in tcp_input.c
197 ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
198 void *ctx_data, void *hdata, struct osd *hosd)
204 struct tcp_hhook_data *thdp;
205 struct txseginfo *txsi;
206 int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
207 uint32_t measurenext, rts;
210 KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
211 KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
213 e_t = (struct ertt *)hdata;
218 new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
219 measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
220 acked = th->th_ack - tp->snd_una;
222 INP_WLOCK_ASSERT(tp->t_inpcb);
224 /* Packet has provided new acknowledgements. */
225 if (acked > 0 || new_sacked_bytes) {
226 if (acked == 0 && new_sacked_bytes) {
227 /* Use last sacked data. */
228 ack = tp->sackhint.last_sack_ack;
232 txsi = TAILQ_FIRST(&e_t->txsegi_q);
233 while (txsi != NULL) {
236 /* Acknowledgement is acking more than this txsi. */
237 if (SEQ_GT(ack, txsi->seq + txsi->len)) {
238 if (txsi->flags & TXSI_RTT_MEASURE_START ||
240 marked_packet_rtt(txsi, e_t, tp,
241 &measurenext, &measurenext_len,
242 &rtt_bytes_adjust, MULTI_ACK);
244 TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
245 uma_zfree(txseginfo_zone, txsi);
246 txsi = TAILQ_FIRST(&e_t->txsegi_q);
251 * Guess if delayed acks are being used by the receiver.
253 * XXXDH: A simple heuristic that could be improved
255 if (!new_sacked_bytes) {
256 if (acked > tp->t_maxseg) {
258 (e_t->dlyack_rx < DLYACK_SMOOTH) ?
261 } else if (acked > txsi->len) {
264 (e_t->dlyack_rx < DLYACK_SMOOTH) ?
266 } else if (acked == tp->t_maxseg ||
267 acked == txsi->len) {
269 (e_t->dlyack_rx > 0) ? 1 : 0;
271 /* Otherwise leave dlyack_rx the way it was. */
275 * Time stamps are only to help match the txsi with the
276 * received acknowledgements.
278 if (e_t->timestamp_errors < MAX_TS_ERR &&
279 (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
281 * Note: All packets sent with the offload will
282 * have the same time stamp. If we are sending
283 * on a fast interface and the t_maxseg is much
284 * smaller than one tick, this will be fine. The
285 * time stamp would be the same whether we were
286 * using tso or not. However, if the interface
287 * is slow, this will cause problems with the
288 * calculations. If the interface is slow, there
289 * is not reason to be using tso, and it should
293 * If there are too many time stamp errors, time
294 * stamps won't be trusted
297 /* Before this packet. */
298 if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
299 /* When delayed acking is used, the
300 * reflected time stamp is of the first
301 * packet and thus may be before
305 if (TSTMP_GT(rts, txsi->tx_ts)) {
307 * If reflected time stamp is later than
308 * tx_tsi, then this txsi is old.
310 if (txsi->flags & TXSI_RTT_MEASURE_START
312 marked_packet_rtt(txsi, e_t, tp,
313 &measurenext, &measurenext_len,
314 &rtt_bytes_adjust, OLD_TXSI);
316 TAILQ_REMOVE(&e_t->txsegi_q, txsi,
318 uma_zfree(txseginfo_zone, txsi);
319 txsi = TAILQ_FIRST(&e_t->txsegi_q);
322 if (rts == txsi->tx_ts &&
323 TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
325 * Segment received before sent!
326 * Something is wrong with the received
327 * timestamps so increment errors. If
328 * this keeps up we will ignore
331 e_t->timestamp_errors++;
335 * Acknowledging a sequence number before this txsi.
336 * If it is an old txsi that may have had the same seq
337 * numbers, it should have been removed if time stamps
340 if (SEQ_LEQ(ack, txsi->seq))
341 break; /* Before first packet in txsi. */
344 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
347 * If delayed acks are being used, an acknowledgement
348 * for a single segment will have been delayed by the
349 * receiver and will yield an inaccurate measurement. In
350 * this case, we only make the measurement if more than
351 * one segment is being acknowledged or sack is
352 * currently being used.
354 if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
355 /* Make an accurate new measurement. */
356 e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;
358 if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
359 e_t->minrtt = e_t->rtt;
361 if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
362 e_t->maxrtt = e_t->rtt;
365 if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
366 marked_packet_rtt(txsi, e_t, tp,
367 &measurenext, &measurenext_len,
368 &rtt_bytes_adjust, CORRECT_ACK);
370 if (txsi->flags & TXSI_TSO) {
371 if (txsi->len > acked) {
374 * This presumes ack for first bytes in
375 * txsi, this may not be true but it
376 * shouldn't cause problems for the
379 * We remeasure RTT even though we only
380 * have a single txsi. The rationale
381 * behind this is that it is better to
382 * have a slightly inaccurate
383 * measurement than no additional
384 * measurement for the rest of the bulk
385 * transfer. Since TSO is only used on
386 * high speed interface cards, so the
387 * packets should be transmitted at line
388 * rate back to back with little
389 * difference in transmission times (in
394 * Reset txsi measure flag so we don't
395 * use it for another RTT measurement.
397 txsi->flags &= ~TXSI_RTT_MEASURE_START;
399 * There is still more data to be acked
400 * from tso bulk transmission, so we
401 * won't remove it from the TAILQ yet.
408 TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
409 uma_zfree(txseginfo_zone, txsi);
415 * We need to do a RTT measurement. It won't be the best
418 marked_packet_rtt(txsi, e_t, tp,
419 &measurenext, &measurenext_len,
420 &rtt_bytes_adjust, FORCED_MEASUREMENT);
428 * Add information about a transmitted segment to a list.
429 * This is called via the helper hook in tcp_output.c
432 ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata,
433 void *ctx_data, void *hdata, struct osd *hosd)
439 struct tcp_hhook_data *thdp;
440 struct txseginfo *txsi;
444 KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
445 KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
447 e_t = (struct ertt *)hdata;
455 INP_WLOCK_ASSERT(tp->t_inpcb);
458 txsi = uma_zalloc(txseginfo_zone, M_NOWAIT);
460 /* Construct txsi setting the necessary flags. */
461 txsi->flags = 0; /* Needs to be initialised. */
462 txsi->seq = ntohl(th->th_seq);
465 txsi->flags |= TXSI_TSO;
466 else if (e_t->flags & ERTT_TSO_DISABLED) {
467 tp->t_flags |= TF_TSO;
468 e_t->flags &= ~ERTT_TSO_DISABLED;
471 if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) {
472 e_t->bytes_tx_in_rtt += len;
474 txsi->flags |= TXSI_RTT_MEASURE_START;
475 e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS;
476 e_t->bytes_tx_in_rtt = len;
479 if (((tp->t_flags & TF_NOOPT) == 0) &&
480 (to->to_flags & TOF_TS)) {
481 txsi->tx_ts = ntohl(to->to_tsval) -
483 txsi->rx_ts = ntohl(to->to_tsecr);
485 txsi->tx_ts = tcp_ts_getticks();
486 txsi->rx_ts = 0; /* No received time stamp. */
488 TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
499 txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo),
500 NULL, NULL, NULL, NULL, 0, 0);
506 ertt_mod_destroy(void)
509 uma_zdestroy(txseginfo_zone);
515 ertt_uma_ctor(void *mem, int size, void *arg, int flags)
521 TAILQ_INIT(&e_t->txsegi_q);
522 e_t->timestamp_errors = 0;
528 e_t->bytes_tx_in_rtt = 0;
529 e_t->markedpkt_rtt = 0;
535 ertt_uma_dtor(void *mem, int size, void *arg)
538 struct txseginfo *n_txsi, *txsi;
541 txsi = TAILQ_FIRST(&e_t->txsegi_q);
542 while (txsi != NULL) {
543 n_txsi = TAILQ_NEXT(txsi, txsegi_lnk);
544 uma_zfree(txseginfo_zone, txsi);
549 KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt),
550 ertt_uma_ctor, ertt_uma_dtor);