2 * SPDX-License-Identifier: BSD-3-Clause
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5 * The Regents of the University of California. All rights reserved.
6 * Copyright (c) 2007-2008,2010
7 * Swinburne University of Technology, Melbourne, Australia.
8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
9 * Copyright (c) 2010 The FreeBSD Foundation
10 * Copyright (c) 2010-2011 Juniper Networks, Inc.
11 * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com>
12 * All rights reserved.
14 * Portions of this software were developed at the Centre for Advanced Internet
15 * Architectures, Swinburne University of Technology, by Lawrence Stewart,
16 * James Healy and David Hayes, made possible in part by a grant from the Cisco
17 * University Research Program Fund at Community Foundation Silicon Valley.
19 * Portions of this software were developed at the Centre for Advanced
20 * Internet Architectures, Swinburne University of Technology, Melbourne,
21 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
23 * Portions of this software were developed by Robert N. M. Watson under
24 * contract to Juniper Networks, Inc.
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
29 * 1. Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in the
33 * documentation and/or other materials provided with the distribution.
34 * 3. Neither the name of the University nor the names of its contributors
35 * may be used to endorse or promote products derived from this software
36 * without specific prior written permission.
38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * @(#)tcp_ecn.c 8.12 (Berkeley) 5/24/95
54 * Utility functions to deal with Explicit Congestion Notification in TCP
55 * implementing the essential parts of the Accurate ECN extension
56 * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
63 #include "opt_inet6.h"
64 #include "opt_tcpdebug.h"
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/sysctl.h>
70 #include <sys/malloc.h>
72 #include <sys/socket.h>
73 #include <sys/socketvar.h>
75 #include <machine/cpu.h>
80 #include <net/if_var.h>
81 #include <net/route.h>
84 #include <netinet/in.h>
85 #include <netinet/in_systm.h>
86 #include <netinet/ip.h>
87 #include <netinet/in_var.h>
88 #include <netinet/in_pcb.h>
89 #include <netinet/ip_var.h>
90 #include <netinet/ip6.h>
91 #include <netinet/icmp6.h>
92 #include <netinet6/nd6.h>
93 #include <netinet6/ip6_var.h>
94 #include <netinet6/in6_pcb.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_fsm.h>
97 #include <netinet/tcp_seq.h>
98 #include <netinet/tcp_var.h>
99 #include <netinet/tcp_syncache.h>
100 #include <netinet/tcp_timer.h>
101 #include <netinet/tcpip.h>
102 #include <netinet/tcp_ecn.h>
106 * Process incoming SYN,ACK packet
109 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos)
112 if (V_tcp_do_ecn == 0)
114 if ((V_tcp_do_ecn == 1) ||
115 (V_tcp_do_ecn == 2)) {
116 /* RFC3168 ECN handling */
117 if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) {
118 tp->t_flags2 |= TF2_ECN_PERMIT;
119 tp->t_flags2 &= ~TF2_ACE_PERMIT;
120 TCPSTAT_INC(tcps_ecn_shs);
123 /* decoding Accurate ECN according to table in section 3.1.1 */
124 if ((V_tcp_do_ecn == 3) ||
125 (V_tcp_do_ecn == 4)) {
127 * on the SYN,ACK, process the AccECN
128 * flags indicating the state the SYN
130 * Reactions to Path ECN mangling can
133 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
136 tp->t_flags2 |= TF2_ECN_PERMIT;
137 tp->t_flags2 &= ~TF2_ACE_PERMIT;
138 TCPSTAT_INC(tcps_ecn_shs);
142 tp->t_flags2 |= TF2_ACE_PERMIT;
143 tp->t_flags2 &= ~TF2_ECN_PERMIT;
145 TCPSTAT_INC(tcps_ecn_shs);
146 TCPSTAT_INC(tcps_ace_nect);
150 tp->t_flags2 |= TF2_ACE_PERMIT;
151 tp->t_flags2 &= ~TF2_ECN_PERMIT;
153 TCPSTAT_INC(tcps_ecn_shs);
154 TCPSTAT_INC(tcps_ace_ect0);
157 case (0|TH_CWR|TH_ECE):
158 tp->t_flags2 |= TF2_ACE_PERMIT;
159 tp->t_flags2 &= ~TF2_ECN_PERMIT;
161 TCPSTAT_INC(tcps_ecn_shs);
162 TCPSTAT_INC(tcps_ace_ect1);
165 case (TH_AE|TH_CWR|0):
166 tp->t_flags2 |= TF2_ACE_PERMIT;
167 tp->t_flags2 &= ~TF2_ECN_PERMIT;
170 * reduce the IW to 2 MSS (to
171 * account for delayed acks) if
172 * the SYN,ACK was CE marked
174 tp->snd_cwnd = 2 * tcp_maxseg(tp);
175 TCPSTAT_INC(tcps_ecn_shs);
176 TCPSTAT_INC(tcps_ace_nect);
179 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT);
183 * Set the AccECN Codepoints on
184 * the outgoing <ACK> to the ECN
185 * state of the <SYN,ACK>
186 * according to table 3 in the
189 switch (iptos & IPTOS_ECN_MASK) {
190 case (IPTOS_ECN_NOTECT):
193 case (IPTOS_ECN_ECT0):
196 case (IPTOS_ECN_ECT1):
207 * Handle parallel SYN for ECN
210 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos)
212 if (thflags & TH_ACK)
214 if (V_tcp_do_ecn == 0)
216 if ((V_tcp_do_ecn == 1) ||
217 (V_tcp_do_ecn == 2)) {
218 /* RFC3168 ECN handling */
219 if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) {
220 tp->t_flags2 |= TF2_ECN_PERMIT;
221 tp->t_flags2 &= ~TF2_ACE_PERMIT;
222 tp->t_flags2 |= TF2_ECN_SND_ECE;
223 TCPSTAT_INC(tcps_ecn_shs);
226 if ((V_tcp_do_ecn == 3) ||
227 (V_tcp_do_ecn == 4)) {
228 /* AccECN handling */
229 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
232 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT);
234 case (0|TH_CWR|TH_ECE):
235 tp->t_flags2 |= TF2_ECN_PERMIT;
236 tp->t_flags2 &= ~TF2_ACE_PERMIT;
237 tp->t_flags2 |= TF2_ECN_SND_ECE;
238 TCPSTAT_INC(tcps_ecn_shs);
240 case (TH_AE|TH_CWR|TH_ECE):
241 tp->t_flags2 |= TF2_ACE_PERMIT;
242 tp->t_flags2 &= ~TF2_ECN_PERMIT;
243 TCPSTAT_INC(tcps_ecn_shs);
245 * Set the AccECN Codepoints on
246 * the outgoing <ACK> to the ECN
247 * state of the <SYN,ACK>
248 * according to table 3 in the
251 switch (iptos & IPTOS_ECN_MASK) {
252 case (IPTOS_ECN_NOTECT):
255 case (IPTOS_ECN_ECT0):
258 case (IPTOS_ECN_ECT1):
271 * TCP ECN processing.
274 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos)
278 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
279 switch (iptos & IPTOS_ECN_MASK) {
281 TCPSTAT_INC(tcps_ecn_ce);
284 TCPSTAT_INC(tcps_ecn_ect0);
287 TCPSTAT_INC(tcps_ecn_ect1);
291 if (tp->t_flags2 & TF2_ACE_PERMIT) {
292 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
294 if (tp->t_flags2 & TF2_ECN_PERMIT) {
295 delta_ace = (tcp_ecn_get_ace(thflags) + 8 -
296 (tp->t_scep & 0x07)) & 0x07;
297 tp->t_scep += delta_ace;
300 * process the final ACK of the 3WHS
301 * see table 3 in draft-ietf-tcpm-accurate-ecn
303 switch (tcp_ecn_get_ace(thflags)) {
305 /* nonECT SYN or SYN,ACK */
308 /* ECT1 SYN or SYN,ACK */
311 /* ECT0 SYN or SYN,ACK */
315 /* CE SYN or SYN,ACK */
317 tp->snd_cwnd = 2 * tcp_maxseg(tp);
320 /* mangled AccECN handshake */
324 tp->t_flags2 |= TF2_ECN_PERMIT;
327 /* RFC3168 ECN handling */
328 if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE)
330 if (thflags & TH_CWR) {
331 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
332 tp->t_flags |= TF_ACKNOW;
334 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
335 tp->t_flags2 |= TF2_ECN_SND_ECE;
338 /* Process a packet differently from RFC3168. */
339 cc_ecnpkt_handler_flags(tp, thflags, iptos);
346 * Send ECN setup <SYN> packet header flags
349 tcp_ecn_output_syn_sent(struct tcpcb *tp)
351 uint16_t thflags = 0;
353 if (V_tcp_do_ecn == 0)
355 if (V_tcp_do_ecn == 1) {
356 /* Send a RFC3168 ECN setup <SYN> packet */
357 if (tp->t_rxtshift >= 1) {
358 if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
359 thflags = TH_ECE|TH_CWR;
361 thflags = TH_ECE|TH_CWR;
363 if (V_tcp_do_ecn == 3) {
364 /* Send an Accurate ECN setup <SYN> packet */
365 if (tp->t_rxtshift >= 1) {
366 if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
367 thflags = TH_ECE|TH_CWR|TH_AE;
369 thflags = TH_ECE|TH_CWR|TH_AE;
376 * output processing of ECN feature
377 * returning IP ECN header codepoint
380 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit)
382 int ipecn = IPTOS_ECN_NOTECT;
386 * If the peer has ECN, mark data packets with
387 * ECN capable transmission (ECT).
388 * Ignore pure control packets, retransmissions
391 newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
393 !((tp->t_flags & TF_FORCEDATA) && len == 1));
394 /* RFC3168 ECN marking, only new data segments */
396 ipecn = IPTOS_ECN_ECT0;
397 TCPSTAT_INC(tcps_ecn_ect0);
400 * Reply with proper ECN notifications.
402 if (tp->t_flags2 & TF2_ACE_PERMIT) {
403 *thflags &= ~(TH_AE|TH_CWR|TH_ECE);
404 if (tp->t_rcep & 0x01)
406 if (tp->t_rcep & 0x02)
408 if (tp->t_rcep & 0x04)
410 if (!(tp->t_flags2 & TF2_ECN_PERMIT)) {
412 * here we process the final
415 if (tp->t_rcep == 0b110) {
420 tp->t_flags2 |= TF2_ECN_PERMIT;
424 (tp->t_flags2 & TF2_ECN_SND_CWR)) {
426 tp->t_flags2 &= ~TF2_ECN_SND_CWR;
428 if (tp->t_flags2 & TF2_ECN_SND_ECE)
436 * Set up the ECN related tcpcb fields from
440 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc)
442 if (sc->sc_flags & SCF_ECN_MASK) {
443 switch (sc->sc_flags & SCF_ECN_MASK) {
445 tp->t_flags2 |= TF2_ECN_PERMIT;
452 tp->t_flags2 |= TF2_ACE_PERMIT;
457 tp->t_flags2 |= TF2_ACE_PERMIT;
461 /* undefined SCF codepoint */
469 * Process a <SYN> packets ECN information, and provide the
470 * syncache with the relevant information.
473 tcp_ecn_syncache_add(uint16_t thflags, int iptos)
477 switch (thflags & (TH_AE|TH_CWR|TH_ECE)) {
482 case (0|TH_CWR|TH_ECE):
486 case (TH_AE|TH_CWR|TH_ECE):
487 if ((V_tcp_do_ecn == 3) ||
488 (V_tcp_do_ecn == 4)) {
489 switch (iptos & IPTOS_ECN_MASK) {
491 scflags = SCF_ACE_CE;
499 case IPTOS_ECN_NOTECT:
506 /* Default Case (section 3.1.2) */
508 if ((V_tcp_do_ecn == 3) ||
509 (V_tcp_do_ecn == 4)) {
510 switch (iptos & IPTOS_ECN_MASK) {
512 scflags = SCF_ACE_CE;
520 case IPTOS_ECN_NOTECT:
531 * Set up the ECN information for the <SYN,ACK> from
532 * syncache information.
535 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc)
537 if ((thflags & TH_SYN) &&
538 (sc->sc_flags & SCF_ECN_MASK)) {
539 switch (sc->sc_flags & SCF_ECN_MASK) {
541 thflags |= (0 | 0 | TH_ECE);
542 TCPSTAT_INC(tcps_ecn_shs);
545 thflags |= (0 | TH_CWR | 0);
546 TCPSTAT_INC(tcps_ecn_shs);
547 TCPSTAT_INC(tcps_ace_nect);
550 thflags |= (TH_AE | 0 | 0);
551 TCPSTAT_INC(tcps_ecn_shs);
552 TCPSTAT_INC(tcps_ace_ect0);
555 thflags |= (0 | TH_ECE | TH_CWR);
556 TCPSTAT_INC(tcps_ecn_shs);
557 TCPSTAT_INC(tcps_ace_ect1);
560 thflags |= (TH_AE | TH_CWR | 0);
561 TCPSTAT_INC(tcps_ecn_shs);
562 TCPSTAT_INC(tcps_ace_ce);
564 /* undefined SCF codepoint */
573 tcp_ecn_get_ace(uint16_t thflags)
577 if (thflags & TH_ECE)
579 if (thflags & TH_CWR)