]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_ecn.c
ssh: update to OpenSSH 9.1p1
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_ecn.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5  *      The Regents of the University of California.  All rights reserved.
6  * Copyright (c) 2007-2008,2010
7  *      Swinburne University of Technology, Melbourne, Australia.
8  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
9  * Copyright (c) 2010 The FreeBSD Foundation
10  * Copyright (c) 2010-2011 Juniper Networks, Inc.
11  * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com>
12  * All rights reserved.
13  *
14  * Portions of this software were developed at the Centre for Advanced Internet
15  * Architectures, Swinburne University of Technology, by Lawrence Stewart,
16  * James Healy and David Hayes, made possible in part by a grant from the Cisco
17  * University Research Program Fund at Community Foundation Silicon Valley.
18  *
19  * Portions of this software were developed at the Centre for Advanced
20  * Internet Architectures, Swinburne University of Technology, Melbourne,
21  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
22  *
23  * Portions of this software were developed by Robert N. M. Watson under
24  * contract to Juniper Networks, Inc.
25  *
26  * Redistribution and use in source and binary forms, with or without
27  * modification, are permitted provided that the following conditions
28  * are met:
29  * 1. Redistributions of source code must retain the above copyright
30  *    notice, this list of conditions and the following disclaimer.
31  * 2. Redistributions in binary form must reproduce the above copyright
32  *    notice, this list of conditions and the following disclaimer in the
33  *    documentation and/or other materials provided with the distribution.
34  * 3. Neither the name of the University nor the names of its contributors
35  *    may be used to endorse or promote products derived from this software
36  *    without specific prior written permission.
37  *
38  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48  * SUCH DAMAGE.
49  *
50  *      @(#)tcp_ecn.c 8.12 (Berkeley) 5/24/95
51  */
52
53 /*
54  * Utility functions to deal with Explicit Congestion Notification in TCP
55  * implementing the essential parts of the Accurate ECN extension
56  * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include "opt_inet.h"
63 #include "opt_inet6.h"
64 #include "opt_tcpdebug.h"
65
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/sysctl.h>
70 #include <sys/malloc.h>
71 #include <sys/mbuf.h>
72 #include <sys/socket.h>
73 #include <sys/socketvar.h>
74
75 #include <machine/cpu.h>
76
77 #include <vm/uma.h>
78
79 #include <net/if.h>
80 #include <net/if_var.h>
81 #include <net/route.h>
82 #include <net/vnet.h>
83
84 #include <netinet/in.h>
85 #include <netinet/in_systm.h>
86 #include <netinet/ip.h>
87 #include <netinet/in_var.h>
88 #include <netinet/in_pcb.h>
89 #include <netinet/ip_var.h>
90 #include <netinet/ip6.h>
91 #include <netinet/icmp6.h>
92 #include <netinet6/nd6.h>
93 #include <netinet6/ip6_var.h>
94 #include <netinet6/in6_pcb.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_fsm.h>
97 #include <netinet/tcp_seq.h>
98 #include <netinet/tcp_var.h>
99 #include <netinet/tcp_syncache.h>
100 #include <netinet/tcp_timer.h>
101 #include <netinet/tcpip.h>
102 #include <netinet/tcp_ecn.h>
103
104
105 /*
106  * Process incoming SYN,ACK packet
107  */
108 void
109 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos)
110 {
111
112         if (V_tcp_do_ecn == 0)
113                 return;
114         if ((V_tcp_do_ecn == 1) ||
115             (V_tcp_do_ecn == 2)) {
116                 /* RFC3168 ECN handling */
117                 if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) {
118                         tp->t_flags2 |= TF2_ECN_PERMIT;
119                         TCPSTAT_INC(tcps_ecn_shs);
120                 }
121         } else
122         /* decoding Accurate ECN according to table in section 3.1.1 */
123         if ((V_tcp_do_ecn == 3) ||
124             (V_tcp_do_ecn == 4)) {
125                 /*
126                  * on the SYN,ACK, process the AccECN
127                  * flags indicating the state the SYN
128                  * was delivered.
129                  * Reactions to Path ECN mangling can
130                  * come here.
131                  */
132                 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
133                 /* RFC3168 SYN */
134                 case (0|0|TH_ECE):
135                         tp->t_flags2 |= TF2_ECN_PERMIT;
136                         TCPSTAT_INC(tcps_ecn_shs);
137                         break;
138                 /* non-ECT SYN */
139                 case (0|TH_CWR|0):
140                         tp->t_flags2 |= TF2_ACE_PERMIT;
141                         tp->t_scep = 5;
142                         TCPSTAT_INC(tcps_ecn_shs);
143                         TCPSTAT_INC(tcps_ace_nect);
144                         break;
145                 /* ECT0 SYN */
146                 case (TH_AE|0|0):
147                         tp->t_flags2 |= TF2_ACE_PERMIT;
148                         tp->t_scep = 5;
149                         TCPSTAT_INC(tcps_ecn_shs);
150                         TCPSTAT_INC(tcps_ace_ect0);
151                         break;
152                 /* ECT1 SYN */
153                 case (0|TH_CWR|TH_ECE):
154                         tp->t_flags2 |= TF2_ACE_PERMIT;
155                         tp->t_scep = 5;
156                         TCPSTAT_INC(tcps_ecn_shs);
157                         TCPSTAT_INC(tcps_ace_ect1);
158                         break;
159                 /* CE SYN */
160                 case (TH_AE|TH_CWR|0):
161                         tp->t_flags2 |= TF2_ACE_PERMIT;
162                         tp->t_scep = 6;
163                         /*
164                          * reduce the IW to 2 MSS (to
165                          * account for delayed acks) if
166                          * the SYN,ACK was CE marked
167                          */
168                         tp->snd_cwnd = 2 * tcp_maxseg(tp);
169                         TCPSTAT_INC(tcps_ecn_shs);
170                         TCPSTAT_INC(tcps_ace_nect);
171                         break;
172                 default:
173                         break;
174                 }
175                 /*
176                  * Set the AccECN Codepoints on
177                  * the outgoing <ACK> to the ECN
178                  * state of the <SYN,ACK>
179                  * according to table 3 in the
180                  * AccECN draft
181                  */
182                 switch (iptos & IPTOS_ECN_MASK) {
183                 case (IPTOS_ECN_NOTECT):
184                         tp->t_rcep = 0b010;
185                         break;
186                 case (IPTOS_ECN_ECT0):
187                         tp->t_rcep = 0b100;
188                         break;
189                 case (IPTOS_ECN_ECT1):
190                         tp->t_rcep = 0b011;
191                         break;
192                 case (IPTOS_ECN_CE):
193                         tp->t_rcep = 0b110;
194                         break;
195                 }
196         }
197 }
198
199 /*
200  * Handle parallel SYN for ECN
201  */
202 void
203 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos)
204 {
205         if (thflags & TH_ACK)
206                 return;
207         if (V_tcp_do_ecn == 0)
208                 return;
209         if ((V_tcp_do_ecn == 1) ||
210             (V_tcp_do_ecn == 2)) {
211                 /* RFC3168 ECN handling */
212                 if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) {
213                         tp->t_flags2 |= TF2_ECN_PERMIT;
214                         tp->t_flags2 |= TF2_ECN_SND_ECE;
215                         TCPSTAT_INC(tcps_ecn_shs);
216                 }
217         } else
218         if ((V_tcp_do_ecn == 3) ||
219             (V_tcp_do_ecn == 4)) {
220                 /* AccECN handling */
221                 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
222                 default:
223                 case (0|0|0):
224                         break;
225                 case (0|TH_CWR|TH_ECE):
226                         tp->t_flags2 |= TF2_ECN_PERMIT;
227                         tp->t_flags2 |= TF2_ECN_SND_ECE;
228                         TCPSTAT_INC(tcps_ecn_shs);
229                         break;
230                 case (TH_AE|TH_CWR|TH_ECE):
231                         tp->t_flags2 |= TF2_ACE_PERMIT;
232                         TCPSTAT_INC(tcps_ecn_shs);
233                         /*
234                          * Set the AccECN Codepoints on
235                          * the outgoing <ACK> to the ECN
236                          * state of the <SYN,ACK>
237                          * according to table 3 in the
238                          * AccECN draft
239                          */
240                         switch (iptos & IPTOS_ECN_MASK) {
241                         case (IPTOS_ECN_NOTECT):
242                                 tp->t_rcep = 0b010;
243                                 break;
244                         case (IPTOS_ECN_ECT0):
245                                 tp->t_rcep = 0b100;
246                                 break;
247                         case (IPTOS_ECN_ECT1):
248                                 tp->t_rcep = 0b011;
249                                 break;
250                         case (IPTOS_ECN_CE):
251                                 tp->t_rcep = 0b110;
252                                 break;
253                         }
254                         break;
255                 }
256         }
257 }
258
259 /*
260  * TCP ECN processing.
261  */
262 int
263 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos)
264 {
265         int delta_ace = 0;
266
267         if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
268                 switch (iptos & IPTOS_ECN_MASK) {
269                 case IPTOS_ECN_CE:
270                         TCPSTAT_INC(tcps_ecn_ce);
271                         break;
272                 case IPTOS_ECN_ECT0:
273                         TCPSTAT_INC(tcps_ecn_ect0);
274                         break;
275                 case IPTOS_ECN_ECT1:
276                         TCPSTAT_INC(tcps_ecn_ect1);
277                         break;
278                 }
279
280                 if (tp->t_flags2 & TF2_ACE_PERMIT) {
281                         if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
282                                 tp->t_rcep += 1;
283                         if (tp->t_flags2 & TF2_ECN_PERMIT) {
284                                 delta_ace = (tcp_ecn_get_ace(thflags) + 8 -
285                                             (tp->t_scep & 0x07)) & 0x07;
286                                 tp->t_scep += delta_ace;
287                         } else {
288                                 /*
289                                  * process the final ACK of the 3WHS
290                                  * see table 3 in draft-ietf-tcpm-accurate-ecn
291                                  */
292                                 switch (tcp_ecn_get_ace(thflags)) {
293                                 case 0b010:
294                                         /* nonECT SYN or SYN,ACK */
295                                         /* Fallthrough */
296                                 case 0b011:
297                                         /* ECT1 SYN or SYN,ACK */
298                                         /* Fallthrough */
299                                 case 0b100:
300                                         /* ECT0 SYN or SYN,ACK */
301                                         tp->t_scep = 5;
302                                         break;
303                                 case 0b110:
304                                         /* CE SYN or SYN,ACK */
305                                         tp->t_scep = 6;
306                                         tp->snd_cwnd = 2 * tcp_maxseg(tp);
307                                         break;
308                                 default:
309                                         /* mangled AccECN handshake */
310                                         tp->t_scep = 5;
311                                         break;
312                                 }
313                                 tp->t_flags2 |= TF2_ECN_PERMIT;
314                         }
315                 } else {
316                         /* RFC3168 ECN handling */
317                         if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE)
318                                 delta_ace = 1;
319                         if (thflags & TH_CWR) {
320                                 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
321                                 tp->t_flags |= TF_ACKNOW;
322                         }
323                         if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
324                                 tp->t_flags2 |= TF2_ECN_SND_ECE;
325                 }
326
327                 /* Process a packet differently from RFC3168. */
328                 cc_ecnpkt_handler_flags(tp, thflags, iptos);
329         }
330
331         return delta_ace;
332 }
333
334 /*
335  * Send ECN setup <SYN> packet header flags
336  */
337 uint16_t
338 tcp_ecn_output_syn_sent(struct tcpcb *tp)
339 {
340         uint16_t thflags = 0;
341
342         if (V_tcp_do_ecn == 0)
343                 return thflags;
344         if (V_tcp_do_ecn == 1) {
345                 /* Send a RFC3168 ECN setup <SYN> packet */
346                 if (tp->t_rxtshift >= 1) {
347                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
348                                 thflags = TH_ECE|TH_CWR;
349                 } else
350                         thflags = TH_ECE|TH_CWR;
351         } else
352         if (V_tcp_do_ecn == 3) {
353                 /* Send an Accurate ECN setup <SYN> packet */
354                 if (tp->t_rxtshift >= 1) {
355                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
356                                 thflags = TH_ECE|TH_CWR|TH_AE;
357                 } else
358                         thflags = TH_ECE|TH_CWR|TH_AE;
359         }
360
361         return thflags;
362 }
363
364 /*
365  * output processing of ECN feature
366  * returning IP ECN header codepoint
367  */
368 int
369 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit)
370 {
371         int ipecn = IPTOS_ECN_NOTECT;
372         bool newdata;
373
374         /*
375          * If the peer has ECN, mark data packets with
376          * ECN capable transmission (ECT).
377          * Ignore pure control packets, retransmissions
378          * and window probes.
379          */
380         newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
381                     !rxmit &&
382                     !((tp->t_flags & TF_FORCEDATA) && len == 1));
383         /* RFC3168 ECN marking, only new data segments */
384         if (newdata) {
385                 ipecn = IPTOS_ECN_ECT0;
386                 TCPSTAT_INC(tcps_ecn_ect0);
387         }
388         /*
389          * Reply with proper ECN notifications.
390          */
391         if (tp->t_flags2 & TF2_ACE_PERMIT) {
392                 *thflags &= ~(TH_AE|TH_CWR|TH_ECE);
393                 if (tp->t_rcep & 0x01)
394                         *thflags |= TH_ECE;
395                 if (tp->t_rcep & 0x02)
396                         *thflags |= TH_CWR;
397                 if (tp->t_rcep & 0x04)
398                         *thflags |= TH_AE;
399                 if (!(tp->t_flags2 & TF2_ECN_PERMIT)) {
400                         /*
401                          * here we process the final
402                          * ACK of the 3WHS
403                          */
404                         if (tp->t_rcep == 0b110) {
405                                 tp->t_rcep = 6;
406                         } else {
407                                 tp->t_rcep = 5;
408                         }
409                         tp->t_flags2 |= TF2_ECN_PERMIT;
410                 }
411         } else {
412                 if (newdata &&
413                     (tp->t_flags2 & TF2_ECN_SND_CWR)) {
414                         *thflags |= TH_CWR;
415                         tp->t_flags2 &= ~TF2_ECN_SND_CWR;
416                 }
417                 if (tp->t_flags2 & TF2_ECN_SND_ECE)
418                         *thflags |= TH_ECE;
419         }
420
421         return ipecn;
422 }
423
424 /*
425  * Set up the ECN related tcpcb fields from
426  * a syncache entry
427  */
428 void
429 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc)
430 {
431         if (sc->sc_flags & SCF_ECN_MASK) {
432                 switch (sc->sc_flags & SCF_ECN_MASK) {
433                 case SCF_ECN:
434                         tp->t_flags2 |= TF2_ECN_PERMIT;
435                         break;
436                 case SCF_ACE_N:
437                         /* Fallthrough */
438                 case SCF_ACE_0:
439                         /* Fallthrough */
440                 case SCF_ACE_1:
441                         tp->t_flags2 |= TF2_ACE_PERMIT;
442                         tp->t_scep = 5;
443                         tp->t_rcep = 5;
444                         break;
445                 case SCF_ACE_CE:
446                         tp->t_flags2 |= TF2_ACE_PERMIT;
447                         tp->t_scep = 6;
448                         tp->t_rcep = 6;
449                         break;
450                 /* undefined SCF codepoint */
451                 default:
452                         break;
453                 }
454         }
455 }
456
457 /*
458  * Process a <SYN> packets ECN information, and provide the
459  * syncache with the relevant information.
460  */
461 int
462 tcp_ecn_syncache_add(uint16_t thflags, int iptos)
463 {
464         int scflags = 0;
465
466         switch (thflags & (TH_AE|TH_CWR|TH_ECE)) {
467         /* no ECN */
468         case (0|0|0):
469                 break;
470         /* legacy ECN */
471         case (0|TH_CWR|TH_ECE):
472                 scflags = SCF_ECN;
473                 break;
474         /* Accurate ECN */
475         case (TH_AE|TH_CWR|TH_ECE):
476                 if ((V_tcp_do_ecn == 3) ||
477                     (V_tcp_do_ecn == 4)) {
478                         switch (iptos & IPTOS_ECN_MASK) {
479                         case IPTOS_ECN_CE:
480                                 scflags = SCF_ACE_CE;
481                                 break;
482                         case IPTOS_ECN_ECT0:
483                                 scflags = SCF_ACE_0;
484                                 break;
485                         case IPTOS_ECN_ECT1:
486                                 scflags = SCF_ACE_1;
487                                 break;
488                         case IPTOS_ECN_NOTECT:
489                                 scflags = SCF_ACE_N;
490                                 break;
491                         }
492                 } else
493                         scflags = SCF_ECN;
494                 break;
495         /* Default Case (section 3.1.2) */
496         default:
497                 if ((V_tcp_do_ecn == 3) ||
498                     (V_tcp_do_ecn == 4)) {
499                         switch (iptos & IPTOS_ECN_MASK) {
500                         case IPTOS_ECN_CE:
501                                 scflags = SCF_ACE_CE;
502                                 break;
503                         case IPTOS_ECN_ECT0:
504                                 scflags = SCF_ACE_0;
505                                 break;
506                         case IPTOS_ECN_ECT1:
507                                 scflags = SCF_ACE_1;
508                                 break;
509                         case IPTOS_ECN_NOTECT:
510                                 scflags = SCF_ACE_N;
511                                 break;
512                         }
513                 }
514                 break;
515         }
516         return scflags;
517 }
518
519 /*
520  * Set up the ECN information for the <SYN,ACK> from
521  * syncache information.
522  */
523 uint16_t
524 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc)
525 {
526         if ((thflags & TH_SYN) &&
527             (sc->sc_flags & SCF_ECN_MASK)) {
528                 switch (sc->sc_flags & SCF_ECN_MASK) {
529                 case SCF_ECN:
530                         thflags |= (0 | 0 | TH_ECE);
531                         TCPSTAT_INC(tcps_ecn_shs);
532                         break;
533                 case SCF_ACE_N:
534                         thflags |= (0 | TH_CWR | 0);
535                         TCPSTAT_INC(tcps_ecn_shs);
536                         TCPSTAT_INC(tcps_ace_nect);
537                         break;
538                 case SCF_ACE_0:
539                         thflags |= (TH_AE | 0 | 0);
540                         TCPSTAT_INC(tcps_ecn_shs);
541                         TCPSTAT_INC(tcps_ace_ect0);
542                         break;
543                 case SCF_ACE_1:
544                         thflags |= (0 | TH_ECE | TH_CWR);
545                         TCPSTAT_INC(tcps_ecn_shs);
546                         TCPSTAT_INC(tcps_ace_ect1);
547                         break;
548                 case SCF_ACE_CE:
549                         thflags |= (TH_AE | TH_CWR | 0);
550                         TCPSTAT_INC(tcps_ecn_shs);
551                         TCPSTAT_INC(tcps_ace_ce);
552                         break;
553                 /* undefined SCF codepoint */
554                 default:
555                         break;
556                 }
557         }
558         return thflags;
559 }
560
561 int
562 tcp_ecn_get_ace(uint16_t thflags)
563 {
564         int ace = 0;
565
566         if (thflags & TH_ECE)
567                 ace += 1;
568         if (thflags & TH_CWR)
569                 ace += 2;
570         if (thflags & TH_AE)
571                 ace += 4;
572         return ace;
573 }