]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_ecn.c
Merge commit '93bf91b4012a28610672d2266366dfa0a663b70f' into HEAD
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_ecn.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5  *      The Regents of the University of California.  All rights reserved.
6  * Copyright (c) 2007-2008,2010
7  *      Swinburne University of Technology, Melbourne, Australia.
8  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
9  * Copyright (c) 2010 The FreeBSD Foundation
10  * Copyright (c) 2010-2011 Juniper Networks, Inc.
11  * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com>
12  * All rights reserved.
13  *
14  * Portions of this software were developed at the Centre for Advanced Internet
15  * Architectures, Swinburne University of Technology, by Lawrence Stewart,
16  * James Healy and David Hayes, made possible in part by a grant from the Cisco
17  * University Research Program Fund at Community Foundation Silicon Valley.
18  *
19  * Portions of this software were developed at the Centre for Advanced
20  * Internet Architectures, Swinburne University of Technology, Melbourne,
21  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
22  *
23  * Portions of this software were developed by Robert N. M. Watson under
24  * contract to Juniper Networks, Inc.
25  *
26  * Redistribution and use in source and binary forms, with or without
27  * modification, are permitted provided that the following conditions
28  * are met:
29  * 1. Redistributions of source code must retain the above copyright
30  *    notice, this list of conditions and the following disclaimer.
31  * 2. Redistributions in binary form must reproduce the above copyright
32  *    notice, this list of conditions and the following disclaimer in the
33  *    documentation and/or other materials provided with the distribution.
34  * 3. Neither the name of the University nor the names of its contributors
35  *    may be used to endorse or promote products derived from this software
36  *    without specific prior written permission.
37  *
38  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48  * SUCH DAMAGE.
49  *
50  *      @(#)tcp_ecn.c 8.12 (Berkeley) 5/24/95
51  */
52
53 /*
54  * Utility functions to deal with Explicit Congestion Notification in TCP
55  * implementing the essential parts of the Accurate ECN extension
56  * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include "opt_inet.h"
63 #include "opt_inet6.h"
64 #include "opt_tcpdebug.h"
65
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/sysctl.h>
70 #include <sys/malloc.h>
71 #include <sys/mbuf.h>
72 #include <sys/socket.h>
73 #include <sys/socketvar.h>
74
75 #include <machine/cpu.h>
76
77 #include <vm/uma.h>
78
79 #include <net/if.h>
80 #include <net/if_var.h>
81 #include <net/route.h>
82 #include <net/vnet.h>
83
84 #include <netinet/in.h>
85 #include <netinet/in_systm.h>
86 #include <netinet/ip.h>
87 #include <netinet/in_var.h>
88 #include <netinet/in_pcb.h>
89 #include <netinet/ip_var.h>
90 #include <netinet/ip6.h>
91 #include <netinet/icmp6.h>
92 #include <netinet6/nd6.h>
93 #include <netinet6/ip6_var.h>
94 #include <netinet6/in6_pcb.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_fsm.h>
97 #include <netinet/tcp_seq.h>
98 #include <netinet/tcp_var.h>
99 #include <netinet/tcp_syncache.h>
100 #include <netinet/tcp_timer.h>
101 #include <netinet/tcpip.h>
102 #include <netinet/tcp_ecn.h>
103
104
105 /*
106  * Process incoming SYN,ACK packet
107  */
108 void
109 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos)
110 {
111
112         if (V_tcp_do_ecn == 0)
113                 return;
114         if ((V_tcp_do_ecn == 1) ||
115             (V_tcp_do_ecn == 2)) {
116                 /* RFC3168 ECN handling */
117                 if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) {
118                         tp->t_flags2 |= TF2_ECN_PERMIT;
119                         tp->t_flags2 &= ~TF2_ACE_PERMIT;
120                         TCPSTAT_INC(tcps_ecn_shs);
121                 }
122         } else
123         /* decoding Accurate ECN according to table in section 3.1.1 */
124         if ((V_tcp_do_ecn == 3) ||
125             (V_tcp_do_ecn == 4)) {
126                 /*
127                  * on the SYN,ACK, process the AccECN
128                  * flags indicating the state the SYN
129                  * was delivered.
130                  * Reactions to Path ECN mangling can
131                  * come here.
132                  */
133                 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
134                 /* RFC3168 SYN */
135                 case (0|0|TH_ECE):
136                         tp->t_flags2 |= TF2_ECN_PERMIT;
137                         tp->t_flags2 &= ~TF2_ACE_PERMIT;
138                         TCPSTAT_INC(tcps_ecn_shs);
139                         break;
140                 /* non-ECT SYN */
141                 case (0|TH_CWR|0):
142                         tp->t_flags2 |= TF2_ACE_PERMIT;
143                         tp->t_flags2 &= ~TF2_ECN_PERMIT;
144                         tp->t_scep = 5;
145                         TCPSTAT_INC(tcps_ecn_shs);
146                         TCPSTAT_INC(tcps_ace_nect);
147                         break;
148                 /* ECT0 SYN */
149                 case (TH_AE|0|0):
150                         tp->t_flags2 |= TF2_ACE_PERMIT;
151                         tp->t_flags2 &= ~TF2_ECN_PERMIT;
152                         tp->t_scep = 5;
153                         TCPSTAT_INC(tcps_ecn_shs);
154                         TCPSTAT_INC(tcps_ace_ect0);
155                         break;
156                 /* ECT1 SYN */
157                 case (0|TH_CWR|TH_ECE):
158                         tp->t_flags2 |= TF2_ACE_PERMIT;
159                         tp->t_flags2 &= ~TF2_ECN_PERMIT;
160                         tp->t_scep = 5;
161                         TCPSTAT_INC(tcps_ecn_shs);
162                         TCPSTAT_INC(tcps_ace_ect1);
163                         break;
164                 /* CE SYN */
165                 case (TH_AE|TH_CWR|0):
166                         tp->t_flags2 |= TF2_ACE_PERMIT;
167                         tp->t_flags2 &= ~TF2_ECN_PERMIT;
168                         tp->t_scep = 6;
169                         /*
170                          * reduce the IW to 2 MSS (to
171                          * account for delayed acks) if
172                          * the SYN,ACK was CE marked
173                          */
174                         tp->snd_cwnd = 2 * tcp_maxseg(tp);
175                         TCPSTAT_INC(tcps_ecn_shs);
176                         TCPSTAT_INC(tcps_ace_nect);
177                         break;
178                 default:
179                         tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT);
180                         break;
181                 }
182                 /*
183                  * Set the AccECN Codepoints on
184                  * the outgoing <ACK> to the ECN
185                  * state of the <SYN,ACK>
186                  * according to table 3 in the
187                  * AccECN draft
188                  */
189                 switch (iptos & IPTOS_ECN_MASK) {
190                 case (IPTOS_ECN_NOTECT):
191                         tp->t_rcep = 0b010;
192                         break;
193                 case (IPTOS_ECN_ECT0):
194                         tp->t_rcep = 0b100;
195                         break;
196                 case (IPTOS_ECN_ECT1):
197                         tp->t_rcep = 0b011;
198                         break;
199                 case (IPTOS_ECN_CE):
200                         tp->t_rcep = 0b110;
201                         break;
202                 }
203         }
204 }
205
206 /*
207  * Handle parallel SYN for ECN
208  */
209 void
210 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos)
211 {
212         if (thflags & TH_ACK)
213                 return;
214         if (V_tcp_do_ecn == 0)
215                 return;
216         if ((V_tcp_do_ecn == 1) ||
217             (V_tcp_do_ecn == 2)) {
218                 /* RFC3168 ECN handling */
219                 if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) {
220                         tp->t_flags2 |= TF2_ECN_PERMIT;
221                         tp->t_flags2 &= ~TF2_ACE_PERMIT;
222                         tp->t_flags2 |= TF2_ECN_SND_ECE;
223                         TCPSTAT_INC(tcps_ecn_shs);
224                 }
225         } else
226         if ((V_tcp_do_ecn == 3) ||
227             (V_tcp_do_ecn == 4)) {
228                 /* AccECN handling */
229                 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
230                 default:
231                 case (0|0|0):
232                         tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT);
233                         break;
234                 case (0|TH_CWR|TH_ECE):
235                         tp->t_flags2 |= TF2_ECN_PERMIT;
236                         tp->t_flags2 &= ~TF2_ACE_PERMIT;
237                         tp->t_flags2 |= TF2_ECN_SND_ECE;
238                         TCPSTAT_INC(tcps_ecn_shs);
239                         break;
240                 case (TH_AE|TH_CWR|TH_ECE):
241                         tp->t_flags2 |= TF2_ACE_PERMIT;
242                         tp->t_flags2 &= ~TF2_ECN_PERMIT;
243                         TCPSTAT_INC(tcps_ecn_shs);
244                         /*
245                          * Set the AccECN Codepoints on
246                          * the outgoing <ACK> to the ECN
247                          * state of the <SYN,ACK>
248                          * according to table 3 in the
249                          * AccECN draft
250                          */
251                         switch (iptos & IPTOS_ECN_MASK) {
252                         case (IPTOS_ECN_NOTECT):
253                                 tp->t_rcep = 0b010;
254                                 break;
255                         case (IPTOS_ECN_ECT0):
256                                 tp->t_rcep = 0b100;
257                                 break;
258                         case (IPTOS_ECN_ECT1):
259                                 tp->t_rcep = 0b011;
260                                 break;
261                         case (IPTOS_ECN_CE):
262                                 tp->t_rcep = 0b110;
263                                 break;
264                         }
265                         break;
266                 }
267         }
268 }
269
270 /*
271  * TCP ECN processing.
272  */
273 int
274 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos)
275 {
276         int delta_ace = 0;
277
278         if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
279                 switch (iptos & IPTOS_ECN_MASK) {
280                 case IPTOS_ECN_CE:
281                         TCPSTAT_INC(tcps_ecn_ce);
282                         break;
283                 case IPTOS_ECN_ECT0:
284                         TCPSTAT_INC(tcps_ecn_ect0);
285                         break;
286                 case IPTOS_ECN_ECT1:
287                         TCPSTAT_INC(tcps_ecn_ect1);
288                         break;
289                 }
290
291                 if (tp->t_flags2 & TF2_ACE_PERMIT) {
292                         if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
293                                 tp->t_rcep += 1;
294                         if (tp->t_flags2 & TF2_ECN_PERMIT) {
295                                 delta_ace = (tcp_ecn_get_ace(thflags) + 8 -
296                                             (tp->t_scep & 0x07)) & 0x07;
297                                 tp->t_scep += delta_ace;
298                         } else {
299                                 /*
300                                  * process the final ACK of the 3WHS
301                                  * see table 3 in draft-ietf-tcpm-accurate-ecn
302                                  */
303                                 switch (tcp_ecn_get_ace(thflags)) {
304                                 case 0b010:
305                                         /* nonECT SYN or SYN,ACK */
306                                         /* Fallthrough */
307                                 case 0b011:
308                                         /* ECT1 SYN or SYN,ACK */
309                                         /* Fallthrough */
310                                 case 0b100:
311                                         /* ECT0 SYN or SYN,ACK */
312                                         tp->t_scep = 5;
313                                         break;
314                                 case 0b110:
315                                         /* CE SYN or SYN,ACK */
316                                         tp->t_scep = 6;
317                                         tp->snd_cwnd = 2 * tcp_maxseg(tp);
318                                         break;
319                                 default:
320                                         /* mangled AccECN handshake */
321                                         tp->t_scep = 5;
322                                         break;
323                                 }
324                                 tp->t_flags2 |= TF2_ECN_PERMIT;
325                         }
326                 } else {
327                         /* RFC3168 ECN handling */
328                         if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE)
329                                 delta_ace = 1;
330                         if (thflags & TH_CWR) {
331                                 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
332                                 tp->t_flags |= TF_ACKNOW;
333                         }
334                         if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
335                                 tp->t_flags2 |= TF2_ECN_SND_ECE;
336                 }
337
338                 /* Process a packet differently from RFC3168. */
339                 cc_ecnpkt_handler_flags(tp, thflags, iptos);
340         }
341
342         return delta_ace;
343 }
344
345 /*
346  * Send ECN setup <SYN> packet header flags
347  */
348 uint16_t
349 tcp_ecn_output_syn_sent(struct tcpcb *tp)
350 {
351         uint16_t thflags = 0;
352
353         if (V_tcp_do_ecn == 0)
354                 return thflags;
355         if (V_tcp_do_ecn == 1) {
356                 /* Send a RFC3168 ECN setup <SYN> packet */
357                 if (tp->t_rxtshift >= 1) {
358                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
359                                 thflags = TH_ECE|TH_CWR;
360                 } else
361                         thflags = TH_ECE|TH_CWR;
362         } else
363         if (V_tcp_do_ecn == 3) {
364                 /* Send an Accurate ECN setup <SYN> packet */
365                 if (tp->t_rxtshift >= 1) {
366                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
367                                 thflags = TH_ECE|TH_CWR|TH_AE;
368                 } else
369                         thflags = TH_ECE|TH_CWR|TH_AE;
370         }
371
372         return thflags;
373 }
374
375 /*
376  * output processing of ECN feature
377  * returning IP ECN header codepoint
378  */
379 int
380 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit)
381 {
382         int ipecn = IPTOS_ECN_NOTECT;
383         bool newdata;
384
385         /*
386          * If the peer has ECN, mark data packets with
387          * ECN capable transmission (ECT).
388          * Ignore pure control packets, retransmissions
389          * and window probes.
390          */
391         newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
392                     !rxmit &&
393                     !((tp->t_flags & TF_FORCEDATA) && len == 1));
394         /* RFC3168 ECN marking, only new data segments */
395         if (newdata) {
396                 ipecn = IPTOS_ECN_ECT0;
397                 TCPSTAT_INC(tcps_ecn_ect0);
398         }
399         /*
400          * Reply with proper ECN notifications.
401          */
402         if (tp->t_flags2 & TF2_ACE_PERMIT) {
403                 *thflags &= ~(TH_AE|TH_CWR|TH_ECE);
404                 if (tp->t_rcep & 0x01)
405                         *thflags |= TH_ECE;
406                 if (tp->t_rcep & 0x02)
407                         *thflags |= TH_CWR;
408                 if (tp->t_rcep & 0x04)
409                         *thflags |= TH_AE;
410                 if (!(tp->t_flags2 & TF2_ECN_PERMIT)) {
411                         /*
412                          * here we process the final
413                          * ACK of the 3WHS
414                          */
415                         if (tp->t_rcep == 0b110) {
416                                 tp->t_rcep = 6;
417                         } else {
418                                 tp->t_rcep = 5;
419                         }
420                         tp->t_flags2 |= TF2_ECN_PERMIT;
421                 }
422         } else {
423                 if (newdata &&
424                     (tp->t_flags2 & TF2_ECN_SND_CWR)) {
425                         *thflags |= TH_CWR;
426                         tp->t_flags2 &= ~TF2_ECN_SND_CWR;
427                 }
428                 if (tp->t_flags2 & TF2_ECN_SND_ECE)
429                         *thflags |= TH_ECE;
430         }
431
432         return ipecn;
433 }
434
435 /*
436  * Set up the ECN related tcpcb fields from
437  * a syncache entry
438  */
439 void
440 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc)
441 {
442         if (sc->sc_flags & SCF_ECN_MASK) {
443                 switch (sc->sc_flags & SCF_ECN_MASK) {
444                 case SCF_ECN:
445                         tp->t_flags2 |= TF2_ECN_PERMIT;
446                         break;
447                 case SCF_ACE_N:
448                         /* Fallthrough */
449                 case SCF_ACE_0:
450                         /* Fallthrough */
451                 case SCF_ACE_1:
452                         tp->t_flags2 |= TF2_ACE_PERMIT;
453                         tp->t_scep = 5;
454                         tp->t_rcep = 5;
455                         break;
456                 case SCF_ACE_CE:
457                         tp->t_flags2 |= TF2_ACE_PERMIT;
458                         tp->t_scep = 6;
459                         tp->t_rcep = 6;
460                         break;
461                 /* undefined SCF codepoint */
462                 default:
463                         break;
464                 }
465         }
466 }
467
468 /*
469  * Process a <SYN> packets ECN information, and provide the
470  * syncache with the relevant information.
471  */
472 int
473 tcp_ecn_syncache_add(uint16_t thflags, int iptos)
474 {
475         int scflags = 0;
476
477         switch (thflags & (TH_AE|TH_CWR|TH_ECE)) {
478         /* no ECN */
479         case (0|0|0):
480                 break;
481         /* legacy ECN */
482         case (0|TH_CWR|TH_ECE):
483                 scflags = SCF_ECN;
484                 break;
485         /* Accurate ECN */
486         case (TH_AE|TH_CWR|TH_ECE):
487                 if ((V_tcp_do_ecn == 3) ||
488                     (V_tcp_do_ecn == 4)) {
489                         switch (iptos & IPTOS_ECN_MASK) {
490                         case IPTOS_ECN_CE:
491                                 scflags = SCF_ACE_CE;
492                                 break;
493                         case IPTOS_ECN_ECT0:
494                                 scflags = SCF_ACE_0;
495                                 break;
496                         case IPTOS_ECN_ECT1:
497                                 scflags = SCF_ACE_1;
498                                 break;
499                         case IPTOS_ECN_NOTECT:
500                                 scflags = SCF_ACE_N;
501                                 break;
502                         }
503                 } else
504                         scflags = SCF_ECN;
505                 break;
506         /* Default Case (section 3.1.2) */
507         default:
508                 if ((V_tcp_do_ecn == 3) ||
509                     (V_tcp_do_ecn == 4)) {
510                         switch (iptos & IPTOS_ECN_MASK) {
511                         case IPTOS_ECN_CE:
512                                 scflags = SCF_ACE_CE;
513                                 break;
514                         case IPTOS_ECN_ECT0:
515                                 scflags = SCF_ACE_0;
516                                 break;
517                         case IPTOS_ECN_ECT1:
518                                 scflags = SCF_ACE_1;
519                                 break;
520                         case IPTOS_ECN_NOTECT:
521                                 scflags = SCF_ACE_N;
522                                 break;
523                         }
524                 }
525                 break;
526         }
527         return scflags;
528 }
529
530 /*
531  * Set up the ECN information for the <SYN,ACK> from
532  * syncache information.
533  */
534 uint16_t
535 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc)
536 {
537         if ((thflags & TH_SYN) &&
538             (sc->sc_flags & SCF_ECN_MASK)) {
539                 switch (sc->sc_flags & SCF_ECN_MASK) {
540                 case SCF_ECN:
541                         thflags |= (0 | 0 | TH_ECE);
542                         TCPSTAT_INC(tcps_ecn_shs);
543                         break;
544                 case SCF_ACE_N:
545                         thflags |= (0 | TH_CWR | 0);
546                         TCPSTAT_INC(tcps_ecn_shs);
547                         TCPSTAT_INC(tcps_ace_nect);
548                         break;
549                 case SCF_ACE_0:
550                         thflags |= (TH_AE | 0 | 0);
551                         TCPSTAT_INC(tcps_ecn_shs);
552                         TCPSTAT_INC(tcps_ace_ect0);
553                         break;
554                 case SCF_ACE_1:
555                         thflags |= (0 | TH_ECE | TH_CWR);
556                         TCPSTAT_INC(tcps_ecn_shs);
557                         TCPSTAT_INC(tcps_ace_ect1);
558                         break;
559                 case SCF_ACE_CE:
560                         thflags |= (TH_AE | TH_CWR | 0);
561                         TCPSTAT_INC(tcps_ecn_shs);
562                         TCPSTAT_INC(tcps_ace_ce);
563                         break;
564                 /* undefined SCF codepoint */
565                 default:
566                         break;
567                 }
568         }
569         return thflags;
570 }
571
572 int
573 tcp_ecn_get_ace(uint16_t thflags)
574 {
575         int ace = 0;
576
577         if (thflags & TH_ECE)
578                 ace += 1;
579         if (thflags & TH_CWR)
580                 ace += 2;
581         if (thflags & TH_AE)
582                 ace += 4;
583         return ace;
584 }