]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_ecn.c
ip_reass: make stray callout assertion more verbose
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_ecn.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5  *      The Regents of the University of California.  All rights reserved.
6  * Copyright (c) 2007-2008,2010
7  *      Swinburne University of Technology, Melbourne, Australia.
8  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
9  * Copyright (c) 2010 The FreeBSD Foundation
10  * Copyright (c) 2010-2011 Juniper Networks, Inc.
11  * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com>
12  * All rights reserved.
13  *
14  * Portions of this software were developed at the Centre for Advanced Internet
15  * Architectures, Swinburne University of Technology, by Lawrence Stewart,
16  * James Healy and David Hayes, made possible in part by a grant from the Cisco
17  * University Research Program Fund at Community Foundation Silicon Valley.
18  *
19  * Portions of this software were developed at the Centre for Advanced
20  * Internet Architectures, Swinburne University of Technology, Melbourne,
21  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
22  *
23  * Portions of this software were developed by Robert N. M. Watson under
24  * contract to Juniper Networks, Inc.
25  *
26  * Redistribution and use in source and binary forms, with or without
27  * modification, are permitted provided that the following conditions
28  * are met:
29  * 1. Redistributions of source code must retain the above copyright
30  *    notice, this list of conditions and the following disclaimer.
31  * 2. Redistributions in binary form must reproduce the above copyright
32  *    notice, this list of conditions and the following disclaimer in the
33  *    documentation and/or other materials provided with the distribution.
34  * 3. Neither the name of the University nor the names of its contributors
35  *    may be used to endorse or promote products derived from this software
36  *    without specific prior written permission.
37  *
38  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48  * SUCH DAMAGE.
49  *
50  *      @(#)tcp_ecn.c 8.12 (Berkeley) 5/24/95
51  */
52
53 /*
54  * Utility functions to deal with Explicit Congestion Notification in TCP
55  * implementing the essential parts of the Accurate ECN extension
56  * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include "opt_inet.h"
63 #include "opt_inet6.h"
64 #include "opt_tcpdebug.h"
65
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/sysctl.h>
70 #include <sys/malloc.h>
71 #include <sys/mbuf.h>
72 #include <sys/socket.h>
73 #include <sys/socketvar.h>
74
75 #include <machine/cpu.h>
76
77 #include <vm/uma.h>
78
79 #include <net/if.h>
80 #include <net/if_var.h>
81 #include <net/route.h>
82 #include <net/vnet.h>
83
84 #include <netinet/in.h>
85 #include <netinet/in_systm.h>
86 #include <netinet/ip.h>
87 #include <netinet/in_var.h>
88 #include <netinet/in_pcb.h>
89 #include <netinet/ip_var.h>
90 #include <netinet/ip6.h>
91 #include <netinet/icmp6.h>
92 #include <netinet6/nd6.h>
93 #include <netinet6/ip6_var.h>
94 #include <netinet6/in6_pcb.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_fsm.h>
97 #include <netinet/tcp_seq.h>
98 #include <netinet/tcp_var.h>
99 #include <netinet/tcp_syncache.h>
100 #include <netinet/tcp_timer.h>
101 #include <netinet6/tcp6_var.h>
102 #include <netinet/tcpip.h>
103 #include <netinet/tcp_ecn.h>
104
105
106 /*
107  * Process incoming SYN,ACK packet
108  */
109 void
110 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos)
111 {
112
113         if (V_tcp_do_ecn == 0)
114                 return;
115         if ((V_tcp_do_ecn == 1) ||
116             (V_tcp_do_ecn == 2)) {
117                 /* RFC3168 ECN handling */
118                 if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) {
119                         tp->t_flags2 |= TF2_ECN_PERMIT;
120                         TCPSTAT_INC(tcps_ecn_shs);
121                 }
122         } else
123         /* decoding Accurate ECN according to table in section 3.1.1 */
124         if ((V_tcp_do_ecn == 3) ||
125             (V_tcp_do_ecn == 4)) {
126                 /*
127                  * on the SYN,ACK, process the AccECN
128                  * flags indicating the state the SYN
129                  * was delivered.
130                  * Reactions to Path ECN mangling can
131                  * come here.
132                  */
133                 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
134                 /* RFC3168 SYN */
135                 case (0|0|TH_ECE):
136                         tp->t_flags2 |= TF2_ECN_PERMIT;
137                         TCPSTAT_INC(tcps_ecn_shs);
138                         break;
139                 /* non-ECT SYN */
140                 case (0|TH_CWR|0):
141                         tp->t_flags2 |= TF2_ACE_PERMIT;
142                         tp->t_scep = 5;
143                         TCPSTAT_INC(tcps_ecn_shs);
144                         TCPSTAT_INC(tcps_ace_nect);
145                         break;
146                 /* ECT0 SYN */
147                 case (TH_AE|0|0):
148                         tp->t_flags2 |= TF2_ACE_PERMIT;
149                         tp->t_scep = 5;
150                         TCPSTAT_INC(tcps_ecn_shs);
151                         TCPSTAT_INC(tcps_ace_ect0);
152                         break;
153                 /* ECT1 SYN */
154                 case (0|TH_CWR|TH_ECE):
155                         tp->t_flags2 |= TF2_ACE_PERMIT;
156                         tp->t_scep = 5;
157                         TCPSTAT_INC(tcps_ecn_shs);
158                         TCPSTAT_INC(tcps_ace_ect1);
159                         break;
160                 /* CE SYN */
161                 case (TH_AE|TH_CWR|0):
162                         tp->t_flags2 |= TF2_ACE_PERMIT;
163                         tp->t_scep = 6;
164                         /*
165                          * reduce the IW to 2 MSS (to
166                          * account for delayed acks) if
167                          * the SYN,ACK was CE marked
168                          */
169                         tp->snd_cwnd = 2 * tcp_maxseg(tp);
170                         TCPSTAT_INC(tcps_ecn_shs);
171                         TCPSTAT_INC(tcps_ace_nect);
172                         break;
173                 default:
174                         break;
175                 }
176                 /*
177                  * Set the AccECN Codepoints on
178                  * the outgoing <ACK> to the ECN
179                  * state of the <SYN,ACK>
180                  * according to table 3 in the
181                  * AccECN draft
182                  */
183                 switch (iptos & IPTOS_ECN_MASK) {
184                 case (IPTOS_ECN_NOTECT):
185                         tp->t_rcep = 0b010;
186                         break;
187                 case (IPTOS_ECN_ECT0):
188                         tp->t_rcep = 0b100;
189                         break;
190                 case (IPTOS_ECN_ECT1):
191                         tp->t_rcep = 0b011;
192                         break;
193                 case (IPTOS_ECN_CE):
194                         tp->t_rcep = 0b110;
195                         break;
196                 }
197         }
198 }
199
200 /*
201  * Handle parallel SYN for ECN
202  */
203 void
204 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos)
205 {
206         if (thflags & TH_ACK)
207                 return;
208         if (V_tcp_do_ecn == 0)
209                 return;
210         if ((V_tcp_do_ecn == 1) ||
211             (V_tcp_do_ecn == 2)) {
212                 /* RFC3168 ECN handling */
213                 if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) {
214                         tp->t_flags2 |= TF2_ECN_PERMIT;
215                         tp->t_flags2 |= TF2_ECN_SND_ECE;
216                         TCPSTAT_INC(tcps_ecn_shs);
217                 }
218         } else
219         if ((V_tcp_do_ecn == 3) ||
220             (V_tcp_do_ecn == 4)) {
221                 /* AccECN handling */
222                 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
223                 default:
224                 case (0|0|0):
225                         break;
226                 case (0|TH_CWR|TH_ECE):
227                         tp->t_flags2 |= TF2_ECN_PERMIT;
228                         tp->t_flags2 |= TF2_ECN_SND_ECE;
229                         TCPSTAT_INC(tcps_ecn_shs);
230                         break;
231                 case (TH_AE|TH_CWR|TH_ECE):
232                         tp->t_flags2 |= TF2_ACE_PERMIT;
233                         TCPSTAT_INC(tcps_ecn_shs);
234                         /*
235                          * Set the AccECN Codepoints on
236                          * the outgoing <ACK> to the ECN
237                          * state of the <SYN,ACK>
238                          * according to table 3 in the
239                          * AccECN draft
240                          */
241                         switch (iptos & IPTOS_ECN_MASK) {
242                         case (IPTOS_ECN_NOTECT):
243                                 tp->t_rcep = 0b010;
244                                 break;
245                         case (IPTOS_ECN_ECT0):
246                                 tp->t_rcep = 0b100;
247                                 break;
248                         case (IPTOS_ECN_ECT1):
249                                 tp->t_rcep = 0b011;
250                                 break;
251                         case (IPTOS_ECN_CE):
252                                 tp->t_rcep = 0b110;
253                                 break;
254                         }
255                         break;
256                 }
257         }
258 }
259
260 /*
261  * TCP ECN processing.
262  */
263 int
264 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos)
265 {
266         int delta_ace = 0;
267
268         if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
269                 switch (iptos & IPTOS_ECN_MASK) {
270                 case IPTOS_ECN_CE:
271                         TCPSTAT_INC(tcps_ecn_ce);
272                         break;
273                 case IPTOS_ECN_ECT0:
274                         TCPSTAT_INC(tcps_ecn_ect0);
275                         break;
276                 case IPTOS_ECN_ECT1:
277                         TCPSTAT_INC(tcps_ecn_ect1);
278                         break;
279                 }
280
281                 if (tp->t_flags2 & TF2_ACE_PERMIT) {
282                         if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
283                                 tp->t_rcep += 1;
284                         if (tp->t_flags2 & TF2_ECN_PERMIT) {
285                                 delta_ace = (tcp_ecn_get_ace(thflags) + 8 -
286                                             (tp->t_scep & 0x07)) & 0x07;
287                                 tp->t_scep += delta_ace;
288                         } else {
289                                 /*
290                                  * process the final ACK of the 3WHS
291                                  * see table 3 in draft-ietf-tcpm-accurate-ecn
292                                  */
293                                 switch (tcp_ecn_get_ace(thflags)) {
294                                 case 0b010:
295                                         /* nonECT SYN or SYN,ACK */
296                                         /* Fallthrough */
297                                 case 0b011:
298                                         /* ECT1 SYN or SYN,ACK */
299                                         /* Fallthrough */
300                                 case 0b100:
301                                         /* ECT0 SYN or SYN,ACK */
302                                         tp->t_scep = 5;
303                                         break;
304                                 case 0b110:
305                                         /* CE SYN or SYN,ACK */
306                                         tp->t_scep = 6;
307                                         tp->snd_cwnd = 2 * tcp_maxseg(tp);
308                                         break;
309                                 default:
310                                         /* mangled AccECN handshake */
311                                         tp->t_scep = 5;
312                                         break;
313                                 }
314                                 tp->t_flags2 |= TF2_ECN_PERMIT;
315                         }
316                 } else {
317                         /* RFC3168 ECN handling */
318                         if (thflags & TH_ECE)
319                                 delta_ace = 1;
320                         if (thflags & TH_CWR) {
321                                 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
322                                 tp->t_flags |= TF_ACKNOW;
323                         }
324                         if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
325                                 tp->t_flags2 |= TF2_ECN_SND_ECE;
326                 }
327
328                 /* Process a packet differently from RFC3168. */
329                 cc_ecnpkt_handler_flags(tp, thflags, iptos);
330         }
331
332         return delta_ace;
333 }
334
335 /*
336  * Send ECN setup <SYN> packet header flags
337  */
338 uint16_t
339 tcp_ecn_output_syn_sent(struct tcpcb *tp)
340 {
341         uint16_t thflags = 0;
342
343         if (V_tcp_do_ecn == 0)
344                 return thflags;
345         if (V_tcp_do_ecn == 1) {
346                 /* Send a RFC3168 ECN setup <SYN> packet */
347                 if (tp->t_rxtshift >= 1) {
348                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
349                                 thflags = TH_ECE|TH_CWR;
350                 } else
351                         thflags = TH_ECE|TH_CWR;
352         } else
353         if (V_tcp_do_ecn == 3) {
354                 /* Send an Accurate ECN setup <SYN> packet */
355                 if (tp->t_rxtshift >= 1) {
356                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
357                                 thflags = TH_ECE|TH_CWR|TH_AE;
358                 } else
359                         thflags = TH_ECE|TH_CWR|TH_AE;
360         }
361
362         return thflags;
363 }
364
365 /*
366  * output processing of ECN feature
367  * returning IP ECN header codepoint
368  */
369 int
370 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit)
371 {
372         int ipecn = IPTOS_ECN_NOTECT;
373         bool newdata;
374
375         /*
376          * If the peer has ECN, mark data packets with
377          * ECN capable transmission (ECT).
378          * Ignore pure control packets, retransmissions
379          * and window probes.
380          */
381         newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
382                     !rxmit &&
383                     !((tp->t_flags & TF_FORCEDATA) && len == 1));
384         /* RFC3168 ECN marking, only new data segments */
385         if (newdata) {
386                 ipecn = IPTOS_ECN_ECT0;
387                 TCPSTAT_INC(tcps_ecn_ect0);
388         }
389         /*
390          * Reply with proper ECN notifications.
391          */
392         if (tp->t_flags2 & TF2_ACE_PERMIT) {
393                 *thflags &= ~(TH_AE|TH_CWR|TH_ECE);
394                 if (tp->t_rcep & 0x01)
395                         *thflags |= TH_ECE;
396                 if (tp->t_rcep & 0x02)
397                         *thflags |= TH_CWR;
398                 if (tp->t_rcep & 0x04)
399                         *thflags |= TH_AE;
400                 if (!(tp->t_flags2 & TF2_ECN_PERMIT)) {
401                         /*
402                          * here we process the final
403                          * ACK of the 3WHS
404                          */
405                         if (tp->t_rcep == 0b110) {
406                                 tp->t_rcep = 6;
407                         } else {
408                                 tp->t_rcep = 5;
409                         }
410                         tp->t_flags2 |= TF2_ECN_PERMIT;
411                 }
412         } else {
413                 if (newdata &&
414                     (tp->t_flags2 & TF2_ECN_SND_CWR)) {
415                         *thflags |= TH_CWR;
416                         tp->t_flags2 &= ~TF2_ECN_SND_CWR;
417                 }
418                 if (tp->t_flags2 & TF2_ECN_SND_ECE)
419                         *thflags |= TH_ECE;
420         }
421
422         return ipecn;
423 }
424
425 /*
426  * Set up the ECN related tcpcb fields from
427  * a syncache entry
428  */
429 void
430 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc)
431 {
432         if (sc->sc_flags & SCF_ECN_MASK) {
433                 switch (sc->sc_flags & SCF_ECN_MASK) {
434                 case SCF_ECN:
435                         tp->t_flags2 |= TF2_ECN_PERMIT;
436                         break;
437                 case SCF_ACE_N:
438                         /* Fallthrough */
439                 case SCF_ACE_0:
440                         /* Fallthrough */
441                 case SCF_ACE_1:
442                         tp->t_flags2 |= TF2_ACE_PERMIT;
443                         tp->t_scep = 5;
444                         tp->t_rcep = 5;
445                         break;
446                 case SCF_ACE_CE:
447                         tp->t_flags2 |= TF2_ACE_PERMIT;
448                         tp->t_scep = 6;
449                         tp->t_rcep = 6;
450                         break;
451                 /* undefined SCF codepoint */
452                 default:
453                         break;
454                 }
455         }
456 }
457
458 /*
459  * Process a <SYN> packets ECN information, and provide the
460  * syncache with the relevant information.
461  */
462 int
463 tcp_ecn_syncache_add(uint16_t thflags, int iptos)
464 {
465         int scflags = 0;
466
467         switch (thflags & (TH_AE|TH_CWR|TH_ECE)) {
468         /* no ECN */
469         case (0|0|0):
470                 break;
471         /* legacy ECN */
472         case (0|TH_CWR|TH_ECE):
473                 scflags = SCF_ECN;
474                 break;
475         /* Accurate ECN */
476         case (TH_AE|TH_CWR|TH_ECE):
477                 if ((V_tcp_do_ecn == 3) ||
478                     (V_tcp_do_ecn == 4)) {
479                         switch (iptos & IPTOS_ECN_MASK) {
480                         case IPTOS_ECN_CE:
481                                 scflags = SCF_ACE_CE;
482                                 break;
483                         case IPTOS_ECN_ECT0:
484                                 scflags = SCF_ACE_0;
485                                 break;
486                         case IPTOS_ECN_ECT1:
487                                 scflags = SCF_ACE_1;
488                                 break;
489                         case IPTOS_ECN_NOTECT:
490                                 scflags = SCF_ACE_N;
491                                 break;
492                         }
493                 } else
494                         scflags = SCF_ECN;
495                 break;
496         /* Default Case (section 3.1.2) */
497         default:
498                 if ((V_tcp_do_ecn == 3) ||
499                     (V_tcp_do_ecn == 4)) {
500                         switch (iptos & IPTOS_ECN_MASK) {
501                         case IPTOS_ECN_CE:
502                                 scflags = SCF_ACE_CE;
503                                 break;
504                         case IPTOS_ECN_ECT0:
505                                 scflags = SCF_ACE_0;
506                                 break;
507                         case IPTOS_ECN_ECT1:
508                                 scflags = SCF_ACE_1;
509                                 break;
510                         case IPTOS_ECN_NOTECT:
511                                 scflags = SCF_ACE_N;
512                                 break;
513                         }
514                 }
515                 break;
516         }
517         return scflags;
518 }
519
520 /*
521  * Set up the ECN information for the <SYN,ACK> from
522  * syncache information.
523  */
524 uint16_t
525 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc)
526 {
527         if ((thflags & TH_SYN) &&
528             (sc->sc_flags & SCF_ECN_MASK)) {
529                 switch (sc->sc_flags & SCF_ECN_MASK) {
530                 case SCF_ECN:
531                         thflags |= (0 | 0 | TH_ECE);
532                         TCPSTAT_INC(tcps_ecn_shs);
533                         break;
534                 case SCF_ACE_N:
535                         thflags |= (0 | TH_CWR | 0);
536                         TCPSTAT_INC(tcps_ecn_shs);
537                         TCPSTAT_INC(tcps_ace_nect);
538                         break;
539                 case SCF_ACE_0:
540                         thflags |= (TH_AE | 0 | 0);
541                         TCPSTAT_INC(tcps_ecn_shs);
542                         TCPSTAT_INC(tcps_ace_ect0);
543                         break;
544                 case SCF_ACE_1:
545                         thflags |= (0 | TH_ECE | TH_CWR);
546                         TCPSTAT_INC(tcps_ecn_shs);
547                         TCPSTAT_INC(tcps_ace_ect1);
548                         break;
549                 case SCF_ACE_CE:
550                         thflags |= (TH_AE | TH_CWR | 0);
551                         TCPSTAT_INC(tcps_ecn_shs);
552                         TCPSTAT_INC(tcps_ace_ce);
553                         break;
554                 /* undefined SCF codepoint */
555                 default:
556                         break;
557                 }
558         }
559         return thflags;
560 }
561
562 int
563 tcp_ecn_get_ace(uint16_t thflags)
564 {
565         int ace = 0;
566
567         if (thflags & TH_ECE)
568                 ace += 1;
569         if (thflags & TH_CWR)
570                 ace += 2;
571         if (thflags & TH_AE)
572                 ace += 4;
573         return ace;
574 }