]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_usrreq.c
Merge llvm-project main llvmorg-15-init-15358-g53dc0f10787
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_usrreq.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1993
5  *      The Regents of the University of California.
6  * Copyright (c) 2006-2007 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * All rights reserved.
9  *
10  * Portions of this software were developed by Robert N. M. Watson under
11  * contract to Juniper Networks, Inc.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *      From: @(#)tcp_usrreq.c  8.2 (Berkeley) 1/3/94
38  */
39
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42
43 #include "opt_ddb.h"
44 #include "opt_inet.h"
45 #include "opt_inet6.h"
46 #include "opt_ipsec.h"
47 #include "opt_kern_tls.h"
48
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/arb.h>
52 #include <sys/limits.h>
53 #include <sys/malloc.h>
54 #include <sys/refcount.h>
55 #include <sys/kernel.h>
56 #include <sys/ktls.h>
57 #include <sys/qmath.h>
58 #include <sys/sysctl.h>
59 #include <sys/mbuf.h>
60 #ifdef INET6
61 #include <sys/domain.h>
62 #endif /* INET6 */
63 #include <sys/socket.h>
64 #include <sys/socketvar.h>
65 #include <sys/protosw.h>
66 #include <sys/proc.h>
67 #include <sys/jail.h>
68 #include <sys/stats.h>
69
70 #ifdef DDB
71 #include <ddb/ddb.h>
72 #endif
73
74 #include <net/if.h>
75 #include <net/if_var.h>
76 #include <net/route.h>
77 #include <net/vnet.h>
78
79 #include <netinet/in.h>
80 #include <netinet/in_kdtrace.h>
81 #include <netinet/in_pcb.h>
82 #include <netinet/in_systm.h>
83 #include <netinet/in_var.h>
84 #include <netinet/ip.h>
85 #include <netinet/ip_var.h>
86 #ifdef INET6
87 #include <netinet/ip6.h>
88 #include <netinet6/in6_pcb.h>
89 #include <netinet6/ip6_var.h>
90 #include <netinet6/scope6_var.h>
91 #endif
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_fsm.h>
94 #include <netinet/tcp_seq.h>
95 #include <netinet/tcp_timer.h>
96 #include <netinet/tcp_var.h>
97 #include <netinet/tcp_log_buf.h>
98 #include <netinet/tcpip.h>
99 #include <netinet/cc/cc.h>
100 #include <netinet/tcp_fastopen.h>
101 #include <netinet/tcp_hpts.h>
102 #ifdef TCPPCAP
103 #include <netinet/tcp_pcap.h>
104 #endif
105 #ifdef TCP_OFFLOAD
106 #include <netinet/tcp_offload.h>
107 #endif
108 #include <netipsec/ipsec_support.h>
109
110 #include <vm/vm.h>
111 #include <vm/vm_param.h>
112 #include <vm/pmap.h>
113 #include <vm/vm_extern.h>
114 #include <vm/vm_map.h>
115 #include <vm/vm_page.h>
116
117 /*
118  * TCP protocol interface to socket abstraction.
119  */
120 #ifdef INET
121 static int      tcp_connect(struct tcpcb *, struct sockaddr_in *,
122                     struct thread *td);
123 #endif /* INET */
124 #ifdef INET6
125 static int      tcp6_connect(struct tcpcb *, struct sockaddr_in6 *,
126                     struct thread *td);
127 #endif /* INET6 */
128 static void     tcp_disconnect(struct tcpcb *);
129 static void     tcp_usrclosed(struct tcpcb *);
130 static void     tcp_fill_info(struct tcpcb *, struct tcp_info *);
131
132 static int      tcp_pru_options_support(struct tcpcb *tp, int flags);
133
134 /*
135  * TCP attaches to socket via pru_attach(), reserving space,
136  * and an internet control block.
137  */
138 static int
139 tcp_usr_attach(struct socket *so, int proto, struct thread *td)
140 {
141         struct inpcb *inp;
142         struct tcpcb *tp = NULL;
143         int error;
144
145         inp = sotoinpcb(so);
146         KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
147
148         error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace);
149         if (error)
150                 goto out;
151
152         so->so_rcv.sb_flags |= SB_AUTOSIZE;
153         so->so_snd.sb_flags |= SB_AUTOSIZE;
154         error = in_pcballoc(so, &V_tcbinfo);
155         if (error)
156                 goto out;
157         inp = sotoinpcb(so);
158         tp = tcp_newtcpcb(inp);
159         if (tp == NULL) {
160                 error = ENOBUFS;
161                 in_pcbdetach(inp);
162                 in_pcbfree(inp);
163                 goto out;
164         }
165         tp->t_state = TCPS_CLOSED;
166         INP_WUNLOCK(inp);
167         TCPSTATES_INC(TCPS_CLOSED);
168 out:
169         TCP_PROBE2(debug__user, tp, PRU_ATTACH);
170         return (error);
171 }
172
173 /*
174  * tcp_usr_detach is called when the socket layer loses its final reference
175  * to the socket, be it a file descriptor reference, a reference from TCP,
176  * etc.  At this point, there is only one case in which we will keep around
177  * inpcb state: time wait.
178  */
179 static void
180 tcp_usr_detach(struct socket *so)
181 {
182         struct inpcb *inp;
183         struct tcpcb *tp;
184
185         inp = sotoinpcb(so);
186         KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
187         INP_WLOCK(inp);
188         KASSERT(so->so_pcb == inp && inp->inp_socket == so,
189                 ("%s: socket %p inp %p mismatch", __func__, so, inp));
190
191         tp = intotcpcb(inp);
192
193         KASSERT(inp->inp_flags & INP_DROPPED ||
194             tp->t_state < TCPS_SYN_SENT,
195             ("%s: inp %p not dropped or embryonic", __func__, inp));
196
197         tcp_discardcb(tp);
198         in_pcbdetach(inp);
199         in_pcbfree(inp);
200 }
201
202 #ifdef INET
203 /*
204  * Give the socket an address.
205  */
206 static int
207 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
208 {
209         int error = 0;
210         struct inpcb *inp;
211 #ifdef KDTRACE_HOOKS
212         struct tcpcb *tp = NULL;
213 #endif
214         struct sockaddr_in *sinp;
215
216         sinp = (struct sockaddr_in *)nam;
217         if (nam->sa_family != AF_INET) {
218                 /*
219                  * Preserve compatibility with old programs.
220                  */
221                 if (nam->sa_family != AF_UNSPEC ||
222                     nam->sa_len < offsetof(struct sockaddr_in, sin_zero) ||
223                     sinp->sin_addr.s_addr != INADDR_ANY)
224                         return (EAFNOSUPPORT);
225                 nam->sa_family = AF_INET;
226         }
227         if (nam->sa_len != sizeof(*sinp))
228                 return (EINVAL);
229
230         /*
231          * Must check for multicast addresses and disallow binding
232          * to them.
233          */
234         if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
235                 return (EAFNOSUPPORT);
236
237         inp = sotoinpcb(so);
238         KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
239         INP_WLOCK(inp);
240         if (inp->inp_flags & INP_DROPPED) {
241                 error = EINVAL;
242                 goto out;
243         }
244 #ifdef KDTRACE_HOOKS
245         tp = intotcpcb(inp);
246 #endif
247         INP_HASH_WLOCK(&V_tcbinfo);
248         error = in_pcbbind(inp, nam, td->td_ucred);
249         INP_HASH_WUNLOCK(&V_tcbinfo);
250 out:
251         TCP_PROBE2(debug__user, tp, PRU_BIND);
252         INP_WUNLOCK(inp);
253
254         return (error);
255 }
256 #endif /* INET */
257
258 #ifdef INET6
259 static int
260 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
261 {
262         int error = 0;
263         struct inpcb *inp;
264 #ifdef KDTRACE_HOOKS
265         struct tcpcb *tp = NULL;
266 #endif
267         struct sockaddr_in6 *sin6;
268         u_char vflagsav;
269
270         sin6 = (struct sockaddr_in6 *)nam;
271         if (nam->sa_family != AF_INET6)
272                 return (EAFNOSUPPORT);
273         if (nam->sa_len != sizeof(*sin6))
274                 return (EINVAL);
275
276         /*
277          * Must check for multicast addresses and disallow binding
278          * to them.
279          */
280         if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
281                 return (EAFNOSUPPORT);
282
283         inp = sotoinpcb(so);
284         KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
285         INP_WLOCK(inp);
286         vflagsav = inp->inp_vflag;
287         if (inp->inp_flags & INP_DROPPED) {
288                 error = EINVAL;
289                 goto out;
290         }
291 #ifdef KDTRACE_HOOKS
292         tp = intotcpcb(inp);
293 #endif
294         INP_HASH_WLOCK(&V_tcbinfo);
295         inp->inp_vflag &= ~INP_IPV4;
296         inp->inp_vflag |= INP_IPV6;
297 #ifdef INET
298         if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
299                 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
300                         inp->inp_vflag |= INP_IPV4;
301                 else if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
302                         struct sockaddr_in sin;
303
304                         in6_sin6_2_sin(&sin, sin6);
305                         if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
306                                 error = EAFNOSUPPORT;
307                                 INP_HASH_WUNLOCK(&V_tcbinfo);
308                                 goto out;
309                         }
310                         inp->inp_vflag |= INP_IPV4;
311                         inp->inp_vflag &= ~INP_IPV6;
312                         error = in_pcbbind(inp, (struct sockaddr *)&sin,
313                             td->td_ucred);
314                         INP_HASH_WUNLOCK(&V_tcbinfo);
315                         goto out;
316                 }
317         }
318 #endif
319         error = in6_pcbbind(inp, nam, td->td_ucred);
320         INP_HASH_WUNLOCK(&V_tcbinfo);
321 out:
322         if (error != 0)
323                 inp->inp_vflag = vflagsav;
324         TCP_PROBE2(debug__user, tp, PRU_BIND);
325         INP_WUNLOCK(inp);
326         return (error);
327 }
328 #endif /* INET6 */
329
330 #ifdef INET
331 /*
332  * Prepare to accept connections.
333  */
334 static int
335 tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
336 {
337         int error = 0;
338         struct inpcb *inp;
339         struct tcpcb *tp = NULL;
340
341         inp = sotoinpcb(so);
342         KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
343         INP_WLOCK(inp);
344         if (inp->inp_flags & INP_DROPPED) {
345                 error = EINVAL;
346                 goto out;
347         }
348         tp = intotcpcb(inp);
349         SOCK_LOCK(so);
350         error = solisten_proto_check(so);
351         if (error != 0) {
352                 SOCK_UNLOCK(so);
353                 goto out;
354         }
355         if (inp->inp_lport == 0) {
356                 INP_HASH_WLOCK(&V_tcbinfo);
357                 error = in_pcbbind(inp, NULL, td->td_ucred);
358                 INP_HASH_WUNLOCK(&V_tcbinfo);
359         }
360         if (error == 0) {
361                 tcp_state_change(tp, TCPS_LISTEN);
362                 solisten_proto(so, backlog);
363 #ifdef TCP_OFFLOAD
364                 if ((so->so_options & SO_NO_OFFLOAD) == 0)
365                         tcp_offload_listen_start(tp);
366 #endif
367         } else {
368                 solisten_proto_abort(so);
369         }
370         SOCK_UNLOCK(so);
371
372         if (IS_FASTOPEN(tp->t_flags))
373                 tp->t_tfo_pending = tcp_fastopen_alloc_counter();
374
375 out:
376         TCP_PROBE2(debug__user, tp, PRU_LISTEN);
377         INP_WUNLOCK(inp);
378         return (error);
379 }
380 #endif /* INET */
381
382 #ifdef INET6
383 static int
384 tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
385 {
386         int error = 0;
387         struct inpcb *inp;
388         struct tcpcb *tp = NULL;
389         u_char vflagsav;
390
391         inp = sotoinpcb(so);
392         KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
393         INP_WLOCK(inp);
394         if (inp->inp_flags & INP_DROPPED) {
395                 error = EINVAL;
396                 goto out;
397         }
398         vflagsav = inp->inp_vflag;
399         tp = intotcpcb(inp);
400         SOCK_LOCK(so);
401         error = solisten_proto_check(so);
402         if (error != 0) {
403                 SOCK_UNLOCK(so);
404                 goto out;
405         }
406         INP_HASH_WLOCK(&V_tcbinfo);
407         if (inp->inp_lport == 0) {
408                 inp->inp_vflag &= ~INP_IPV4;
409                 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
410                         inp->inp_vflag |= INP_IPV4;
411                 error = in6_pcbbind(inp, NULL, td->td_ucred);
412         }
413         INP_HASH_WUNLOCK(&V_tcbinfo);
414         if (error == 0) {
415                 tcp_state_change(tp, TCPS_LISTEN);
416                 solisten_proto(so, backlog);
417 #ifdef TCP_OFFLOAD
418                 if ((so->so_options & SO_NO_OFFLOAD) == 0)
419                         tcp_offload_listen_start(tp);
420 #endif
421         } else {
422                 solisten_proto_abort(so);
423         }
424         SOCK_UNLOCK(so);
425
426         if (IS_FASTOPEN(tp->t_flags))
427                 tp->t_tfo_pending = tcp_fastopen_alloc_counter();
428
429         if (error != 0)
430                 inp->inp_vflag = vflagsav;
431
432 out:
433         TCP_PROBE2(debug__user, tp, PRU_LISTEN);
434         INP_WUNLOCK(inp);
435         return (error);
436 }
437 #endif /* INET6 */
438
439 #ifdef INET
440 /*
441  * Initiate connection to peer.
442  * Create a template for use in transmissions on this connection.
443  * Enter SYN_SENT state, and mark socket as connecting.
444  * Start keep-alive timer, and seed output sequence space.
445  * Send initial segment on connection.
446  */
447 static int
448 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
449 {
450         struct epoch_tracker et;
451         int error = 0;
452         struct inpcb *inp;
453         struct tcpcb *tp = NULL;
454         struct sockaddr_in *sinp;
455
456         sinp = (struct sockaddr_in *)nam;
457         if (nam->sa_family != AF_INET)
458                 return (EAFNOSUPPORT);
459         if (nam->sa_len != sizeof (*sinp))
460                 return (EINVAL);
461
462         /*
463          * Must disallow TCP ``connections'' to multicast addresses.
464          */
465         if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
466                 return (EAFNOSUPPORT);
467         if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST)
468                 return (EACCES);
469         if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0)
470                 return (error);
471
472         inp = sotoinpcb(so);
473         KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
474         INP_WLOCK(inp);
475         if (inp->inp_flags & INP_DROPPED) {
476                 error = ECONNREFUSED;
477                 goto out;
478         }
479         if (SOLISTENING(so)) {
480                 error = EOPNOTSUPP;
481                 goto out;
482         }
483         tp = intotcpcb(inp);
484         NET_EPOCH_ENTER(et);
485         if ((error = tcp_connect(tp, sinp, td)) != 0)
486                 goto out_in_epoch;
487 #ifdef TCP_OFFLOAD
488         if (registered_toedevs > 0 &&
489             (so->so_options & SO_NO_OFFLOAD) == 0 &&
490             (error = tcp_offload_connect(so, nam)) == 0)
491                 goto out_in_epoch;
492 #endif
493         tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
494         error = tcp_output(tp);
495         KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()"
496             ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error));
497 out_in_epoch:
498         NET_EPOCH_EXIT(et);
499 out:
500         TCP_PROBE2(debug__user, tp, PRU_CONNECT);
501         INP_WUNLOCK(inp);
502         return (error);
503 }
504 #endif /* INET */
505
506 #ifdef INET6
507 static int
508 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
509 {
510         struct epoch_tracker et;
511         int error = 0;
512         struct inpcb *inp;
513         struct tcpcb *tp = NULL;
514         struct sockaddr_in6 *sin6;
515         u_int8_t incflagsav;
516         u_char vflagsav;
517
518         sin6 = (struct sockaddr_in6 *)nam;
519         if (nam->sa_family != AF_INET6)
520                 return (EAFNOSUPPORT);
521         if (nam->sa_len != sizeof (*sin6))
522                 return (EINVAL);
523
524         /*
525          * Must disallow TCP ``connections'' to multicast addresses.
526          */
527         if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
528                 return (EAFNOSUPPORT);
529
530         inp = sotoinpcb(so);
531         KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
532         INP_WLOCK(inp);
533         vflagsav = inp->inp_vflag;
534         incflagsav = inp->inp_inc.inc_flags;
535         if (inp->inp_flags & INP_DROPPED) {
536                 error = ECONNREFUSED;
537                 goto out;
538         }
539         if (SOLISTENING(so)) {
540                 error = EINVAL;
541                 goto out;
542         }
543         tp = intotcpcb(inp);
544 #ifdef INET
545         /*
546          * XXXRW: Some confusion: V4/V6 flags relate to binding, and
547          * therefore probably require the hash lock, which isn't held here.
548          * Is this a significant problem?
549          */
550         if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
551                 struct sockaddr_in sin;
552
553                 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
554                         error = EINVAL;
555                         goto out;
556                 }
557                 if ((inp->inp_vflag & INP_IPV4) == 0) {
558                         error = EAFNOSUPPORT;
559                         goto out;
560                 }
561
562                 in6_sin6_2_sin(&sin, sin6);
563                 if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
564                         error = EAFNOSUPPORT;
565                         goto out;
566                 }
567                 if (ntohl(sin.sin_addr.s_addr) == INADDR_BROADCAST) {
568                         error = EACCES;
569                         goto out;
570                 }
571                 if ((error = prison_remote_ip4(td->td_ucred,
572                     &sin.sin_addr)) != 0)
573                         goto out;
574                 inp->inp_vflag |= INP_IPV4;
575                 inp->inp_vflag &= ~INP_IPV6;
576                 NET_EPOCH_ENTER(et);
577                 if ((error = tcp_connect(tp, &sin, td)) != 0)
578                         goto out_in_epoch;
579 #ifdef TCP_OFFLOAD
580                 if (registered_toedevs > 0 &&
581                     (so->so_options & SO_NO_OFFLOAD) == 0 &&
582                     (error = tcp_offload_connect(so, nam)) == 0)
583                         goto out_in_epoch;
584 #endif
585                 error = tcp_output(tp);
586                 goto out_in_epoch;
587         } else {
588                 if ((inp->inp_vflag & INP_IPV6) == 0) {
589                         error = EAFNOSUPPORT;
590                         goto out;
591                 }
592         }
593 #endif
594         if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr)) != 0)
595                 goto out;
596         inp->inp_vflag &= ~INP_IPV4;
597         inp->inp_vflag |= INP_IPV6;
598         inp->inp_inc.inc_flags |= INC_ISIPV6;
599         NET_EPOCH_ENTER(et);
600         if ((error = tcp6_connect(tp, sin6, td)) != 0)
601                 goto out_in_epoch;
602 #ifdef TCP_OFFLOAD
603         if (registered_toedevs > 0 &&
604             (so->so_options & SO_NO_OFFLOAD) == 0 &&
605             (error = tcp_offload_connect(so, nam)) == 0)
606                 goto out_in_epoch;
607 #endif
608         tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
609         error = tcp_output(tp);
610 out_in_epoch:
611         NET_EPOCH_EXIT(et);
612 out:
613         KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()"
614             ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error));
615         /*
616          * If the implicit bind in the connect call fails, restore
617          * the flags we modified.
618          */
619         if (error != 0 && inp->inp_lport == 0) {
620                 inp->inp_vflag = vflagsav;
621                 inp->inp_inc.inc_flags = incflagsav;
622         }
623
624         TCP_PROBE2(debug__user, tp, PRU_CONNECT);
625         INP_WUNLOCK(inp);
626         return (error);
627 }
628 #endif /* INET6 */
629
630 /*
631  * Initiate disconnect from peer.
632  * If connection never passed embryonic stage, just drop;
633  * else if don't need to let data drain, then can just drop anyways,
634  * else have to begin TCP shutdown process: mark socket disconnecting,
635  * drain unread data, state switch to reflect user close, and
636  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
637  * when peer sends FIN and acks ours.
638  *
639  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
640  */
641 static int
642 tcp_usr_disconnect(struct socket *so)
643 {
644         struct inpcb *inp;
645         struct tcpcb *tp = NULL;
646         struct epoch_tracker et;
647         int error = 0;
648
649         NET_EPOCH_ENTER(et);
650         inp = sotoinpcb(so);
651         KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
652         INP_WLOCK(inp);
653         if (inp->inp_flags & INP_DROPPED) {
654                 error = ECONNRESET;
655                 goto out;
656         }
657         tp = intotcpcb(inp);
658         tcp_disconnect(tp);
659 out:
660         TCP_PROBE2(debug__user, tp, PRU_DISCONNECT);
661         INP_WUNLOCK(inp);
662         NET_EPOCH_EXIT(et);
663         return (error);
664 }
665
666 #ifdef INET
667 /*
668  * Accept a connection.  Essentially all the work is done at higher levels;
669  * just return the address of the peer, storing through addr.
670  */
671 static int
672 tcp_usr_accept(struct socket *so, struct sockaddr **nam)
673 {
674         int error = 0;
675         struct inpcb *inp = NULL;
676 #ifdef KDTRACE_HOOKS
677         struct tcpcb *tp = NULL;
678 #endif
679         struct in_addr addr;
680         in_port_t port = 0;
681
682         if (so->so_state & SS_ISDISCONNECTED)
683                 return (ECONNABORTED);
684
685         inp = sotoinpcb(so);
686         KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
687         INP_WLOCK(inp);
688         if (inp->inp_flags & INP_DROPPED) {
689                 error = ECONNABORTED;
690                 goto out;
691         }
692 #ifdef KDTRACE_HOOKS
693         tp = intotcpcb(inp);
694 #endif
695
696         /*
697          * We inline in_getpeeraddr and COMMON_END here, so that we can
698          * copy the data of interest and defer the malloc until after we
699          * release the lock.
700          */
701         port = inp->inp_fport;
702         addr = inp->inp_faddr;
703
704 out:
705         TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
706         INP_WUNLOCK(inp);
707         if (error == 0)
708                 *nam = in_sockaddr(port, &addr);
709         return error;
710 }
711 #endif /* INET */
712
713 #ifdef INET6
714 static int
715 tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
716 {
717         struct inpcb *inp = NULL;
718         int error = 0;
719 #ifdef KDTRACE_HOOKS
720         struct tcpcb *tp = NULL;
721 #endif
722         struct in_addr addr;
723         struct in6_addr addr6;
724         struct epoch_tracker et;
725         in_port_t port = 0;
726         int v4 = 0;
727
728         if (so->so_state & SS_ISDISCONNECTED)
729                 return (ECONNABORTED);
730
731         inp = sotoinpcb(so);
732         KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
733         NET_EPOCH_ENTER(et);
734         INP_WLOCK(inp);
735         if (inp->inp_flags & INP_DROPPED) {
736                 error = ECONNABORTED;
737                 goto out;
738         }
739 #ifdef KDTRACE_HOOKS
740         tp = intotcpcb(inp);
741 #endif
742
743         /*
744          * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
745          * copy the data of interest and defer the malloc until after we
746          * release the lock.
747          */
748         if (inp->inp_vflag & INP_IPV4) {
749                 v4 = 1;
750                 port = inp->inp_fport;
751                 addr = inp->inp_faddr;
752         } else {
753                 port = inp->inp_fport;
754                 addr6 = inp->in6p_faddr;
755         }
756
757 out:
758         TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
759         INP_WUNLOCK(inp);
760         NET_EPOCH_EXIT(et);
761         if (error == 0) {
762                 if (v4)
763                         *nam = in6_v4mapsin6_sockaddr(port, &addr);
764                 else
765                         *nam = in6_sockaddr(port, &addr6);
766         }
767         return error;
768 }
769 #endif /* INET6 */
770
771 /*
772  * Mark the connection as being incapable of further output.
773  */
774 static int
775 tcp_usr_shutdown(struct socket *so)
776 {
777         int error = 0;
778         struct inpcb *inp;
779         struct tcpcb *tp = NULL;
780         struct epoch_tracker et;
781
782         inp = sotoinpcb(so);
783         KASSERT(inp != NULL, ("inp == NULL"));
784         INP_WLOCK(inp);
785         if (inp->inp_flags & INP_DROPPED) {
786                 INP_WUNLOCK(inp);
787                 return (ECONNRESET);
788         }
789         tp = intotcpcb(inp);
790         NET_EPOCH_ENTER(et);
791         socantsendmore(so);
792         tcp_usrclosed(tp);
793         if (!(inp->inp_flags & INP_DROPPED))
794                 error = tcp_output_nodrop(tp);
795         TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN);
796         error = tcp_unlock_or_drop(tp, error);
797         NET_EPOCH_EXIT(et);
798
799         return (error);
800 }
801
802 /*
803  * After a receive, possibly send window update to peer.
804  */
805 static int
806 tcp_usr_rcvd(struct socket *so, int flags)
807 {
808         struct epoch_tracker et;
809         struct inpcb *inp;
810         struct tcpcb *tp = NULL;
811         int outrv = 0, error = 0;
812
813         inp = sotoinpcb(so);
814         KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
815         INP_WLOCK(inp);
816         if (inp->inp_flags & INP_DROPPED) {
817                 INP_WUNLOCK(inp);
818                 return (ECONNRESET);
819         }
820         tp = intotcpcb(inp);
821         NET_EPOCH_ENTER(et);
822         /*
823          * For passively-created TFO connections, don't attempt a window
824          * update while still in SYN_RECEIVED as this may trigger an early
825          * SYN|ACK.  It is preferable to have the SYN|ACK be sent along with
826          * application response data, or failing that, when the DELACK timer
827          * expires.
828          */
829         if (IS_FASTOPEN(tp->t_flags) &&
830             (tp->t_state == TCPS_SYN_RECEIVED))
831                 goto out;
832 #ifdef TCP_OFFLOAD
833         if (tp->t_flags & TF_TOE)
834                 tcp_offload_rcvd(tp);
835         else
836 #endif
837                 outrv = tcp_output_nodrop(tp);
838 out:
839         TCP_PROBE2(debug__user, tp, PRU_RCVD);
840         (void) tcp_unlock_or_drop(tp, outrv);
841         NET_EPOCH_EXIT(et);
842         return (error);
843 }
844
845 /*
846  * Do a send by putting data in output queue and updating urgent
847  * marker if URG set.  Possibly send more data.  Unlike the other
848  * pru_*() routines, the mbuf chains are our responsibility.  We
849  * must either enqueue them or free them.  The other pru_* routines
850  * generally are caller-frees.
851  */
852 static int
853 tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
854     struct sockaddr *nam, struct mbuf *control, struct thread *td)
855 {
856         struct epoch_tracker et;
857         int error = 0;
858         struct inpcb *inp;
859         struct tcpcb *tp = NULL;
860 #ifdef INET
861 #ifdef INET6
862         struct sockaddr_in sin;
863 #endif
864         struct sockaddr_in *sinp;
865 #endif
866 #ifdef INET6
867         struct sockaddr_in6 *sin6;
868         int isipv6;
869 #endif
870         u_int8_t incflagsav;
871         u_char vflagsav;
872         bool restoreflags;
873
874         if (control != NULL) {
875                 /* TCP doesn't do control messages (rights, creds, etc) */
876                 if (control->m_len) {
877                         m_freem(control);
878                         return (EINVAL);
879                 }
880                 m_freem(control);       /* empty control, just free it */
881         }
882
883         inp = sotoinpcb(so);
884         KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
885         INP_WLOCK(inp);
886         if (inp->inp_flags & INP_DROPPED) {
887                 if (m != NULL && (flags & PRUS_NOTREADY) == 0)
888                         m_freem(m);
889                 INP_WUNLOCK(inp);
890                 return (ECONNRESET);
891         }
892
893         vflagsav = inp->inp_vflag;
894         incflagsav = inp->inp_inc.inc_flags;
895         restoreflags = false;
896         tp = intotcpcb(inp);
897
898         NET_EPOCH_ENTER(et);
899         if ((flags & PRUS_OOB) != 0 &&
900             (error = tcp_pru_options_support(tp, PRUS_OOB)) != 0)
901                 goto out;
902
903         if (nam != NULL && tp->t_state < TCPS_SYN_SENT) {
904                 if (tp->t_state == TCPS_LISTEN) {
905                         error = EINVAL;
906                         goto out;
907                 }
908                 switch (nam->sa_family) {
909 #ifdef INET
910                 case AF_INET:
911                         sinp = (struct sockaddr_in *)nam;
912                         if (sinp->sin_len != sizeof(struct sockaddr_in)) {
913                                 error = EINVAL;
914                                 goto out;
915                         }
916                         if ((inp->inp_vflag & INP_IPV6) != 0) {
917                                 error = EAFNOSUPPORT;
918                                 goto out;
919                         }
920                         if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
921                                 error = EAFNOSUPPORT;
922                                 goto out;
923                         }
924                         if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) {
925                                 error = EACCES;
926                                 goto out;
927                         }
928                         if ((error = prison_remote_ip4(td->td_ucred,
929                             &sinp->sin_addr)))
930                                 goto out;
931 #ifdef INET6
932                         isipv6 = 0;
933 #endif
934                         break;
935 #endif /* INET */
936 #ifdef INET6
937                 case AF_INET6:
938                         sin6 = (struct sockaddr_in6 *)nam;
939                         if (sin6->sin6_len != sizeof(*sin6)) {
940                                 error = EINVAL;
941                                 goto out;
942                         }
943                         if ((inp->inp_vflag & INP_IPV6PROTO) == 0) {
944                                 error = EAFNOSUPPORT;
945                                 goto out;
946                         }
947                         if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
948                                 error = EAFNOSUPPORT;
949                                 goto out;
950                         }
951                         if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
952 #ifdef INET
953                                 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
954                                         error = EINVAL;
955                                         goto out;
956                                 }
957                                 if ((inp->inp_vflag & INP_IPV4) == 0) {
958                                         error = EAFNOSUPPORT;
959                                         goto out;
960                                 }
961                                 restoreflags = true;
962                                 inp->inp_vflag &= ~INP_IPV6;
963                                 sinp = &sin;
964                                 in6_sin6_2_sin(sinp, sin6);
965                                 if (IN_MULTICAST(
966                                     ntohl(sinp->sin_addr.s_addr))) {
967                                         error = EAFNOSUPPORT;
968                                         goto out;
969                                 }
970                                 if ((error = prison_remote_ip4(td->td_ucred,
971                                     &sinp->sin_addr)))
972                                         goto out;
973                                 isipv6 = 0;
974 #else /* !INET */
975                                 error = EAFNOSUPPORT;
976                                 goto out;
977 #endif /* INET */
978                         } else {
979                                 if ((inp->inp_vflag & INP_IPV6) == 0) {
980                                         error = EAFNOSUPPORT;
981                                         goto out;
982                                 }
983                                 restoreflags = true;
984                                 inp->inp_vflag &= ~INP_IPV4;
985                                 inp->inp_inc.inc_flags |= INC_ISIPV6;
986                                 if ((error = prison_remote_ip6(td->td_ucred,
987                                     &sin6->sin6_addr)))
988                                         goto out;
989                                 isipv6 = 1;
990                         }
991                         break;
992 #endif /* INET6 */
993                 default:
994                         error = EAFNOSUPPORT;
995                         goto out;
996                 }
997         }
998         if (!(flags & PRUS_OOB)) {
999                 if (tp->t_acktime == 0)
1000                         tp->t_acktime = ticks;
1001                 sbappendstream(&so->so_snd, m, flags);
1002                 m = NULL;
1003                 if (nam && tp->t_state < TCPS_SYN_SENT) {
1004                         KASSERT(tp->t_state == TCPS_CLOSED,
1005                             ("%s: tp %p is listening", __func__, tp));
1006
1007                         /*
1008                          * Do implied connect if not yet connected,
1009                          * initialize window to default value, and
1010                          * initialize maxseg using peer's cached MSS.
1011                          */
1012 #ifdef INET6
1013                         if (isipv6)
1014                                 error = tcp6_connect(tp, sin6, td);
1015 #endif /* INET6 */
1016 #if defined(INET6) && defined(INET)
1017                         else
1018 #endif
1019 #ifdef INET
1020                                 error = tcp_connect(tp, sinp, td);
1021 #endif
1022                         /*
1023                          * The bind operation in tcp_connect succeeded. We
1024                          * no longer want to restore the flags if later
1025                          * operations fail.
1026                          */
1027                         if (error == 0 || inp->inp_lport != 0)
1028                                 restoreflags = false;
1029
1030                         if (error) {
1031                                 /* m is freed if PRUS_NOTREADY is unset. */
1032                                 sbflush(&so->so_snd);
1033                                 goto out;
1034                         }
1035                         if (IS_FASTOPEN(tp->t_flags))
1036                                 tcp_fastopen_connect(tp);
1037                         else {
1038                                 tp->snd_wnd = TTCP_CLIENT_SND_WND;
1039                                 tcp_mss(tp, -1);
1040                         }
1041                 }
1042                 if (flags & PRUS_EOF) {
1043                         /*
1044                          * Close the send side of the connection after
1045                          * the data is sent.
1046                          */
1047                         socantsendmore(so);
1048                         tcp_usrclosed(tp);
1049                 }
1050                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1051                     ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
1052                     (tp->t_fbyte_out == 0) &&
1053                     (so->so_snd.sb_ccc > 0)) {
1054                         tp->t_fbyte_out = ticks;
1055                         if (tp->t_fbyte_out == 0)
1056                                 tp->t_fbyte_out = 1;
1057                         if (tp->t_fbyte_out && tp->t_fbyte_in)
1058                                 tp->t_flags2 |= TF2_FBYTES_COMPLETE;
1059                 }
1060                 if (!(inp->inp_flags & INP_DROPPED) &&
1061                     !(flags & PRUS_NOTREADY)) {
1062                         if (flags & PRUS_MORETOCOME)
1063                                 tp->t_flags |= TF_MORETOCOME;
1064                         error = tcp_output_nodrop(tp);
1065                         if (flags & PRUS_MORETOCOME)
1066                                 tp->t_flags &= ~TF_MORETOCOME;
1067                 }
1068         } else {
1069                 /*
1070                  * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
1071                  */
1072                 SOCKBUF_LOCK(&so->so_snd);
1073                 if (sbspace(&so->so_snd) < -512) {
1074                         SOCKBUF_UNLOCK(&so->so_snd);
1075                         error = ENOBUFS;
1076                         goto out;
1077                 }
1078                 /*
1079                  * According to RFC961 (Assigned Protocols),
1080                  * the urgent pointer points to the last octet
1081                  * of urgent data.  We continue, however,
1082                  * to consider it to indicate the first octet
1083                  * of data past the urgent section.
1084                  * Otherwise, snd_up should be one lower.
1085                  */
1086                 if (tp->t_acktime == 0)
1087                         tp->t_acktime = ticks;
1088                 sbappendstream_locked(&so->so_snd, m, flags);
1089                 SOCKBUF_UNLOCK(&so->so_snd);
1090                 m = NULL;
1091                 if (nam && tp->t_state < TCPS_SYN_SENT) {
1092                         /*
1093                          * Do implied connect if not yet connected,
1094                          * initialize window to default value, and
1095                          * initialize maxseg using peer's cached MSS.
1096                          */
1097
1098                         /*
1099                          * Not going to contemplate SYN|URG
1100                          */
1101                         if (IS_FASTOPEN(tp->t_flags))
1102                                 tp->t_flags &= ~TF_FASTOPEN;
1103 #ifdef INET6
1104                         if (isipv6)
1105                                 error = tcp6_connect(tp, sin6, td);
1106 #endif /* INET6 */
1107 #if defined(INET6) && defined(INET)
1108                         else
1109 #endif
1110 #ifdef INET
1111                                 error = tcp_connect(tp, sinp, td);
1112 #endif
1113                         /*
1114                          * The bind operation in tcp_connect succeeded. We
1115                          * no longer want to restore the flags if later
1116                          * operations fail.
1117                          */
1118                         if (error == 0 || inp->inp_lport != 0)
1119                                 restoreflags = false;
1120
1121                         if (error != 0) {
1122                                 /* m is freed if PRUS_NOTREADY is unset. */
1123                                 sbflush(&so->so_snd);
1124                                 goto out;
1125                         }
1126                         tp->snd_wnd = TTCP_CLIENT_SND_WND;
1127                         tcp_mss(tp, -1);
1128                 }
1129                 tp->snd_up = tp->snd_una + sbavail(&so->so_snd);
1130                 if ((flags & PRUS_NOTREADY) == 0) {
1131                         tp->t_flags |= TF_FORCEDATA;
1132                         error = tcp_output_nodrop(tp);
1133                         tp->t_flags &= ~TF_FORCEDATA;
1134                 }
1135         }
1136         TCP_LOG_EVENT(tp, NULL,
1137             &inp->inp_socket->so_rcv,
1138             &inp->inp_socket->so_snd,
1139             TCP_LOG_USERSEND, error,
1140             0, NULL, false);
1141
1142 out:
1143         /*
1144          * In case of PRUS_NOTREADY, the caller or tcp_usr_ready() is
1145          * responsible for freeing memory.
1146          */
1147         if (m != NULL && (flags & PRUS_NOTREADY) == 0)
1148                 m_freem(m);
1149
1150         /*
1151          * If the request was unsuccessful and we changed flags,
1152          * restore the original flags.
1153          */
1154         if (error != 0 && restoreflags) {
1155                 inp->inp_vflag = vflagsav;
1156                 inp->inp_inc.inc_flags = incflagsav;
1157         }
1158         TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB :
1159                    ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
1160         error = tcp_unlock_or_drop(tp, error);
1161         NET_EPOCH_EXIT(et);
1162         return (error);
1163 }
1164
1165 static int
1166 tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
1167 {
1168         struct epoch_tracker et;
1169         struct inpcb *inp;
1170         struct tcpcb *tp;
1171         int error;
1172
1173         inp = sotoinpcb(so);
1174         INP_WLOCK(inp);
1175         if (inp->inp_flags & INP_DROPPED) {
1176                 INP_WUNLOCK(inp);
1177                 mb_free_notready(m, count);
1178                 return (ECONNRESET);
1179         }
1180         tp = intotcpcb(inp);
1181
1182         SOCKBUF_LOCK(&so->so_snd);
1183         error = sbready(&so->so_snd, m, count);
1184         SOCKBUF_UNLOCK(&so->so_snd);
1185         if (error) {
1186                 INP_WUNLOCK(inp);
1187                 return (error);
1188         }
1189         NET_EPOCH_ENTER(et);
1190         error = tcp_output_unlock(tp);
1191         NET_EPOCH_EXIT(et);
1192
1193         return (error);
1194 }
1195
1196 /*
1197  * Abort the TCP.  Drop the connection abruptly.
1198  */
1199 static void
1200 tcp_usr_abort(struct socket *so)
1201 {
1202         struct inpcb *inp;
1203         struct tcpcb *tp = NULL;
1204         struct epoch_tracker et;
1205
1206         inp = sotoinpcb(so);
1207         KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
1208
1209         NET_EPOCH_ENTER(et);
1210         INP_WLOCK(inp);
1211         KASSERT(inp->inp_socket != NULL,
1212             ("tcp_usr_abort: inp_socket == NULL"));
1213
1214         /*
1215          * If we still have full TCP state, and we're not dropped, drop.
1216          */
1217         if (!(inp->inp_flags & INP_DROPPED)) {
1218                 tp = intotcpcb(inp);
1219                 tp = tcp_drop(tp, ECONNABORTED);
1220                 if (tp == NULL)
1221                         goto dropped;
1222                 TCP_PROBE2(debug__user, tp, PRU_ABORT);
1223         }
1224         if (!(inp->inp_flags & INP_DROPPED)) {
1225                 soref(so);
1226                 inp->inp_flags |= INP_SOCKREF;
1227         }
1228         INP_WUNLOCK(inp);
1229 dropped:
1230         NET_EPOCH_EXIT(et);
1231 }
1232
1233 /*
1234  * TCP socket is closed.  Start friendly disconnect.
1235  */
1236 static void
1237 tcp_usr_close(struct socket *so)
1238 {
1239         struct inpcb *inp;
1240         struct tcpcb *tp = NULL;
1241         struct epoch_tracker et;
1242
1243         inp = sotoinpcb(so);
1244         KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
1245
1246         NET_EPOCH_ENTER(et);
1247         INP_WLOCK(inp);
1248         KASSERT(inp->inp_socket != NULL,
1249             ("tcp_usr_close: inp_socket == NULL"));
1250
1251         /*
1252          * If we still have full TCP state, and we're not dropped, initiate
1253          * a disconnect.
1254          */
1255         if (!(inp->inp_flags & INP_DROPPED)) {
1256                 tp = intotcpcb(inp);
1257                 tp->t_flags |= TF_CLOSED;
1258                 tcp_disconnect(tp);
1259                 TCP_PROBE2(debug__user, tp, PRU_CLOSE);
1260         }
1261         if (!(inp->inp_flags & INP_DROPPED)) {
1262                 soref(so);
1263                 inp->inp_flags |= INP_SOCKREF;
1264         }
1265         INP_WUNLOCK(inp);
1266         NET_EPOCH_EXIT(et);
1267 }
1268
1269 static int
1270 tcp_pru_options_support(struct tcpcb *tp, int flags)
1271 {
1272         /*
1273          * If the specific TCP stack has a pru_options
1274          * specified then it does not always support
1275          * all the PRU_XX options and we must ask it.
1276          * If the function is not specified then all
1277          * of the PRU_XX options are supported.
1278          */
1279         int ret = 0;
1280
1281         if (tp->t_fb->tfb_pru_options) {
1282                 ret = (*tp->t_fb->tfb_pru_options)(tp, flags);
1283         }
1284         return (ret);
1285 }
1286
1287 /*
1288  * Receive out-of-band data.
1289  */
1290 static int
1291 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
1292 {
1293         int error = 0;
1294         struct inpcb *inp;
1295         struct tcpcb *tp = NULL;
1296
1297         inp = sotoinpcb(so);
1298         KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
1299         INP_WLOCK(inp);
1300         if (inp->inp_flags & INP_DROPPED) {
1301                 error = ECONNRESET;
1302                 goto out;
1303         }
1304         tp = intotcpcb(inp);
1305         error = tcp_pru_options_support(tp, PRUS_OOB);
1306         if (error) {
1307                 goto out;
1308         }
1309         if ((so->so_oobmark == 0 &&
1310              (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1311             so->so_options & SO_OOBINLINE ||
1312             tp->t_oobflags & TCPOOB_HADDATA) {
1313                 error = EINVAL;
1314                 goto out;
1315         }
1316         if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
1317                 error = EWOULDBLOCK;
1318                 goto out;
1319         }
1320         m->m_len = 1;
1321         *mtod(m, caddr_t) = tp->t_iobc;
1322         if ((flags & MSG_PEEK) == 0)
1323                 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1324
1325 out:
1326         TCP_PROBE2(debug__user, tp, PRU_RCVOOB);
1327         INP_WUNLOCK(inp);
1328         return (error);
1329 }
1330
1331 #ifdef INET
1332 struct protosw tcp_protosw = {
1333         .pr_type =              SOCK_STREAM,
1334         .pr_protocol =          IPPROTO_TCP,
1335         .pr_flags =             PR_CONNREQUIRED | PR_IMPLOPCL | PR_WANTRCVD |
1336                                     PR_CAPATTACH,
1337         .pr_ctloutput =         tcp_ctloutput,
1338         .pr_abort =             tcp_usr_abort,
1339         .pr_accept =            tcp_usr_accept,
1340         .pr_attach =            tcp_usr_attach,
1341         .pr_bind =              tcp_usr_bind,
1342         .pr_connect =           tcp_usr_connect,
1343         .pr_control =           in_control,
1344         .pr_detach =            tcp_usr_detach,
1345         .pr_disconnect =        tcp_usr_disconnect,
1346         .pr_listen =            tcp_usr_listen,
1347         .pr_peeraddr =          in_getpeeraddr,
1348         .pr_rcvd =              tcp_usr_rcvd,
1349         .pr_rcvoob =            tcp_usr_rcvoob,
1350         .pr_send =              tcp_usr_send,
1351         .pr_ready =             tcp_usr_ready,
1352         .pr_shutdown =          tcp_usr_shutdown,
1353         .pr_sockaddr =          in_getsockaddr,
1354         .pr_sosetlabel =        in_pcbsosetlabel,
1355         .pr_close =             tcp_usr_close,
1356 };
1357 #endif /* INET */
1358
1359 #ifdef INET6
1360 struct protosw tcp6_protosw = {
1361         .pr_type =              SOCK_STREAM,
1362         .pr_protocol =          IPPROTO_TCP,
1363         .pr_flags =             PR_CONNREQUIRED | PR_IMPLOPCL |PR_WANTRCVD |
1364                                     PR_CAPATTACH,
1365         .pr_ctloutput =         tcp_ctloutput,
1366         .pr_abort =             tcp_usr_abort,
1367         .pr_accept =            tcp6_usr_accept,
1368         .pr_attach =            tcp_usr_attach,
1369         .pr_bind =              tcp6_usr_bind,
1370         .pr_connect =           tcp6_usr_connect,
1371         .pr_control =           in6_control,
1372         .pr_detach =            tcp_usr_detach,
1373         .pr_disconnect =        tcp_usr_disconnect,
1374         .pr_listen =            tcp6_usr_listen,
1375         .pr_peeraddr =          in6_mapped_peeraddr,
1376         .pr_rcvd =              tcp_usr_rcvd,
1377         .pr_rcvoob =            tcp_usr_rcvoob,
1378         .pr_send =              tcp_usr_send,
1379         .pr_ready =             tcp_usr_ready,
1380         .pr_shutdown =          tcp_usr_shutdown,
1381         .pr_sockaddr =          in6_mapped_sockaddr,
1382         .pr_sosetlabel =        in_pcbsosetlabel,
1383         .pr_close =             tcp_usr_close,
1384 };
1385 #endif /* INET6 */
1386
1387 #ifdef INET
1388 /*
1389  * Common subroutine to open a TCP connection to remote host specified
1390  * by struct sockaddr_in.  Call in_pcbconnect() to choose local host address
1391  * and assign a local port number and install the inpcb into the hash.
1392  * Initialize connection parameters and enter SYN-SENT state.
1393  */
1394 static int
1395 tcp_connect(struct tcpcb *tp, struct sockaddr_in *sin, struct thread *td)
1396 {
1397         struct inpcb *inp = tptoinpcb(tp);
1398         struct socket *so = tptosocket(tp);
1399         int error;
1400
1401         NET_EPOCH_ASSERT();
1402         INP_WLOCK_ASSERT(inp);
1403
1404         INP_HASH_WLOCK(&V_tcbinfo);
1405         error = in_pcbconnect(inp, sin, td->td_ucred, true);
1406         INP_HASH_WUNLOCK(&V_tcbinfo);
1407         if (error != 0)
1408                 return (error);
1409
1410         /*
1411          * Compute window scaling to request:
1412          * Scale to fit into sweet spot.  See tcp_syncache.c.
1413          * XXX: This should move to tcp_output().
1414          */
1415         while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1416             (TCP_MAXWIN << tp->request_r_scale) < sb_max)
1417                 tp->request_r_scale++;
1418
1419         soisconnecting(so);
1420         TCPSTAT_INC(tcps_connattempt);
1421         tcp_state_change(tp, TCPS_SYN_SENT);
1422         tp->iss = tcp_new_isn(&inp->inp_inc);
1423         if (tp->t_flags & TF_REQ_TSTMP)
1424                 tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc);
1425         tcp_sendseqinit(tp);
1426
1427         return (0);
1428 }
1429 #endif /* INET */
1430
1431 #ifdef INET6
1432 static int
1433 tcp6_connect(struct tcpcb *tp, struct sockaddr_in6 *sin6, struct thread *td)
1434 {
1435         struct inpcb *inp = tptoinpcb(tp);
1436         struct epoch_tracker et;
1437         int error;
1438
1439         INP_WLOCK_ASSERT(inp);
1440
1441         NET_EPOCH_ENTER(et);
1442         INP_HASH_WLOCK(&V_tcbinfo);
1443         error = in6_pcbconnect(inp, sin6, td->td_ucred, true);
1444         INP_HASH_WUNLOCK(&V_tcbinfo);
1445         NET_EPOCH_EXIT(et);
1446         if (error != 0)
1447                 return (error);
1448
1449         /* Compute window scaling to request.  */
1450         while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1451             (TCP_MAXWIN << tp->request_r_scale) < sb_max)
1452                 tp->request_r_scale++;
1453
1454         soisconnecting(inp->inp_socket);
1455         TCPSTAT_INC(tcps_connattempt);
1456         tcp_state_change(tp, TCPS_SYN_SENT);
1457         tp->iss = tcp_new_isn(&inp->inp_inc);
1458         if (tp->t_flags & TF_REQ_TSTMP)
1459                 tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc);
1460         tcp_sendseqinit(tp);
1461
1462         return (0);
1463 }
1464 #endif /* INET6 */
1465
1466 /*
1467  * Export TCP internal state information via a struct tcp_info, based on the
1468  * Linux 2.6 API.  Not ABI compatible as our constants are mapped differently
1469  * (TCP state machine, etc).  We export all information using FreeBSD-native
1470  * constants -- for example, the numeric values for tcpi_state will differ
1471  * from Linux.
1472  */
1473 static void
1474 tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
1475 {
1476
1477         INP_WLOCK_ASSERT(tptoinpcb(tp));
1478         bzero(ti, sizeof(*ti));
1479
1480         ti->tcpi_state = tp->t_state;
1481         if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
1482                 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1483         if (tp->t_flags & TF_SACK_PERMIT)
1484                 ti->tcpi_options |= TCPI_OPT_SACK;
1485         if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
1486                 ti->tcpi_options |= TCPI_OPT_WSCALE;
1487                 ti->tcpi_snd_wscale = tp->snd_scale;
1488                 ti->tcpi_rcv_wscale = tp->rcv_scale;
1489         }
1490         if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))
1491                 ti->tcpi_options |= TCPI_OPT_ECN;
1492
1493         ti->tcpi_rto = tp->t_rxtcur * tick;
1494         ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
1495         ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
1496         ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
1497
1498         ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
1499         ti->tcpi_snd_cwnd = tp->snd_cwnd;
1500
1501         /*
1502          * FreeBSD-specific extension fields for tcp_info.
1503          */
1504         ti->tcpi_rcv_space = tp->rcv_wnd;
1505         ti->tcpi_rcv_nxt = tp->rcv_nxt;
1506         ti->tcpi_snd_wnd = tp->snd_wnd;
1507         ti->tcpi_snd_bwnd = 0;          /* Unused, kept for compat. */
1508         ti->tcpi_snd_nxt = tp->snd_nxt;
1509         ti->tcpi_snd_mss = tp->t_maxseg;
1510         ti->tcpi_rcv_mss = tp->t_maxseg;
1511         ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
1512         ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
1513         ti->tcpi_snd_zerowin = tp->t_sndzerowin;
1514 #ifdef TCP_OFFLOAD
1515         if (tp->t_flags & TF_TOE) {
1516                 ti->tcpi_options |= TCPI_OPT_TOE;
1517                 tcp_offload_tcp_info(tp, ti);
1518         }
1519 #endif
1520         /*
1521          * AccECN related counters.
1522          */
1523         if ((tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) ==
1524             (TF2_ECN_PERMIT | TF2_ACE_PERMIT))
1525                 /*
1526                  * Internal counter starts at 5 for AccECN
1527                  * but 0 for RFC3168 ECN.
1528                  */
1529                 ti->tcpi_delivered_ce = tp->t_scep - 5;
1530         else
1531                 ti->tcpi_delivered_ce = tp->t_scep;
1532         ti->tcpi_received_ce = tp->t_rcep;
1533 }
1534
1535 /*
1536  * tcp_ctloutput() must drop the inpcb lock before performing copyin on
1537  * socket option arguments.  When it re-acquires the lock after the copy, it
1538  * has to revalidate that the connection is still valid for the socket
1539  * option.
1540  */
1541 #define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do {                    \
1542         INP_WLOCK(inp);                                                 \
1543         if (inp->inp_flags & INP_DROPPED) {                             \
1544                 INP_WUNLOCK(inp);                                       \
1545                 cleanup;                                                \
1546                 return (ECONNRESET);                                    \
1547         }                                                               \
1548         tp = intotcpcb(inp);                                            \
1549 } while(0)
1550 #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */)
1551
1552 int
1553 tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)
1554 {
1555         struct socket *so = inp->inp_socket;
1556         struct tcpcb *tp = intotcpcb(inp);
1557         int error = 0;
1558
1559         MPASS(sopt->sopt_dir == SOPT_SET);
1560         INP_WLOCK_ASSERT(inp);
1561         KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1562             ("inp_flags == %x", inp->inp_flags));
1563         KASSERT(so != NULL, ("inp_socket == NULL"));
1564
1565         if (sopt->sopt_level != IPPROTO_TCP) {
1566                 INP_WUNLOCK(inp);
1567 #ifdef INET6
1568                 if (inp->inp_vflag & INP_IPV6PROTO)
1569                         error = ip6_ctloutput(so, sopt);
1570 #endif
1571 #if defined(INET6) && defined(INET)
1572                 else
1573 #endif
1574 #ifdef INET
1575                         error = ip_ctloutput(so, sopt);
1576 #endif
1577                 /*
1578                  * When an IP-level socket option affects TCP, pass control
1579                  * down to stack tfb_tcp_ctloutput, otherwise return what
1580                  * IP level returned.
1581                  */
1582                 switch (sopt->sopt_level) {
1583 #ifdef INET6
1584                 case IPPROTO_IPV6:
1585                         if ((inp->inp_vflag & INP_IPV6PROTO) == 0)
1586                                 return (error);
1587                         switch (sopt->sopt_name) {
1588                         case IPV6_TCLASS:
1589                                 /* Notify tcp stacks that care (e.g. RACK). */
1590                                 break;
1591                         case IPV6_USE_MIN_MTU:
1592                                 /* Update t_maxseg accordingly. */
1593                                 break;
1594                         default:
1595                                 return (error);
1596                         }
1597                         break;
1598 #endif
1599 #ifdef INET
1600                 case IPPROTO_IP:
1601                         switch (sopt->sopt_name) {
1602                         case IP_TOS:
1603                                 inp->inp_ip_tos &= ~IPTOS_ECN_MASK;
1604                                 break;
1605                         case IP_TTL:
1606                                 /* Notify tcp stacks that care (e.g. RACK). */
1607                                 break;
1608                         default:
1609                                 return (error);
1610                         }
1611                         break;
1612 #endif
1613                 default:
1614                         return (error);
1615                 }
1616                 INP_WLOCK(inp);
1617                 if (inp->inp_flags & INP_DROPPED) {
1618                         INP_WUNLOCK(inp);
1619                         return (ECONNRESET);
1620                 }
1621         } else if (sopt->sopt_name == TCP_FUNCTION_BLK) {
1622                 /*
1623                  * Protect the TCP option TCP_FUNCTION_BLK so
1624                  * that a sub-function can *never* overwrite this.
1625                  */
1626                 struct tcp_function_set fsn;
1627                 struct tcp_function_block *blk;
1628
1629                 INP_WUNLOCK(inp);
1630                 error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn);
1631                 if (error)
1632                         return (error);
1633
1634                 INP_WLOCK(inp);
1635                 if (inp->inp_flags & INP_DROPPED) {
1636                         INP_WUNLOCK(inp);
1637                         return (ECONNRESET);
1638                 }
1639                 tp = intotcpcb(inp);
1640
1641                 blk = find_and_ref_tcp_functions(&fsn);
1642                 if (blk == NULL) {
1643                         INP_WUNLOCK(inp);
1644                         return (ENOENT);
1645                 }
1646                 if (tp->t_fb == blk) {
1647                         /* You already have this */
1648                         refcount_release(&blk->tfb_refcnt);
1649                         INP_WUNLOCK(inp);
1650                         return (0);
1651                 }
1652                 if (tp->t_state != TCPS_CLOSED) {
1653                         /*
1654                          * The user has advanced the state
1655                          * past the initial point, we may not
1656                          * be able to switch.
1657                          */
1658                         if (blk->tfb_tcp_handoff_ok != NULL) {
1659                                 /*
1660                                  * Does the stack provide a
1661                                  * query mechanism, if so it may
1662                                  * still be possible?
1663                                  */
1664                                 error = (*blk->tfb_tcp_handoff_ok)(tp);
1665                         } else
1666                                 error = EINVAL;
1667                         if (error) {
1668                                 refcount_release(&blk->tfb_refcnt);
1669                                 INP_WUNLOCK(inp);
1670                                 return(error);
1671                         }
1672                 }
1673                 if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
1674                         refcount_release(&blk->tfb_refcnt);
1675                         INP_WUNLOCK(inp);
1676                         return (ENOENT);
1677                 }
1678                 /*
1679                  * Release the old refcnt, the
1680                  * lookup acquired a ref on the
1681                  * new one already.
1682                  */
1683                 if (tp->t_fb->tfb_tcp_fb_fini) {
1684                         struct epoch_tracker et;
1685                         /*
1686                          * Tell the stack to cleanup with 0 i.e.
1687                          * the tcb is not going away.
1688                          */
1689                         NET_EPOCH_ENTER(et);
1690                         (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
1691                         NET_EPOCH_EXIT(et);
1692                 }
1693 #ifdef TCPHPTS
1694                 /* Assure that we are not on any hpts */
1695                 tcp_hpts_remove(tptoinpcb(tp));
1696 #endif
1697                 if (blk->tfb_tcp_fb_init) {
1698                         error = (*blk->tfb_tcp_fb_init)(tp);
1699                         if (error) {
1700                                 refcount_release(&blk->tfb_refcnt);
1701                                 if (tp->t_fb->tfb_tcp_fb_init) {
1702                                         if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0)  {
1703                                                 /* Fall back failed, drop the connection */
1704                                                 INP_WUNLOCK(inp);
1705                                                 soabort(so);
1706                                                 return (error);
1707                                         }
1708                                 }
1709                                 goto err_out;
1710                         }
1711                 }
1712                 refcount_release(&tp->t_fb->tfb_refcnt);
1713                 tp->t_fb = blk;
1714 #ifdef TCP_OFFLOAD
1715                 if (tp->t_flags & TF_TOE) {
1716                         tcp_offload_ctloutput(tp, sopt->sopt_dir,
1717                              sopt->sopt_name);
1718                 }
1719 #endif
1720 err_out:
1721                 INP_WUNLOCK(inp);
1722                 return (error);
1723         }
1724
1725         /* Pass in the INP locked, callee must unlock it. */
1726         return (tp->t_fb->tfb_tcp_ctloutput(inp, sopt));
1727 }
1728
1729 static int
1730 tcp_ctloutput_get(struct inpcb *inp, struct sockopt *sopt)
1731 {
1732         struct socket *so = inp->inp_socket;
1733         struct tcpcb *tp = intotcpcb(inp);
1734         int error = 0;
1735
1736         MPASS(sopt->sopt_dir == SOPT_GET);
1737         INP_WLOCK_ASSERT(inp);
1738         KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1739             ("inp_flags == %x", inp->inp_flags));
1740         KASSERT(so != NULL, ("inp_socket == NULL"));
1741
1742         if (sopt->sopt_level != IPPROTO_TCP) {
1743                 INP_WUNLOCK(inp);
1744 #ifdef INET6
1745                 if (inp->inp_vflag & INP_IPV6PROTO)
1746                         error = ip6_ctloutput(so, sopt);
1747 #endif /* INET6 */
1748 #if defined(INET6) && defined(INET)
1749                 else
1750 #endif
1751 #ifdef INET
1752                         error = ip_ctloutput(so, sopt);
1753 #endif
1754                 return (error);
1755         }
1756         if (((sopt->sopt_name == TCP_FUNCTION_BLK) ||
1757              (sopt->sopt_name == TCP_FUNCTION_ALIAS))) {
1758                 struct tcp_function_set fsn;
1759
1760                 if (sopt->sopt_name == TCP_FUNCTION_ALIAS) {
1761                         memset(&fsn, 0, sizeof(fsn));
1762                         find_tcp_function_alias(tp->t_fb, &fsn);
1763                 } else {
1764                         strncpy(fsn.function_set_name,
1765                             tp->t_fb->tfb_tcp_block_name,
1766                             TCP_FUNCTION_NAME_LEN_MAX);
1767                         fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
1768                 }
1769                 fsn.pcbcnt = tp->t_fb->tfb_refcnt;
1770                 INP_WUNLOCK(inp);
1771                 error = sooptcopyout(sopt, &fsn, sizeof fsn);
1772                 return (error);
1773         }
1774
1775         /* Pass in the INP locked, callee must unlock it. */
1776         return (tp->t_fb->tfb_tcp_ctloutput(inp, sopt));
1777 }
1778
1779 int
1780 tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1781 {
1782         struct  inpcb *inp;
1783
1784         inp = sotoinpcb(so);
1785         KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
1786
1787         INP_WLOCK(inp);
1788         if (inp->inp_flags & INP_DROPPED) {
1789                 INP_WUNLOCK(inp);
1790                 return (ECONNRESET);
1791         }
1792         if (sopt->sopt_dir == SOPT_SET)
1793                 return (tcp_ctloutput_set(inp, sopt));
1794         else if (sopt->sopt_dir == SOPT_GET)
1795                 return (tcp_ctloutput_get(inp, sopt));
1796         else
1797                 panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir);
1798 }
1799
1800 /*
1801  * If this assert becomes untrue, we need to change the size of the buf
1802  * variable in tcp_default_ctloutput().
1803  */
1804 #ifdef CTASSERT
1805 CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN);
1806 CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN);
1807 #endif
1808
1809 #ifdef KERN_TLS
1810 static int
1811 copyin_tls_enable(struct sockopt *sopt, struct tls_enable *tls)
1812 {
1813         struct tls_enable_v0 tls_v0;
1814         int error;
1815
1816         if (sopt->sopt_valsize == sizeof(tls_v0)) {
1817                 error = sooptcopyin(sopt, &tls_v0, sizeof(tls_v0),
1818                     sizeof(tls_v0));
1819                 if (error)
1820                         return (error);
1821                 memset(tls, 0, sizeof(*tls));
1822                 tls->cipher_key = tls_v0.cipher_key;
1823                 tls->iv = tls_v0.iv;
1824                 tls->auth_key = tls_v0.auth_key;
1825                 tls->cipher_algorithm = tls_v0.cipher_algorithm;
1826                 tls->cipher_key_len = tls_v0.cipher_key_len;
1827                 tls->iv_len = tls_v0.iv_len;
1828                 tls->auth_algorithm = tls_v0.auth_algorithm;
1829                 tls->auth_key_len = tls_v0.auth_key_len;
1830                 tls->flags = tls_v0.flags;
1831                 tls->tls_vmajor = tls_v0.tls_vmajor;
1832                 tls->tls_vminor = tls_v0.tls_vminor;
1833                 return (0);
1834         }
1835
1836         return (sooptcopyin(sopt, tls, sizeof(*tls), sizeof(*tls)));
1837 }
1838 #endif
1839
1840 extern struct cc_algo newreno_cc_algo;
1841
1842 static int
1843 tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt)
1844 {
1845         struct cc_algo *algo;
1846         void *ptr = NULL;
1847         struct tcpcb *tp;
1848         struct cc_var cc_mem;
1849         char    buf[TCP_CA_NAME_MAX];
1850         size_t mem_sz;
1851         int error;
1852
1853         INP_WUNLOCK(inp);
1854         error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1);
1855         if (error)
1856                 return(error);
1857         buf[sopt->sopt_valsize] = '\0';
1858         CC_LIST_RLOCK();
1859         STAILQ_FOREACH(algo, &cc_list, entries) {
1860                 if (strncmp(buf, algo->name,
1861                             TCP_CA_NAME_MAX) == 0) {
1862                         if (algo->flags & CC_MODULE_BEING_REMOVED) {
1863                                 /* We can't "see" modules being unloaded */
1864                                 continue;
1865                         }
1866                         break;
1867                 }
1868         }
1869         if (algo == NULL) {
1870                 CC_LIST_RUNLOCK();
1871                 return(ESRCH);
1872         }
1873         /* 
1874          * With a reference the algorithm cannot be removed
1875          * so we hold a reference through the change process.
1876          */
1877         cc_refer(algo);
1878         CC_LIST_RUNLOCK();
1879         if (algo->cb_init != NULL) {
1880                 /* We can now pre-get the memory for the CC */
1881                 mem_sz = (*algo->cc_data_sz)();
1882                 if (mem_sz == 0) {
1883                         goto no_mem_needed;
1884                 }
1885                 ptr = malloc(mem_sz, M_CC_MEM, M_WAITOK);
1886         } else {
1887 no_mem_needed:
1888                 mem_sz = 0;
1889                 ptr = NULL;
1890         }
1891         /*
1892          * Make sure its all clean and zero and also get
1893          * back the inplock.
1894          */
1895         memset(&cc_mem, 0, sizeof(cc_mem));
1896         INP_WLOCK(inp);
1897         if (inp->inp_flags & INP_DROPPED) {
1898                 INP_WUNLOCK(inp);
1899                 if (ptr)
1900                         free(ptr, M_CC_MEM);
1901                 /* Release our temp reference */
1902                 CC_LIST_RLOCK();
1903                 cc_release(algo);
1904                 CC_LIST_RUNLOCK();
1905                 return (ECONNRESET);
1906         }
1907         tp = intotcpcb(inp);
1908         if (ptr != NULL)
1909                 memset(ptr, 0, mem_sz);
1910         cc_mem.ccvc.tcp = tp;
1911         /*
1912          * We once again hold a write lock over the tcb so it's
1913          * safe to do these things without ordering concerns.
1914          * Note here we init into stack memory.
1915          */
1916         if (algo->cb_init != NULL)
1917                 error = algo->cb_init(&cc_mem, ptr);
1918         else
1919                 error = 0;
1920         /*
1921          * The CC algorithms, when given their memory
1922          * should not fail we could in theory have a
1923          * KASSERT here.
1924          */
1925         if (error == 0) {
1926                 /*
1927                  * Touchdown, lets go ahead and move the
1928                  * connection to the new CC module by
1929                  * copying in the cc_mem after we call
1930                  * the old ones cleanup (if any).
1931                  */
1932                 if (CC_ALGO(tp)->cb_destroy != NULL)
1933                         CC_ALGO(tp)->cb_destroy(&tp->t_ccv);
1934                 /* Detach the old CC from the tcpcb  */
1935                 cc_detach(tp);
1936                 /* Copy in our temp memory that was inited */
1937                 memcpy(&tp->t_ccv, &cc_mem, sizeof(struct cc_var));
1938                 /* Now attach the new, which takes a reference */
1939                 cc_attach(tp, algo);
1940                 /* Ok now are we where we have gotten past any conn_init? */
1941                 if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) {
1942                         /* Yep run the connection init for the new CC */
1943                         CC_ALGO(tp)->conn_init(&tp->t_ccv);
1944                 }
1945         } else if (ptr)
1946                 free(ptr, M_CC_MEM);
1947         INP_WUNLOCK(inp);
1948         /* Now lets release our temp reference */
1949         CC_LIST_RLOCK();
1950         cc_release(algo);
1951         CC_LIST_RUNLOCK();
1952         return (error);
1953 }
1954
1955 int
1956 tcp_default_ctloutput(struct inpcb *inp, struct sockopt *sopt)
1957 {
1958         struct tcpcb *tp = intotcpcb(inp);
1959         int     error, opt, optval;
1960         u_int   ui;
1961         struct  tcp_info ti;
1962 #ifdef KERN_TLS
1963         struct tls_enable tls;
1964         struct socket *so = inp->inp_socket;
1965 #endif
1966         char    *pbuf, buf[TCP_LOG_ID_LEN];
1967 #ifdef STATS
1968         struct statsblob *sbp;
1969 #endif
1970         size_t  len;
1971
1972         INP_WLOCK_ASSERT(inp);
1973         KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1974             ("inp_flags == %x", inp->inp_flags));
1975         KASSERT(inp->inp_socket != NULL, ("inp_socket == NULL"));
1976
1977         switch (sopt->sopt_level) {
1978 #ifdef INET6
1979         case IPPROTO_IPV6:
1980                 MPASS(inp->inp_vflag & INP_IPV6PROTO);
1981                 switch (sopt->sopt_name) {
1982                 case IPV6_USE_MIN_MTU:
1983                         tcp6_use_min_mtu(tp);
1984                         /* FALLTHROUGH */
1985                 }
1986                 INP_WUNLOCK(inp);
1987                 return (0);
1988 #endif
1989 #ifdef INET
1990         case IPPROTO_IP:
1991                 INP_WUNLOCK(inp);
1992                 return (0);
1993 #endif
1994         }
1995
1996         /*
1997          * For TCP_CCALGOOPT forward the control to CC module, for both
1998          * SOPT_SET and SOPT_GET.
1999          */
2000         switch (sopt->sopt_name) {
2001         case TCP_CCALGOOPT:
2002                 INP_WUNLOCK(inp);
2003                 if (sopt->sopt_valsize > CC_ALGOOPT_LIMIT)
2004                         return (EINVAL);
2005                 pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO);
2006                 error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize,
2007                     sopt->sopt_valsize);
2008                 if (error) {
2009                         free(pbuf, M_TEMP);
2010                         return (error);
2011                 }
2012                 INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP));
2013                 if (CC_ALGO(tp)->ctl_output != NULL)
2014                         error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, sopt, pbuf);
2015                 else
2016                         error = ENOENT;
2017                 INP_WUNLOCK(inp);
2018                 if (error == 0 && sopt->sopt_dir == SOPT_GET)
2019                         error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize);
2020                 free(pbuf, M_TEMP);
2021                 return (error);
2022         }
2023
2024         switch (sopt->sopt_dir) {
2025         case SOPT_SET:
2026                 switch (sopt->sopt_name) {
2027 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
2028                 case TCP_MD5SIG:
2029                         INP_WUNLOCK(inp);
2030                         if (!TCPMD5_ENABLED())
2031                                 return (ENOPROTOOPT);
2032                         error = TCPMD5_PCBCTL(inp, sopt);
2033                         if (error)
2034                                 return (error);
2035                         INP_WLOCK_RECHECK(inp);
2036                         goto unlock_and_done;
2037 #endif /* IPSEC */
2038
2039                 case TCP_NODELAY:
2040                 case TCP_NOOPT:
2041                 case TCP_LRD:
2042                         INP_WUNLOCK(inp);
2043                         error = sooptcopyin(sopt, &optval, sizeof optval,
2044                             sizeof optval);
2045                         if (error)
2046                                 return (error);
2047
2048                         INP_WLOCK_RECHECK(inp);
2049                         switch (sopt->sopt_name) {
2050                         case TCP_NODELAY:
2051                                 opt = TF_NODELAY;
2052                                 break;
2053                         case TCP_NOOPT:
2054                                 opt = TF_NOOPT;
2055                                 break;
2056                         case TCP_LRD:
2057                                 opt = TF_LRD;
2058                                 break;
2059                         default:
2060                                 opt = 0; /* dead code to fool gcc */
2061                                 break;
2062                         }
2063
2064                         if (optval)
2065                                 tp->t_flags |= opt;
2066                         else
2067                                 tp->t_flags &= ~opt;
2068 unlock_and_done:
2069 #ifdef TCP_OFFLOAD
2070                         if (tp->t_flags & TF_TOE) {
2071                                 tcp_offload_ctloutput(tp, sopt->sopt_dir,
2072                                     sopt->sopt_name);
2073                         }
2074 #endif
2075                         INP_WUNLOCK(inp);
2076                         break;
2077
2078                 case TCP_NOPUSH:
2079                         INP_WUNLOCK(inp);
2080                         error = sooptcopyin(sopt, &optval, sizeof optval,
2081                             sizeof optval);
2082                         if (error)
2083                                 return (error);
2084
2085                         INP_WLOCK_RECHECK(inp);
2086                         if (optval)
2087                                 tp->t_flags |= TF_NOPUSH;
2088                         else if (tp->t_flags & TF_NOPUSH) {
2089                                 tp->t_flags &= ~TF_NOPUSH;
2090                                 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
2091                                         struct epoch_tracker et;
2092
2093                                         NET_EPOCH_ENTER(et);
2094                                         error = tcp_output_nodrop(tp);
2095                                         NET_EPOCH_EXIT(et);
2096                                 }
2097                         }
2098                         goto unlock_and_done;
2099
2100                 case TCP_REMOTE_UDP_ENCAPS_PORT:
2101                         INP_WUNLOCK(inp);
2102                         error = sooptcopyin(sopt, &optval, sizeof optval,
2103                             sizeof optval);
2104                         if (error)
2105                                 return (error);
2106                         if ((optval < TCP_TUNNELING_PORT_MIN) ||
2107                             (optval > TCP_TUNNELING_PORT_MAX)) {
2108                                 /* Its got to be in range */
2109                                 return (EINVAL);
2110                         }
2111                         if ((V_tcp_udp_tunneling_port == 0) && (optval != 0)) {
2112                                 /* You have to have enabled a UDP tunneling port first */
2113                                 return (EINVAL);
2114                         }
2115                         INP_WLOCK_RECHECK(inp);
2116                         if (tp->t_state != TCPS_CLOSED) {
2117                                 /* You can't change after you are connected */
2118                                 error = EINVAL;
2119                         } else {
2120                                 /* Ok we are all good set the port */
2121                                 tp->t_port = htons(optval);
2122                         }
2123                         goto unlock_and_done;
2124
2125                 case TCP_MAXSEG:
2126                         INP_WUNLOCK(inp);
2127                         error = sooptcopyin(sopt, &optval, sizeof optval,
2128                             sizeof optval);
2129                         if (error)
2130                                 return (error);
2131
2132                         INP_WLOCK_RECHECK(inp);
2133                         if (optval > 0 && optval <= tp->t_maxseg &&
2134                             optval + 40 >= V_tcp_minmss)
2135                                 tp->t_maxseg = optval;
2136                         else
2137                                 error = EINVAL;
2138                         goto unlock_and_done;
2139
2140                 case TCP_INFO:
2141                         INP_WUNLOCK(inp);
2142                         error = EINVAL;
2143                         break;
2144
2145                 case TCP_STATS:
2146                         INP_WUNLOCK(inp);
2147 #ifdef STATS
2148                         error = sooptcopyin(sopt, &optval, sizeof optval,
2149                             sizeof optval);
2150                         if (error)
2151                                 return (error);
2152
2153                         if (optval > 0)
2154                                 sbp = stats_blob_alloc(
2155                                     V_tcp_perconn_stats_dflt_tpl, 0);
2156                         else
2157                                 sbp = NULL;
2158
2159                         INP_WLOCK_RECHECK(inp);
2160                         if ((tp->t_stats != NULL && sbp == NULL) ||
2161                             (tp->t_stats == NULL && sbp != NULL)) {
2162                                 struct statsblob *t = tp->t_stats;
2163                                 tp->t_stats = sbp;
2164                                 sbp = t;
2165                         }
2166                         INP_WUNLOCK(inp);
2167
2168                         stats_blob_destroy(sbp);
2169 #else
2170                         return (EOPNOTSUPP);
2171 #endif /* !STATS */
2172                         break;
2173
2174                 case TCP_CONGESTION:
2175                         error = tcp_set_cc_mod(inp, sopt);
2176                         break;
2177
2178                 case TCP_REUSPORT_LB_NUMA:
2179                         INP_WUNLOCK(inp);
2180                         error = sooptcopyin(sopt, &optval, sizeof(optval),
2181                             sizeof(optval));
2182                         INP_WLOCK_RECHECK(inp);
2183                         if (!error)
2184                                 error = in_pcblbgroup_numa(inp, optval);
2185                         INP_WUNLOCK(inp);
2186                         break;
2187
2188 #ifdef KERN_TLS
2189                 case TCP_TXTLS_ENABLE:
2190                         INP_WUNLOCK(inp);
2191                         error = copyin_tls_enable(sopt, &tls);
2192                         if (error)
2193                                 break;
2194                         error = ktls_enable_tx(so, &tls);
2195                         break;
2196                 case TCP_TXTLS_MODE:
2197                         INP_WUNLOCK(inp);
2198                         error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
2199                         if (error)
2200                                 return (error);
2201
2202                         INP_WLOCK_RECHECK(inp);
2203                         error = ktls_set_tx_mode(so, ui);
2204                         INP_WUNLOCK(inp);
2205                         break;
2206                 case TCP_RXTLS_ENABLE:
2207                         INP_WUNLOCK(inp);
2208                         error = sooptcopyin(sopt, &tls, sizeof(tls),
2209                             sizeof(tls));
2210                         if (error)
2211                                 break;
2212                         error = ktls_enable_rx(so, &tls);
2213                         break;
2214 #endif
2215                 case TCP_MAXUNACKTIME:
2216                 case TCP_KEEPIDLE:
2217                 case TCP_KEEPINTVL:
2218                 case TCP_KEEPINIT:
2219                         INP_WUNLOCK(inp);
2220                         error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
2221                         if (error)
2222                                 return (error);
2223
2224                         if (ui > (UINT_MAX / hz)) {
2225                                 error = EINVAL;
2226                                 break;
2227                         }
2228                         ui *= hz;
2229
2230                         INP_WLOCK_RECHECK(inp);
2231                         switch (sopt->sopt_name) {
2232                         case TCP_MAXUNACKTIME:
2233                                 tp->t_maxunacktime = ui;
2234                                 break;
2235
2236                         case TCP_KEEPIDLE:
2237                                 tp->t_keepidle = ui;
2238                                 /*
2239                                  * XXX: better check current remaining
2240                                  * timeout and "merge" it with new value.
2241                                  */
2242                                 if ((tp->t_state > TCPS_LISTEN) &&
2243                                     (tp->t_state <= TCPS_CLOSING))
2244                                         tcp_timer_activate(tp, TT_KEEP,
2245                                             TP_KEEPIDLE(tp));
2246                                 break;
2247                         case TCP_KEEPINTVL:
2248                                 tp->t_keepintvl = ui;
2249                                 if ((tp->t_state == TCPS_FIN_WAIT_2) &&
2250                                     (TP_MAXIDLE(tp) > 0))
2251                                         tcp_timer_activate(tp, TT_2MSL,
2252                                             TP_MAXIDLE(tp));
2253                                 break;
2254                         case TCP_KEEPINIT:
2255                                 tp->t_keepinit = ui;
2256                                 if (tp->t_state == TCPS_SYN_RECEIVED ||
2257                                     tp->t_state == TCPS_SYN_SENT)
2258                                         tcp_timer_activate(tp, TT_KEEP,
2259                                             TP_KEEPINIT(tp));
2260                                 break;
2261                         }
2262                         goto unlock_and_done;
2263
2264                 case TCP_KEEPCNT:
2265                         INP_WUNLOCK(inp);
2266                         error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
2267                         if (error)
2268                                 return (error);
2269
2270                         INP_WLOCK_RECHECK(inp);
2271                         tp->t_keepcnt = ui;
2272                         if ((tp->t_state == TCPS_FIN_WAIT_2) &&
2273                             (TP_MAXIDLE(tp) > 0))
2274                                 tcp_timer_activate(tp, TT_2MSL,
2275                                     TP_MAXIDLE(tp));
2276                         goto unlock_and_done;
2277
2278 #ifdef TCPPCAP
2279                 case TCP_PCAP_OUT:
2280                 case TCP_PCAP_IN:
2281                         INP_WUNLOCK(inp);
2282                         error = sooptcopyin(sopt, &optval, sizeof optval,
2283                             sizeof optval);
2284                         if (error)
2285                                 return (error);
2286
2287                         INP_WLOCK_RECHECK(inp);
2288                         if (optval >= 0)
2289                                 tcp_pcap_set_sock_max(TCP_PCAP_OUT ?
2290                                         &(tp->t_outpkts) : &(tp->t_inpkts),
2291                                         optval);
2292                         else
2293                                 error = EINVAL;
2294                         goto unlock_and_done;
2295 #endif
2296
2297                 case TCP_FASTOPEN: {
2298                         struct tcp_fastopen tfo_optval;
2299
2300                         INP_WUNLOCK(inp);
2301                         if (!V_tcp_fastopen_client_enable &&
2302                             !V_tcp_fastopen_server_enable)
2303                                 return (EPERM);
2304
2305                         error = sooptcopyin(sopt, &tfo_optval,
2306                                     sizeof(tfo_optval), sizeof(int));
2307                         if (error)
2308                                 return (error);
2309
2310                         INP_WLOCK_RECHECK(inp);
2311                         if ((tp->t_state != TCPS_CLOSED) &&
2312                             (tp->t_state != TCPS_LISTEN)) {
2313                                 error = EINVAL;
2314                                 goto unlock_and_done;
2315                         }
2316                         if (tfo_optval.enable) {
2317                                 if (tp->t_state == TCPS_LISTEN) {
2318                                         if (!V_tcp_fastopen_server_enable) {
2319                                                 error = EPERM;
2320                                                 goto unlock_and_done;
2321                                         }
2322
2323                                         if (tp->t_tfo_pending == NULL)
2324                                                 tp->t_tfo_pending =
2325                                                     tcp_fastopen_alloc_counter();
2326                                 } else {
2327                                         /*
2328                                          * If a pre-shared key was provided,
2329                                          * stash it in the client cookie
2330                                          * field of the tcpcb for use during
2331                                          * connect.
2332                                          */
2333                                         if (sopt->sopt_valsize ==
2334                                             sizeof(tfo_optval)) {
2335                                                 memcpy(tp->t_tfo_cookie.client,
2336                                                        tfo_optval.psk,
2337                                                        TCP_FASTOPEN_PSK_LEN);
2338                                                 tp->t_tfo_client_cookie_len =
2339                                                     TCP_FASTOPEN_PSK_LEN;
2340                                         }
2341                                 }
2342                                 tp->t_flags |= TF_FASTOPEN;
2343                         } else
2344                                 tp->t_flags &= ~TF_FASTOPEN;
2345                         goto unlock_and_done;
2346                 }
2347
2348 #ifdef TCP_BLACKBOX
2349                 case TCP_LOG:
2350                         INP_WUNLOCK(inp);
2351                         error = sooptcopyin(sopt, &optval, sizeof optval,
2352                             sizeof optval);
2353                         if (error)
2354                                 return (error);
2355
2356                         INP_WLOCK_RECHECK(inp);
2357                         error = tcp_log_state_change(tp, optval);
2358                         goto unlock_and_done;
2359
2360                 case TCP_LOGBUF:
2361                         INP_WUNLOCK(inp);
2362                         error = EINVAL;
2363                         break;
2364
2365                 case TCP_LOGID:
2366                         INP_WUNLOCK(inp);
2367                         error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0);
2368                         if (error)
2369                                 break;
2370                         buf[sopt->sopt_valsize] = '\0';
2371                         INP_WLOCK_RECHECK(inp);
2372                         error = tcp_log_set_id(tp, buf);
2373                         /* tcp_log_set_id() unlocks the INP. */
2374                         break;
2375
2376                 case TCP_LOGDUMP:
2377                 case TCP_LOGDUMPID:
2378                         INP_WUNLOCK(inp);
2379                         error =
2380                             sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0);
2381                         if (error)
2382                                 break;
2383                         buf[sopt->sopt_valsize] = '\0';
2384                         INP_WLOCK_RECHECK(inp);
2385                         if (sopt->sopt_name == TCP_LOGDUMP) {
2386                                 error = tcp_log_dump_tp_logbuf(tp, buf,
2387                                     M_WAITOK, true);
2388                                 INP_WUNLOCK(inp);
2389                         } else {
2390                                 tcp_log_dump_tp_bucket_logbufs(tp, buf);
2391                                 /*
2392                                  * tcp_log_dump_tp_bucket_logbufs() drops the
2393                                  * INP lock.
2394                                  */
2395                         }
2396                         break;
2397 #endif
2398
2399                 default:
2400                         INP_WUNLOCK(inp);
2401                         error = ENOPROTOOPT;
2402                         break;
2403                 }
2404                 break;
2405
2406         case SOPT_GET:
2407                 tp = intotcpcb(inp);
2408                 switch (sopt->sopt_name) {
2409 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
2410                 case TCP_MD5SIG:
2411                         INP_WUNLOCK(inp);
2412                         if (!TCPMD5_ENABLED())
2413                                 return (ENOPROTOOPT);
2414                         error = TCPMD5_PCBCTL(inp, sopt);
2415                         break;
2416 #endif
2417
2418                 case TCP_NODELAY:
2419                         optval = tp->t_flags & TF_NODELAY;
2420                         INP_WUNLOCK(inp);
2421                         error = sooptcopyout(sopt, &optval, sizeof optval);
2422                         break;
2423                 case TCP_MAXSEG:
2424                         optval = tp->t_maxseg;
2425                         INP_WUNLOCK(inp);
2426                         error = sooptcopyout(sopt, &optval, sizeof optval);
2427                         break;
2428                 case TCP_REMOTE_UDP_ENCAPS_PORT:
2429                         optval = ntohs(tp->t_port);
2430                         INP_WUNLOCK(inp);
2431                         error = sooptcopyout(sopt, &optval, sizeof optval);
2432                         break;
2433                 case TCP_NOOPT:
2434                         optval = tp->t_flags & TF_NOOPT;
2435                         INP_WUNLOCK(inp);
2436                         error = sooptcopyout(sopt, &optval, sizeof optval);
2437                         break;
2438                 case TCP_NOPUSH:
2439                         optval = tp->t_flags & TF_NOPUSH;
2440                         INP_WUNLOCK(inp);
2441                         error = sooptcopyout(sopt, &optval, sizeof optval);
2442                         break;
2443                 case TCP_INFO:
2444                         tcp_fill_info(tp, &ti);
2445                         INP_WUNLOCK(inp);
2446                         error = sooptcopyout(sopt, &ti, sizeof ti);
2447                         break;
2448                 case TCP_STATS:
2449                         {
2450 #ifdef STATS
2451                         int nheld;
2452                         TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0;
2453
2454                         error = 0;
2455                         socklen_t outsbsz = sopt->sopt_valsize;
2456                         if (tp->t_stats == NULL)
2457                                 error = ENOENT;
2458                         else if (outsbsz >= tp->t_stats->cursz)
2459                                 outsbsz = tp->t_stats->cursz;
2460                         else if (outsbsz >= sizeof(struct statsblob))
2461                                 outsbsz = sizeof(struct statsblob);
2462                         else
2463                                 error = EINVAL;
2464                         INP_WUNLOCK(inp);
2465                         if (error)
2466                                 break;
2467
2468                         sbp = sopt->sopt_val;
2469                         nheld = atop(round_page(((vm_offset_t)sbp) +
2470                             (vm_size_t)outsbsz) - trunc_page((vm_offset_t)sbp));
2471                         vm_page_t ma[nheld];
2472                         if (vm_fault_quick_hold_pages(
2473                             &curproc->p_vmspace->vm_map, (vm_offset_t)sbp,
2474                             outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma,
2475                             nheld) < 0) {
2476                                 error = EFAULT;
2477                                 break;
2478                         }
2479
2480                         if ((error = copyin_nofault(&(sbp->flags), &sbflags,
2481                             SIZEOF_MEMBER(struct statsblob, flags))))
2482                                 goto unhold;
2483
2484                         INP_WLOCK_RECHECK(inp);
2485                         error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats,
2486                             sbflags | SB_CLONE_USRDSTNOFAULT);
2487                         INP_WUNLOCK(inp);
2488                         sopt->sopt_valsize = outsbsz;
2489 unhold:
2490                         vm_page_unhold_pages(ma, nheld);
2491 #else
2492                         INP_WUNLOCK(inp);
2493                         error = EOPNOTSUPP;
2494 #endif /* !STATS */
2495                         break;
2496                         }
2497                 case TCP_CONGESTION:
2498                         len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
2499                         INP_WUNLOCK(inp);
2500                         error = sooptcopyout(sopt, buf, len + 1);
2501                         break;
2502                 case TCP_MAXUNACKTIME:
2503                 case TCP_KEEPIDLE:
2504                 case TCP_KEEPINTVL:
2505                 case TCP_KEEPINIT:
2506                 case TCP_KEEPCNT:
2507                         switch (sopt->sopt_name) {
2508                         case TCP_MAXUNACKTIME:
2509                                 ui = TP_MAXUNACKTIME(tp) / hz;
2510                                 break;
2511                         case TCP_KEEPIDLE:
2512                                 ui = TP_KEEPIDLE(tp) / hz;
2513                                 break;
2514                         case TCP_KEEPINTVL:
2515                                 ui = TP_KEEPINTVL(tp) / hz;
2516                                 break;
2517                         case TCP_KEEPINIT:
2518                                 ui = TP_KEEPINIT(tp) / hz;
2519                                 break;
2520                         case TCP_KEEPCNT:
2521                                 ui = TP_KEEPCNT(tp);
2522                                 break;
2523                         }
2524                         INP_WUNLOCK(inp);
2525                         error = sooptcopyout(sopt, &ui, sizeof(ui));
2526                         break;
2527 #ifdef TCPPCAP
2528                 case TCP_PCAP_OUT:
2529                 case TCP_PCAP_IN:
2530                         optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ?
2531                                         &(tp->t_outpkts) : &(tp->t_inpkts));
2532                         INP_WUNLOCK(inp);
2533                         error = sooptcopyout(sopt, &optval, sizeof optval);
2534                         break;
2535 #endif
2536                 case TCP_FASTOPEN:
2537                         optval = tp->t_flags & TF_FASTOPEN;
2538                         INP_WUNLOCK(inp);
2539                         error = sooptcopyout(sopt, &optval, sizeof optval);
2540                         break;
2541 #ifdef TCP_BLACKBOX
2542                 case TCP_LOG:
2543                         optval = tp->t_logstate;
2544                         INP_WUNLOCK(inp);
2545                         error = sooptcopyout(sopt, &optval, sizeof(optval));
2546                         break;
2547                 case TCP_LOGBUF:
2548                         /* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */
2549                         error = tcp_log_getlogbuf(sopt, tp);
2550                         break;
2551                 case TCP_LOGID:
2552                         len = tcp_log_get_id(tp, buf);
2553                         INP_WUNLOCK(inp);
2554                         error = sooptcopyout(sopt, buf, len + 1);
2555                         break;
2556                 case TCP_LOGDUMP:
2557                 case TCP_LOGDUMPID:
2558                         INP_WUNLOCK(inp);
2559                         error = EINVAL;
2560                         break;
2561 #endif
2562 #ifdef KERN_TLS
2563                 case TCP_TXTLS_MODE:
2564                         error = ktls_get_tx_mode(so, &optval);
2565                         INP_WUNLOCK(inp);
2566                         if (error == 0)
2567                                 error = sooptcopyout(sopt, &optval,
2568                                     sizeof(optval));
2569                         break;
2570                 case TCP_RXTLS_MODE:
2571                         error = ktls_get_rx_mode(so, &optval);
2572                         INP_WUNLOCK(inp);
2573                         if (error == 0)
2574                                 error = sooptcopyout(sopt, &optval,
2575                                     sizeof(optval));
2576                         break;
2577 #endif
2578                 case TCP_LRD:
2579                         optval = tp->t_flags & TF_LRD;
2580                         INP_WUNLOCK(inp);
2581                         error = sooptcopyout(sopt, &optval, sizeof optval);
2582                         break;
2583                 default:
2584                         INP_WUNLOCK(inp);
2585                         error = ENOPROTOOPT;
2586                         break;
2587                 }
2588                 break;
2589         }
2590         return (error);
2591 }
2592 #undef INP_WLOCK_RECHECK
2593 #undef INP_WLOCK_RECHECK_CLEANUP
2594
2595 /*
2596  * Initiate (or continue) disconnect.
2597  * If embryonic state, just send reset (once).
2598  * If in ``let data drain'' option and linger null, just drop.
2599  * Otherwise (hard), mark socket disconnecting and drop
2600  * current input data; switch states based on user close, and
2601  * send segment to peer (with FIN).
2602  */
2603 static void
2604 tcp_disconnect(struct tcpcb *tp)
2605 {
2606         struct inpcb *inp = tptoinpcb(tp);
2607         struct socket *so = tptosocket(tp);
2608
2609         NET_EPOCH_ASSERT();
2610         INP_WLOCK_ASSERT(inp);
2611
2612         /*
2613          * Neither tcp_close() nor tcp_drop() should return NULL, as the
2614          * socket is still open.
2615          */
2616         if (tp->t_state < TCPS_ESTABLISHED &&
2617             !(tp->t_state > TCPS_LISTEN && IS_FASTOPEN(tp->t_flags))) {
2618                 tp = tcp_close(tp);
2619                 KASSERT(tp != NULL,
2620                     ("tcp_disconnect: tcp_close() returned NULL"));
2621         } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
2622                 tp = tcp_drop(tp, 0);
2623                 KASSERT(tp != NULL,
2624                     ("tcp_disconnect: tcp_drop() returned NULL"));
2625         } else {
2626                 soisdisconnecting(so);
2627                 sbflush(&so->so_rcv);
2628                 tcp_usrclosed(tp);
2629                 if (!(inp->inp_flags & INP_DROPPED))
2630                         /* Ignore stack's drop request, we already at it. */
2631                         (void)tcp_output_nodrop(tp);
2632         }
2633 }
2634
2635 /*
2636  * User issued close, and wish to trail through shutdown states:
2637  * if never received SYN, just forget it.  If got a SYN from peer,
2638  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
2639  * If already got a FIN from peer, then almost done; go to LAST_ACK
2640  * state.  In all other cases, have already sent FIN to peer (e.g.
2641  * after PRU_SHUTDOWN), and just have to play tedious game waiting
2642  * for peer to send FIN or not respond to keep-alives, etc.
2643  * We can let the user exit from the close as soon as the FIN is acked.
2644  */
2645 static void
2646 tcp_usrclosed(struct tcpcb *tp)
2647 {
2648
2649         NET_EPOCH_ASSERT();
2650         INP_WLOCK_ASSERT(tptoinpcb(tp));
2651
2652         switch (tp->t_state) {
2653         case TCPS_LISTEN:
2654 #ifdef TCP_OFFLOAD
2655                 tcp_offload_listen_stop(tp);
2656 #endif
2657                 tcp_state_change(tp, TCPS_CLOSED);
2658                 /* FALLTHROUGH */
2659         case TCPS_CLOSED:
2660                 tp = tcp_close(tp);
2661                 /*
2662                  * tcp_close() should never return NULL here as the socket is
2663                  * still open.
2664                  */
2665                 KASSERT(tp != NULL,
2666                     ("tcp_usrclosed: tcp_close() returned NULL"));
2667                 break;
2668
2669         case TCPS_SYN_SENT:
2670         case TCPS_SYN_RECEIVED:
2671                 tp->t_flags |= TF_NEEDFIN;
2672                 break;
2673
2674         case TCPS_ESTABLISHED:
2675                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
2676                 break;
2677
2678         case TCPS_CLOSE_WAIT:
2679                 tcp_state_change(tp, TCPS_LAST_ACK);
2680                 break;
2681         }
2682         if (tp->t_acktime == 0)
2683                 tp->t_acktime = ticks;
2684         if (tp->t_state >= TCPS_FIN_WAIT_2) {
2685                 soisdisconnected(tptosocket(tp));
2686                 /* Prevent the connection hanging in FIN_WAIT_2 forever. */
2687                 if (tp->t_state == TCPS_FIN_WAIT_2) {
2688                         int timeout;
2689
2690                         timeout = (tcp_fast_finwait2_recycle) ?
2691                             tcp_finwait2_timeout : TP_MAXIDLE(tp);
2692                         tcp_timer_activate(tp, TT_2MSL, timeout);
2693                 }
2694         }
2695 }
2696
2697 #ifdef DDB
2698 static void
2699 db_print_indent(int indent)
2700 {
2701         int i;
2702
2703         for (i = 0; i < indent; i++)
2704                 db_printf(" ");
2705 }
2706
2707 static void
2708 db_print_tstate(int t_state)
2709 {
2710
2711         switch (t_state) {
2712         case TCPS_CLOSED:
2713                 db_printf("TCPS_CLOSED");
2714                 return;
2715
2716         case TCPS_LISTEN:
2717                 db_printf("TCPS_LISTEN");
2718                 return;
2719
2720         case TCPS_SYN_SENT:
2721                 db_printf("TCPS_SYN_SENT");
2722                 return;
2723
2724         case TCPS_SYN_RECEIVED:
2725                 db_printf("TCPS_SYN_RECEIVED");
2726                 return;
2727
2728         case TCPS_ESTABLISHED:
2729                 db_printf("TCPS_ESTABLISHED");
2730                 return;
2731
2732         case TCPS_CLOSE_WAIT:
2733                 db_printf("TCPS_CLOSE_WAIT");
2734                 return;
2735
2736         case TCPS_FIN_WAIT_1:
2737                 db_printf("TCPS_FIN_WAIT_1");
2738                 return;
2739
2740         case TCPS_CLOSING:
2741                 db_printf("TCPS_CLOSING");
2742                 return;
2743
2744         case TCPS_LAST_ACK:
2745                 db_printf("TCPS_LAST_ACK");
2746                 return;
2747
2748         case TCPS_FIN_WAIT_2:
2749                 db_printf("TCPS_FIN_WAIT_2");
2750                 return;
2751
2752         case TCPS_TIME_WAIT:
2753                 db_printf("TCPS_TIME_WAIT");
2754                 return;
2755
2756         default:
2757                 db_printf("unknown");
2758                 return;
2759         }
2760 }
2761
2762 static void
2763 db_print_tflags(u_int t_flags)
2764 {
2765         int comma;
2766
2767         comma = 0;
2768         if (t_flags & TF_ACKNOW) {
2769                 db_printf("%sTF_ACKNOW", comma ? ", " : "");
2770                 comma = 1;
2771         }
2772         if (t_flags & TF_DELACK) {
2773                 db_printf("%sTF_DELACK", comma ? ", " : "");
2774                 comma = 1;
2775         }
2776         if (t_flags & TF_NODELAY) {
2777                 db_printf("%sTF_NODELAY", comma ? ", " : "");
2778                 comma = 1;
2779         }
2780         if (t_flags & TF_NOOPT) {
2781                 db_printf("%sTF_NOOPT", comma ? ", " : "");
2782                 comma = 1;
2783         }
2784         if (t_flags & TF_SENTFIN) {
2785                 db_printf("%sTF_SENTFIN", comma ? ", " : "");
2786                 comma = 1;
2787         }
2788         if (t_flags & TF_REQ_SCALE) {
2789                 db_printf("%sTF_REQ_SCALE", comma ? ", " : "");
2790                 comma = 1;
2791         }
2792         if (t_flags & TF_RCVD_SCALE) {
2793                 db_printf("%sTF_RECVD_SCALE", comma ? ", " : "");
2794                 comma = 1;
2795         }
2796         if (t_flags & TF_REQ_TSTMP) {
2797                 db_printf("%sTF_REQ_TSTMP", comma ? ", " : "");
2798                 comma = 1;
2799         }
2800         if (t_flags & TF_RCVD_TSTMP) {
2801                 db_printf("%sTF_RCVD_TSTMP", comma ? ", " : "");
2802                 comma = 1;
2803         }
2804         if (t_flags & TF_SACK_PERMIT) {
2805                 db_printf("%sTF_SACK_PERMIT", comma ? ", " : "");
2806                 comma = 1;
2807         }
2808         if (t_flags & TF_NEEDSYN) {
2809                 db_printf("%sTF_NEEDSYN", comma ? ", " : "");
2810                 comma = 1;
2811         }
2812         if (t_flags & TF_NEEDFIN) {
2813                 db_printf("%sTF_NEEDFIN", comma ? ", " : "");
2814                 comma = 1;
2815         }
2816         if (t_flags & TF_NOPUSH) {
2817                 db_printf("%sTF_NOPUSH", comma ? ", " : "");
2818                 comma = 1;
2819         }
2820         if (t_flags & TF_PREVVALID) {
2821                 db_printf("%sTF_PREVVALID", comma ? ", " : "");
2822                 comma = 1;
2823         }
2824         if (t_flags & TF_MORETOCOME) {
2825                 db_printf("%sTF_MORETOCOME", comma ? ", " : "");
2826                 comma = 1;
2827         }
2828         if (t_flags & TF_SONOTCONN) {
2829                 db_printf("%sTF_SONOTCONN", comma ? ", " : "");
2830                 comma = 1;
2831         }
2832         if (t_flags & TF_LASTIDLE) {
2833                 db_printf("%sTF_LASTIDLE", comma ? ", " : "");
2834                 comma = 1;
2835         }
2836         if (t_flags & TF_RXWIN0SENT) {
2837                 db_printf("%sTF_RXWIN0SENT", comma ? ", " : "");
2838                 comma = 1;
2839         }
2840         if (t_flags & TF_FASTRECOVERY) {
2841                 db_printf("%sTF_FASTRECOVERY", comma ? ", " : "");
2842                 comma = 1;
2843         }
2844         if (t_flags & TF_CONGRECOVERY) {
2845                 db_printf("%sTF_CONGRECOVERY", comma ? ", " : "");
2846                 comma = 1;
2847         }
2848         if (t_flags & TF_WASFRECOVERY) {
2849                 db_printf("%sTF_WASFRECOVERY", comma ? ", " : "");
2850                 comma = 1;
2851         }
2852         if (t_flags & TF_WASCRECOVERY) {
2853                 db_printf("%sTF_WASCRECOVERY", comma ? ", " : "");
2854                 comma = 1;
2855         }
2856         if (t_flags & TF_SIGNATURE) {
2857                 db_printf("%sTF_SIGNATURE", comma ? ", " : "");
2858                 comma = 1;
2859         }
2860         if (t_flags & TF_FORCEDATA) {
2861                 db_printf("%sTF_FORCEDATA", comma ? ", " : "");
2862                 comma = 1;
2863         }
2864         if (t_flags & TF_TSO) {
2865                 db_printf("%sTF_TSO", comma ? ", " : "");
2866                 comma = 1;
2867         }
2868         if (t_flags & TF_FASTOPEN) {
2869                 db_printf("%sTF_FASTOPEN", comma ? ", " : "");
2870                 comma = 1;
2871         }
2872 }
2873
2874 static void
2875 db_print_tflags2(u_int t_flags2)
2876 {
2877         int comma;
2878
2879         comma = 0;
2880         if (t_flags2 & TF2_PLPMTU_BLACKHOLE) {
2881                 db_printf("%sTF2_PLPMTU_BLACKHOLE", comma ? ", " : "");
2882                 comma = 1;
2883         }
2884         if (t_flags2 & TF2_PLPMTU_PMTUD) {
2885                 db_printf("%sTF2_PLPMTU_PMTUD", comma ? ", " : "");
2886                 comma = 1;
2887         }
2888         if (t_flags2 & TF2_PLPMTU_MAXSEGSNT) {
2889                 db_printf("%sTF2_PLPMTU_MAXSEGSNT", comma ? ", " : "");
2890                 comma = 1;
2891         }
2892         if (t_flags2 & TF2_LOG_AUTO) {
2893                 db_printf("%sTF2_LOG_AUTO", comma ? ", " : "");
2894                 comma = 1;
2895         }
2896         if (t_flags2 & TF2_DROP_AF_DATA) {
2897                 db_printf("%sTF2_DROP_AF_DATA", comma ? ", " : "");
2898                 comma = 1;
2899         }
2900         if (t_flags2 & TF2_ECN_PERMIT) {
2901                 db_printf("%sTF2_ECN_PERMIT", comma ? ", " : "");
2902                 comma = 1;
2903         }
2904         if (t_flags2 & TF2_ECN_SND_CWR) {
2905                 db_printf("%sTF2_ECN_SND_CWR", comma ? ", " : "");
2906                 comma = 1;
2907         }
2908         if (t_flags2 & TF2_ECN_SND_ECE) {
2909                 db_printf("%sTF2_ECN_SND_ECE", comma ? ", " : "");
2910                 comma = 1;
2911         }
2912         if (t_flags2 & TF2_ACE_PERMIT) {
2913                 db_printf("%sTF2_ACE_PERMIT", comma ? ", " : "");
2914                 comma = 1;
2915         }
2916         if (t_flags2 & TF2_FBYTES_COMPLETE) {
2917                 db_printf("%sTF2_FBYTES_COMPLETE", comma ? ", " : "");
2918                 comma = 1;
2919         }
2920 }
2921
2922 static void
2923 db_print_toobflags(char t_oobflags)
2924 {
2925         int comma;
2926
2927         comma = 0;
2928         if (t_oobflags & TCPOOB_HAVEDATA) {
2929                 db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : "");
2930                 comma = 1;
2931         }
2932         if (t_oobflags & TCPOOB_HADDATA) {
2933                 db_printf("%sTCPOOB_HADDATA", comma ? ", " : "");
2934                 comma = 1;
2935         }
2936 }
2937
2938 static void
2939 db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
2940 {
2941
2942         db_print_indent(indent);
2943         db_printf("%s at %p\n", name, tp);
2944
2945         indent += 2;
2946
2947         db_print_indent(indent);
2948         db_printf("t_segq first: %p   t_segqlen: %d   t_dupacks: %d\n",
2949            TAILQ_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
2950
2951         db_print_indent(indent);
2952         db_printf("t_callout: %p   t_timers: %p\n",
2953             &tp->t_callout, &tp->t_timers);
2954
2955         db_print_indent(indent);
2956         db_printf("t_state: %d (", tp->t_state);
2957         db_print_tstate(tp->t_state);
2958         db_printf(")\n");
2959
2960         db_print_indent(indent);
2961         db_printf("t_flags: 0x%x (", tp->t_flags);
2962         db_print_tflags(tp->t_flags);
2963         db_printf(")\n");
2964
2965         db_print_indent(indent);
2966         db_printf("t_flags2: 0x%x (", tp->t_flags2);
2967         db_print_tflags2(tp->t_flags2);
2968         db_printf(")\n");
2969
2970         db_print_indent(indent);
2971         db_printf("snd_una: 0x%08x   snd_max: 0x%08x   snd_nxt: 0x%08x\n",
2972             tp->snd_una, tp->snd_max, tp->snd_nxt);
2973
2974         db_print_indent(indent);
2975         db_printf("snd_up: 0x%08x   snd_wl1: 0x%08x   snd_wl2: 0x%08x\n",
2976            tp->snd_up, tp->snd_wl1, tp->snd_wl2);
2977
2978         db_print_indent(indent);
2979         db_printf("iss: 0x%08x   irs: 0x%08x   rcv_nxt: 0x%08x\n",
2980             tp->iss, tp->irs, tp->rcv_nxt);
2981
2982         db_print_indent(indent);
2983         db_printf("rcv_adv: 0x%08x   rcv_wnd: %u   rcv_up: 0x%08x\n",
2984             tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);
2985
2986         db_print_indent(indent);
2987         db_printf("snd_wnd: %u   snd_cwnd: %u\n",
2988            tp->snd_wnd, tp->snd_cwnd);
2989
2990         db_print_indent(indent);
2991         db_printf("snd_ssthresh: %u   snd_recover: "
2992             "0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
2993
2994         db_print_indent(indent);
2995         db_printf("t_rcvtime: %u   t_startime: %u\n",
2996             tp->t_rcvtime, tp->t_starttime);
2997
2998         db_print_indent(indent);
2999         db_printf("t_rttime: %u   t_rtsq: 0x%08x\n",
3000             tp->t_rtttime, tp->t_rtseq);
3001
3002         db_print_indent(indent);
3003         db_printf("t_rxtcur: %d   t_maxseg: %u   t_srtt: %d\n",
3004             tp->t_rxtcur, tp->t_maxseg, tp->t_srtt);
3005
3006         db_print_indent(indent);
3007         db_printf("t_rttvar: %d   t_rxtshift: %d   t_rttmin: %u\n",
3008             tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin);
3009
3010         db_print_indent(indent);
3011         db_printf("t_rttupdated: %u   max_sndwnd: %u   t_softerror: %d\n",
3012             tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror);
3013
3014         db_print_indent(indent);
3015         db_printf("t_oobflags: 0x%x (", tp->t_oobflags);
3016         db_print_toobflags(tp->t_oobflags);
3017         db_printf(")   t_iobc: 0x%02x\n", tp->t_iobc);
3018
3019         db_print_indent(indent);
3020         db_printf("snd_scale: %u   rcv_scale: %u   request_r_scale: %u\n",
3021             tp->snd_scale, tp->rcv_scale, tp->request_r_scale);
3022
3023         db_print_indent(indent);
3024         db_printf("ts_recent: %u   ts_recent_age: %u\n",
3025             tp->ts_recent, tp->ts_recent_age);
3026
3027         db_print_indent(indent);
3028         db_printf("ts_offset: %u   last_ack_sent: 0x%08x   snd_cwnd_prev: "
3029             "%u\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev);
3030
3031         db_print_indent(indent);
3032         db_printf("snd_ssthresh_prev: %u   snd_recover_prev: 0x%08x   "
3033             "t_badrxtwin: %u\n", tp->snd_ssthresh_prev,
3034             tp->snd_recover_prev, tp->t_badrxtwin);
3035
3036         db_print_indent(indent);
3037         db_printf("snd_numholes: %d  snd_holes first: %p\n",
3038             tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes));
3039
3040         db_print_indent(indent);
3041         db_printf("snd_fack: 0x%08x   rcv_numsacks: %d\n",
3042             tp->snd_fack, tp->rcv_numsacks);
3043
3044         /* Skip sackblks, sackhint. */
3045
3046         db_print_indent(indent);
3047         db_printf("t_rttlow: %d   rfbuf_ts: %u   rfbuf_cnt: %d\n",
3048             tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt);
3049 }
3050
3051 DB_SHOW_COMMAND(tcpcb, db_show_tcpcb)
3052 {
3053         struct tcpcb *tp;
3054
3055         if (!have_addr) {
3056                 db_printf("usage: show tcpcb <addr>\n");
3057                 return;
3058         }
3059         tp = (struct tcpcb *)addr;
3060
3061         db_print_tcpcb(tp, "tcpcb", 0);
3062 }
3063 #endif