]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/in_pcb.c
unix: make unp_connect2() void
[FreeBSD/FreeBSD.git] / sys / netinet / in_pcb.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1991, 1993, 1995
5  *      The Regents of the University of California.
6  * Copyright (c) 2007-2009 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * All rights reserved.
9  *
10  * Portions of this software were developed by Robert N. M. Watson under
11  * contract to Juniper Networks, Inc.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *      @(#)in_pcb.c    8.4 (Berkeley) 5/24/95
38  */
39
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42
43 #include "opt_ddb.h"
44 #include "opt_ipsec.h"
45 #include "opt_inet.h"
46 #include "opt_inet6.h"
47 #include "opt_ratelimit.h"
48 #include "opt_route.h"
49 #include "opt_rss.h"
50
51 #include <sys/param.h>
52 #include <sys/hash.h>
53 #include <sys/systm.h>
54 #include <sys/libkern.h>
55 #include <sys/lock.h>
56 #include <sys/malloc.h>
57 #include <sys/mbuf.h>
58 #include <sys/callout.h>
59 #include <sys/eventhandler.h>
60 #include <sys/domain.h>
61 #include <sys/protosw.h>
62 #include <sys/smp.h>
63 #include <sys/socket.h>
64 #include <sys/socketvar.h>
65 #include <sys/sockio.h>
66 #include <sys/priv.h>
67 #include <sys/proc.h>
68 #include <sys/refcount.h>
69 #include <sys/jail.h>
70 #include <sys/kernel.h>
71 #include <sys/sysctl.h>
72
73 #ifdef DDB
74 #include <ddb/ddb.h>
75 #endif
76
77 #include <vm/uma.h>
78 #include <vm/vm.h>
79
80 #include <net/if.h>
81 #include <net/if_var.h>
82 #include <net/if_types.h>
83 #include <net/if_llatbl.h>
84 #include <net/route.h>
85 #include <net/rss_config.h>
86 #include <net/vnet.h>
87
88 #if defined(INET) || defined(INET6)
89 #include <netinet/in.h>
90 #include <netinet/in_pcb.h>
91 #include <netinet/in_pcb_var.h>
92 #ifdef INET
93 #include <netinet/in_var.h>
94 #include <netinet/in_fib.h>
95 #endif
96 #include <netinet/ip_var.h>
97 #include <netinet/tcp_var.h>
98 #ifdef TCPHPTS
99 #include <netinet/tcp_hpts.h>
100 #endif
101 #include <netinet/udp.h>
102 #include <netinet/udp_var.h>
103 #ifdef INET6
104 #include <netinet/ip6.h>
105 #include <netinet6/in6_pcb.h>
106 #include <netinet6/in6_var.h>
107 #include <netinet6/ip6_var.h>
108 #endif /* INET6 */
109 #include <net/route/nhop.h>
110 #endif
111
112 #include <netipsec/ipsec_support.h>
113
114 #include <security/mac/mac_framework.h>
115
116 #define INPCBLBGROUP_SIZMIN     8
117 #define INPCBLBGROUP_SIZMAX     256
118 #define INP_FREED       0x00000200      /* See in_pcb.h. */
119
120 static struct callout   ipport_tick_callout;
121
122 /*
123  * These configure the range of local port addresses assigned to
124  * "unspecified" outgoing connections/packets/whatever.
125  */
126 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;    /* 1023 */
127 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;    /* 600 */
128 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;     /* 10000 */
129 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;       /* 65535 */
130 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;      /* 49152 */
131 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;        /* 65535 */
132
133 /*
134  * Reserved ports accessible only to root. There are significant
135  * security considerations that must be accounted for when changing these,
136  * but the security benefits can be great. Please be careful.
137  */
138 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;    /* 1023 */
139 VNET_DEFINE(int, ipport_reservedlow);
140
141 /* Variables dealing with random ephemeral port allocation. */
142 VNET_DEFINE(int, ipport_randomized) = 1;        /* user controlled via sysctl */
143 VNET_DEFINE(int, ipport_randomcps) = 10;        /* user controlled via sysctl */
144 VNET_DEFINE(int, ipport_randomtime) = 45;       /* user controlled via sysctl */
145 VNET_DEFINE(int, ipport_stoprandom);            /* toggled by ipport_tick */
146 VNET_DEFINE(int, ipport_tcpallocs);
147 VNET_DEFINE_STATIC(int, ipport_tcplastcount);
148
149 #define V_ipport_tcplastcount           VNET(ipport_tcplastcount)
150
151 #ifdef INET
152 static struct inpcb     *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
153                             struct in_addr faddr, u_int fport_arg,
154                             struct in_addr laddr, u_int lport_arg,
155                             int lookupflags, struct ifnet *ifp,
156                             uint8_t numa_domain);
157
158 #define RANGECHK(var, min, max) \
159         if ((var) < (min)) { (var) = (min); } \
160         else if ((var) > (max)) { (var) = (max); }
161
162 static int
163 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
164 {
165         int error;
166
167         error = sysctl_handle_int(oidp, arg1, arg2, req);
168         if (error == 0) {
169                 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
170                 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
171                 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
172                 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
173                 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
174                 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
175         }
176         return (error);
177 }
178
179 #undef RANGECHK
180
181 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
182     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
183     "IP Ports");
184
185 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
186     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
187     &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
188     "");
189 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
190     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
191     &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
192     "");
193 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
194     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
195     &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
196     "");
197 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
198     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
199     &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
200     "");
201 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
202     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
203     &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
204     "");
205 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
206     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
207     &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
208     "");
209 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
210         CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
211         &VNET_NAME(ipport_reservedhigh), 0, "");
212 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
213         CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
214 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
215         CTLFLAG_VNET | CTLFLAG_RW,
216         &VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
217 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
218         CTLFLAG_VNET | CTLFLAG_RW,
219         &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
220         "allocations before switching to a sequential one");
221 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
222         CTLFLAG_VNET | CTLFLAG_RW,
223         &VNET_NAME(ipport_randomtime), 0,
224         "Minimum time to keep sequential port "
225         "allocation before switching to a random one");
226
227 #ifdef RATELIMIT
228 counter_u64_t rate_limit_new;
229 counter_u64_t rate_limit_chg;
230 counter_u64_t rate_limit_active;
231 counter_u64_t rate_limit_alloc_fail;
232 counter_u64_t rate_limit_set_ok;
233
234 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
235     "IP Rate Limiting");
236 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
237     &rate_limit_active, "Active rate limited connections");
238 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
239    &rate_limit_alloc_fail, "Rate limited connection failures");
240 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
241    &rate_limit_set_ok, "Rate limited setting succeeded");
242 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
243    &rate_limit_new, "Total Rate limit new attempts");
244 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
245    &rate_limit_chg, "Total Rate limited change attempts");
246
247 #endif /* RATELIMIT */
248
249 #endif /* INET */
250
251 VNET_DEFINE(uint32_t, in_pcbhashseed);
252 static void
253 in_pcbhashseed_init(void)
254 {
255
256         V_in_pcbhashseed = arc4random();
257 }
258 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
259     in_pcbhashseed_init, 0);
260
261 /*
262  * in_pcb.c: manage the Protocol Control Blocks.
263  *
264  * NOTE: It is assumed that most of these functions will be called with
265  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
266  * functions often modify hash chains or addresses in pcbs.
267  */
268
269 static struct inpcblbgroup *
270 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
271     uint16_t port, const union in_dependaddr *addr, int size,
272     uint8_t numa_domain)
273 {
274         struct inpcblbgroup *grp;
275         size_t bytes;
276
277         bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
278         grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
279         if (!grp)
280                 return (NULL);
281         grp->il_vflag = vflag;
282         grp->il_lport = port;
283         grp->il_numa_domain = numa_domain;
284         grp->il_dependladdr = *addr;
285         grp->il_inpsiz = size;
286         CK_LIST_INSERT_HEAD(hdr, grp, il_list);
287         return (grp);
288 }
289
290 static void
291 in_pcblbgroup_free_deferred(epoch_context_t ctx)
292 {
293         struct inpcblbgroup *grp;
294
295         grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
296         free(grp, M_PCB);
297 }
298
299 static void
300 in_pcblbgroup_free(struct inpcblbgroup *grp)
301 {
302
303         CK_LIST_REMOVE(grp, il_list);
304         NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
305 }
306
307 static struct inpcblbgroup *
308 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
309     struct inpcblbgroup *old_grp, int size)
310 {
311         struct inpcblbgroup *grp;
312         int i;
313
314         grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
315             old_grp->il_lport, &old_grp->il_dependladdr, size,
316             old_grp->il_numa_domain);
317         if (grp == NULL)
318                 return (NULL);
319
320         KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
321             ("invalid new local group size %d and old local group count %d",
322              grp->il_inpsiz, old_grp->il_inpcnt));
323
324         for (i = 0; i < old_grp->il_inpcnt; ++i)
325                 grp->il_inp[i] = old_grp->il_inp[i];
326         grp->il_inpcnt = old_grp->il_inpcnt;
327         in_pcblbgroup_free(old_grp);
328         return (grp);
329 }
330
331 /*
332  * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
333  * and shrink group if possible.
334  */
335 static void
336 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
337     int i)
338 {
339         struct inpcblbgroup *grp, *new_grp;
340
341         grp = *grpp;
342         for (; i + 1 < grp->il_inpcnt; ++i)
343                 grp->il_inp[i] = grp->il_inp[i + 1];
344         grp->il_inpcnt--;
345
346         if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
347             grp->il_inpcnt <= grp->il_inpsiz / 4) {
348                 /* Shrink this group. */
349                 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
350                 if (new_grp != NULL)
351                         *grpp = new_grp;
352         }
353 }
354
355 /*
356  * Add PCB to load balance group for SO_REUSEPORT_LB option.
357  */
358 static int
359 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
360 {
361         const static struct timeval interval = { 60, 0 };
362         static struct timeval lastprint;
363         struct inpcbinfo *pcbinfo;
364         struct inpcblbgrouphead *hdr;
365         struct inpcblbgroup *grp;
366         uint32_t idx;
367
368         pcbinfo = inp->inp_pcbinfo;
369
370         INP_WLOCK_ASSERT(inp);
371         INP_HASH_WLOCK_ASSERT(pcbinfo);
372
373         /*
374          * Don't allow jailed socket to join local group.
375          */
376         if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred))
377                 return (0);
378
379 #ifdef INET6
380         /*
381          * Don't allow IPv4 mapped INET6 wild socket.
382          */
383         if ((inp->inp_vflag & INP_IPV4) &&
384             inp->inp_laddr.s_addr == INADDR_ANY &&
385             INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
386                 return (0);
387         }
388 #endif
389
390         idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
391         hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
392         CK_LIST_FOREACH(grp, hdr, il_list) {
393                 if (grp->il_vflag == inp->inp_vflag &&
394                     grp->il_lport == inp->inp_lport &&
395                     grp->il_numa_domain == numa_domain &&
396                     memcmp(&grp->il_dependladdr,
397                     &inp->inp_inc.inc_ie.ie_dependladdr,
398                     sizeof(grp->il_dependladdr)) == 0)
399                         break;
400         }
401         if (grp == NULL) {
402                 /* Create new load balance group. */
403                 grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
404                     inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
405                     INPCBLBGROUP_SIZMIN, numa_domain);
406                 if (grp == NULL)
407                         return (ENOBUFS);
408         } else if (grp->il_inpcnt == grp->il_inpsiz) {
409                 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
410                         if (ratecheck(&lastprint, &interval))
411                                 printf("lb group port %d, limit reached\n",
412                                     ntohs(grp->il_lport));
413                         return (0);
414                 }
415
416                 /* Expand this local group. */
417                 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
418                 if (grp == NULL)
419                         return (ENOBUFS);
420         }
421
422         KASSERT(grp->il_inpcnt < grp->il_inpsiz,
423             ("invalid local group size %d and count %d", grp->il_inpsiz,
424             grp->il_inpcnt));
425
426         grp->il_inp[grp->il_inpcnt] = inp;
427         grp->il_inpcnt++;
428         return (0);
429 }
430
431 /*
432  * Remove PCB from load balance group.
433  */
434 static void
435 in_pcbremlbgrouphash(struct inpcb *inp)
436 {
437         struct inpcbinfo *pcbinfo;
438         struct inpcblbgrouphead *hdr;
439         struct inpcblbgroup *grp;
440         int i;
441
442         pcbinfo = inp->inp_pcbinfo;
443
444         INP_WLOCK_ASSERT(inp);
445         INP_HASH_WLOCK_ASSERT(pcbinfo);
446
447         hdr = &pcbinfo->ipi_lbgrouphashbase[
448             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
449         CK_LIST_FOREACH(grp, hdr, il_list) {
450                 for (i = 0; i < grp->il_inpcnt; ++i) {
451                         if (grp->il_inp[i] != inp)
452                                 continue;
453
454                         if (grp->il_inpcnt == 1) {
455                                 /* We are the last, free this local group. */
456                                 in_pcblbgroup_free(grp);
457                         } else {
458                                 /* Pull up inpcbs, shrink group if possible. */
459                                 in_pcblbgroup_reorder(hdr, &grp, i);
460                         }
461                         return;
462                 }
463         }
464 }
465
466 int
467 in_pcblbgroup_numa(struct inpcb *inp, int arg)
468 {
469         struct inpcbinfo *pcbinfo;
470         struct inpcblbgrouphead *hdr;
471         struct inpcblbgroup *grp;
472         int err, i;
473         uint8_t numa_domain;
474
475         switch (arg) {
476         case TCP_REUSPORT_LB_NUMA_NODOM:
477                 numa_domain = M_NODOM;
478                 break;
479         case TCP_REUSPORT_LB_NUMA_CURDOM:
480                 numa_domain = PCPU_GET(domain);
481                 break;
482         default:
483                 if (arg < 0 || arg >= vm_ndomains)
484                         return (EINVAL);
485                 numa_domain = arg;
486         }
487
488         err = 0;
489         pcbinfo = inp->inp_pcbinfo;
490         INP_WLOCK_ASSERT(inp);
491         INP_HASH_WLOCK(pcbinfo);
492         hdr = &pcbinfo->ipi_lbgrouphashbase[
493             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
494         CK_LIST_FOREACH(grp, hdr, il_list) {
495                 for (i = 0; i < grp->il_inpcnt; ++i) {
496                         if (grp->il_inp[i] != inp)
497                                 continue;
498
499                         if (grp->il_numa_domain == numa_domain) {
500                                 goto abort_with_hash_wlock;
501                         }
502
503                         /* Remove it from the old group. */
504                         in_pcbremlbgrouphash(inp);
505
506                         /* Add it to the new group based on numa domain. */
507                         in_pcbinslbgrouphash(inp, numa_domain);
508                         goto abort_with_hash_wlock;
509                 }
510         }
511         err = ENOENT;
512 abort_with_hash_wlock:
513         INP_HASH_WUNLOCK(pcbinfo);
514         return (err);
515 }
516
517 /* Make sure it is safe to use hashinit(9) on CK_LIST. */
518 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
519
520 /*
521  * Initialize an inpcbinfo - a per-VNET instance of connections db.
522  */
523 void
524 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
525     u_int hash_nelements, u_int porthash_nelements)
526 {
527
528         mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
529         mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
530             NULL, MTX_DEF);
531 #ifdef VIMAGE
532         pcbinfo->ipi_vnet = curvnet;
533 #endif
534         CK_LIST_INIT(&pcbinfo->ipi_listhead);
535         pcbinfo->ipi_count = 0;
536         pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
537             &pcbinfo->ipi_hashmask);
538         porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
539         pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
540             &pcbinfo->ipi_porthashmask);
541         pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
542             &pcbinfo->ipi_lbgrouphashmask);
543         pcbinfo->ipi_zone = pcbstor->ips_zone;
544         pcbinfo->ipi_portzone = pcbstor->ips_portzone;
545         pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
546 }
547
548 /*
549  * Destroy an inpcbinfo.
550  */
551 void
552 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
553 {
554
555         KASSERT(pcbinfo->ipi_count == 0,
556             ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
557
558         hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
559         hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
560             pcbinfo->ipi_porthashmask);
561         hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
562             pcbinfo->ipi_lbgrouphashmask);
563         mtx_destroy(&pcbinfo->ipi_hash_lock);
564         mtx_destroy(&pcbinfo->ipi_lock);
565 }
566
567 /*
568  * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
569  */
570 static void inpcb_dtor(void *, int, void *);
571 static void inpcb_fini(void *, int);
572 void
573 in_pcbstorage_init(void *arg)
574 {
575         struct inpcbstorage *pcbstor = arg;
576
577         pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
578             sizeof(struct inpcb), NULL, inpcb_dtor, pcbstor->ips_pcbinit,
579             inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR);
580         pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name,
581             sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
582         uma_zone_set_smr(pcbstor->ips_portzone,
583             uma_zone_get_smr(pcbstor->ips_zone));
584 }
585
586 /*
587  * Destroy a pcbstorage - used by unloadable protocols.
588  */
589 void
590 in_pcbstorage_destroy(void *arg)
591 {
592         struct inpcbstorage *pcbstor = arg;
593
594         uma_zdestroy(pcbstor->ips_zone);
595         uma_zdestroy(pcbstor->ips_portzone);
596 }
597
598 /*
599  * Allocate a PCB and associate it with the socket.
600  * On success return with the PCB locked.
601  */
602 int
603 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
604 {
605         struct inpcb *inp;
606 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
607         int error;
608 #endif
609
610         inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
611         if (inp == NULL)
612                 return (ENOBUFS);
613         bzero(&inp->inp_start_zero, inp_zero_size);
614 #ifdef NUMA
615         inp->inp_numa_domain = M_NODOM;
616 #endif
617         inp->inp_pcbinfo = pcbinfo;
618         inp->inp_socket = so;
619         inp->inp_cred = crhold(so->so_cred);
620         inp->inp_inc.inc_fibnum = so->so_fibnum;
621 #ifdef MAC
622         error = mac_inpcb_init(inp, M_NOWAIT);
623         if (error != 0)
624                 goto out;
625         mac_inpcb_create(so, inp);
626 #endif
627 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
628         error = ipsec_init_pcbpolicy(inp);
629         if (error != 0) {
630 #ifdef MAC
631                 mac_inpcb_destroy(inp);
632 #endif
633                 goto out;
634         }
635 #endif /*IPSEC*/
636 #ifdef INET6
637         if (INP_SOCKAF(so) == AF_INET6) {
638                 inp->inp_vflag |= INP_IPV6PROTO;
639                 if (V_ip6_v6only)
640                         inp->inp_flags |= IN6P_IPV6_V6ONLY;
641         }
642         if (V_ip6_auto_flowlabel)
643                 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
644 #endif
645         /*
646          * Routes in inpcb's can cache L2 as well; they are guaranteed
647          * to be cleaned up.
648          */
649         inp->inp_route.ro_flags = RT_LLE_CACHE;
650 #ifdef TCPHPTS
651         /*
652          * If using hpts lets drop a random number in so
653          * not all new connections fall on the same CPU.
654          */
655         inp->inp_hpts_cpu = hpts_random_cpu(inp);
656 #endif
657         refcount_init(&inp->inp_refcount, 1);   /* Reference from socket. */
658         INP_WLOCK(inp);
659         INP_INFO_WLOCK(pcbinfo);
660         pcbinfo->ipi_count++;
661         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
662         CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
663         INP_INFO_WUNLOCK(pcbinfo);
664         so->so_pcb = inp;
665
666         return (0);
667
668 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
669 out:
670         uma_zfree_smr(pcbinfo->ipi_zone, inp);
671         return (error);
672 #endif
673 }
674
675 #ifdef INET
676 int
677 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
678 {
679         int anonport, error;
680
681         KASSERT(nam == NULL || nam->sa_family == AF_INET,
682             ("%s: invalid address family for %p", __func__, nam));
683         KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in),
684             ("%s: invalid address length for %p", __func__, nam));
685         INP_WLOCK_ASSERT(inp);
686         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
687
688         if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
689                 return (EINVAL);
690         anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
691         error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
692             &inp->inp_lport, cred);
693         if (error)
694                 return (error);
695         if (in_pcbinshash(inp) != 0) {
696                 inp->inp_laddr.s_addr = INADDR_ANY;
697                 inp->inp_lport = 0;
698                 return (EAGAIN);
699         }
700         if (anonport)
701                 inp->inp_flags |= INP_ANONPORT;
702         return (0);
703 }
704 #endif
705
706 #if defined(INET) || defined(INET6)
707 /*
708  * Assign a local port like in_pcb_lport(), but also used with connect()
709  * and a foreign address and port.  If fsa is non-NULL, choose a local port
710  * that is unused with those, otherwise one that is completely unused.
711  * lsa can be NULL for IPv6.
712  */
713 int
714 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
715     struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
716 {
717         struct inpcbinfo *pcbinfo;
718         struct inpcb *tmpinp;
719         unsigned short *lastport;
720         int count, dorandom, error;
721         u_short aux, first, last, lport;
722 #ifdef INET
723         struct in_addr laddr, faddr;
724 #endif
725 #ifdef INET6
726         struct in6_addr *laddr6, *faddr6;
727 #endif
728
729         pcbinfo = inp->inp_pcbinfo;
730
731         /*
732          * Because no actual state changes occur here, a global write lock on
733          * the pcbinfo isn't required.
734          */
735         INP_LOCK_ASSERT(inp);
736         INP_HASH_LOCK_ASSERT(pcbinfo);
737
738         if (inp->inp_flags & INP_HIGHPORT) {
739                 first = V_ipport_hifirstauto;   /* sysctl */
740                 last  = V_ipport_hilastauto;
741                 lastport = &pcbinfo->ipi_lasthi;
742         } else if (inp->inp_flags & INP_LOWPORT) {
743                 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
744                 if (error)
745                         return (error);
746                 first = V_ipport_lowfirstauto;  /* 1023 */
747                 last  = V_ipport_lowlastauto;   /* 600 */
748                 lastport = &pcbinfo->ipi_lastlow;
749         } else {
750                 first = V_ipport_firstauto;     /* sysctl */
751                 last  = V_ipport_lastauto;
752                 lastport = &pcbinfo->ipi_lastport;
753         }
754         /*
755          * For UDP(-Lite), use random port allocation as long as the user
756          * allows it.  For TCP (and as of yet unknown) connections,
757          * use random port allocation only if the user allows it AND
758          * ipport_tick() allows it.
759          */
760         if (V_ipport_randomized &&
761                 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
762                 pcbinfo == &V_ulitecbinfo))
763                 dorandom = 1;
764         else
765                 dorandom = 0;
766         /*
767          * It makes no sense to do random port allocation if
768          * we have the only port available.
769          */
770         if (first == last)
771                 dorandom = 0;
772         /* Make sure to not include UDP(-Lite) packets in the count. */
773         if (pcbinfo != &V_udbinfo && pcbinfo != &V_ulitecbinfo)
774                 V_ipport_tcpallocs++;
775         /*
776          * Instead of having two loops further down counting up or down
777          * make sure that first is always <= last and go with only one
778          * code path implementing all logic.
779          */
780         if (first > last) {
781                 aux = first;
782                 first = last;
783                 last = aux;
784         }
785
786 #ifdef INET
787         laddr.s_addr = INADDR_ANY;
788         if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
789                 if (lsa != NULL)
790                         laddr = ((struct sockaddr_in *)lsa)->sin_addr;
791                 if (fsa != NULL)
792                         faddr = ((struct sockaddr_in *)fsa)->sin_addr;
793         }
794 #endif
795 #ifdef INET6
796         laddr6 = NULL;
797         if ((inp->inp_vflag & INP_IPV6) != 0) {
798                 if (lsa != NULL)
799                         laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
800                 if (fsa != NULL)
801                         faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
802         }
803 #endif
804
805         tmpinp = NULL;
806         lport = *lportp;
807
808         if (dorandom)
809                 *lastport = first + (arc4random() % (last - first));
810
811         count = last - first;
812
813         do {
814                 if (count-- < 0)        /* completely used? */
815                         return (EADDRNOTAVAIL);
816                 ++*lastport;
817                 if (*lastport < first || *lastport > last)
818                         *lastport = first;
819                 lport = htons(*lastport);
820
821                 if (fsa != NULL) {
822 #ifdef INET
823                         if (lsa->sa_family == AF_INET) {
824                                 tmpinp = in_pcblookup_hash_locked(pcbinfo,
825                                     faddr, fport, laddr, lport, lookupflags,
826                                     NULL, M_NODOM);
827                         }
828 #endif
829 #ifdef INET6
830                         if (lsa->sa_family == AF_INET6) {
831                                 tmpinp = in6_pcblookup_hash_locked(pcbinfo,
832                                     faddr6, fport, laddr6, lport, lookupflags,
833                                     NULL, M_NODOM);
834                         }
835 #endif
836                 } else {
837 #ifdef INET6
838                         if ((inp->inp_vflag & INP_IPV6) != 0)
839                                 tmpinp = in6_pcblookup_local(pcbinfo,
840                                     &inp->in6p_laddr, lport, lookupflags, cred);
841 #endif
842 #if defined(INET) && defined(INET6)
843                         else
844 #endif
845 #ifdef INET
846                                 tmpinp = in_pcblookup_local(pcbinfo, laddr,
847                                     lport, lookupflags, cred);
848 #endif
849                 }
850         } while (tmpinp != NULL);
851
852         *lportp = lport;
853
854         return (0);
855 }
856
857 /*
858  * Select a local port (number) to use.
859  */
860 int
861 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
862     struct ucred *cred, int lookupflags)
863 {
864         struct sockaddr_in laddr;
865
866         if (laddrp) {
867                 bzero(&laddr, sizeof(laddr));
868                 laddr.sin_family = AF_INET;
869                 laddr.sin_addr = *laddrp;
870         }
871         return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
872             NULL, lportp, NULL, 0, cred, lookupflags));
873 }
874
875 /*
876  * Return cached socket options.
877  */
878 int
879 inp_so_options(const struct inpcb *inp)
880 {
881         int so_options;
882
883         so_options = 0;
884
885         if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
886                 so_options |= SO_REUSEPORT_LB;
887         if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
888                 so_options |= SO_REUSEPORT;
889         if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
890                 so_options |= SO_REUSEADDR;
891         return (so_options);
892 }
893 #endif /* INET || INET6 */
894
895 /*
896  * Check if a new BINDMULTI socket is allowed to be created.
897  *
898  * ni points to the new inp.
899  * oi points to the existing inp.
900  *
901  * This checks whether the existing inp also has BINDMULTI and
902  * whether the credentials match.
903  */
904 int
905 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
906 {
907         /* Check permissions match */
908         if ((ni->inp_flags2 & INP_BINDMULTI) &&
909             (ni->inp_cred->cr_uid !=
910             oi->inp_cred->cr_uid))
911                 return (0);
912
913         /* Check the existing inp has BINDMULTI set */
914         if ((ni->inp_flags2 & INP_BINDMULTI) &&
915             ((oi->inp_flags2 & INP_BINDMULTI) == 0))
916                 return (0);
917
918         /*
919          * We're okay - either INP_BINDMULTI isn't set on ni, or
920          * it is and it matches the checks.
921          */
922         return (1);
923 }
924
925 #ifdef INET
926 /*
927  * Set up a bind operation on a PCB, performing port allocation
928  * as required, but do not actually modify the PCB. Callers can
929  * either complete the bind by setting inp_laddr/inp_lport and
930  * calling in_pcbinshash(), or they can just use the resulting
931  * port and address to authorise the sending of a once-off packet.
932  *
933  * On error, the values of *laddrp and *lportp are not changed.
934  */
935 int
936 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
937     u_short *lportp, struct ucred *cred)
938 {
939         struct socket *so = inp->inp_socket;
940         struct sockaddr_in *sin;
941         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
942         struct in_addr laddr;
943         u_short lport = 0;
944         int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
945         int error;
946
947         /*
948          * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
949          * so that we don't have to add to the (already messy) code below.
950          */
951         int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
952
953         /*
954          * No state changes, so read locks are sufficient here.
955          */
956         INP_LOCK_ASSERT(inp);
957         INP_HASH_LOCK_ASSERT(pcbinfo);
958
959         laddr.s_addr = *laddrp;
960         if (nam != NULL && laddr.s_addr != INADDR_ANY)
961                 return (EINVAL);
962         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
963                 lookupflags = INPLOOKUP_WILDCARD;
964         if (nam == NULL) {
965                 if ((error = prison_local_ip4(cred, &laddr)) != 0)
966                         return (error);
967         } else {
968                 sin = (struct sockaddr_in *)nam;
969                 KASSERT(sin->sin_family == AF_INET,
970                     ("%s: invalid family for address %p", __func__, sin));
971                 KASSERT(sin->sin_len == sizeof(*sin),
972                     ("%s: invalid length for address %p", __func__, sin));
973
974                 error = prison_local_ip4(cred, &sin->sin_addr);
975                 if (error)
976                         return (error);
977                 if (sin->sin_port != *lportp) {
978                         /* Don't allow the port to change. */
979                         if (*lportp != 0)
980                                 return (EINVAL);
981                         lport = sin->sin_port;
982                 }
983                 /* NB: lport is left as 0 if the port isn't being changed. */
984                 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
985                         /*
986                          * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
987                          * allow complete duplication of binding if
988                          * SO_REUSEPORT is set, or if SO_REUSEADDR is set
989                          * and a multicast address is bound on both
990                          * new and duplicated sockets.
991                          */
992                         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
993                                 reuseport = SO_REUSEADDR|SO_REUSEPORT;
994                         /*
995                          * XXX: How to deal with SO_REUSEPORT_LB here?
996                          * Treat same as SO_REUSEPORT for now.
997                          */
998                         if ((so->so_options &
999                             (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
1000                                 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
1001                 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
1002                         sin->sin_port = 0;              /* yech... */
1003                         bzero(&sin->sin_zero, sizeof(sin->sin_zero));
1004                         /*
1005                          * Is the address a local IP address?
1006                          * If INP_BINDANY is set, then the socket may be bound
1007                          * to any endpoint address, local or not.
1008                          */
1009                         if ((inp->inp_flags & INP_BINDANY) == 0 &&
1010                             ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
1011                                 return (EADDRNOTAVAIL);
1012                 }
1013                 laddr = sin->sin_addr;
1014                 if (lport) {
1015                         struct inpcb *t;
1016                         struct tcptw *tw;
1017
1018                         /* GROSS */
1019                         if (ntohs(lport) <= V_ipport_reservedhigh &&
1020                             ntohs(lport) >= V_ipport_reservedlow &&
1021                             priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
1022                                 return (EACCES);
1023                         if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
1024                             priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
1025                                 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
1026                                     lport, INPLOOKUP_WILDCARD, cred);
1027         /*
1028          * XXX
1029          * This entire block sorely needs a rewrite.
1030          */
1031                                 if (t &&
1032                                     ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
1033                                     ((t->inp_flags & INP_TIMEWAIT) == 0) &&
1034                                     (so->so_type != SOCK_STREAM ||
1035                                      ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
1036                                     (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
1037                                      ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
1038                                      (t->inp_flags2 & INP_REUSEPORT) ||
1039                                      (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
1040                                     (inp->inp_cred->cr_uid !=
1041                                      t->inp_cred->cr_uid))
1042                                         return (EADDRINUSE);
1043
1044                                 /*
1045                                  * If the socket is a BINDMULTI socket, then
1046                                  * the credentials need to match and the
1047                                  * original socket also has to have been bound
1048                                  * with BINDMULTI.
1049                                  */
1050                                 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
1051                                         return (EADDRINUSE);
1052                         }
1053                         t = in_pcblookup_local(pcbinfo, sin->sin_addr,
1054                             lport, lookupflags, cred);
1055                         if (t && (t->inp_flags & INP_TIMEWAIT)) {
1056                                 /*
1057                                  * XXXRW: If an incpb has had its timewait
1058                                  * state recycled, we treat the address as
1059                                  * being in use (for now).  This is better
1060                                  * than a panic, but not desirable.
1061                                  */
1062                                 tw = intotw(t);
1063                                 if (tw == NULL ||
1064                                     ((reuseport & tw->tw_so_options) == 0 &&
1065                                         (reuseport_lb &
1066                                             tw->tw_so_options) == 0)) {
1067                                         return (EADDRINUSE);
1068                                 }
1069                         } else if (t &&
1070                                    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
1071                                    (reuseport & inp_so_options(t)) == 0 &&
1072                                    (reuseport_lb & inp_so_options(t)) == 0) {
1073 #ifdef INET6
1074                                 if (ntohl(sin->sin_addr.s_addr) !=
1075                                     INADDR_ANY ||
1076                                     ntohl(t->inp_laddr.s_addr) !=
1077                                     INADDR_ANY ||
1078                                     (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
1079                                     (t->inp_vflag & INP_IPV6PROTO) == 0)
1080 #endif
1081                                                 return (EADDRINUSE);
1082                                 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
1083                                         return (EADDRINUSE);
1084                         }
1085                 }
1086         }
1087         if (*lportp != 0)
1088                 lport = *lportp;
1089         if (lport == 0) {
1090                 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
1091                 if (error != 0)
1092                         return (error);
1093         }
1094         *laddrp = laddr.s_addr;
1095         *lportp = lport;
1096         return (0);
1097 }
1098
1099 /*
1100  * Connect from a socket to a specified address.
1101  * Both address and port must be specified in argument sin.
1102  * If don't have a local address for this socket yet,
1103  * then pick one.
1104  */
1105 int
1106 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred,
1107     bool rehash)
1108 {
1109         u_short lport, fport;
1110         in_addr_t laddr, faddr;
1111         int anonport, error;
1112
1113         INP_WLOCK_ASSERT(inp);
1114         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1115
1116         lport = inp->inp_lport;
1117         laddr = inp->inp_laddr.s_addr;
1118         anonport = (lport == 0);
1119         error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
1120             NULL, cred);
1121         if (error)
1122                 return (error);
1123
1124         /* Do the initial binding of the local address if required. */
1125         if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
1126                 KASSERT(rehash == true,
1127                     ("Rehashing required for unbound inps"));
1128                 inp->inp_lport = lport;
1129                 inp->inp_laddr.s_addr = laddr;
1130                 if (in_pcbinshash(inp) != 0) {
1131                         inp->inp_laddr.s_addr = INADDR_ANY;
1132                         inp->inp_lport = 0;
1133                         return (EAGAIN);
1134                 }
1135         }
1136
1137         /* Commit the remaining changes. */
1138         inp->inp_lport = lport;
1139         inp->inp_laddr.s_addr = laddr;
1140         inp->inp_faddr.s_addr = faddr;
1141         inp->inp_fport = fport;
1142         if (rehash) {
1143                 in_pcbrehash(inp);
1144         } else {
1145                 in_pcbinshash(inp);
1146         }
1147
1148         if (anonport)
1149                 inp->inp_flags |= INP_ANONPORT;
1150         return (0);
1151 }
1152
1153 /*
1154  * Do proper source address selection on an unbound socket in case
1155  * of connect. Take jails into account as well.
1156  */
1157 int
1158 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
1159     struct ucred *cred)
1160 {
1161         struct ifaddr *ifa;
1162         struct sockaddr *sa;
1163         struct sockaddr_in *sin, dst;
1164         struct nhop_object *nh;
1165         int error;
1166
1167         NET_EPOCH_ASSERT();
1168         KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1169         /*
1170          * Bypass source address selection and use the primary jail IP
1171          * if requested.
1172          */
1173         if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
1174                 return (0);
1175
1176         error = 0;
1177
1178         nh = NULL;
1179         bzero(&dst, sizeof(dst));
1180         sin = &dst;
1181         sin->sin_family = AF_INET;
1182         sin->sin_len = sizeof(struct sockaddr_in);
1183         sin->sin_addr.s_addr = faddr->s_addr;
1184
1185         /*
1186          * If route is known our src addr is taken from the i/f,
1187          * else punt.
1188          *
1189          * Find out route to destination.
1190          */
1191         if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1192                 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1193                     0, NHR_NONE, 0);
1194
1195         /*
1196          * If we found a route, use the address corresponding to
1197          * the outgoing interface.
1198          *
1199          * Otherwise assume faddr is reachable on a directly connected
1200          * network and try to find a corresponding interface to take
1201          * the source address from.
1202          */
1203         if (nh == NULL || nh->nh_ifp == NULL) {
1204                 struct in_ifaddr *ia;
1205                 struct ifnet *ifp;
1206
1207                 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1208                                         inp->inp_socket->so_fibnum));
1209                 if (ia == NULL) {
1210                         ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1211                                                 inp->inp_socket->so_fibnum));
1212                 }
1213                 if (ia == NULL) {
1214                         error = ENETUNREACH;
1215                         goto done;
1216                 }
1217
1218                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1219                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1220                         goto done;
1221                 }
1222
1223                 ifp = ia->ia_ifp;
1224                 ia = NULL;
1225                 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1226                         sa = ifa->ifa_addr;
1227                         if (sa->sa_family != AF_INET)
1228                                 continue;
1229                         sin = (struct sockaddr_in *)sa;
1230                         if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1231                                 ia = (struct in_ifaddr *)ifa;
1232                                 break;
1233                         }
1234                 }
1235                 if (ia != NULL) {
1236                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1237                         goto done;
1238                 }
1239
1240                 /* 3. As a last resort return the 'default' jail address. */
1241                 error = prison_get_ip4(cred, laddr);
1242                 goto done;
1243         }
1244
1245         /*
1246          * If the outgoing interface on the route found is not
1247          * a loopback interface, use the address from that interface.
1248          * In case of jails do those three steps:
1249          * 1. check if the interface address belongs to the jail. If so use it.
1250          * 2. check if we have any address on the outgoing interface
1251          *    belonging to this jail. If so use it.
1252          * 3. as a last resort return the 'default' jail address.
1253          */
1254         if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1255                 struct in_ifaddr *ia;
1256                 struct ifnet *ifp;
1257
1258                 /* If not jailed, use the default returned. */
1259                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1260                         ia = (struct in_ifaddr *)nh->nh_ifa;
1261                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1262                         goto done;
1263                 }
1264
1265                 /* Jailed. */
1266                 /* 1. Check if the iface address belongs to the jail. */
1267                 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1268                 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1269                         ia = (struct in_ifaddr *)nh->nh_ifa;
1270                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1271                         goto done;
1272                 }
1273
1274                 /*
1275                  * 2. Check if we have any address on the outgoing interface
1276                  *    belonging to this jail.
1277                  */
1278                 ia = NULL;
1279                 ifp = nh->nh_ifp;
1280                 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1281                         sa = ifa->ifa_addr;
1282                         if (sa->sa_family != AF_INET)
1283                                 continue;
1284                         sin = (struct sockaddr_in *)sa;
1285                         if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1286                                 ia = (struct in_ifaddr *)ifa;
1287                                 break;
1288                         }
1289                 }
1290                 if (ia != NULL) {
1291                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1292                         goto done;
1293                 }
1294
1295                 /* 3. As a last resort return the 'default' jail address. */
1296                 error = prison_get_ip4(cred, laddr);
1297                 goto done;
1298         }
1299
1300         /*
1301          * The outgoing interface is marked with 'loopback net', so a route
1302          * to ourselves is here.
1303          * Try to find the interface of the destination address and then
1304          * take the address from there. That interface is not necessarily
1305          * a loopback interface.
1306          * In case of jails, check that it is an address of the jail
1307          * and if we cannot find, fall back to the 'default' jail address.
1308          */
1309         if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1310                 struct in_ifaddr *ia;
1311
1312                 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1313                                         inp->inp_socket->so_fibnum));
1314                 if (ia == NULL)
1315                         ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1316                                                 inp->inp_socket->so_fibnum));
1317                 if (ia == NULL)
1318                         ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1319
1320                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1321                         if (ia == NULL) {
1322                                 error = ENETUNREACH;
1323                                 goto done;
1324                         }
1325                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1326                         goto done;
1327                 }
1328
1329                 /* Jailed. */
1330                 if (ia != NULL) {
1331                         struct ifnet *ifp;
1332
1333                         ifp = ia->ia_ifp;
1334                         ia = NULL;
1335                         CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1336                                 sa = ifa->ifa_addr;
1337                                 if (sa->sa_family != AF_INET)
1338                                         continue;
1339                                 sin = (struct sockaddr_in *)sa;
1340                                 if (prison_check_ip4(cred,
1341                                     &sin->sin_addr) == 0) {
1342                                         ia = (struct in_ifaddr *)ifa;
1343                                         break;
1344                                 }
1345                         }
1346                         if (ia != NULL) {
1347                                 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1348                                 goto done;
1349                         }
1350                 }
1351
1352                 /* 3. As a last resort return the 'default' jail address. */
1353                 error = prison_get_ip4(cred, laddr);
1354                 goto done;
1355         }
1356
1357 done:
1358         return (error);
1359 }
1360
1361 /*
1362  * Set up for a connect from a socket to the specified address.
1363  * On entry, *laddrp and *lportp should contain the current local
1364  * address and port for the PCB; these are updated to the values
1365  * that should be placed in inp_laddr and inp_lport to complete
1366  * the connect.
1367  *
1368  * On success, *faddrp and *fportp will be set to the remote address
1369  * and port. These are not updated in the error case.
1370  *
1371  * If the operation fails because the connection already exists,
1372  * *oinpp will be set to the PCB of that connection so that the
1373  * caller can decide to override it. In all other cases, *oinpp
1374  * is set to NULL.
1375  */
1376 int
1377 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
1378     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
1379     struct inpcb **oinpp, struct ucred *cred)
1380 {
1381         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
1382         struct in_ifaddr *ia;
1383         struct inpcb *oinp;
1384         struct in_addr laddr, faddr;
1385         u_short lport, fport;
1386         int error;
1387
1388         KASSERT(sin->sin_family == AF_INET,
1389             ("%s: invalid address family for %p", __func__, sin));
1390         KASSERT(sin->sin_len == sizeof(*sin),
1391             ("%s: invalid address length for %p", __func__, sin));
1392
1393         /*
1394          * Because a global state change doesn't actually occur here, a read
1395          * lock is sufficient.
1396          */
1397         NET_EPOCH_ASSERT();
1398         INP_LOCK_ASSERT(inp);
1399         INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1400
1401         if (oinpp != NULL)
1402                 *oinpp = NULL;
1403         if (sin->sin_port == 0)
1404                 return (EADDRNOTAVAIL);
1405         laddr.s_addr = *laddrp;
1406         lport = *lportp;
1407         faddr = sin->sin_addr;
1408         fport = sin->sin_port;
1409 #ifdef ROUTE_MPATH
1410         if (CALC_FLOWID_OUTBOUND) {
1411                 uint32_t hash_val, hash_type;
1412
1413                 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport,
1414                     inp->inp_socket->so_proto->pr_protocol, &hash_type);
1415
1416                 inp->inp_flowid = hash_val;
1417                 inp->inp_flowtype = hash_type;
1418         }
1419 #endif
1420         if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
1421                 /*
1422                  * If the destination address is INADDR_ANY,
1423                  * use the primary local address.
1424                  * If the supplied address is INADDR_BROADCAST,
1425                  * and the primary interface supports broadcast,
1426                  * choose the broadcast address for that interface.
1427                  */
1428                 if (faddr.s_addr == INADDR_ANY) {
1429                         faddr =
1430                             IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1431                         if (cred != NULL &&
1432                             (error = prison_get_ip4(cred, &faddr)) != 0)
1433                                 return (error);
1434                 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
1435                         if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
1436                             IFF_BROADCAST)
1437                                 faddr = satosin(&CK_STAILQ_FIRST(
1438                                     &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1439                 }
1440         }
1441         if (laddr.s_addr == INADDR_ANY) {
1442                 error = in_pcbladdr(inp, &faddr, &laddr, cred);
1443                 /*
1444                  * If the destination address is multicast and an outgoing
1445                  * interface has been set as a multicast option, prefer the
1446                  * address of that interface as our source address.
1447                  */
1448                 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
1449                     inp->inp_moptions != NULL) {
1450                         struct ip_moptions *imo;
1451                         struct ifnet *ifp;
1452
1453                         imo = inp->inp_moptions;
1454                         if (imo->imo_multicast_ifp != NULL) {
1455                                 ifp = imo->imo_multicast_ifp;
1456                                 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1457                                         if ((ia->ia_ifp == ifp) &&
1458                                             (cred == NULL ||
1459                                             prison_check_ip4(cred,
1460                                             &ia->ia_addr.sin_addr) == 0))
1461                                                 break;
1462                                 }
1463                                 if (ia == NULL)
1464                                         error = EADDRNOTAVAIL;
1465                                 else {
1466                                         laddr = ia->ia_addr.sin_addr;
1467                                         error = 0;
1468                                 }
1469                         }
1470                 }
1471                 if (error)
1472                         return (error);
1473         }
1474
1475         if (lport != 0) {
1476                 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
1477                     fport, laddr, lport, 0, NULL, M_NODOM);
1478                 if (oinp != NULL) {
1479                         if (oinpp != NULL)
1480                                 *oinpp = oinp;
1481                         return (EADDRINUSE);
1482                 }
1483         } else {
1484                 struct sockaddr_in lsin, fsin;
1485
1486                 bzero(&lsin, sizeof(lsin));
1487                 bzero(&fsin, sizeof(fsin));
1488                 lsin.sin_family = AF_INET;
1489                 lsin.sin_addr = laddr;
1490                 fsin.sin_family = AF_INET;
1491                 fsin.sin_addr = faddr;
1492                 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin,
1493                     &lport, (struct sockaddr *)& fsin, fport, cred,
1494                     INPLOOKUP_WILDCARD);
1495                 if (error)
1496                         return (error);
1497         }
1498         *laddrp = laddr.s_addr;
1499         *lportp = lport;
1500         *faddrp = faddr.s_addr;
1501         *fportp = fport;
1502         return (0);
1503 }
1504
1505 void
1506 in_pcbdisconnect(struct inpcb *inp)
1507 {
1508
1509         INP_WLOCK_ASSERT(inp);
1510         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1511
1512         inp->inp_faddr.s_addr = INADDR_ANY;
1513         inp->inp_fport = 0;
1514         in_pcbrehash(inp);
1515 }
1516 #endif /* INET */
1517
1518 /*
1519  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
1520  * For most protocols, this will be invoked immediately prior to calling
1521  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
1522  * socket, in which case in_pcbfree() is deferred.
1523  */
1524 void
1525 in_pcbdetach(struct inpcb *inp)
1526 {
1527
1528         KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1529
1530 #ifdef RATELIMIT
1531         if (inp->inp_snd_tag != NULL)
1532                 in_pcbdetach_txrtlmt(inp);
1533 #endif
1534         inp->inp_socket->so_pcb = NULL;
1535         inp->inp_socket = NULL;
1536 }
1537
1538 /*
1539  * inpcb hash lookups are protected by SMR section.
1540  *
1541  * Once desired pcb has been found, switching from SMR section to a pcb
1542  * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
1543  * here because SMR is a critical section.
1544  * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
1545  */
1546 static inline void
1547 inp_lock(struct inpcb *inp, const inp_lookup_t lock)
1548 {
1549
1550         lock == INPLOOKUP_RLOCKPCB ?
1551             rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
1552 }
1553
1554 static inline void
1555 inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
1556 {
1557
1558         lock == INPLOOKUP_RLOCKPCB ?
1559             rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
1560 }
1561
1562 static inline int
1563 inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
1564 {
1565
1566         return (lock == INPLOOKUP_RLOCKPCB ?
1567             rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
1568 }
1569
1570 static inline bool
1571 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
1572 {
1573
1574         return (lock == INPLOOKUP_RLOCKPCB ?
1575             in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
1576 }
1577
1578 bool
1579 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
1580 {
1581
1582         MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
1583         SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
1584
1585         if (__predict_true(inp_trylock(inp, lock))) {
1586                 if (__predict_false(inp->inp_flags & INP_FREED)) {
1587                         smr_exit(inp->inp_pcbinfo->ipi_smr);
1588                         inp_unlock(inp, lock);
1589                         return (false);
1590                 }
1591                 smr_exit(inp->inp_pcbinfo->ipi_smr);
1592                 return (true);
1593         }
1594
1595         if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1596                 smr_exit(inp->inp_pcbinfo->ipi_smr);
1597                 inp_lock(inp, lock);
1598                 if (__predict_false(in_pcbrele(inp, lock)))
1599                         return (false);
1600                 /*
1601                  * inp acquired through refcount & lock for sure didn't went
1602                  * through uma_zfree().  However, it may have already went
1603                  * through in_pcbfree() and has another reference, that
1604                  * prevented its release by our in_pcbrele().
1605                  */
1606                 if (__predict_false(inp->inp_flags & INP_FREED)) {
1607                         inp_unlock(inp, lock);
1608                         return (false);
1609                 }
1610                 return (true);
1611         } else {
1612                 smr_exit(inp->inp_pcbinfo->ipi_smr);
1613                 return (false);
1614         }
1615 }
1616
1617 /*
1618  * inp_next() - inpcb hash/list traversal iterator
1619  *
1620  * Requires initialized struct inpcb_iterator for context.
1621  * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
1622  *
1623  * - Iterator can have either write-lock or read-lock semantics, that can not
1624  *   be changed later.
1625  * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
1626  *   a single hash slot.  Note: only rip_input() does the latter.
1627  * - Iterator may have optional bool matching function.  The matching function
1628  *   will be executed for each inpcb in the SMR context, so it can not acquire
1629  *   locks and can safely access only immutable fields of inpcb.
1630  *
1631  * A fresh initialized iterator has NULL inpcb in its context and that
1632  * means that inp_next() call would return the very first inpcb on the list
1633  * locked with desired semantic.  In all following calls the context pointer
1634  * shall hold the current inpcb pointer.  The KPI user is not supposed to
1635  * unlock the current inpcb!  Upon end of traversal inp_next() will return NULL
1636  * and write NULL to its context.  After end of traversal an iterator can be
1637  * reused.
1638  *
1639  * List traversals have the following features/constraints:
1640  * - New entries won't be seen, as they are always added to the head of a list.
1641  * - Removed entries won't stop traversal as long as they are not added to
1642  *   a different list. This is violated by in_pcbrehash().
1643  */
1644 #define II_LIST_FIRST(ipi, hash)                                        \
1645                 (((hash) == INP_ALL_LIST) ?                             \
1646                     CK_LIST_FIRST(&(ipi)->ipi_listhead) :               \
1647                     CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)]))
1648 #define II_LIST_NEXT(inp, hash)                                         \
1649                 (((hash) == INP_ALL_LIST) ?                             \
1650                     CK_LIST_NEXT((inp), inp_list) :                     \
1651                     CK_LIST_NEXT((inp), inp_hash))
1652 #define II_LOCK_ASSERT(inp, lock)                                       \
1653                 rw_assert(&(inp)->inp_lock,                             \
1654                     (lock) == INPLOOKUP_RLOCKPCB ?  RA_RLOCKED : RA_WLOCKED )
1655 struct inpcb *
1656 inp_next(struct inpcb_iterator *ii)
1657 {
1658         const struct inpcbinfo *ipi = ii->ipi;
1659         inp_match_t *match = ii->match;
1660         void *ctx = ii->ctx;
1661         inp_lookup_t lock = ii->lock;
1662         int hash = ii->hash;
1663         struct inpcb *inp;
1664
1665         if (ii->inp == NULL) {          /* First call. */
1666                 smr_enter(ipi->ipi_smr);
1667                 /* This is unrolled CK_LIST_FOREACH(). */
1668                 for (inp = II_LIST_FIRST(ipi, hash);
1669                     inp != NULL;
1670                     inp = II_LIST_NEXT(inp, hash)) {
1671                         if (match != NULL && (match)(inp, ctx) == false)
1672                                 continue;
1673                         if (__predict_true(inp_smr_lock(inp, lock)))
1674                                 break;
1675                         else {
1676                                 smr_enter(ipi->ipi_smr);
1677                                 MPASS(inp != II_LIST_FIRST(ipi, hash));
1678                                 inp = II_LIST_FIRST(ipi, hash);
1679                                 if (inp == NULL)
1680                                         break;
1681                         }
1682                 }
1683
1684                 if (inp == NULL)
1685                         smr_exit(ipi->ipi_smr);
1686                 else
1687                         ii->inp = inp;
1688
1689                 return (inp);
1690         }
1691
1692         /* Not a first call. */
1693         smr_enter(ipi->ipi_smr);
1694 restart:
1695         inp = ii->inp;
1696         II_LOCK_ASSERT(inp, lock);
1697 next:
1698         inp = II_LIST_NEXT(inp, hash);
1699         if (inp == NULL) {
1700                 smr_exit(ipi->ipi_smr);
1701                 goto found;
1702         }
1703
1704         if (match != NULL && (match)(inp, ctx) == false)
1705                 goto next;
1706
1707         if (__predict_true(inp_trylock(inp, lock))) {
1708                 if (__predict_false(inp->inp_flags & INP_FREED)) {
1709                         /*
1710                          * Entries are never inserted in middle of a list, thus
1711                          * as long as we are in SMR, we can continue traversal.
1712                          * Jump to 'restart' should yield in the same result,
1713                          * but could produce unnecessary looping.  Could this
1714                          * looping be unbound?
1715                          */
1716                         inp_unlock(inp, lock);
1717                         goto next;
1718                 } else {
1719                         smr_exit(ipi->ipi_smr);
1720                         goto found;
1721                 }
1722         }
1723
1724         /*
1725          * Can't obtain lock immediately, thus going hard.  Once we exit the
1726          * SMR section we can no longer jump to 'next', and our only stable
1727          * anchoring point is ii->inp, which we keep locked for this case, so
1728          * we jump to 'restart'.
1729          */
1730         if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1731                 smr_exit(ipi->ipi_smr);
1732                 inp_lock(inp, lock);
1733                 if (__predict_false(in_pcbrele(inp, lock))) {
1734                         smr_enter(ipi->ipi_smr);
1735                         goto restart;
1736                 }
1737                 /*
1738                  * See comment in inp_smr_lock().
1739                  */
1740                 if (__predict_false(inp->inp_flags & INP_FREED)) {
1741                         inp_unlock(inp, lock);
1742                         smr_enter(ipi->ipi_smr);
1743                         goto restart;
1744                 }
1745         } else
1746                 goto next;
1747
1748 found:
1749         inp_unlock(ii->inp, lock);
1750         ii->inp = inp;
1751
1752         return (ii->inp);
1753 }
1754
1755 /*
1756  * in_pcbref() bumps the reference count on an inpcb in order to maintain
1757  * stability of an inpcb pointer despite the inpcb lock being released or
1758  * SMR section exited.
1759  *
1760  * To free a reference later in_pcbrele_(r|w)locked() must be performed.
1761  */
1762 void
1763 in_pcbref(struct inpcb *inp)
1764 {
1765         u_int old __diagused;
1766
1767         old = refcount_acquire(&inp->inp_refcount);
1768         KASSERT(old > 0, ("%s: refcount 0", __func__));
1769 }
1770
1771 /*
1772  * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
1773  * freeing the pcb, if the reference was very last.
1774  */
1775 bool
1776 in_pcbrele_rlocked(struct inpcb *inp)
1777 {
1778
1779         INP_RLOCK_ASSERT(inp);
1780
1781         if (refcount_release(&inp->inp_refcount) == 0)
1782                 return (false);
1783
1784         MPASS(inp->inp_flags & INP_FREED);
1785         MPASS(inp->inp_socket == NULL);
1786         MPASS(inp->inp_in_hpts == 0);
1787         INP_RUNLOCK(inp);
1788         uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1789         return (true);
1790 }
1791
1792 bool
1793 in_pcbrele_wlocked(struct inpcb *inp)
1794 {
1795
1796         INP_WLOCK_ASSERT(inp);
1797
1798         if (refcount_release(&inp->inp_refcount) == 0)
1799                 return (false);
1800
1801         MPASS(inp->inp_flags & INP_FREED);
1802         MPASS(inp->inp_socket == NULL);
1803         MPASS(inp->inp_in_hpts == 0);
1804         INP_WUNLOCK(inp);
1805         uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1806         return (true);
1807 }
1808
1809 /*
1810  * Unconditionally schedule an inpcb to be freed by decrementing its
1811  * reference count, which should occur only after the inpcb has been detached
1812  * from its socket.  If another thread holds a temporary reference (acquired
1813  * using in_pcbref()) then the free is deferred until that reference is
1814  * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
1815  *  Almost all work, including removal from global lists, is done in this
1816  * context, where the pcbinfo lock is held.
1817  */
1818 void
1819 in_pcbfree(struct inpcb *inp)
1820 {
1821         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1822 #ifdef INET
1823         struct ip_moptions *imo;
1824 #endif
1825 #ifdef INET6
1826         struct ip6_moptions *im6o;
1827 #endif
1828
1829         INP_WLOCK_ASSERT(inp);
1830         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1831         KASSERT((inp->inp_flags & INP_FREED) == 0,
1832             ("%s: called twice for pcb %p", __func__, inp));
1833
1834         inp->inp_flags |= INP_FREED;
1835         INP_INFO_WLOCK(pcbinfo);
1836         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1837         pcbinfo->ipi_count--;
1838         CK_LIST_REMOVE(inp, inp_list);
1839         INP_INFO_WUNLOCK(pcbinfo);
1840
1841         if (inp->inp_flags & INP_INHASHLIST) {
1842                 struct inpcbport *phd = inp->inp_phd;
1843
1844                 INP_HASH_WLOCK(pcbinfo);
1845                 /* XXX: Only do if SO_REUSEPORT_LB set? */
1846                 in_pcbremlbgrouphash(inp);
1847
1848                 CK_LIST_REMOVE(inp, inp_hash);
1849                 CK_LIST_REMOVE(inp, inp_portlist);
1850                 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
1851                         CK_LIST_REMOVE(phd, phd_hash);
1852                         uma_zfree_smr(pcbinfo->ipi_portzone, phd);
1853                 }
1854                 INP_HASH_WUNLOCK(pcbinfo);
1855                 inp->inp_flags &= ~INP_INHASHLIST;
1856         }
1857
1858         RO_INVALIDATE_CACHE(&inp->inp_route);
1859 #ifdef MAC
1860         mac_inpcb_destroy(inp);
1861 #endif
1862 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1863         if (inp->inp_sp != NULL)
1864                 ipsec_delete_pcbpolicy(inp);
1865 #endif
1866 #ifdef INET
1867         if (inp->inp_options)
1868                 (void)m_free(inp->inp_options);
1869         imo = inp->inp_moptions;
1870 #endif
1871 #ifdef INET6
1872         if (inp->inp_vflag & INP_IPV6PROTO) {
1873                 ip6_freepcbopts(inp->in6p_outputopts);
1874                 im6o = inp->in6p_moptions;
1875         } else
1876                 im6o = NULL;
1877 #endif
1878
1879         if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
1880                 INP_WUNLOCK(inp);
1881         }
1882 #ifdef INET6
1883         ip6_freemoptions(im6o);
1884 #endif
1885 #ifdef INET
1886         inp_freemoptions(imo);
1887 #endif
1888         /* Destruction is finalized in inpcb_dtor(). */
1889 }
1890
1891 static void
1892 inpcb_dtor(void *mem, int size, void *arg)
1893 {
1894         struct inpcb *inp = mem;
1895
1896         crfree(inp->inp_cred);
1897 #ifdef INVARIANTS
1898         inp->inp_cred = NULL;
1899 #endif
1900 }
1901
1902 /*
1903  * Different protocols initialize their inpcbs differently - giving
1904  * different name to the lock.  But they all are disposed the same.
1905  */
1906 static void
1907 inpcb_fini(void *mem, int size)
1908 {
1909         struct inpcb *inp = mem;
1910
1911         INP_LOCK_DESTROY(inp);
1912 }
1913
1914 /*
1915  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1916  * port reservation, and preventing it from being returned by inpcb lookups.
1917  *
1918  * It is used by TCP to mark an inpcb as unused and avoid future packet
1919  * delivery or event notification when a socket remains open but TCP has
1920  * closed.  This might occur as a result of a shutdown()-initiated TCP close
1921  * or a RST on the wire, and allows the port binding to be reused while still
1922  * maintaining the invariant that so_pcb always points to a valid inpcb until
1923  * in_pcbdetach().
1924  *
1925  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1926  * in_pcbnotifyall() and in_pcbpurgeif0()?
1927  */
1928 void
1929 in_pcbdrop(struct inpcb *inp)
1930 {
1931
1932         INP_WLOCK_ASSERT(inp);
1933 #ifdef INVARIANTS
1934         if (inp->inp_socket != NULL && inp->inp_ppcb != NULL)
1935                 MPASS(inp->inp_refcount > 1);
1936 #endif
1937
1938         /*
1939          * XXXRW: Possibly we should protect the setting of INP_DROPPED with
1940          * the hash lock...?
1941          */
1942         inp->inp_flags |= INP_DROPPED;
1943         if (inp->inp_flags & INP_INHASHLIST) {
1944                 struct inpcbport *phd = inp->inp_phd;
1945
1946                 INP_HASH_WLOCK(inp->inp_pcbinfo);
1947                 in_pcbremlbgrouphash(inp);
1948                 CK_LIST_REMOVE(inp, inp_hash);
1949                 CK_LIST_REMOVE(inp, inp_portlist);
1950                 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
1951                         CK_LIST_REMOVE(phd, phd_hash);
1952                         uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
1953                 }
1954                 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1955                 inp->inp_flags &= ~INP_INHASHLIST;
1956         }
1957 }
1958
1959 #ifdef INET
1960 /*
1961  * Common routines to return the socket addresses associated with inpcbs.
1962  */
1963 struct sockaddr *
1964 in_sockaddr(in_port_t port, struct in_addr *addr_p)
1965 {
1966         struct sockaddr_in *sin;
1967
1968         sin = malloc(sizeof *sin, M_SONAME,
1969                 M_WAITOK | M_ZERO);
1970         sin->sin_family = AF_INET;
1971         sin->sin_len = sizeof(*sin);
1972         sin->sin_addr = *addr_p;
1973         sin->sin_port = port;
1974
1975         return (struct sockaddr *)sin;
1976 }
1977
1978 int
1979 in_getsockaddr(struct socket *so, struct sockaddr **nam)
1980 {
1981         struct inpcb *inp;
1982         struct in_addr addr;
1983         in_port_t port;
1984
1985         inp = sotoinpcb(so);
1986         KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1987
1988         INP_RLOCK(inp);
1989         port = inp->inp_lport;
1990         addr = inp->inp_laddr;
1991         INP_RUNLOCK(inp);
1992
1993         *nam = in_sockaddr(port, &addr);
1994         return 0;
1995 }
1996
1997 int
1998 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
1999 {
2000         struct inpcb *inp;
2001         struct in_addr addr;
2002         in_port_t port;
2003
2004         inp = sotoinpcb(so);
2005         KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
2006
2007         INP_RLOCK(inp);
2008         port = inp->inp_fport;
2009         addr = inp->inp_faddr;
2010         INP_RUNLOCK(inp);
2011
2012         *nam = in_sockaddr(port, &addr);
2013         return 0;
2014 }
2015
2016 void
2017 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
2018     struct inpcb *(*notify)(struct inpcb *, int))
2019 {
2020         struct inpcb *inp, *inp_temp;
2021
2022         INP_INFO_WLOCK(pcbinfo);
2023         CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) {
2024                 INP_WLOCK(inp);
2025 #ifdef INET6
2026                 if ((inp->inp_vflag & INP_IPV4) == 0) {
2027                         INP_WUNLOCK(inp);
2028                         continue;
2029                 }
2030 #endif
2031                 if (inp->inp_faddr.s_addr != faddr.s_addr ||
2032                     inp->inp_socket == NULL) {
2033                         INP_WUNLOCK(inp);
2034                         continue;
2035                 }
2036                 if ((*notify)(inp, errno))
2037                         INP_WUNLOCK(inp);
2038         }
2039         INP_INFO_WUNLOCK(pcbinfo);
2040 }
2041
2042 static bool
2043 inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
2044 {
2045
2046         if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
2047                 return (true);
2048         else
2049                 return (false);
2050 }
2051
2052 void
2053 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
2054 {
2055         struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
2056             inp_v4_multi_match, NULL);
2057         struct inpcb *inp;
2058         struct in_multi *inm;
2059         struct in_mfilter *imf;
2060         struct ip_moptions *imo;
2061
2062         IN_MULTI_LOCK_ASSERT();
2063
2064         while ((inp = inp_next(&inpi)) != NULL) {
2065                 INP_WLOCK_ASSERT(inp);
2066
2067                 imo = inp->inp_moptions;
2068                 /*
2069                  * Unselect the outgoing interface if it is being
2070                  * detached.
2071                  */
2072                 if (imo->imo_multicast_ifp == ifp)
2073                         imo->imo_multicast_ifp = NULL;
2074
2075                 /*
2076                  * Drop multicast group membership if we joined
2077                  * through the interface being detached.
2078                  *
2079                  * XXX This can all be deferred to an epoch_call
2080                  */
2081 restart:
2082                 IP_MFILTER_FOREACH(imf, &imo->imo_head) {
2083                         if ((inm = imf->imf_inm) == NULL)
2084                                 continue;
2085                         if (inm->inm_ifp != ifp)
2086                                 continue;
2087                         ip_mfilter_remove(&imo->imo_head, imf);
2088                         in_leavegroup_locked(inm, NULL);
2089                         ip_mfilter_free(imf);
2090                         goto restart;
2091                 }
2092         }
2093 }
2094
2095 /*
2096  * Lookup a PCB based on the local address and port.  Caller must hold the
2097  * hash lock.  No inpcb locks or references are acquired.
2098  */
2099 #define INP_LOOKUP_MAPPED_PCB_COST      3
2100 struct inpcb *
2101 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2102     u_short lport, int lookupflags, struct ucred *cred)
2103 {
2104         struct inpcb *inp;
2105 #ifdef INET6
2106         int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
2107 #else
2108         int matchwild = 3;
2109 #endif
2110         int wildcard;
2111
2112         KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2113             ("%s: invalid lookup flags %d", __func__, lookupflags));
2114         INP_HASH_LOCK_ASSERT(pcbinfo);
2115
2116         if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
2117                 struct inpcbhead *head;
2118                 /*
2119                  * Look for an unconnected (wildcard foreign addr) PCB that
2120                  * matches the local address and port we're looking for.
2121                  */
2122                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
2123                     pcbinfo->ipi_hashmask)];
2124                 CK_LIST_FOREACH(inp, head, inp_hash) {
2125 #ifdef INET6
2126                         /* XXX inp locking */
2127                         if ((inp->inp_vflag & INP_IPV4) == 0)
2128                                 continue;
2129 #endif
2130                         if (inp->inp_faddr.s_addr == INADDR_ANY &&
2131                             inp->inp_laddr.s_addr == laddr.s_addr &&
2132                             inp->inp_lport == lport) {
2133                                 /*
2134                                  * Found?
2135                                  */
2136                                 if (cred == NULL ||
2137                                     prison_equal_ip4(cred->cr_prison,
2138                                         inp->inp_cred->cr_prison))
2139                                         return (inp);
2140                         }
2141                 }
2142                 /*
2143                  * Not found.
2144                  */
2145                 return (NULL);
2146         } else {
2147                 struct inpcbporthead *porthash;
2148                 struct inpcbport *phd;
2149                 struct inpcb *match = NULL;
2150                 /*
2151                  * Best fit PCB lookup.
2152                  *
2153                  * First see if this local port is in use by looking on the
2154                  * port hash list.
2155                  */
2156                 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2157                     pcbinfo->ipi_porthashmask)];
2158                 CK_LIST_FOREACH(phd, porthash, phd_hash) {
2159                         if (phd->phd_port == lport)
2160                                 break;
2161                 }
2162                 if (phd != NULL) {
2163                         /*
2164                          * Port is in use by one or more PCBs. Look for best
2165                          * fit.
2166                          */
2167                         CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
2168                                 wildcard = 0;
2169                                 if (cred != NULL &&
2170                                     !prison_equal_ip4(inp->inp_cred->cr_prison,
2171                                         cred->cr_prison))
2172                                         continue;
2173 #ifdef INET6
2174                                 /* XXX inp locking */
2175                                 if ((inp->inp_vflag & INP_IPV4) == 0)
2176                                         continue;
2177                                 /*
2178                                  * We never select the PCB that has
2179                                  * INP_IPV6 flag and is bound to :: if
2180                                  * we have another PCB which is bound
2181                                  * to 0.0.0.0.  If a PCB has the
2182                                  * INP_IPV6 flag, then we set its cost
2183                                  * higher than IPv4 only PCBs.
2184                                  *
2185                                  * Note that the case only happens
2186                                  * when a socket is bound to ::, under
2187                                  * the condition that the use of the
2188                                  * mapped address is allowed.
2189                                  */
2190                                 if ((inp->inp_vflag & INP_IPV6) != 0)
2191                                         wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2192 #endif
2193                                 if (inp->inp_faddr.s_addr != INADDR_ANY)
2194                                         wildcard++;
2195                                 if (inp->inp_laddr.s_addr != INADDR_ANY) {
2196                                         if (laddr.s_addr == INADDR_ANY)
2197                                                 wildcard++;
2198                                         else if (inp->inp_laddr.s_addr != laddr.s_addr)
2199                                                 continue;
2200                                 } else {
2201                                         if (laddr.s_addr != INADDR_ANY)
2202                                                 wildcard++;
2203                                 }
2204                                 if (wildcard < matchwild) {
2205                                         match = inp;
2206                                         matchwild = wildcard;
2207                                         if (matchwild == 0)
2208                                                 break;
2209                                 }
2210                         }
2211                 }
2212                 return (match);
2213         }
2214 }
2215 #undef INP_LOOKUP_MAPPED_PCB_COST
2216
2217 static struct inpcb *
2218 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2219     const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
2220     uint16_t fport, int lookupflags, int numa_domain)
2221 {
2222         struct inpcb *local_wild, *numa_wild;
2223         const struct inpcblbgrouphead *hdr;
2224         struct inpcblbgroup *grp;
2225         uint32_t idx;
2226
2227         INP_HASH_LOCK_ASSERT(pcbinfo);
2228
2229         hdr = &pcbinfo->ipi_lbgrouphashbase[
2230             INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2231
2232         /*
2233          * Order of socket selection:
2234          * 1. non-wild.
2235          * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
2236          *
2237          * NOTE:
2238          * - Load balanced group does not contain jailed sockets
2239          * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
2240          */
2241         local_wild = NULL;
2242         numa_wild = NULL;
2243         CK_LIST_FOREACH(grp, hdr, il_list) {
2244 #ifdef INET6
2245                 if (!(grp->il_vflag & INP_IPV4))
2246                         continue;
2247 #endif
2248                 if (grp->il_lport != lport)
2249                         continue;
2250
2251                 idx = INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) %
2252                     grp->il_inpcnt;
2253                 if (grp->il_laddr.s_addr == laddr->s_addr) {
2254                         if (numa_domain == M_NODOM ||
2255                             grp->il_numa_domain == numa_domain) {
2256                                 return (grp->il_inp[idx]);
2257                         } else {
2258                                 numa_wild = grp->il_inp[idx];
2259                         }
2260                 }
2261                 if (grp->il_laddr.s_addr == INADDR_ANY &&
2262                     (lookupflags & INPLOOKUP_WILDCARD) != 0 &&
2263                     (local_wild == NULL || numa_domain == M_NODOM ||
2264                         grp->il_numa_domain == numa_domain)) {
2265                         local_wild = grp->il_inp[idx];
2266                 }
2267         }
2268         if (numa_wild != NULL)
2269                 return (numa_wild);
2270
2271         return (local_wild);
2272 }
2273
2274 /*
2275  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
2276  * that the caller has either locked the hash list, which usually happens
2277  * for bind(2) operations, or is in SMR section, which happens when sorting
2278  * out incoming packets.
2279  */
2280 static struct inpcb *
2281 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2282     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2283     struct ifnet *ifp, uint8_t numa_domain)
2284 {
2285         struct inpcbhead *head;
2286         struct inpcb *inp, *tmpinp;
2287         u_short fport = fport_arg, lport = lport_arg;
2288
2289         KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2290             ("%s: invalid lookup flags %d", __func__, lookupflags));
2291         INP_HASH_LOCK_ASSERT(pcbinfo);
2292
2293         /*
2294          * First look for an exact match.
2295          */
2296         tmpinp = NULL;
2297         head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport,
2298             pcbinfo->ipi_hashmask)];
2299         CK_LIST_FOREACH(inp, head, inp_hash) {
2300 #ifdef INET6
2301                 /* XXX inp locking */
2302                 if ((inp->inp_vflag & INP_IPV4) == 0)
2303                         continue;
2304 #endif
2305                 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2306                     inp->inp_laddr.s_addr == laddr.s_addr &&
2307                     inp->inp_fport == fport &&
2308                     inp->inp_lport == lport) {
2309                         /*
2310                          * XXX We should be able to directly return
2311                          * the inp here, without any checks.
2312                          * Well unless both bound with SO_REUSEPORT?
2313                          */
2314                         if (prison_flag(inp->inp_cred, PR_IP4))
2315                                 return (inp);
2316                         if (tmpinp == NULL)
2317                                 tmpinp = inp;
2318                 }
2319         }
2320         if (tmpinp != NULL)
2321                 return (tmpinp);
2322
2323         /*
2324          * Then look in lb group (for wildcard match).
2325          */
2326         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2327                 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
2328                     fport, lookupflags, numa_domain);
2329                 if (inp != NULL)
2330                         return (inp);
2331         }
2332
2333         /*
2334          * Then look for a wildcard match, if requested.
2335          */
2336         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2337                 struct inpcb *local_wild = NULL, *local_exact = NULL;
2338 #ifdef INET6
2339                 struct inpcb *local_wild_mapped = NULL;
2340 #endif
2341                 struct inpcb *jail_wild = NULL;
2342                 int injail;
2343
2344                 /*
2345                  * Order of socket selection - we always prefer jails.
2346                  *      1. jailed, non-wild.
2347                  *      2. jailed, wild.
2348                  *      3. non-jailed, non-wild.
2349                  *      4. non-jailed, wild.
2350                  */
2351
2352                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
2353                     pcbinfo->ipi_hashmask)];
2354                 CK_LIST_FOREACH(inp, head, inp_hash) {
2355 #ifdef INET6
2356                         /* XXX inp locking */
2357                         if ((inp->inp_vflag & INP_IPV4) == 0)
2358                                 continue;
2359 #endif
2360                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
2361                             inp->inp_lport != lport)
2362                                 continue;
2363
2364                         injail = prison_flag(inp->inp_cred, PR_IP4);
2365                         if (injail) {
2366                                 if (prison_check_ip4_locked(
2367                                     inp->inp_cred->cr_prison, &laddr) != 0)
2368                                         continue;
2369                         } else {
2370                                 if (local_exact != NULL)
2371                                         continue;
2372                         }
2373
2374                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
2375                                 if (injail)
2376                                         return (inp);
2377                                 else
2378                                         local_exact = inp;
2379                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2380 #ifdef INET6
2381                                 /* XXX inp locking, NULL check */
2382                                 if (inp->inp_vflag & INP_IPV6PROTO)
2383                                         local_wild_mapped = inp;
2384                                 else
2385 #endif
2386                                         if (injail)
2387                                                 jail_wild = inp;
2388                                         else
2389                                                 local_wild = inp;
2390                         }
2391                 } /* LIST_FOREACH */
2392                 if (jail_wild != NULL)
2393                         return (jail_wild);
2394                 if (local_exact != NULL)
2395                         return (local_exact);
2396                 if (local_wild != NULL)
2397                         return (local_wild);
2398 #ifdef INET6
2399                 if (local_wild_mapped != NULL)
2400                         return (local_wild_mapped);
2401 #endif
2402         } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
2403
2404         return (NULL);
2405 }
2406
2407 /*
2408  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
2409  * hash list lock, and will return the inpcb locked (i.e., requires
2410  * INPLOOKUP_LOCKPCB).
2411  */
2412 static struct inpcb *
2413 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2414     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2415     struct ifnet *ifp, uint8_t numa_domain)
2416 {
2417         struct inpcb *inp;
2418
2419         smr_enter(pcbinfo->ipi_smr);
2420         inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2421             lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
2422         if (inp != NULL) {
2423                 if (__predict_false(inp_smr_lock(inp,
2424                     (lookupflags & INPLOOKUP_LOCKMASK)) == false))
2425                         inp = NULL;
2426         } else
2427                 smr_exit(pcbinfo->ipi_smr);
2428
2429         return (inp);
2430 }
2431
2432 /*
2433  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2434  * from which a pre-calculated hash value may be extracted.
2435  */
2436 struct inpcb *
2437 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2438     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
2439 {
2440
2441         KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2442             ("%s: invalid lookup flags %d", __func__, lookupflags));
2443         KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2444             ("%s: LOCKPCB not set", __func__));
2445
2446         return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2447             lookupflags, ifp, M_NODOM));
2448 }
2449
2450 struct inpcb *
2451 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2452     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2453     struct ifnet *ifp, struct mbuf *m)
2454 {
2455
2456         KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2457             ("%s: invalid lookup flags %d", __func__, lookupflags));
2458         KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2459             ("%s: LOCKPCB not set", __func__));
2460
2461         return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2462             lookupflags, ifp, m->m_pkthdr.numa_domain));
2463 }
2464 #endif /* INET */
2465
2466 /*
2467  * Insert PCB onto various hash lists.
2468  */
2469 int
2470 in_pcbinshash(struct inpcb *inp)
2471 {
2472         struct inpcbhead *pcbhash;
2473         struct inpcbporthead *pcbporthash;
2474         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2475         struct inpcbport *phd;
2476         int so_options;
2477
2478         INP_WLOCK_ASSERT(inp);
2479         INP_HASH_WLOCK_ASSERT(pcbinfo);
2480
2481         KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2482             ("in_pcbinshash: INP_INHASHLIST"));
2483
2484 #ifdef INET6
2485         if (inp->inp_vflag & INP_IPV6)
2486                 pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
2487                     inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2488         else
2489 #endif
2490                 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
2491                     inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2492
2493         pcbporthash = &pcbinfo->ipi_porthashbase[
2494             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2495
2496         /*
2497          * Add entry to load balance group.
2498          * Only do this if SO_REUSEPORT_LB is set.
2499          */
2500         so_options = inp_so_options(inp);
2501         if (so_options & SO_REUSEPORT_LB) {
2502                 int ret = in_pcbinslbgrouphash(inp, M_NODOM);
2503                 if (ret) {
2504                         /* pcb lb group malloc fail (ret=ENOBUFS). */
2505                         return (ret);
2506                 }
2507         }
2508
2509         /*
2510          * Go through port list and look for a head for this lport.
2511          */
2512         CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
2513                 if (phd->phd_port == inp->inp_lport)
2514                         break;
2515         }
2516         /*
2517          * If none exists, malloc one and tack it on.
2518          */
2519         if (phd == NULL) {
2520                 phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT);
2521                 if (phd == NULL) {
2522                         return (ENOBUFS); /* XXX */
2523                 }
2524                 phd->phd_port = inp->inp_lport;
2525                 CK_LIST_INIT(&phd->phd_pcblist);
2526                 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2527         }
2528         inp->inp_phd = phd;
2529         CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2530         CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
2531         inp->inp_flags |= INP_INHASHLIST;
2532
2533         return (0);
2534 }
2535
2536 /*
2537  * Move PCB to the proper hash bucket when { faddr, fport } have  been
2538  * changed. NOTE: This does not handle the case of the lport changing (the
2539  * hashed port list would have to be updated as well), so the lport must
2540  * not change after in_pcbinshash() has been called.
2541  *
2542  * XXXGL: a race between this function and SMR-protected hash iterator
2543  * will lead to iterator traversing a possibly wrong hash list. However,
2544  * this race should have been here since change from rwlock to epoch.
2545  */
2546 void
2547 in_pcbrehash(struct inpcb *inp)
2548 {
2549         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2550         struct inpcbhead *head;
2551
2552         INP_WLOCK_ASSERT(inp);
2553         INP_HASH_WLOCK_ASSERT(pcbinfo);
2554
2555         KASSERT(inp->inp_flags & INP_INHASHLIST,
2556             ("in_pcbrehash: !INP_INHASHLIST"));
2557
2558 #ifdef INET6
2559         if (inp->inp_vflag & INP_IPV6)
2560                 head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
2561                     inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2562         else
2563 #endif
2564                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
2565                     inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2566
2567         CK_LIST_REMOVE(inp, inp_hash);
2568         CK_LIST_INSERT_HEAD(head, inp, inp_hash);
2569 }
2570
2571 /*
2572  * Check for alternatives when higher level complains
2573  * about service problems.  For now, invalidate cached
2574  * routing information.  If the route was created dynamically
2575  * (by a redirect), time to try a default gateway again.
2576  */
2577 void
2578 in_losing(struct inpcb *inp)
2579 {
2580
2581         RO_INVALIDATE_CACHE(&inp->inp_route);
2582         return;
2583 }
2584
2585 /*
2586  * A set label operation has occurred at the socket layer, propagate the
2587  * label change into the in_pcb for the socket.
2588  */
2589 void
2590 in_pcbsosetlabel(struct socket *so)
2591 {
2592 #ifdef MAC
2593         struct inpcb *inp;
2594
2595         inp = sotoinpcb(so);
2596         KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2597
2598         INP_WLOCK(inp);
2599         SOCK_LOCK(so);
2600         mac_inpcb_sosetlabel(so, inp);
2601         SOCK_UNLOCK(so);
2602         INP_WUNLOCK(inp);
2603 #endif
2604 }
2605
2606 /*
2607  * ipport_tick runs once per second, determining if random port allocation
2608  * should be continued.  If more than ipport_randomcps ports have been
2609  * allocated in the last second, then we return to sequential port
2610  * allocation. We return to random allocation only once we drop below
2611  * ipport_randomcps for at least ipport_randomtime seconds.
2612  */
2613 static void
2614 ipport_tick(void *xtp)
2615 {
2616         VNET_ITERATOR_DECL(vnet_iter);
2617
2618         VNET_LIST_RLOCK_NOSLEEP();
2619         VNET_FOREACH(vnet_iter) {
2620                 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */
2621                 if (V_ipport_tcpallocs - V_ipport_tcplastcount <=
2622                     V_ipport_randomcps) {
2623                         if (V_ipport_stoprandom > 0)
2624                                 V_ipport_stoprandom--;
2625                 } else
2626                         V_ipport_stoprandom = V_ipport_randomtime;
2627                 V_ipport_tcplastcount = V_ipport_tcpallocs;
2628                 CURVNET_RESTORE();
2629         }
2630         VNET_LIST_RUNLOCK_NOSLEEP();
2631         callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
2632 }
2633
2634 static void
2635 ip_fini(void *xtp)
2636 {
2637
2638         callout_stop(&ipport_tick_callout);
2639 }
2640
2641 /*
2642  * The ipport_callout should start running at about the time we attach the
2643  * inet or inet6 domains.
2644  */
2645 static void
2646 ipport_tick_init(const void *unused __unused)
2647 {
2648
2649         /* Start ipport_tick. */
2650         callout_init(&ipport_tick_callout, 1);
2651         callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
2652         EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
2653                 SHUTDOWN_PRI_DEFAULT);
2654 }
2655 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
2656     ipport_tick_init, NULL);
2657
2658 void
2659 inp_wlock(struct inpcb *inp)
2660 {
2661
2662         INP_WLOCK(inp);
2663 }
2664
2665 void
2666 inp_wunlock(struct inpcb *inp)
2667 {
2668
2669         INP_WUNLOCK(inp);
2670 }
2671
2672 void
2673 inp_rlock(struct inpcb *inp)
2674 {
2675
2676         INP_RLOCK(inp);
2677 }
2678
2679 void
2680 inp_runlock(struct inpcb *inp)
2681 {
2682
2683         INP_RUNLOCK(inp);
2684 }
2685
2686 #ifdef INVARIANT_SUPPORT
2687 void
2688 inp_lock_assert(struct inpcb *inp)
2689 {
2690
2691         INP_WLOCK_ASSERT(inp);
2692 }
2693
2694 void
2695 inp_unlock_assert(struct inpcb *inp)
2696 {
2697
2698         INP_UNLOCK_ASSERT(inp);
2699 }
2700 #endif
2701
2702 void
2703 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
2704 {
2705         struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
2706             INPLOOKUP_WLOCKPCB);
2707         struct inpcb *inp;
2708
2709         while ((inp = inp_next(&inpi)) != NULL)
2710                 func(inp, arg);
2711 }
2712
2713 struct socket *
2714 inp_inpcbtosocket(struct inpcb *inp)
2715 {
2716
2717         INP_WLOCK_ASSERT(inp);
2718         return (inp->inp_socket);
2719 }
2720
2721 struct tcpcb *
2722 inp_inpcbtotcpcb(struct inpcb *inp)
2723 {
2724
2725         INP_WLOCK_ASSERT(inp);
2726         return ((struct tcpcb *)inp->inp_ppcb);
2727 }
2728
2729 int
2730 inp_ip_tos_get(const struct inpcb *inp)
2731 {
2732
2733         return (inp->inp_ip_tos);
2734 }
2735
2736 void
2737 inp_ip_tos_set(struct inpcb *inp, int val)
2738 {
2739
2740         inp->inp_ip_tos = val;
2741 }
2742
2743 void
2744 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2745     uint32_t *faddr, uint16_t *fp)
2746 {
2747
2748         INP_LOCK_ASSERT(inp);
2749         *laddr = inp->inp_laddr.s_addr;
2750         *faddr = inp->inp_faddr.s_addr;
2751         *lp = inp->inp_lport;
2752         *fp = inp->inp_fport;
2753 }
2754
2755 struct inpcb *
2756 so_sotoinpcb(struct socket *so)
2757 {
2758
2759         return (sotoinpcb(so));
2760 }
2761
2762 struct tcpcb *
2763 so_sototcpcb(struct socket *so)
2764 {
2765
2766         return (sototcpcb(so));
2767 }
2768
2769 /*
2770  * Create an external-format (``xinpcb'') structure using the information in
2771  * the kernel-format in_pcb structure pointed to by inp.  This is done to
2772  * reduce the spew of irrelevant information over this interface, to isolate
2773  * user code from changes in the kernel structure, and potentially to provide
2774  * information-hiding if we decide that some of this information should be
2775  * hidden from users.
2776  */
2777 void
2778 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
2779 {
2780
2781         bzero(xi, sizeof(*xi));
2782         xi->xi_len = sizeof(struct xinpcb);
2783         if (inp->inp_socket)
2784                 sotoxsocket(inp->inp_socket, &xi->xi_socket);
2785         bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
2786         xi->inp_gencnt = inp->inp_gencnt;
2787         xi->inp_ppcb = (uintptr_t)inp->inp_ppcb;
2788         xi->inp_flow = inp->inp_flow;
2789         xi->inp_flowid = inp->inp_flowid;
2790         xi->inp_flowtype = inp->inp_flowtype;
2791         xi->inp_flags = inp->inp_flags;
2792         xi->inp_flags2 = inp->inp_flags2;
2793         xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
2794         xi->in6p_cksum = inp->in6p_cksum;
2795         xi->in6p_hops = inp->in6p_hops;
2796         xi->inp_ip_tos = inp->inp_ip_tos;
2797         xi->inp_vflag = inp->inp_vflag;
2798         xi->inp_ip_ttl = inp->inp_ip_ttl;
2799         xi->inp_ip_p = inp->inp_ip_p;
2800         xi->inp_ip_minttl = inp->inp_ip_minttl;
2801 }
2802
2803 int
2804 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
2805     int (*ctloutput_set)(struct inpcb *, struct sockopt *))
2806 {
2807         struct sockopt sopt;
2808         struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2809             INPLOOKUP_WLOCKPCB);
2810         struct inpcb *inp;
2811         struct sockopt_parameters *params;
2812         struct socket *so;
2813         int error;
2814         char buf[1024];
2815
2816         if (req->oldptr != NULL || req->oldlen != 0)
2817                 return (EINVAL);
2818         if (req->newptr == NULL)
2819                 return (EPERM);
2820         if (req->newlen > sizeof(buf))
2821                 return (ENOMEM);
2822         error = SYSCTL_IN(req, buf, req->newlen);
2823         if (error != 0)
2824                 return (error);
2825         if (req->newlen < sizeof(struct sockopt_parameters))
2826                 return (EINVAL);
2827         params = (struct sockopt_parameters *)buf;
2828         sopt.sopt_level = params->sop_level;
2829         sopt.sopt_name = params->sop_optname;
2830         sopt.sopt_dir = SOPT_SET;
2831         sopt.sopt_val = params->sop_optval;
2832         sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
2833         sopt.sopt_td = NULL;
2834 #ifdef INET6
2835         if (params->sop_inc.inc_flags & INC_ISIPV6) {
2836                 if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_laddr))
2837                         params->sop_inc.inc6_laddr.s6_addr16[1] =
2838                             htons(params->sop_inc.inc6_zoneid & 0xffff);
2839                 if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_faddr))
2840                         params->sop_inc.inc6_faddr.s6_addr16[1] =
2841                             htons(params->sop_inc.inc6_zoneid & 0xffff);
2842         }
2843 #endif
2844         if (params->sop_inc.inc_lport != htons(0)) {
2845                 if (params->sop_inc.inc_fport == htons(0))
2846                         inpi.hash = INP_PCBHASH_WILD(params->sop_inc.inc_lport,
2847                             pcbinfo->ipi_hashmask);
2848                 else
2849 #ifdef INET6
2850                         if (params->sop_inc.inc_flags & INC_ISIPV6)
2851                                 inpi.hash = INP6_PCBHASH(
2852                                     &params->sop_inc.inc6_faddr,
2853                                     params->sop_inc.inc_lport,
2854                                     params->sop_inc.inc_fport,
2855                                     pcbinfo->ipi_hashmask);
2856                         else
2857 #endif
2858                                 inpi.hash = INP_PCBHASH(
2859                                     &params->sop_inc.inc_faddr,
2860                                     params->sop_inc.inc_lport,
2861                                     params->sop_inc.inc_fport,
2862                                     pcbinfo->ipi_hashmask);
2863         }
2864         while ((inp = inp_next(&inpi)) != NULL)
2865                 if (inp->inp_gencnt == params->sop_id) {
2866                         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
2867                                 INP_WUNLOCK(inp);
2868                                 return (ECONNRESET);
2869                         }
2870                         so = inp->inp_socket;
2871                         KASSERT(so != NULL, ("inp_socket == NULL"));
2872                         soref(so);
2873                         error = (*ctloutput_set)(inp, &sopt);
2874                         sorele(so);
2875                         break;
2876                 }
2877         if (inp == NULL)
2878                 error = ESRCH;
2879         return (error);
2880 }
2881
2882 #ifdef DDB
2883 static void
2884 db_print_indent(int indent)
2885 {
2886         int i;
2887
2888         for (i = 0; i < indent; i++)
2889                 db_printf(" ");
2890 }
2891
2892 static void
2893 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
2894 {
2895         char faddr_str[48], laddr_str[48];
2896
2897         db_print_indent(indent);
2898         db_printf("%s at %p\n", name, inc);
2899
2900         indent += 2;
2901
2902 #ifdef INET6
2903         if (inc->inc_flags & INC_ISIPV6) {
2904                 /* IPv6. */
2905                 ip6_sprintf(laddr_str, &inc->inc6_laddr);
2906                 ip6_sprintf(faddr_str, &inc->inc6_faddr);
2907         } else
2908 #endif
2909         {
2910                 /* IPv4. */
2911                 inet_ntoa_r(inc->inc_laddr, laddr_str);
2912                 inet_ntoa_r(inc->inc_faddr, faddr_str);
2913         }
2914         db_print_indent(indent);
2915         db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
2916             ntohs(inc->inc_lport));
2917         db_print_indent(indent);
2918         db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
2919             ntohs(inc->inc_fport));
2920 }
2921
2922 static void
2923 db_print_inpflags(int inp_flags)
2924 {
2925         int comma;
2926
2927         comma = 0;
2928         if (inp_flags & INP_RECVOPTS) {
2929                 db_printf("%sINP_RECVOPTS", comma ? ", " : "");
2930                 comma = 1;
2931         }
2932         if (inp_flags & INP_RECVRETOPTS) {
2933                 db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
2934                 comma = 1;
2935         }
2936         if (inp_flags & INP_RECVDSTADDR) {
2937                 db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
2938                 comma = 1;
2939         }
2940         if (inp_flags & INP_ORIGDSTADDR) {
2941                 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
2942                 comma = 1;
2943         }
2944         if (inp_flags & INP_HDRINCL) {
2945                 db_printf("%sINP_HDRINCL", comma ? ", " : "");
2946                 comma = 1;
2947         }
2948         if (inp_flags & INP_HIGHPORT) {
2949                 db_printf("%sINP_HIGHPORT", comma ? ", " : "");
2950                 comma = 1;
2951         }
2952         if (inp_flags & INP_LOWPORT) {
2953                 db_printf("%sINP_LOWPORT", comma ? ", " : "");
2954                 comma = 1;
2955         }
2956         if (inp_flags & INP_ANONPORT) {
2957                 db_printf("%sINP_ANONPORT", comma ? ", " : "");
2958                 comma = 1;
2959         }
2960         if (inp_flags & INP_RECVIF) {
2961                 db_printf("%sINP_RECVIF", comma ? ", " : "");
2962                 comma = 1;
2963         }
2964         if (inp_flags & INP_MTUDISC) {
2965                 db_printf("%sINP_MTUDISC", comma ? ", " : "");
2966                 comma = 1;
2967         }
2968         if (inp_flags & INP_RECVTTL) {
2969                 db_printf("%sINP_RECVTTL", comma ? ", " : "");
2970                 comma = 1;
2971         }
2972         if (inp_flags & INP_DONTFRAG) {
2973                 db_printf("%sINP_DONTFRAG", comma ? ", " : "");
2974                 comma = 1;
2975         }
2976         if (inp_flags & INP_RECVTOS) {
2977                 db_printf("%sINP_RECVTOS", comma ? ", " : "");
2978                 comma = 1;
2979         }
2980         if (inp_flags & IN6P_IPV6_V6ONLY) {
2981                 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
2982                 comma = 1;
2983         }
2984         if (inp_flags & IN6P_PKTINFO) {
2985                 db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
2986                 comma = 1;
2987         }
2988         if (inp_flags & IN6P_HOPLIMIT) {
2989                 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
2990                 comma = 1;
2991         }
2992         if (inp_flags & IN6P_HOPOPTS) {
2993                 db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
2994                 comma = 1;
2995         }
2996         if (inp_flags & IN6P_DSTOPTS) {
2997                 db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
2998                 comma = 1;
2999         }
3000         if (inp_flags & IN6P_RTHDR) {
3001                 db_printf("%sIN6P_RTHDR", comma ? ", " : "");
3002                 comma = 1;
3003         }
3004         if (inp_flags & IN6P_RTHDRDSTOPTS) {
3005                 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
3006                 comma = 1;
3007         }
3008         if (inp_flags & IN6P_TCLASS) {
3009                 db_printf("%sIN6P_TCLASS", comma ? ", " : "");
3010                 comma = 1;
3011         }
3012         if (inp_flags & IN6P_AUTOFLOWLABEL) {
3013                 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
3014                 comma = 1;
3015         }
3016         if (inp_flags & INP_TIMEWAIT) {
3017                 db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
3018                 comma  = 1;
3019         }
3020         if (inp_flags & INP_ONESBCAST) {
3021                 db_printf("%sINP_ONESBCAST", comma ? ", " : "");
3022                 comma  = 1;
3023         }
3024         if (inp_flags & INP_DROPPED) {
3025                 db_printf("%sINP_DROPPED", comma ? ", " : "");
3026                 comma  = 1;
3027         }
3028         if (inp_flags & INP_SOCKREF) {
3029                 db_printf("%sINP_SOCKREF", comma ? ", " : "");
3030                 comma  = 1;
3031         }
3032         if (inp_flags & IN6P_RFC2292) {
3033                 db_printf("%sIN6P_RFC2292", comma ? ", " : "");
3034                 comma = 1;
3035         }
3036         if (inp_flags & IN6P_MTU) {
3037                 db_printf("IN6P_MTU%s", comma ? ", " : "");
3038                 comma = 1;
3039         }
3040 }
3041
3042 static void
3043 db_print_inpvflag(u_char inp_vflag)
3044 {
3045         int comma;
3046
3047         comma = 0;
3048         if (inp_vflag & INP_IPV4) {
3049                 db_printf("%sINP_IPV4", comma ? ", " : "");
3050                 comma  = 1;
3051         }
3052         if (inp_vflag & INP_IPV6) {
3053                 db_printf("%sINP_IPV6", comma ? ", " : "");
3054                 comma  = 1;
3055         }
3056         if (inp_vflag & INP_IPV6PROTO) {
3057                 db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
3058                 comma  = 1;
3059         }
3060 }
3061
3062 static void
3063 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3064 {
3065
3066         db_print_indent(indent);
3067         db_printf("%s at %p\n", name, inp);
3068
3069         indent += 2;
3070
3071         db_print_indent(indent);
3072         db_printf("inp_flow: 0x%x\n", inp->inp_flow);
3073
3074         db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3075
3076         db_print_indent(indent);
3077         db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
3078             inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
3079
3080         db_print_indent(indent);
3081         db_printf("inp_label: %p   inp_flags: 0x%x (",
3082            inp->inp_label, inp->inp_flags);
3083         db_print_inpflags(inp->inp_flags);
3084         db_printf(")\n");
3085
3086         db_print_indent(indent);
3087         db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
3088             inp->inp_vflag);
3089         db_print_inpvflag(inp->inp_vflag);
3090         db_printf(")\n");
3091
3092         db_print_indent(indent);
3093         db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
3094             inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3095
3096         db_print_indent(indent);
3097 #ifdef INET6
3098         if (inp->inp_vflag & INP_IPV6) {
3099                 db_printf("in6p_options: %p   in6p_outputopts: %p   "
3100                     "in6p_moptions: %p\n", inp->in6p_options,
3101                     inp->in6p_outputopts, inp->in6p_moptions);
3102                 db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
3103                     "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3104                     inp->in6p_hops);
3105         } else
3106 #endif
3107         {
3108                 db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
3109                     "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3110                     inp->inp_options, inp->inp_moptions);
3111         }
3112
3113         db_print_indent(indent);
3114         db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
3115             (uintmax_t)inp->inp_gencnt);
3116 }
3117
3118 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3119 {
3120         struct inpcb *inp;
3121
3122         if (!have_addr) {
3123                 db_printf("usage: show inpcb <addr>\n");
3124                 return;
3125         }
3126         inp = (struct inpcb *)addr;
3127
3128         db_print_inpcb(inp, "inpcb", 0);
3129 }
3130 #endif /* DDB */
3131
3132 #ifdef RATELIMIT
3133 /*
3134  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3135  * if any.
3136  */
3137 int
3138 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3139 {
3140         union if_snd_tag_modify_params params = {
3141                 .rate_limit.max_rate = max_pacing_rate,
3142                 .rate_limit.flags = M_NOWAIT,
3143         };
3144         struct m_snd_tag *mst;
3145         int error;
3146
3147         mst = inp->inp_snd_tag;
3148         if (mst == NULL)
3149                 return (EINVAL);
3150
3151         if (mst->sw->snd_tag_modify == NULL) {
3152                 error = EOPNOTSUPP;
3153         } else {
3154                 error = mst->sw->snd_tag_modify(mst, &params);
3155         }
3156         return (error);
3157 }
3158
3159 /*
3160  * Query existing TX rate limit based on the existing
3161  * "inp->inp_snd_tag", if any.
3162  */
3163 int
3164 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3165 {
3166         union if_snd_tag_query_params params = { };
3167         struct m_snd_tag *mst;
3168         int error;
3169
3170         mst = inp->inp_snd_tag;
3171         if (mst == NULL)
3172                 return (EINVAL);
3173
3174         if (mst->sw->snd_tag_query == NULL) {
3175                 error = EOPNOTSUPP;
3176         } else {
3177                 error = mst->sw->snd_tag_query(mst, &params);
3178                 if (error == 0 && p_max_pacing_rate != NULL)
3179                         *p_max_pacing_rate = params.rate_limit.max_rate;
3180         }
3181         return (error);
3182 }
3183
3184 /*
3185  * Query existing TX queue level based on the existing
3186  * "inp->inp_snd_tag", if any.
3187  */
3188 int
3189 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3190 {
3191         union if_snd_tag_query_params params = { };
3192         struct m_snd_tag *mst;
3193         int error;
3194
3195         mst = inp->inp_snd_tag;
3196         if (mst == NULL)
3197                 return (EINVAL);
3198
3199         if (mst->sw->snd_tag_query == NULL)
3200                 return (EOPNOTSUPP);
3201
3202         error = mst->sw->snd_tag_query(mst, &params);
3203         if (error == 0 && p_txqueue_level != NULL)
3204                 *p_txqueue_level = params.rate_limit.queue_level;
3205         return (error);
3206 }
3207
3208 /*
3209  * Allocate a new TX rate limit send tag from the network interface
3210  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3211  */
3212 int
3213 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3214     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3215
3216 {
3217         union if_snd_tag_alloc_params params = {
3218                 .rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3219                     IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3220                 .rate_limit.hdr.flowid = flowid,
3221                 .rate_limit.hdr.flowtype = flowtype,
3222                 .rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3223                 .rate_limit.max_rate = max_pacing_rate,
3224                 .rate_limit.flags = M_NOWAIT,
3225         };
3226         int error;
3227
3228         INP_WLOCK_ASSERT(inp);
3229
3230         /*
3231          * If there is already a send tag, or the INP is being torn
3232          * down, allocating a new send tag is not allowed. Else send
3233          * tags may leak.
3234          */
3235         if (*st != NULL || (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0)
3236                 return (EINVAL);
3237
3238         error = m_snd_tag_alloc(ifp, &params, st);
3239 #ifdef INET
3240         if (error == 0) {
3241                 counter_u64_add(rate_limit_set_ok, 1);
3242                 counter_u64_add(rate_limit_active, 1);
3243         } else if (error != EOPNOTSUPP)
3244                   counter_u64_add(rate_limit_alloc_fail, 1);
3245 #endif
3246         return (error);
3247 }
3248
3249 void
3250 in_pcbdetach_tag(struct m_snd_tag *mst)
3251 {
3252
3253         m_snd_tag_rele(mst);
3254 #ifdef INET
3255         counter_u64_add(rate_limit_active, -1);
3256 #endif
3257 }
3258
3259 /*
3260  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3261  * if any:
3262  */
3263 void
3264 in_pcbdetach_txrtlmt(struct inpcb *inp)
3265 {
3266         struct m_snd_tag *mst;
3267
3268         INP_WLOCK_ASSERT(inp);
3269
3270         mst = inp->inp_snd_tag;
3271         inp->inp_snd_tag = NULL;
3272
3273         if (mst == NULL)
3274                 return;
3275
3276         m_snd_tag_rele(mst);
3277 #ifdef INET
3278         counter_u64_add(rate_limit_active, -1);
3279 #endif
3280 }
3281
3282 int
3283 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3284 {
3285         int error;
3286
3287         /*
3288          * If the existing send tag is for the wrong interface due to
3289          * a route change, first drop the existing tag.  Set the
3290          * CHANGED flag so that we will keep trying to allocate a new
3291          * tag if we fail to allocate one this time.
3292          */
3293         if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3294                 in_pcbdetach_txrtlmt(inp);
3295                 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3296         }
3297
3298         /*
3299          * NOTE: When attaching to a network interface a reference is
3300          * made to ensure the network interface doesn't go away until
3301          * all ratelimit connections are gone. The network interface
3302          * pointers compared below represent valid network interfaces,
3303          * except when comparing towards NULL.
3304          */
3305         if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3306                 error = 0;
3307         } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3308                 if (inp->inp_snd_tag != NULL)
3309                         in_pcbdetach_txrtlmt(inp);
3310                 error = 0;
3311         } else if (inp->inp_snd_tag == NULL) {
3312                 /*
3313                  * In order to utilize packet pacing with RSS, we need
3314                  * to wait until there is a valid RSS hash before we
3315                  * can proceed:
3316                  */
3317                 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3318                         error = EAGAIN;
3319                 } else {
3320                         error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3321                             mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3322                 }
3323         } else {
3324                 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3325         }
3326         if (error == 0 || error == EOPNOTSUPP)
3327                 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3328
3329         return (error);
3330 }
3331
3332 /*
3333  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3334  * is set in the fast path and will attach/detach/modify the TX rate
3335  * limit send tag based on the socket's so_max_pacing_rate value.
3336  */
3337 void
3338 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3339 {
3340         struct socket *socket;
3341         uint32_t max_pacing_rate;
3342         bool did_upgrade;
3343
3344         if (inp == NULL)
3345                 return;
3346
3347         socket = inp->inp_socket;
3348         if (socket == NULL)
3349                 return;
3350
3351         if (!INP_WLOCKED(inp)) {
3352                 /*
3353                  * NOTE: If the write locking fails, we need to bail
3354                  * out and use the non-ratelimited ring for the
3355                  * transmit until there is a new chance to get the
3356                  * write lock.
3357                  */
3358                 if (!INP_TRY_UPGRADE(inp))
3359                         return;
3360                 did_upgrade = 1;
3361         } else {
3362                 did_upgrade = 0;
3363         }
3364
3365         /*
3366          * NOTE: The so_max_pacing_rate value is read unlocked,
3367          * because atomic updates are not required since the variable
3368          * is checked at every mbuf we send. It is assumed that the
3369          * variable read itself will be atomic.
3370          */
3371         max_pacing_rate = socket->so_max_pacing_rate;
3372
3373         in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3374
3375         if (did_upgrade)
3376                 INP_DOWNGRADE(inp);
3377 }
3378
3379 /*
3380  * Track route changes for TX rate limiting.
3381  */
3382 void
3383 in_pcboutput_eagain(struct inpcb *inp)
3384 {
3385         bool did_upgrade;
3386
3387         if (inp == NULL)
3388                 return;
3389
3390         if (inp->inp_snd_tag == NULL)
3391                 return;
3392
3393         if (!INP_WLOCKED(inp)) {
3394                 /*
3395                  * NOTE: If the write locking fails, we need to bail
3396                  * out and use the non-ratelimited ring for the
3397                  * transmit until there is a new chance to get the
3398                  * write lock.
3399                  */
3400                 if (!INP_TRY_UPGRADE(inp))
3401                         return;
3402                 did_upgrade = 1;
3403         } else {
3404                 did_upgrade = 0;
3405         }
3406
3407         /* detach rate limiting */
3408         in_pcbdetach_txrtlmt(inp);
3409
3410         /* make sure new mbuf send tag allocation is made */
3411         inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3412
3413         if (did_upgrade)
3414                 INP_DOWNGRADE(inp);
3415 }
3416
3417 #ifdef INET
3418 static void
3419 rl_init(void *st)
3420 {
3421         rate_limit_new = counter_u64_alloc(M_WAITOK);
3422         rate_limit_chg = counter_u64_alloc(M_WAITOK);
3423         rate_limit_active = counter_u64_alloc(M_WAITOK);
3424         rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3425         rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3426 }
3427
3428 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3429 #endif
3430 #endif /* RATELIMIT */