]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/in_pcb.c
zfs: merge openzfs/zfs@6a6bd4939 (zfs-2.1-release) into stable/13
[FreeBSD/FreeBSD.git] / sys / netinet / in_pcb.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1991, 1993, 1995
5  *      The Regents of the University of California.
6  * Copyright (c) 2007-2009 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * All rights reserved.
9  *
10  * Portions of this software were developed by Robert N. M. Watson under
11  * contract to Juniper Networks, Inc.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *      @(#)in_pcb.c    8.4 (Berkeley) 5/24/95
38  */
39
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42
43 #include "opt_ddb.h"
44 #include "opt_ipsec.h"
45 #include "opt_inet.h"
46 #include "opt_inet6.h"
47 #include "opt_ratelimit.h"
48 #include "opt_pcbgroup.h"
49 #include "opt_route.h"
50 #include "opt_rss.h"
51
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/lock.h>
55 #include <sys/malloc.h>
56 #include <sys/mbuf.h>
57 #include <sys/callout.h>
58 #include <sys/eventhandler.h>
59 #include <sys/domain.h>
60 #include <sys/protosw.h>
61 #include <sys/rmlock.h>
62 #include <sys/smp.h>
63 #include <sys/socket.h>
64 #include <sys/socketvar.h>
65 #include <sys/sockio.h>
66 #include <sys/priv.h>
67 #include <sys/proc.h>
68 #include <sys/refcount.h>
69 #include <sys/jail.h>
70 #include <sys/kernel.h>
71 #include <sys/sysctl.h>
72
73 #ifdef DDB
74 #include <ddb/ddb.h>
75 #endif
76
77 #include <vm/uma.h>
78 #include <vm/vm.h>
79
80 #include <net/if.h>
81 #include <net/if_var.h>
82 #include <net/if_types.h>
83 #include <net/if_llatbl.h>
84 #include <net/route.h>
85 #include <net/rss_config.h>
86 #include <net/vnet.h>
87
88 #if defined(INET) || defined(INET6)
89 #include <netinet/in.h>
90 #include <netinet/in_pcb.h>
91 #ifdef INET
92 #include <netinet/in_var.h>
93 #include <netinet/in_fib.h>
94 #endif
95 #include <netinet/ip_var.h>
96 #include <netinet/tcp_var.h>
97 #ifdef TCPHPTS
98 #include <netinet/tcp_hpts.h>
99 #endif
100 #include <netinet/udp.h>
101 #include <netinet/udp_var.h>
102 #ifdef INET6
103 #include <netinet/ip6.h>
104 #include <netinet6/in6_pcb.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/ip6_var.h>
107 #endif /* INET6 */
108 #include <net/route/nhop.h>
109 #endif
110
111 #include <netipsec/ipsec_support.h>
112
113 #include <security/mac/mac_framework.h>
114
115 #define INPCBLBGROUP_SIZMIN     8
116 #define INPCBLBGROUP_SIZMAX     256
117
118 static struct callout   ipport_tick_callout;
119
120 /*
121  * These configure the range of local port addresses assigned to
122  * "unspecified" outgoing connections/packets/whatever.
123  */
124 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;    /* 1023 */
125 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;    /* 600 */
126 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;     /* 10000 */
127 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;       /* 65535 */
128 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;      /* 49152 */
129 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;        /* 65535 */
130
131 /*
132  * Reserved ports accessible only to root. There are significant
133  * security considerations that must be accounted for when changing these,
134  * but the security benefits can be great. Please be careful.
135  */
136 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;    /* 1023 */
137 VNET_DEFINE(int, ipport_reservedlow);
138
139 /* Variables dealing with random ephemeral port allocation. */
140 VNET_DEFINE(int, ipport_randomized) = 1;        /* user controlled via sysctl */
141 VNET_DEFINE(int, ipport_randomcps) = 10;        /* user controlled via sysctl */
142 VNET_DEFINE(int, ipport_randomtime) = 45;       /* user controlled via sysctl */
143 VNET_DEFINE(int, ipport_stoprandom);            /* toggled by ipport_tick */
144 VNET_DEFINE(int, ipport_tcpallocs);
145 VNET_DEFINE_STATIC(int, ipport_tcplastcount);
146
147 #define V_ipport_tcplastcount           VNET(ipport_tcplastcount)
148
149 static void     in_pcbremlists(struct inpcb *inp);
150 #ifdef INET
151 static struct inpcb     *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
152                             struct in_addr faddr, u_int fport_arg,
153                             struct in_addr laddr, u_int lport_arg,
154                             int lookupflags, struct ifnet *ifp,
155                             uint8_t numa_domain);
156
157 #define RANGECHK(var, min, max) \
158         if ((var) < (min)) { (var) = (min); } \
159         else if ((var) > (max)) { (var) = (max); }
160
161 static int
162 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
163 {
164         int error;
165
166         error = sysctl_handle_int(oidp, arg1, arg2, req);
167         if (error == 0) {
168                 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
169                 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
170                 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
171                 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
172                 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
173                 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
174         }
175         return (error);
176 }
177
178 #undef RANGECHK
179
180 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
181     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
182     "IP Ports");
183
184 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
185     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
186     &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
187     "");
188 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
189     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
190     &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
191     "");
192 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
193     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
194     &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
195     "");
196 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
197     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
198     &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
199     "");
200 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
201     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
202     &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
203     "");
204 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
205     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
206     &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
207     "");
208 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
209         CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
210         &VNET_NAME(ipport_reservedhigh), 0, "");
211 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
212         CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
213 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
214         CTLFLAG_VNET | CTLFLAG_RW,
215         &VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
216 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
217         CTLFLAG_VNET | CTLFLAG_RW,
218         &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
219         "allocations before switching to a sequential one");
220 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
221         CTLFLAG_VNET | CTLFLAG_RW,
222         &VNET_NAME(ipport_randomtime), 0,
223         "Minimum time to keep sequential port "
224         "allocation before switching to a random one");
225
226 #ifdef RATELIMIT
227 counter_u64_t rate_limit_new;
228 counter_u64_t rate_limit_chg;
229 counter_u64_t rate_limit_active;
230 counter_u64_t rate_limit_alloc_fail;
231 counter_u64_t rate_limit_set_ok;
232
233 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
234     "IP Rate Limiting");
235 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
236     &rate_limit_active, "Active rate limited connections");
237 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
238    &rate_limit_alloc_fail, "Rate limited connection failures");
239 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
240    &rate_limit_set_ok, "Rate limited setting succeeded");
241 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
242    &rate_limit_new, "Total Rate limit new attempts");
243 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
244    &rate_limit_chg, "Total Rate limited change attempts");
245
246 #endif /* RATELIMIT */
247
248 #endif /* INET */
249
250 /*
251  * in_pcb.c: manage the Protocol Control Blocks.
252  *
253  * NOTE: It is assumed that most of these functions will be called with
254  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
255  * functions often modify hash chains or addresses in pcbs.
256  */
257
258 static struct inpcblbgroup *
259 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
260     uint16_t port, const union in_dependaddr *addr, int size,
261     uint8_t numa_domain)
262 {
263         struct inpcblbgroup *grp;
264         size_t bytes;
265
266         bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
267         grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
268         if (!grp)
269                 return (NULL);
270         grp->il_vflag = vflag;
271         grp->il_lport = port;
272         grp->il_numa_domain = numa_domain;
273         grp->il_dependladdr = *addr;
274         grp->il_inpsiz = size;
275         CK_LIST_INSERT_HEAD(hdr, grp, il_list);
276         return (grp);
277 }
278
279 static void
280 in_pcblbgroup_free_deferred(epoch_context_t ctx)
281 {
282         struct inpcblbgroup *grp;
283
284         grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
285         free(grp, M_PCB);
286 }
287
288 static void
289 in_pcblbgroup_free(struct inpcblbgroup *grp)
290 {
291
292         CK_LIST_REMOVE(grp, il_list);
293         NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
294 }
295
296 static struct inpcblbgroup *
297 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
298     struct inpcblbgroup *old_grp, int size)
299 {
300         struct inpcblbgroup *grp;
301         int i;
302
303         grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
304             old_grp->il_lport, &old_grp->il_dependladdr, size,
305             old_grp->il_numa_domain);
306         if (grp == NULL)
307                 return (NULL);
308
309         KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
310             ("invalid new local group size %d and old local group count %d",
311              grp->il_inpsiz, old_grp->il_inpcnt));
312
313         for (i = 0; i < old_grp->il_inpcnt; ++i)
314                 grp->il_inp[i] = old_grp->il_inp[i];
315         grp->il_inpcnt = old_grp->il_inpcnt;
316         in_pcblbgroup_free(old_grp);
317         return (grp);
318 }
319
320 /*
321  * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
322  * and shrink group if possible.
323  */
324 static void
325 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
326     int i)
327 {
328         struct inpcblbgroup *grp, *new_grp;
329
330         grp = *grpp;
331         for (; i + 1 < grp->il_inpcnt; ++i)
332                 grp->il_inp[i] = grp->il_inp[i + 1];
333         grp->il_inpcnt--;
334
335         if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
336             grp->il_inpcnt <= grp->il_inpsiz / 4) {
337                 /* Shrink this group. */
338                 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
339                 if (new_grp != NULL)
340                         *grpp = new_grp;
341         }
342 }
343
344 /*
345  * Add PCB to load balance group for SO_REUSEPORT_LB option.
346  */
347 static int
348 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
349 {
350         const static struct timeval interval = { 60, 0 };
351         static struct timeval lastprint;
352         struct inpcbinfo *pcbinfo;
353         struct inpcblbgrouphead *hdr;
354         struct inpcblbgroup *grp;
355         uint32_t idx;
356
357         pcbinfo = inp->inp_pcbinfo;
358
359         INP_WLOCK_ASSERT(inp);
360         INP_HASH_WLOCK_ASSERT(pcbinfo);
361
362         /*
363          * Don't allow jailed socket to join local group.
364          */
365         if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred))
366                 return (0);
367
368 #ifdef INET6
369         /*
370          * Don't allow IPv4 mapped INET6 wild socket.
371          */
372         if ((inp->inp_vflag & INP_IPV4) &&
373             inp->inp_laddr.s_addr == INADDR_ANY &&
374             INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
375                 return (0);
376         }
377 #endif
378
379         idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
380         hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
381         CK_LIST_FOREACH(grp, hdr, il_list) {
382                 if (grp->il_vflag == inp->inp_vflag &&
383                     grp->il_lport == inp->inp_lport &&
384                     grp->il_numa_domain == numa_domain &&
385                     memcmp(&grp->il_dependladdr,
386                     &inp->inp_inc.inc_ie.ie_dependladdr,
387                     sizeof(grp->il_dependladdr)) == 0)
388                         break;
389         }
390         if (grp == NULL) {
391                 /* Create new load balance group. */
392                 grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
393                     inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
394                     INPCBLBGROUP_SIZMIN, numa_domain);
395                 if (grp == NULL)
396                         return (ENOBUFS);
397         } else if (grp->il_inpcnt == grp->il_inpsiz) {
398                 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
399                         if (ratecheck(&lastprint, &interval))
400                                 printf("lb group port %d, limit reached\n",
401                                     ntohs(grp->il_lport));
402                         return (0);
403                 }
404
405                 /* Expand this local group. */
406                 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
407                 if (grp == NULL)
408                         return (ENOBUFS);
409         }
410
411         KASSERT(grp->il_inpcnt < grp->il_inpsiz,
412             ("invalid local group size %d and count %d", grp->il_inpsiz,
413             grp->il_inpcnt));
414
415         grp->il_inp[grp->il_inpcnt] = inp;
416         grp->il_inpcnt++;
417         return (0);
418 }
419
420 /*
421  * Remove PCB from load balance group.
422  */
423 static void
424 in_pcbremlbgrouphash(struct inpcb *inp)
425 {
426         struct inpcbinfo *pcbinfo;
427         struct inpcblbgrouphead *hdr;
428         struct inpcblbgroup *grp;
429         int i;
430
431         pcbinfo = inp->inp_pcbinfo;
432
433         INP_WLOCK_ASSERT(inp);
434         INP_HASH_WLOCK_ASSERT(pcbinfo);
435
436         hdr = &pcbinfo->ipi_lbgrouphashbase[
437             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
438         CK_LIST_FOREACH(grp, hdr, il_list) {
439                 for (i = 0; i < grp->il_inpcnt; ++i) {
440                         if (grp->il_inp[i] != inp)
441                                 continue;
442
443                         if (grp->il_inpcnt == 1) {
444                                 /* We are the last, free this local group. */
445                                 in_pcblbgroup_free(grp);
446                         } else {
447                                 /* Pull up inpcbs, shrink group if possible. */
448                                 in_pcblbgroup_reorder(hdr, &grp, i);
449                         }
450                         return;
451                 }
452         }
453 }
454
455 int
456 in_pcblbgroup_numa(struct inpcb *inp, int arg)
457 {
458         struct inpcbinfo *pcbinfo;
459         struct inpcblbgrouphead *hdr;
460         struct inpcblbgroup *grp;
461         int err, i;
462         uint8_t numa_domain;
463
464         switch (arg) {
465         case TCP_REUSPORT_LB_NUMA_NODOM:
466                 numa_domain = M_NODOM;
467                 break;
468         case TCP_REUSPORT_LB_NUMA_CURDOM:
469                 numa_domain = PCPU_GET(domain);
470                 break;
471         default:
472                 if (arg < 0 || arg >= vm_ndomains)
473                         return (EINVAL);
474                 numa_domain = arg;
475         }
476
477         err = 0;
478         pcbinfo = inp->inp_pcbinfo;
479         INP_WLOCK_ASSERT(inp);
480         INP_HASH_WLOCK(pcbinfo);
481         hdr = &pcbinfo->ipi_lbgrouphashbase[
482             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
483         CK_LIST_FOREACH(grp, hdr, il_list) {
484                 for (i = 0; i < grp->il_inpcnt; ++i) {
485                         if (grp->il_inp[i] != inp)
486                                 continue;
487
488                         if (grp->il_numa_domain == numa_domain) {
489                                 goto abort_with_hash_wlock;
490                         }
491
492                         /* Remove it from the old group. */
493                         in_pcbremlbgrouphash(inp);
494
495                         /* Add it to the new group based on numa domain. */
496                         in_pcbinslbgrouphash(inp, numa_domain);
497                         goto abort_with_hash_wlock;
498                 }
499         }
500         err = ENOENT;
501 abort_with_hash_wlock:
502         INP_HASH_WUNLOCK(pcbinfo);
503         return (err);
504 }
505
506 /*
507  * Different protocols initialize their inpcbs differently - giving
508  * different name to the lock.  But they all are disposed the same.
509  */
510 static void
511 inpcb_fini(void *mem, int size)
512 {
513         struct inpcb *inp = mem;
514
515         INP_LOCK_DESTROY(inp);
516 }
517
518 /*
519  * Initialize an inpcbinfo -- we should be able to reduce the number of
520  * arguments in time.
521  */
522 void
523 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
524     struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
525     char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields)
526 {
527
528         porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
529
530         INP_INFO_LOCK_INIT(pcbinfo, name);
531         INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");     /* XXXRW: argument? */
532         INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
533 #ifdef VIMAGE
534         pcbinfo->ipi_vnet = curvnet;
535 #endif
536         pcbinfo->ipi_listhead = listhead;
537         CK_LIST_INIT(pcbinfo->ipi_listhead);
538         pcbinfo->ipi_count = 0;
539         pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
540             &pcbinfo->ipi_hashmask);
541         pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
542             &pcbinfo->ipi_porthashmask);
543         pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
544             &pcbinfo->ipi_lbgrouphashmask);
545 #ifdef PCBGROUP
546         in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
547 #endif
548         pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
549             NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0);
550         uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
551         uma_zone_set_warning(pcbinfo->ipi_zone,
552             "kern.ipc.maxsockets limit reached");
553 }
554
555 /*
556  * Destroy an inpcbinfo.
557  */
558 void
559 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
560 {
561
562         KASSERT(pcbinfo->ipi_count == 0,
563             ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
564
565         hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
566         hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
567             pcbinfo->ipi_porthashmask);
568         hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
569             pcbinfo->ipi_lbgrouphashmask);
570 #ifdef PCBGROUP
571         in_pcbgroup_destroy(pcbinfo);
572 #endif
573         uma_zdestroy(pcbinfo->ipi_zone);
574         INP_LIST_LOCK_DESTROY(pcbinfo);
575         INP_HASH_LOCK_DESTROY(pcbinfo);
576         INP_INFO_LOCK_DESTROY(pcbinfo);
577 }
578
579 /*
580  * Allocate a PCB and associate it with the socket.
581  * On success return with the PCB locked.
582  */
583 int
584 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
585 {
586         struct inpcb *inp;
587         int error;
588
589         error = 0;
590         inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
591         if (inp == NULL)
592                 return (ENOBUFS);
593         bzero(&inp->inp_start_zero, inp_zero_size);
594 #ifdef NUMA
595         inp->inp_numa_domain = M_NODOM;
596 #endif
597         inp->inp_pcbinfo = pcbinfo;
598         inp->inp_socket = so;
599         inp->inp_cred = crhold(so->so_cred);
600         inp->inp_inc.inc_fibnum = so->so_fibnum;
601 #ifdef MAC
602         error = mac_inpcb_init(inp, M_NOWAIT);
603         if (error != 0)
604                 goto out;
605         mac_inpcb_create(so, inp);
606 #endif
607 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
608         error = ipsec_init_pcbpolicy(inp);
609         if (error != 0) {
610 #ifdef MAC
611                 mac_inpcb_destroy(inp);
612 #endif
613                 goto out;
614         }
615 #endif /*IPSEC*/
616 #ifdef INET6
617         if (INP_SOCKAF(so) == AF_INET6) {
618                 inp->inp_vflag |= INP_IPV6PROTO;
619                 if (V_ip6_v6only)
620                         inp->inp_flags |= IN6P_IPV6_V6ONLY;
621         }
622 #endif
623         INP_WLOCK(inp);
624         INP_LIST_WLOCK(pcbinfo);
625         CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
626         pcbinfo->ipi_count++;
627         so->so_pcb = (caddr_t)inp;
628 #ifdef INET6
629         if (V_ip6_auto_flowlabel)
630                 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
631 #endif
632         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
633         refcount_init(&inp->inp_refcount, 1);   /* Reference from inpcbinfo */
634
635         /*
636          * Routes in inpcb's can cache L2 as well; they are guaranteed
637          * to be cleaned up.
638          */
639         inp->inp_route.ro_flags = RT_LLE_CACHE;
640         INP_LIST_WUNLOCK(pcbinfo);
641 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
642 out:
643         if (error != 0) {
644                 crfree(inp->inp_cred);
645                 uma_zfree(pcbinfo->ipi_zone, inp);
646         }
647 #endif
648         return (error);
649 }
650
651 #ifdef INET
652 int
653 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
654 {
655         int anonport, error;
656
657         KASSERT(nam == NULL || nam->sa_family == AF_INET,
658             ("%s: invalid address family for %p", __func__, nam));
659         KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in),
660             ("%s: invalid address length for %p", __func__, nam));
661         INP_WLOCK_ASSERT(inp);
662         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
663
664         if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
665                 return (EINVAL);
666         anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
667         error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
668             &inp->inp_lport, cred);
669         if (error)
670                 return (error);
671         if (in_pcbinshash(inp) != 0) {
672                 inp->inp_laddr.s_addr = INADDR_ANY;
673                 inp->inp_lport = 0;
674                 return (EAGAIN);
675         }
676         if (anonport)
677                 inp->inp_flags |= INP_ANONPORT;
678         return (0);
679 }
680 #endif
681
682 #if defined(INET) || defined(INET6)
683 /*
684  * Assign a local port like in_pcb_lport(), but also used with connect()
685  * and a foreign address and port.  If fsa is non-NULL, choose a local port
686  * that is unused with those, otherwise one that is completely unused.
687  * lsa can be NULL for IPv6.
688  */
689 int
690 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
691     struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
692 {
693         struct inpcbinfo *pcbinfo;
694         struct inpcb *tmpinp;
695         unsigned short *lastport;
696         int count, dorandom, error;
697         u_short aux, first, last, lport;
698 #ifdef INET
699         struct in_addr laddr, faddr;
700 #endif
701 #ifdef INET6
702         struct in6_addr *laddr6, *faddr6;
703 #endif
704
705         pcbinfo = inp->inp_pcbinfo;
706
707         /*
708          * Because no actual state changes occur here, a global write lock on
709          * the pcbinfo isn't required.
710          */
711         INP_LOCK_ASSERT(inp);
712         INP_HASH_LOCK_ASSERT(pcbinfo);
713
714         if (inp->inp_flags & INP_HIGHPORT) {
715                 first = V_ipport_hifirstauto;   /* sysctl */
716                 last  = V_ipport_hilastauto;
717                 lastport = &pcbinfo->ipi_lasthi;
718         } else if (inp->inp_flags & INP_LOWPORT) {
719                 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
720                 if (error)
721                         return (error);
722                 first = V_ipport_lowfirstauto;  /* 1023 */
723                 last  = V_ipport_lowlastauto;   /* 600 */
724                 lastport = &pcbinfo->ipi_lastlow;
725         } else {
726                 first = V_ipport_firstauto;     /* sysctl */
727                 last  = V_ipport_lastauto;
728                 lastport = &pcbinfo->ipi_lastport;
729         }
730         /*
731          * For UDP(-Lite), use random port allocation as long as the user
732          * allows it.  For TCP (and as of yet unknown) connections,
733          * use random port allocation only if the user allows it AND
734          * ipport_tick() allows it.
735          */
736         if (V_ipport_randomized &&
737                 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
738                 pcbinfo == &V_ulitecbinfo))
739                 dorandom = 1;
740         else
741                 dorandom = 0;
742         /*
743          * It makes no sense to do random port allocation if
744          * we have the only port available.
745          */
746         if (first == last)
747                 dorandom = 0;
748         /* Make sure to not include UDP(-Lite) packets in the count. */
749         if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
750                 V_ipport_tcpallocs++;
751         /*
752          * Instead of having two loops further down counting up or down
753          * make sure that first is always <= last and go with only one
754          * code path implementing all logic.
755          */
756         if (first > last) {
757                 aux = first;
758                 first = last;
759                 last = aux;
760         }
761
762 #ifdef INET
763         laddr.s_addr = INADDR_ANY;
764         if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
765                 if (lsa != NULL)
766                         laddr = ((struct sockaddr_in *)lsa)->sin_addr;
767                 if (fsa != NULL)
768                         faddr = ((struct sockaddr_in *)fsa)->sin_addr;
769         }
770 #endif
771 #ifdef INET6
772         laddr6 = NULL;
773         if ((inp->inp_vflag & INP_IPV6) != 0) {
774                 if (lsa != NULL)
775                         laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
776                 if (fsa != NULL)
777                         faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
778         }
779 #endif
780
781         tmpinp = NULL;
782         lport = *lportp;
783
784         if (dorandom)
785                 *lastport = first + (arc4random() % (last - first));
786
787         count = last - first;
788
789         do {
790                 if (count-- < 0)        /* completely used? */
791                         return (EADDRNOTAVAIL);
792                 ++*lastport;
793                 if (*lastport < first || *lastport > last)
794                         *lastport = first;
795                 lport = htons(*lastport);
796
797                 if (fsa != NULL) {
798 #ifdef INET
799                         if (lsa->sa_family == AF_INET) {
800                                 tmpinp = in_pcblookup_hash_locked(pcbinfo,
801                                     faddr, fport, laddr, lport, lookupflags,
802                                     NULL, M_NODOM);
803                         }
804 #endif
805 #ifdef INET6
806                         if (lsa->sa_family == AF_INET6) {
807                                 tmpinp = in6_pcblookup_hash_locked(pcbinfo,
808                                     faddr6, fport, laddr6, lport, lookupflags,
809                                     NULL, M_NODOM);
810                         }
811 #endif
812                 } else {
813 #ifdef INET6
814                         if ((inp->inp_vflag & INP_IPV6) != 0)
815                                 tmpinp = in6_pcblookup_local(pcbinfo,
816                                     &inp->in6p_laddr, lport, lookupflags, cred);
817 #endif
818 #if defined(INET) && defined(INET6)
819                         else
820 #endif
821 #ifdef INET
822                                 tmpinp = in_pcblookup_local(pcbinfo, laddr,
823                                     lport, lookupflags, cred);
824 #endif
825                 }
826         } while (tmpinp != NULL);
827
828         *lportp = lport;
829
830         return (0);
831 }
832
833 /*
834  * Select a local port (number) to use.
835  */
836 int
837 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
838     struct ucred *cred, int lookupflags)
839 {
840         struct sockaddr_in laddr;
841
842         if (laddrp) {
843                 bzero(&laddr, sizeof(laddr));
844                 laddr.sin_family = AF_INET;
845                 laddr.sin_addr = *laddrp;
846         }
847         return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
848             NULL, lportp, NULL, 0, cred, lookupflags));
849 }
850
851 /*
852  * Return cached socket options.
853  */
854 int
855 inp_so_options(const struct inpcb *inp)
856 {
857         int so_options;
858
859         so_options = 0;
860
861         if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
862                 so_options |= SO_REUSEPORT_LB;
863         if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
864                 so_options |= SO_REUSEPORT;
865         if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
866                 so_options |= SO_REUSEADDR;
867         return (so_options);
868 }
869 #endif /* INET || INET6 */
870
871 /*
872  * Check if a new BINDMULTI socket is allowed to be created.
873  *
874  * ni points to the new inp.
875  * oi points to the existing inp.
876  *
877  * This checks whether the existing inp also has BINDMULTI and
878  * whether the credentials match.
879  */
880 int
881 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
882 {
883         /* Check permissions match */
884         if ((ni->inp_flags2 & INP_BINDMULTI) &&
885             (ni->inp_cred->cr_uid !=
886             oi->inp_cred->cr_uid))
887                 return (0);
888
889         /* Check the existing inp has BINDMULTI set */
890         if ((ni->inp_flags2 & INP_BINDMULTI) &&
891             ((oi->inp_flags2 & INP_BINDMULTI) == 0))
892                 return (0);
893
894         /*
895          * We're okay - either INP_BINDMULTI isn't set on ni, or
896          * it is and it matches the checks.
897          */
898         return (1);
899 }
900
901 #ifdef INET
902 /*
903  * Set up a bind operation on a PCB, performing port allocation
904  * as required, but do not actually modify the PCB. Callers can
905  * either complete the bind by setting inp_laddr/inp_lport and
906  * calling in_pcbinshash(), or they can just use the resulting
907  * port and address to authorise the sending of a once-off packet.
908  *
909  * On error, the values of *laddrp and *lportp are not changed.
910  */
911 int
912 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
913     u_short *lportp, struct ucred *cred)
914 {
915         struct socket *so = inp->inp_socket;
916         struct sockaddr_in *sin;
917         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
918         struct in_addr laddr;
919         u_short lport = 0;
920         int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
921         int error;
922
923         /*
924          * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
925          * so that we don't have to add to the (already messy) code below.
926          */
927         int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
928
929         /*
930          * No state changes, so read locks are sufficient here.
931          */
932         INP_LOCK_ASSERT(inp);
933         INP_HASH_LOCK_ASSERT(pcbinfo);
934
935         laddr.s_addr = *laddrp;
936         if (nam != NULL && laddr.s_addr != INADDR_ANY)
937                 return (EINVAL);
938         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
939                 lookupflags = INPLOOKUP_WILDCARD;
940         if (nam == NULL) {
941                 if ((error = prison_local_ip4(cred, &laddr)) != 0)
942                         return (error);
943         } else {
944                 sin = (struct sockaddr_in *)nam;
945                 KASSERT(sin->sin_family == AF_INET,
946                     ("%s: invalid family for address %p", __func__, sin));
947                 KASSERT(sin->sin_len == sizeof(*sin),
948                     ("%s: invalid length for address %p", __func__, sin));
949
950                 error = prison_local_ip4(cred, &sin->sin_addr);
951                 if (error)
952                         return (error);
953                 if (sin->sin_port != *lportp) {
954                         /* Don't allow the port to change. */
955                         if (*lportp != 0)
956                                 return (EINVAL);
957                         lport = sin->sin_port;
958                 }
959                 /* NB: lport is left as 0 if the port isn't being changed. */
960                 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
961                         /*
962                          * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
963                          * allow complete duplication of binding if
964                          * SO_REUSEPORT is set, or if SO_REUSEADDR is set
965                          * and a multicast address is bound on both
966                          * new and duplicated sockets.
967                          */
968                         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
969                                 reuseport = SO_REUSEADDR|SO_REUSEPORT;
970                         /*
971                          * XXX: How to deal with SO_REUSEPORT_LB here?
972                          * Treat same as SO_REUSEPORT for now.
973                          */
974                         if ((so->so_options &
975                             (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
976                                 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
977                 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
978                         sin->sin_port = 0;              /* yech... */
979                         bzero(&sin->sin_zero, sizeof(sin->sin_zero));
980                         /*
981                          * Is the address a local IP address?
982                          * If INP_BINDANY is set, then the socket may be bound
983                          * to any endpoint address, local or not.
984                          */
985                         if ((inp->inp_flags & INP_BINDANY) == 0 &&
986                             ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
987                                 return (EADDRNOTAVAIL);
988                 }
989                 laddr = sin->sin_addr;
990                 if (lport) {
991                         struct inpcb *t;
992                         struct tcptw *tw;
993
994                         /* GROSS */
995                         if (ntohs(lport) <= V_ipport_reservedhigh &&
996                             ntohs(lport) >= V_ipport_reservedlow &&
997                             priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
998                                 return (EACCES);
999                         if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
1000                             priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
1001                                 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
1002                                     lport, INPLOOKUP_WILDCARD, cred);
1003         /*
1004          * XXX
1005          * This entire block sorely needs a rewrite.
1006          */
1007                                 if (t &&
1008                                     ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
1009                                     ((t->inp_flags & INP_TIMEWAIT) == 0) &&
1010                                     (so->so_type != SOCK_STREAM ||
1011                                      ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
1012                                     (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
1013                                      ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
1014                                      (t->inp_flags2 & INP_REUSEPORT) ||
1015                                      (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
1016                                     (inp->inp_cred->cr_uid !=
1017                                      t->inp_cred->cr_uid))
1018                                         return (EADDRINUSE);
1019
1020                                 /*
1021                                  * If the socket is a BINDMULTI socket, then
1022                                  * the credentials need to match and the
1023                                  * original socket also has to have been bound
1024                                  * with BINDMULTI.
1025                                  */
1026                                 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
1027                                         return (EADDRINUSE);
1028                         }
1029                         t = in_pcblookup_local(pcbinfo, sin->sin_addr,
1030                             lport, lookupflags, cred);
1031                         if (t && (t->inp_flags & INP_TIMEWAIT)) {
1032                                 /*
1033                                  * XXXRW: If an incpb has had its timewait
1034                                  * state recycled, we treat the address as
1035                                  * being in use (for now).  This is better
1036                                  * than a panic, but not desirable.
1037                                  */
1038                                 tw = intotw(t);
1039                                 if (tw == NULL ||
1040                                     ((reuseport & tw->tw_so_options) == 0 &&
1041                                         (reuseport_lb &
1042                                             tw->tw_so_options) == 0)) {
1043                                         return (EADDRINUSE);
1044                                 }
1045                         } else if (t &&
1046                                    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
1047                                    (reuseport & inp_so_options(t)) == 0 &&
1048                                    (reuseport_lb & inp_so_options(t)) == 0) {
1049 #ifdef INET6
1050                                 if (ntohl(sin->sin_addr.s_addr) !=
1051                                     INADDR_ANY ||
1052                                     ntohl(t->inp_laddr.s_addr) !=
1053                                     INADDR_ANY ||
1054                                     (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
1055                                     (t->inp_vflag & INP_IPV6PROTO) == 0)
1056 #endif
1057                                                 return (EADDRINUSE);
1058                                 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
1059                                         return (EADDRINUSE);
1060                         }
1061                 }
1062         }
1063         if (*lportp != 0)
1064                 lport = *lportp;
1065         if (lport == 0) {
1066                 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
1067                 if (error != 0)
1068                         return (error);
1069         }
1070         *laddrp = laddr.s_addr;
1071         *lportp = lport;
1072         return (0);
1073 }
1074
1075 /*
1076  * Connect from a socket to a specified address.
1077  * Both address and port must be specified in argument sin.
1078  * If don't have a local address for this socket yet,
1079  * then pick one.
1080  */
1081 int
1082 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
1083     struct ucred *cred, struct mbuf *m, bool rehash)
1084 {
1085         u_short lport, fport;
1086         in_addr_t laddr, faddr;
1087         int anonport, error;
1088
1089         INP_WLOCK_ASSERT(inp);
1090         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1091
1092         lport = inp->inp_lport;
1093         laddr = inp->inp_laddr.s_addr;
1094         anonport = (lport == 0);
1095         error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
1096             NULL, cred);
1097         if (error)
1098                 return (error);
1099
1100         /* Do the initial binding of the local address if required. */
1101         if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
1102                 KASSERT(rehash == true,
1103                     ("Rehashing required for unbound inps"));
1104                 inp->inp_lport = lport;
1105                 inp->inp_laddr.s_addr = laddr;
1106                 if (in_pcbinshash(inp) != 0) {
1107                         inp->inp_laddr.s_addr = INADDR_ANY;
1108                         inp->inp_lport = 0;
1109                         return (EAGAIN);
1110                 }
1111         }
1112
1113         /* Commit the remaining changes. */
1114         inp->inp_lport = lport;
1115         inp->inp_laddr.s_addr = laddr;
1116         inp->inp_faddr.s_addr = faddr;
1117         inp->inp_fport = fport;
1118         if (rehash) {
1119                 in_pcbrehash_mbuf(inp, m);
1120         } else {
1121                 in_pcbinshash_mbuf(inp, m);
1122         }
1123
1124         if (anonport)
1125                 inp->inp_flags |= INP_ANONPORT;
1126         return (0);
1127 }
1128
1129 int
1130 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
1131 {
1132
1133         return (in_pcbconnect_mbuf(inp, nam, cred, NULL, true));
1134 }
1135
1136 /*
1137  * Do proper source address selection on an unbound socket in case
1138  * of connect. Take jails into account as well.
1139  */
1140 int
1141 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
1142     struct ucred *cred)
1143 {
1144         struct ifaddr *ifa;
1145         struct sockaddr *sa;
1146         struct sockaddr_in *sin, dst;
1147         struct nhop_object *nh;
1148         int error;
1149
1150         NET_EPOCH_ASSERT();
1151         KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1152         /*
1153          * Bypass source address selection and use the primary jail IP
1154          * if requested.
1155          */
1156         if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
1157                 return (0);
1158
1159         error = 0;
1160
1161         nh = NULL;
1162         bzero(&dst, sizeof(dst));
1163         sin = &dst;
1164         sin->sin_family = AF_INET;
1165         sin->sin_len = sizeof(struct sockaddr_in);
1166         sin->sin_addr.s_addr = faddr->s_addr;
1167
1168         /*
1169          * If route is known our src addr is taken from the i/f,
1170          * else punt.
1171          *
1172          * Find out route to destination.
1173          */
1174         if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1175                 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1176                     0, NHR_NONE, 0);
1177
1178         /*
1179          * If we found a route, use the address corresponding to
1180          * the outgoing interface.
1181          *
1182          * Otherwise assume faddr is reachable on a directly connected
1183          * network and try to find a corresponding interface to take
1184          * the source address from.
1185          */
1186         if (nh == NULL || nh->nh_ifp == NULL) {
1187                 struct in_ifaddr *ia;
1188                 struct ifnet *ifp;
1189
1190                 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1191                                         inp->inp_socket->so_fibnum));
1192                 if (ia == NULL) {
1193                         ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1194                                                 inp->inp_socket->so_fibnum));
1195                 }
1196                 if (ia == NULL) {
1197                         error = ENETUNREACH;
1198                         goto done;
1199                 }
1200
1201                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1202                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1203                         goto done;
1204                 }
1205
1206                 ifp = ia->ia_ifp;
1207                 ia = NULL;
1208                 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1209                         sa = ifa->ifa_addr;
1210                         if (sa->sa_family != AF_INET)
1211                                 continue;
1212                         sin = (struct sockaddr_in *)sa;
1213                         if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1214                                 ia = (struct in_ifaddr *)ifa;
1215                                 break;
1216                         }
1217                 }
1218                 if (ia != NULL) {
1219                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1220                         goto done;
1221                 }
1222
1223                 /* 3. As a last resort return the 'default' jail address. */
1224                 error = prison_get_ip4(cred, laddr);
1225                 goto done;
1226         }
1227
1228         /*
1229          * If the outgoing interface on the route found is not
1230          * a loopback interface, use the address from that interface.
1231          * In case of jails do those three steps:
1232          * 1. check if the interface address belongs to the jail. If so use it.
1233          * 2. check if we have any address on the outgoing interface
1234          *    belonging to this jail. If so use it.
1235          * 3. as a last resort return the 'default' jail address.
1236          */
1237         if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1238                 struct in_ifaddr *ia;
1239                 struct ifnet *ifp;
1240
1241                 /* If not jailed, use the default returned. */
1242                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1243                         ia = (struct in_ifaddr *)nh->nh_ifa;
1244                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1245                         goto done;
1246                 }
1247
1248                 /* Jailed. */
1249                 /* 1. Check if the iface address belongs to the jail. */
1250                 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1251                 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1252                         ia = (struct in_ifaddr *)nh->nh_ifa;
1253                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1254                         goto done;
1255                 }
1256
1257                 /*
1258                  * 2. Check if we have any address on the outgoing interface
1259                  *    belonging to this jail.
1260                  */
1261                 ia = NULL;
1262                 ifp = nh->nh_ifp;
1263                 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1264                         sa = ifa->ifa_addr;
1265                         if (sa->sa_family != AF_INET)
1266                                 continue;
1267                         sin = (struct sockaddr_in *)sa;
1268                         if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1269                                 ia = (struct in_ifaddr *)ifa;
1270                                 break;
1271                         }
1272                 }
1273                 if (ia != NULL) {
1274                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1275                         goto done;
1276                 }
1277
1278                 /* 3. As a last resort return the 'default' jail address. */
1279                 error = prison_get_ip4(cred, laddr);
1280                 goto done;
1281         }
1282
1283         /*
1284          * The outgoing interface is marked with 'loopback net', so a route
1285          * to ourselves is here.
1286          * Try to find the interface of the destination address and then
1287          * take the address from there. That interface is not necessarily
1288          * a loopback interface.
1289          * In case of jails, check that it is an address of the jail
1290          * and if we cannot find, fall back to the 'default' jail address.
1291          */
1292         if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1293                 struct in_ifaddr *ia;
1294
1295                 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1296                                         inp->inp_socket->so_fibnum));
1297                 if (ia == NULL)
1298                         ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1299                                                 inp->inp_socket->so_fibnum));
1300                 if (ia == NULL)
1301                         ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1302
1303                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1304                         if (ia == NULL) {
1305                                 error = ENETUNREACH;
1306                                 goto done;
1307                         }
1308                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1309                         goto done;
1310                 }
1311
1312                 /* Jailed. */
1313                 if (ia != NULL) {
1314                         struct ifnet *ifp;
1315
1316                         ifp = ia->ia_ifp;
1317                         ia = NULL;
1318                         CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1319                                 sa = ifa->ifa_addr;
1320                                 if (sa->sa_family != AF_INET)
1321                                         continue;
1322                                 sin = (struct sockaddr_in *)sa;
1323                                 if (prison_check_ip4(cred,
1324                                     &sin->sin_addr) == 0) {
1325                                         ia = (struct in_ifaddr *)ifa;
1326                                         break;
1327                                 }
1328                         }
1329                         if (ia != NULL) {
1330                                 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1331                                 goto done;
1332                         }
1333                 }
1334
1335                 /* 3. As a last resort return the 'default' jail address. */
1336                 error = prison_get_ip4(cred, laddr);
1337                 goto done;
1338         }
1339
1340 done:
1341         return (error);
1342 }
1343
1344 /*
1345  * Set up for a connect from a socket to the specified address.
1346  * On entry, *laddrp and *lportp should contain the current local
1347  * address and port for the PCB; these are updated to the values
1348  * that should be placed in inp_laddr and inp_lport to complete
1349  * the connect.
1350  *
1351  * On success, *faddrp and *fportp will be set to the remote address
1352  * and port. These are not updated in the error case.
1353  *
1354  * If the operation fails because the connection already exists,
1355  * *oinpp will be set to the PCB of that connection so that the
1356  * caller can decide to override it. In all other cases, *oinpp
1357  * is set to NULL.
1358  */
1359 int
1360 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
1361     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
1362     struct inpcb **oinpp, struct ucred *cred)
1363 {
1364         struct rm_priotracker in_ifa_tracker;
1365         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
1366         struct in_ifaddr *ia;
1367         struct inpcb *oinp;
1368         struct in_addr laddr, faddr;
1369         u_short lport, fport;
1370         int error;
1371
1372         KASSERT(sin->sin_family == AF_INET,
1373             ("%s: invalid address family for %p", __func__, sin));
1374         KASSERT(sin->sin_len == sizeof(*sin),
1375             ("%s: invalid address length for %p", __func__, sin));
1376
1377         /*
1378          * Because a global state change doesn't actually occur here, a read
1379          * lock is sufficient.
1380          */
1381         NET_EPOCH_ASSERT();
1382         INP_LOCK_ASSERT(inp);
1383         INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1384
1385         if (oinpp != NULL)
1386                 *oinpp = NULL;
1387         if (sin->sin_port == 0)
1388                 return (EADDRNOTAVAIL);
1389         laddr.s_addr = *laddrp;
1390         lport = *lportp;
1391         faddr = sin->sin_addr;
1392         fport = sin->sin_port;
1393 #ifdef ROUTE_MPATH
1394         if (CALC_FLOWID_OUTBOUND) {
1395                 uint32_t hash_val, hash_type;
1396
1397                 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport,
1398                     inp->inp_socket->so_proto->pr_protocol, &hash_type);
1399
1400                 inp->inp_flowid = hash_val;
1401                 inp->inp_flowtype = hash_type;
1402         }
1403 #endif
1404         if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
1405                 /*
1406                  * If the destination address is INADDR_ANY,
1407                  * use the primary local address.
1408                  * If the supplied address is INADDR_BROADCAST,
1409                  * and the primary interface supports broadcast,
1410                  * choose the broadcast address for that interface.
1411                  */
1412                 if (faddr.s_addr == INADDR_ANY) {
1413                         IN_IFADDR_RLOCK(&in_ifa_tracker);
1414                         faddr =
1415                             IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1416                         IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1417                         if (cred != NULL &&
1418                             (error = prison_get_ip4(cred, &faddr)) != 0)
1419                                 return (error);
1420                 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
1421                         IN_IFADDR_RLOCK(&in_ifa_tracker);
1422                         if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
1423                             IFF_BROADCAST)
1424                                 faddr = satosin(&CK_STAILQ_FIRST(
1425                                     &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1426                         IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1427                 }
1428         }
1429         if (laddr.s_addr == INADDR_ANY) {
1430                 error = in_pcbladdr(inp, &faddr, &laddr, cred);
1431                 /*
1432                  * If the destination address is multicast and an outgoing
1433                  * interface has been set as a multicast option, prefer the
1434                  * address of that interface as our source address.
1435                  */
1436                 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
1437                     inp->inp_moptions != NULL) {
1438                         struct ip_moptions *imo;
1439                         struct ifnet *ifp;
1440
1441                         imo = inp->inp_moptions;
1442                         if (imo->imo_multicast_ifp != NULL) {
1443                                 ifp = imo->imo_multicast_ifp;
1444                                 IN_IFADDR_RLOCK(&in_ifa_tracker);
1445                                 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1446                                         if ((ia->ia_ifp == ifp) &&
1447                                             (cred == NULL ||
1448                                             prison_check_ip4(cred,
1449                                             &ia->ia_addr.sin_addr) == 0))
1450                                                 break;
1451                                 }
1452                                 if (ia == NULL)
1453                                         error = EADDRNOTAVAIL;
1454                                 else {
1455                                         laddr = ia->ia_addr.sin_addr;
1456                                         error = 0;
1457                                 }
1458                                 IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1459                         }
1460                 }
1461                 if (error)
1462                         return (error);
1463         }
1464
1465         if (lport != 0) {
1466                 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
1467                     fport, laddr, lport, 0, NULL, M_NODOM);
1468                 if (oinp != NULL) {
1469                         if (oinpp != NULL)
1470                                 *oinpp = oinp;
1471                         return (EADDRINUSE);
1472                 }
1473         } else {
1474                 struct sockaddr_in lsin, fsin;
1475
1476                 bzero(&lsin, sizeof(lsin));
1477                 bzero(&fsin, sizeof(fsin));
1478                 lsin.sin_family = AF_INET;
1479                 lsin.sin_addr = laddr;
1480                 fsin.sin_family = AF_INET;
1481                 fsin.sin_addr = faddr;
1482                 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin,
1483                     &lport, (struct sockaddr *)& fsin, fport, cred,
1484                     INPLOOKUP_WILDCARD);
1485                 if (error)
1486                         return (error);
1487         }
1488         *laddrp = laddr.s_addr;
1489         *lportp = lport;
1490         *faddrp = faddr.s_addr;
1491         *fportp = fport;
1492         return (0);
1493 }
1494
1495 void
1496 in_pcbdisconnect(struct inpcb *inp)
1497 {
1498
1499         INP_WLOCK_ASSERT(inp);
1500         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1501
1502         inp->inp_faddr.s_addr = INADDR_ANY;
1503         inp->inp_fport = 0;
1504         in_pcbrehash(inp);
1505 }
1506 #endif /* INET */
1507
1508 /*
1509  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
1510  * For most protocols, this will be invoked immediately prior to calling
1511  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
1512  * socket, in which case in_pcbfree() is deferred.
1513  */
1514 void
1515 in_pcbdetach(struct inpcb *inp)
1516 {
1517
1518         KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1519
1520 #ifdef RATELIMIT
1521         if (inp->inp_snd_tag != NULL)
1522                 in_pcbdetach_txrtlmt(inp);
1523 #endif
1524         inp->inp_socket->so_pcb = NULL;
1525         inp->inp_socket = NULL;
1526 }
1527
1528 /*
1529  * in_pcbref() bumps the reference count on an inpcb in order to maintain
1530  * stability of an inpcb pointer despite the inpcb lock being released.  This
1531  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
1532  * but where the inpcb lock may already held, or when acquiring a reference
1533  * via a pcbgroup.
1534  *
1535  * in_pcbref() should be used only to provide brief memory stability, and
1536  * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
1537  * garbage collect the inpcb if it has been in_pcbfree()'d from another
1538  * context.  Until in_pcbrele() has returned that the inpcb is still valid,
1539  * lock and rele are the *only* safe operations that may be performed on the
1540  * inpcb.
1541  *
1542  * While the inpcb will not be freed, releasing the inpcb lock means that the
1543  * connection's state may change, so the caller should be careful to
1544  * revalidate any cached state on reacquiring the lock.  Drop the reference
1545  * using in_pcbrele().
1546  */
1547 void
1548 in_pcbref(struct inpcb *inp)
1549 {
1550
1551         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1552
1553         refcount_acquire(&inp->inp_refcount);
1554 }
1555
1556 /*
1557  * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
1558  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
1559  * return a flag indicating whether or not the inpcb remains valid.  If it is
1560  * valid, we return with the inpcb lock held.
1561  *
1562  * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
1563  * reference on an inpcb.  Historically more work was done here (actually, in
1564  * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
1565  * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
1566  * about memory stability (and continued use of the write lock).
1567  */
1568 int
1569 in_pcbrele_rlocked(struct inpcb *inp)
1570 {
1571         struct inpcbinfo *pcbinfo;
1572
1573         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1574
1575         INP_RLOCK_ASSERT(inp);
1576
1577         if (refcount_release(&inp->inp_refcount) == 0) {
1578                 /*
1579                  * If the inpcb has been freed, let the caller know, even if
1580                  * this isn't the last reference.
1581                  */
1582                 if (inp->inp_flags2 & INP_FREED) {
1583                         INP_RUNLOCK(inp);
1584                         return (1);
1585                 }
1586                 return (0);
1587         }
1588
1589         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1590 #ifdef TCPHPTS
1591         if (inp->inp_in_hpts || inp->inp_in_input) {
1592                 struct tcp_hpts_entry *hpts;
1593                 /*
1594                  * We should not be on the hpts at
1595                  * this point in any form. we must
1596                  * get the lock to be sure.
1597                  */
1598                 hpts = tcp_hpts_lock(inp);
1599                 if (inp->inp_in_hpts)
1600                         panic("Hpts:%p inp:%p at free still on hpts",
1601                               hpts, inp);
1602                 mtx_unlock(&hpts->p_mtx);
1603                 hpts = tcp_input_lock(inp);
1604                 if (inp->inp_in_input)
1605                         panic("Hpts:%p inp:%p at free still on input hpts",
1606                               hpts, inp);
1607                 mtx_unlock(&hpts->p_mtx);
1608         }
1609 #endif
1610         INP_RUNLOCK(inp);
1611         pcbinfo = inp->inp_pcbinfo;
1612         uma_zfree(pcbinfo->ipi_zone, inp);
1613         return (1);
1614 }
1615
1616 int
1617 in_pcbrele_wlocked(struct inpcb *inp)
1618 {
1619         struct inpcbinfo *pcbinfo;
1620
1621         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1622
1623         INP_WLOCK_ASSERT(inp);
1624
1625         if (refcount_release(&inp->inp_refcount) == 0) {
1626                 /*
1627                  * If the inpcb has been freed, let the caller know, even if
1628                  * this isn't the last reference.
1629                  */
1630                 if (inp->inp_flags2 & INP_FREED) {
1631                         INP_WUNLOCK(inp);
1632                         return (1);
1633                 }
1634                 return (0);
1635         }
1636
1637         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1638 #ifdef TCPHPTS
1639         if (inp->inp_in_hpts || inp->inp_in_input) {
1640                 struct tcp_hpts_entry *hpts;
1641                 /*
1642                  * We should not be on the hpts at
1643                  * this point in any form. we must
1644                  * get the lock to be sure.
1645                  */
1646                 hpts = tcp_hpts_lock(inp);
1647                 if (inp->inp_in_hpts)
1648                         panic("Hpts:%p inp:%p at free still on hpts",
1649                               hpts, inp);
1650                 mtx_unlock(&hpts->p_mtx);
1651                 hpts = tcp_input_lock(inp);
1652                 if (inp->inp_in_input)
1653                         panic("Hpts:%p inp:%p at free still on input hpts",
1654                               hpts, inp);
1655                 mtx_unlock(&hpts->p_mtx);
1656         }
1657 #endif
1658         INP_WUNLOCK(inp);
1659         pcbinfo = inp->inp_pcbinfo;
1660         uma_zfree(pcbinfo->ipi_zone, inp);
1661         return (1);
1662 }
1663
1664 /*
1665  * Temporary wrapper.
1666  */
1667 int
1668 in_pcbrele(struct inpcb *inp)
1669 {
1670
1671         return (in_pcbrele_wlocked(inp));
1672 }
1673
1674 void
1675 in_pcblist_rele_rlocked(epoch_context_t ctx)
1676 {
1677         struct in_pcblist *il;
1678         struct inpcb *inp;
1679         struct inpcbinfo *pcbinfo;
1680         int i, n;
1681
1682         il = __containerof(ctx, struct in_pcblist, il_epoch_ctx);
1683         pcbinfo = il->il_pcbinfo;
1684         n = il->il_count;
1685         INP_INFO_WLOCK(pcbinfo);
1686         for (i = 0; i < n; i++) {
1687                 inp = il->il_inp_list[i];
1688                 INP_RLOCK(inp);
1689                 if (!in_pcbrele_rlocked(inp))
1690                         INP_RUNLOCK(inp);
1691         }
1692         INP_INFO_WUNLOCK(pcbinfo);
1693         free(il, M_TEMP);
1694 }
1695
1696 static void
1697 inpcbport_free(epoch_context_t ctx)
1698 {
1699         struct inpcbport *phd;
1700
1701         phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx);
1702         free(phd, M_PCB);
1703 }
1704
1705 static void
1706 in_pcbfree_deferred(epoch_context_t ctx)
1707 {
1708         struct inpcb *inp;
1709         int released __unused;
1710
1711         inp = __containerof(ctx, struct inpcb, inp_epoch_ctx);
1712
1713         INP_WLOCK(inp);
1714         CURVNET_SET(inp->inp_vnet);
1715 #ifdef INET
1716         struct ip_moptions *imo = inp->inp_moptions;
1717         inp->inp_moptions = NULL;
1718 #endif
1719         /* XXXRW: Do as much as possible here. */
1720 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1721         if (inp->inp_sp != NULL)
1722                 ipsec_delete_pcbpolicy(inp);
1723 #endif
1724 #ifdef INET6
1725         struct ip6_moptions *im6o = NULL;
1726         if (inp->inp_vflag & INP_IPV6PROTO) {
1727                 ip6_freepcbopts(inp->in6p_outputopts);
1728                 im6o = inp->in6p_moptions;
1729                 inp->in6p_moptions = NULL;
1730         }
1731 #endif
1732         if (inp->inp_options)
1733                 (void)m_free(inp->inp_options);
1734         inp->inp_vflag = 0;
1735         crfree(inp->inp_cred);
1736 #ifdef MAC
1737         mac_inpcb_destroy(inp);
1738 #endif
1739         released = in_pcbrele_wlocked(inp);
1740         MPASS(released);
1741 #ifdef INET6
1742         ip6_freemoptions(im6o);
1743 #endif
1744 #ifdef INET
1745         inp_freemoptions(imo);
1746 #endif
1747         CURVNET_RESTORE();
1748 }
1749
1750 /*
1751  * Unconditionally schedule an inpcb to be freed by decrementing its
1752  * reference count, which should occur only after the inpcb has been detached
1753  * from its socket.  If another thread holds a temporary reference (acquired
1754  * using in_pcbref()) then the free is deferred until that reference is
1755  * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
1756  * work, including removal from global lists, is done in this context, where
1757  * the pcbinfo lock is held.
1758  */
1759 void
1760 in_pcbfree(struct inpcb *inp)
1761 {
1762         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1763
1764         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1765         KASSERT((inp->inp_flags2 & INP_FREED) == 0,
1766             ("%s: called twice for pcb %p", __func__, inp));
1767         if (inp->inp_flags2 & INP_FREED) {
1768                 INP_WUNLOCK(inp);
1769                 return;
1770         }
1771
1772         INP_WLOCK_ASSERT(inp);
1773         INP_LIST_WLOCK(pcbinfo);
1774         in_pcbremlists(inp);
1775         INP_LIST_WUNLOCK(pcbinfo);
1776         RO_INVALIDATE_CACHE(&inp->inp_route);
1777         /* mark as destruction in progress */
1778         inp->inp_flags2 |= INP_FREED;
1779         INP_WUNLOCK(inp);
1780         NET_EPOCH_CALL(in_pcbfree_deferred, &inp->inp_epoch_ctx);
1781 }
1782
1783 /*
1784  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1785  * port reservation, and preventing it from being returned by inpcb lookups.
1786  *
1787  * It is used by TCP to mark an inpcb as unused and avoid future packet
1788  * delivery or event notification when a socket remains open but TCP has
1789  * closed.  This might occur as a result of a shutdown()-initiated TCP close
1790  * or a RST on the wire, and allows the port binding to be reused while still
1791  * maintaining the invariant that so_pcb always points to a valid inpcb until
1792  * in_pcbdetach().
1793  *
1794  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1795  * in_pcbnotifyall() and in_pcbpurgeif0()?
1796  */
1797 void
1798 in_pcbdrop(struct inpcb *inp)
1799 {
1800
1801         INP_WLOCK_ASSERT(inp);
1802 #ifdef INVARIANTS
1803         if (inp->inp_socket != NULL && inp->inp_ppcb != NULL)
1804                 MPASS(inp->inp_refcount > 1);
1805 #endif
1806
1807         /*
1808          * XXXRW: Possibly we should protect the setting of INP_DROPPED with
1809          * the hash lock...?
1810          */
1811         inp->inp_flags |= INP_DROPPED;
1812         if (inp->inp_flags & INP_INHASHLIST) {
1813                 struct inpcbport *phd = inp->inp_phd;
1814
1815                 INP_HASH_WLOCK(inp->inp_pcbinfo);
1816                 in_pcbremlbgrouphash(inp);
1817                 CK_LIST_REMOVE(inp, inp_hash);
1818                 CK_LIST_REMOVE(inp, inp_portlist);
1819                 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
1820                         CK_LIST_REMOVE(phd, phd_hash);
1821                         NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx);
1822                 }
1823                 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1824                 inp->inp_flags &= ~INP_INHASHLIST;
1825 #ifdef PCBGROUP
1826                 in_pcbgroup_remove(inp);
1827 #endif
1828         }
1829 }
1830
1831 #ifdef INET
1832 /*
1833  * Common routines to return the socket addresses associated with inpcbs.
1834  */
1835 struct sockaddr *
1836 in_sockaddr(in_port_t port, struct in_addr *addr_p)
1837 {
1838         struct sockaddr_in *sin;
1839
1840         sin = malloc(sizeof *sin, M_SONAME,
1841                 M_WAITOK | M_ZERO);
1842         sin->sin_family = AF_INET;
1843         sin->sin_len = sizeof(*sin);
1844         sin->sin_addr = *addr_p;
1845         sin->sin_port = port;
1846
1847         return (struct sockaddr *)sin;
1848 }
1849
1850 int
1851 in_getsockaddr(struct socket *so, struct sockaddr **nam)
1852 {
1853         struct inpcb *inp;
1854         struct in_addr addr;
1855         in_port_t port;
1856
1857         inp = sotoinpcb(so);
1858         KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1859
1860         INP_RLOCK(inp);
1861         port = inp->inp_lport;
1862         addr = inp->inp_laddr;
1863         INP_RUNLOCK(inp);
1864
1865         *nam = in_sockaddr(port, &addr);
1866         return 0;
1867 }
1868
1869 int
1870 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
1871 {
1872         struct inpcb *inp;
1873         struct in_addr addr;
1874         in_port_t port;
1875
1876         inp = sotoinpcb(so);
1877         KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1878
1879         INP_RLOCK(inp);
1880         port = inp->inp_fport;
1881         addr = inp->inp_faddr;
1882         INP_RUNLOCK(inp);
1883
1884         *nam = in_sockaddr(port, &addr);
1885         return 0;
1886 }
1887
1888 void
1889 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
1890     struct inpcb *(*notify)(struct inpcb *, int))
1891 {
1892         struct inpcb *inp, *inp_temp;
1893
1894         INP_INFO_WLOCK(pcbinfo);
1895         CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
1896                 INP_WLOCK(inp);
1897 #ifdef INET6
1898                 if ((inp->inp_vflag & INP_IPV4) == 0) {
1899                         INP_WUNLOCK(inp);
1900                         continue;
1901                 }
1902 #endif
1903                 if (inp->inp_faddr.s_addr != faddr.s_addr ||
1904                     inp->inp_socket == NULL) {
1905                         INP_WUNLOCK(inp);
1906                         continue;
1907                 }
1908                 if ((*notify)(inp, errno))
1909                         INP_WUNLOCK(inp);
1910         }
1911         INP_INFO_WUNLOCK(pcbinfo);
1912 }
1913
1914 void
1915 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1916 {
1917         struct inpcb *inp;
1918         struct in_multi *inm;
1919         struct in_mfilter *imf;
1920         struct ip_moptions *imo;
1921
1922         INP_INFO_WLOCK(pcbinfo);
1923         CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1924                 INP_WLOCK(inp);
1925                 imo = inp->inp_moptions;
1926                 if ((inp->inp_vflag & INP_IPV4) &&
1927                     imo != NULL) {
1928                         /*
1929                          * Unselect the outgoing interface if it is being
1930                          * detached.
1931                          */
1932                         if (imo->imo_multicast_ifp == ifp)
1933                                 imo->imo_multicast_ifp = NULL;
1934
1935                         /*
1936                          * Drop multicast group membership if we joined
1937                          * through the interface being detached.
1938                          *
1939                          * XXX This can all be deferred to an epoch_call
1940                          */
1941 restart:
1942                         IP_MFILTER_FOREACH(imf, &imo->imo_head) {
1943                                 if ((inm = imf->imf_inm) == NULL)
1944                                         continue;
1945                                 if (inm->inm_ifp != ifp)
1946                                         continue;
1947                                 ip_mfilter_remove(&imo->imo_head, imf);
1948                                 IN_MULTI_LOCK_ASSERT();
1949                                 in_leavegroup_locked(inm, NULL);
1950                                 ip_mfilter_free(imf);
1951                                 goto restart;
1952                         }
1953                 }
1954                 INP_WUNLOCK(inp);
1955         }
1956         INP_INFO_WUNLOCK(pcbinfo);
1957 }
1958
1959 /*
1960  * Lookup a PCB based on the local address and port.  Caller must hold the
1961  * hash lock.  No inpcb locks or references are acquired.
1962  */
1963 #define INP_LOOKUP_MAPPED_PCB_COST      3
1964 struct inpcb *
1965 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1966     u_short lport, int lookupflags, struct ucred *cred)
1967 {
1968         struct inpcb *inp;
1969 #ifdef INET6
1970         int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
1971 #else
1972         int matchwild = 3;
1973 #endif
1974         int wildcard;
1975
1976         KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1977             ("%s: invalid lookup flags %d", __func__, lookupflags));
1978
1979         INP_HASH_LOCK_ASSERT(pcbinfo);
1980
1981         if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
1982                 struct inpcbhead *head;
1983                 /*
1984                  * Look for an unconnected (wildcard foreign addr) PCB that
1985                  * matches the local address and port we're looking for.
1986                  */
1987                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1988                     0, pcbinfo->ipi_hashmask)];
1989                 CK_LIST_FOREACH(inp, head, inp_hash) {
1990 #ifdef INET6
1991                         /* XXX inp locking */
1992                         if ((inp->inp_vflag & INP_IPV4) == 0)
1993                                 continue;
1994 #endif
1995                         if (inp->inp_faddr.s_addr == INADDR_ANY &&
1996                             inp->inp_laddr.s_addr == laddr.s_addr &&
1997                             inp->inp_lport == lport) {
1998                                 /*
1999                                  * Found?
2000                                  */
2001                                 if (cred == NULL ||
2002                                     prison_equal_ip4(cred->cr_prison,
2003                                         inp->inp_cred->cr_prison))
2004                                         return (inp);
2005                         }
2006                 }
2007                 /*
2008                  * Not found.
2009                  */
2010                 return (NULL);
2011         } else {
2012                 struct inpcbporthead *porthash;
2013                 struct inpcbport *phd;
2014                 struct inpcb *match = NULL;
2015                 /*
2016                  * Best fit PCB lookup.
2017                  *
2018                  * First see if this local port is in use by looking on the
2019                  * port hash list.
2020                  */
2021                 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2022                     pcbinfo->ipi_porthashmask)];
2023                 CK_LIST_FOREACH(phd, porthash, phd_hash) {
2024                         if (phd->phd_port == lport)
2025                                 break;
2026                 }
2027                 if (phd != NULL) {
2028                         /*
2029                          * Port is in use by one or more PCBs. Look for best
2030                          * fit.
2031                          */
2032                         CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
2033                                 wildcard = 0;
2034                                 if (cred != NULL &&
2035                                     !prison_equal_ip4(inp->inp_cred->cr_prison,
2036                                         cred->cr_prison))
2037                                         continue;
2038 #ifdef INET6
2039                                 /* XXX inp locking */
2040                                 if ((inp->inp_vflag & INP_IPV4) == 0)
2041                                         continue;
2042                                 /*
2043                                  * We never select the PCB that has
2044                                  * INP_IPV6 flag and is bound to :: if
2045                                  * we have another PCB which is bound
2046                                  * to 0.0.0.0.  If a PCB has the
2047                                  * INP_IPV6 flag, then we set its cost
2048                                  * higher than IPv4 only PCBs.
2049                                  *
2050                                  * Note that the case only happens
2051                                  * when a socket is bound to ::, under
2052                                  * the condition that the use of the
2053                                  * mapped address is allowed.
2054                                  */
2055                                 if ((inp->inp_vflag & INP_IPV6) != 0)
2056                                         wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2057 #endif
2058                                 if (inp->inp_faddr.s_addr != INADDR_ANY)
2059                                         wildcard++;
2060                                 if (inp->inp_laddr.s_addr != INADDR_ANY) {
2061                                         if (laddr.s_addr == INADDR_ANY)
2062                                                 wildcard++;
2063                                         else if (inp->inp_laddr.s_addr != laddr.s_addr)
2064                                                 continue;
2065                                 } else {
2066                                         if (laddr.s_addr != INADDR_ANY)
2067                                                 wildcard++;
2068                                 }
2069                                 if (wildcard < matchwild) {
2070                                         match = inp;
2071                                         matchwild = wildcard;
2072                                         if (matchwild == 0)
2073                                                 break;
2074                                 }
2075                         }
2076                 }
2077                 return (match);
2078         }
2079 }
2080 #undef INP_LOOKUP_MAPPED_PCB_COST
2081
2082 static struct inpcb *
2083 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2084     const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
2085     uint16_t fport, int lookupflags, int numa_domain)
2086 {
2087         struct inpcb *local_wild, *numa_wild;
2088         const struct inpcblbgrouphead *hdr;
2089         struct inpcblbgroup *grp;
2090         uint32_t idx;
2091
2092         INP_HASH_LOCK_ASSERT(pcbinfo);
2093
2094         hdr = &pcbinfo->ipi_lbgrouphashbase[
2095             INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2096
2097         /*
2098          * Order of socket selection:
2099          * 1. non-wild.
2100          * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
2101          *
2102          * NOTE:
2103          * - Load balanced group does not contain jailed sockets
2104          * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
2105          */
2106         local_wild = NULL;
2107         numa_wild = NULL;
2108         CK_LIST_FOREACH(grp, hdr, il_list) {
2109 #ifdef INET6
2110                 if (!(grp->il_vflag & INP_IPV4))
2111                         continue;
2112 #endif
2113                 if (grp->il_lport != lport)
2114                         continue;
2115
2116                 idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) %
2117                     grp->il_inpcnt;
2118                 if (grp->il_laddr.s_addr == laddr->s_addr) {
2119                         if (numa_domain == M_NODOM ||
2120                             grp->il_numa_domain == numa_domain) {
2121                                 return (grp->il_inp[idx]);
2122                         } else {
2123                                 numa_wild = grp->il_inp[idx];
2124                         }
2125                 }
2126                 if (grp->il_laddr.s_addr == INADDR_ANY &&
2127                     (lookupflags & INPLOOKUP_WILDCARD) != 0 &&
2128                     (local_wild == NULL || numa_domain == M_NODOM ||
2129                         grp->il_numa_domain == numa_domain)) {
2130                         local_wild = grp->il_inp[idx];
2131                 }
2132         }
2133         if (numa_wild != NULL)
2134                 return (numa_wild);
2135
2136         return (local_wild);
2137 }
2138
2139 #ifdef PCBGROUP
2140 /*
2141  * Lookup PCB in hash list, using pcbgroup tables.
2142  */
2143 static struct inpcb *
2144 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
2145     struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
2146     u_int lport_arg, int lookupflags, struct ifnet *ifp)
2147 {
2148         struct inpcbhead *head;
2149         struct inpcb *inp, *tmpinp;
2150         u_short fport = fport_arg, lport = lport_arg;
2151         bool locked;
2152
2153         /*
2154          * First look for an exact match.
2155          */
2156         tmpinp = NULL;
2157         INP_GROUP_LOCK(pcbgroup);
2158         head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2159             pcbgroup->ipg_hashmask)];
2160         CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
2161 #ifdef INET6
2162                 /* XXX inp locking */
2163                 if ((inp->inp_vflag & INP_IPV4) == 0)
2164                         continue;
2165 #endif
2166                 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2167                     inp->inp_laddr.s_addr == laddr.s_addr &&
2168                     inp->inp_fport == fport &&
2169                     inp->inp_lport == lport) {
2170                         /*
2171                          * XXX We should be able to directly return
2172                          * the inp here, without any checks.
2173                          * Well unless both bound with SO_REUSEPORT?
2174                          */
2175                         if (prison_flag(inp->inp_cred, PR_IP4))
2176                                 goto found;
2177                         if (tmpinp == NULL)
2178                                 tmpinp = inp;
2179                 }
2180         }
2181         if (tmpinp != NULL) {
2182                 inp = tmpinp;
2183                 goto found;
2184         }
2185
2186 #ifdef  RSS
2187         /*
2188          * For incoming connections, we may wish to do a wildcard
2189          * match for an RSS-local socket.
2190          */
2191         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2192                 struct inpcb *local_wild = NULL, *local_exact = NULL;
2193 #ifdef INET6
2194                 struct inpcb *local_wild_mapped = NULL;
2195 #endif
2196                 struct inpcb *jail_wild = NULL;
2197                 struct inpcbhead *head;
2198                 int injail;
2199
2200                 /*
2201                  * Order of socket selection - we always prefer jails.
2202                  *      1. jailed, non-wild.
2203                  *      2. jailed, wild.
2204                  *      3. non-jailed, non-wild.
2205                  *      4. non-jailed, wild.
2206                  */
2207
2208                 head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
2209                     lport, 0, pcbgroup->ipg_hashmask)];
2210                 CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
2211 #ifdef INET6
2212                         /* XXX inp locking */
2213                         if ((inp->inp_vflag & INP_IPV4) == 0)
2214                                 continue;
2215 #endif
2216                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
2217                             inp->inp_lport != lport)
2218                                 continue;
2219
2220                         injail = prison_flag(inp->inp_cred, PR_IP4);
2221                         if (injail) {
2222                                 if (prison_check_ip4(inp->inp_cred,
2223                                     &laddr) != 0)
2224                                         continue;
2225                         } else {
2226                                 if (local_exact != NULL)
2227                                         continue;
2228                         }
2229
2230                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
2231                                 if (injail)
2232                                         goto found;
2233                                 else
2234                                         local_exact = inp;
2235                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2236 #ifdef INET6
2237                                 /* XXX inp locking, NULL check */
2238                                 if (inp->inp_vflag & INP_IPV6PROTO)
2239                                         local_wild_mapped = inp;
2240                                 else
2241 #endif
2242                                         if (injail)
2243                                                 jail_wild = inp;
2244                                         else
2245                                                 local_wild = inp;
2246                         }
2247                 } /* LIST_FOREACH */
2248
2249                 inp = jail_wild;
2250                 if (inp == NULL)
2251                         inp = local_exact;
2252                 if (inp == NULL)
2253                         inp = local_wild;
2254 #ifdef INET6
2255                 if (inp == NULL)
2256                         inp = local_wild_mapped;
2257 #endif
2258                 if (inp != NULL)
2259                         goto found;
2260         }
2261 #endif
2262
2263         /*
2264          * Then look for a wildcard match, if requested.
2265          */
2266         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2267                 struct inpcb *local_wild = NULL, *local_exact = NULL;
2268 #ifdef INET6
2269                 struct inpcb *local_wild_mapped = NULL;
2270 #endif
2271                 struct inpcb *jail_wild = NULL;
2272                 struct inpcbhead *head;
2273                 int injail;
2274
2275                 /*
2276                  * Order of socket selection - we always prefer jails.
2277                  *      1. jailed, non-wild.
2278                  *      2. jailed, wild.
2279                  *      3. non-jailed, non-wild.
2280                  *      4. non-jailed, wild.
2281                  */
2282                 head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
2283                     0, pcbinfo->ipi_wildmask)];
2284                 CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
2285 #ifdef INET6
2286                         /* XXX inp locking */
2287                         if ((inp->inp_vflag & INP_IPV4) == 0)
2288                                 continue;
2289 #endif
2290                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
2291                             inp->inp_lport != lport)
2292                                 continue;
2293
2294                         injail = prison_flag(inp->inp_cred, PR_IP4);
2295                         if (injail) {
2296                                 if (prison_check_ip4(inp->inp_cred,
2297                                     &laddr) != 0)
2298                                         continue;
2299                         } else {
2300                                 if (local_exact != NULL)
2301                                         continue;
2302                         }
2303
2304                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
2305                                 if (injail)
2306                                         goto found;
2307                                 else
2308                                         local_exact = inp;
2309                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2310 #ifdef INET6
2311                                 /* XXX inp locking, NULL check */
2312                                 if (inp->inp_vflag & INP_IPV6PROTO)
2313                                         local_wild_mapped = inp;
2314                                 else
2315 #endif
2316                                         if (injail)
2317                                                 jail_wild = inp;
2318                                         else
2319                                                 local_wild = inp;
2320                         }
2321                 } /* LIST_FOREACH */
2322                 inp = jail_wild;
2323                 if (inp == NULL)
2324                         inp = local_exact;
2325                 if (inp == NULL)
2326                         inp = local_wild;
2327 #ifdef INET6
2328                 if (inp == NULL)
2329                         inp = local_wild_mapped;
2330 #endif
2331                 if (inp != NULL)
2332                         goto found;
2333         } /* if (lookupflags & INPLOOKUP_WILDCARD) */
2334         INP_GROUP_UNLOCK(pcbgroup);
2335         return (NULL);
2336
2337 found:
2338         if (lookupflags & INPLOOKUP_WLOCKPCB)
2339                 locked = INP_TRY_WLOCK(inp);
2340         else if (lookupflags & INPLOOKUP_RLOCKPCB)
2341                 locked = INP_TRY_RLOCK(inp);
2342         else
2343                 panic("%s: locking bug", __func__);
2344         if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) {
2345                 if (lookupflags & INPLOOKUP_WLOCKPCB)
2346                         INP_WUNLOCK(inp);
2347                 else
2348                         INP_RUNLOCK(inp);
2349                 return (NULL);
2350         } else if (!locked)
2351                 in_pcbref(inp);
2352         INP_GROUP_UNLOCK(pcbgroup);
2353         if (!locked) {
2354                 if (lookupflags & INPLOOKUP_WLOCKPCB) {
2355                         INP_WLOCK(inp);
2356                         if (in_pcbrele_wlocked(inp))
2357                                 return (NULL);
2358                 } else {
2359                         INP_RLOCK(inp);
2360                         if (in_pcbrele_rlocked(inp))
2361                                 return (NULL);
2362                 }
2363         }
2364 #ifdef INVARIANTS
2365         if (lookupflags & INPLOOKUP_WLOCKPCB)
2366                 INP_WLOCK_ASSERT(inp);
2367         else
2368                 INP_RLOCK_ASSERT(inp);
2369 #endif
2370         return (inp);
2371 }
2372 #endif /* PCBGROUP */
2373
2374 /*
2375  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
2376  * that the caller has locked the hash list, and will not perform any further
2377  * locking or reference operations on either the hash list or the connection.
2378  */
2379 static struct inpcb *
2380 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2381     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2382     struct ifnet *ifp, uint8_t numa_domain)
2383 {
2384         struct inpcbhead *head;
2385         struct inpcb *inp, *tmpinp;
2386         u_short fport = fport_arg, lport = lport_arg;
2387
2388         KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2389             ("%s: invalid lookup flags %d", __func__, lookupflags));
2390         INP_HASH_LOCK_ASSERT(pcbinfo);
2391
2392         /*
2393          * First look for an exact match.
2394          */
2395         tmpinp = NULL;
2396         head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2397             pcbinfo->ipi_hashmask)];
2398         CK_LIST_FOREACH(inp, head, inp_hash) {
2399 #ifdef INET6
2400                 /* XXX inp locking */
2401                 if ((inp->inp_vflag & INP_IPV4) == 0)
2402                         continue;
2403 #endif
2404                 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2405                     inp->inp_laddr.s_addr == laddr.s_addr &&
2406                     inp->inp_fport == fport &&
2407                     inp->inp_lport == lport) {
2408                         /*
2409                          * XXX We should be able to directly return
2410                          * the inp here, without any checks.
2411                          * Well unless both bound with SO_REUSEPORT?
2412                          */
2413                         if (prison_flag(inp->inp_cred, PR_IP4))
2414                                 return (inp);
2415                         if (tmpinp == NULL)
2416                                 tmpinp = inp;
2417                 }
2418         }
2419         if (tmpinp != NULL)
2420                 return (tmpinp);
2421
2422         /*
2423          * Then look in lb group (for wildcard match).
2424          */
2425         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2426                 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
2427                     fport, lookupflags, numa_domain);
2428                 if (inp != NULL)
2429                         return (inp);
2430         }
2431
2432         /*
2433          * Then look for a wildcard match, if requested.
2434          */
2435         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2436                 struct inpcb *local_wild = NULL, *local_exact = NULL;
2437 #ifdef INET6
2438                 struct inpcb *local_wild_mapped = NULL;
2439 #endif
2440                 struct inpcb *jail_wild = NULL;
2441                 int injail;
2442
2443                 /*
2444                  * Order of socket selection - we always prefer jails.
2445                  *      1. jailed, non-wild.
2446                  *      2. jailed, wild.
2447                  *      3. non-jailed, non-wild.
2448                  *      4. non-jailed, wild.
2449                  */
2450
2451                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
2452                     0, pcbinfo->ipi_hashmask)];
2453                 CK_LIST_FOREACH(inp, head, inp_hash) {
2454 #ifdef INET6
2455                         /* XXX inp locking */
2456                         if ((inp->inp_vflag & INP_IPV4) == 0)
2457                                 continue;
2458 #endif
2459                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
2460                             inp->inp_lport != lport)
2461                                 continue;
2462
2463                         injail = prison_flag(inp->inp_cred, PR_IP4);
2464                         if (injail) {
2465                                 if (prison_check_ip4(inp->inp_cred,
2466                                     &laddr) != 0)
2467                                         continue;
2468                         } else {
2469                                 if (local_exact != NULL)
2470                                         continue;
2471                         }
2472
2473                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
2474                                 if (injail)
2475                                         return (inp);
2476                                 else
2477                                         local_exact = inp;
2478                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2479 #ifdef INET6
2480                                 /* XXX inp locking, NULL check */
2481                                 if (inp->inp_vflag & INP_IPV6PROTO)
2482                                         local_wild_mapped = inp;
2483                                 else
2484 #endif
2485                                         if (injail)
2486                                                 jail_wild = inp;
2487                                         else
2488                                                 local_wild = inp;
2489                         }
2490                 } /* LIST_FOREACH */
2491                 if (jail_wild != NULL)
2492                         return (jail_wild);
2493                 if (local_exact != NULL)
2494                         return (local_exact);
2495                 if (local_wild != NULL)
2496                         return (local_wild);
2497 #ifdef INET6
2498                 if (local_wild_mapped != NULL)
2499                         return (local_wild_mapped);
2500 #endif
2501         } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
2502
2503         return (NULL);
2504 }
2505
2506 /*
2507  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
2508  * hash list lock, and will return the inpcb locked (i.e., requires
2509  * INPLOOKUP_LOCKPCB).
2510  */
2511 static struct inpcb *
2512 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2513     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2514     struct ifnet *ifp, uint8_t numa_domain)
2515 {
2516         struct inpcb *inp;
2517
2518         inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2519             (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp,
2520             numa_domain);
2521         if (inp != NULL) {
2522                 if (lookupflags & INPLOOKUP_WLOCKPCB) {
2523                         INP_WLOCK(inp);
2524                         if (__predict_false(inp->inp_flags2 & INP_FREED)) {
2525                                 INP_WUNLOCK(inp);
2526                                 inp = NULL;
2527                         }
2528                 } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
2529                         INP_RLOCK(inp);
2530                         if (__predict_false(inp->inp_flags2 & INP_FREED)) {
2531                                 INP_RUNLOCK(inp);
2532                                 inp = NULL;
2533                         }
2534                 } else
2535                         panic("%s: locking bug", __func__);
2536 #ifdef INVARIANTS
2537                 if (inp != NULL) {
2538                         if (lookupflags & INPLOOKUP_WLOCKPCB)
2539                                 INP_WLOCK_ASSERT(inp);
2540                         else
2541                                 INP_RLOCK_ASSERT(inp);
2542                 }
2543 #endif
2544         }
2545
2546         return (inp);
2547 }
2548
2549 /*
2550  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2551  * from which a pre-calculated hash value may be extracted.
2552  *
2553  * Possibly more of this logic should be in in_pcbgroup.c.
2554  */
2555 struct inpcb *
2556 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2557     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
2558 {
2559 #if defined(PCBGROUP) && !defined(RSS)
2560         struct inpcbgroup *pcbgroup;
2561 #endif
2562
2563         KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2564             ("%s: invalid lookup flags %d", __func__, lookupflags));
2565         KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2566             ("%s: LOCKPCB not set", __func__));
2567
2568         /*
2569          * When not using RSS, use connection groups in preference to the
2570          * reservation table when looking up 4-tuples.  When using RSS, just
2571          * use the reservation table, due to the cost of the Toeplitz hash
2572          * in software.
2573          *
2574          * XXXRW: This policy belongs in the pcbgroup code, as in principle
2575          * we could be doing RSS with a non-Toeplitz hash that is affordable
2576          * in software.
2577          */
2578 #if defined(PCBGROUP) && !defined(RSS)
2579         if (in_pcbgroup_enabled(pcbinfo)) {
2580                 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2581                     fport);
2582                 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2583                     laddr, lport, lookupflags, ifp));
2584         }
2585 #endif
2586         return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2587             lookupflags, ifp, M_NODOM));
2588 }
2589
2590 struct inpcb *
2591 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2592     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2593     struct ifnet *ifp, struct mbuf *m)
2594 {
2595 #ifdef PCBGROUP
2596         struct inpcbgroup *pcbgroup;
2597 #endif
2598
2599         KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2600             ("%s: invalid lookup flags %d", __func__, lookupflags));
2601         KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2602             ("%s: LOCKPCB not set", __func__));
2603
2604 #ifdef PCBGROUP
2605         /*
2606          * If we can use a hardware-generated hash to look up the connection
2607          * group, use that connection group to find the inpcb.  Otherwise
2608          * fall back on a software hash -- or the reservation table if we're
2609          * using RSS.
2610          *
2611          * XXXRW: As above, that policy belongs in the pcbgroup code.
2612          */
2613         if (in_pcbgroup_enabled(pcbinfo) &&
2614             !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
2615                 pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
2616                     m->m_pkthdr.flowid);
2617                 if (pcbgroup != NULL)
2618                         return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
2619                             fport, laddr, lport, lookupflags, ifp));
2620 #ifndef RSS
2621                 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2622                     fport);
2623                 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2624                     laddr, lport, lookupflags, ifp));
2625 #endif
2626         }
2627 #endif
2628         return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2629             lookupflags, ifp, m->m_pkthdr.numa_domain));
2630 }
2631 #endif /* INET */
2632
2633 /*
2634  * Insert PCB onto various hash lists.
2635  */
2636 static int
2637 in_pcbinshash_internal(struct inpcb *inp, struct mbuf *m)
2638 {
2639         struct inpcbhead *pcbhash;
2640         struct inpcbporthead *pcbporthash;
2641         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2642         struct inpcbport *phd;
2643         u_int32_t hashkey_faddr;
2644         int so_options;
2645
2646         INP_WLOCK_ASSERT(inp);
2647         INP_HASH_WLOCK_ASSERT(pcbinfo);
2648
2649         KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2650             ("in_pcbinshash: INP_INHASHLIST"));
2651
2652 #ifdef INET6
2653         if (inp->inp_vflag & INP_IPV6)
2654                 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2655         else
2656 #endif
2657         hashkey_faddr = inp->inp_faddr.s_addr;
2658
2659         pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2660                  inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2661
2662         pcbporthash = &pcbinfo->ipi_porthashbase[
2663             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2664
2665         /*
2666          * Add entry to load balance group.
2667          * Only do this if SO_REUSEPORT_LB is set.
2668          */
2669         so_options = inp_so_options(inp);
2670         if (so_options & SO_REUSEPORT_LB) {
2671                 int ret = in_pcbinslbgrouphash(inp, M_NODOM);
2672                 if (ret) {
2673                         /* pcb lb group malloc fail (ret=ENOBUFS). */
2674                         return (ret);
2675                 }
2676         }
2677
2678         /*
2679          * Go through port list and look for a head for this lport.
2680          */
2681         CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
2682                 if (phd->phd_port == inp->inp_lport)
2683                         break;
2684         }
2685         /*
2686          * If none exists, malloc one and tack it on.
2687          */
2688         if (phd == NULL) {
2689                 phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
2690                 if (phd == NULL) {
2691                         return (ENOBUFS); /* XXX */
2692                 }
2693                 bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context));
2694                 phd->phd_port = inp->inp_lport;
2695                 CK_LIST_INIT(&phd->phd_pcblist);
2696                 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2697         }
2698         inp->inp_phd = phd;
2699         CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2700         CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
2701         inp->inp_flags |= INP_INHASHLIST;
2702 #ifdef PCBGROUP
2703         if (m != NULL) {
2704                 in_pcbgroup_update_mbuf(inp, m);
2705         } else {
2706                 in_pcbgroup_update(inp);
2707         }
2708 #endif
2709         return (0);
2710 }
2711
2712 int
2713 in_pcbinshash(struct inpcb *inp)
2714 {
2715
2716         return (in_pcbinshash_internal(inp, NULL));
2717 }
2718
2719 int
2720 in_pcbinshash_mbuf(struct inpcb *inp, struct mbuf *m)
2721 {
2722
2723         return (in_pcbinshash_internal(inp, m));
2724 }
2725
2726 /*
2727  * Move PCB to the proper hash bucket when { faddr, fport } have  been
2728  * changed. NOTE: This does not handle the case of the lport changing (the
2729  * hashed port list would have to be updated as well), so the lport must
2730  * not change after in_pcbinshash() has been called.
2731  */
2732 void
2733 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
2734 {
2735         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2736         struct inpcbhead *head;
2737         u_int32_t hashkey_faddr;
2738
2739         INP_WLOCK_ASSERT(inp);
2740         INP_HASH_WLOCK_ASSERT(pcbinfo);
2741
2742         KASSERT(inp->inp_flags & INP_INHASHLIST,
2743             ("in_pcbrehash: !INP_INHASHLIST"));
2744
2745 #ifdef INET6
2746         if (inp->inp_vflag & INP_IPV6)
2747                 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2748         else
2749 #endif
2750         hashkey_faddr = inp->inp_faddr.s_addr;
2751
2752         head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2753                 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2754
2755         CK_LIST_REMOVE(inp, inp_hash);
2756         CK_LIST_INSERT_HEAD(head, inp, inp_hash);
2757
2758 #ifdef PCBGROUP
2759         if (m != NULL)
2760                 in_pcbgroup_update_mbuf(inp, m);
2761         else
2762                 in_pcbgroup_update(inp);
2763 #endif
2764 }
2765
2766 void
2767 in_pcbrehash(struct inpcb *inp)
2768 {
2769
2770         in_pcbrehash_mbuf(inp, NULL);
2771 }
2772
2773 /*
2774  * Remove PCB from various lists.
2775  */
2776 static void
2777 in_pcbremlists(struct inpcb *inp)
2778 {
2779         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2780
2781         INP_WLOCK_ASSERT(inp);
2782         INP_LIST_WLOCK_ASSERT(pcbinfo);
2783
2784         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
2785         if (inp->inp_flags & INP_INHASHLIST) {
2786                 struct inpcbport *phd = inp->inp_phd;
2787
2788                 INP_HASH_WLOCK(pcbinfo);
2789
2790                 /* XXX: Only do if SO_REUSEPORT_LB set? */
2791                 in_pcbremlbgrouphash(inp);
2792
2793                 CK_LIST_REMOVE(inp, inp_hash);
2794                 CK_LIST_REMOVE(inp, inp_portlist);
2795                 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
2796                         CK_LIST_REMOVE(phd, phd_hash);
2797                         NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx);
2798                 }
2799                 INP_HASH_WUNLOCK(pcbinfo);
2800                 inp->inp_flags &= ~INP_INHASHLIST;
2801         }
2802         CK_LIST_REMOVE(inp, inp_list);
2803         pcbinfo->ipi_count--;
2804 #ifdef PCBGROUP
2805         in_pcbgroup_remove(inp);
2806 #endif
2807 }
2808
2809 /*
2810  * Check for alternatives when higher level complains
2811  * about service problems.  For now, invalidate cached
2812  * routing information.  If the route was created dynamically
2813  * (by a redirect), time to try a default gateway again.
2814  */
2815 void
2816 in_losing(struct inpcb *inp)
2817 {
2818
2819         RO_INVALIDATE_CACHE(&inp->inp_route);
2820         return;
2821 }
2822
2823 /*
2824  * A set label operation has occurred at the socket layer, propagate the
2825  * label change into the in_pcb for the socket.
2826  */
2827 void
2828 in_pcbsosetlabel(struct socket *so)
2829 {
2830 #ifdef MAC
2831         struct inpcb *inp;
2832
2833         inp = sotoinpcb(so);
2834         KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2835
2836         INP_WLOCK(inp);
2837         SOCK_LOCK(so);
2838         mac_inpcb_sosetlabel(so, inp);
2839         SOCK_UNLOCK(so);
2840         INP_WUNLOCK(inp);
2841 #endif
2842 }
2843
2844 /*
2845  * ipport_tick runs once per second, determining if random port allocation
2846  * should be continued.  If more than ipport_randomcps ports have been
2847  * allocated in the last second, then we return to sequential port
2848  * allocation. We return to random allocation only once we drop below
2849  * ipport_randomcps for at least ipport_randomtime seconds.
2850  */
2851 static void
2852 ipport_tick(void *xtp)
2853 {
2854         VNET_ITERATOR_DECL(vnet_iter);
2855
2856         VNET_LIST_RLOCK_NOSLEEP();
2857         VNET_FOREACH(vnet_iter) {
2858                 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */
2859                 if (V_ipport_tcpallocs <=
2860                     V_ipport_tcplastcount + V_ipport_randomcps) {
2861                         if (V_ipport_stoprandom > 0)
2862                                 V_ipport_stoprandom--;
2863                 } else
2864                         V_ipport_stoprandom = V_ipport_randomtime;
2865                 V_ipport_tcplastcount = V_ipport_tcpallocs;
2866                 CURVNET_RESTORE();
2867         }
2868         VNET_LIST_RUNLOCK_NOSLEEP();
2869         callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
2870 }
2871
2872 static void
2873 ip_fini(void *xtp)
2874 {
2875
2876         callout_stop(&ipport_tick_callout);
2877 }
2878
2879 /*
2880  * The ipport_callout should start running at about the time we attach the
2881  * inet or inet6 domains.
2882  */
2883 static void
2884 ipport_tick_init(const void *unused __unused)
2885 {
2886
2887         /* Start ipport_tick. */
2888         callout_init(&ipport_tick_callout, 1);
2889         callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
2890         EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
2891                 SHUTDOWN_PRI_DEFAULT);
2892 }
2893 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
2894     ipport_tick_init, NULL);
2895
2896 void
2897 inp_wlock(struct inpcb *inp)
2898 {
2899
2900         INP_WLOCK(inp);
2901 }
2902
2903 void
2904 inp_wunlock(struct inpcb *inp)
2905 {
2906
2907         INP_WUNLOCK(inp);
2908 }
2909
2910 void
2911 inp_rlock(struct inpcb *inp)
2912 {
2913
2914         INP_RLOCK(inp);
2915 }
2916
2917 void
2918 inp_runlock(struct inpcb *inp)
2919 {
2920
2921         INP_RUNLOCK(inp);
2922 }
2923
2924 #ifdef INVARIANT_SUPPORT
2925 void
2926 inp_lock_assert(struct inpcb *inp)
2927 {
2928
2929         INP_WLOCK_ASSERT(inp);
2930 }
2931
2932 void
2933 inp_unlock_assert(struct inpcb *inp)
2934 {
2935
2936         INP_UNLOCK_ASSERT(inp);
2937 }
2938 #endif
2939
2940 void
2941 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
2942 {
2943         struct inpcb *inp;
2944
2945         INP_INFO_WLOCK(&V_tcbinfo);
2946         CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
2947                 INP_WLOCK(inp);
2948                 func(inp, arg);
2949                 INP_WUNLOCK(inp);
2950         }
2951         INP_INFO_WUNLOCK(&V_tcbinfo);
2952 }
2953
2954 struct socket *
2955 inp_inpcbtosocket(struct inpcb *inp)
2956 {
2957
2958         INP_WLOCK_ASSERT(inp);
2959         return (inp->inp_socket);
2960 }
2961
2962 struct tcpcb *
2963 inp_inpcbtotcpcb(struct inpcb *inp)
2964 {
2965
2966         INP_WLOCK_ASSERT(inp);
2967         return ((struct tcpcb *)inp->inp_ppcb);
2968 }
2969
2970 int
2971 inp_ip_tos_get(const struct inpcb *inp)
2972 {
2973
2974         return (inp->inp_ip_tos);
2975 }
2976
2977 void
2978 inp_ip_tos_set(struct inpcb *inp, int val)
2979 {
2980
2981         inp->inp_ip_tos = val;
2982 }
2983
2984 void
2985 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2986     uint32_t *faddr, uint16_t *fp)
2987 {
2988
2989         INP_LOCK_ASSERT(inp);
2990         *laddr = inp->inp_laddr.s_addr;
2991         *faddr = inp->inp_faddr.s_addr;
2992         *lp = inp->inp_lport;
2993         *fp = inp->inp_fport;
2994 }
2995
2996 struct inpcb *
2997 so_sotoinpcb(struct socket *so)
2998 {
2999
3000         return (sotoinpcb(so));
3001 }
3002
3003 struct tcpcb *
3004 so_sototcpcb(struct socket *so)
3005 {
3006
3007         return (sototcpcb(so));
3008 }
3009
3010 /*
3011  * Create an external-format (``xinpcb'') structure using the information in
3012  * the kernel-format in_pcb structure pointed to by inp.  This is done to
3013  * reduce the spew of irrelevant information over this interface, to isolate
3014  * user code from changes in the kernel structure, and potentially to provide
3015  * information-hiding if we decide that some of this information should be
3016  * hidden from users.
3017  */
3018 void
3019 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
3020 {
3021
3022         bzero(xi, sizeof(*xi));
3023         xi->xi_len = sizeof(struct xinpcb);
3024         if (inp->inp_socket)
3025                 sotoxsocket(inp->inp_socket, &xi->xi_socket);
3026         bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
3027         xi->inp_gencnt = inp->inp_gencnt;
3028         xi->inp_ppcb = (uintptr_t)inp->inp_ppcb;
3029         xi->inp_flow = inp->inp_flow;
3030         xi->inp_flowid = inp->inp_flowid;
3031         xi->inp_flowtype = inp->inp_flowtype;
3032         xi->inp_flags = inp->inp_flags;
3033         xi->inp_flags2 = inp->inp_flags2;
3034         xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
3035         xi->in6p_cksum = inp->in6p_cksum;
3036         xi->in6p_hops = inp->in6p_hops;
3037         xi->inp_ip_tos = inp->inp_ip_tos;
3038         xi->inp_vflag = inp->inp_vflag;
3039         xi->inp_ip_ttl = inp->inp_ip_ttl;
3040         xi->inp_ip_p = inp->inp_ip_p;
3041         xi->inp_ip_minttl = inp->inp_ip_minttl;
3042 }
3043
3044 #ifdef DDB
3045 static void
3046 db_print_indent(int indent)
3047 {
3048         int i;
3049
3050         for (i = 0; i < indent; i++)
3051                 db_printf(" ");
3052 }
3053
3054 static void
3055 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
3056 {
3057         char faddr_str[48], laddr_str[48];
3058
3059         db_print_indent(indent);
3060         db_printf("%s at %p\n", name, inc);
3061
3062         indent += 2;
3063
3064 #ifdef INET6
3065         if (inc->inc_flags & INC_ISIPV6) {
3066                 /* IPv6. */
3067                 ip6_sprintf(laddr_str, &inc->inc6_laddr);
3068                 ip6_sprintf(faddr_str, &inc->inc6_faddr);
3069         } else
3070 #endif
3071         {
3072                 /* IPv4. */
3073                 inet_ntoa_r(inc->inc_laddr, laddr_str);
3074                 inet_ntoa_r(inc->inc_faddr, faddr_str);
3075         }
3076         db_print_indent(indent);
3077         db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
3078             ntohs(inc->inc_lport));
3079         db_print_indent(indent);
3080         db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
3081             ntohs(inc->inc_fport));
3082 }
3083
3084 static void
3085 db_print_inpflags(int inp_flags)
3086 {
3087         int comma;
3088
3089         comma = 0;
3090         if (inp_flags & INP_RECVOPTS) {
3091                 db_printf("%sINP_RECVOPTS", comma ? ", " : "");
3092                 comma = 1;
3093         }
3094         if (inp_flags & INP_RECVRETOPTS) {
3095                 db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
3096                 comma = 1;
3097         }
3098         if (inp_flags & INP_RECVDSTADDR) {
3099                 db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
3100                 comma = 1;
3101         }
3102         if (inp_flags & INP_ORIGDSTADDR) {
3103                 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
3104                 comma = 1;
3105         }
3106         if (inp_flags & INP_HDRINCL) {
3107                 db_printf("%sINP_HDRINCL", comma ? ", " : "");
3108                 comma = 1;
3109         }
3110         if (inp_flags & INP_HIGHPORT) {
3111                 db_printf("%sINP_HIGHPORT", comma ? ", " : "");
3112                 comma = 1;
3113         }
3114         if (inp_flags & INP_LOWPORT) {
3115                 db_printf("%sINP_LOWPORT", comma ? ", " : "");
3116                 comma = 1;
3117         }
3118         if (inp_flags & INP_ANONPORT) {
3119                 db_printf("%sINP_ANONPORT", comma ? ", " : "");
3120                 comma = 1;
3121         }
3122         if (inp_flags & INP_RECVIF) {
3123                 db_printf("%sINP_RECVIF", comma ? ", " : "");
3124                 comma = 1;
3125         }
3126         if (inp_flags & INP_MTUDISC) {
3127                 db_printf("%sINP_MTUDISC", comma ? ", " : "");
3128                 comma = 1;
3129         }
3130         if (inp_flags & INP_RECVTTL) {
3131                 db_printf("%sINP_RECVTTL", comma ? ", " : "");
3132                 comma = 1;
3133         }
3134         if (inp_flags & INP_DONTFRAG) {
3135                 db_printf("%sINP_DONTFRAG", comma ? ", " : "");
3136                 comma = 1;
3137         }
3138         if (inp_flags & INP_RECVTOS) {
3139                 db_printf("%sINP_RECVTOS", comma ? ", " : "");
3140                 comma = 1;
3141         }
3142         if (inp_flags & IN6P_IPV6_V6ONLY) {
3143                 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
3144                 comma = 1;
3145         }
3146         if (inp_flags & IN6P_PKTINFO) {
3147                 db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
3148                 comma = 1;
3149         }
3150         if (inp_flags & IN6P_HOPLIMIT) {
3151                 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
3152                 comma = 1;
3153         }
3154         if (inp_flags & IN6P_HOPOPTS) {
3155                 db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
3156                 comma = 1;
3157         }
3158         if (inp_flags & IN6P_DSTOPTS) {
3159                 db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
3160                 comma = 1;
3161         }
3162         if (inp_flags & IN6P_RTHDR) {
3163                 db_printf("%sIN6P_RTHDR", comma ? ", " : "");
3164                 comma = 1;
3165         }
3166         if (inp_flags & IN6P_RTHDRDSTOPTS) {
3167                 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
3168                 comma = 1;
3169         }
3170         if (inp_flags & IN6P_TCLASS) {
3171                 db_printf("%sIN6P_TCLASS", comma ? ", " : "");
3172                 comma = 1;
3173         }
3174         if (inp_flags & IN6P_AUTOFLOWLABEL) {
3175                 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
3176                 comma = 1;
3177         }
3178         if (inp_flags & INP_TIMEWAIT) {
3179                 db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
3180                 comma  = 1;
3181         }
3182         if (inp_flags & INP_ONESBCAST) {
3183                 db_printf("%sINP_ONESBCAST", comma ? ", " : "");
3184                 comma  = 1;
3185         }
3186         if (inp_flags & INP_DROPPED) {
3187                 db_printf("%sINP_DROPPED", comma ? ", " : "");
3188                 comma  = 1;
3189         }
3190         if (inp_flags & INP_SOCKREF) {
3191                 db_printf("%sINP_SOCKREF", comma ? ", " : "");
3192                 comma  = 1;
3193         }
3194         if (inp_flags & IN6P_RFC2292) {
3195                 db_printf("%sIN6P_RFC2292", comma ? ", " : "");
3196                 comma = 1;
3197         }
3198         if (inp_flags & IN6P_MTU) {
3199                 db_printf("IN6P_MTU%s", comma ? ", " : "");
3200                 comma = 1;
3201         }
3202 }
3203
3204 static void
3205 db_print_inpvflag(u_char inp_vflag)
3206 {
3207         int comma;
3208
3209         comma = 0;
3210         if (inp_vflag & INP_IPV4) {
3211                 db_printf("%sINP_IPV4", comma ? ", " : "");
3212                 comma  = 1;
3213         }
3214         if (inp_vflag & INP_IPV6) {
3215                 db_printf("%sINP_IPV6", comma ? ", " : "");
3216                 comma  = 1;
3217         }
3218         if (inp_vflag & INP_IPV6PROTO) {
3219                 db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
3220                 comma  = 1;
3221         }
3222 }
3223
3224 static void
3225 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3226 {
3227
3228         db_print_indent(indent);
3229         db_printf("%s at %p\n", name, inp);
3230
3231         indent += 2;
3232
3233         db_print_indent(indent);
3234         db_printf("inp_flow: 0x%x\n", inp->inp_flow);
3235
3236         db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3237
3238         db_print_indent(indent);
3239         db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
3240             inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
3241
3242         db_print_indent(indent);
3243         db_printf("inp_label: %p   inp_flags: 0x%x (",
3244            inp->inp_label, inp->inp_flags);
3245         db_print_inpflags(inp->inp_flags);
3246         db_printf(")\n");
3247
3248         db_print_indent(indent);
3249         db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
3250             inp->inp_vflag);
3251         db_print_inpvflag(inp->inp_vflag);
3252         db_printf(")\n");
3253
3254         db_print_indent(indent);
3255         db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
3256             inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3257
3258         db_print_indent(indent);
3259 #ifdef INET6
3260         if (inp->inp_vflag & INP_IPV6) {
3261                 db_printf("in6p_options: %p   in6p_outputopts: %p   "
3262                     "in6p_moptions: %p\n", inp->in6p_options,
3263                     inp->in6p_outputopts, inp->in6p_moptions);
3264                 db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
3265                     "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3266                     inp->in6p_hops);
3267         } else
3268 #endif
3269         {
3270                 db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
3271                     "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3272                     inp->inp_options, inp->inp_moptions);
3273         }
3274
3275         db_print_indent(indent);
3276         db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
3277             (uintmax_t)inp->inp_gencnt);
3278 }
3279
3280 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3281 {
3282         struct inpcb *inp;
3283
3284         if (!have_addr) {
3285                 db_printf("usage: show inpcb <addr>\n");
3286                 return;
3287         }
3288         inp = (struct inpcb *)addr;
3289
3290         db_print_inpcb(inp, "inpcb", 0);
3291 }
3292 #endif /* DDB */
3293
3294 #ifdef RATELIMIT
3295 /*
3296  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3297  * if any.
3298  */
3299 int
3300 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3301 {
3302         union if_snd_tag_modify_params params = {
3303                 .rate_limit.max_rate = max_pacing_rate,
3304                 .rate_limit.flags = M_NOWAIT,
3305         };
3306         struct m_snd_tag *mst;
3307         struct ifnet *ifp;
3308         int error;
3309
3310         mst = inp->inp_snd_tag;
3311         if (mst == NULL)
3312                 return (EINVAL);
3313
3314         ifp = mst->ifp;
3315         if (ifp == NULL)
3316                 return (EINVAL);
3317
3318         if (ifp->if_snd_tag_modify == NULL) {
3319                 error = EOPNOTSUPP;
3320         } else {
3321                 error = ifp->if_snd_tag_modify(mst, &params);
3322         }
3323         return (error);
3324 }
3325
3326 /*
3327  * Query existing TX rate limit based on the existing
3328  * "inp->inp_snd_tag", if any.
3329  */
3330 int
3331 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3332 {
3333         union if_snd_tag_query_params params = { };
3334         struct m_snd_tag *mst;
3335         struct ifnet *ifp;
3336         int error;
3337
3338         mst = inp->inp_snd_tag;
3339         if (mst == NULL)
3340                 return (EINVAL);
3341
3342         ifp = mst->ifp;
3343         if (ifp == NULL)
3344                 return (EINVAL);
3345
3346         if (ifp->if_snd_tag_query == NULL) {
3347                 error = EOPNOTSUPP;
3348         } else {
3349                 error = ifp->if_snd_tag_query(mst, &params);
3350                 if (error == 0 &&  p_max_pacing_rate != NULL)
3351                         *p_max_pacing_rate = params.rate_limit.max_rate;
3352         }
3353         return (error);
3354 }
3355
3356 /*
3357  * Query existing TX queue level based on the existing
3358  * "inp->inp_snd_tag", if any.
3359  */
3360 int
3361 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3362 {
3363         union if_snd_tag_query_params params = { };
3364         struct m_snd_tag *mst;
3365         struct ifnet *ifp;
3366         int error;
3367
3368         mst = inp->inp_snd_tag;
3369         if (mst == NULL)
3370                 return (EINVAL);
3371
3372         ifp = mst->ifp;
3373         if (ifp == NULL)
3374                 return (EINVAL);
3375
3376         if (ifp->if_snd_tag_query == NULL)
3377                 return (EOPNOTSUPP);
3378
3379         error = ifp->if_snd_tag_query(mst, &params);
3380         if (error == 0 &&  p_txqueue_level != NULL)
3381                 *p_txqueue_level = params.rate_limit.queue_level;
3382         return (error);
3383 }
3384
3385 /*
3386  * Allocate a new TX rate limit send tag from the network interface
3387  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3388  */
3389 int
3390 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3391     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3392
3393 {
3394         union if_snd_tag_alloc_params params = {
3395                 .rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3396                     IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3397                 .rate_limit.hdr.flowid = flowid,
3398                 .rate_limit.hdr.flowtype = flowtype,
3399                 .rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3400                 .rate_limit.max_rate = max_pacing_rate,
3401                 .rate_limit.flags = M_NOWAIT,
3402         };
3403         int error;
3404
3405         INP_WLOCK_ASSERT(inp);
3406
3407         /*
3408          * If there is already a send tag, or the INP is being torn
3409          * down, allocating a new send tag is not allowed. Else send
3410          * tags may leak.
3411          */
3412         if (*st != NULL || (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0)
3413                 return (EINVAL);
3414
3415         error = m_snd_tag_alloc(ifp, &params, st);
3416 #ifdef INET
3417         if (error == 0) {
3418                 counter_u64_add(rate_limit_set_ok, 1);
3419                 counter_u64_add(rate_limit_active, 1);
3420         } else if (error != EOPNOTSUPP)
3421                   counter_u64_add(rate_limit_alloc_fail, 1);
3422 #endif
3423         return (error);
3424 }
3425
3426 void
3427 in_pcbdetach_tag(struct m_snd_tag *mst)
3428 {
3429
3430         m_snd_tag_rele(mst);
3431 #ifdef INET
3432         counter_u64_add(rate_limit_active, -1);
3433 #endif
3434 }
3435
3436 /*
3437  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3438  * if any:
3439  */
3440 void
3441 in_pcbdetach_txrtlmt(struct inpcb *inp)
3442 {
3443         struct m_snd_tag *mst;
3444
3445         INP_WLOCK_ASSERT(inp);
3446
3447         mst = inp->inp_snd_tag;
3448         inp->inp_snd_tag = NULL;
3449
3450         if (mst == NULL)
3451                 return;
3452
3453         m_snd_tag_rele(mst);
3454 #ifdef INET
3455         counter_u64_add(rate_limit_active, -1);
3456 #endif
3457 }
3458
3459 int
3460 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3461 {
3462         int error;
3463
3464         /*
3465          * If the existing send tag is for the wrong interface due to
3466          * a route change, first drop the existing tag.  Set the
3467          * CHANGED flag so that we will keep trying to allocate a new
3468          * tag if we fail to allocate one this time.
3469          */
3470         if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3471                 in_pcbdetach_txrtlmt(inp);
3472                 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3473         }
3474
3475         /*
3476          * NOTE: When attaching to a network interface a reference is
3477          * made to ensure the network interface doesn't go away until
3478          * all ratelimit connections are gone. The network interface
3479          * pointers compared below represent valid network interfaces,
3480          * except when comparing towards NULL.
3481          */
3482         if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3483                 error = 0;
3484         } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3485                 if (inp->inp_snd_tag != NULL)
3486                         in_pcbdetach_txrtlmt(inp);
3487                 error = 0;
3488         } else if (inp->inp_snd_tag == NULL) {
3489                 /*
3490                  * In order to utilize packet pacing with RSS, we need
3491                  * to wait until there is a valid RSS hash before we
3492                  * can proceed:
3493                  */
3494                 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3495                         error = EAGAIN;
3496                 } else {
3497                         error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3498                             mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3499                 }
3500         } else {
3501                 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3502         }
3503         if (error == 0 || error == EOPNOTSUPP)
3504                 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3505
3506         return (error);
3507 }
3508
3509 /*
3510  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3511  * is set in the fast path and will attach/detach/modify the TX rate
3512  * limit send tag based on the socket's so_max_pacing_rate value.
3513  */
3514 void
3515 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3516 {
3517         struct socket *socket;
3518         uint32_t max_pacing_rate;
3519         bool did_upgrade;
3520         int error;
3521
3522         if (inp == NULL)
3523                 return;
3524
3525         socket = inp->inp_socket;
3526         if (socket == NULL)
3527                 return;
3528
3529         if (!INP_WLOCKED(inp)) {
3530                 /*
3531                  * NOTE: If the write locking fails, we need to bail
3532                  * out and use the non-ratelimited ring for the
3533                  * transmit until there is a new chance to get the
3534                  * write lock.
3535                  */
3536                 if (!INP_TRY_UPGRADE(inp))
3537                         return;
3538                 did_upgrade = 1;
3539         } else {
3540                 did_upgrade = 0;
3541         }
3542
3543         /*
3544          * NOTE: The so_max_pacing_rate value is read unlocked,
3545          * because atomic updates are not required since the variable
3546          * is checked at every mbuf we send. It is assumed that the
3547          * variable read itself will be atomic.
3548          */
3549         max_pacing_rate = socket->so_max_pacing_rate;
3550
3551         error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3552
3553         if (did_upgrade)
3554                 INP_DOWNGRADE(inp);
3555 }
3556
3557 /*
3558  * Track route changes for TX rate limiting.
3559  */
3560 void
3561 in_pcboutput_eagain(struct inpcb *inp)
3562 {
3563         bool did_upgrade;
3564
3565         if (inp == NULL)
3566                 return;
3567
3568         if (inp->inp_snd_tag == NULL)
3569                 return;
3570
3571         if (!INP_WLOCKED(inp)) {
3572                 /*
3573                  * NOTE: If the write locking fails, we need to bail
3574                  * out and use the non-ratelimited ring for the
3575                  * transmit until there is a new chance to get the
3576                  * write lock.
3577                  */
3578                 if (!INP_TRY_UPGRADE(inp))
3579                         return;
3580                 did_upgrade = 1;
3581         } else {
3582                 did_upgrade = 0;
3583         }
3584
3585         /* detach rate limiting */
3586         in_pcbdetach_txrtlmt(inp);
3587
3588         /* make sure new mbuf send tag allocation is made */
3589         inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3590
3591         if (did_upgrade)
3592                 INP_DOWNGRADE(inp);
3593 }
3594
3595 #ifdef INET
3596 static void
3597 rl_init(void *st)
3598 {
3599         rate_limit_new = counter_u64_alloc(M_WAITOK);
3600         rate_limit_chg = counter_u64_alloc(M_WAITOK);
3601         rate_limit_active = counter_u64_alloc(M_WAITOK);
3602         rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3603         rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3604 }
3605
3606 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3607 #endif
3608 #endif /* RATELIMIT */