]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/in_pcb.c
zfs: merge openzfs/zfs@c3b60eded (zfs-2.1-release) into stable/13
[FreeBSD/FreeBSD.git] / sys / netinet / in_pcb.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1991, 1993, 1995
5  *      The Regents of the University of California.
6  * Copyright (c) 2007-2009 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * All rights reserved.
9  *
10  * Portions of this software were developed by Robert N. M. Watson under
11  * contract to Juniper Networks, Inc.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *      @(#)in_pcb.c    8.4 (Berkeley) 5/24/95
38  */
39
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42
43 #include "opt_ddb.h"
44 #include "opt_ipsec.h"
45 #include "opt_inet.h"
46 #include "opt_inet6.h"
47 #include "opt_ratelimit.h"
48 #include "opt_pcbgroup.h"
49 #include "opt_route.h"
50 #include "opt_rss.h"
51
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/lock.h>
55 #include <sys/malloc.h>
56 #include <sys/mbuf.h>
57 #include <sys/callout.h>
58 #include <sys/eventhandler.h>
59 #include <sys/domain.h>
60 #include <sys/protosw.h>
61 #include <sys/rmlock.h>
62 #include <sys/smp.h>
63 #include <sys/socket.h>
64 #include <sys/socketvar.h>
65 #include <sys/sockio.h>
66 #include <sys/priv.h>
67 #include <sys/proc.h>
68 #include <sys/refcount.h>
69 #include <sys/jail.h>
70 #include <sys/kernel.h>
71 #include <sys/sysctl.h>
72
73 #ifdef DDB
74 #include <ddb/ddb.h>
75 #endif
76
77 #include <vm/uma.h>
78 #include <vm/vm.h>
79
80 #include <net/if.h>
81 #include <net/if_var.h>
82 #include <net/if_types.h>
83 #include <net/if_llatbl.h>
84 #include <net/route.h>
85 #include <net/rss_config.h>
86 #include <net/vnet.h>
87
88 #if defined(INET) || defined(INET6)
89 #include <netinet/in.h>
90 #include <netinet/in_pcb.h>
91 #ifdef INET
92 #include <netinet/in_var.h>
93 #include <netinet/in_fib.h>
94 #endif
95 #include <netinet/ip_var.h>
96 #include <netinet/tcp_var.h>
97 #ifdef TCPHPTS
98 #include <netinet/tcp_hpts.h>
99 #endif
100 #include <netinet/udp.h>
101 #include <netinet/udp_var.h>
102 #ifdef INET6
103 #include <netinet/ip6.h>
104 #include <netinet6/in6_pcb.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/ip6_var.h>
107 #endif /* INET6 */
108 #include <net/route/nhop.h>
109 #endif
110
111 #include <netipsec/ipsec_support.h>
112
113 #include <security/mac/mac_framework.h>
114
115 #define INPCBLBGROUP_SIZMIN     8
116 #define INPCBLBGROUP_SIZMAX     256
117
118 static struct callout   ipport_tick_callout;
119
120 /*
121  * These configure the range of local port addresses assigned to
122  * "unspecified" outgoing connections/packets/whatever.
123  */
124 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;    /* 1023 */
125 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;    /* 600 */
126 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;     /* 10000 */
127 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;       /* 65535 */
128 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;      /* 49152 */
129 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;        /* 65535 */
130
131 /*
132  * Reserved ports accessible only to root. There are significant
133  * security considerations that must be accounted for when changing these,
134  * but the security benefits can be great. Please be careful.
135  */
136 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;    /* 1023 */
137 VNET_DEFINE(int, ipport_reservedlow);
138
139 /* Variables dealing with random ephemeral port allocation. */
140 VNET_DEFINE(int, ipport_randomized) = 1;        /* user controlled via sysctl */
141 VNET_DEFINE(int, ipport_randomcps) = 10;        /* user controlled via sysctl */
142 VNET_DEFINE(int, ipport_randomtime) = 45;       /* user controlled via sysctl */
143 VNET_DEFINE(int, ipport_stoprandom);            /* toggled by ipport_tick */
144 VNET_DEFINE(int, ipport_tcpallocs);
145 VNET_DEFINE_STATIC(int, ipport_tcplastcount);
146
147 #define V_ipport_tcplastcount           VNET(ipport_tcplastcount)
148
149 static void     in_pcbremlists(struct inpcb *inp);
150 #ifdef INET
151 static struct inpcb     *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
152                             struct in_addr faddr, u_int fport_arg,
153                             struct in_addr laddr, u_int lport_arg,
154                             int lookupflags, struct ifnet *ifp,
155                             uint8_t numa_domain);
156
157 #define RANGECHK(var, min, max) \
158         if ((var) < (min)) { (var) = (min); } \
159         else if ((var) > (max)) { (var) = (max); }
160
161 static int
162 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
163 {
164         int error;
165
166         error = sysctl_handle_int(oidp, arg1, arg2, req);
167         if (error == 0) {
168                 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
169                 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
170                 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
171                 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
172                 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
173                 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
174         }
175         return (error);
176 }
177
178 #undef RANGECHK
179
180 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
181     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
182     "IP Ports");
183
184 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
185     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
186     &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
187     "");
188 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
189     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
190     &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
191     "");
192 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
193     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
194     &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
195     "");
196 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
197     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
198     &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
199     "");
200 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
201     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
202     &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
203     "");
204 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
205     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
206     &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
207     "");
208 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
209         CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
210         &VNET_NAME(ipport_reservedhigh), 0, "");
211 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
212         CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
213 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
214         CTLFLAG_VNET | CTLFLAG_RW,
215         &VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
216 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
217         CTLFLAG_VNET | CTLFLAG_RW,
218         &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
219         "allocations before switching to a sequental one");
220 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
221         CTLFLAG_VNET | CTLFLAG_RW,
222         &VNET_NAME(ipport_randomtime), 0,
223         "Minimum time to keep sequental port "
224         "allocation before switching to a random one");
225
226 #ifdef RATELIMIT
227 counter_u64_t rate_limit_new;
228 counter_u64_t rate_limit_chg;
229 counter_u64_t rate_limit_active;
230 counter_u64_t rate_limit_alloc_fail;
231 counter_u64_t rate_limit_set_ok;
232
233 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
234     "IP Rate Limiting");
235 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
236     &rate_limit_active, "Active rate limited connections");
237 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
238    &rate_limit_alloc_fail, "Rate limited connection failures");
239 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
240    &rate_limit_set_ok, "Rate limited setting succeeded");
241 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
242    &rate_limit_new, "Total Rate limit new attempts");
243 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
244    &rate_limit_chg, "Total Rate limited change attempts");
245
246 #endif /* RATELIMIT */
247
248 #endif /* INET */
249
250 /*
251  * in_pcb.c: manage the Protocol Control Blocks.
252  *
253  * NOTE: It is assumed that most of these functions will be called with
254  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
255  * functions often modify hash chains or addresses in pcbs.
256  */
257
258 static struct inpcblbgroup *
259 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
260     uint16_t port, const union in_dependaddr *addr, int size,
261     uint8_t numa_domain)
262 {
263         struct inpcblbgroup *grp;
264         size_t bytes;
265
266         bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
267         grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
268         if (!grp)
269                 return (NULL);
270         grp->il_vflag = vflag;
271         grp->il_lport = port;
272         grp->il_numa_domain = numa_domain;
273         grp->il_dependladdr = *addr;
274         grp->il_inpsiz = size;
275         CK_LIST_INSERT_HEAD(hdr, grp, il_list);
276         return (grp);
277 }
278
279 static void
280 in_pcblbgroup_free_deferred(epoch_context_t ctx)
281 {
282         struct inpcblbgroup *grp;
283
284         grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
285         free(grp, M_PCB);
286 }
287
288 static void
289 in_pcblbgroup_free(struct inpcblbgroup *grp)
290 {
291
292         CK_LIST_REMOVE(grp, il_list);
293         NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
294 }
295
296 static struct inpcblbgroup *
297 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
298     struct inpcblbgroup *old_grp, int size)
299 {
300         struct inpcblbgroup *grp;
301         int i;
302
303         grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
304             old_grp->il_lport, &old_grp->il_dependladdr, size,
305             old_grp->il_numa_domain);
306         if (grp == NULL)
307                 return (NULL);
308
309         KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
310             ("invalid new local group size %d and old local group count %d",
311              grp->il_inpsiz, old_grp->il_inpcnt));
312
313         for (i = 0; i < old_grp->il_inpcnt; ++i)
314                 grp->il_inp[i] = old_grp->il_inp[i];
315         grp->il_inpcnt = old_grp->il_inpcnt;
316         in_pcblbgroup_free(old_grp);
317         return (grp);
318 }
319
320 /*
321  * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
322  * and shrink group if possible.
323  */
324 static void
325 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
326     int i)
327 {
328         struct inpcblbgroup *grp, *new_grp;
329
330         grp = *grpp;
331         for (; i + 1 < grp->il_inpcnt; ++i)
332                 grp->il_inp[i] = grp->il_inp[i + 1];
333         grp->il_inpcnt--;
334
335         if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
336             grp->il_inpcnt <= grp->il_inpsiz / 4) {
337                 /* Shrink this group. */
338                 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
339                 if (new_grp != NULL)
340                         *grpp = new_grp;
341         }
342 }
343
344 /*
345  * Add PCB to load balance group for SO_REUSEPORT_LB option.
346  */
347 static int
348 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
349 {
350         const static struct timeval interval = { 60, 0 };
351         static struct timeval lastprint;
352         struct inpcbinfo *pcbinfo;
353         struct inpcblbgrouphead *hdr;
354         struct inpcblbgroup *grp;
355         uint32_t idx;
356
357         pcbinfo = inp->inp_pcbinfo;
358
359         INP_WLOCK_ASSERT(inp);
360         INP_HASH_WLOCK_ASSERT(pcbinfo);
361
362         /*
363          * Don't allow jailed socket to join local group.
364          */
365         if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred))
366                 return (0);
367
368 #ifdef INET6
369         /*
370          * Don't allow IPv4 mapped INET6 wild socket.
371          */
372         if ((inp->inp_vflag & INP_IPV4) &&
373             inp->inp_laddr.s_addr == INADDR_ANY &&
374             INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
375                 return (0);
376         }
377 #endif
378
379         idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
380         hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
381         CK_LIST_FOREACH(grp, hdr, il_list) {
382                 if (grp->il_vflag == inp->inp_vflag &&
383                     grp->il_lport == inp->inp_lport &&
384                     grp->il_numa_domain == numa_domain &&
385                     memcmp(&grp->il_dependladdr,
386                     &inp->inp_inc.inc_ie.ie_dependladdr,
387                     sizeof(grp->il_dependladdr)) == 0)
388                         break;
389         }
390         if (grp == NULL) {
391                 /* Create new load balance group. */
392                 grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
393                     inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
394                     INPCBLBGROUP_SIZMIN, numa_domain);
395                 if (grp == NULL)
396                         return (ENOBUFS);
397         } else if (grp->il_inpcnt == grp->il_inpsiz) {
398                 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
399                         if (ratecheck(&lastprint, &interval))
400                                 printf("lb group port %d, limit reached\n",
401                                     ntohs(grp->il_lport));
402                         return (0);
403                 }
404
405                 /* Expand this local group. */
406                 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
407                 if (grp == NULL)
408                         return (ENOBUFS);
409         }
410
411         KASSERT(grp->il_inpcnt < grp->il_inpsiz,
412             ("invalid local group size %d and count %d", grp->il_inpsiz,
413             grp->il_inpcnt));
414
415         grp->il_inp[grp->il_inpcnt] = inp;
416         grp->il_inpcnt++;
417         return (0);
418 }
419
420 /*
421  * Remove PCB from load balance group.
422  */
423 static void
424 in_pcbremlbgrouphash(struct inpcb *inp)
425 {
426         struct inpcbinfo *pcbinfo;
427         struct inpcblbgrouphead *hdr;
428         struct inpcblbgroup *grp;
429         int i;
430
431         pcbinfo = inp->inp_pcbinfo;
432
433         INP_WLOCK_ASSERT(inp);
434         INP_HASH_WLOCK_ASSERT(pcbinfo);
435
436         hdr = &pcbinfo->ipi_lbgrouphashbase[
437             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
438         CK_LIST_FOREACH(grp, hdr, il_list) {
439                 for (i = 0; i < grp->il_inpcnt; ++i) {
440                         if (grp->il_inp[i] != inp)
441                                 continue;
442
443                         if (grp->il_inpcnt == 1) {
444                                 /* We are the last, free this local group. */
445                                 in_pcblbgroup_free(grp);
446                         } else {
447                                 /* Pull up inpcbs, shrink group if possible. */
448                                 in_pcblbgroup_reorder(hdr, &grp, i);
449                         }
450                         return;
451                 }
452         }
453 }
454
455 int
456 in_pcblbgroup_numa(struct inpcb *inp, int arg)
457 {
458         struct inpcbinfo *pcbinfo;
459         struct inpcblbgrouphead *hdr;
460         struct inpcblbgroup *grp;
461         int err, i;
462         uint8_t numa_domain;
463
464         switch (arg) {
465         case TCP_REUSPORT_LB_NUMA_NODOM:
466                 numa_domain = M_NODOM;
467                 break;
468         case TCP_REUSPORT_LB_NUMA_CURDOM:
469                 numa_domain = PCPU_GET(domain);
470                 break;
471         default:
472                 if (arg < 0 || arg >= vm_ndomains)
473                         return (EINVAL);
474                 numa_domain = arg;
475         }
476
477         err = 0;
478         pcbinfo = inp->inp_pcbinfo;
479         INP_WLOCK_ASSERT(inp);
480         INP_HASH_WLOCK(pcbinfo);
481         hdr = &pcbinfo->ipi_lbgrouphashbase[
482             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
483         CK_LIST_FOREACH(grp, hdr, il_list) {
484                 for (i = 0; i < grp->il_inpcnt; ++i) {
485                         if (grp->il_inp[i] != inp)
486                                 continue;
487
488                         if (grp->il_numa_domain == numa_domain) {
489                                 goto abort_with_hash_wlock;
490                         }
491
492                         /* Remove it from the old group. */
493                         in_pcbremlbgrouphash(inp);
494
495                         /* Add it to the new group based on numa domain. */
496                         in_pcbinslbgrouphash(inp, numa_domain);
497                         goto abort_with_hash_wlock;
498                 }
499         }
500         err = ENOENT;
501 abort_with_hash_wlock:
502         INP_HASH_WUNLOCK(pcbinfo);
503         return (err);
504 }
505
506 /*
507  * Different protocols initialize their inpcbs differently - giving
508  * different name to the lock.  But they all are disposed the same.
509  */
510 static void
511 inpcb_fini(void *mem, int size)
512 {
513         struct inpcb *inp = mem;
514
515         INP_LOCK_DESTROY(inp);
516 }
517
518 /*
519  * Initialize an inpcbinfo -- we should be able to reduce the number of
520  * arguments in time.
521  */
522 void
523 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
524     struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
525     char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields)
526 {
527
528         porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
529
530         INP_INFO_LOCK_INIT(pcbinfo, name);
531         INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");     /* XXXRW: argument? */
532         INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
533 #ifdef VIMAGE
534         pcbinfo->ipi_vnet = curvnet;
535 #endif
536         pcbinfo->ipi_listhead = listhead;
537         CK_LIST_INIT(pcbinfo->ipi_listhead);
538         pcbinfo->ipi_count = 0;
539         pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
540             &pcbinfo->ipi_hashmask);
541         pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
542             &pcbinfo->ipi_porthashmask);
543         pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
544             &pcbinfo->ipi_lbgrouphashmask);
545 #ifdef PCBGROUP
546         in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
547 #endif
548         pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
549             NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0);
550         uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
551         uma_zone_set_warning(pcbinfo->ipi_zone,
552             "kern.ipc.maxsockets limit reached");
553 }
554
555 /*
556  * Destroy an inpcbinfo.
557  */
558 void
559 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
560 {
561
562         KASSERT(pcbinfo->ipi_count == 0,
563             ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
564
565         hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
566         hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
567             pcbinfo->ipi_porthashmask);
568         hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
569             pcbinfo->ipi_lbgrouphashmask);
570 #ifdef PCBGROUP
571         in_pcbgroup_destroy(pcbinfo);
572 #endif
573         uma_zdestroy(pcbinfo->ipi_zone);
574         INP_LIST_LOCK_DESTROY(pcbinfo);
575         INP_HASH_LOCK_DESTROY(pcbinfo);
576         INP_INFO_LOCK_DESTROY(pcbinfo);
577 }
578
579 /*
580  * Allocate a PCB and associate it with the socket.
581  * On success return with the PCB locked.
582  */
583 int
584 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
585 {
586         struct inpcb *inp;
587         int error;
588
589         error = 0;
590         inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
591         if (inp == NULL)
592                 return (ENOBUFS);
593         bzero(&inp->inp_start_zero, inp_zero_size);
594 #ifdef NUMA
595         inp->inp_numa_domain = M_NODOM;
596 #endif
597         inp->inp_pcbinfo = pcbinfo;
598         inp->inp_socket = so;
599         inp->inp_cred = crhold(so->so_cred);
600         inp->inp_inc.inc_fibnum = so->so_fibnum;
601 #ifdef MAC
602         error = mac_inpcb_init(inp, M_NOWAIT);
603         if (error != 0)
604                 goto out;
605         mac_inpcb_create(so, inp);
606 #endif
607 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
608         error = ipsec_init_pcbpolicy(inp);
609         if (error != 0) {
610 #ifdef MAC
611                 mac_inpcb_destroy(inp);
612 #endif
613                 goto out;
614         }
615 #endif /*IPSEC*/
616 #ifdef INET6
617         if (INP_SOCKAF(so) == AF_INET6) {
618                 inp->inp_vflag |= INP_IPV6PROTO;
619                 if (V_ip6_v6only)
620                         inp->inp_flags |= IN6P_IPV6_V6ONLY;
621         }
622 #endif
623         INP_WLOCK(inp);
624         INP_LIST_WLOCK(pcbinfo);
625         CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
626         pcbinfo->ipi_count++;
627         so->so_pcb = (caddr_t)inp;
628 #ifdef INET6
629         if (V_ip6_auto_flowlabel)
630                 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
631 #endif
632         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
633         refcount_init(&inp->inp_refcount, 1);   /* Reference from inpcbinfo */
634
635         /*
636          * Routes in inpcb's can cache L2 as well; they are guaranteed
637          * to be cleaned up.
638          */
639         inp->inp_route.ro_flags = RT_LLE_CACHE;
640         INP_LIST_WUNLOCK(pcbinfo);
641 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
642 out:
643         if (error != 0) {
644                 crfree(inp->inp_cred);
645                 uma_zfree(pcbinfo->ipi_zone, inp);
646         }
647 #endif
648         return (error);
649 }
650
651 #ifdef INET
652 int
653 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
654 {
655         int anonport, error;
656
657         KASSERT(nam == NULL || nam->sa_family == AF_INET,
658             ("%s: invalid address family for %p", __func__, nam));
659         KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in),
660             ("%s: invalid address length for %p", __func__, nam));
661         INP_WLOCK_ASSERT(inp);
662         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
663
664         if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
665                 return (EINVAL);
666         anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
667         error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
668             &inp->inp_lport, cred);
669         if (error)
670                 return (error);
671         if (in_pcbinshash(inp) != 0) {
672                 inp->inp_laddr.s_addr = INADDR_ANY;
673                 inp->inp_lport = 0;
674                 return (EAGAIN);
675         }
676         if (anonport)
677                 inp->inp_flags |= INP_ANONPORT;
678         return (0);
679 }
680 #endif
681
682 #if defined(INET) || defined(INET6)
683 /*
684  * Assign a local port like in_pcb_lport(), but also used with connect()
685  * and a foreign address and port.  If fsa is non-NULL, choose a local port
686  * that is unused with those, otherwise one that is completely unused.
687  * lsa can be NULL for IPv6.
688  */
689 int
690 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
691     struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
692 {
693         struct inpcbinfo *pcbinfo;
694         struct inpcb *tmpinp;
695         unsigned short *lastport;
696         int count, dorandom, error;
697         u_short aux, first, last, lport;
698 #ifdef INET
699         struct in_addr laddr, faddr;
700 #endif
701 #ifdef INET6
702         struct in6_addr *laddr6, *faddr6;
703 #endif
704
705         pcbinfo = inp->inp_pcbinfo;
706
707         /*
708          * Because no actual state changes occur here, a global write lock on
709          * the pcbinfo isn't required.
710          */
711         INP_LOCK_ASSERT(inp);
712         INP_HASH_LOCK_ASSERT(pcbinfo);
713
714         if (inp->inp_flags & INP_HIGHPORT) {
715                 first = V_ipport_hifirstauto;   /* sysctl */
716                 last  = V_ipport_hilastauto;
717                 lastport = &pcbinfo->ipi_lasthi;
718         } else if (inp->inp_flags & INP_LOWPORT) {
719                 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
720                 if (error)
721                         return (error);
722                 first = V_ipport_lowfirstauto;  /* 1023 */
723                 last  = V_ipport_lowlastauto;   /* 600 */
724                 lastport = &pcbinfo->ipi_lastlow;
725         } else {
726                 first = V_ipport_firstauto;     /* sysctl */
727                 last  = V_ipport_lastauto;
728                 lastport = &pcbinfo->ipi_lastport;
729         }
730         /*
731          * For UDP(-Lite), use random port allocation as long as the user
732          * allows it.  For TCP (and as of yet unknown) connections,
733          * use random port allocation only if the user allows it AND
734          * ipport_tick() allows it.
735          */
736         if (V_ipport_randomized &&
737                 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
738                 pcbinfo == &V_ulitecbinfo))
739                 dorandom = 1;
740         else
741                 dorandom = 0;
742         /*
743          * It makes no sense to do random port allocation if
744          * we have the only port available.
745          */
746         if (first == last)
747                 dorandom = 0;
748         /* Make sure to not include UDP(-Lite) packets in the count. */
749         if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
750                 V_ipport_tcpallocs++;
751         /*
752          * Instead of having two loops further down counting up or down
753          * make sure that first is always <= last and go with only one
754          * code path implementing all logic.
755          */
756         if (first > last) {
757                 aux = first;
758                 first = last;
759                 last = aux;
760         }
761
762 #ifdef INET
763         laddr.s_addr = INADDR_ANY;
764         if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
765                 if (lsa != NULL)
766                         laddr = ((struct sockaddr_in *)lsa)->sin_addr;
767                 if (fsa != NULL)
768                         faddr = ((struct sockaddr_in *)fsa)->sin_addr;
769         }
770 #endif
771 #ifdef INET6
772         laddr6 = NULL;
773         if ((inp->inp_vflag & INP_IPV6) != 0) {
774                 if (lsa != NULL)
775                         laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
776                 if (fsa != NULL)
777                         faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
778         }
779 #endif
780
781         tmpinp = NULL;
782         lport = *lportp;
783
784         if (dorandom)
785                 *lastport = first + (arc4random() % (last - first));
786
787         count = last - first;
788
789         do {
790                 if (count-- < 0)        /* completely used? */
791                         return (EADDRNOTAVAIL);
792                 ++*lastport;
793                 if (*lastport < first || *lastport > last)
794                         *lastport = first;
795                 lport = htons(*lastport);
796
797                 if (fsa != NULL) {
798 #ifdef INET
799                         if (lsa->sa_family == AF_INET) {
800                                 tmpinp = in_pcblookup_hash_locked(pcbinfo,
801                                     faddr, fport, laddr, lport, lookupflags,
802                                     NULL, M_NODOM);
803                         }
804 #endif
805 #ifdef INET6
806                         if (lsa->sa_family == AF_INET6) {
807                                 tmpinp = in6_pcblookup_hash_locked(pcbinfo,
808                                     faddr6, fport, laddr6, lport, lookupflags,
809                                     NULL, M_NODOM);
810                         }
811 #endif
812                 } else {
813 #ifdef INET6
814                         if ((inp->inp_vflag & INP_IPV6) != 0)
815                                 tmpinp = in6_pcblookup_local(pcbinfo,
816                                     &inp->in6p_laddr, lport, lookupflags, cred);
817 #endif
818 #if defined(INET) && defined(INET6)
819                         else
820 #endif
821 #ifdef INET
822                                 tmpinp = in_pcblookup_local(pcbinfo, laddr,
823                                     lport, lookupflags, cred);
824 #endif
825                 }
826         } while (tmpinp != NULL);
827
828         *lportp = lport;
829
830         return (0);
831 }
832
833 /*
834  * Select a local port (number) to use.
835  */
836 int
837 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
838     struct ucred *cred, int lookupflags)
839 {
840         struct sockaddr_in laddr;
841
842         if (laddrp) {
843                 bzero(&laddr, sizeof(laddr));
844                 laddr.sin_family = AF_INET;
845                 laddr.sin_addr = *laddrp;
846         }
847         return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
848             NULL, lportp, NULL, 0, cred, lookupflags));
849 }
850
851 /*
852  * Return cached socket options.
853  */
854 int
855 inp_so_options(const struct inpcb *inp)
856 {
857         int so_options;
858
859         so_options = 0;
860
861         if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
862                 so_options |= SO_REUSEPORT_LB;
863         if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
864                 so_options |= SO_REUSEPORT;
865         if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
866                 so_options |= SO_REUSEADDR;
867         return (so_options);
868 }
869 #endif /* INET || INET6 */
870
871 /*
872  * Check if a new BINDMULTI socket is allowed to be created.
873  *
874  * ni points to the new inp.
875  * oi points to the exisitng inp.
876  *
877  * This checks whether the existing inp also has BINDMULTI and
878  * whether the credentials match.
879  */
880 int
881 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
882 {
883         /* Check permissions match */
884         if ((ni->inp_flags2 & INP_BINDMULTI) &&
885             (ni->inp_cred->cr_uid !=
886             oi->inp_cred->cr_uid))
887                 return (0);
888
889         /* Check the existing inp has BINDMULTI set */
890         if ((ni->inp_flags2 & INP_BINDMULTI) &&
891             ((oi->inp_flags2 & INP_BINDMULTI) == 0))
892                 return (0);
893
894         /*
895          * We're okay - either INP_BINDMULTI isn't set on ni, or
896          * it is and it matches the checks.
897          */
898         return (1);
899 }
900
901 #ifdef INET
902 /*
903  * Set up a bind operation on a PCB, performing port allocation
904  * as required, but do not actually modify the PCB. Callers can
905  * either complete the bind by setting inp_laddr/inp_lport and
906  * calling in_pcbinshash(), or they can just use the resulting
907  * port and address to authorise the sending of a once-off packet.
908  *
909  * On error, the values of *laddrp and *lportp are not changed.
910  */
911 int
912 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
913     u_short *lportp, struct ucred *cred)
914 {
915         struct socket *so = inp->inp_socket;
916         struct sockaddr_in *sin;
917         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
918         struct in_addr laddr;
919         u_short lport = 0;
920         int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
921         int error;
922
923         /*
924          * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
925          * so that we don't have to add to the (already messy) code below.
926          */
927         int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
928
929         /*
930          * No state changes, so read locks are sufficient here.
931          */
932         INP_LOCK_ASSERT(inp);
933         INP_HASH_LOCK_ASSERT(pcbinfo);
934
935         if (CK_STAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
936                 return (EADDRNOTAVAIL);
937         laddr.s_addr = *laddrp;
938         if (nam != NULL && laddr.s_addr != INADDR_ANY)
939                 return (EINVAL);
940         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
941                 lookupflags = INPLOOKUP_WILDCARD;
942         if (nam == NULL) {
943                 if ((error = prison_local_ip4(cred, &laddr)) != 0)
944                         return (error);
945         } else {
946                 sin = (struct sockaddr_in *)nam;
947                 KASSERT(sin->sin_family == AF_INET,
948                     ("%s: invalid family for address %p", __func__, sin));
949                 KASSERT(sin->sin_len == sizeof(*sin),
950                     ("%s: invalid length for address %p", __func__, sin));
951
952                 error = prison_local_ip4(cred, &sin->sin_addr);
953                 if (error)
954                         return (error);
955                 if (sin->sin_port != *lportp) {
956                         /* Don't allow the port to change. */
957                         if (*lportp != 0)
958                                 return (EINVAL);
959                         lport = sin->sin_port;
960                 }
961                 /* NB: lport is left as 0 if the port isn't being changed. */
962                 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
963                         /*
964                          * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
965                          * allow complete duplication of binding if
966                          * SO_REUSEPORT is set, or if SO_REUSEADDR is set
967                          * and a multicast address is bound on both
968                          * new and duplicated sockets.
969                          */
970                         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
971                                 reuseport = SO_REUSEADDR|SO_REUSEPORT;
972                         /*
973                          * XXX: How to deal with SO_REUSEPORT_LB here?
974                          * Treat same as SO_REUSEPORT for now.
975                          */
976                         if ((so->so_options &
977                             (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
978                                 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
979                 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
980                         sin->sin_port = 0;              /* yech... */
981                         bzero(&sin->sin_zero, sizeof(sin->sin_zero));
982                         /*
983                          * Is the address a local IP address?
984                          * If INP_BINDANY is set, then the socket may be bound
985                          * to any endpoint address, local or not.
986                          */
987                         if ((inp->inp_flags & INP_BINDANY) == 0 &&
988                             ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
989                                 return (EADDRNOTAVAIL);
990                 }
991                 laddr = sin->sin_addr;
992                 if (lport) {
993                         struct inpcb *t;
994                         struct tcptw *tw;
995
996                         /* GROSS */
997                         if (ntohs(lport) <= V_ipport_reservedhigh &&
998                             ntohs(lport) >= V_ipport_reservedlow &&
999                             priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
1000                                 return (EACCES);
1001                         if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
1002                             priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
1003                                 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
1004                                     lport, INPLOOKUP_WILDCARD, cred);
1005         /*
1006          * XXX
1007          * This entire block sorely needs a rewrite.
1008          */
1009                                 if (t &&
1010                                     ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
1011                                     ((t->inp_flags & INP_TIMEWAIT) == 0) &&
1012                                     (so->so_type != SOCK_STREAM ||
1013                                      ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
1014                                     (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
1015                                      ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
1016                                      (t->inp_flags2 & INP_REUSEPORT) ||
1017                                      (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
1018                                     (inp->inp_cred->cr_uid !=
1019                                      t->inp_cred->cr_uid))
1020                                         return (EADDRINUSE);
1021
1022                                 /*
1023                                  * If the socket is a BINDMULTI socket, then
1024                                  * the credentials need to match and the
1025                                  * original socket also has to have been bound
1026                                  * with BINDMULTI.
1027                                  */
1028                                 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
1029                                         return (EADDRINUSE);
1030                         }
1031                         t = in_pcblookup_local(pcbinfo, sin->sin_addr,
1032                             lport, lookupflags, cred);
1033                         if (t && (t->inp_flags & INP_TIMEWAIT)) {
1034                                 /*
1035                                  * XXXRW: If an incpb has had its timewait
1036                                  * state recycled, we treat the address as
1037                                  * being in use (for now).  This is better
1038                                  * than a panic, but not desirable.
1039                                  */
1040                                 tw = intotw(t);
1041                                 if (tw == NULL ||
1042                                     ((reuseport & tw->tw_so_options) == 0 &&
1043                                         (reuseport_lb &
1044                                             tw->tw_so_options) == 0)) {
1045                                         return (EADDRINUSE);
1046                                 }
1047                         } else if (t &&
1048                                    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
1049                                    (reuseport & inp_so_options(t)) == 0 &&
1050                                    (reuseport_lb & inp_so_options(t)) == 0) {
1051 #ifdef INET6
1052                                 if (ntohl(sin->sin_addr.s_addr) !=
1053                                     INADDR_ANY ||
1054                                     ntohl(t->inp_laddr.s_addr) !=
1055                                     INADDR_ANY ||
1056                                     (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
1057                                     (t->inp_vflag & INP_IPV6PROTO) == 0)
1058 #endif
1059                                                 return (EADDRINUSE);
1060                                 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
1061                                         return (EADDRINUSE);
1062                         }
1063                 }
1064         }
1065         if (*lportp != 0)
1066                 lport = *lportp;
1067         if (lport == 0) {
1068                 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
1069                 if (error != 0)
1070                         return (error);
1071         }
1072         *laddrp = laddr.s_addr;
1073         *lportp = lport;
1074         return (0);
1075 }
1076
1077 /*
1078  * Connect from a socket to a specified address.
1079  * Both address and port must be specified in argument sin.
1080  * If don't have a local address for this socket yet,
1081  * then pick one.
1082  */
1083 int
1084 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
1085     struct ucred *cred, struct mbuf *m, bool rehash)
1086 {
1087         u_short lport, fport;
1088         in_addr_t laddr, faddr;
1089         int anonport, error;
1090
1091         INP_WLOCK_ASSERT(inp);
1092         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1093
1094         lport = inp->inp_lport;
1095         laddr = inp->inp_laddr.s_addr;
1096         anonport = (lport == 0);
1097         error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
1098             NULL, cred);
1099         if (error)
1100                 return (error);
1101
1102         /* Do the initial binding of the local address if required. */
1103         if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
1104                 KASSERT(rehash == true,
1105                     ("Rehashing required for unbound inps"));
1106                 inp->inp_lport = lport;
1107                 inp->inp_laddr.s_addr = laddr;
1108                 if (in_pcbinshash(inp) != 0) {
1109                         inp->inp_laddr.s_addr = INADDR_ANY;
1110                         inp->inp_lport = 0;
1111                         return (EAGAIN);
1112                 }
1113         }
1114
1115         /* Commit the remaining changes. */
1116         inp->inp_lport = lport;
1117         inp->inp_laddr.s_addr = laddr;
1118         inp->inp_faddr.s_addr = faddr;
1119         inp->inp_fport = fport;
1120         if (rehash) {
1121                 in_pcbrehash_mbuf(inp, m);
1122         } else {
1123                 in_pcbinshash_mbuf(inp, m);
1124         }
1125
1126         if (anonport)
1127                 inp->inp_flags |= INP_ANONPORT;
1128         return (0);
1129 }
1130
1131 int
1132 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
1133 {
1134
1135         return (in_pcbconnect_mbuf(inp, nam, cred, NULL, true));
1136 }
1137
1138 /*
1139  * Do proper source address selection on an unbound socket in case
1140  * of connect. Take jails into account as well.
1141  */
1142 int
1143 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
1144     struct ucred *cred)
1145 {
1146         struct ifaddr *ifa;
1147         struct sockaddr *sa;
1148         struct sockaddr_in *sin, dst;
1149         struct nhop_object *nh;
1150         int error;
1151
1152         NET_EPOCH_ASSERT();
1153         KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1154         /*
1155          * Bypass source address selection and use the primary jail IP
1156          * if requested.
1157          */
1158         if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
1159                 return (0);
1160
1161         error = 0;
1162
1163         nh = NULL;
1164         bzero(&dst, sizeof(dst));
1165         sin = &dst;
1166         sin->sin_family = AF_INET;
1167         sin->sin_len = sizeof(struct sockaddr_in);
1168         sin->sin_addr.s_addr = faddr->s_addr;
1169
1170         /*
1171          * If route is known our src addr is taken from the i/f,
1172          * else punt.
1173          *
1174          * Find out route to destination.
1175          */
1176         if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1177                 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1178                     0, NHR_NONE, 0);
1179
1180         /*
1181          * If we found a route, use the address corresponding to
1182          * the outgoing interface.
1183          *
1184          * Otherwise assume faddr is reachable on a directly connected
1185          * network and try to find a corresponding interface to take
1186          * the source address from.
1187          */
1188         if (nh == NULL || nh->nh_ifp == NULL) {
1189                 struct in_ifaddr *ia;
1190                 struct ifnet *ifp;
1191
1192                 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1193                                         inp->inp_socket->so_fibnum));
1194                 if (ia == NULL) {
1195                         ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1196                                                 inp->inp_socket->so_fibnum));
1197                 }
1198                 if (ia == NULL) {
1199                         error = ENETUNREACH;
1200                         goto done;
1201                 }
1202
1203                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1204                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1205                         goto done;
1206                 }
1207
1208                 ifp = ia->ia_ifp;
1209                 ia = NULL;
1210                 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1211                         sa = ifa->ifa_addr;
1212                         if (sa->sa_family != AF_INET)
1213                                 continue;
1214                         sin = (struct sockaddr_in *)sa;
1215                         if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1216                                 ia = (struct in_ifaddr *)ifa;
1217                                 break;
1218                         }
1219                 }
1220                 if (ia != NULL) {
1221                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1222                         goto done;
1223                 }
1224
1225                 /* 3. As a last resort return the 'default' jail address. */
1226                 error = prison_get_ip4(cred, laddr);
1227                 goto done;
1228         }
1229
1230         /*
1231          * If the outgoing interface on the route found is not
1232          * a loopback interface, use the address from that interface.
1233          * In case of jails do those three steps:
1234          * 1. check if the interface address belongs to the jail. If so use it.
1235          * 2. check if we have any address on the outgoing interface
1236          *    belonging to this jail. If so use it.
1237          * 3. as a last resort return the 'default' jail address.
1238          */
1239         if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1240                 struct in_ifaddr *ia;
1241                 struct ifnet *ifp;
1242
1243                 /* If not jailed, use the default returned. */
1244                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1245                         ia = (struct in_ifaddr *)nh->nh_ifa;
1246                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1247                         goto done;
1248                 }
1249
1250                 /* Jailed. */
1251                 /* 1. Check if the iface address belongs to the jail. */
1252                 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1253                 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1254                         ia = (struct in_ifaddr *)nh->nh_ifa;
1255                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1256                         goto done;
1257                 }
1258
1259                 /*
1260                  * 2. Check if we have any address on the outgoing interface
1261                  *    belonging to this jail.
1262                  */
1263                 ia = NULL;
1264                 ifp = nh->nh_ifp;
1265                 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1266                         sa = ifa->ifa_addr;
1267                         if (sa->sa_family != AF_INET)
1268                                 continue;
1269                         sin = (struct sockaddr_in *)sa;
1270                         if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1271                                 ia = (struct in_ifaddr *)ifa;
1272                                 break;
1273                         }
1274                 }
1275                 if (ia != NULL) {
1276                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1277                         goto done;
1278                 }
1279
1280                 /* 3. As a last resort return the 'default' jail address. */
1281                 error = prison_get_ip4(cred, laddr);
1282                 goto done;
1283         }
1284
1285         /*
1286          * The outgoing interface is marked with 'loopback net', so a route
1287          * to ourselves is here.
1288          * Try to find the interface of the destination address and then
1289          * take the address from there. That interface is not necessarily
1290          * a loopback interface.
1291          * In case of jails, check that it is an address of the jail
1292          * and if we cannot find, fall back to the 'default' jail address.
1293          */
1294         if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1295                 struct in_ifaddr *ia;
1296
1297                 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1298                                         inp->inp_socket->so_fibnum));
1299                 if (ia == NULL)
1300                         ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1301                                                 inp->inp_socket->so_fibnum));
1302                 if (ia == NULL)
1303                         ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1304
1305                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1306                         if (ia == NULL) {
1307                                 error = ENETUNREACH;
1308                                 goto done;
1309                         }
1310                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1311                         goto done;
1312                 }
1313
1314                 /* Jailed. */
1315                 if (ia != NULL) {
1316                         struct ifnet *ifp;
1317
1318                         ifp = ia->ia_ifp;
1319                         ia = NULL;
1320                         CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1321                                 sa = ifa->ifa_addr;
1322                                 if (sa->sa_family != AF_INET)
1323                                         continue;
1324                                 sin = (struct sockaddr_in *)sa;
1325                                 if (prison_check_ip4(cred,
1326                                     &sin->sin_addr) == 0) {
1327                                         ia = (struct in_ifaddr *)ifa;
1328                                         break;
1329                                 }
1330                         }
1331                         if (ia != NULL) {
1332                                 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1333                                 goto done;
1334                         }
1335                 }
1336
1337                 /* 3. As a last resort return the 'default' jail address. */
1338                 error = prison_get_ip4(cred, laddr);
1339                 goto done;
1340         }
1341
1342 done:
1343         return (error);
1344 }
1345
1346 /*
1347  * Set up for a connect from a socket to the specified address.
1348  * On entry, *laddrp and *lportp should contain the current local
1349  * address and port for the PCB; these are updated to the values
1350  * that should be placed in inp_laddr and inp_lport to complete
1351  * the connect.
1352  *
1353  * On success, *faddrp and *fportp will be set to the remote address
1354  * and port. These are not updated in the error case.
1355  *
1356  * If the operation fails because the connection already exists,
1357  * *oinpp will be set to the PCB of that connection so that the
1358  * caller can decide to override it. In all other cases, *oinpp
1359  * is set to NULL.
1360  */
1361 int
1362 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
1363     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
1364     struct inpcb **oinpp, struct ucred *cred)
1365 {
1366         struct rm_priotracker in_ifa_tracker;
1367         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
1368         struct in_ifaddr *ia;
1369         struct inpcb *oinp;
1370         struct in_addr laddr, faddr;
1371         u_short lport, fport;
1372         int error;
1373
1374         KASSERT(sin->sin_family == AF_INET,
1375             ("%s: invalid address family for %p", __func__, sin));
1376         KASSERT(sin->sin_len == sizeof(*sin),
1377             ("%s: invalid address length for %p", __func__, sin));
1378
1379         /*
1380          * Because a global state change doesn't actually occur here, a read
1381          * lock is sufficient.
1382          */
1383         NET_EPOCH_ASSERT();
1384         INP_LOCK_ASSERT(inp);
1385         INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1386
1387         if (oinpp != NULL)
1388                 *oinpp = NULL;
1389         if (sin->sin_port == 0)
1390                 return (EADDRNOTAVAIL);
1391         laddr.s_addr = *laddrp;
1392         lport = *lportp;
1393         faddr = sin->sin_addr;
1394         fport = sin->sin_port;
1395 #ifdef ROUTE_MPATH
1396         if (CALC_FLOWID_OUTBOUND) {
1397                 uint32_t hash_val, hash_type;
1398
1399                 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport,
1400                     inp->inp_socket->so_proto->pr_protocol, &hash_type);
1401
1402                 inp->inp_flowid = hash_val;
1403                 inp->inp_flowtype = hash_type;
1404         }
1405 #endif
1406         if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
1407                 /*
1408                  * If the destination address is INADDR_ANY,
1409                  * use the primary local address.
1410                  * If the supplied address is INADDR_BROADCAST,
1411                  * and the primary interface supports broadcast,
1412                  * choose the broadcast address for that interface.
1413                  */
1414                 if (faddr.s_addr == INADDR_ANY) {
1415                         IN_IFADDR_RLOCK(&in_ifa_tracker);
1416                         faddr =
1417                             IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1418                         IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1419                         if (cred != NULL &&
1420                             (error = prison_get_ip4(cred, &faddr)) != 0)
1421                                 return (error);
1422                 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
1423                         IN_IFADDR_RLOCK(&in_ifa_tracker);
1424                         if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
1425                             IFF_BROADCAST)
1426                                 faddr = satosin(&CK_STAILQ_FIRST(
1427                                     &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1428                         IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1429                 }
1430         }
1431         if (laddr.s_addr == INADDR_ANY) {
1432                 error = in_pcbladdr(inp, &faddr, &laddr, cred);
1433                 /*
1434                  * If the destination address is multicast and an outgoing
1435                  * interface has been set as a multicast option, prefer the
1436                  * address of that interface as our source address.
1437                  */
1438                 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
1439                     inp->inp_moptions != NULL) {
1440                         struct ip_moptions *imo;
1441                         struct ifnet *ifp;
1442
1443                         imo = inp->inp_moptions;
1444                         if (imo->imo_multicast_ifp != NULL) {
1445                                 ifp = imo->imo_multicast_ifp;
1446                                 IN_IFADDR_RLOCK(&in_ifa_tracker);
1447                                 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1448                                         if ((ia->ia_ifp == ifp) &&
1449                                             (cred == NULL ||
1450                                             prison_check_ip4(cred,
1451                                             &ia->ia_addr.sin_addr) == 0))
1452                                                 break;
1453                                 }
1454                                 if (ia == NULL)
1455                                         error = EADDRNOTAVAIL;
1456                                 else {
1457                                         laddr = ia->ia_addr.sin_addr;
1458                                         error = 0;
1459                                 }
1460                                 IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1461                         }
1462                 }
1463                 if (error)
1464                         return (error);
1465         }
1466
1467         if (lport != 0) {
1468                 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
1469                     fport, laddr, lport, 0, NULL, M_NODOM);
1470                 if (oinp != NULL) {
1471                         if (oinpp != NULL)
1472                                 *oinpp = oinp;
1473                         return (EADDRINUSE);
1474                 }
1475         } else {
1476                 struct sockaddr_in lsin, fsin;
1477
1478                 bzero(&lsin, sizeof(lsin));
1479                 bzero(&fsin, sizeof(fsin));
1480                 lsin.sin_family = AF_INET;
1481                 lsin.sin_addr = laddr;
1482                 fsin.sin_family = AF_INET;
1483                 fsin.sin_addr = faddr;
1484                 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin,
1485                     &lport, (struct sockaddr *)& fsin, fport, cred,
1486                     INPLOOKUP_WILDCARD);
1487                 if (error)
1488                         return (error);
1489         }
1490         *laddrp = laddr.s_addr;
1491         *lportp = lport;
1492         *faddrp = faddr.s_addr;
1493         *fportp = fport;
1494         return (0);
1495 }
1496
1497 void
1498 in_pcbdisconnect(struct inpcb *inp)
1499 {
1500
1501         INP_WLOCK_ASSERT(inp);
1502         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1503
1504         inp->inp_faddr.s_addr = INADDR_ANY;
1505         inp->inp_fport = 0;
1506         in_pcbrehash(inp);
1507 }
1508 #endif /* INET */
1509
1510 /*
1511  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
1512  * For most protocols, this will be invoked immediately prior to calling
1513  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
1514  * socket, in which case in_pcbfree() is deferred.
1515  */
1516 void
1517 in_pcbdetach(struct inpcb *inp)
1518 {
1519
1520         KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1521
1522 #ifdef RATELIMIT
1523         if (inp->inp_snd_tag != NULL)
1524                 in_pcbdetach_txrtlmt(inp);
1525 #endif
1526         inp->inp_socket->so_pcb = NULL;
1527         inp->inp_socket = NULL;
1528 }
1529
1530 /*
1531  * in_pcbref() bumps the reference count on an inpcb in order to maintain
1532  * stability of an inpcb pointer despite the inpcb lock being released.  This
1533  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
1534  * but where the inpcb lock may already held, or when acquiring a reference
1535  * via a pcbgroup.
1536  *
1537  * in_pcbref() should be used only to provide brief memory stability, and
1538  * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
1539  * garbage collect the inpcb if it has been in_pcbfree()'d from another
1540  * context.  Until in_pcbrele() has returned that the inpcb is still valid,
1541  * lock and rele are the *only* safe operations that may be performed on the
1542  * inpcb.
1543  *
1544  * While the inpcb will not be freed, releasing the inpcb lock means that the
1545  * connection's state may change, so the caller should be careful to
1546  * revalidate any cached state on reacquiring the lock.  Drop the reference
1547  * using in_pcbrele().
1548  */
1549 void
1550 in_pcbref(struct inpcb *inp)
1551 {
1552
1553         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1554
1555         refcount_acquire(&inp->inp_refcount);
1556 }
1557
1558 /*
1559  * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
1560  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
1561  * return a flag indicating whether or not the inpcb remains valid.  If it is
1562  * valid, we return with the inpcb lock held.
1563  *
1564  * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
1565  * reference on an inpcb.  Historically more work was done here (actually, in
1566  * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
1567  * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
1568  * about memory stability (and continued use of the write lock).
1569  */
1570 int
1571 in_pcbrele_rlocked(struct inpcb *inp)
1572 {
1573         struct inpcbinfo *pcbinfo;
1574
1575         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1576
1577         INP_RLOCK_ASSERT(inp);
1578
1579         if (refcount_release(&inp->inp_refcount) == 0) {
1580                 /*
1581                  * If the inpcb has been freed, let the caller know, even if
1582                  * this isn't the last reference.
1583                  */
1584                 if (inp->inp_flags2 & INP_FREED) {
1585                         INP_RUNLOCK(inp);
1586                         return (1);
1587                 }
1588                 return (0);
1589         }
1590
1591         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1592 #ifdef TCPHPTS
1593         if (inp->inp_in_hpts || inp->inp_in_input) {
1594                 struct tcp_hpts_entry *hpts;
1595                 /*
1596                  * We should not be on the hpts at
1597                  * this point in any form. we must
1598                  * get the lock to be sure.
1599                  */
1600                 hpts = tcp_hpts_lock(inp);
1601                 if (inp->inp_in_hpts)
1602                         panic("Hpts:%p inp:%p at free still on hpts",
1603                               hpts, inp);
1604                 mtx_unlock(&hpts->p_mtx);
1605                 hpts = tcp_input_lock(inp);
1606                 if (inp->inp_in_input)
1607                         panic("Hpts:%p inp:%p at free still on input hpts",
1608                               hpts, inp);
1609                 mtx_unlock(&hpts->p_mtx);
1610         }
1611 #endif
1612         INP_RUNLOCK(inp);
1613         pcbinfo = inp->inp_pcbinfo;
1614         uma_zfree(pcbinfo->ipi_zone, inp);
1615         return (1);
1616 }
1617
1618 int
1619 in_pcbrele_wlocked(struct inpcb *inp)
1620 {
1621         struct inpcbinfo *pcbinfo;
1622
1623         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1624
1625         INP_WLOCK_ASSERT(inp);
1626
1627         if (refcount_release(&inp->inp_refcount) == 0) {
1628                 /*
1629                  * If the inpcb has been freed, let the caller know, even if
1630                  * this isn't the last reference.
1631                  */
1632                 if (inp->inp_flags2 & INP_FREED) {
1633                         INP_WUNLOCK(inp);
1634                         return (1);
1635                 }
1636                 return (0);
1637         }
1638
1639         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1640 #ifdef TCPHPTS
1641         if (inp->inp_in_hpts || inp->inp_in_input) {
1642                 struct tcp_hpts_entry *hpts;
1643                 /*
1644                  * We should not be on the hpts at
1645                  * this point in any form. we must
1646                  * get the lock to be sure.
1647                  */
1648                 hpts = tcp_hpts_lock(inp);
1649                 if (inp->inp_in_hpts)
1650                         panic("Hpts:%p inp:%p at free still on hpts",
1651                               hpts, inp);
1652                 mtx_unlock(&hpts->p_mtx);
1653                 hpts = tcp_input_lock(inp);
1654                 if (inp->inp_in_input)
1655                         panic("Hpts:%p inp:%p at free still on input hpts",
1656                               hpts, inp);
1657                 mtx_unlock(&hpts->p_mtx);
1658         }
1659 #endif
1660         INP_WUNLOCK(inp);
1661         pcbinfo = inp->inp_pcbinfo;
1662         uma_zfree(pcbinfo->ipi_zone, inp);
1663         return (1);
1664 }
1665
1666 /*
1667  * Temporary wrapper.
1668  */
1669 int
1670 in_pcbrele(struct inpcb *inp)
1671 {
1672
1673         return (in_pcbrele_wlocked(inp));
1674 }
1675
1676 void
1677 in_pcblist_rele_rlocked(epoch_context_t ctx)
1678 {
1679         struct in_pcblist *il;
1680         struct inpcb *inp;
1681         struct inpcbinfo *pcbinfo;
1682         int i, n;
1683
1684         il = __containerof(ctx, struct in_pcblist, il_epoch_ctx);
1685         pcbinfo = il->il_pcbinfo;
1686         n = il->il_count;
1687         INP_INFO_WLOCK(pcbinfo);
1688         for (i = 0; i < n; i++) {
1689                 inp = il->il_inp_list[i];
1690                 INP_RLOCK(inp);
1691                 if (!in_pcbrele_rlocked(inp))
1692                         INP_RUNLOCK(inp);
1693         }
1694         INP_INFO_WUNLOCK(pcbinfo);
1695         free(il, M_TEMP);
1696 }
1697
1698 static void
1699 inpcbport_free(epoch_context_t ctx)
1700 {
1701         struct inpcbport *phd;
1702
1703         phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx);
1704         free(phd, M_PCB);
1705 }
1706
1707 static void
1708 in_pcbfree_deferred(epoch_context_t ctx)
1709 {
1710         struct inpcb *inp;
1711         int released __unused;
1712
1713         inp = __containerof(ctx, struct inpcb, inp_epoch_ctx);
1714
1715         INP_WLOCK(inp);
1716         CURVNET_SET(inp->inp_vnet);
1717 #ifdef INET
1718         struct ip_moptions *imo = inp->inp_moptions;
1719         inp->inp_moptions = NULL;
1720 #endif
1721         /* XXXRW: Do as much as possible here. */
1722 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1723         if (inp->inp_sp != NULL)
1724                 ipsec_delete_pcbpolicy(inp);
1725 #endif
1726 #ifdef INET6
1727         struct ip6_moptions *im6o = NULL;
1728         if (inp->inp_vflag & INP_IPV6PROTO) {
1729                 ip6_freepcbopts(inp->in6p_outputopts);
1730                 im6o = inp->in6p_moptions;
1731                 inp->in6p_moptions = NULL;
1732         }
1733 #endif
1734         if (inp->inp_options)
1735                 (void)m_free(inp->inp_options);
1736         inp->inp_vflag = 0;
1737         crfree(inp->inp_cred);
1738 #ifdef MAC
1739         mac_inpcb_destroy(inp);
1740 #endif
1741         released = in_pcbrele_wlocked(inp);
1742         MPASS(released);
1743 #ifdef INET6
1744         ip6_freemoptions(im6o);
1745 #endif
1746 #ifdef INET
1747         inp_freemoptions(imo);
1748 #endif
1749         CURVNET_RESTORE();
1750 }
1751
1752 /*
1753  * Unconditionally schedule an inpcb to be freed by decrementing its
1754  * reference count, which should occur only after the inpcb has been detached
1755  * from its socket.  If another thread holds a temporary reference (acquired
1756  * using in_pcbref()) then the free is deferred until that reference is
1757  * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
1758  * work, including removal from global lists, is done in this context, where
1759  * the pcbinfo lock is held.
1760  */
1761 void
1762 in_pcbfree(struct inpcb *inp)
1763 {
1764         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1765
1766         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1767         KASSERT((inp->inp_flags2 & INP_FREED) == 0,
1768             ("%s: called twice for pcb %p", __func__, inp));
1769         if (inp->inp_flags2 & INP_FREED) {
1770                 INP_WUNLOCK(inp);
1771                 return;
1772         }
1773
1774         INP_WLOCK_ASSERT(inp);
1775         INP_LIST_WLOCK(pcbinfo);
1776         in_pcbremlists(inp);
1777         INP_LIST_WUNLOCK(pcbinfo);
1778         RO_INVALIDATE_CACHE(&inp->inp_route);
1779         /* mark as destruction in progress */
1780         inp->inp_flags2 |= INP_FREED;
1781         INP_WUNLOCK(inp);
1782         NET_EPOCH_CALL(in_pcbfree_deferred, &inp->inp_epoch_ctx);
1783 }
1784
1785 /*
1786  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1787  * port reservation, and preventing it from being returned by inpcb lookups.
1788  *
1789  * It is used by TCP to mark an inpcb as unused and avoid future packet
1790  * delivery or event notification when a socket remains open but TCP has
1791  * closed.  This might occur as a result of a shutdown()-initiated TCP close
1792  * or a RST on the wire, and allows the port binding to be reused while still
1793  * maintaining the invariant that so_pcb always points to a valid inpcb until
1794  * in_pcbdetach().
1795  *
1796  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1797  * in_pcbnotifyall() and in_pcbpurgeif0()?
1798  */
1799 void
1800 in_pcbdrop(struct inpcb *inp)
1801 {
1802
1803         INP_WLOCK_ASSERT(inp);
1804 #ifdef INVARIANTS
1805         if (inp->inp_socket != NULL && inp->inp_ppcb != NULL)
1806                 MPASS(inp->inp_refcount > 1);
1807 #endif
1808
1809         /*
1810          * XXXRW: Possibly we should protect the setting of INP_DROPPED with
1811          * the hash lock...?
1812          */
1813         inp->inp_flags |= INP_DROPPED;
1814         if (inp->inp_flags & INP_INHASHLIST) {
1815                 struct inpcbport *phd = inp->inp_phd;
1816
1817                 INP_HASH_WLOCK(inp->inp_pcbinfo);
1818                 in_pcbremlbgrouphash(inp);
1819                 CK_LIST_REMOVE(inp, inp_hash);
1820                 CK_LIST_REMOVE(inp, inp_portlist);
1821                 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
1822                         CK_LIST_REMOVE(phd, phd_hash);
1823                         NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx);
1824                 }
1825                 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1826                 inp->inp_flags &= ~INP_INHASHLIST;
1827 #ifdef PCBGROUP
1828                 in_pcbgroup_remove(inp);
1829 #endif
1830         }
1831 }
1832
1833 #ifdef INET
1834 /*
1835  * Common routines to return the socket addresses associated with inpcbs.
1836  */
1837 struct sockaddr *
1838 in_sockaddr(in_port_t port, struct in_addr *addr_p)
1839 {
1840         struct sockaddr_in *sin;
1841
1842         sin = malloc(sizeof *sin, M_SONAME,
1843                 M_WAITOK | M_ZERO);
1844         sin->sin_family = AF_INET;
1845         sin->sin_len = sizeof(*sin);
1846         sin->sin_addr = *addr_p;
1847         sin->sin_port = port;
1848
1849         return (struct sockaddr *)sin;
1850 }
1851
1852 int
1853 in_getsockaddr(struct socket *so, struct sockaddr **nam)
1854 {
1855         struct inpcb *inp;
1856         struct in_addr addr;
1857         in_port_t port;
1858
1859         inp = sotoinpcb(so);
1860         KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1861
1862         INP_RLOCK(inp);
1863         port = inp->inp_lport;
1864         addr = inp->inp_laddr;
1865         INP_RUNLOCK(inp);
1866
1867         *nam = in_sockaddr(port, &addr);
1868         return 0;
1869 }
1870
1871 int
1872 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
1873 {
1874         struct inpcb *inp;
1875         struct in_addr addr;
1876         in_port_t port;
1877
1878         inp = sotoinpcb(so);
1879         KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1880
1881         INP_RLOCK(inp);
1882         port = inp->inp_fport;
1883         addr = inp->inp_faddr;
1884         INP_RUNLOCK(inp);
1885
1886         *nam = in_sockaddr(port, &addr);
1887         return 0;
1888 }
1889
1890 void
1891 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
1892     struct inpcb *(*notify)(struct inpcb *, int))
1893 {
1894         struct inpcb *inp, *inp_temp;
1895
1896         INP_INFO_WLOCK(pcbinfo);
1897         CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
1898                 INP_WLOCK(inp);
1899 #ifdef INET6
1900                 if ((inp->inp_vflag & INP_IPV4) == 0) {
1901                         INP_WUNLOCK(inp);
1902                         continue;
1903                 }
1904 #endif
1905                 if (inp->inp_faddr.s_addr != faddr.s_addr ||
1906                     inp->inp_socket == NULL) {
1907                         INP_WUNLOCK(inp);
1908                         continue;
1909                 }
1910                 if ((*notify)(inp, errno))
1911                         INP_WUNLOCK(inp);
1912         }
1913         INP_INFO_WUNLOCK(pcbinfo);
1914 }
1915
1916 void
1917 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1918 {
1919         struct inpcb *inp;
1920         struct in_multi *inm;
1921         struct in_mfilter *imf;
1922         struct ip_moptions *imo;
1923
1924         INP_INFO_WLOCK(pcbinfo);
1925         CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1926                 INP_WLOCK(inp);
1927                 imo = inp->inp_moptions;
1928                 if ((inp->inp_vflag & INP_IPV4) &&
1929                     imo != NULL) {
1930                         /*
1931                          * Unselect the outgoing interface if it is being
1932                          * detached.
1933                          */
1934                         if (imo->imo_multicast_ifp == ifp)
1935                                 imo->imo_multicast_ifp = NULL;
1936
1937                         /*
1938                          * Drop multicast group membership if we joined
1939                          * through the interface being detached.
1940                          *
1941                          * XXX This can all be deferred to an epoch_call
1942                          */
1943 restart:
1944                         IP_MFILTER_FOREACH(imf, &imo->imo_head) {
1945                                 if ((inm = imf->imf_inm) == NULL)
1946                                         continue;
1947                                 if (inm->inm_ifp != ifp)
1948                                         continue;
1949                                 ip_mfilter_remove(&imo->imo_head, imf);
1950                                 IN_MULTI_LOCK_ASSERT();
1951                                 in_leavegroup_locked(inm, NULL);
1952                                 ip_mfilter_free(imf);
1953                                 goto restart;
1954                         }
1955                 }
1956                 INP_WUNLOCK(inp);
1957         }
1958         INP_INFO_WUNLOCK(pcbinfo);
1959 }
1960
1961 /*
1962  * Lookup a PCB based on the local address and port.  Caller must hold the
1963  * hash lock.  No inpcb locks or references are acquired.
1964  */
1965 #define INP_LOOKUP_MAPPED_PCB_COST      3
1966 struct inpcb *
1967 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1968     u_short lport, int lookupflags, struct ucred *cred)
1969 {
1970         struct inpcb *inp;
1971 #ifdef INET6
1972         int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
1973 #else
1974         int matchwild = 3;
1975 #endif
1976         int wildcard;
1977
1978         KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1979             ("%s: invalid lookup flags %d", __func__, lookupflags));
1980
1981         INP_HASH_LOCK_ASSERT(pcbinfo);
1982
1983         if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
1984                 struct inpcbhead *head;
1985                 /*
1986                  * Look for an unconnected (wildcard foreign addr) PCB that
1987                  * matches the local address and port we're looking for.
1988                  */
1989                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1990                     0, pcbinfo->ipi_hashmask)];
1991                 CK_LIST_FOREACH(inp, head, inp_hash) {
1992 #ifdef INET6
1993                         /* XXX inp locking */
1994                         if ((inp->inp_vflag & INP_IPV4) == 0)
1995                                 continue;
1996 #endif
1997                         if (inp->inp_faddr.s_addr == INADDR_ANY &&
1998                             inp->inp_laddr.s_addr == laddr.s_addr &&
1999                             inp->inp_lport == lport) {
2000                                 /*
2001                                  * Found?
2002                                  */
2003                                 if (cred == NULL ||
2004                                     prison_equal_ip4(cred->cr_prison,
2005                                         inp->inp_cred->cr_prison))
2006                                         return (inp);
2007                         }
2008                 }
2009                 /*
2010                  * Not found.
2011                  */
2012                 return (NULL);
2013         } else {
2014                 struct inpcbporthead *porthash;
2015                 struct inpcbport *phd;
2016                 struct inpcb *match = NULL;
2017                 /*
2018                  * Best fit PCB lookup.
2019                  *
2020                  * First see if this local port is in use by looking on the
2021                  * port hash list.
2022                  */
2023                 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2024                     pcbinfo->ipi_porthashmask)];
2025                 CK_LIST_FOREACH(phd, porthash, phd_hash) {
2026                         if (phd->phd_port == lport)
2027                                 break;
2028                 }
2029                 if (phd != NULL) {
2030                         /*
2031                          * Port is in use by one or more PCBs. Look for best
2032                          * fit.
2033                          */
2034                         CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
2035                                 wildcard = 0;
2036                                 if (cred != NULL &&
2037                                     !prison_equal_ip4(inp->inp_cred->cr_prison,
2038                                         cred->cr_prison))
2039                                         continue;
2040 #ifdef INET6
2041                                 /* XXX inp locking */
2042                                 if ((inp->inp_vflag & INP_IPV4) == 0)
2043                                         continue;
2044                                 /*
2045                                  * We never select the PCB that has
2046                                  * INP_IPV6 flag and is bound to :: if
2047                                  * we have another PCB which is bound
2048                                  * to 0.0.0.0.  If a PCB has the
2049                                  * INP_IPV6 flag, then we set its cost
2050                                  * higher than IPv4 only PCBs.
2051                                  *
2052                                  * Note that the case only happens
2053                                  * when a socket is bound to ::, under
2054                                  * the condition that the use of the
2055                                  * mapped address is allowed.
2056                                  */
2057                                 if ((inp->inp_vflag & INP_IPV6) != 0)
2058                                         wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2059 #endif
2060                                 if (inp->inp_faddr.s_addr != INADDR_ANY)
2061                                         wildcard++;
2062                                 if (inp->inp_laddr.s_addr != INADDR_ANY) {
2063                                         if (laddr.s_addr == INADDR_ANY)
2064                                                 wildcard++;
2065                                         else if (inp->inp_laddr.s_addr != laddr.s_addr)
2066                                                 continue;
2067                                 } else {
2068                                         if (laddr.s_addr != INADDR_ANY)
2069                                                 wildcard++;
2070                                 }
2071                                 if (wildcard < matchwild) {
2072                                         match = inp;
2073                                         matchwild = wildcard;
2074                                         if (matchwild == 0)
2075                                                 break;
2076                                 }
2077                         }
2078                 }
2079                 return (match);
2080         }
2081 }
2082 #undef INP_LOOKUP_MAPPED_PCB_COST
2083
2084 static struct inpcb *
2085 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2086     const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
2087     uint16_t fport, int lookupflags, int numa_domain)
2088 {
2089         struct inpcb *local_wild, *numa_wild;
2090         const struct inpcblbgrouphead *hdr;
2091         struct inpcblbgroup *grp;
2092         uint32_t idx;
2093
2094         INP_HASH_LOCK_ASSERT(pcbinfo);
2095
2096         hdr = &pcbinfo->ipi_lbgrouphashbase[
2097             INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2098
2099         /*
2100          * Order of socket selection:
2101          * 1. non-wild.
2102          * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
2103          *
2104          * NOTE:
2105          * - Load balanced group does not contain jailed sockets
2106          * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
2107          */
2108         local_wild = NULL;
2109         numa_wild = NULL;
2110         CK_LIST_FOREACH(grp, hdr, il_list) {
2111 #ifdef INET6
2112                 if (!(grp->il_vflag & INP_IPV4))
2113                         continue;
2114 #endif
2115                 if (grp->il_lport != lport)
2116                         continue;
2117
2118                 idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) %
2119                     grp->il_inpcnt;
2120                 if (grp->il_laddr.s_addr == laddr->s_addr) {
2121                         if (numa_domain == M_NODOM ||
2122                             grp->il_numa_domain == numa_domain) {
2123                                 return (grp->il_inp[idx]);
2124                         } else {
2125                                 numa_wild = grp->il_inp[idx];
2126                         }
2127                 }
2128                 if (grp->il_laddr.s_addr == INADDR_ANY &&
2129                     (lookupflags & INPLOOKUP_WILDCARD) != 0 &&
2130                     (local_wild == NULL || numa_domain == M_NODOM ||
2131                         grp->il_numa_domain == numa_domain)) {
2132                         local_wild = grp->il_inp[idx];
2133                 }
2134         }
2135         if (numa_wild != NULL)
2136                 return (numa_wild);
2137
2138         return (local_wild);
2139 }
2140
2141 #ifdef PCBGROUP
2142 /*
2143  * Lookup PCB in hash list, using pcbgroup tables.
2144  */
2145 static struct inpcb *
2146 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
2147     struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
2148     u_int lport_arg, int lookupflags, struct ifnet *ifp)
2149 {
2150         struct inpcbhead *head;
2151         struct inpcb *inp, *tmpinp;
2152         u_short fport = fport_arg, lport = lport_arg;
2153         bool locked;
2154
2155         /*
2156          * First look for an exact match.
2157          */
2158         tmpinp = NULL;
2159         INP_GROUP_LOCK(pcbgroup);
2160         head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2161             pcbgroup->ipg_hashmask)];
2162         CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
2163 #ifdef INET6
2164                 /* XXX inp locking */
2165                 if ((inp->inp_vflag & INP_IPV4) == 0)
2166                         continue;
2167 #endif
2168                 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2169                     inp->inp_laddr.s_addr == laddr.s_addr &&
2170                     inp->inp_fport == fport &&
2171                     inp->inp_lport == lport) {
2172                         /*
2173                          * XXX We should be able to directly return
2174                          * the inp here, without any checks.
2175                          * Well unless both bound with SO_REUSEPORT?
2176                          */
2177                         if (prison_flag(inp->inp_cred, PR_IP4))
2178                                 goto found;
2179                         if (tmpinp == NULL)
2180                                 tmpinp = inp;
2181                 }
2182         }
2183         if (tmpinp != NULL) {
2184                 inp = tmpinp;
2185                 goto found;
2186         }
2187
2188 #ifdef  RSS
2189         /*
2190          * For incoming connections, we may wish to do a wildcard
2191          * match for an RSS-local socket.
2192          */
2193         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2194                 struct inpcb *local_wild = NULL, *local_exact = NULL;
2195 #ifdef INET6
2196                 struct inpcb *local_wild_mapped = NULL;
2197 #endif
2198                 struct inpcb *jail_wild = NULL;
2199                 struct inpcbhead *head;
2200                 int injail;
2201
2202                 /*
2203                  * Order of socket selection - we always prefer jails.
2204                  *      1. jailed, non-wild.
2205                  *      2. jailed, wild.
2206                  *      3. non-jailed, non-wild.
2207                  *      4. non-jailed, wild.
2208                  */
2209
2210                 head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
2211                     lport, 0, pcbgroup->ipg_hashmask)];
2212                 CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
2213 #ifdef INET6
2214                         /* XXX inp locking */
2215                         if ((inp->inp_vflag & INP_IPV4) == 0)
2216                                 continue;
2217 #endif
2218                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
2219                             inp->inp_lport != lport)
2220                                 continue;
2221
2222                         injail = prison_flag(inp->inp_cred, PR_IP4);
2223                         if (injail) {
2224                                 if (prison_check_ip4(inp->inp_cred,
2225                                     &laddr) != 0)
2226                                         continue;
2227                         } else {
2228                                 if (local_exact != NULL)
2229                                         continue;
2230                         }
2231
2232                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
2233                                 if (injail)
2234                                         goto found;
2235                                 else
2236                                         local_exact = inp;
2237                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2238 #ifdef INET6
2239                                 /* XXX inp locking, NULL check */
2240                                 if (inp->inp_vflag & INP_IPV6PROTO)
2241                                         local_wild_mapped = inp;
2242                                 else
2243 #endif
2244                                         if (injail)
2245                                                 jail_wild = inp;
2246                                         else
2247                                                 local_wild = inp;
2248                         }
2249                 } /* LIST_FOREACH */
2250
2251                 inp = jail_wild;
2252                 if (inp == NULL)
2253                         inp = local_exact;
2254                 if (inp == NULL)
2255                         inp = local_wild;
2256 #ifdef INET6
2257                 if (inp == NULL)
2258                         inp = local_wild_mapped;
2259 #endif
2260                 if (inp != NULL)
2261                         goto found;
2262         }
2263 #endif
2264
2265         /*
2266          * Then look for a wildcard match, if requested.
2267          */
2268         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2269                 struct inpcb *local_wild = NULL, *local_exact = NULL;
2270 #ifdef INET6
2271                 struct inpcb *local_wild_mapped = NULL;
2272 #endif
2273                 struct inpcb *jail_wild = NULL;
2274                 struct inpcbhead *head;
2275                 int injail;
2276
2277                 /*
2278                  * Order of socket selection - we always prefer jails.
2279                  *      1. jailed, non-wild.
2280                  *      2. jailed, wild.
2281                  *      3. non-jailed, non-wild.
2282                  *      4. non-jailed, wild.
2283                  */
2284                 head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
2285                     0, pcbinfo->ipi_wildmask)];
2286                 CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
2287 #ifdef INET6
2288                         /* XXX inp locking */
2289                         if ((inp->inp_vflag & INP_IPV4) == 0)
2290                                 continue;
2291 #endif
2292                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
2293                             inp->inp_lport != lport)
2294                                 continue;
2295
2296                         injail = prison_flag(inp->inp_cred, PR_IP4);
2297                         if (injail) {
2298                                 if (prison_check_ip4(inp->inp_cred,
2299                                     &laddr) != 0)
2300                                         continue;
2301                         } else {
2302                                 if (local_exact != NULL)
2303                                         continue;
2304                         }
2305
2306                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
2307                                 if (injail)
2308                                         goto found;
2309                                 else
2310                                         local_exact = inp;
2311                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2312 #ifdef INET6
2313                                 /* XXX inp locking, NULL check */
2314                                 if (inp->inp_vflag & INP_IPV6PROTO)
2315                                         local_wild_mapped = inp;
2316                                 else
2317 #endif
2318                                         if (injail)
2319                                                 jail_wild = inp;
2320                                         else
2321                                                 local_wild = inp;
2322                         }
2323                 } /* LIST_FOREACH */
2324                 inp = jail_wild;
2325                 if (inp == NULL)
2326                         inp = local_exact;
2327                 if (inp == NULL)
2328                         inp = local_wild;
2329 #ifdef INET6
2330                 if (inp == NULL)
2331                         inp = local_wild_mapped;
2332 #endif
2333                 if (inp != NULL)
2334                         goto found;
2335         } /* if (lookupflags & INPLOOKUP_WILDCARD) */
2336         INP_GROUP_UNLOCK(pcbgroup);
2337         return (NULL);
2338
2339 found:
2340         if (lookupflags & INPLOOKUP_WLOCKPCB)
2341                 locked = INP_TRY_WLOCK(inp);
2342         else if (lookupflags & INPLOOKUP_RLOCKPCB)
2343                 locked = INP_TRY_RLOCK(inp);
2344         else
2345                 panic("%s: locking bug", __func__);
2346         if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) {
2347                 if (lookupflags & INPLOOKUP_WLOCKPCB)
2348                         INP_WUNLOCK(inp);
2349                 else
2350                         INP_RUNLOCK(inp);
2351                 return (NULL);
2352         } else if (!locked)
2353                 in_pcbref(inp);
2354         INP_GROUP_UNLOCK(pcbgroup);
2355         if (!locked) {
2356                 if (lookupflags & INPLOOKUP_WLOCKPCB) {
2357                         INP_WLOCK(inp);
2358                         if (in_pcbrele_wlocked(inp))
2359                                 return (NULL);
2360                 } else {
2361                         INP_RLOCK(inp);
2362                         if (in_pcbrele_rlocked(inp))
2363                                 return (NULL);
2364                 }
2365         }
2366 #ifdef INVARIANTS
2367         if (lookupflags & INPLOOKUP_WLOCKPCB)
2368                 INP_WLOCK_ASSERT(inp);
2369         else
2370                 INP_RLOCK_ASSERT(inp);
2371 #endif
2372         return (inp);
2373 }
2374 #endif /* PCBGROUP */
2375
2376 /*
2377  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
2378  * that the caller has locked the hash list, and will not perform any further
2379  * locking or reference operations on either the hash list or the connection.
2380  */
2381 static struct inpcb *
2382 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2383     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2384     struct ifnet *ifp, uint8_t numa_domain)
2385 {
2386         struct inpcbhead *head;
2387         struct inpcb *inp, *tmpinp;
2388         u_short fport = fport_arg, lport = lport_arg;
2389
2390         KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2391             ("%s: invalid lookup flags %d", __func__, lookupflags));
2392         INP_HASH_LOCK_ASSERT(pcbinfo);
2393
2394         /*
2395          * First look for an exact match.
2396          */
2397         tmpinp = NULL;
2398         head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2399             pcbinfo->ipi_hashmask)];
2400         CK_LIST_FOREACH(inp, head, inp_hash) {
2401 #ifdef INET6
2402                 /* XXX inp locking */
2403                 if ((inp->inp_vflag & INP_IPV4) == 0)
2404                         continue;
2405 #endif
2406                 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2407                     inp->inp_laddr.s_addr == laddr.s_addr &&
2408                     inp->inp_fport == fport &&
2409                     inp->inp_lport == lport) {
2410                         /*
2411                          * XXX We should be able to directly return
2412                          * the inp here, without any checks.
2413                          * Well unless both bound with SO_REUSEPORT?
2414                          */
2415                         if (prison_flag(inp->inp_cred, PR_IP4))
2416                                 return (inp);
2417                         if (tmpinp == NULL)
2418                                 tmpinp = inp;
2419                 }
2420         }
2421         if (tmpinp != NULL)
2422                 return (tmpinp);
2423
2424         /*
2425          * Then look in lb group (for wildcard match).
2426          */
2427         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2428                 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
2429                     fport, lookupflags, numa_domain);
2430                 if (inp != NULL)
2431                         return (inp);
2432         }
2433
2434         /*
2435          * Then look for a wildcard match, if requested.
2436          */
2437         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2438                 struct inpcb *local_wild = NULL, *local_exact = NULL;
2439 #ifdef INET6
2440                 struct inpcb *local_wild_mapped = NULL;
2441 #endif
2442                 struct inpcb *jail_wild = NULL;
2443                 int injail;
2444
2445                 /*
2446                  * Order of socket selection - we always prefer jails.
2447                  *      1. jailed, non-wild.
2448                  *      2. jailed, wild.
2449                  *      3. non-jailed, non-wild.
2450                  *      4. non-jailed, wild.
2451                  */
2452
2453                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
2454                     0, pcbinfo->ipi_hashmask)];
2455                 CK_LIST_FOREACH(inp, head, inp_hash) {
2456 #ifdef INET6
2457                         /* XXX inp locking */
2458                         if ((inp->inp_vflag & INP_IPV4) == 0)
2459                                 continue;
2460 #endif
2461                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
2462                             inp->inp_lport != lport)
2463                                 continue;
2464
2465                         injail = prison_flag(inp->inp_cred, PR_IP4);
2466                         if (injail) {
2467                                 if (prison_check_ip4(inp->inp_cred,
2468                                     &laddr) != 0)
2469                                         continue;
2470                         } else {
2471                                 if (local_exact != NULL)
2472                                         continue;
2473                         }
2474
2475                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
2476                                 if (injail)
2477                                         return (inp);
2478                                 else
2479                                         local_exact = inp;
2480                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2481 #ifdef INET6
2482                                 /* XXX inp locking, NULL check */
2483                                 if (inp->inp_vflag & INP_IPV6PROTO)
2484                                         local_wild_mapped = inp;
2485                                 else
2486 #endif
2487                                         if (injail)
2488                                                 jail_wild = inp;
2489                                         else
2490                                                 local_wild = inp;
2491                         }
2492                 } /* LIST_FOREACH */
2493                 if (jail_wild != NULL)
2494                         return (jail_wild);
2495                 if (local_exact != NULL)
2496                         return (local_exact);
2497                 if (local_wild != NULL)
2498                         return (local_wild);
2499 #ifdef INET6
2500                 if (local_wild_mapped != NULL)
2501                         return (local_wild_mapped);
2502 #endif
2503         } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
2504
2505         return (NULL);
2506 }
2507
2508 /*
2509  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
2510  * hash list lock, and will return the inpcb locked (i.e., requires
2511  * INPLOOKUP_LOCKPCB).
2512  */
2513 static struct inpcb *
2514 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2515     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2516     struct ifnet *ifp, uint8_t numa_domain)
2517 {
2518         struct inpcb *inp;
2519
2520         inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2521             (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp,
2522             numa_domain);
2523         if (inp != NULL) {
2524                 if (lookupflags & INPLOOKUP_WLOCKPCB) {
2525                         INP_WLOCK(inp);
2526                         if (__predict_false(inp->inp_flags2 & INP_FREED)) {
2527                                 INP_WUNLOCK(inp);
2528                                 inp = NULL;
2529                         }
2530                 } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
2531                         INP_RLOCK(inp);
2532                         if (__predict_false(inp->inp_flags2 & INP_FREED)) {
2533                                 INP_RUNLOCK(inp);
2534                                 inp = NULL;
2535                         }
2536                 } else
2537                         panic("%s: locking bug", __func__);
2538 #ifdef INVARIANTS
2539                 if (inp != NULL) {
2540                         if (lookupflags & INPLOOKUP_WLOCKPCB)
2541                                 INP_WLOCK_ASSERT(inp);
2542                         else
2543                                 INP_RLOCK_ASSERT(inp);
2544                 }
2545 #endif
2546         }
2547
2548         return (inp);
2549 }
2550
2551 /*
2552  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2553  * from which a pre-calculated hash value may be extracted.
2554  *
2555  * Possibly more of this logic should be in in_pcbgroup.c.
2556  */
2557 struct inpcb *
2558 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2559     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
2560 {
2561 #if defined(PCBGROUP) && !defined(RSS)
2562         struct inpcbgroup *pcbgroup;
2563 #endif
2564
2565         KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2566             ("%s: invalid lookup flags %d", __func__, lookupflags));
2567         KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2568             ("%s: LOCKPCB not set", __func__));
2569
2570         /*
2571          * When not using RSS, use connection groups in preference to the
2572          * reservation table when looking up 4-tuples.  When using RSS, just
2573          * use the reservation table, due to the cost of the Toeplitz hash
2574          * in software.
2575          *
2576          * XXXRW: This policy belongs in the pcbgroup code, as in principle
2577          * we could be doing RSS with a non-Toeplitz hash that is affordable
2578          * in software.
2579          */
2580 #if defined(PCBGROUP) && !defined(RSS)
2581         if (in_pcbgroup_enabled(pcbinfo)) {
2582                 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2583                     fport);
2584                 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2585                     laddr, lport, lookupflags, ifp));
2586         }
2587 #endif
2588         return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2589             lookupflags, ifp, M_NODOM));
2590 }
2591
2592 struct inpcb *
2593 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2594     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2595     struct ifnet *ifp, struct mbuf *m)
2596 {
2597 #ifdef PCBGROUP
2598         struct inpcbgroup *pcbgroup;
2599 #endif
2600
2601         KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2602             ("%s: invalid lookup flags %d", __func__, lookupflags));
2603         KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2604             ("%s: LOCKPCB not set", __func__));
2605
2606 #ifdef PCBGROUP
2607         /*
2608          * If we can use a hardware-generated hash to look up the connection
2609          * group, use that connection group to find the inpcb.  Otherwise
2610          * fall back on a software hash -- or the reservation table if we're
2611          * using RSS.
2612          *
2613          * XXXRW: As above, that policy belongs in the pcbgroup code.
2614          */
2615         if (in_pcbgroup_enabled(pcbinfo) &&
2616             !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
2617                 pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
2618                     m->m_pkthdr.flowid);
2619                 if (pcbgroup != NULL)
2620                         return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
2621                             fport, laddr, lport, lookupflags, ifp));
2622 #ifndef RSS
2623                 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2624                     fport);
2625                 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2626                     laddr, lport, lookupflags, ifp));
2627 #endif
2628         }
2629 #endif
2630         return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2631             lookupflags, ifp, m->m_pkthdr.numa_domain));
2632 }
2633 #endif /* INET */
2634
2635 /*
2636  * Insert PCB onto various hash lists.
2637  */
2638 static int
2639 in_pcbinshash_internal(struct inpcb *inp, struct mbuf *m)
2640 {
2641         struct inpcbhead *pcbhash;
2642         struct inpcbporthead *pcbporthash;
2643         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2644         struct inpcbport *phd;
2645         u_int32_t hashkey_faddr;
2646         int so_options;
2647
2648         INP_WLOCK_ASSERT(inp);
2649         INP_HASH_WLOCK_ASSERT(pcbinfo);
2650
2651         KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2652             ("in_pcbinshash: INP_INHASHLIST"));
2653
2654 #ifdef INET6
2655         if (inp->inp_vflag & INP_IPV6)
2656                 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2657         else
2658 #endif
2659         hashkey_faddr = inp->inp_faddr.s_addr;
2660
2661         pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2662                  inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2663
2664         pcbporthash = &pcbinfo->ipi_porthashbase[
2665             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2666
2667         /*
2668          * Add entry to load balance group.
2669          * Only do this if SO_REUSEPORT_LB is set.
2670          */
2671         so_options = inp_so_options(inp);
2672         if (so_options & SO_REUSEPORT_LB) {
2673                 int ret = in_pcbinslbgrouphash(inp, M_NODOM);
2674                 if (ret) {
2675                         /* pcb lb group malloc fail (ret=ENOBUFS). */
2676                         return (ret);
2677                 }
2678         }
2679
2680         /*
2681          * Go through port list and look for a head for this lport.
2682          */
2683         CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
2684                 if (phd->phd_port == inp->inp_lport)
2685                         break;
2686         }
2687         /*
2688          * If none exists, malloc one and tack it on.
2689          */
2690         if (phd == NULL) {
2691                 phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
2692                 if (phd == NULL) {
2693                         return (ENOBUFS); /* XXX */
2694                 }
2695                 bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context));
2696                 phd->phd_port = inp->inp_lport;
2697                 CK_LIST_INIT(&phd->phd_pcblist);
2698                 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2699         }
2700         inp->inp_phd = phd;
2701         CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2702         CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
2703         inp->inp_flags |= INP_INHASHLIST;
2704 #ifdef PCBGROUP
2705         if (m != NULL) {
2706                 in_pcbgroup_update_mbuf(inp, m);
2707         } else {
2708                 in_pcbgroup_update(inp);
2709         }
2710 #endif
2711         return (0);
2712 }
2713
2714 int
2715 in_pcbinshash(struct inpcb *inp)
2716 {
2717
2718         return (in_pcbinshash_internal(inp, NULL));
2719 }
2720
2721 int
2722 in_pcbinshash_mbuf(struct inpcb *inp, struct mbuf *m)
2723 {
2724
2725         return (in_pcbinshash_internal(inp, m));
2726 }
2727
2728 /*
2729  * Move PCB to the proper hash bucket when { faddr, fport } have  been
2730  * changed. NOTE: This does not handle the case of the lport changing (the
2731  * hashed port list would have to be updated as well), so the lport must
2732  * not change after in_pcbinshash() has been called.
2733  */
2734 void
2735 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
2736 {
2737         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2738         struct inpcbhead *head;
2739         u_int32_t hashkey_faddr;
2740
2741         INP_WLOCK_ASSERT(inp);
2742         INP_HASH_WLOCK_ASSERT(pcbinfo);
2743
2744         KASSERT(inp->inp_flags & INP_INHASHLIST,
2745             ("in_pcbrehash: !INP_INHASHLIST"));
2746
2747 #ifdef INET6
2748         if (inp->inp_vflag & INP_IPV6)
2749                 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2750         else
2751 #endif
2752         hashkey_faddr = inp->inp_faddr.s_addr;
2753
2754         head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2755                 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2756
2757         CK_LIST_REMOVE(inp, inp_hash);
2758         CK_LIST_INSERT_HEAD(head, inp, inp_hash);
2759
2760 #ifdef PCBGROUP
2761         if (m != NULL)
2762                 in_pcbgroup_update_mbuf(inp, m);
2763         else
2764                 in_pcbgroup_update(inp);
2765 #endif
2766 }
2767
2768 void
2769 in_pcbrehash(struct inpcb *inp)
2770 {
2771
2772         in_pcbrehash_mbuf(inp, NULL);
2773 }
2774
2775 /*
2776  * Remove PCB from various lists.
2777  */
2778 static void
2779 in_pcbremlists(struct inpcb *inp)
2780 {
2781         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2782
2783         INP_WLOCK_ASSERT(inp);
2784         INP_LIST_WLOCK_ASSERT(pcbinfo);
2785
2786         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
2787         if (inp->inp_flags & INP_INHASHLIST) {
2788                 struct inpcbport *phd = inp->inp_phd;
2789
2790                 INP_HASH_WLOCK(pcbinfo);
2791
2792                 /* XXX: Only do if SO_REUSEPORT_LB set? */
2793                 in_pcbremlbgrouphash(inp);
2794
2795                 CK_LIST_REMOVE(inp, inp_hash);
2796                 CK_LIST_REMOVE(inp, inp_portlist);
2797                 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
2798                         CK_LIST_REMOVE(phd, phd_hash);
2799                         NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx);
2800                 }
2801                 INP_HASH_WUNLOCK(pcbinfo);
2802                 inp->inp_flags &= ~INP_INHASHLIST;
2803         }
2804         CK_LIST_REMOVE(inp, inp_list);
2805         pcbinfo->ipi_count--;
2806 #ifdef PCBGROUP
2807         in_pcbgroup_remove(inp);
2808 #endif
2809 }
2810
2811 /*
2812  * Check for alternatives when higher level complains
2813  * about service problems.  For now, invalidate cached
2814  * routing information.  If the route was created dynamically
2815  * (by a redirect), time to try a default gateway again.
2816  */
2817 void
2818 in_losing(struct inpcb *inp)
2819 {
2820
2821         RO_INVALIDATE_CACHE(&inp->inp_route);
2822         return;
2823 }
2824
2825 /*
2826  * A set label operation has occurred at the socket layer, propagate the
2827  * label change into the in_pcb for the socket.
2828  */
2829 void
2830 in_pcbsosetlabel(struct socket *so)
2831 {
2832 #ifdef MAC
2833         struct inpcb *inp;
2834
2835         inp = sotoinpcb(so);
2836         KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2837
2838         INP_WLOCK(inp);
2839         SOCK_LOCK(so);
2840         mac_inpcb_sosetlabel(so, inp);
2841         SOCK_UNLOCK(so);
2842         INP_WUNLOCK(inp);
2843 #endif
2844 }
2845
2846 /*
2847  * ipport_tick runs once per second, determining if random port allocation
2848  * should be continued.  If more than ipport_randomcps ports have been
2849  * allocated in the last second, then we return to sequential port
2850  * allocation. We return to random allocation only once we drop below
2851  * ipport_randomcps for at least ipport_randomtime seconds.
2852  */
2853 static void
2854 ipport_tick(void *xtp)
2855 {
2856         VNET_ITERATOR_DECL(vnet_iter);
2857
2858         VNET_LIST_RLOCK_NOSLEEP();
2859         VNET_FOREACH(vnet_iter) {
2860                 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */
2861                 if (V_ipport_tcpallocs <=
2862                     V_ipport_tcplastcount + V_ipport_randomcps) {
2863                         if (V_ipport_stoprandom > 0)
2864                                 V_ipport_stoprandom--;
2865                 } else
2866                         V_ipport_stoprandom = V_ipport_randomtime;
2867                 V_ipport_tcplastcount = V_ipport_tcpallocs;
2868                 CURVNET_RESTORE();
2869         }
2870         VNET_LIST_RUNLOCK_NOSLEEP();
2871         callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
2872 }
2873
2874 static void
2875 ip_fini(void *xtp)
2876 {
2877
2878         callout_stop(&ipport_tick_callout);
2879 }
2880
2881 /*
2882  * The ipport_callout should start running at about the time we attach the
2883  * inet or inet6 domains.
2884  */
2885 static void
2886 ipport_tick_init(const void *unused __unused)
2887 {
2888
2889         /* Start ipport_tick. */
2890         callout_init(&ipport_tick_callout, 1);
2891         callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
2892         EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
2893                 SHUTDOWN_PRI_DEFAULT);
2894 }
2895 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
2896     ipport_tick_init, NULL);
2897
2898 void
2899 inp_wlock(struct inpcb *inp)
2900 {
2901
2902         INP_WLOCK(inp);
2903 }
2904
2905 void
2906 inp_wunlock(struct inpcb *inp)
2907 {
2908
2909         INP_WUNLOCK(inp);
2910 }
2911
2912 void
2913 inp_rlock(struct inpcb *inp)
2914 {
2915
2916         INP_RLOCK(inp);
2917 }
2918
2919 void
2920 inp_runlock(struct inpcb *inp)
2921 {
2922
2923         INP_RUNLOCK(inp);
2924 }
2925
2926 #ifdef INVARIANT_SUPPORT
2927 void
2928 inp_lock_assert(struct inpcb *inp)
2929 {
2930
2931         INP_WLOCK_ASSERT(inp);
2932 }
2933
2934 void
2935 inp_unlock_assert(struct inpcb *inp)
2936 {
2937
2938         INP_UNLOCK_ASSERT(inp);
2939 }
2940 #endif
2941
2942 void
2943 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
2944 {
2945         struct inpcb *inp;
2946
2947         INP_INFO_WLOCK(&V_tcbinfo);
2948         CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
2949                 INP_WLOCK(inp);
2950                 func(inp, arg);
2951                 INP_WUNLOCK(inp);
2952         }
2953         INP_INFO_WUNLOCK(&V_tcbinfo);
2954 }
2955
2956 struct socket *
2957 inp_inpcbtosocket(struct inpcb *inp)
2958 {
2959
2960         INP_WLOCK_ASSERT(inp);
2961         return (inp->inp_socket);
2962 }
2963
2964 struct tcpcb *
2965 inp_inpcbtotcpcb(struct inpcb *inp)
2966 {
2967
2968         INP_WLOCK_ASSERT(inp);
2969         return ((struct tcpcb *)inp->inp_ppcb);
2970 }
2971
2972 int
2973 inp_ip_tos_get(const struct inpcb *inp)
2974 {
2975
2976         return (inp->inp_ip_tos);
2977 }
2978
2979 void
2980 inp_ip_tos_set(struct inpcb *inp, int val)
2981 {
2982
2983         inp->inp_ip_tos = val;
2984 }
2985
2986 void
2987 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2988     uint32_t *faddr, uint16_t *fp)
2989 {
2990
2991         INP_LOCK_ASSERT(inp);
2992         *laddr = inp->inp_laddr.s_addr;
2993         *faddr = inp->inp_faddr.s_addr;
2994         *lp = inp->inp_lport;
2995         *fp = inp->inp_fport;
2996 }
2997
2998 struct inpcb *
2999 so_sotoinpcb(struct socket *so)
3000 {
3001
3002         return (sotoinpcb(so));
3003 }
3004
3005 struct tcpcb *
3006 so_sototcpcb(struct socket *so)
3007 {
3008
3009         return (sototcpcb(so));
3010 }
3011
3012 /*
3013  * Create an external-format (``xinpcb'') structure using the information in
3014  * the kernel-format in_pcb structure pointed to by inp.  This is done to
3015  * reduce the spew of irrelevant information over this interface, to isolate
3016  * user code from changes in the kernel structure, and potentially to provide
3017  * information-hiding if we decide that some of this information should be
3018  * hidden from users.
3019  */
3020 void
3021 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
3022 {
3023
3024         bzero(xi, sizeof(*xi));
3025         xi->xi_len = sizeof(struct xinpcb);
3026         if (inp->inp_socket)
3027                 sotoxsocket(inp->inp_socket, &xi->xi_socket);
3028         bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
3029         xi->inp_gencnt = inp->inp_gencnt;
3030         xi->inp_ppcb = (uintptr_t)inp->inp_ppcb;
3031         xi->inp_flow = inp->inp_flow;
3032         xi->inp_flowid = inp->inp_flowid;
3033         xi->inp_flowtype = inp->inp_flowtype;
3034         xi->inp_flags = inp->inp_flags;
3035         xi->inp_flags2 = inp->inp_flags2;
3036         xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
3037         xi->in6p_cksum = inp->in6p_cksum;
3038         xi->in6p_hops = inp->in6p_hops;
3039         xi->inp_ip_tos = inp->inp_ip_tos;
3040         xi->inp_vflag = inp->inp_vflag;
3041         xi->inp_ip_ttl = inp->inp_ip_ttl;
3042         xi->inp_ip_p = inp->inp_ip_p;
3043         xi->inp_ip_minttl = inp->inp_ip_minttl;
3044 }
3045
3046 #ifdef DDB
3047 static void
3048 db_print_indent(int indent)
3049 {
3050         int i;
3051
3052         for (i = 0; i < indent; i++)
3053                 db_printf(" ");
3054 }
3055
3056 static void
3057 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
3058 {
3059         char faddr_str[48], laddr_str[48];
3060
3061         db_print_indent(indent);
3062         db_printf("%s at %p\n", name, inc);
3063
3064         indent += 2;
3065
3066 #ifdef INET6
3067         if (inc->inc_flags & INC_ISIPV6) {
3068                 /* IPv6. */
3069                 ip6_sprintf(laddr_str, &inc->inc6_laddr);
3070                 ip6_sprintf(faddr_str, &inc->inc6_faddr);
3071         } else
3072 #endif
3073         {
3074                 /* IPv4. */
3075                 inet_ntoa_r(inc->inc_laddr, laddr_str);
3076                 inet_ntoa_r(inc->inc_faddr, faddr_str);
3077         }
3078         db_print_indent(indent);
3079         db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
3080             ntohs(inc->inc_lport));
3081         db_print_indent(indent);
3082         db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
3083             ntohs(inc->inc_fport));
3084 }
3085
3086 static void
3087 db_print_inpflags(int inp_flags)
3088 {
3089         int comma;
3090
3091         comma = 0;
3092         if (inp_flags & INP_RECVOPTS) {
3093                 db_printf("%sINP_RECVOPTS", comma ? ", " : "");
3094                 comma = 1;
3095         }
3096         if (inp_flags & INP_RECVRETOPTS) {
3097                 db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
3098                 comma = 1;
3099         }
3100         if (inp_flags & INP_RECVDSTADDR) {
3101                 db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
3102                 comma = 1;
3103         }
3104         if (inp_flags & INP_ORIGDSTADDR) {
3105                 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
3106                 comma = 1;
3107         }
3108         if (inp_flags & INP_HDRINCL) {
3109                 db_printf("%sINP_HDRINCL", comma ? ", " : "");
3110                 comma = 1;
3111         }
3112         if (inp_flags & INP_HIGHPORT) {
3113                 db_printf("%sINP_HIGHPORT", comma ? ", " : "");
3114                 comma = 1;
3115         }
3116         if (inp_flags & INP_LOWPORT) {
3117                 db_printf("%sINP_LOWPORT", comma ? ", " : "");
3118                 comma = 1;
3119         }
3120         if (inp_flags & INP_ANONPORT) {
3121                 db_printf("%sINP_ANONPORT", comma ? ", " : "");
3122                 comma = 1;
3123         }
3124         if (inp_flags & INP_RECVIF) {
3125                 db_printf("%sINP_RECVIF", comma ? ", " : "");
3126                 comma = 1;
3127         }
3128         if (inp_flags & INP_MTUDISC) {
3129                 db_printf("%sINP_MTUDISC", comma ? ", " : "");
3130                 comma = 1;
3131         }
3132         if (inp_flags & INP_RECVTTL) {
3133                 db_printf("%sINP_RECVTTL", comma ? ", " : "");
3134                 comma = 1;
3135         }
3136         if (inp_flags & INP_DONTFRAG) {
3137                 db_printf("%sINP_DONTFRAG", comma ? ", " : "");
3138                 comma = 1;
3139         }
3140         if (inp_flags & INP_RECVTOS) {
3141                 db_printf("%sINP_RECVTOS", comma ? ", " : "");
3142                 comma = 1;
3143         }
3144         if (inp_flags & IN6P_IPV6_V6ONLY) {
3145                 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
3146                 comma = 1;
3147         }
3148         if (inp_flags & IN6P_PKTINFO) {
3149                 db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
3150                 comma = 1;
3151         }
3152         if (inp_flags & IN6P_HOPLIMIT) {
3153                 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
3154                 comma = 1;
3155         }
3156         if (inp_flags & IN6P_HOPOPTS) {
3157                 db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
3158                 comma = 1;
3159         }
3160         if (inp_flags & IN6P_DSTOPTS) {
3161                 db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
3162                 comma = 1;
3163         }
3164         if (inp_flags & IN6P_RTHDR) {
3165                 db_printf("%sIN6P_RTHDR", comma ? ", " : "");
3166                 comma = 1;
3167         }
3168         if (inp_flags & IN6P_RTHDRDSTOPTS) {
3169                 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
3170                 comma = 1;
3171         }
3172         if (inp_flags & IN6P_TCLASS) {
3173                 db_printf("%sIN6P_TCLASS", comma ? ", " : "");
3174                 comma = 1;
3175         }
3176         if (inp_flags & IN6P_AUTOFLOWLABEL) {
3177                 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
3178                 comma = 1;
3179         }
3180         if (inp_flags & INP_TIMEWAIT) {
3181                 db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
3182                 comma  = 1;
3183         }
3184         if (inp_flags & INP_ONESBCAST) {
3185                 db_printf("%sINP_ONESBCAST", comma ? ", " : "");
3186                 comma  = 1;
3187         }
3188         if (inp_flags & INP_DROPPED) {
3189                 db_printf("%sINP_DROPPED", comma ? ", " : "");
3190                 comma  = 1;
3191         }
3192         if (inp_flags & INP_SOCKREF) {
3193                 db_printf("%sINP_SOCKREF", comma ? ", " : "");
3194                 comma  = 1;
3195         }
3196         if (inp_flags & IN6P_RFC2292) {
3197                 db_printf("%sIN6P_RFC2292", comma ? ", " : "");
3198                 comma = 1;
3199         }
3200         if (inp_flags & IN6P_MTU) {
3201                 db_printf("IN6P_MTU%s", comma ? ", " : "");
3202                 comma = 1;
3203         }
3204 }
3205
3206 static void
3207 db_print_inpvflag(u_char inp_vflag)
3208 {
3209         int comma;
3210
3211         comma = 0;
3212         if (inp_vflag & INP_IPV4) {
3213                 db_printf("%sINP_IPV4", comma ? ", " : "");
3214                 comma  = 1;
3215         }
3216         if (inp_vflag & INP_IPV6) {
3217                 db_printf("%sINP_IPV6", comma ? ", " : "");
3218                 comma  = 1;
3219         }
3220         if (inp_vflag & INP_IPV6PROTO) {
3221                 db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
3222                 comma  = 1;
3223         }
3224 }
3225
3226 static void
3227 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3228 {
3229
3230         db_print_indent(indent);
3231         db_printf("%s at %p\n", name, inp);
3232
3233         indent += 2;
3234
3235         db_print_indent(indent);
3236         db_printf("inp_flow: 0x%x\n", inp->inp_flow);
3237
3238         db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3239
3240         db_print_indent(indent);
3241         db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
3242             inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
3243
3244         db_print_indent(indent);
3245         db_printf("inp_label: %p   inp_flags: 0x%x (",
3246            inp->inp_label, inp->inp_flags);
3247         db_print_inpflags(inp->inp_flags);
3248         db_printf(")\n");
3249
3250         db_print_indent(indent);
3251         db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
3252             inp->inp_vflag);
3253         db_print_inpvflag(inp->inp_vflag);
3254         db_printf(")\n");
3255
3256         db_print_indent(indent);
3257         db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
3258             inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3259
3260         db_print_indent(indent);
3261 #ifdef INET6
3262         if (inp->inp_vflag & INP_IPV6) {
3263                 db_printf("in6p_options: %p   in6p_outputopts: %p   "
3264                     "in6p_moptions: %p\n", inp->in6p_options,
3265                     inp->in6p_outputopts, inp->in6p_moptions);
3266                 db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
3267                     "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3268                     inp->in6p_hops);
3269         } else
3270 #endif
3271         {
3272                 db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
3273                     "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3274                     inp->inp_options, inp->inp_moptions);
3275         }
3276
3277         db_print_indent(indent);
3278         db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
3279             (uintmax_t)inp->inp_gencnt);
3280 }
3281
3282 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3283 {
3284         struct inpcb *inp;
3285
3286         if (!have_addr) {
3287                 db_printf("usage: show inpcb <addr>\n");
3288                 return;
3289         }
3290         inp = (struct inpcb *)addr;
3291
3292         db_print_inpcb(inp, "inpcb", 0);
3293 }
3294 #endif /* DDB */
3295
3296 #ifdef RATELIMIT
3297 /*
3298  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3299  * if any.
3300  */
3301 int
3302 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3303 {
3304         union if_snd_tag_modify_params params = {
3305                 .rate_limit.max_rate = max_pacing_rate,
3306                 .rate_limit.flags = M_NOWAIT,
3307         };
3308         struct m_snd_tag *mst;
3309         struct ifnet *ifp;
3310         int error;
3311
3312         mst = inp->inp_snd_tag;
3313         if (mst == NULL)
3314                 return (EINVAL);
3315
3316         ifp = mst->ifp;
3317         if (ifp == NULL)
3318                 return (EINVAL);
3319
3320         if (ifp->if_snd_tag_modify == NULL) {
3321                 error = EOPNOTSUPP;
3322         } else {
3323                 error = ifp->if_snd_tag_modify(mst, &params);
3324         }
3325         return (error);
3326 }
3327
3328 /*
3329  * Query existing TX rate limit based on the existing
3330  * "inp->inp_snd_tag", if any.
3331  */
3332 int
3333 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3334 {
3335         union if_snd_tag_query_params params = { };
3336         struct m_snd_tag *mst;
3337         struct ifnet *ifp;
3338         int error;
3339
3340         mst = inp->inp_snd_tag;
3341         if (mst == NULL)
3342                 return (EINVAL);
3343
3344         ifp = mst->ifp;
3345         if (ifp == NULL)
3346                 return (EINVAL);
3347
3348         if (ifp->if_snd_tag_query == NULL) {
3349                 error = EOPNOTSUPP;
3350         } else {
3351                 error = ifp->if_snd_tag_query(mst, &params);
3352                 if (error == 0 &&  p_max_pacing_rate != NULL)
3353                         *p_max_pacing_rate = params.rate_limit.max_rate;
3354         }
3355         return (error);
3356 }
3357
3358 /*
3359  * Query existing TX queue level based on the existing
3360  * "inp->inp_snd_tag", if any.
3361  */
3362 int
3363 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3364 {
3365         union if_snd_tag_query_params params = { };
3366         struct m_snd_tag *mst;
3367         struct ifnet *ifp;
3368         int error;
3369
3370         mst = inp->inp_snd_tag;
3371         if (mst == NULL)
3372                 return (EINVAL);
3373
3374         ifp = mst->ifp;
3375         if (ifp == NULL)
3376                 return (EINVAL);
3377
3378         if (ifp->if_snd_tag_query == NULL)
3379                 return (EOPNOTSUPP);
3380
3381         error = ifp->if_snd_tag_query(mst, &params);
3382         if (error == 0 &&  p_txqueue_level != NULL)
3383                 *p_txqueue_level = params.rate_limit.queue_level;
3384         return (error);
3385 }
3386
3387 /*
3388  * Allocate a new TX rate limit send tag from the network interface
3389  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3390  */
3391 int
3392 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3393     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3394
3395 {
3396         union if_snd_tag_alloc_params params = {
3397                 .rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3398                     IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3399                 .rate_limit.hdr.flowid = flowid,
3400                 .rate_limit.hdr.flowtype = flowtype,
3401                 .rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3402                 .rate_limit.max_rate = max_pacing_rate,
3403                 .rate_limit.flags = M_NOWAIT,
3404         };
3405         int error;
3406
3407         INP_WLOCK_ASSERT(inp);
3408
3409         /*
3410          * If there is already a send tag, or the INP is being torn
3411          * down, allocating a new send tag is not allowed. Else send
3412          * tags may leak.
3413          */
3414         if (*st != NULL || (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0)
3415                 return (EINVAL);
3416
3417         error = m_snd_tag_alloc(ifp, &params, st);
3418 #ifdef INET
3419         if (error == 0) {
3420                 counter_u64_add(rate_limit_set_ok, 1);
3421                 counter_u64_add(rate_limit_active, 1);
3422         } else if (error != EOPNOTSUPP)
3423                   counter_u64_add(rate_limit_alloc_fail, 1);
3424 #endif
3425         return (error);
3426 }
3427
3428 void
3429 in_pcbdetach_tag(struct m_snd_tag *mst)
3430 {
3431
3432         m_snd_tag_rele(mst);
3433 #ifdef INET
3434         counter_u64_add(rate_limit_active, -1);
3435 #endif
3436 }
3437
3438 /*
3439  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3440  * if any:
3441  */
3442 void
3443 in_pcbdetach_txrtlmt(struct inpcb *inp)
3444 {
3445         struct m_snd_tag *mst;
3446
3447         INP_WLOCK_ASSERT(inp);
3448
3449         mst = inp->inp_snd_tag;
3450         inp->inp_snd_tag = NULL;
3451
3452         if (mst == NULL)
3453                 return;
3454
3455         m_snd_tag_rele(mst);
3456 #ifdef INET
3457         counter_u64_add(rate_limit_active, -1);
3458 #endif
3459 }
3460
3461 int
3462 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3463 {
3464         int error;
3465
3466         /*
3467          * If the existing send tag is for the wrong interface due to
3468          * a route change, first drop the existing tag.  Set the
3469          * CHANGED flag so that we will keep trying to allocate a new
3470          * tag if we fail to allocate one this time.
3471          */
3472         if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3473                 in_pcbdetach_txrtlmt(inp);
3474                 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3475         }
3476
3477         /*
3478          * NOTE: When attaching to a network interface a reference is
3479          * made to ensure the network interface doesn't go away until
3480          * all ratelimit connections are gone. The network interface
3481          * pointers compared below represent valid network interfaces,
3482          * except when comparing towards NULL.
3483          */
3484         if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3485                 error = 0;
3486         } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3487                 if (inp->inp_snd_tag != NULL)
3488                         in_pcbdetach_txrtlmt(inp);
3489                 error = 0;
3490         } else if (inp->inp_snd_tag == NULL) {
3491                 /*
3492                  * In order to utilize packet pacing with RSS, we need
3493                  * to wait until there is a valid RSS hash before we
3494                  * can proceed:
3495                  */
3496                 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3497                         error = EAGAIN;
3498                 } else {
3499                         error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3500                             mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3501                 }
3502         } else {
3503                 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3504         }
3505         if (error == 0 || error == EOPNOTSUPP)
3506                 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3507
3508         return (error);
3509 }
3510
3511 /*
3512  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3513  * is set in the fast path and will attach/detach/modify the TX rate
3514  * limit send tag based on the socket's so_max_pacing_rate value.
3515  */
3516 void
3517 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3518 {
3519         struct socket *socket;
3520         uint32_t max_pacing_rate;
3521         bool did_upgrade;
3522         int error;
3523
3524         if (inp == NULL)
3525                 return;
3526
3527         socket = inp->inp_socket;
3528         if (socket == NULL)
3529                 return;
3530
3531         if (!INP_WLOCKED(inp)) {
3532                 /*
3533                  * NOTE: If the write locking fails, we need to bail
3534                  * out and use the non-ratelimited ring for the
3535                  * transmit until there is a new chance to get the
3536                  * write lock.
3537                  */
3538                 if (!INP_TRY_UPGRADE(inp))
3539                         return;
3540                 did_upgrade = 1;
3541         } else {
3542                 did_upgrade = 0;
3543         }
3544
3545         /*
3546          * NOTE: The so_max_pacing_rate value is read unlocked,
3547          * because atomic updates are not required since the variable
3548          * is checked at every mbuf we send. It is assumed that the
3549          * variable read itself will be atomic.
3550          */
3551         max_pacing_rate = socket->so_max_pacing_rate;
3552
3553         error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3554
3555         if (did_upgrade)
3556                 INP_DOWNGRADE(inp);
3557 }
3558
3559 /*
3560  * Track route changes for TX rate limiting.
3561  */
3562 void
3563 in_pcboutput_eagain(struct inpcb *inp)
3564 {
3565         bool did_upgrade;
3566
3567         if (inp == NULL)
3568                 return;
3569
3570         if (inp->inp_snd_tag == NULL)
3571                 return;
3572
3573         if (!INP_WLOCKED(inp)) {
3574                 /*
3575                  * NOTE: If the write locking fails, we need to bail
3576                  * out and use the non-ratelimited ring for the
3577                  * transmit until there is a new chance to get the
3578                  * write lock.
3579                  */
3580                 if (!INP_TRY_UPGRADE(inp))
3581                         return;
3582                 did_upgrade = 1;
3583         } else {
3584                 did_upgrade = 0;
3585         }
3586
3587         /* detach rate limiting */
3588         in_pcbdetach_txrtlmt(inp);
3589
3590         /* make sure new mbuf send tag allocation is made */
3591         inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3592
3593         if (did_upgrade)
3594                 INP_DOWNGRADE(inp);
3595 }
3596
3597 #ifdef INET
3598 static void
3599 rl_init(void *st)
3600 {
3601         rate_limit_new = counter_u64_alloc(M_WAITOK);
3602         rate_limit_chg = counter_u64_alloc(M_WAITOK);
3603         rate_limit_active = counter_u64_alloc(M_WAITOK);
3604         rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3605         rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3606 }
3607
3608 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3609 #endif
3610 #endif /* RATELIMIT */