]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/in_pcb.c
Load balance sockets with new SO_REUSEPORT_LB option
[FreeBSD/FreeBSD.git] / sys / netinet / in_pcb.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1991, 1993, 1995
5  *      The Regents of the University of California.
6  * Copyright (c) 2007-2009 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * All rights reserved.
9  *
10  * Portions of this software were developed by Robert N. M. Watson under
11  * contract to Juniper Networks, Inc.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *      @(#)in_pcb.c    8.4 (Berkeley) 5/24/95
38  */
39
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42
43 #include "opt_ddb.h"
44 #include "opt_ipsec.h"
45 #include "opt_inet.h"
46 #include "opt_inet6.h"
47 #include "opt_ratelimit.h"
48 #include "opt_pcbgroup.h"
49 #include "opt_rss.h"
50
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/lock.h>
54 #include <sys/malloc.h>
55 #include <sys/mbuf.h>
56 #include <sys/callout.h>
57 #include <sys/eventhandler.h>
58 #include <sys/domain.h>
59 #include <sys/protosw.h>
60 #include <sys/rmlock.h>
61 #include <sys/smp.h>
62 #include <sys/socket.h>
63 #include <sys/socketvar.h>
64 #include <sys/sockio.h>
65 #include <sys/priv.h>
66 #include <sys/proc.h>
67 #include <sys/refcount.h>
68 #include <sys/jail.h>
69 #include <sys/kernel.h>
70 #include <sys/sysctl.h>
71
72 #ifdef DDB
73 #include <ddb/ddb.h>
74 #endif
75
76 #include <vm/uma.h>
77
78 #include <net/if.h>
79 #include <net/if_var.h>
80 #include <net/if_types.h>
81 #include <net/if_llatbl.h>
82 #include <net/route.h>
83 #include <net/rss_config.h>
84 #include <net/vnet.h>
85
86 #if defined(INET) || defined(INET6)
87 #include <netinet/in.h>
88 #include <netinet/in_pcb.h>
89 #include <netinet/ip_var.h>
90 #include <netinet/tcp_var.h>
91 #ifdef TCPHPTS
92 #include <netinet/tcp_hpts.h>
93 #endif
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #endif
97 #ifdef INET
98 #include <netinet/in_var.h>
99 #endif
100 #ifdef INET6
101 #include <netinet/ip6.h>
102 #include <netinet6/in6_pcb.h>
103 #include <netinet6/in6_var.h>
104 #include <netinet6/ip6_var.h>
105 #endif /* INET6 */
106
107 #include <netipsec/ipsec_support.h>
108
109 #include <security/mac/mac_framework.h>
110
111 #define INPCBLBGROUP_SIZMIN     8
112 #define INPCBLBGROUP_SIZMAX     256
113
114 static struct callout   ipport_tick_callout;
115
116 /*
117  * These configure the range of local port addresses assigned to
118  * "unspecified" outgoing connections/packets/whatever.
119  */
120 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;    /* 1023 */
121 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;    /* 600 */
122 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;     /* 10000 */
123 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;       /* 65535 */
124 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;      /* 49152 */
125 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;        /* 65535 */
126
127 /*
128  * Reserved ports accessible only to root. There are significant
129  * security considerations that must be accounted for when changing these,
130  * but the security benefits can be great. Please be careful.
131  */
132 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;    /* 1023 */
133 VNET_DEFINE(int, ipport_reservedlow);
134
135 /* Variables dealing with random ephemeral port allocation. */
136 VNET_DEFINE(int, ipport_randomized) = 1;        /* user controlled via sysctl */
137 VNET_DEFINE(int, ipport_randomcps) = 10;        /* user controlled via sysctl */
138 VNET_DEFINE(int, ipport_randomtime) = 45;       /* user controlled via sysctl */
139 VNET_DEFINE(int, ipport_stoprandom);            /* toggled by ipport_tick */
140 VNET_DEFINE(int, ipport_tcpallocs);
141 static VNET_DEFINE(int, ipport_tcplastcount);
142
143 #define V_ipport_tcplastcount           VNET(ipport_tcplastcount)
144
145 static void     in_pcbremlists(struct inpcb *inp);
146 #ifdef INET
147 static struct inpcb     *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
148                             struct in_addr faddr, u_int fport_arg,
149                             struct in_addr laddr, u_int lport_arg,
150                             int lookupflags, struct ifnet *ifp);
151
152 #define RANGECHK(var, min, max) \
153         if ((var) < (min)) { (var) = (min); } \
154         else if ((var) > (max)) { (var) = (max); }
155
156 static int
157 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
158 {
159         int error;
160
161         error = sysctl_handle_int(oidp, arg1, arg2, req);
162         if (error == 0) {
163                 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
164                 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
165                 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
166                 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
167                 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
168                 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
169         }
170         return (error);
171 }
172
173 #undef RANGECHK
174
175 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0,
176     "IP Ports");
177
178 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
179         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
180         &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", "");
181 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
182         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
183         &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", "");
184 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
185         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
186         &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", "");
187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
188         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
189         &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", "");
190 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
191         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
192         &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", "");
193 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
194         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
195         &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", "");
196 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
197         CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
198         &VNET_NAME(ipport_reservedhigh), 0, "");
199 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
200         CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
201 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
202         CTLFLAG_VNET | CTLFLAG_RW,
203         &VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
204 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
205         CTLFLAG_VNET | CTLFLAG_RW,
206         &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
207         "allocations before switching to a sequental one");
208 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
209         CTLFLAG_VNET | CTLFLAG_RW,
210         &VNET_NAME(ipport_randomtime), 0,
211         "Minimum time to keep sequental port "
212         "allocation before switching to a random one");
213 #endif /* INET */
214
215 /*
216  * in_pcb.c: manage the Protocol Control Blocks.
217  *
218  * NOTE: It is assumed that most of these functions will be called with
219  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
220  * functions often modify hash chains or addresses in pcbs.
221  */
222
223 static struct inpcblbgroup *
224 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
225     uint16_t port, const union in_dependaddr *addr, int size)
226 {
227         struct inpcblbgroup *grp;
228
229         size_t bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
230         grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
231         if(!grp)
232                 return NULL;
233         grp->il_vflag = vflag;
234         grp->il_lport = port;
235         grp->il_dependladdr = *addr;
236         grp->il_inpsiz = size;
237         LIST_INSERT_HEAD(hdr, grp, il_list);
238
239         return grp;
240 }
241
242 static void
243 in_pcblbgroup_free(struct inpcblbgroup *grp)
244 {
245         LIST_REMOVE(grp, il_list);
246         free(grp, M_TEMP);
247 }
248
249 static struct inpcblbgroup *
250 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
251     struct inpcblbgroup *old_grp, int size)
252 {
253         struct inpcblbgroup *grp;
254         int i;
255
256         grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
257             old_grp->il_lport, &old_grp->il_dependladdr, size);
258         if(!grp)
259                 return NULL;
260
261         KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
262             ("invalid new local group size %d and old local group count %d",
263              grp->il_inpsiz, old_grp->il_inpcnt));
264         for (i = 0; i < old_grp->il_inpcnt; ++i)
265                 grp->il_inp[i] = old_grp->il_inp[i];
266         grp->il_inpcnt = old_grp->il_inpcnt;
267
268         in_pcblbgroup_free(old_grp);
269
270         return grp;
271 }
272
273 /*
274  * Add PCB to lb group (load balance used by SO_REUSEPORT_LB)
275  */
276 static int
277 in_pcbinslbgrouphash(struct inpcb *inp, struct inpcbinfo *pcbinfo)
278 {
279         struct inpcblbgrouphead *hdr;
280         struct inpcblbgroup *grp;
281
282         uint16_t hashmask = pcbinfo->ipi_lbgrouphashmask;
283         uint16_t lport = inp->inp_lport;
284         uint32_t group_index = INP_PCBLBGROUP_PORTHASH(lport, hashmask);
285
286         hdr = &pcbinfo->ipi_lbgrouphashbase[group_index];
287
288         struct ucred *cred;
289
290         if (pcbinfo->ipi_lbgrouphashbase == NULL)
291                 return 0;
292
293         /*
294          * don't allow jailed socket to join local group
295          */
296         if (inp->inp_socket != NULL)
297                 cred = inp->inp_socket->so_cred;
298         else
299                 cred = NULL;
300         if (cred != NULL && jailed(cred))
301                 return 0;
302
303 #ifdef INET6
304         /*
305          * don't allow IPv4 mapped INET6 wild socket
306          */
307         if ((inp->inp_vflag & INP_IPV4) &&
308             inp->inp_laddr.s_addr == INADDR_ANY &&
309             INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
310                 return 0;
311         }
312 #endif
313
314         hdr = &pcbinfo->ipi_lbgrouphashbase[
315             INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
316
317         LIST_FOREACH(grp, hdr, il_list) {
318                 if (grp->il_vflag == inp->inp_vflag &&
319                     grp->il_lport == inp->inp_lport &&
320                     memcmp(&grp->il_dependladdr,
321                         &inp->inp_inc.inc_ie.ie_dependladdr,
322                         sizeof(grp->il_dependladdr)) == 0) {
323                         break;
324                 }
325         }
326         if (grp == NULL) {
327                 /* Create new load balance group */
328                 grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
329                     inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
330                     INPCBLBGROUP_SIZMIN);
331                 if(!grp)
332                         return (ENOBUFS);
333         } else if (grp->il_inpcnt == grp->il_inpsiz) {
334                 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
335                         static int limit_logged = 0;
336
337                         if (!limit_logged) {
338                                 limit_logged = 1;
339                                 printf("lb group port %d, "
340                                            "limit reached\n", ntohs(grp->il_lport));
341                         }
342                         return 0;
343                 }
344
345                 /* Expand this local group */
346                 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
347                 if(!grp)
348                         return (ENOBUFS);
349         }
350
351         KASSERT(grp->il_inpcnt < grp->il_inpsiz,
352                         ("invalid local group size %d and count %d",
353                          grp->il_inpsiz, grp->il_inpcnt));
354
355         grp->il_inp[grp->il_inpcnt] = inp;
356         grp->il_inpcnt++;
357         return 0;
358 }
359
360 static void
361 in_pcbremlbgrouphash(struct inpcb *inp, struct inpcbinfo *pcbinfo)
362 {
363         struct inpcblbgrouphead *hdr;
364         struct inpcblbgroup *grp;
365
366         if (pcbinfo->ipi_lbgrouphashbase == NULL)
367                 return;
368
369         hdr = &pcbinfo->ipi_lbgrouphashbase[
370             INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
371
372         LIST_FOREACH(grp, hdr, il_list) {
373                 int i;
374
375                 for (i = 0; i < grp->il_inpcnt; ++i) {
376                         if (grp->il_inp[i] != inp)
377                                 continue;
378
379                         if (grp->il_inpcnt == 1) {
380                                 /* Free this local group */
381                                 in_pcblbgroup_free(grp);
382                         } else {
383                                 /* Pull up inpcbs */
384                                 for (; i + 1 < grp->il_inpcnt; ++i)
385                                         grp->il_inp[i] = grp->il_inp[i + 1];
386                                 grp->il_inpcnt--;
387
388                                 if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
389                                     grp->il_inpcnt <= (grp->il_inpsiz / 4)) {
390                                         /* Shrink this local group */
391                                         struct inpcblbgroup *new_grp =
392                                                 in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
393                                         if(new_grp)
394                                                 grp = new_grp;
395                                 }
396                         }
397                         return;
398                 }
399         }
400 }
401
402 /*
403  * Different protocols initialize their inpcbs differently - giving
404  * different name to the lock.  But they all are disposed the same.
405  */
406 static void
407 inpcb_fini(void *mem, int size)
408 {
409         struct inpcb *inp = mem;
410
411         INP_LOCK_DESTROY(inp);
412 }
413
414 /*
415  * Initialize an inpcbinfo -- we should be able to reduce the number of
416  * arguments in time.
417  */
418 void
419 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
420     struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
421     char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields)
422 {
423
424         INP_INFO_LOCK_INIT(pcbinfo, name);
425         INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");     /* XXXRW: argument? */
426         INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
427 #ifdef VIMAGE
428         pcbinfo->ipi_vnet = curvnet;
429 #endif
430         pcbinfo->ipi_listhead = listhead;
431         LIST_INIT(pcbinfo->ipi_listhead);
432         pcbinfo->ipi_count = 0;
433         pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
434             &pcbinfo->ipi_hashmask);
435         pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
436             &pcbinfo->ipi_porthashmask);
437         pcbinfo->ipi_lbgrouphashbase = hashinit(hash_nelements, M_PCB,
438             &pcbinfo->ipi_lbgrouphashmask);
439 #ifdef PCBGROUP
440         in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
441 #endif
442         pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
443             NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0);
444         uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
445         uma_zone_set_warning(pcbinfo->ipi_zone,
446             "kern.ipc.maxsockets limit reached");
447 }
448
449 /*
450  * Destroy an inpcbinfo.
451  */
452 void
453 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
454 {
455
456         KASSERT(pcbinfo->ipi_count == 0,
457             ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
458
459         hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
460         hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
461             pcbinfo->ipi_porthashmask);
462         hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
463             pcbinfo->ipi_lbgrouphashmask);
464 #ifdef PCBGROUP
465         in_pcbgroup_destroy(pcbinfo);
466 #endif
467         uma_zdestroy(pcbinfo->ipi_zone);
468         INP_LIST_LOCK_DESTROY(pcbinfo);
469         INP_HASH_LOCK_DESTROY(pcbinfo);
470         INP_INFO_LOCK_DESTROY(pcbinfo);
471 }
472
473 /*
474  * Allocate a PCB and associate it with the socket.
475  * On success return with the PCB locked.
476  */
477 int
478 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
479 {
480         struct inpcb *inp;
481         int error;
482
483 #ifdef INVARIANTS
484         if (pcbinfo == &V_tcbinfo) {
485                 INP_INFO_RLOCK_ASSERT(pcbinfo);
486         } else {
487                 INP_INFO_WLOCK_ASSERT(pcbinfo);
488         }
489 #endif
490
491         error = 0;
492         inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
493         if (inp == NULL)
494                 return (ENOBUFS);
495         bzero(&inp->inp_start_zero, inp_zero_size);
496         inp->inp_pcbinfo = pcbinfo;
497         inp->inp_socket = so;
498         inp->inp_cred = crhold(so->so_cred);
499         inp->inp_inc.inc_fibnum = so->so_fibnum;
500 #ifdef MAC
501         error = mac_inpcb_init(inp, M_NOWAIT);
502         if (error != 0)
503                 goto out;
504         mac_inpcb_create(so, inp);
505 #endif
506 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
507         error = ipsec_init_pcbpolicy(inp);
508         if (error != 0) {
509 #ifdef MAC
510                 mac_inpcb_destroy(inp);
511 #endif
512                 goto out;
513         }
514 #endif /*IPSEC*/
515 #ifdef INET6
516         if (INP_SOCKAF(so) == AF_INET6) {
517                 inp->inp_vflag |= INP_IPV6PROTO;
518                 if (V_ip6_v6only)
519                         inp->inp_flags |= IN6P_IPV6_V6ONLY;
520         }
521 #endif
522         INP_WLOCK(inp);
523         INP_LIST_WLOCK(pcbinfo);
524         LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
525         pcbinfo->ipi_count++;
526         so->so_pcb = (caddr_t)inp;
527 #ifdef INET6
528         if (V_ip6_auto_flowlabel)
529                 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
530 #endif
531         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
532         refcount_init(&inp->inp_refcount, 1);   /* Reference from inpcbinfo */
533
534         /*
535          * Routes in inpcb's can cache L2 as well; they are guaranteed
536          * to be cleaned up.
537          */
538         inp->inp_route.ro_flags = RT_LLE_CACHE;
539         INP_LIST_WUNLOCK(pcbinfo);
540 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
541 out:
542         if (error != 0) {
543                 crfree(inp->inp_cred);
544                 uma_zfree(pcbinfo->ipi_zone, inp);
545         }
546 #endif
547         return (error);
548 }
549
550 #ifdef INET
551 int
552 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
553 {
554         int anonport, error;
555
556         INP_WLOCK_ASSERT(inp);
557         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
558
559         if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
560                 return (EINVAL);
561         anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
562         error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
563             &inp->inp_lport, cred);
564         if (error)
565                 return (error);
566         if (in_pcbinshash(inp) != 0) {
567                 inp->inp_laddr.s_addr = INADDR_ANY;
568                 inp->inp_lport = 0;
569                 return (EAGAIN);
570         }
571         if (anonport)
572                 inp->inp_flags |= INP_ANONPORT;
573         return (0);
574 }
575 #endif
576
577 /*
578  * Select a local port (number) to use.
579  */
580 #if defined(INET) || defined(INET6)
581 int
582 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
583     struct ucred *cred, int lookupflags)
584 {
585         struct inpcbinfo *pcbinfo;
586         struct inpcb *tmpinp;
587         unsigned short *lastport;
588         int count, dorandom, error;
589         u_short aux, first, last, lport;
590 #ifdef INET
591         struct in_addr laddr;
592 #endif
593
594         pcbinfo = inp->inp_pcbinfo;
595
596         /*
597          * Because no actual state changes occur here, a global write lock on
598          * the pcbinfo isn't required.
599          */
600         INP_LOCK_ASSERT(inp);
601         INP_HASH_LOCK_ASSERT(pcbinfo);
602
603         if (inp->inp_flags & INP_HIGHPORT) {
604                 first = V_ipport_hifirstauto;   /* sysctl */
605                 last  = V_ipport_hilastauto;
606                 lastport = &pcbinfo->ipi_lasthi;
607         } else if (inp->inp_flags & INP_LOWPORT) {
608                 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
609                 if (error)
610                         return (error);
611                 first = V_ipport_lowfirstauto;  /* 1023 */
612                 last  = V_ipport_lowlastauto;   /* 600 */
613                 lastport = &pcbinfo->ipi_lastlow;
614         } else {
615                 first = V_ipport_firstauto;     /* sysctl */
616                 last  = V_ipport_lastauto;
617                 lastport = &pcbinfo->ipi_lastport;
618         }
619         /*
620          * For UDP(-Lite), use random port allocation as long as the user
621          * allows it.  For TCP (and as of yet unknown) connections,
622          * use random port allocation only if the user allows it AND
623          * ipport_tick() allows it.
624          */
625         if (V_ipport_randomized &&
626                 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
627                 pcbinfo == &V_ulitecbinfo))
628                 dorandom = 1;
629         else
630                 dorandom = 0;
631         /*
632          * It makes no sense to do random port allocation if
633          * we have the only port available.
634          */
635         if (first == last)
636                 dorandom = 0;
637         /* Make sure to not include UDP(-Lite) packets in the count. */
638         if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
639                 V_ipport_tcpallocs++;
640         /*
641          * Instead of having two loops further down counting up or down
642          * make sure that first is always <= last and go with only one
643          * code path implementing all logic.
644          */
645         if (first > last) {
646                 aux = first;
647                 first = last;
648                 last = aux;
649         }
650
651 #ifdef INET
652         /* Make the compiler happy. */
653         laddr.s_addr = 0;
654         if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
655                 KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p",
656                     __func__, inp));
657                 laddr = *laddrp;
658         }
659 #endif
660         tmpinp = NULL;  /* Make compiler happy. */
661         lport = *lportp;
662
663         if (dorandom)
664                 *lastport = first + (arc4random() % (last - first));
665
666         count = last - first;
667
668         do {
669                 if (count-- < 0)        /* completely used? */
670                         return (EADDRNOTAVAIL);
671                 ++*lastport;
672                 if (*lastport < first || *lastport > last)
673                         *lastport = first;
674                 lport = htons(*lastport);
675
676 #ifdef INET6
677                 if ((inp->inp_vflag & INP_IPV6) != 0)
678                         tmpinp = in6_pcblookup_local(pcbinfo,
679                             &inp->in6p_laddr, lport, lookupflags, cred);
680 #endif
681 #if defined(INET) && defined(INET6)
682                 else
683 #endif
684 #ifdef INET
685                         tmpinp = in_pcblookup_local(pcbinfo, laddr,
686                             lport, lookupflags, cred);
687 #endif
688         } while (tmpinp != NULL);
689
690 #ifdef INET
691         if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4)
692                 laddrp->s_addr = laddr.s_addr;
693 #endif
694         *lportp = lport;
695
696         return (0);
697 }
698
699 /*
700  * Return cached socket options.
701  */
702 int
703 inp_so_options(const struct inpcb *inp)
704 {
705         int so_options;
706
707         so_options = 0;
708
709         if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
710                 so_options |= SO_REUSEPORT_LB;
711         if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
712                 so_options |= SO_REUSEPORT;
713         if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
714                 so_options |= SO_REUSEADDR;
715         return (so_options);
716 }
717 #endif /* INET || INET6 */
718
719 /*
720  * Check if a new BINDMULTI socket is allowed to be created.
721  *
722  * ni points to the new inp.
723  * oi points to the exisitng inp.
724  *
725  * This checks whether the existing inp also has BINDMULTI and
726  * whether the credentials match.
727  */
728 int
729 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
730 {
731         /* Check permissions match */
732         if ((ni->inp_flags2 & INP_BINDMULTI) &&
733             (ni->inp_cred->cr_uid !=
734             oi->inp_cred->cr_uid))
735                 return (0);
736
737         /* Check the existing inp has BINDMULTI set */
738         if ((ni->inp_flags2 & INP_BINDMULTI) &&
739             ((oi->inp_flags2 & INP_BINDMULTI) == 0))
740                 return (0);
741
742         /*
743          * We're okay - either INP_BINDMULTI isn't set on ni, or
744          * it is and it matches the checks.
745          */
746         return (1);
747 }
748
749 #ifdef INET
750 /*
751  * Set up a bind operation on a PCB, performing port allocation
752  * as required, but do not actually modify the PCB. Callers can
753  * either complete the bind by setting inp_laddr/inp_lport and
754  * calling in_pcbinshash(), or they can just use the resulting
755  * port and address to authorise the sending of a once-off packet.
756  *
757  * On error, the values of *laddrp and *lportp are not changed.
758  */
759 int
760 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
761     u_short *lportp, struct ucred *cred)
762 {
763         struct socket *so = inp->inp_socket;
764         struct sockaddr_in *sin;
765         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
766         struct in_addr laddr;
767         u_short lport = 0;
768         int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
769         int error;
770
771         /*
772          * XXX Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
773          * so that we don't have to add to the (already messy) code below
774          */
775         int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
776
777         /*
778          * No state changes, so read locks are sufficient here.
779          */
780         INP_LOCK_ASSERT(inp);
781         INP_HASH_LOCK_ASSERT(pcbinfo);
782
783         if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
784                 return (EADDRNOTAVAIL);
785         laddr.s_addr = *laddrp;
786         if (nam != NULL && laddr.s_addr != INADDR_ANY)
787                 return (EINVAL);
788         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
789                 lookupflags = INPLOOKUP_WILDCARD;
790         if (nam == NULL) {
791                 if ((error = prison_local_ip4(cred, &laddr)) != 0)
792                         return (error);
793         } else {
794                 sin = (struct sockaddr_in *)nam;
795                 if (nam->sa_len != sizeof (*sin))
796                         return (EINVAL);
797 #ifdef notdef
798                 /*
799                  * We should check the family, but old programs
800                  * incorrectly fail to initialize it.
801                  */
802                 if (sin->sin_family != AF_INET)
803                         return (EAFNOSUPPORT);
804 #endif
805                 error = prison_local_ip4(cred, &sin->sin_addr);
806                 if (error)
807                         return (error);
808                 if (sin->sin_port != *lportp) {
809                         /* Don't allow the port to change. */
810                         if (*lportp != 0)
811                                 return (EINVAL);
812                         lport = sin->sin_port;
813                 }
814                 /* NB: lport is left as 0 if the port isn't being changed. */
815                 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
816                         /*
817                          * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
818                          * allow complete duplication of binding if
819                          * SO_REUSEPORT is set, or if SO_REUSEADDR is set
820                          * and a multicast address is bound on both
821                          * new and duplicated sockets.
822                          */
823                         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
824                                 reuseport = SO_REUSEADDR|SO_REUSEPORT;
825                         // XXX: How to deal with SO_REUSEPORT_LB here?
826                         // Added equivalent treatment as SO_REUSEPORT here for now
827                         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
828                                 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
829                 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
830                         sin->sin_port = 0;              /* yech... */
831                         bzero(&sin->sin_zero, sizeof(sin->sin_zero));
832                         /*
833                          * Is the address a local IP address?
834                          * If INP_BINDANY is set, then the socket may be bound
835                          * to any endpoint address, local or not.
836                          */
837                         if ((inp->inp_flags & INP_BINDANY) == 0 &&
838                             ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
839                                 return (EADDRNOTAVAIL);
840                 }
841                 laddr = sin->sin_addr;
842                 if (lport) {
843                         struct inpcb *t;
844                         struct tcptw *tw;
845
846                         /* GROSS */
847                         if (ntohs(lport) <= V_ipport_reservedhigh &&
848                             ntohs(lport) >= V_ipport_reservedlow &&
849                             priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
850                             0))
851                                 return (EACCES);
852                         if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
853                             priv_check_cred(inp->inp_cred,
854                             PRIV_NETINET_REUSEPORT, 0) != 0) {
855                                 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
856                                     lport, INPLOOKUP_WILDCARD, cred);
857         /*
858          * XXX
859          * This entire block sorely needs a rewrite.
860          */
861                                 if (t &&
862                                     ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
863                                     ((t->inp_flags & INP_TIMEWAIT) == 0) &&
864                                     (so->so_type != SOCK_STREAM ||
865                                      ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
866                                     (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
867                                      ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
868                                      (t->inp_flags2 & INP_REUSEPORT) ||
869                                      (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
870                                     (inp->inp_cred->cr_uid !=
871                                      t->inp_cred->cr_uid))
872                                         return (EADDRINUSE);
873
874                                 /*
875                                  * If the socket is a BINDMULTI socket, then
876                                  * the credentials need to match and the
877                                  * original socket also has to have been bound
878                                  * with BINDMULTI.
879                                  */
880                                 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
881                                         return (EADDRINUSE);
882                         }
883                         t = in_pcblookup_local(pcbinfo, sin->sin_addr,
884                             lport, lookupflags, cred);
885                         if (t && (t->inp_flags & INP_TIMEWAIT)) {
886                                 /*
887                                  * XXXRW: If an incpb has had its timewait
888                                  * state recycled, we treat the address as
889                                  * being in use (for now).  This is better
890                                  * than a panic, but not desirable.
891                                  */
892                                 tw = intotw(t);
893                                 if (tw == NULL ||
894                                     ((reuseport & tw->tw_so_options) == 0 &&
895                                         (reuseport_lb & tw->tw_so_options) == 0)) {
896                                         return (EADDRINUSE);
897                                 }
898                         } else if (t &&
899                                    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
900                                    (reuseport & inp_so_options(t)) == 0 &&
901                                    (reuseport_lb & inp_so_options(t)) == 0) {
902 #ifdef INET6
903                                 if (ntohl(sin->sin_addr.s_addr) !=
904                                     INADDR_ANY ||
905                                     ntohl(t->inp_laddr.s_addr) !=
906                                     INADDR_ANY ||
907                                     (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
908                                     (t->inp_vflag & INP_IPV6PROTO) == 0)
909 #endif
910                                                 return (EADDRINUSE);
911                                 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
912                                         return (EADDRINUSE);
913                         }
914                 }
915         }
916         if (*lportp != 0)
917                 lport = *lportp;
918         if (lport == 0) {
919                 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
920                 if (error != 0)
921                         return (error);
922
923         }
924         *laddrp = laddr.s_addr;
925         *lportp = lport;
926         return (0);
927 }
928
929 /*
930  * Connect from a socket to a specified address.
931  * Both address and port must be specified in argument sin.
932  * If don't have a local address for this socket yet,
933  * then pick one.
934  */
935 int
936 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
937     struct ucred *cred, struct mbuf *m)
938 {
939         u_short lport, fport;
940         in_addr_t laddr, faddr;
941         int anonport, error;
942
943         INP_WLOCK_ASSERT(inp);
944         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
945
946         lport = inp->inp_lport;
947         laddr = inp->inp_laddr.s_addr;
948         anonport = (lport == 0);
949         error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
950             NULL, cred);
951         if (error)
952                 return (error);
953
954         /* Do the initial binding of the local address if required. */
955         if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
956                 inp->inp_lport = lport;
957                 inp->inp_laddr.s_addr = laddr;
958                 if (in_pcbinshash(inp) != 0) {
959                         inp->inp_laddr.s_addr = INADDR_ANY;
960                         inp->inp_lport = 0;
961                         return (EAGAIN);
962                 }
963         }
964
965         /* Commit the remaining changes. */
966         inp->inp_lport = lport;
967         inp->inp_laddr.s_addr = laddr;
968         inp->inp_faddr.s_addr = faddr;
969         inp->inp_fport = fport;
970         in_pcbrehash_mbuf(inp, m);
971
972         if (anonport)
973                 inp->inp_flags |= INP_ANONPORT;
974         return (0);
975 }
976
977 int
978 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
979 {
980
981         return (in_pcbconnect_mbuf(inp, nam, cred, NULL));
982 }
983
984 /*
985  * Do proper source address selection on an unbound socket in case
986  * of connect. Take jails into account as well.
987  */
988 int
989 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
990     struct ucred *cred)
991 {
992         struct ifaddr *ifa;
993         struct sockaddr *sa;
994         struct sockaddr_in *sin;
995         struct route sro;
996         int error;
997
998         KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
999
1000         /*
1001          * Bypass source address selection and use the primary jail IP
1002          * if requested.
1003          */
1004         if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
1005                 return (0);
1006
1007         error = 0;
1008         bzero(&sro, sizeof(sro));
1009
1010         sin = (struct sockaddr_in *)&sro.ro_dst;
1011         sin->sin_family = AF_INET;
1012         sin->sin_len = sizeof(struct sockaddr_in);
1013         sin->sin_addr.s_addr = faddr->s_addr;
1014
1015         /*
1016          * If route is known our src addr is taken from the i/f,
1017          * else punt.
1018          *
1019          * Find out route to destination.
1020          */
1021         if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1022                 in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
1023
1024         /*
1025          * If we found a route, use the address corresponding to
1026          * the outgoing interface.
1027          * 
1028          * Otherwise assume faddr is reachable on a directly connected
1029          * network and try to find a corresponding interface to take
1030          * the source address from.
1031          */
1032         if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
1033                 struct in_ifaddr *ia;
1034                 struct ifnet *ifp;
1035
1036                 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1037                                         inp->inp_socket->so_fibnum));
1038                 if (ia == NULL)
1039                         ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1040                                                 inp->inp_socket->so_fibnum));
1041                 if (ia == NULL) {
1042                         error = ENETUNREACH;
1043                         goto done;
1044                 }
1045
1046                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1047                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1048                         ifa_free(&ia->ia_ifa);
1049                         goto done;
1050                 }
1051
1052                 ifp = ia->ia_ifp;
1053                 ifa_free(&ia->ia_ifa);
1054                 ia = NULL;
1055                 IF_ADDR_RLOCK(ifp);
1056                 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1057
1058                         sa = ifa->ifa_addr;
1059                         if (sa->sa_family != AF_INET)
1060                                 continue;
1061                         sin = (struct sockaddr_in *)sa;
1062                         if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1063                                 ia = (struct in_ifaddr *)ifa;
1064                                 break;
1065                         }
1066                 }
1067                 if (ia != NULL) {
1068                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1069                         IF_ADDR_RUNLOCK(ifp);
1070                         goto done;
1071                 }
1072                 IF_ADDR_RUNLOCK(ifp);
1073
1074                 /* 3. As a last resort return the 'default' jail address. */
1075                 error = prison_get_ip4(cred, laddr);
1076                 goto done;
1077         }
1078
1079         /*
1080          * If the outgoing interface on the route found is not
1081          * a loopback interface, use the address from that interface.
1082          * In case of jails do those three steps:
1083          * 1. check if the interface address belongs to the jail. If so use it.
1084          * 2. check if we have any address on the outgoing interface
1085          *    belonging to this jail. If so use it.
1086          * 3. as a last resort return the 'default' jail address.
1087          */
1088         if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
1089                 struct in_ifaddr *ia;
1090                 struct ifnet *ifp;
1091
1092                 /* If not jailed, use the default returned. */
1093                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1094                         ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
1095                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1096                         goto done;
1097                 }
1098
1099                 /* Jailed. */
1100                 /* 1. Check if the iface address belongs to the jail. */
1101                 sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
1102                 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1103                         ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
1104                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1105                         goto done;
1106                 }
1107
1108                 /*
1109                  * 2. Check if we have any address on the outgoing interface
1110                  *    belonging to this jail.
1111                  */
1112                 ia = NULL;
1113                 ifp = sro.ro_rt->rt_ifp;
1114                 IF_ADDR_RLOCK(ifp);
1115                 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1116                         sa = ifa->ifa_addr;
1117                         if (sa->sa_family != AF_INET)
1118                                 continue;
1119                         sin = (struct sockaddr_in *)sa;
1120                         if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1121                                 ia = (struct in_ifaddr *)ifa;
1122                                 break;
1123                         }
1124                 }
1125                 if (ia != NULL) {
1126                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1127                         IF_ADDR_RUNLOCK(ifp);
1128                         goto done;
1129                 }
1130                 IF_ADDR_RUNLOCK(ifp);
1131
1132                 /* 3. As a last resort return the 'default' jail address. */
1133                 error = prison_get_ip4(cred, laddr);
1134                 goto done;
1135         }
1136
1137         /*
1138          * The outgoing interface is marked with 'loopback net', so a route
1139          * to ourselves is here.
1140          * Try to find the interface of the destination address and then
1141          * take the address from there. That interface is not necessarily
1142          * a loopback interface.
1143          * In case of jails, check that it is an address of the jail
1144          * and if we cannot find, fall back to the 'default' jail address.
1145          */
1146         if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
1147                 struct sockaddr_in sain;
1148                 struct in_ifaddr *ia;
1149
1150                 bzero(&sain, sizeof(struct sockaddr_in));
1151                 sain.sin_family = AF_INET;
1152                 sain.sin_len = sizeof(struct sockaddr_in);
1153                 sain.sin_addr.s_addr = faddr->s_addr;
1154
1155                 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain),
1156                                         inp->inp_socket->so_fibnum));
1157                 if (ia == NULL)
1158                         ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0,
1159                                                 inp->inp_socket->so_fibnum));
1160                 if (ia == NULL)
1161                         ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
1162
1163                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1164                         if (ia == NULL) {
1165                                 error = ENETUNREACH;
1166                                 goto done;
1167                         }
1168                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1169                         ifa_free(&ia->ia_ifa);
1170                         goto done;
1171                 }
1172
1173                 /* Jailed. */
1174                 if (ia != NULL) {
1175                         struct ifnet *ifp;
1176
1177                         ifp = ia->ia_ifp;
1178                         ifa_free(&ia->ia_ifa);
1179                         ia = NULL;
1180                         IF_ADDR_RLOCK(ifp);
1181                         TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1182
1183                                 sa = ifa->ifa_addr;
1184                                 if (sa->sa_family != AF_INET)
1185                                         continue;
1186                                 sin = (struct sockaddr_in *)sa;
1187                                 if (prison_check_ip4(cred,
1188                                     &sin->sin_addr) == 0) {
1189                                         ia = (struct in_ifaddr *)ifa;
1190                                         break;
1191                                 }
1192                         }
1193                         if (ia != NULL) {
1194                                 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1195                                 IF_ADDR_RUNLOCK(ifp);
1196                                 goto done;
1197                         }
1198                         IF_ADDR_RUNLOCK(ifp);
1199                 }
1200
1201                 /* 3. As a last resort return the 'default' jail address. */
1202                 error = prison_get_ip4(cred, laddr);
1203                 goto done;
1204         }
1205
1206 done:
1207         if (sro.ro_rt != NULL)
1208                 RTFREE(sro.ro_rt);
1209         return (error);
1210 }
1211
1212 /*
1213  * Set up for a connect from a socket to the specified address.
1214  * On entry, *laddrp and *lportp should contain the current local
1215  * address and port for the PCB; these are updated to the values
1216  * that should be placed in inp_laddr and inp_lport to complete
1217  * the connect.
1218  *
1219  * On success, *faddrp and *fportp will be set to the remote address
1220  * and port. These are not updated in the error case.
1221  *
1222  * If the operation fails because the connection already exists,
1223  * *oinpp will be set to the PCB of that connection so that the
1224  * caller can decide to override it. In all other cases, *oinpp
1225  * is set to NULL.
1226  */
1227 int
1228 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
1229     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
1230     struct inpcb **oinpp, struct ucred *cred)
1231 {
1232         struct rm_priotracker in_ifa_tracker;
1233         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
1234         struct in_ifaddr *ia;
1235         struct inpcb *oinp;
1236         struct in_addr laddr, faddr;
1237         u_short lport, fport;
1238         int error;
1239
1240         /*
1241          * Because a global state change doesn't actually occur here, a read
1242          * lock is sufficient.
1243          */
1244         INP_LOCK_ASSERT(inp);
1245         INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1246
1247         if (oinpp != NULL)
1248                 *oinpp = NULL;
1249         if (nam->sa_len != sizeof (*sin))
1250                 return (EINVAL);
1251         if (sin->sin_family != AF_INET)
1252                 return (EAFNOSUPPORT);
1253         if (sin->sin_port == 0)
1254                 return (EADDRNOTAVAIL);
1255         laddr.s_addr = *laddrp;
1256         lport = *lportp;
1257         faddr = sin->sin_addr;
1258         fport = sin->sin_port;
1259
1260         if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
1261                 /*
1262                  * If the destination address is INADDR_ANY,
1263                  * use the primary local address.
1264                  * If the supplied address is INADDR_BROADCAST,
1265                  * and the primary interface supports broadcast,
1266                  * choose the broadcast address for that interface.
1267                  */
1268                 if (faddr.s_addr == INADDR_ANY) {
1269                         IN_IFADDR_RLOCK(&in_ifa_tracker);
1270                         faddr =
1271                             IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1272                         IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1273                         if (cred != NULL &&
1274                             (error = prison_get_ip4(cred, &faddr)) != 0)
1275                                 return (error);
1276                 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
1277                         IN_IFADDR_RLOCK(&in_ifa_tracker);
1278                         if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
1279                             IFF_BROADCAST)
1280                                 faddr = satosin(&TAILQ_FIRST(
1281                                     &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1282                         IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1283                 }
1284         }
1285         if (laddr.s_addr == INADDR_ANY) {
1286                 error = in_pcbladdr(inp, &faddr, &laddr, cred);
1287                 /*
1288                  * If the destination address is multicast and an outgoing
1289                  * interface has been set as a multicast option, prefer the
1290                  * address of that interface as our source address.
1291                  */
1292                 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
1293                     inp->inp_moptions != NULL) {
1294                         struct ip_moptions *imo;
1295                         struct ifnet *ifp;
1296
1297                         imo = inp->inp_moptions;
1298                         if (imo->imo_multicast_ifp != NULL) {
1299                                 ifp = imo->imo_multicast_ifp;
1300                                 IN_IFADDR_RLOCK(&in_ifa_tracker);
1301                                 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1302                                         if ((ia->ia_ifp == ifp) &&
1303                                             (cred == NULL ||
1304                                             prison_check_ip4(cred,
1305                                             &ia->ia_addr.sin_addr) == 0))
1306                                                 break;
1307                                 }
1308                                 if (ia == NULL)
1309                                         error = EADDRNOTAVAIL;
1310                                 else {
1311                                         laddr = ia->ia_addr.sin_addr;
1312                                         error = 0;
1313                                 }
1314                                 IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1315                         }
1316                 }
1317                 if (error)
1318                         return (error);
1319         }
1320         oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport,
1321             laddr, lport, 0, NULL);
1322         if (oinp != NULL) {
1323                 if (oinpp != NULL)
1324                         *oinpp = oinp;
1325                 return (EADDRINUSE);
1326         }
1327         if (lport == 0) {
1328                 error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
1329                     cred);
1330                 if (error)
1331                         return (error);
1332         }
1333         *laddrp = laddr.s_addr;
1334         *lportp = lport;
1335         *faddrp = faddr.s_addr;
1336         *fportp = fport;
1337         return (0);
1338 }
1339
1340 void
1341 in_pcbdisconnect(struct inpcb *inp)
1342 {
1343
1344         INP_WLOCK_ASSERT(inp);
1345         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1346
1347         inp->inp_faddr.s_addr = INADDR_ANY;
1348         inp->inp_fport = 0;
1349         in_pcbrehash(inp);
1350 }
1351 #endif /* INET */
1352
1353 /*
1354  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
1355  * For most protocols, this will be invoked immediately prior to calling
1356  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
1357  * socket, in which case in_pcbfree() is deferred.
1358  */
1359 void
1360 in_pcbdetach(struct inpcb *inp)
1361 {
1362
1363         KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1364
1365 #ifdef RATELIMIT
1366         if (inp->inp_snd_tag != NULL)
1367                 in_pcbdetach_txrtlmt(inp);
1368 #endif
1369         inp->inp_socket->so_pcb = NULL;
1370         inp->inp_socket = NULL;
1371 }
1372
1373 /*
1374  * in_pcbref() bumps the reference count on an inpcb in order to maintain
1375  * stability of an inpcb pointer despite the inpcb lock being released.  This
1376  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
1377  * but where the inpcb lock may already held, or when acquiring a reference
1378  * via a pcbgroup.
1379  *
1380  * in_pcbref() should be used only to provide brief memory stability, and
1381  * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
1382  * garbage collect the inpcb if it has been in_pcbfree()'d from another
1383  * context.  Until in_pcbrele() has returned that the inpcb is still valid,
1384  * lock and rele are the *only* safe operations that may be performed on the
1385  * inpcb.
1386  *
1387  * While the inpcb will not be freed, releasing the inpcb lock means that the
1388  * connection's state may change, so the caller should be careful to
1389  * revalidate any cached state on reacquiring the lock.  Drop the reference
1390  * using in_pcbrele().
1391  */
1392 void
1393 in_pcbref(struct inpcb *inp)
1394 {
1395
1396         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1397
1398         refcount_acquire(&inp->inp_refcount);
1399 }
1400
1401 /*
1402  * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
1403  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
1404  * return a flag indicating whether or not the inpcb remains valid.  If it is
1405  * valid, we return with the inpcb lock held.
1406  *
1407  * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
1408  * reference on an inpcb.  Historically more work was done here (actually, in
1409  * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
1410  * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
1411  * about memory stability (and continued use of the write lock).
1412  */
1413 int
1414 in_pcbrele_rlocked(struct inpcb *inp)
1415 {
1416         struct inpcbinfo *pcbinfo;
1417
1418         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1419
1420         INP_RLOCK_ASSERT(inp);
1421
1422         if (refcount_release(&inp->inp_refcount) == 0) {
1423                 /*
1424                  * If the inpcb has been freed, let the caller know, even if
1425                  * this isn't the last reference.
1426                  */
1427                 if (inp->inp_flags2 & INP_FREED) {
1428                         INP_RUNLOCK(inp);
1429                         return (1);
1430                 }
1431                 return (0);
1432         }
1433         
1434         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1435 #ifdef TCPHPTS
1436         if (inp->inp_in_hpts || inp->inp_in_input) {
1437                 struct tcp_hpts_entry *hpts;
1438                 /*
1439                  * We should not be on the hpts at 
1440                  * this point in any form. we must
1441                  * get the lock to be sure.
1442                  */
1443                 hpts = tcp_hpts_lock(inp);
1444                 if (inp->inp_in_hpts)
1445                         panic("Hpts:%p inp:%p at free still on hpts",
1446                               hpts, inp);
1447                 mtx_unlock(&hpts->p_mtx);
1448                 hpts = tcp_input_lock(inp);
1449                 if (inp->inp_in_input) 
1450                         panic("Hpts:%p inp:%p at free still on input hpts",
1451                               hpts, inp);
1452                 mtx_unlock(&hpts->p_mtx);
1453         }
1454 #endif
1455         INP_RUNLOCK(inp);
1456         pcbinfo = inp->inp_pcbinfo;
1457         uma_zfree(pcbinfo->ipi_zone, inp);
1458         return (1);
1459 }
1460
1461 int
1462 in_pcbrele_wlocked(struct inpcb *inp)
1463 {
1464         struct inpcbinfo *pcbinfo;
1465
1466         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1467
1468         INP_WLOCK_ASSERT(inp);
1469
1470         if (refcount_release(&inp->inp_refcount) == 0) {
1471                 /*
1472                  * If the inpcb has been freed, let the caller know, even if
1473                  * this isn't the last reference.
1474                  */
1475                 if (inp->inp_flags2 & INP_FREED) {
1476                         INP_WUNLOCK(inp);
1477                         return (1);
1478                 }
1479                 return (0);
1480         }
1481
1482         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1483 #ifdef TCPHPTS
1484         if (inp->inp_in_hpts || inp->inp_in_input) {
1485                 struct tcp_hpts_entry *hpts;
1486                 /*
1487                  * We should not be on the hpts at 
1488                  * this point in any form. we must
1489                  * get the lock to be sure.
1490                  */
1491                 hpts = tcp_hpts_lock(inp);
1492                 if (inp->inp_in_hpts)
1493                         panic("Hpts:%p inp:%p at free still on hpts",
1494                               hpts, inp);
1495                 mtx_unlock(&hpts->p_mtx);
1496                 hpts = tcp_input_lock(inp);
1497                 if (inp->inp_in_input) 
1498                         panic("Hpts:%p inp:%p at free still on input hpts",
1499                               hpts, inp);
1500                 mtx_unlock(&hpts->p_mtx);
1501         }
1502 #endif
1503         INP_WUNLOCK(inp);
1504         pcbinfo = inp->inp_pcbinfo;
1505         uma_zfree(pcbinfo->ipi_zone, inp);
1506         return (1);
1507 }
1508
1509 /*
1510  * Temporary wrapper.
1511  */
1512 int
1513 in_pcbrele(struct inpcb *inp)
1514 {
1515
1516         return (in_pcbrele_wlocked(inp));
1517 }
1518
1519 /*
1520  * Unconditionally schedule an inpcb to be freed by decrementing its
1521  * reference count, which should occur only after the inpcb has been detached
1522  * from its socket.  If another thread holds a temporary reference (acquired
1523  * using in_pcbref()) then the free is deferred until that reference is
1524  * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
1525  * work, including removal from global lists, is done in this context, where
1526  * the pcbinfo lock is held.
1527  */
1528 void
1529 in_pcbfree(struct inpcb *inp)
1530 {
1531         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1532
1533         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1534
1535         KASSERT((inp->inp_flags2 & INP_FREED) == 0,
1536             ("%s: called twice for pcb %p", __func__, inp));
1537         if (inp->inp_flags2 & INP_FREED) {
1538                 INP_WUNLOCK(inp);
1539                 return;
1540         }
1541
1542 #ifdef INVARIANTS
1543         if (pcbinfo == &V_tcbinfo) {
1544                 INP_INFO_LOCK_ASSERT(pcbinfo);
1545         } else {
1546                 INP_INFO_WLOCK_ASSERT(pcbinfo);
1547         }
1548 #endif
1549         INP_WLOCK_ASSERT(inp);
1550
1551         /* XXXRW: Do as much as possible here. */
1552 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1553         if (inp->inp_sp != NULL)
1554                 ipsec_delete_pcbpolicy(inp);
1555 #endif
1556         INP_LIST_WLOCK(pcbinfo);
1557         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1558         in_pcbremlists(inp);
1559         INP_LIST_WUNLOCK(pcbinfo);
1560 #ifdef INET6
1561         if (inp->inp_vflag & INP_IPV6PROTO) {
1562                 ip6_freepcbopts(inp->in6p_outputopts);
1563                 if (inp->in6p_moptions != NULL)
1564                         ip6_freemoptions(inp->in6p_moptions);
1565         }
1566 #endif
1567         if (inp->inp_options)
1568                 (void)m_free(inp->inp_options);
1569 #ifdef INET
1570         if (inp->inp_moptions != NULL)
1571                 inp_freemoptions(inp->inp_moptions);
1572 #endif
1573         RO_INVALIDATE_CACHE(&inp->inp_route);
1574
1575         inp->inp_vflag = 0;
1576         inp->inp_flags2 |= INP_FREED;
1577         crfree(inp->inp_cred);
1578 #ifdef MAC
1579         mac_inpcb_destroy(inp);
1580 #endif
1581         if (!in_pcbrele_wlocked(inp))
1582                 INP_WUNLOCK(inp);
1583 }
1584
1585 /*
1586  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1587  * port reservation, and preventing it from being returned by inpcb lookups.
1588  *
1589  * It is used by TCP to mark an inpcb as unused and avoid future packet
1590  * delivery or event notification when a socket remains open but TCP has
1591  * closed.  This might occur as a result of a shutdown()-initiated TCP close
1592  * or a RST on the wire, and allows the port binding to be reused while still
1593  * maintaining the invariant that so_pcb always points to a valid inpcb until
1594  * in_pcbdetach().
1595  *
1596  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1597  * in_pcbnotifyall() and in_pcbpurgeif0()?
1598  */
1599 void
1600 in_pcbdrop(struct inpcb *inp)
1601 {
1602
1603         INP_WLOCK_ASSERT(inp);
1604
1605         /*
1606          * XXXRW: Possibly we should protect the setting of INP_DROPPED with
1607          * the hash lock...?
1608          */
1609         inp->inp_flags |= INP_DROPPED;
1610         if (inp->inp_flags & INP_INHASHLIST) {
1611                 struct inpcbport *phd = inp->inp_phd;
1612
1613                 INP_HASH_WLOCK(inp->inp_pcbinfo);
1614                 in_pcbremlbgrouphash(inp, inp->inp_pcbinfo);
1615                 LIST_REMOVE(inp, inp_hash);
1616                 LIST_REMOVE(inp, inp_portlist);
1617                 if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1618                         LIST_REMOVE(phd, phd_hash);
1619                         free(phd, M_PCB);
1620                 }
1621                 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1622                 inp->inp_flags &= ~INP_INHASHLIST;
1623 #ifdef PCBGROUP
1624                 in_pcbgroup_remove(inp);
1625 #endif
1626         }
1627 }
1628
1629 #ifdef INET
1630 /*
1631  * Common routines to return the socket addresses associated with inpcbs.
1632  */
1633 struct sockaddr *
1634 in_sockaddr(in_port_t port, struct in_addr *addr_p)
1635 {
1636         struct sockaddr_in *sin;
1637
1638         sin = malloc(sizeof *sin, M_SONAME,
1639                 M_WAITOK | M_ZERO);
1640         sin->sin_family = AF_INET;
1641         sin->sin_len = sizeof(*sin);
1642         sin->sin_addr = *addr_p;
1643         sin->sin_port = port;
1644
1645         return (struct sockaddr *)sin;
1646 }
1647
1648 int
1649 in_getsockaddr(struct socket *so, struct sockaddr **nam)
1650 {
1651         struct inpcb *inp;
1652         struct in_addr addr;
1653         in_port_t port;
1654
1655         inp = sotoinpcb(so);
1656         KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1657
1658         INP_RLOCK(inp);
1659         port = inp->inp_lport;
1660         addr = inp->inp_laddr;
1661         INP_RUNLOCK(inp);
1662
1663         *nam = in_sockaddr(port, &addr);
1664         return 0;
1665 }
1666
1667 int
1668 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
1669 {
1670         struct inpcb *inp;
1671         struct in_addr addr;
1672         in_port_t port;
1673
1674         inp = sotoinpcb(so);
1675         KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1676
1677         INP_RLOCK(inp);
1678         port = inp->inp_fport;
1679         addr = inp->inp_faddr;
1680         INP_RUNLOCK(inp);
1681
1682         *nam = in_sockaddr(port, &addr);
1683         return 0;
1684 }
1685
1686 void
1687 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
1688     struct inpcb *(*notify)(struct inpcb *, int))
1689 {
1690         struct inpcb *inp, *inp_temp;
1691
1692         INP_INFO_WLOCK(pcbinfo);
1693         LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
1694                 INP_WLOCK(inp);
1695 #ifdef INET6
1696                 if ((inp->inp_vflag & INP_IPV4) == 0) {
1697                         INP_WUNLOCK(inp);
1698                         continue;
1699                 }
1700 #endif
1701                 if (inp->inp_faddr.s_addr != faddr.s_addr ||
1702                     inp->inp_socket == NULL) {
1703                         INP_WUNLOCK(inp);
1704                         continue;
1705                 }
1706                 if ((*notify)(inp, errno))
1707                         INP_WUNLOCK(inp);
1708         }
1709         INP_INFO_WUNLOCK(pcbinfo);
1710 }
1711
1712 void
1713 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1714 {
1715         struct inpcb *inp;
1716         struct ip_moptions *imo;
1717         int i, gap;
1718
1719         INP_INFO_WLOCK(pcbinfo);
1720         LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1721                 INP_WLOCK(inp);
1722                 imo = inp->inp_moptions;
1723                 if ((inp->inp_vflag & INP_IPV4) &&
1724                     imo != NULL) {
1725                         /*
1726                          * Unselect the outgoing interface if it is being
1727                          * detached.
1728                          */
1729                         if (imo->imo_multicast_ifp == ifp)
1730                                 imo->imo_multicast_ifp = NULL;
1731
1732                         /*
1733                          * Drop multicast group membership if we joined
1734                          * through the interface being detached.
1735                          */
1736                         for (i = 0, gap = 0; i < imo->imo_num_memberships;
1737                             i++) {
1738                                 if (imo->imo_membership[i]->inm_ifp == ifp) {
1739                                         in_delmulti(imo->imo_membership[i]);
1740                                         gap++;
1741                                 } else if (gap != 0)
1742                                         imo->imo_membership[i - gap] =
1743                                             imo->imo_membership[i];
1744                         }
1745                         imo->imo_num_memberships -= gap;
1746                 }
1747                 INP_WUNLOCK(inp);
1748         }
1749         INP_INFO_WUNLOCK(pcbinfo);
1750 }
1751
1752 /*
1753  * Lookup a PCB based on the local address and port.  Caller must hold the
1754  * hash lock.  No inpcb locks or references are acquired.
1755  */
1756 #define INP_LOOKUP_MAPPED_PCB_COST      3
1757 struct inpcb *
1758 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1759     u_short lport, int lookupflags, struct ucred *cred)
1760 {
1761         struct inpcb *inp;
1762 #ifdef INET6
1763         int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
1764 #else
1765         int matchwild = 3;
1766 #endif
1767         int wildcard;
1768
1769         KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1770             ("%s: invalid lookup flags %d", __func__, lookupflags));
1771
1772         INP_HASH_LOCK_ASSERT(pcbinfo);
1773
1774         if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
1775                 struct inpcbhead *head;
1776                 /*
1777                  * Look for an unconnected (wildcard foreign addr) PCB that
1778                  * matches the local address and port we're looking for.
1779                  */
1780                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1781                     0, pcbinfo->ipi_hashmask)];
1782                 LIST_FOREACH(inp, head, inp_hash) {
1783 #ifdef INET6
1784                         /* XXX inp locking */
1785                         if ((inp->inp_vflag & INP_IPV4) == 0)
1786                                 continue;
1787 #endif
1788                         if (inp->inp_faddr.s_addr == INADDR_ANY &&
1789                             inp->inp_laddr.s_addr == laddr.s_addr &&
1790                             inp->inp_lport == lport) {
1791                                 /*
1792                                  * Found?
1793                                  */
1794                                 if (cred == NULL ||
1795                                     prison_equal_ip4(cred->cr_prison,
1796                                         inp->inp_cred->cr_prison))
1797                                         return (inp);
1798                         }
1799                 }
1800                 /*
1801                  * Not found.
1802                  */
1803                 return (NULL);
1804         } else {
1805                 struct inpcbporthead *porthash;
1806                 struct inpcbport *phd;
1807                 struct inpcb *match = NULL;
1808                 /*
1809                  * Best fit PCB lookup.
1810                  *
1811                  * First see if this local port is in use by looking on the
1812                  * port hash list.
1813                  */
1814                 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
1815                     pcbinfo->ipi_porthashmask)];
1816                 LIST_FOREACH(phd, porthash, phd_hash) {
1817                         if (phd->phd_port == lport)
1818                                 break;
1819                 }
1820                 if (phd != NULL) {
1821                         /*
1822                          * Port is in use by one or more PCBs. Look for best
1823                          * fit.
1824                          */
1825                         LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1826                                 wildcard = 0;
1827                                 if (cred != NULL &&
1828                                     !prison_equal_ip4(inp->inp_cred->cr_prison,
1829                                         cred->cr_prison))
1830                                         continue;
1831 #ifdef INET6
1832                                 /* XXX inp locking */
1833                                 if ((inp->inp_vflag & INP_IPV4) == 0)
1834                                         continue;
1835                                 /*
1836                                  * We never select the PCB that has
1837                                  * INP_IPV6 flag and is bound to :: if
1838                                  * we have another PCB which is bound
1839                                  * to 0.0.0.0.  If a PCB has the
1840                                  * INP_IPV6 flag, then we set its cost
1841                                  * higher than IPv4 only PCBs.
1842                                  *
1843                                  * Note that the case only happens
1844                                  * when a socket is bound to ::, under
1845                                  * the condition that the use of the
1846                                  * mapped address is allowed.
1847                                  */
1848                                 if ((inp->inp_vflag & INP_IPV6) != 0)
1849                                         wildcard += INP_LOOKUP_MAPPED_PCB_COST;
1850 #endif
1851                                 if (inp->inp_faddr.s_addr != INADDR_ANY)
1852                                         wildcard++;
1853                                 if (inp->inp_laddr.s_addr != INADDR_ANY) {
1854                                         if (laddr.s_addr == INADDR_ANY)
1855                                                 wildcard++;
1856                                         else if (inp->inp_laddr.s_addr != laddr.s_addr)
1857                                                 continue;
1858                                 } else {
1859                                         if (laddr.s_addr != INADDR_ANY)
1860                                                 wildcard++;
1861                                 }
1862                                 if (wildcard < matchwild) {
1863                                         match = inp;
1864                                         matchwild = wildcard;
1865                                         if (matchwild == 0)
1866                                                 break;
1867                                 }
1868                         }
1869                 }
1870                 return (match);
1871         }
1872 }
1873 #undef INP_LOOKUP_MAPPED_PCB_COST
1874
1875 struct inpcb *
1876 in_pcblookup_lbgroup_last(const struct inpcb *inp)
1877 {
1878         const struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1879         const struct inpcblbgrouphead *hdr;
1880         const struct inpcblbgroup *grp;
1881         int i;
1882
1883         if (pcbinfo->ipi_lbgrouphashbase == NULL)
1884                 return NULL;
1885
1886         hdr = &pcbinfo->ipi_lbgrouphashbase[
1887             INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
1888
1889         LIST_FOREACH(grp, hdr, il_list) {
1890                 if (grp->il_vflag == inp->inp_vflag &&
1891                     grp->il_lport == inp->inp_lport &&
1892                     memcmp(&grp->il_dependladdr,
1893                         &inp->inp_inc.inc_ie.ie_dependladdr,
1894                         sizeof(grp->il_dependladdr)) == 0) {
1895                         break;
1896                 }
1897         }
1898         if (grp == NULL || grp->il_inpcnt == 1)
1899                 return NULL;
1900
1901         KASSERT(grp->il_inpcnt >= 2,
1902             ("invalid lbgroup inp count %d", grp->il_inpcnt));
1903         for (i = 0; i < grp->il_inpcnt; ++i) {
1904                 if (grp->il_inp[i] == inp) {
1905                         int last = grp->il_inpcnt - 1;
1906
1907                         if (i == last)
1908                                 last = grp->il_inpcnt - 2;
1909                         return grp->il_inp[last];
1910                 }
1911         }
1912         return NULL;
1913 }
1914
1915 static struct inpcb *
1916 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
1917   const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
1918   uint16_t fport, int lookupflags)
1919 {
1920         struct inpcb *local_wild = NULL;
1921         const struct inpcblbgrouphead *hdr;
1922         struct inpcblbgroup *grp;
1923         struct inpcblbgroup *grp_local_wild;
1924
1925         hdr = &pcbinfo->ipi_lbgrouphashbase[
1926                   INP_PCBLBGROUP_PORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
1927
1928         /*
1929          * Order of socket selection:
1930          * 1. non-wild.
1931          * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
1932          *
1933          * NOTE:
1934          * - Load balanced group does not contain jailed sockets
1935          * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
1936          */
1937         LIST_FOREACH(grp, hdr, il_list) {
1938 #ifdef INET6
1939                 if (!(grp->il_vflag & INP_IPV4))
1940                         continue;
1941 #endif
1942
1943                 if (grp->il_lport == lport) {
1944
1945                         uint32_t idx = 0;
1946                         int pkt_hash = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport);
1947
1948                         idx = pkt_hash % grp->il_inpcnt;
1949
1950                         if (grp->il_laddr.s_addr == laddr->s_addr) {
1951                                 return grp->il_inp[idx];
1952                         } else {
1953                                 if (grp->il_laddr.s_addr == INADDR_ANY &&
1954                                         (lookupflags & INPLOOKUP_WILDCARD)) {
1955                                         local_wild = grp->il_inp[idx];
1956                                         grp_local_wild = grp;
1957                                 }
1958                         }
1959                 }
1960         }
1961         if (local_wild != NULL) {
1962                 return local_wild;
1963         }
1964         return NULL;
1965 }
1966
1967 #ifdef PCBGROUP
1968 /*
1969  * Lookup PCB in hash list, using pcbgroup tables.
1970  */
1971 static struct inpcb *
1972 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
1973     struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
1974     u_int lport_arg, int lookupflags, struct ifnet *ifp)
1975 {
1976         struct inpcbhead *head;
1977         struct inpcb *inp, *tmpinp;
1978         u_short fport = fport_arg, lport = lport_arg;
1979         bool locked;
1980
1981         /*
1982          * First look for an exact match.
1983          */
1984         tmpinp = NULL;
1985         INP_GROUP_LOCK(pcbgroup);
1986         head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
1987             pcbgroup->ipg_hashmask)];
1988         LIST_FOREACH(inp, head, inp_pcbgrouphash) {
1989 #ifdef INET6
1990                 /* XXX inp locking */
1991                 if ((inp->inp_vflag & INP_IPV4) == 0)
1992                         continue;
1993 #endif
1994                 if (inp->inp_faddr.s_addr == faddr.s_addr &&
1995                     inp->inp_laddr.s_addr == laddr.s_addr &&
1996                     inp->inp_fport == fport &&
1997                     inp->inp_lport == lport) {
1998                         /*
1999                          * XXX We should be able to directly return
2000                          * the inp here, without any checks.
2001                          * Well unless both bound with SO_REUSEPORT?
2002                          */
2003                         if (prison_flag(inp->inp_cred, PR_IP4))
2004                                 goto found;
2005                         if (tmpinp == NULL)
2006                                 tmpinp = inp;
2007                 }
2008         }
2009         if (tmpinp != NULL) {
2010                 inp = tmpinp;
2011                 goto found;
2012         }
2013
2014 #ifdef  RSS
2015         /*
2016          * For incoming connections, we may wish to do a wildcard
2017          * match for an RSS-local socket.
2018          */
2019         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2020                 struct inpcb *local_wild = NULL, *local_exact = NULL;
2021 #ifdef INET6
2022                 struct inpcb *local_wild_mapped = NULL;
2023 #endif
2024                 struct inpcb *jail_wild = NULL;
2025                 struct inpcbhead *head;
2026                 int injail;
2027
2028                 /*
2029                  * Order of socket selection - we always prefer jails.
2030                  *      1. jailed, non-wild.
2031                  *      2. jailed, wild.
2032                  *      3. non-jailed, non-wild.
2033                  *      4. non-jailed, wild.
2034                  */
2035
2036                 head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
2037                     lport, 0, pcbgroup->ipg_hashmask)];
2038                 LIST_FOREACH(inp, head, inp_pcbgrouphash) {
2039 #ifdef INET6
2040                         /* XXX inp locking */
2041                         if ((inp->inp_vflag & INP_IPV4) == 0)
2042                                 continue;
2043 #endif
2044                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
2045                             inp->inp_lport != lport)
2046                                 continue;
2047
2048                         injail = prison_flag(inp->inp_cred, PR_IP4);
2049                         if (injail) {
2050                                 if (prison_check_ip4(inp->inp_cred,
2051                                     &laddr) != 0)
2052                                         continue;
2053                         } else {
2054                                 if (local_exact != NULL)
2055                                         continue;
2056                         }
2057
2058                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
2059                                 if (injail)
2060                                         goto found;
2061                                 else
2062                                         local_exact = inp;
2063                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2064 #ifdef INET6
2065                                 /* XXX inp locking, NULL check */
2066                                 if (inp->inp_vflag & INP_IPV6PROTO)
2067                                         local_wild_mapped = inp;
2068                                 else
2069 #endif
2070                                         if (injail)
2071                                                 jail_wild = inp;
2072                                         else
2073                                                 local_wild = inp;
2074                         }
2075                 } /* LIST_FOREACH */
2076
2077                 inp = jail_wild;
2078                 if (inp == NULL)
2079                         inp = local_exact;
2080                 if (inp == NULL)
2081                         inp = local_wild;
2082 #ifdef INET6
2083                 if (inp == NULL)
2084                         inp = local_wild_mapped;
2085 #endif
2086                 if (inp != NULL)
2087                         goto found;
2088         }
2089 #endif
2090
2091         /*
2092          * Then look for a wildcard match, if requested.
2093          */
2094         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2095                 struct inpcb *local_wild = NULL, *local_exact = NULL;
2096 #ifdef INET6
2097                 struct inpcb *local_wild_mapped = NULL;
2098 #endif
2099                 struct inpcb *jail_wild = NULL;
2100                 struct inpcbhead *head;
2101                 int injail;
2102
2103                 /*
2104                  * Order of socket selection - we always prefer jails.
2105                  *      1. jailed, non-wild.
2106                  *      2. jailed, wild.
2107                  *      3. non-jailed, non-wild.
2108                  *      4. non-jailed, wild.
2109                  */
2110                 head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
2111                     0, pcbinfo->ipi_wildmask)];
2112                 LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
2113 #ifdef INET6
2114                         /* XXX inp locking */
2115                         if ((inp->inp_vflag & INP_IPV4) == 0)
2116                                 continue;
2117 #endif
2118                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
2119                             inp->inp_lport != lport)
2120                                 continue;
2121
2122                         injail = prison_flag(inp->inp_cred, PR_IP4);
2123                         if (injail) {
2124                                 if (prison_check_ip4(inp->inp_cred,
2125                                     &laddr) != 0)
2126                                         continue;
2127                         } else {
2128                                 if (local_exact != NULL)
2129                                         continue;
2130                         }
2131
2132                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
2133                                 if (injail)
2134                                         goto found;
2135                                 else
2136                                         local_exact = inp;
2137                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2138 #ifdef INET6
2139                                 /* XXX inp locking, NULL check */
2140                                 if (inp->inp_vflag & INP_IPV6PROTO)
2141                                         local_wild_mapped = inp;
2142                                 else
2143 #endif
2144                                         if (injail)
2145                                                 jail_wild = inp;
2146                                         else
2147                                                 local_wild = inp;
2148                         }
2149                 } /* LIST_FOREACH */
2150                 inp = jail_wild;
2151                 if (inp == NULL)
2152                         inp = local_exact;
2153                 if (inp == NULL)
2154                         inp = local_wild;
2155 #ifdef INET6
2156                 if (inp == NULL)
2157                         inp = local_wild_mapped;
2158 #endif
2159                 if (inp != NULL)
2160                         goto found;
2161         } /* if (lookupflags & INPLOOKUP_WILDCARD) */
2162         INP_GROUP_UNLOCK(pcbgroup);
2163         return (NULL);
2164
2165 found:
2166         if (lookupflags & INPLOOKUP_WLOCKPCB)
2167                 locked = INP_TRY_WLOCK(inp);
2168         else if (lookupflags & INPLOOKUP_RLOCKPCB)
2169                 locked = INP_TRY_RLOCK(inp);
2170         else
2171                 panic("%s: locking bug", __func__);
2172         if (!locked)
2173                 in_pcbref(inp);
2174         INP_GROUP_UNLOCK(pcbgroup);
2175         if (!locked) {
2176                 if (lookupflags & INPLOOKUP_WLOCKPCB) {
2177                         INP_WLOCK(inp);
2178                         if (in_pcbrele_wlocked(inp))
2179                                 return (NULL);
2180                 } else {
2181                         INP_RLOCK(inp);
2182                         if (in_pcbrele_rlocked(inp))
2183                                 return (NULL);
2184                 }
2185         }
2186 #ifdef INVARIANTS
2187         if (lookupflags & INPLOOKUP_WLOCKPCB)
2188                 INP_WLOCK_ASSERT(inp);
2189         else
2190                 INP_RLOCK_ASSERT(inp);
2191 #endif
2192         return (inp);
2193 }
2194 #endif /* PCBGROUP */
2195
2196 /*
2197  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
2198  * that the caller has locked the hash list, and will not perform any further
2199  * locking or reference operations on either the hash list or the connection.
2200  */
2201 static struct inpcb *
2202 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2203     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2204     struct ifnet *ifp)
2205 {
2206         struct inpcbhead *head;
2207         struct inpcb *inp, *tmpinp;
2208         u_short fport = fport_arg, lport = lport_arg;
2209
2210         KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2211             ("%s: invalid lookup flags %d", __func__, lookupflags));
2212
2213         INP_HASH_LOCK_ASSERT(pcbinfo);
2214
2215         /*
2216          * First look for an exact match.
2217          */
2218         tmpinp = NULL;
2219         head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2220             pcbinfo->ipi_hashmask)];
2221         LIST_FOREACH(inp, head, inp_hash) {
2222 #ifdef INET6
2223                 /* XXX inp locking */
2224                 if ((inp->inp_vflag & INP_IPV4) == 0)
2225                         continue;
2226 #endif
2227                 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2228                     inp->inp_laddr.s_addr == laddr.s_addr &&
2229                     inp->inp_fport == fport &&
2230                     inp->inp_lport == lport) {
2231                         /*
2232                          * XXX We should be able to directly return
2233                          * the inp here, without any checks.
2234                          * Well unless both bound with SO_REUSEPORT?
2235                          */
2236                         if (prison_flag(inp->inp_cred, PR_IP4))
2237                                 return (inp);
2238                         if (tmpinp == NULL)
2239                                 tmpinp = inp;
2240                 }
2241         }
2242         if (tmpinp != NULL)
2243                 return (tmpinp);
2244
2245         /*
2246          * Then look in lb group (for wildcard match)
2247          */
2248         if (pcbinfo->ipi_lbgrouphashbase != NULL &&
2249                 (lookupflags & INPLOOKUP_WILDCARD)) {
2250                 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, fport,
2251                                                                    lookupflags);
2252                 if (inp != NULL) {
2253                         return inp;
2254                 }
2255         }
2256
2257         /*
2258          * Then look for a wildcard match, if requested.
2259          */
2260         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2261                 struct inpcb *local_wild = NULL, *local_exact = NULL;
2262 #ifdef INET6
2263                 struct inpcb *local_wild_mapped = NULL;
2264 #endif
2265                 struct inpcb *jail_wild = NULL;
2266                 int injail;
2267
2268                 /*
2269                  * Order of socket selection - we always prefer jails.
2270                  *      1. jailed, non-wild.
2271                  *      2. jailed, wild.
2272                  *      3. non-jailed, non-wild.
2273                  *      4. non-jailed, wild.
2274                  */
2275
2276                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
2277                     0, pcbinfo->ipi_hashmask)];
2278                 LIST_FOREACH(inp, head, inp_hash) {
2279 #ifdef INET6
2280                         /* XXX inp locking */
2281                         if ((inp->inp_vflag & INP_IPV4) == 0)
2282                                 continue;
2283 #endif
2284                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
2285                             inp->inp_lport != lport)
2286                                 continue;
2287
2288                         injail = prison_flag(inp->inp_cred, PR_IP4);
2289                         if (injail) {
2290                                 if (prison_check_ip4(inp->inp_cred,
2291                                     &laddr) != 0)
2292                                         continue;
2293                         } else {
2294                                 if (local_exact != NULL)
2295                                         continue;
2296                         }
2297
2298                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
2299                                 if (injail)
2300                                         return (inp);
2301                                 else
2302                                         local_exact = inp;
2303                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2304 #ifdef INET6
2305                                 /* XXX inp locking, NULL check */
2306                                 if (inp->inp_vflag & INP_IPV6PROTO)
2307                                         local_wild_mapped = inp;
2308                                 else
2309 #endif
2310                                         if (injail)
2311                                                 jail_wild = inp;
2312                                         else
2313                                                 local_wild = inp;
2314                         }
2315                 } /* LIST_FOREACH */
2316                 if (jail_wild != NULL)
2317                         return (jail_wild);
2318                 if (local_exact != NULL)
2319                         return (local_exact);
2320                 if (local_wild != NULL)
2321                         return (local_wild);
2322 #ifdef INET6
2323                 if (local_wild_mapped != NULL)
2324                         return (local_wild_mapped);
2325 #endif
2326         } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
2327
2328         return (NULL);
2329 }
2330
2331 /*
2332  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
2333  * hash list lock, and will return the inpcb locked (i.e., requires
2334  * INPLOOKUP_LOCKPCB).
2335  */
2336 static struct inpcb *
2337 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2338     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2339     struct ifnet *ifp)
2340 {
2341         struct inpcb *inp;
2342         bool locked;
2343
2344         INP_HASH_RLOCK(pcbinfo);
2345         inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2346             (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
2347         if (inp != NULL) {
2348                 if (lookupflags & INPLOOKUP_WLOCKPCB)
2349                         locked = INP_TRY_WLOCK(inp);
2350                 else if (lookupflags & INPLOOKUP_RLOCKPCB)
2351                         locked = INP_TRY_RLOCK(inp);
2352                 else
2353                         panic("%s: locking bug", __func__);
2354                 if (!locked)
2355                         in_pcbref(inp);
2356                 INP_HASH_RUNLOCK(pcbinfo);
2357                 if (!locked) {
2358                         if (lookupflags & INPLOOKUP_WLOCKPCB) {
2359                                 INP_WLOCK(inp);
2360                                 if (in_pcbrele_wlocked(inp))
2361                                         return (NULL);
2362                         } else {
2363                                 INP_RLOCK(inp);
2364                                 if (in_pcbrele_rlocked(inp))
2365                                         return (NULL);
2366                         }
2367                 }
2368 #ifdef INVARIANTS
2369                 if (lookupflags & INPLOOKUP_WLOCKPCB)
2370                         INP_WLOCK_ASSERT(inp);
2371                 else
2372                         INP_RLOCK_ASSERT(inp);
2373 #endif
2374         } else
2375                 INP_HASH_RUNLOCK(pcbinfo);
2376         return (inp);
2377 }
2378
2379 /*
2380  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2381  * from which a pre-calculated hash value may be extracted.
2382  *
2383  * Possibly more of this logic should be in in_pcbgroup.c.
2384  */
2385 struct inpcb *
2386 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2387     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
2388 {
2389 #if defined(PCBGROUP) && !defined(RSS)
2390         struct inpcbgroup *pcbgroup;
2391 #endif
2392
2393         KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2394             ("%s: invalid lookup flags %d", __func__, lookupflags));
2395         KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2396             ("%s: LOCKPCB not set", __func__));
2397
2398         /*
2399          * When not using RSS, use connection groups in preference to the
2400          * reservation table when looking up 4-tuples.  When using RSS, just
2401          * use the reservation table, due to the cost of the Toeplitz hash
2402          * in software.
2403          *
2404          * XXXRW: This policy belongs in the pcbgroup code, as in principle
2405          * we could be doing RSS with a non-Toeplitz hash that is affordable
2406          * in software.
2407          */
2408 #if defined(PCBGROUP) && !defined(RSS)
2409         if (in_pcbgroup_enabled(pcbinfo)) {
2410                 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2411                     fport);
2412                 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2413                     laddr, lport, lookupflags, ifp));
2414         }
2415 #endif
2416         return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2417             lookupflags, ifp));
2418 }
2419
2420 struct inpcb *
2421 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2422     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2423     struct ifnet *ifp, struct mbuf *m)
2424 {
2425 #ifdef PCBGROUP
2426         struct inpcbgroup *pcbgroup;
2427 #endif
2428
2429         KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2430             ("%s: invalid lookup flags %d", __func__, lookupflags));
2431         KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2432             ("%s: LOCKPCB not set", __func__));
2433
2434 #ifdef PCBGROUP
2435         /*
2436          * If we can use a hardware-generated hash to look up the connection
2437          * group, use that connection group to find the inpcb.  Otherwise
2438          * fall back on a software hash -- or the reservation table if we're
2439          * using RSS.
2440          *
2441          * XXXRW: As above, that policy belongs in the pcbgroup code.
2442          */
2443         if (in_pcbgroup_enabled(pcbinfo) &&
2444             !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
2445                 pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
2446                     m->m_pkthdr.flowid);
2447                 if (pcbgroup != NULL)
2448                         return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
2449                             fport, laddr, lport, lookupflags, ifp));
2450 #ifndef RSS
2451                 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2452                     fport);
2453                 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2454                     laddr, lport, lookupflags, ifp));
2455 #endif
2456         }
2457 #endif
2458         return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2459             lookupflags, ifp));
2460 }
2461 #endif /* INET */
2462
2463 /*
2464  * Insert PCB onto various hash lists.
2465  */
2466 static int
2467 in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
2468 {
2469         struct inpcbhead *pcbhash;
2470         struct inpcbporthead *pcbporthash;
2471         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2472         struct inpcbport *phd;
2473         u_int32_t hashkey_faddr;
2474         int so_options;
2475
2476         INP_WLOCK_ASSERT(inp);
2477         INP_HASH_WLOCK_ASSERT(pcbinfo);
2478
2479         KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2480             ("in_pcbinshash: INP_INHASHLIST"));
2481
2482 #ifdef INET6
2483         if (inp->inp_vflag & INP_IPV6)
2484                 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2485         else
2486 #endif
2487         hashkey_faddr = inp->inp_faddr.s_addr;
2488
2489         pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2490                  inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2491
2492         pcbporthash = &pcbinfo->ipi_porthashbase[
2493             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2494
2495
2496         /*
2497          * Add entry in lb group
2498          * Only do this if SO_REUSEPORT_LB is set
2499          */
2500         so_options = inp_so_options(inp);
2501         if(so_options & SO_REUSEPORT_LB) {
2502                 int ret = in_pcbinslbgrouphash(inp, pcbinfo);
2503                 if(ret) {
2504                         // pcb lb group malloc fail (ret=ENOBUFS)
2505                         return ret;
2506                 }
2507         }
2508
2509         /*
2510          * Go through port list and look for a head for this lport.
2511          */
2512         LIST_FOREACH(phd, pcbporthash, phd_hash) {
2513                 if (phd->phd_port == inp->inp_lport)
2514                         break;
2515         }
2516         /*
2517          * If none exists, malloc one and tack it on.
2518          */
2519         if (phd == NULL) {
2520                 phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
2521                 if (phd == NULL) {
2522                         return (ENOBUFS); /* XXX */
2523                 }
2524                 phd->phd_port = inp->inp_lport;
2525                 LIST_INIT(&phd->phd_pcblist);
2526                 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2527         }
2528         inp->inp_phd = phd;
2529         LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2530         LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
2531         inp->inp_flags |= INP_INHASHLIST;
2532 #ifdef PCBGROUP
2533         if (do_pcbgroup_update)
2534                 in_pcbgroup_update(inp);
2535 #endif
2536         return (0);
2537 }
2538
2539 /*
2540  * For now, there are two public interfaces to insert an inpcb into the hash
2541  * lists -- one that does update pcbgroups, and one that doesn't.  The latter
2542  * is used only in the TCP syncache, where in_pcbinshash is called before the
2543  * full 4-tuple is set for the inpcb, and we don't want to install in the
2544  * pcbgroup until later.
2545  *
2546  * XXXRW: This seems like a misfeature.  in_pcbinshash should always update
2547  * connection groups, and partially initialised inpcbs should not be exposed
2548  * to either reservation hash tables or pcbgroups.
2549  */
2550 int
2551 in_pcbinshash(struct inpcb *inp)
2552 {
2553
2554         return (in_pcbinshash_internal(inp, 1));
2555 }
2556
2557 int
2558 in_pcbinshash_nopcbgroup(struct inpcb *inp)
2559 {
2560
2561         return (in_pcbinshash_internal(inp, 0));
2562 }
2563
2564 /*
2565  * Move PCB to the proper hash bucket when { faddr, fport } have  been
2566  * changed. NOTE: This does not handle the case of the lport changing (the
2567  * hashed port list would have to be updated as well), so the lport must
2568  * not change after in_pcbinshash() has been called.
2569  */
2570 void
2571 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
2572 {
2573         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2574         struct inpcbhead *head;
2575         u_int32_t hashkey_faddr;
2576
2577         INP_WLOCK_ASSERT(inp);
2578         INP_HASH_WLOCK_ASSERT(pcbinfo);
2579
2580         KASSERT(inp->inp_flags & INP_INHASHLIST,
2581             ("in_pcbrehash: !INP_INHASHLIST"));
2582
2583 #ifdef INET6
2584         if (inp->inp_vflag & INP_IPV6)
2585                 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2586         else
2587 #endif
2588         hashkey_faddr = inp->inp_faddr.s_addr;
2589
2590         head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2591                 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2592
2593         LIST_REMOVE(inp, inp_hash);
2594         LIST_INSERT_HEAD(head, inp, inp_hash);
2595
2596 #ifdef PCBGROUP
2597         if (m != NULL)
2598                 in_pcbgroup_update_mbuf(inp, m);
2599         else
2600                 in_pcbgroup_update(inp);
2601 #endif
2602 }
2603
2604 void
2605 in_pcbrehash(struct inpcb *inp)
2606 {
2607
2608         in_pcbrehash_mbuf(inp, NULL);
2609 }
2610
2611 /*
2612  * Remove PCB from various lists.
2613  */
2614 static void
2615 in_pcbremlists(struct inpcb *inp)
2616 {
2617         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2618
2619 #ifdef INVARIANTS
2620         if (pcbinfo == &V_tcbinfo) {
2621                 INP_INFO_RLOCK_ASSERT(pcbinfo);
2622         } else {
2623                 INP_INFO_WLOCK_ASSERT(pcbinfo);
2624         }
2625 #endif
2626
2627         INP_WLOCK_ASSERT(inp);
2628         INP_LIST_WLOCK_ASSERT(pcbinfo);
2629
2630         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
2631         if (inp->inp_flags & INP_INHASHLIST) {
2632                 struct inpcbport *phd = inp->inp_phd;
2633
2634                 INP_HASH_WLOCK(pcbinfo);
2635
2636                 // XXX Only do if SO_REUSEPORT_LB set?
2637                 in_pcbremlbgrouphash(inp, pcbinfo);
2638
2639                 LIST_REMOVE(inp, inp_hash);
2640                 LIST_REMOVE(inp, inp_portlist);
2641                 if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
2642                         LIST_REMOVE(phd, phd_hash);
2643                         free(phd, M_PCB);
2644                 }
2645                 INP_HASH_WUNLOCK(pcbinfo);
2646                 inp->inp_flags &= ~INP_INHASHLIST;
2647         }
2648         LIST_REMOVE(inp, inp_list);
2649         pcbinfo->ipi_count--;
2650 #ifdef PCBGROUP
2651         in_pcbgroup_remove(inp);
2652 #endif
2653 }
2654
2655 /*
2656  * Check for alternatives when higher level complains
2657  * about service problems.  For now, invalidate cached
2658  * routing information.  If the route was created dynamically
2659  * (by a redirect), time to try a default gateway again.
2660  */
2661 void
2662 in_losing(struct inpcb *inp)
2663 {
2664
2665         RO_INVALIDATE_CACHE(&inp->inp_route);
2666         return;
2667 }
2668
2669 /*
2670  * A set label operation has occurred at the socket layer, propagate the
2671  * label change into the in_pcb for the socket.
2672  */
2673 void
2674 in_pcbsosetlabel(struct socket *so)
2675 {
2676 #ifdef MAC
2677         struct inpcb *inp;
2678
2679         inp = sotoinpcb(so);
2680         KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2681
2682         INP_WLOCK(inp);
2683         SOCK_LOCK(so);
2684         mac_inpcb_sosetlabel(so, inp);
2685         SOCK_UNLOCK(so);
2686         INP_WUNLOCK(inp);
2687 #endif
2688 }
2689
2690 /*
2691  * ipport_tick runs once per second, determining if random port allocation
2692  * should be continued.  If more than ipport_randomcps ports have been
2693  * allocated in the last second, then we return to sequential port
2694  * allocation. We return to random allocation only once we drop below
2695  * ipport_randomcps for at least ipport_randomtime seconds.
2696  */
2697 static void
2698 ipport_tick(void *xtp)
2699 {
2700         VNET_ITERATOR_DECL(vnet_iter);
2701
2702         VNET_LIST_RLOCK_NOSLEEP();
2703         VNET_FOREACH(vnet_iter) {
2704                 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */
2705                 if (V_ipport_tcpallocs <=
2706                     V_ipport_tcplastcount + V_ipport_randomcps) {
2707                         if (V_ipport_stoprandom > 0)
2708                                 V_ipport_stoprandom--;
2709                 } else
2710                         V_ipport_stoprandom = V_ipport_randomtime;
2711                 V_ipport_tcplastcount = V_ipport_tcpallocs;
2712                 CURVNET_RESTORE();
2713         }
2714         VNET_LIST_RUNLOCK_NOSLEEP();
2715         callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
2716 }
2717
2718 static void
2719 ip_fini(void *xtp)
2720 {
2721
2722         callout_stop(&ipport_tick_callout);
2723 }
2724
2725 /* 
2726  * The ipport_callout should start running at about the time we attach the
2727  * inet or inet6 domains.
2728  */
2729 static void
2730 ipport_tick_init(const void *unused __unused)
2731 {
2732
2733         /* Start ipport_tick. */
2734         callout_init(&ipport_tick_callout, 1);
2735         callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
2736         EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
2737                 SHUTDOWN_PRI_DEFAULT);
2738 }
2739 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, 
2740     ipport_tick_init, NULL);
2741
2742 void
2743 inp_wlock(struct inpcb *inp)
2744 {
2745
2746         INP_WLOCK(inp);
2747 }
2748
2749 void
2750 inp_wunlock(struct inpcb *inp)
2751 {
2752
2753         INP_WUNLOCK(inp);
2754 }
2755
2756 void
2757 inp_rlock(struct inpcb *inp)
2758 {
2759
2760         INP_RLOCK(inp);
2761 }
2762
2763 void
2764 inp_runlock(struct inpcb *inp)
2765 {
2766
2767         INP_RUNLOCK(inp);
2768 }
2769
2770 #ifdef INVARIANT_SUPPORT
2771 void
2772 inp_lock_assert(struct inpcb *inp)
2773 {
2774
2775         INP_WLOCK_ASSERT(inp);
2776 }
2777
2778 void
2779 inp_unlock_assert(struct inpcb *inp)
2780 {
2781
2782         INP_UNLOCK_ASSERT(inp);
2783 }
2784 #endif
2785
2786 void
2787 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
2788 {
2789         struct inpcb *inp;
2790
2791         INP_INFO_WLOCK(&V_tcbinfo);
2792         LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
2793                 INP_WLOCK(inp);
2794                 func(inp, arg);
2795                 INP_WUNLOCK(inp);
2796         }
2797         INP_INFO_WUNLOCK(&V_tcbinfo);
2798 }
2799
2800 struct socket *
2801 inp_inpcbtosocket(struct inpcb *inp)
2802 {
2803
2804         INP_WLOCK_ASSERT(inp);
2805         return (inp->inp_socket);
2806 }
2807
2808 struct tcpcb *
2809 inp_inpcbtotcpcb(struct inpcb *inp)
2810 {
2811
2812         INP_WLOCK_ASSERT(inp);
2813         return ((struct tcpcb *)inp->inp_ppcb);
2814 }
2815
2816 int
2817 inp_ip_tos_get(const struct inpcb *inp)
2818 {
2819
2820         return (inp->inp_ip_tos);
2821 }
2822
2823 void
2824 inp_ip_tos_set(struct inpcb *inp, int val)
2825 {
2826
2827         inp->inp_ip_tos = val;
2828 }
2829
2830 void
2831 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2832     uint32_t *faddr, uint16_t *fp)
2833 {
2834
2835         INP_LOCK_ASSERT(inp);
2836         *laddr = inp->inp_laddr.s_addr;
2837         *faddr = inp->inp_faddr.s_addr;
2838         *lp = inp->inp_lport;
2839         *fp = inp->inp_fport;
2840 }
2841
2842 struct inpcb *
2843 so_sotoinpcb(struct socket *so)
2844 {
2845
2846         return (sotoinpcb(so));
2847 }
2848
2849 struct tcpcb *
2850 so_sototcpcb(struct socket *so)
2851 {
2852
2853         return (sototcpcb(so));
2854 }
2855
2856 /*
2857  * Create an external-format (``xinpcb'') structure using the information in
2858  * the kernel-format in_pcb structure pointed to by inp.  This is done to
2859  * reduce the spew of irrelevant information over this interface, to isolate
2860  * user code from changes in the kernel structure, and potentially to provide
2861  * information-hiding if we decide that some of this information should be
2862  * hidden from users.
2863  */
2864 void
2865 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
2866 {
2867
2868         xi->xi_len = sizeof(struct xinpcb);
2869         if (inp->inp_socket)
2870                 sotoxsocket(inp->inp_socket, &xi->xi_socket);
2871         else
2872                 bzero(&xi->xi_socket, sizeof(struct xsocket));
2873         bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
2874         xi->inp_gencnt = inp->inp_gencnt;
2875         xi->inp_ppcb = inp->inp_ppcb;
2876         xi->inp_flow = inp->inp_flow;
2877         xi->inp_flowid = inp->inp_flowid;
2878         xi->inp_flowtype = inp->inp_flowtype;
2879         xi->inp_flags = inp->inp_flags;
2880         xi->inp_flags2 = inp->inp_flags2;
2881         xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
2882         xi->in6p_cksum = inp->in6p_cksum;
2883         xi->in6p_hops = inp->in6p_hops;
2884         xi->inp_ip_tos = inp->inp_ip_tos;
2885         xi->inp_vflag = inp->inp_vflag;
2886         xi->inp_ip_ttl = inp->inp_ip_ttl;
2887         xi->inp_ip_p = inp->inp_ip_p;
2888         xi->inp_ip_minttl = inp->inp_ip_minttl;
2889 }
2890
2891 #ifdef DDB
2892 static void
2893 db_print_indent(int indent)
2894 {
2895         int i;
2896
2897         for (i = 0; i < indent; i++)
2898                 db_printf(" ");
2899 }
2900
2901 static void
2902 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
2903 {
2904         char faddr_str[48], laddr_str[48];
2905
2906         db_print_indent(indent);
2907         db_printf("%s at %p\n", name, inc);
2908
2909         indent += 2;
2910
2911 #ifdef INET6
2912         if (inc->inc_flags & INC_ISIPV6) {
2913                 /* IPv6. */
2914                 ip6_sprintf(laddr_str, &inc->inc6_laddr);
2915                 ip6_sprintf(faddr_str, &inc->inc6_faddr);
2916         } else
2917 #endif
2918         {
2919                 /* IPv4. */
2920                 inet_ntoa_r(inc->inc_laddr, laddr_str);
2921                 inet_ntoa_r(inc->inc_faddr, faddr_str);
2922         }
2923         db_print_indent(indent);
2924         db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
2925             ntohs(inc->inc_lport));
2926         db_print_indent(indent);
2927         db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
2928             ntohs(inc->inc_fport));
2929 }
2930
2931 static void
2932 db_print_inpflags(int inp_flags)
2933 {
2934         int comma;
2935
2936         comma = 0;
2937         if (inp_flags & INP_RECVOPTS) {
2938                 db_printf("%sINP_RECVOPTS", comma ? ", " : "");
2939                 comma = 1;
2940         }
2941         if (inp_flags & INP_RECVRETOPTS) {
2942                 db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
2943                 comma = 1;
2944         }
2945         if (inp_flags & INP_RECVDSTADDR) {
2946                 db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
2947                 comma = 1;
2948         }
2949         if (inp_flags & INP_ORIGDSTADDR) {
2950                 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
2951                 comma = 1;
2952         }
2953         if (inp_flags & INP_HDRINCL) {
2954                 db_printf("%sINP_HDRINCL", comma ? ", " : "");
2955                 comma = 1;
2956         }
2957         if (inp_flags & INP_HIGHPORT) {
2958                 db_printf("%sINP_HIGHPORT", comma ? ", " : "");
2959                 comma = 1;
2960         }
2961         if (inp_flags & INP_LOWPORT) {
2962                 db_printf("%sINP_LOWPORT", comma ? ", " : "");
2963                 comma = 1;
2964         }
2965         if (inp_flags & INP_ANONPORT) {
2966                 db_printf("%sINP_ANONPORT", comma ? ", " : "");
2967                 comma = 1;
2968         }
2969         if (inp_flags & INP_RECVIF) {
2970                 db_printf("%sINP_RECVIF", comma ? ", " : "");
2971                 comma = 1;
2972         }
2973         if (inp_flags & INP_MTUDISC) {
2974                 db_printf("%sINP_MTUDISC", comma ? ", " : "");
2975                 comma = 1;
2976         }
2977         if (inp_flags & INP_RECVTTL) {
2978                 db_printf("%sINP_RECVTTL", comma ? ", " : "");
2979                 comma = 1;
2980         }
2981         if (inp_flags & INP_DONTFRAG) {
2982                 db_printf("%sINP_DONTFRAG", comma ? ", " : "");
2983                 comma = 1;
2984         }
2985         if (inp_flags & INP_RECVTOS) {
2986                 db_printf("%sINP_RECVTOS", comma ? ", " : "");
2987                 comma = 1;
2988         }
2989         if (inp_flags & IN6P_IPV6_V6ONLY) {
2990                 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
2991                 comma = 1;
2992         }
2993         if (inp_flags & IN6P_PKTINFO) {
2994                 db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
2995                 comma = 1;
2996         }
2997         if (inp_flags & IN6P_HOPLIMIT) {
2998                 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
2999                 comma = 1;
3000         }
3001         if (inp_flags & IN6P_HOPOPTS) {
3002                 db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
3003                 comma = 1;
3004         }
3005         if (inp_flags & IN6P_DSTOPTS) {
3006                 db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
3007                 comma = 1;
3008         }
3009         if (inp_flags & IN6P_RTHDR) {
3010                 db_printf("%sIN6P_RTHDR", comma ? ", " : "");
3011                 comma = 1;
3012         }
3013         if (inp_flags & IN6P_RTHDRDSTOPTS) {
3014                 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
3015                 comma = 1;
3016         }
3017         if (inp_flags & IN6P_TCLASS) {
3018                 db_printf("%sIN6P_TCLASS", comma ? ", " : "");
3019                 comma = 1;
3020         }
3021         if (inp_flags & IN6P_AUTOFLOWLABEL) {
3022                 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
3023                 comma = 1;
3024         }
3025         if (inp_flags & INP_TIMEWAIT) {
3026                 db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
3027                 comma  = 1;
3028         }
3029         if (inp_flags & INP_ONESBCAST) {
3030                 db_printf("%sINP_ONESBCAST", comma ? ", " : "");
3031                 comma  = 1;
3032         }
3033         if (inp_flags & INP_DROPPED) {
3034                 db_printf("%sINP_DROPPED", comma ? ", " : "");
3035                 comma  = 1;
3036         }
3037         if (inp_flags & INP_SOCKREF) {
3038                 db_printf("%sINP_SOCKREF", comma ? ", " : "");
3039                 comma  = 1;
3040         }
3041         if (inp_flags & IN6P_RFC2292) {
3042                 db_printf("%sIN6P_RFC2292", comma ? ", " : "");
3043                 comma = 1;
3044         }
3045         if (inp_flags & IN6P_MTU) {
3046                 db_printf("IN6P_MTU%s", comma ? ", " : "");
3047                 comma = 1;
3048         }
3049 }
3050
3051 static void
3052 db_print_inpvflag(u_char inp_vflag)
3053 {
3054         int comma;
3055
3056         comma = 0;
3057         if (inp_vflag & INP_IPV4) {
3058                 db_printf("%sINP_IPV4", comma ? ", " : "");
3059                 comma  = 1;
3060         }
3061         if (inp_vflag & INP_IPV6) {
3062                 db_printf("%sINP_IPV6", comma ? ", " : "");
3063                 comma  = 1;
3064         }
3065         if (inp_vflag & INP_IPV6PROTO) {
3066                 db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
3067                 comma  = 1;
3068         }
3069 }
3070
3071 static void
3072 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3073 {
3074
3075         db_print_indent(indent);
3076         db_printf("%s at %p\n", name, inp);
3077
3078         indent += 2;
3079
3080         db_print_indent(indent);
3081         db_printf("inp_flow: 0x%x\n", inp->inp_flow);
3082
3083         db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3084
3085         db_print_indent(indent);
3086         db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
3087             inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
3088
3089         db_print_indent(indent);
3090         db_printf("inp_label: %p   inp_flags: 0x%x (",
3091            inp->inp_label, inp->inp_flags);
3092         db_print_inpflags(inp->inp_flags);
3093         db_printf(")\n");
3094
3095         db_print_indent(indent);
3096         db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
3097             inp->inp_vflag);
3098         db_print_inpvflag(inp->inp_vflag);
3099         db_printf(")\n");
3100
3101         db_print_indent(indent);
3102         db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
3103             inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3104
3105         db_print_indent(indent);
3106 #ifdef INET6
3107         if (inp->inp_vflag & INP_IPV6) {
3108                 db_printf("in6p_options: %p   in6p_outputopts: %p   "
3109                     "in6p_moptions: %p\n", inp->in6p_options,
3110                     inp->in6p_outputopts, inp->in6p_moptions);
3111                 db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
3112                     "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3113                     inp->in6p_hops);
3114         } else
3115 #endif
3116         {
3117                 db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
3118                     "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3119                     inp->inp_options, inp->inp_moptions);
3120         }
3121
3122         db_print_indent(indent);
3123         db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
3124             (uintmax_t)inp->inp_gencnt);
3125 }
3126
3127 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3128 {
3129         struct inpcb *inp;
3130
3131         if (!have_addr) {
3132                 db_printf("usage: show inpcb <addr>\n");
3133                 return;
3134         }
3135         inp = (struct inpcb *)addr;
3136
3137         db_print_inpcb(inp, "inpcb", 0);
3138 }
3139 #endif /* DDB */
3140
3141 #ifdef RATELIMIT
3142 /*
3143  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3144  * if any.
3145  */
3146 int
3147 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3148 {
3149         union if_snd_tag_modify_params params = {
3150                 .rate_limit.max_rate = max_pacing_rate,
3151         };
3152         struct m_snd_tag *mst;
3153         struct ifnet *ifp;
3154         int error;
3155
3156         mst = inp->inp_snd_tag;
3157         if (mst == NULL)
3158                 return (EINVAL);
3159
3160         ifp = mst->ifp;
3161         if (ifp == NULL)
3162                 return (EINVAL);
3163
3164         if (ifp->if_snd_tag_modify == NULL) {
3165                 error = EOPNOTSUPP;
3166         } else {
3167                 error = ifp->if_snd_tag_modify(mst, &params);
3168         }
3169         return (error);
3170 }
3171
3172 /*
3173  * Query existing TX rate limit based on the existing
3174  * "inp->inp_snd_tag", if any.
3175  */
3176 int
3177 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3178 {
3179         union if_snd_tag_query_params params = { };
3180         struct m_snd_tag *mst;
3181         struct ifnet *ifp;
3182         int error;
3183
3184         mst = inp->inp_snd_tag;
3185         if (mst == NULL)
3186                 return (EINVAL);
3187
3188         ifp = mst->ifp;
3189         if (ifp == NULL)
3190                 return (EINVAL);
3191
3192         if (ifp->if_snd_tag_query == NULL) {
3193                 error = EOPNOTSUPP;
3194         } else {
3195                 error = ifp->if_snd_tag_query(mst, &params);
3196                 if (error == 0 &&  p_max_pacing_rate != NULL)
3197                         *p_max_pacing_rate = params.rate_limit.max_rate;
3198         }
3199         return (error);
3200 }
3201
3202 /*
3203  * Query existing TX queue level based on the existing
3204  * "inp->inp_snd_tag", if any.
3205  */
3206 int
3207 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3208 {
3209         union if_snd_tag_query_params params = { };
3210         struct m_snd_tag *mst;
3211         struct ifnet *ifp;
3212         int error;
3213
3214         mst = inp->inp_snd_tag;
3215         if (mst == NULL)
3216                 return (EINVAL);
3217
3218         ifp = mst->ifp;
3219         if (ifp == NULL)
3220                 return (EINVAL);
3221
3222         if (ifp->if_snd_tag_query == NULL)
3223                 return (EOPNOTSUPP);
3224
3225         error = ifp->if_snd_tag_query(mst, &params);
3226         if (error == 0 &&  p_txqueue_level != NULL)
3227                 *p_txqueue_level = params.rate_limit.queue_level;
3228         return (error);
3229 }
3230
3231 /*
3232  * Allocate a new TX rate limit send tag from the network interface
3233  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3234  */
3235 int
3236 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3237     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
3238 {
3239         union if_snd_tag_alloc_params params = {
3240                 .rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3241                     IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3242                 .rate_limit.hdr.flowid = flowid,
3243                 .rate_limit.hdr.flowtype = flowtype,
3244                 .rate_limit.max_rate = max_pacing_rate,
3245         };
3246         int error;
3247
3248         INP_WLOCK_ASSERT(inp);
3249
3250         if (inp->inp_snd_tag != NULL)
3251                 return (EINVAL);
3252
3253         if (ifp->if_snd_tag_alloc == NULL) {
3254                 error = EOPNOTSUPP;
3255         } else {
3256                 error = ifp->if_snd_tag_alloc(ifp, &params, &inp->inp_snd_tag);
3257
3258                 /*
3259                  * At success increment the refcount on
3260                  * the send tag's network interface:
3261                  */
3262                 if (error == 0)
3263                         if_ref(inp->inp_snd_tag->ifp);
3264         }
3265         return (error);
3266 }
3267
3268 /*
3269  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3270  * if any:
3271  */
3272 void
3273 in_pcbdetach_txrtlmt(struct inpcb *inp)
3274 {
3275         struct m_snd_tag *mst;
3276         struct ifnet *ifp;
3277
3278         INP_WLOCK_ASSERT(inp);
3279
3280         mst = inp->inp_snd_tag;
3281         inp->inp_snd_tag = NULL;
3282
3283         if (mst == NULL)
3284                 return;
3285
3286         ifp = mst->ifp;
3287         if (ifp == NULL)
3288                 return;
3289
3290         /*
3291          * If the device was detached while we still had reference(s)
3292          * on the ifp, we assume if_snd_tag_free() was replaced with
3293          * stubs.
3294          */
3295         ifp->if_snd_tag_free(mst);
3296
3297         /* release reference count on network interface */
3298         if_rele(ifp);
3299 }
3300
3301 /*
3302  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3303  * is set in the fast path and will attach/detach/modify the TX rate
3304  * limit send tag based on the socket's so_max_pacing_rate value.
3305  */
3306 void
3307 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3308 {
3309         struct socket *socket;
3310         uint32_t max_pacing_rate;
3311         bool did_upgrade;
3312         int error;
3313
3314         if (inp == NULL)
3315                 return;
3316
3317         socket = inp->inp_socket;
3318         if (socket == NULL)
3319                 return;
3320
3321         if (!INP_WLOCKED(inp)) {
3322                 /*
3323                  * NOTE: If the write locking fails, we need to bail
3324                  * out and use the non-ratelimited ring for the
3325                  * transmit until there is a new chance to get the
3326                  * write lock.
3327                  */
3328                 if (!INP_TRY_UPGRADE(inp))
3329                         return;
3330                 did_upgrade = 1;
3331         } else {
3332                 did_upgrade = 0;
3333         }
3334
3335         /*
3336          * NOTE: The so_max_pacing_rate value is read unlocked,
3337          * because atomic updates are not required since the variable
3338          * is checked at every mbuf we send. It is assumed that the
3339          * variable read itself will be atomic.
3340          */
3341         max_pacing_rate = socket->so_max_pacing_rate;
3342
3343         /*
3344          * NOTE: When attaching to a network interface a reference is
3345          * made to ensure the network interface doesn't go away until
3346          * all ratelimit connections are gone. The network interface
3347          * pointers compared below represent valid network interfaces,
3348          * except when comparing towards NULL.
3349          */
3350         if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3351                 error = 0;
3352         } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3353                 if (inp->inp_snd_tag != NULL)
3354                         in_pcbdetach_txrtlmt(inp);
3355                 error = 0;
3356         } else if (inp->inp_snd_tag == NULL) {
3357                 /*
3358                  * In order to utilize packet pacing with RSS, we need
3359                  * to wait until there is a valid RSS hash before we
3360                  * can proceed:
3361                  */
3362                 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3363                         error = EAGAIN;
3364                 } else {
3365                         error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3366                             mb->m_pkthdr.flowid, max_pacing_rate);
3367                 }
3368         } else {
3369                 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3370         }
3371         if (error == 0 || error == EOPNOTSUPP)
3372                 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3373         if (did_upgrade)
3374                 INP_DOWNGRADE(inp);
3375 }
3376
3377 /*
3378  * Track route changes for TX rate limiting.
3379  */
3380 void
3381 in_pcboutput_eagain(struct inpcb *inp)
3382 {
3383         struct socket *socket;
3384         bool did_upgrade;
3385
3386         if (inp == NULL)
3387                 return;
3388
3389         socket = inp->inp_socket;
3390         if (socket == NULL)
3391                 return;
3392
3393         if (inp->inp_snd_tag == NULL)
3394                 return;
3395
3396         if (!INP_WLOCKED(inp)) {
3397                 /*
3398                  * NOTE: If the write locking fails, we need to bail
3399                  * out and use the non-ratelimited ring for the
3400                  * transmit until there is a new chance to get the
3401                  * write lock.
3402                  */
3403                 if (!INP_TRY_UPGRADE(inp))
3404                         return;
3405                 did_upgrade = 1;
3406         } else {
3407                 did_upgrade = 0;
3408         }
3409
3410         /* detach rate limiting */
3411         in_pcbdetach_txrtlmt(inp);
3412
3413         /* make sure new mbuf send tag allocation is made */
3414         inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3415
3416         if (did_upgrade)
3417                 INP_DOWNGRADE(inp);
3418 }
3419 #endif /* RATELIMIT */