]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/in_pcb.c
Merge ^/head r317281 through r317502.
[FreeBSD/FreeBSD.git] / sys / netinet / in_pcb.c
1 /*-
2  * Copyright (c) 1982, 1986, 1991, 1993, 1995
3  *      The Regents of the University of California.
4  * Copyright (c) 2007-2009 Robert N. M. Watson
5  * Copyright (c) 2010-2011 Juniper Networks, Inc.
6  * All rights reserved.
7  *
8  * Portions of this software were developed by Robert N. M. Watson under
9  * contract to Juniper Networks, Inc.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *      @(#)in_pcb.c    8.4 (Berkeley) 5/24/95
36  */
37
38 #include <sys/cdefs.h>
39 __FBSDID("$FreeBSD$");
40
41 #include "opt_ddb.h"
42 #include "opt_ipsec.h"
43 #include "opt_inet.h"
44 #include "opt_inet6.h"
45 #include "opt_ratelimit.h"
46 #include "opt_pcbgroup.h"
47 #include "opt_rss.h"
48
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/lock.h>
52 #include <sys/malloc.h>
53 #include <sys/mbuf.h>
54 #include <sys/callout.h>
55 #include <sys/eventhandler.h>
56 #include <sys/domain.h>
57 #include <sys/protosw.h>
58 #include <sys/rmlock.h>
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/sockio.h>
62 #include <sys/priv.h>
63 #include <sys/proc.h>
64 #include <sys/refcount.h>
65 #include <sys/jail.h>
66 #include <sys/kernel.h>
67 #include <sys/sysctl.h>
68
69 #ifdef DDB
70 #include <ddb/ddb.h>
71 #endif
72
73 #include <vm/uma.h>
74
75 #include <net/if.h>
76 #include <net/if_var.h>
77 #include <net/if_types.h>
78 #include <net/if_llatbl.h>
79 #include <net/route.h>
80 #include <net/rss_config.h>
81 #include <net/vnet.h>
82
83 #if defined(INET) || defined(INET6)
84 #include <netinet/in.h>
85 #include <netinet/in_pcb.h>
86 #include <netinet/ip_var.h>
87 #include <netinet/tcp_var.h>
88 #include <netinet/udp.h>
89 #include <netinet/udp_var.h>
90 #endif
91 #ifdef INET
92 #include <netinet/in_var.h>
93 #endif
94 #ifdef INET6
95 #include <netinet/ip6.h>
96 #include <netinet6/in6_pcb.h>
97 #include <netinet6/in6_var.h>
98 #include <netinet6/ip6_var.h>
99 #endif /* INET6 */
100
101 #include <netipsec/ipsec_support.h>
102
103 #include <security/mac/mac_framework.h>
104
105 static struct callout   ipport_tick_callout;
106
107 /*
108  * These configure the range of local port addresses assigned to
109  * "unspecified" outgoing connections/packets/whatever.
110  */
111 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;    /* 1023 */
112 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;    /* 600 */
113 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;     /* 10000 */
114 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;       /* 65535 */
115 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;      /* 49152 */
116 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;        /* 65535 */
117
118 /*
119  * Reserved ports accessible only to root. There are significant
120  * security considerations that must be accounted for when changing these,
121  * but the security benefits can be great. Please be careful.
122  */
123 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;    /* 1023 */
124 VNET_DEFINE(int, ipport_reservedlow);
125
126 /* Variables dealing with random ephemeral port allocation. */
127 VNET_DEFINE(int, ipport_randomized) = 1;        /* user controlled via sysctl */
128 VNET_DEFINE(int, ipport_randomcps) = 10;        /* user controlled via sysctl */
129 VNET_DEFINE(int, ipport_randomtime) = 45;       /* user controlled via sysctl */
130 VNET_DEFINE(int, ipport_stoprandom);            /* toggled by ipport_tick */
131 VNET_DEFINE(int, ipport_tcpallocs);
132 static VNET_DEFINE(int, ipport_tcplastcount);
133
134 #define V_ipport_tcplastcount           VNET(ipport_tcplastcount)
135
136 static void     in_pcbremlists(struct inpcb *inp);
137 #ifdef INET
138 static struct inpcb     *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
139                             struct in_addr faddr, u_int fport_arg,
140                             struct in_addr laddr, u_int lport_arg,
141                             int lookupflags, struct ifnet *ifp);
142
143 #define RANGECHK(var, min, max) \
144         if ((var) < (min)) { (var) = (min); } \
145         else if ((var) > (max)) { (var) = (max); }
146
147 static int
148 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
149 {
150         int error;
151
152         error = sysctl_handle_int(oidp, arg1, arg2, req);
153         if (error == 0) {
154                 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
155                 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
156                 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
157                 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
158                 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
159                 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
160         }
161         return (error);
162 }
163
164 #undef RANGECHK
165
166 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0,
167     "IP Ports");
168
169 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
170         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
171         &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", "");
172 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
173         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
174         &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", "");
175 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
176         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
177         &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", "");
178 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
179         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
180         &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", "");
181 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
182         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
183         &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", "");
184 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
185         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
186         &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", "");
187 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
188         CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
189         &VNET_NAME(ipport_reservedhigh), 0, "");
190 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
191         CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
192 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
193         CTLFLAG_VNET | CTLFLAG_RW,
194         &VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
195 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
196         CTLFLAG_VNET | CTLFLAG_RW,
197         &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
198         "allocations before switching to a sequental one");
199 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
200         CTLFLAG_VNET | CTLFLAG_RW,
201         &VNET_NAME(ipport_randomtime), 0,
202         "Minimum time to keep sequental port "
203         "allocation before switching to a random one");
204 #endif /* INET */
205
206 /*
207  * in_pcb.c: manage the Protocol Control Blocks.
208  *
209  * NOTE: It is assumed that most of these functions will be called with
210  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
211  * functions often modify hash chains or addresses in pcbs.
212  */
213
214 /*
215  * Initialize an inpcbinfo -- we should be able to reduce the number of
216  * arguments in time.
217  */
218 void
219 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
220     struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
221     char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini,
222     uint32_t inpcbzone_flags, u_int hashfields)
223 {
224
225         INP_INFO_LOCK_INIT(pcbinfo, name);
226         INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");     /* XXXRW: argument? */
227         INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
228 #ifdef VIMAGE
229         pcbinfo->ipi_vnet = curvnet;
230 #endif
231         pcbinfo->ipi_listhead = listhead;
232         LIST_INIT(pcbinfo->ipi_listhead);
233         pcbinfo->ipi_count = 0;
234         pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
235             &pcbinfo->ipi_hashmask);
236         pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
237             &pcbinfo->ipi_porthashmask);
238 #ifdef PCBGROUP
239         in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
240 #endif
241         pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
242             NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR,
243             inpcbzone_flags);
244         uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
245         uma_zone_set_warning(pcbinfo->ipi_zone,
246             "kern.ipc.maxsockets limit reached");
247 }
248
249 /*
250  * Destroy an inpcbinfo.
251  */
252 void
253 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
254 {
255
256         KASSERT(pcbinfo->ipi_count == 0,
257             ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
258
259         hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
260         hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
261             pcbinfo->ipi_porthashmask);
262 #ifdef PCBGROUP
263         in_pcbgroup_destroy(pcbinfo);
264 #endif
265         uma_zdestroy(pcbinfo->ipi_zone);
266         INP_LIST_LOCK_DESTROY(pcbinfo);
267         INP_HASH_LOCK_DESTROY(pcbinfo);
268         INP_INFO_LOCK_DESTROY(pcbinfo);
269 }
270
271 /*
272  * Allocate a PCB and associate it with the socket.
273  * On success return with the PCB locked.
274  */
275 int
276 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
277 {
278         struct inpcb *inp;
279         int error;
280
281 #ifdef INVARIANTS
282         if (pcbinfo == &V_tcbinfo) {
283                 INP_INFO_RLOCK_ASSERT(pcbinfo);
284         } else {
285                 INP_INFO_WLOCK_ASSERT(pcbinfo);
286         }
287 #endif
288
289         error = 0;
290         inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
291         if (inp == NULL)
292                 return (ENOBUFS);
293         bzero(inp, inp_zero_size);
294         inp->inp_pcbinfo = pcbinfo;
295         inp->inp_socket = so;
296         inp->inp_cred = crhold(so->so_cred);
297         inp->inp_inc.inc_fibnum = so->so_fibnum;
298 #ifdef MAC
299         error = mac_inpcb_init(inp, M_NOWAIT);
300         if (error != 0)
301                 goto out;
302         mac_inpcb_create(so, inp);
303 #endif
304 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
305         error = ipsec_init_pcbpolicy(inp);
306         if (error != 0) {
307 #ifdef MAC
308                 mac_inpcb_destroy(inp);
309 #endif
310                 goto out;
311         }
312 #endif /*IPSEC*/
313 #ifdef INET6
314         if (INP_SOCKAF(so) == AF_INET6) {
315                 inp->inp_vflag |= INP_IPV6PROTO;
316                 if (V_ip6_v6only)
317                         inp->inp_flags |= IN6P_IPV6_V6ONLY;
318         }
319 #endif
320         INP_WLOCK(inp);
321         INP_LIST_WLOCK(pcbinfo);
322         LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
323         pcbinfo->ipi_count++;
324         so->so_pcb = (caddr_t)inp;
325 #ifdef INET6
326         if (V_ip6_auto_flowlabel)
327                 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
328 #endif
329         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
330         refcount_init(&inp->inp_refcount, 1);   /* Reference from inpcbinfo */
331
332         /*
333          * Routes in inpcb's can cache L2 as well; they are guaranteed
334          * to be cleaned up.
335          */
336         inp->inp_route.ro_flags = RT_LLE_CACHE;
337         INP_LIST_WUNLOCK(pcbinfo);
338 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
339 out:
340         if (error != 0) {
341                 crfree(inp->inp_cred);
342                 uma_zfree(pcbinfo->ipi_zone, inp);
343         }
344 #endif
345         return (error);
346 }
347
348 #ifdef INET
349 int
350 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
351 {
352         int anonport, error;
353
354         INP_WLOCK_ASSERT(inp);
355         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
356
357         if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
358                 return (EINVAL);
359         anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
360         error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
361             &inp->inp_lport, cred);
362         if (error)
363                 return (error);
364         if (in_pcbinshash(inp) != 0) {
365                 inp->inp_laddr.s_addr = INADDR_ANY;
366                 inp->inp_lport = 0;
367                 return (EAGAIN);
368         }
369         if (anonport)
370                 inp->inp_flags |= INP_ANONPORT;
371         return (0);
372 }
373 #endif
374
375 /*
376  * Select a local port (number) to use.
377  */
378 #if defined(INET) || defined(INET6)
379 int
380 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
381     struct ucred *cred, int lookupflags)
382 {
383         struct inpcbinfo *pcbinfo;
384         struct inpcb *tmpinp;
385         unsigned short *lastport;
386         int count, dorandom, error;
387         u_short aux, first, last, lport;
388 #ifdef INET
389         struct in_addr laddr;
390 #endif
391
392         pcbinfo = inp->inp_pcbinfo;
393
394         /*
395          * Because no actual state changes occur here, a global write lock on
396          * the pcbinfo isn't required.
397          */
398         INP_LOCK_ASSERT(inp);
399         INP_HASH_LOCK_ASSERT(pcbinfo);
400
401         if (inp->inp_flags & INP_HIGHPORT) {
402                 first = V_ipport_hifirstauto;   /* sysctl */
403                 last  = V_ipport_hilastauto;
404                 lastport = &pcbinfo->ipi_lasthi;
405         } else if (inp->inp_flags & INP_LOWPORT) {
406                 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
407                 if (error)
408                         return (error);
409                 first = V_ipport_lowfirstauto;  /* 1023 */
410                 last  = V_ipport_lowlastauto;   /* 600 */
411                 lastport = &pcbinfo->ipi_lastlow;
412         } else {
413                 first = V_ipport_firstauto;     /* sysctl */
414                 last  = V_ipport_lastauto;
415                 lastport = &pcbinfo->ipi_lastport;
416         }
417         /*
418          * For UDP(-Lite), use random port allocation as long as the user
419          * allows it.  For TCP (and as of yet unknown) connections,
420          * use random port allocation only if the user allows it AND
421          * ipport_tick() allows it.
422          */
423         if (V_ipport_randomized &&
424                 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
425                 pcbinfo == &V_ulitecbinfo))
426                 dorandom = 1;
427         else
428                 dorandom = 0;
429         /*
430          * It makes no sense to do random port allocation if
431          * we have the only port available.
432          */
433         if (first == last)
434                 dorandom = 0;
435         /* Make sure to not include UDP(-Lite) packets in the count. */
436         if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
437                 V_ipport_tcpallocs++;
438         /*
439          * Instead of having two loops further down counting up or down
440          * make sure that first is always <= last and go with only one
441          * code path implementing all logic.
442          */
443         if (first > last) {
444                 aux = first;
445                 first = last;
446                 last = aux;
447         }
448
449 #ifdef INET
450         /* Make the compiler happy. */
451         laddr.s_addr = 0;
452         if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
453                 KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p",
454                     __func__, inp));
455                 laddr = *laddrp;
456         }
457 #endif
458         tmpinp = NULL;  /* Make compiler happy. */
459         lport = *lportp;
460
461         if (dorandom)
462                 *lastport = first + (arc4random() % (last - first));
463
464         count = last - first;
465
466         do {
467                 if (count-- < 0)        /* completely used? */
468                         return (EADDRNOTAVAIL);
469                 ++*lastport;
470                 if (*lastport < first || *lastport > last)
471                         *lastport = first;
472                 lport = htons(*lastport);
473
474 #ifdef INET6
475                 if ((inp->inp_vflag & INP_IPV6) != 0)
476                         tmpinp = in6_pcblookup_local(pcbinfo,
477                             &inp->in6p_laddr, lport, lookupflags, cred);
478 #endif
479 #if defined(INET) && defined(INET6)
480                 else
481 #endif
482 #ifdef INET
483                         tmpinp = in_pcblookup_local(pcbinfo, laddr,
484                             lport, lookupflags, cred);
485 #endif
486         } while (tmpinp != NULL);
487
488 #ifdef INET
489         if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4)
490                 laddrp->s_addr = laddr.s_addr;
491 #endif
492         *lportp = lport;
493
494         return (0);
495 }
496
497 /*
498  * Return cached socket options.
499  */
500 short
501 inp_so_options(const struct inpcb *inp)
502 {
503    short so_options;
504
505    so_options = 0;
506
507    if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
508            so_options |= SO_REUSEPORT;
509    if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
510            so_options |= SO_REUSEADDR;
511    return (so_options);
512 }
513 #endif /* INET || INET6 */
514
515 /*
516  * Check if a new BINDMULTI socket is allowed to be created.
517  *
518  * ni points to the new inp.
519  * oi points to the exisitng inp.
520  *
521  * This checks whether the existing inp also has BINDMULTI and
522  * whether the credentials match.
523  */
524 int
525 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
526 {
527         /* Check permissions match */
528         if ((ni->inp_flags2 & INP_BINDMULTI) &&
529             (ni->inp_cred->cr_uid !=
530             oi->inp_cred->cr_uid))
531                 return (0);
532
533         /* Check the existing inp has BINDMULTI set */
534         if ((ni->inp_flags2 & INP_BINDMULTI) &&
535             ((oi->inp_flags2 & INP_BINDMULTI) == 0))
536                 return (0);
537
538         /*
539          * We're okay - either INP_BINDMULTI isn't set on ni, or
540          * it is and it matches the checks.
541          */
542         return (1);
543 }
544
545 #ifdef INET
546 /*
547  * Set up a bind operation on a PCB, performing port allocation
548  * as required, but do not actually modify the PCB. Callers can
549  * either complete the bind by setting inp_laddr/inp_lport and
550  * calling in_pcbinshash(), or they can just use the resulting
551  * port and address to authorise the sending of a once-off packet.
552  *
553  * On error, the values of *laddrp and *lportp are not changed.
554  */
555 int
556 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
557     u_short *lportp, struct ucred *cred)
558 {
559         struct socket *so = inp->inp_socket;
560         struct sockaddr_in *sin;
561         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
562         struct in_addr laddr;
563         u_short lport = 0;
564         int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
565         int error;
566
567         /*
568          * No state changes, so read locks are sufficient here.
569          */
570         INP_LOCK_ASSERT(inp);
571         INP_HASH_LOCK_ASSERT(pcbinfo);
572
573         if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
574                 return (EADDRNOTAVAIL);
575         laddr.s_addr = *laddrp;
576         if (nam != NULL && laddr.s_addr != INADDR_ANY)
577                 return (EINVAL);
578         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
579                 lookupflags = INPLOOKUP_WILDCARD;
580         if (nam == NULL) {
581                 if ((error = prison_local_ip4(cred, &laddr)) != 0)
582                         return (error);
583         } else {
584                 sin = (struct sockaddr_in *)nam;
585                 if (nam->sa_len != sizeof (*sin))
586                         return (EINVAL);
587 #ifdef notdef
588                 /*
589                  * We should check the family, but old programs
590                  * incorrectly fail to initialize it.
591                  */
592                 if (sin->sin_family != AF_INET)
593                         return (EAFNOSUPPORT);
594 #endif
595                 error = prison_local_ip4(cred, &sin->sin_addr);
596                 if (error)
597                         return (error);
598                 if (sin->sin_port != *lportp) {
599                         /* Don't allow the port to change. */
600                         if (*lportp != 0)
601                                 return (EINVAL);
602                         lport = sin->sin_port;
603                 }
604                 /* NB: lport is left as 0 if the port isn't being changed. */
605                 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
606                         /*
607                          * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
608                          * allow complete duplication of binding if
609                          * SO_REUSEPORT is set, or if SO_REUSEADDR is set
610                          * and a multicast address is bound on both
611                          * new and duplicated sockets.
612                          */
613                         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
614                                 reuseport = SO_REUSEADDR|SO_REUSEPORT;
615                 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
616                         sin->sin_port = 0;              /* yech... */
617                         bzero(&sin->sin_zero, sizeof(sin->sin_zero));
618                         /*
619                          * Is the address a local IP address? 
620                          * If INP_BINDANY is set, then the socket may be bound
621                          * to any endpoint address, local or not.
622                          */
623                         if ((inp->inp_flags & INP_BINDANY) == 0 &&
624                             ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 
625                                 return (EADDRNOTAVAIL);
626                 }
627                 laddr = sin->sin_addr;
628                 if (lport) {
629                         struct inpcb *t;
630                         struct tcptw *tw;
631
632                         /* GROSS */
633                         if (ntohs(lport) <= V_ipport_reservedhigh &&
634                             ntohs(lport) >= V_ipport_reservedlow &&
635                             priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
636                             0))
637                                 return (EACCES);
638                         if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
639                             priv_check_cred(inp->inp_cred,
640                             PRIV_NETINET_REUSEPORT, 0) != 0) {
641                                 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
642                                     lport, INPLOOKUP_WILDCARD, cred);
643         /*
644          * XXX
645          * This entire block sorely needs a rewrite.
646          */
647                                 if (t &&
648                                     ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
649                                     ((t->inp_flags & INP_TIMEWAIT) == 0) &&
650                                     (so->so_type != SOCK_STREAM ||
651                                      ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
652                                     (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
653                                      ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
654                                      (t->inp_flags2 & INP_REUSEPORT) == 0) &&
655                                     (inp->inp_cred->cr_uid !=
656                                      t->inp_cred->cr_uid))
657                                         return (EADDRINUSE);
658
659                                 /*
660                                  * If the socket is a BINDMULTI socket, then
661                                  * the credentials need to match and the
662                                  * original socket also has to have been bound
663                                  * with BINDMULTI.
664                                  */
665                                 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
666                                         return (EADDRINUSE);
667                         }
668                         t = in_pcblookup_local(pcbinfo, sin->sin_addr,
669                             lport, lookupflags, cred);
670                         if (t && (t->inp_flags & INP_TIMEWAIT)) {
671                                 /*
672                                  * XXXRW: If an incpb has had its timewait
673                                  * state recycled, we treat the address as
674                                  * being in use (for now).  This is better
675                                  * than a panic, but not desirable.
676                                  */
677                                 tw = intotw(t);
678                                 if (tw == NULL ||
679                                     (reuseport & tw->tw_so_options) == 0)
680                                         return (EADDRINUSE);
681                         } else if (t &&
682                             ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
683                             (reuseport & inp_so_options(t)) == 0) {
684 #ifdef INET6
685                                 if (ntohl(sin->sin_addr.s_addr) !=
686                                     INADDR_ANY ||
687                                     ntohl(t->inp_laddr.s_addr) !=
688                                     INADDR_ANY ||
689                                     (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
690                                     (t->inp_vflag & INP_IPV6PROTO) == 0)
691 #endif
692                                 return (EADDRINUSE);
693                                 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
694                                         return (EADDRINUSE);
695                         }
696                 }
697         }
698         if (*lportp != 0)
699                 lport = *lportp;
700         if (lport == 0) {
701                 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
702                 if (error != 0)
703                         return (error);
704
705         }
706         *laddrp = laddr.s_addr;
707         *lportp = lport;
708         return (0);
709 }
710
711 /*
712  * Connect from a socket to a specified address.
713  * Both address and port must be specified in argument sin.
714  * If don't have a local address for this socket yet,
715  * then pick one.
716  */
717 int
718 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
719     struct ucred *cred, struct mbuf *m)
720 {
721         u_short lport, fport;
722         in_addr_t laddr, faddr;
723         int anonport, error;
724
725         INP_WLOCK_ASSERT(inp);
726         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
727
728         lport = inp->inp_lport;
729         laddr = inp->inp_laddr.s_addr;
730         anonport = (lport == 0);
731         error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
732             NULL, cred);
733         if (error)
734                 return (error);
735
736         /* Do the initial binding of the local address if required. */
737         if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
738                 inp->inp_lport = lport;
739                 inp->inp_laddr.s_addr = laddr;
740                 if (in_pcbinshash(inp) != 0) {
741                         inp->inp_laddr.s_addr = INADDR_ANY;
742                         inp->inp_lport = 0;
743                         return (EAGAIN);
744                 }
745         }
746
747         /* Commit the remaining changes. */
748         inp->inp_lport = lport;
749         inp->inp_laddr.s_addr = laddr;
750         inp->inp_faddr.s_addr = faddr;
751         inp->inp_fport = fport;
752         in_pcbrehash_mbuf(inp, m);
753
754         if (anonport)
755                 inp->inp_flags |= INP_ANONPORT;
756         return (0);
757 }
758
759 int
760 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
761 {
762
763         return (in_pcbconnect_mbuf(inp, nam, cred, NULL));
764 }
765
766 /*
767  * Do proper source address selection on an unbound socket in case
768  * of connect. Take jails into account as well.
769  */
770 int
771 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
772     struct ucred *cred)
773 {
774         struct ifaddr *ifa;
775         struct sockaddr *sa;
776         struct sockaddr_in *sin;
777         struct route sro;
778         int error;
779
780         KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
781
782         /*
783          * Bypass source address selection and use the primary jail IP
784          * if requested.
785          */
786         if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
787                 return (0);
788
789         error = 0;
790         bzero(&sro, sizeof(sro));
791
792         sin = (struct sockaddr_in *)&sro.ro_dst;
793         sin->sin_family = AF_INET;
794         sin->sin_len = sizeof(struct sockaddr_in);
795         sin->sin_addr.s_addr = faddr->s_addr;
796
797         /*
798          * If route is known our src addr is taken from the i/f,
799          * else punt.
800          *
801          * Find out route to destination.
802          */
803         if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
804                 in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
805
806         /*
807          * If we found a route, use the address corresponding to
808          * the outgoing interface.
809          * 
810          * Otherwise assume faddr is reachable on a directly connected
811          * network and try to find a corresponding interface to take
812          * the source address from.
813          */
814         if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
815                 struct in_ifaddr *ia;
816                 struct ifnet *ifp;
817
818                 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
819                                         inp->inp_socket->so_fibnum));
820                 if (ia == NULL)
821                         ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
822                                                 inp->inp_socket->so_fibnum));
823                 if (ia == NULL) {
824                         error = ENETUNREACH;
825                         goto done;
826                 }
827
828                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
829                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
830                         ifa_free(&ia->ia_ifa);
831                         goto done;
832                 }
833
834                 ifp = ia->ia_ifp;
835                 ifa_free(&ia->ia_ifa);
836                 ia = NULL;
837                 IF_ADDR_RLOCK(ifp);
838                 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
839
840                         sa = ifa->ifa_addr;
841                         if (sa->sa_family != AF_INET)
842                                 continue;
843                         sin = (struct sockaddr_in *)sa;
844                         if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
845                                 ia = (struct in_ifaddr *)ifa;
846                                 break;
847                         }
848                 }
849                 if (ia != NULL) {
850                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
851                         IF_ADDR_RUNLOCK(ifp);
852                         goto done;
853                 }
854                 IF_ADDR_RUNLOCK(ifp);
855
856                 /* 3. As a last resort return the 'default' jail address. */
857                 error = prison_get_ip4(cred, laddr);
858                 goto done;
859         }
860
861         /*
862          * If the outgoing interface on the route found is not
863          * a loopback interface, use the address from that interface.
864          * In case of jails do those three steps:
865          * 1. check if the interface address belongs to the jail. If so use it.
866          * 2. check if we have any address on the outgoing interface
867          *    belonging to this jail. If so use it.
868          * 3. as a last resort return the 'default' jail address.
869          */
870         if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
871                 struct in_ifaddr *ia;
872                 struct ifnet *ifp;
873
874                 /* If not jailed, use the default returned. */
875                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
876                         ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
877                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
878                         goto done;
879                 }
880
881                 /* Jailed. */
882                 /* 1. Check if the iface address belongs to the jail. */
883                 sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
884                 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
885                         ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
886                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
887                         goto done;
888                 }
889
890                 /*
891                  * 2. Check if we have any address on the outgoing interface
892                  *    belonging to this jail.
893                  */
894                 ia = NULL;
895                 ifp = sro.ro_rt->rt_ifp;
896                 IF_ADDR_RLOCK(ifp);
897                 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
898                         sa = ifa->ifa_addr;
899                         if (sa->sa_family != AF_INET)
900                                 continue;
901                         sin = (struct sockaddr_in *)sa;
902                         if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
903                                 ia = (struct in_ifaddr *)ifa;
904                                 break;
905                         }
906                 }
907                 if (ia != NULL) {
908                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
909                         IF_ADDR_RUNLOCK(ifp);
910                         goto done;
911                 }
912                 IF_ADDR_RUNLOCK(ifp);
913
914                 /* 3. As a last resort return the 'default' jail address. */
915                 error = prison_get_ip4(cred, laddr);
916                 goto done;
917         }
918
919         /*
920          * The outgoing interface is marked with 'loopback net', so a route
921          * to ourselves is here.
922          * Try to find the interface of the destination address and then
923          * take the address from there. That interface is not necessarily
924          * a loopback interface.
925          * In case of jails, check that it is an address of the jail
926          * and if we cannot find, fall back to the 'default' jail address.
927          */
928         if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
929                 struct sockaddr_in sain;
930                 struct in_ifaddr *ia;
931
932                 bzero(&sain, sizeof(struct sockaddr_in));
933                 sain.sin_family = AF_INET;
934                 sain.sin_len = sizeof(struct sockaddr_in);
935                 sain.sin_addr.s_addr = faddr->s_addr;
936
937                 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain),
938                                         inp->inp_socket->so_fibnum));
939                 if (ia == NULL)
940                         ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0,
941                                                 inp->inp_socket->so_fibnum));
942                 if (ia == NULL)
943                         ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
944
945                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
946                         if (ia == NULL) {
947                                 error = ENETUNREACH;
948                                 goto done;
949                         }
950                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
951                         ifa_free(&ia->ia_ifa);
952                         goto done;
953                 }
954
955                 /* Jailed. */
956                 if (ia != NULL) {
957                         struct ifnet *ifp;
958
959                         ifp = ia->ia_ifp;
960                         ifa_free(&ia->ia_ifa);
961                         ia = NULL;
962                         IF_ADDR_RLOCK(ifp);
963                         TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
964
965                                 sa = ifa->ifa_addr;
966                                 if (sa->sa_family != AF_INET)
967                                         continue;
968                                 sin = (struct sockaddr_in *)sa;
969                                 if (prison_check_ip4(cred,
970                                     &sin->sin_addr) == 0) {
971                                         ia = (struct in_ifaddr *)ifa;
972                                         break;
973                                 }
974                         }
975                         if (ia != NULL) {
976                                 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
977                                 IF_ADDR_RUNLOCK(ifp);
978                                 goto done;
979                         }
980                         IF_ADDR_RUNLOCK(ifp);
981                 }
982
983                 /* 3. As a last resort return the 'default' jail address. */
984                 error = prison_get_ip4(cred, laddr);
985                 goto done;
986         }
987
988 done:
989         if (sro.ro_rt != NULL)
990                 RTFREE(sro.ro_rt);
991         return (error);
992 }
993
994 /*
995  * Set up for a connect from a socket to the specified address.
996  * On entry, *laddrp and *lportp should contain the current local
997  * address and port for the PCB; these are updated to the values
998  * that should be placed in inp_laddr and inp_lport to complete
999  * the connect.
1000  *
1001  * On success, *faddrp and *fportp will be set to the remote address
1002  * and port. These are not updated in the error case.
1003  *
1004  * If the operation fails because the connection already exists,
1005  * *oinpp will be set to the PCB of that connection so that the
1006  * caller can decide to override it. In all other cases, *oinpp
1007  * is set to NULL.
1008  */
1009 int
1010 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
1011     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
1012     struct inpcb **oinpp, struct ucred *cred)
1013 {
1014         struct rm_priotracker in_ifa_tracker;
1015         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
1016         struct in_ifaddr *ia;
1017         struct inpcb *oinp;
1018         struct in_addr laddr, faddr;
1019         u_short lport, fport;
1020         int error;
1021
1022         /*
1023          * Because a global state change doesn't actually occur here, a read
1024          * lock is sufficient.
1025          */
1026         INP_LOCK_ASSERT(inp);
1027         INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1028
1029         if (oinpp != NULL)
1030                 *oinpp = NULL;
1031         if (nam->sa_len != sizeof (*sin))
1032                 return (EINVAL);
1033         if (sin->sin_family != AF_INET)
1034                 return (EAFNOSUPPORT);
1035         if (sin->sin_port == 0)
1036                 return (EADDRNOTAVAIL);
1037         laddr.s_addr = *laddrp;
1038         lport = *lportp;
1039         faddr = sin->sin_addr;
1040         fport = sin->sin_port;
1041
1042         if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
1043                 /*
1044                  * If the destination address is INADDR_ANY,
1045                  * use the primary local address.
1046                  * If the supplied address is INADDR_BROADCAST,
1047                  * and the primary interface supports broadcast,
1048                  * choose the broadcast address for that interface.
1049                  */
1050                 if (faddr.s_addr == INADDR_ANY) {
1051                         IN_IFADDR_RLOCK(&in_ifa_tracker);
1052                         faddr =
1053                             IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1054                         IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1055                         if (cred != NULL &&
1056                             (error = prison_get_ip4(cred, &faddr)) != 0)
1057                                 return (error);
1058                 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
1059                         IN_IFADDR_RLOCK(&in_ifa_tracker);
1060                         if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
1061                             IFF_BROADCAST)
1062                                 faddr = satosin(&TAILQ_FIRST(
1063                                     &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1064                         IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1065                 }
1066         }
1067         if (laddr.s_addr == INADDR_ANY) {
1068                 error = in_pcbladdr(inp, &faddr, &laddr, cred);
1069                 /*
1070                  * If the destination address is multicast and an outgoing
1071                  * interface has been set as a multicast option, prefer the
1072                  * address of that interface as our source address.
1073                  */
1074                 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
1075                     inp->inp_moptions != NULL) {
1076                         struct ip_moptions *imo;
1077                         struct ifnet *ifp;
1078
1079                         imo = inp->inp_moptions;
1080                         if (imo->imo_multicast_ifp != NULL) {
1081                                 ifp = imo->imo_multicast_ifp;
1082                                 IN_IFADDR_RLOCK(&in_ifa_tracker);
1083                                 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1084                                         if ((ia->ia_ifp == ifp) &&
1085                                             (cred == NULL ||
1086                                             prison_check_ip4(cred,
1087                                             &ia->ia_addr.sin_addr) == 0))
1088                                                 break;
1089                                 }
1090                                 if (ia == NULL)
1091                                         error = EADDRNOTAVAIL;
1092                                 else {
1093                                         laddr = ia->ia_addr.sin_addr;
1094                                         error = 0;
1095                                 }
1096                                 IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1097                         }
1098                 }
1099                 if (error)
1100                         return (error);
1101         }
1102         oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport,
1103             laddr, lport, 0, NULL);
1104         if (oinp != NULL) {
1105                 if (oinpp != NULL)
1106                         *oinpp = oinp;
1107                 return (EADDRINUSE);
1108         }
1109         if (lport == 0) {
1110                 error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
1111                     cred);
1112                 if (error)
1113                         return (error);
1114         }
1115         *laddrp = laddr.s_addr;
1116         *lportp = lport;
1117         *faddrp = faddr.s_addr;
1118         *fportp = fport;
1119         return (0);
1120 }
1121
1122 void
1123 in_pcbdisconnect(struct inpcb *inp)
1124 {
1125
1126         INP_WLOCK_ASSERT(inp);
1127         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1128
1129         inp->inp_faddr.s_addr = INADDR_ANY;
1130         inp->inp_fport = 0;
1131         in_pcbrehash(inp);
1132 }
1133 #endif /* INET */
1134
1135 /*
1136  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
1137  * For most protocols, this will be invoked immediately prior to calling
1138  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
1139  * socket, in which case in_pcbfree() is deferred.
1140  */
1141 void
1142 in_pcbdetach(struct inpcb *inp)
1143 {
1144
1145         KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1146
1147 #ifdef RATELIMIT
1148         if (inp->inp_snd_tag != NULL)
1149                 in_pcbdetach_txrtlmt(inp);
1150 #endif
1151         inp->inp_socket->so_pcb = NULL;
1152         inp->inp_socket = NULL;
1153 }
1154
1155 /*
1156  * in_pcbref() bumps the reference count on an inpcb in order to maintain
1157  * stability of an inpcb pointer despite the inpcb lock being released.  This
1158  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
1159  * but where the inpcb lock may already held, or when acquiring a reference
1160  * via a pcbgroup.
1161  *
1162  * in_pcbref() should be used only to provide brief memory stability, and
1163  * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
1164  * garbage collect the inpcb if it has been in_pcbfree()'d from another
1165  * context.  Until in_pcbrele() has returned that the inpcb is still valid,
1166  * lock and rele are the *only* safe operations that may be performed on the
1167  * inpcb.
1168  *
1169  * While the inpcb will not be freed, releasing the inpcb lock means that the
1170  * connection's state may change, so the caller should be careful to
1171  * revalidate any cached state on reacquiring the lock.  Drop the reference
1172  * using in_pcbrele().
1173  */
1174 void
1175 in_pcbref(struct inpcb *inp)
1176 {
1177
1178         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1179
1180         refcount_acquire(&inp->inp_refcount);
1181 }
1182
1183 /*
1184  * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
1185  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
1186  * return a flag indicating whether or not the inpcb remains valid.  If it is
1187  * valid, we return with the inpcb lock held.
1188  *
1189  * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
1190  * reference on an inpcb.  Historically more work was done here (actually, in
1191  * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
1192  * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
1193  * about memory stability (and continued use of the write lock).
1194  */
1195 int
1196 in_pcbrele_rlocked(struct inpcb *inp)
1197 {
1198         struct inpcbinfo *pcbinfo;
1199
1200         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1201
1202         INP_RLOCK_ASSERT(inp);
1203
1204         if (refcount_release(&inp->inp_refcount) == 0) {
1205                 /*
1206                  * If the inpcb has been freed, let the caller know, even if
1207                  * this isn't the last reference.
1208                  */
1209                 if (inp->inp_flags2 & INP_FREED) {
1210                         INP_RUNLOCK(inp);
1211                         return (1);
1212                 }
1213                 return (0);
1214         }
1215
1216         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1217
1218         INP_RUNLOCK(inp);
1219         pcbinfo = inp->inp_pcbinfo;
1220         uma_zfree(pcbinfo->ipi_zone, inp);
1221         return (1);
1222 }
1223
1224 int
1225 in_pcbrele_wlocked(struct inpcb *inp)
1226 {
1227         struct inpcbinfo *pcbinfo;
1228
1229         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1230
1231         INP_WLOCK_ASSERT(inp);
1232
1233         if (refcount_release(&inp->inp_refcount) == 0) {
1234                 /*
1235                  * If the inpcb has been freed, let the caller know, even if
1236                  * this isn't the last reference.
1237                  */
1238                 if (inp->inp_flags2 & INP_FREED) {
1239                         INP_WUNLOCK(inp);
1240                         return (1);
1241                 }
1242                 return (0);
1243         }
1244
1245         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1246
1247         INP_WUNLOCK(inp);
1248         pcbinfo = inp->inp_pcbinfo;
1249         uma_zfree(pcbinfo->ipi_zone, inp);
1250         return (1);
1251 }
1252
1253 /*
1254  * Temporary wrapper.
1255  */
1256 int
1257 in_pcbrele(struct inpcb *inp)
1258 {
1259
1260         return (in_pcbrele_wlocked(inp));
1261 }
1262
1263 /*
1264  * Unconditionally schedule an inpcb to be freed by decrementing its
1265  * reference count, which should occur only after the inpcb has been detached
1266  * from its socket.  If another thread holds a temporary reference (acquired
1267  * using in_pcbref()) then the free is deferred until that reference is
1268  * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
1269  * work, including removal from global lists, is done in this context, where
1270  * the pcbinfo lock is held.
1271  */
1272 void
1273 in_pcbfree(struct inpcb *inp)
1274 {
1275         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1276
1277         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1278
1279 #ifdef INVARIANTS
1280         if (pcbinfo == &V_tcbinfo) {
1281                 INP_INFO_LOCK_ASSERT(pcbinfo);
1282         } else {
1283                 INP_INFO_WLOCK_ASSERT(pcbinfo);
1284         }
1285 #endif
1286         INP_WLOCK_ASSERT(inp);
1287
1288         /* XXXRW: Do as much as possible here. */
1289 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1290         if (inp->inp_sp != NULL)
1291                 ipsec_delete_pcbpolicy(inp);
1292 #endif
1293         INP_LIST_WLOCK(pcbinfo);
1294         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1295         in_pcbremlists(inp);
1296         INP_LIST_WUNLOCK(pcbinfo);
1297 #ifdef INET6
1298         if (inp->inp_vflag & INP_IPV6PROTO) {
1299                 ip6_freepcbopts(inp->in6p_outputopts);
1300                 if (inp->in6p_moptions != NULL)
1301                         ip6_freemoptions(inp->in6p_moptions);
1302         }
1303 #endif
1304         if (inp->inp_options)
1305                 (void)m_free(inp->inp_options);
1306 #ifdef INET
1307         if (inp->inp_moptions != NULL)
1308                 inp_freemoptions(inp->inp_moptions);
1309 #endif
1310         RO_RTFREE(&inp->inp_route);
1311         if (inp->inp_route.ro_lle)
1312                 LLE_FREE(inp->inp_route.ro_lle);        /* zeros ro_lle */
1313
1314         inp->inp_vflag = 0;
1315         inp->inp_flags2 |= INP_FREED;
1316         crfree(inp->inp_cred);
1317 #ifdef MAC
1318         mac_inpcb_destroy(inp);
1319 #endif
1320         if (!in_pcbrele_wlocked(inp))
1321                 INP_WUNLOCK(inp);
1322 }
1323
1324 /*
1325  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1326  * port reservation, and preventing it from being returned by inpcb lookups.
1327  *
1328  * It is used by TCP to mark an inpcb as unused and avoid future packet
1329  * delivery or event notification when a socket remains open but TCP has
1330  * closed.  This might occur as a result of a shutdown()-initiated TCP close
1331  * or a RST on the wire, and allows the port binding to be reused while still
1332  * maintaining the invariant that so_pcb always points to a valid inpcb until
1333  * in_pcbdetach().
1334  *
1335  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1336  * in_pcbnotifyall() and in_pcbpurgeif0()?
1337  */
1338 void
1339 in_pcbdrop(struct inpcb *inp)
1340 {
1341
1342         INP_WLOCK_ASSERT(inp);
1343
1344         /*
1345          * XXXRW: Possibly we should protect the setting of INP_DROPPED with
1346          * the hash lock...?
1347          */
1348         inp->inp_flags |= INP_DROPPED;
1349         if (inp->inp_flags & INP_INHASHLIST) {
1350                 struct inpcbport *phd = inp->inp_phd;
1351
1352                 INP_HASH_WLOCK(inp->inp_pcbinfo);
1353                 LIST_REMOVE(inp, inp_hash);
1354                 LIST_REMOVE(inp, inp_portlist);
1355                 if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1356                         LIST_REMOVE(phd, phd_hash);
1357                         free(phd, M_PCB);
1358                 }
1359                 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1360                 inp->inp_flags &= ~INP_INHASHLIST;
1361 #ifdef PCBGROUP
1362                 in_pcbgroup_remove(inp);
1363 #endif
1364         }
1365 }
1366
1367 #ifdef INET
1368 /*
1369  * Common routines to return the socket addresses associated with inpcbs.
1370  */
1371 struct sockaddr *
1372 in_sockaddr(in_port_t port, struct in_addr *addr_p)
1373 {
1374         struct sockaddr_in *sin;
1375
1376         sin = malloc(sizeof *sin, M_SONAME,
1377                 M_WAITOK | M_ZERO);
1378         sin->sin_family = AF_INET;
1379         sin->sin_len = sizeof(*sin);
1380         sin->sin_addr = *addr_p;
1381         sin->sin_port = port;
1382
1383         return (struct sockaddr *)sin;
1384 }
1385
1386 int
1387 in_getsockaddr(struct socket *so, struct sockaddr **nam)
1388 {
1389         struct inpcb *inp;
1390         struct in_addr addr;
1391         in_port_t port;
1392
1393         inp = sotoinpcb(so);
1394         KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1395
1396         INP_RLOCK(inp);
1397         port = inp->inp_lport;
1398         addr = inp->inp_laddr;
1399         INP_RUNLOCK(inp);
1400
1401         *nam = in_sockaddr(port, &addr);
1402         return 0;
1403 }
1404
1405 int
1406 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
1407 {
1408         struct inpcb *inp;
1409         struct in_addr addr;
1410         in_port_t port;
1411
1412         inp = sotoinpcb(so);
1413         KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1414
1415         INP_RLOCK(inp);
1416         port = inp->inp_fport;
1417         addr = inp->inp_faddr;
1418         INP_RUNLOCK(inp);
1419
1420         *nam = in_sockaddr(port, &addr);
1421         return 0;
1422 }
1423
1424 void
1425 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
1426     struct inpcb *(*notify)(struct inpcb *, int))
1427 {
1428         struct inpcb *inp, *inp_temp;
1429
1430         INP_INFO_WLOCK(pcbinfo);
1431         LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
1432                 INP_WLOCK(inp);
1433 #ifdef INET6
1434                 if ((inp->inp_vflag & INP_IPV4) == 0) {
1435                         INP_WUNLOCK(inp);
1436                         continue;
1437                 }
1438 #endif
1439                 if (inp->inp_faddr.s_addr != faddr.s_addr ||
1440                     inp->inp_socket == NULL) {
1441                         INP_WUNLOCK(inp);
1442                         continue;
1443                 }
1444                 if ((*notify)(inp, errno))
1445                         INP_WUNLOCK(inp);
1446         }
1447         INP_INFO_WUNLOCK(pcbinfo);
1448 }
1449
1450 void
1451 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1452 {
1453         struct inpcb *inp;
1454         struct ip_moptions *imo;
1455         int i, gap;
1456
1457         INP_INFO_WLOCK(pcbinfo);
1458         LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1459                 INP_WLOCK(inp);
1460                 imo = inp->inp_moptions;
1461                 if ((inp->inp_vflag & INP_IPV4) &&
1462                     imo != NULL) {
1463                         /*
1464                          * Unselect the outgoing interface if it is being
1465                          * detached.
1466                          */
1467                         if (imo->imo_multicast_ifp == ifp)
1468                                 imo->imo_multicast_ifp = NULL;
1469
1470                         /*
1471                          * Drop multicast group membership if we joined
1472                          * through the interface being detached.
1473                          */
1474                         for (i = 0, gap = 0; i < imo->imo_num_memberships;
1475                             i++) {
1476                                 if (imo->imo_membership[i]->inm_ifp == ifp) {
1477                                         in_delmulti(imo->imo_membership[i]);
1478                                         gap++;
1479                                 } else if (gap != 0)
1480                                         imo->imo_membership[i - gap] =
1481                                             imo->imo_membership[i];
1482                         }
1483                         imo->imo_num_memberships -= gap;
1484                 }
1485                 INP_WUNLOCK(inp);
1486         }
1487         INP_INFO_WUNLOCK(pcbinfo);
1488 }
1489
1490 /*
1491  * Lookup a PCB based on the local address and port.  Caller must hold the
1492  * hash lock.  No inpcb locks or references are acquired.
1493  */
1494 #define INP_LOOKUP_MAPPED_PCB_COST      3
1495 struct inpcb *
1496 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1497     u_short lport, int lookupflags, struct ucred *cred)
1498 {
1499         struct inpcb *inp;
1500 #ifdef INET6
1501         int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
1502 #else
1503         int matchwild = 3;
1504 #endif
1505         int wildcard;
1506
1507         KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1508             ("%s: invalid lookup flags %d", __func__, lookupflags));
1509
1510         INP_HASH_LOCK_ASSERT(pcbinfo);
1511
1512         if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
1513                 struct inpcbhead *head;
1514                 /*
1515                  * Look for an unconnected (wildcard foreign addr) PCB that
1516                  * matches the local address and port we're looking for.
1517                  */
1518                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1519                     0, pcbinfo->ipi_hashmask)];
1520                 LIST_FOREACH(inp, head, inp_hash) {
1521 #ifdef INET6
1522                         /* XXX inp locking */
1523                         if ((inp->inp_vflag & INP_IPV4) == 0)
1524                                 continue;
1525 #endif
1526                         if (inp->inp_faddr.s_addr == INADDR_ANY &&
1527                             inp->inp_laddr.s_addr == laddr.s_addr &&
1528                             inp->inp_lport == lport) {
1529                                 /*
1530                                  * Found?
1531                                  */
1532                                 if (cred == NULL ||
1533                                     prison_equal_ip4(cred->cr_prison,
1534                                         inp->inp_cred->cr_prison))
1535                                         return (inp);
1536                         }
1537                 }
1538                 /*
1539                  * Not found.
1540                  */
1541                 return (NULL);
1542         } else {
1543                 struct inpcbporthead *porthash;
1544                 struct inpcbport *phd;
1545                 struct inpcb *match = NULL;
1546                 /*
1547                  * Best fit PCB lookup.
1548                  *
1549                  * First see if this local port is in use by looking on the
1550                  * port hash list.
1551                  */
1552                 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
1553                     pcbinfo->ipi_porthashmask)];
1554                 LIST_FOREACH(phd, porthash, phd_hash) {
1555                         if (phd->phd_port == lport)
1556                                 break;
1557                 }
1558                 if (phd != NULL) {
1559                         /*
1560                          * Port is in use by one or more PCBs. Look for best
1561                          * fit.
1562                          */
1563                         LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1564                                 wildcard = 0;
1565                                 if (cred != NULL &&
1566                                     !prison_equal_ip4(inp->inp_cred->cr_prison,
1567                                         cred->cr_prison))
1568                                         continue;
1569 #ifdef INET6
1570                                 /* XXX inp locking */
1571                                 if ((inp->inp_vflag & INP_IPV4) == 0)
1572                                         continue;
1573                                 /*
1574                                  * We never select the PCB that has
1575                                  * INP_IPV6 flag and is bound to :: if
1576                                  * we have another PCB which is bound
1577                                  * to 0.0.0.0.  If a PCB has the
1578                                  * INP_IPV6 flag, then we set its cost
1579                                  * higher than IPv4 only PCBs.
1580                                  *
1581                                  * Note that the case only happens
1582                                  * when a socket is bound to ::, under
1583                                  * the condition that the use of the
1584                                  * mapped address is allowed.
1585                                  */
1586                                 if ((inp->inp_vflag & INP_IPV6) != 0)
1587                                         wildcard += INP_LOOKUP_MAPPED_PCB_COST;
1588 #endif
1589                                 if (inp->inp_faddr.s_addr != INADDR_ANY)
1590                                         wildcard++;
1591                                 if (inp->inp_laddr.s_addr != INADDR_ANY) {
1592                                         if (laddr.s_addr == INADDR_ANY)
1593                                                 wildcard++;
1594                                         else if (inp->inp_laddr.s_addr != laddr.s_addr)
1595                                                 continue;
1596                                 } else {
1597                                         if (laddr.s_addr != INADDR_ANY)
1598                                                 wildcard++;
1599                                 }
1600                                 if (wildcard < matchwild) {
1601                                         match = inp;
1602                                         matchwild = wildcard;
1603                                         if (matchwild == 0)
1604                                                 break;
1605                                 }
1606                         }
1607                 }
1608                 return (match);
1609         }
1610 }
1611 #undef INP_LOOKUP_MAPPED_PCB_COST
1612
1613 #ifdef PCBGROUP
1614 /*
1615  * Lookup PCB in hash list, using pcbgroup tables.
1616  */
1617 static struct inpcb *
1618 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
1619     struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
1620     u_int lport_arg, int lookupflags, struct ifnet *ifp)
1621 {
1622         struct inpcbhead *head;
1623         struct inpcb *inp, *tmpinp;
1624         u_short fport = fport_arg, lport = lport_arg;
1625
1626         /*
1627          * First look for an exact match.
1628          */
1629         tmpinp = NULL;
1630         INP_GROUP_LOCK(pcbgroup);
1631         head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
1632             pcbgroup->ipg_hashmask)];
1633         LIST_FOREACH(inp, head, inp_pcbgrouphash) {
1634 #ifdef INET6
1635                 /* XXX inp locking */
1636                 if ((inp->inp_vflag & INP_IPV4) == 0)
1637                         continue;
1638 #endif
1639                 if (inp->inp_faddr.s_addr == faddr.s_addr &&
1640                     inp->inp_laddr.s_addr == laddr.s_addr &&
1641                     inp->inp_fport == fport &&
1642                     inp->inp_lport == lport) {
1643                         /*
1644                          * XXX We should be able to directly return
1645                          * the inp here, without any checks.
1646                          * Well unless both bound with SO_REUSEPORT?
1647                          */
1648                         if (prison_flag(inp->inp_cred, PR_IP4))
1649                                 goto found;
1650                         if (tmpinp == NULL)
1651                                 tmpinp = inp;
1652                 }
1653         }
1654         if (tmpinp != NULL) {
1655                 inp = tmpinp;
1656                 goto found;
1657         }
1658
1659 #ifdef  RSS
1660         /*
1661          * For incoming connections, we may wish to do a wildcard
1662          * match for an RSS-local socket.
1663          */
1664         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
1665                 struct inpcb *local_wild = NULL, *local_exact = NULL;
1666 #ifdef INET6
1667                 struct inpcb *local_wild_mapped = NULL;
1668 #endif
1669                 struct inpcb *jail_wild = NULL;
1670                 struct inpcbhead *head;
1671                 int injail;
1672
1673                 /*
1674                  * Order of socket selection - we always prefer jails.
1675                  *      1. jailed, non-wild.
1676                  *      2. jailed, wild.
1677                  *      3. non-jailed, non-wild.
1678                  *      4. non-jailed, wild.
1679                  */
1680
1681                 head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
1682                     lport, 0, pcbgroup->ipg_hashmask)];
1683                 LIST_FOREACH(inp, head, inp_pcbgrouphash) {
1684 #ifdef INET6
1685                         /* XXX inp locking */
1686                         if ((inp->inp_vflag & INP_IPV4) == 0)
1687                                 continue;
1688 #endif
1689                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
1690                             inp->inp_lport != lport)
1691                                 continue;
1692
1693                         injail = prison_flag(inp->inp_cred, PR_IP4);
1694                         if (injail) {
1695                                 if (prison_check_ip4(inp->inp_cred,
1696                                     &laddr) != 0)
1697                                         continue;
1698                         } else {
1699                                 if (local_exact != NULL)
1700                                         continue;
1701                         }
1702
1703                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
1704                                 if (injail)
1705                                         goto found;
1706                                 else
1707                                         local_exact = inp;
1708                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1709 #ifdef INET6
1710                                 /* XXX inp locking, NULL check */
1711                                 if (inp->inp_vflag & INP_IPV6PROTO)
1712                                         local_wild_mapped = inp;
1713                                 else
1714 #endif
1715                                         if (injail)
1716                                                 jail_wild = inp;
1717                                         else
1718                                                 local_wild = inp;
1719                         }
1720                 } /* LIST_FOREACH */
1721
1722                 inp = jail_wild;
1723                 if (inp == NULL)
1724                         inp = local_exact;
1725                 if (inp == NULL)
1726                         inp = local_wild;
1727 #ifdef INET6
1728                 if (inp == NULL)
1729                         inp = local_wild_mapped;
1730 #endif
1731                 if (inp != NULL)
1732                         goto found;
1733         }
1734 #endif
1735
1736         /*
1737          * Then look for a wildcard match, if requested.
1738          */
1739         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
1740                 struct inpcb *local_wild = NULL, *local_exact = NULL;
1741 #ifdef INET6
1742                 struct inpcb *local_wild_mapped = NULL;
1743 #endif
1744                 struct inpcb *jail_wild = NULL;
1745                 struct inpcbhead *head;
1746                 int injail;
1747
1748                 /*
1749                  * Order of socket selection - we always prefer jails.
1750                  *      1. jailed, non-wild.
1751                  *      2. jailed, wild.
1752                  *      3. non-jailed, non-wild.
1753                  *      4. non-jailed, wild.
1754                  */
1755                 head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
1756                     0, pcbinfo->ipi_wildmask)];
1757                 LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
1758 #ifdef INET6
1759                         /* XXX inp locking */
1760                         if ((inp->inp_vflag & INP_IPV4) == 0)
1761                                 continue;
1762 #endif
1763                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
1764                             inp->inp_lport != lport)
1765                                 continue;
1766
1767                         injail = prison_flag(inp->inp_cred, PR_IP4);
1768                         if (injail) {
1769                                 if (prison_check_ip4(inp->inp_cred,
1770                                     &laddr) != 0)
1771                                         continue;
1772                         } else {
1773                                 if (local_exact != NULL)
1774                                         continue;
1775                         }
1776
1777                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
1778                                 if (injail)
1779                                         goto found;
1780                                 else
1781                                         local_exact = inp;
1782                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1783 #ifdef INET6
1784                                 /* XXX inp locking, NULL check */
1785                                 if (inp->inp_vflag & INP_IPV6PROTO)
1786                                         local_wild_mapped = inp;
1787                                 else
1788 #endif
1789                                         if (injail)
1790                                                 jail_wild = inp;
1791                                         else
1792                                                 local_wild = inp;
1793                         }
1794                 } /* LIST_FOREACH */
1795                 inp = jail_wild;
1796                 if (inp == NULL)
1797                         inp = local_exact;
1798                 if (inp == NULL)
1799                         inp = local_wild;
1800 #ifdef INET6
1801                 if (inp == NULL)
1802                         inp = local_wild_mapped;
1803 #endif
1804                 if (inp != NULL)
1805                         goto found;
1806         } /* if (lookupflags & INPLOOKUP_WILDCARD) */
1807         INP_GROUP_UNLOCK(pcbgroup);
1808         return (NULL);
1809
1810 found:
1811         in_pcbref(inp);
1812         INP_GROUP_UNLOCK(pcbgroup);
1813         if (lookupflags & INPLOOKUP_WLOCKPCB) {
1814                 INP_WLOCK(inp);
1815                 if (in_pcbrele_wlocked(inp))
1816                         return (NULL);
1817         } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
1818                 INP_RLOCK(inp);
1819                 if (in_pcbrele_rlocked(inp))
1820                         return (NULL);
1821         } else
1822                 panic("%s: locking bug", __func__);
1823         return (inp);
1824 }
1825 #endif /* PCBGROUP */
1826
1827 /*
1828  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
1829  * that the caller has locked the hash list, and will not perform any further
1830  * locking or reference operations on either the hash list or the connection.
1831  */
1832 static struct inpcb *
1833 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1834     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
1835     struct ifnet *ifp)
1836 {
1837         struct inpcbhead *head;
1838         struct inpcb *inp, *tmpinp;
1839         u_short fport = fport_arg, lport = lport_arg;
1840
1841         KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1842             ("%s: invalid lookup flags %d", __func__, lookupflags));
1843
1844         INP_HASH_LOCK_ASSERT(pcbinfo);
1845
1846         /*
1847          * First look for an exact match.
1848          */
1849         tmpinp = NULL;
1850         head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
1851             pcbinfo->ipi_hashmask)];
1852         LIST_FOREACH(inp, head, inp_hash) {
1853 #ifdef INET6
1854                 /* XXX inp locking */
1855                 if ((inp->inp_vflag & INP_IPV4) == 0)
1856                         continue;
1857 #endif
1858                 if (inp->inp_faddr.s_addr == faddr.s_addr &&
1859                     inp->inp_laddr.s_addr == laddr.s_addr &&
1860                     inp->inp_fport == fport &&
1861                     inp->inp_lport == lport) {
1862                         /*
1863                          * XXX We should be able to directly return
1864                          * the inp here, without any checks.
1865                          * Well unless both bound with SO_REUSEPORT?
1866                          */
1867                         if (prison_flag(inp->inp_cred, PR_IP4))
1868                                 return (inp);
1869                         if (tmpinp == NULL)
1870                                 tmpinp = inp;
1871                 }
1872         }
1873         if (tmpinp != NULL)
1874                 return (tmpinp);
1875
1876         /*
1877          * Then look for a wildcard match, if requested.
1878          */
1879         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
1880                 struct inpcb *local_wild = NULL, *local_exact = NULL;
1881 #ifdef INET6
1882                 struct inpcb *local_wild_mapped = NULL;
1883 #endif
1884                 struct inpcb *jail_wild = NULL;
1885                 int injail;
1886
1887                 /*
1888                  * Order of socket selection - we always prefer jails.
1889                  *      1. jailed, non-wild.
1890                  *      2. jailed, wild.
1891                  *      3. non-jailed, non-wild.
1892                  *      4. non-jailed, wild.
1893                  */
1894
1895                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1896                     0, pcbinfo->ipi_hashmask)];
1897                 LIST_FOREACH(inp, head, inp_hash) {
1898 #ifdef INET6
1899                         /* XXX inp locking */
1900                         if ((inp->inp_vflag & INP_IPV4) == 0)
1901                                 continue;
1902 #endif
1903                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
1904                             inp->inp_lport != lport)
1905                                 continue;
1906
1907                         injail = prison_flag(inp->inp_cred, PR_IP4);
1908                         if (injail) {
1909                                 if (prison_check_ip4(inp->inp_cred,
1910                                     &laddr) != 0)
1911                                         continue;
1912                         } else {
1913                                 if (local_exact != NULL)
1914                                         continue;
1915                         }
1916
1917                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
1918                                 if (injail)
1919                                         return (inp);
1920                                 else
1921                                         local_exact = inp;
1922                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1923 #ifdef INET6
1924                                 /* XXX inp locking, NULL check */
1925                                 if (inp->inp_vflag & INP_IPV6PROTO)
1926                                         local_wild_mapped = inp;
1927                                 else
1928 #endif
1929                                         if (injail)
1930                                                 jail_wild = inp;
1931                                         else
1932                                                 local_wild = inp;
1933                         }
1934                 } /* LIST_FOREACH */
1935                 if (jail_wild != NULL)
1936                         return (jail_wild);
1937                 if (local_exact != NULL)
1938                         return (local_exact);
1939                 if (local_wild != NULL)
1940                         return (local_wild);
1941 #ifdef INET6
1942                 if (local_wild_mapped != NULL)
1943                         return (local_wild_mapped);
1944 #endif
1945         } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
1946
1947         return (NULL);
1948 }
1949
1950 /*
1951  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
1952  * hash list lock, and will return the inpcb locked (i.e., requires
1953  * INPLOOKUP_LOCKPCB).
1954  */
1955 static struct inpcb *
1956 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1957     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
1958     struct ifnet *ifp)
1959 {
1960         struct inpcb *inp;
1961
1962         INP_HASH_RLOCK(pcbinfo);
1963         inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
1964             (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
1965         if (inp != NULL) {
1966                 in_pcbref(inp);
1967                 INP_HASH_RUNLOCK(pcbinfo);
1968                 if (lookupflags & INPLOOKUP_WLOCKPCB) {
1969                         INP_WLOCK(inp);
1970                         if (in_pcbrele_wlocked(inp))
1971                                 return (NULL);
1972                 } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
1973                         INP_RLOCK(inp);
1974                         if (in_pcbrele_rlocked(inp))
1975                                 return (NULL);
1976                 } else
1977                         panic("%s: locking bug", __func__);
1978         } else
1979                 INP_HASH_RUNLOCK(pcbinfo);
1980         return (inp);
1981 }
1982
1983 /*
1984  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
1985  * from which a pre-calculated hash value may be extracted.
1986  *
1987  * Possibly more of this logic should be in in_pcbgroup.c.
1988  */
1989 struct inpcb *
1990 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
1991     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
1992 {
1993 #if defined(PCBGROUP) && !defined(RSS)
1994         struct inpcbgroup *pcbgroup;
1995 #endif
1996
1997         KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
1998             ("%s: invalid lookup flags %d", __func__, lookupflags));
1999         KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2000             ("%s: LOCKPCB not set", __func__));
2001
2002         /*
2003          * When not using RSS, use connection groups in preference to the
2004          * reservation table when looking up 4-tuples.  When using RSS, just
2005          * use the reservation table, due to the cost of the Toeplitz hash
2006          * in software.
2007          *
2008          * XXXRW: This policy belongs in the pcbgroup code, as in principle
2009          * we could be doing RSS with a non-Toeplitz hash that is affordable
2010          * in software.
2011          */
2012 #if defined(PCBGROUP) && !defined(RSS)
2013         if (in_pcbgroup_enabled(pcbinfo)) {
2014                 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2015                     fport);
2016                 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2017                     laddr, lport, lookupflags, ifp));
2018         }
2019 #endif
2020         return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2021             lookupflags, ifp));
2022 }
2023
2024 struct inpcb *
2025 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2026     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2027     struct ifnet *ifp, struct mbuf *m)
2028 {
2029 #ifdef PCBGROUP
2030         struct inpcbgroup *pcbgroup;
2031 #endif
2032
2033         KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2034             ("%s: invalid lookup flags %d", __func__, lookupflags));
2035         KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2036             ("%s: LOCKPCB not set", __func__));
2037
2038 #ifdef PCBGROUP
2039         /*
2040          * If we can use a hardware-generated hash to look up the connection
2041          * group, use that connection group to find the inpcb.  Otherwise
2042          * fall back on a software hash -- or the reservation table if we're
2043          * using RSS.
2044          *
2045          * XXXRW: As above, that policy belongs in the pcbgroup code.
2046          */
2047         if (in_pcbgroup_enabled(pcbinfo) &&
2048             !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
2049                 pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
2050                     m->m_pkthdr.flowid);
2051                 if (pcbgroup != NULL)
2052                         return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
2053                             fport, laddr, lport, lookupflags, ifp));
2054 #ifndef RSS
2055                 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2056                     fport);
2057                 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2058                     laddr, lport, lookupflags, ifp));
2059 #endif
2060         }
2061 #endif
2062         return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2063             lookupflags, ifp));
2064 }
2065 #endif /* INET */
2066
2067 /*
2068  * Insert PCB onto various hash lists.
2069  */
2070 static int
2071 in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
2072 {
2073         struct inpcbhead *pcbhash;
2074         struct inpcbporthead *pcbporthash;
2075         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2076         struct inpcbport *phd;
2077         u_int32_t hashkey_faddr;
2078
2079         INP_WLOCK_ASSERT(inp);
2080         INP_HASH_WLOCK_ASSERT(pcbinfo);
2081
2082         KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2083             ("in_pcbinshash: INP_INHASHLIST"));
2084
2085 #ifdef INET6
2086         if (inp->inp_vflag & INP_IPV6)
2087                 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2088         else
2089 #endif
2090         hashkey_faddr = inp->inp_faddr.s_addr;
2091
2092         pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2093                  inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2094
2095         pcbporthash = &pcbinfo->ipi_porthashbase[
2096             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2097
2098         /*
2099          * Go through port list and look for a head for this lport.
2100          */
2101         LIST_FOREACH(phd, pcbporthash, phd_hash) {
2102                 if (phd->phd_port == inp->inp_lport)
2103                         break;
2104         }
2105         /*
2106          * If none exists, malloc one and tack it on.
2107          */
2108         if (phd == NULL) {
2109                 phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
2110                 if (phd == NULL) {
2111                         return (ENOBUFS); /* XXX */
2112                 }
2113                 phd->phd_port = inp->inp_lport;
2114                 LIST_INIT(&phd->phd_pcblist);
2115                 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2116         }
2117         inp->inp_phd = phd;
2118         LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2119         LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
2120         inp->inp_flags |= INP_INHASHLIST;
2121 #ifdef PCBGROUP
2122         if (do_pcbgroup_update)
2123                 in_pcbgroup_update(inp);
2124 #endif
2125         return (0);
2126 }
2127
2128 /*
2129  * For now, there are two public interfaces to insert an inpcb into the hash
2130  * lists -- one that does update pcbgroups, and one that doesn't.  The latter
2131  * is used only in the TCP syncache, where in_pcbinshash is called before the
2132  * full 4-tuple is set for the inpcb, and we don't want to install in the
2133  * pcbgroup until later.
2134  *
2135  * XXXRW: This seems like a misfeature.  in_pcbinshash should always update
2136  * connection groups, and partially initialised inpcbs should not be exposed
2137  * to either reservation hash tables or pcbgroups.
2138  */
2139 int
2140 in_pcbinshash(struct inpcb *inp)
2141 {
2142
2143         return (in_pcbinshash_internal(inp, 1));
2144 }
2145
2146 int
2147 in_pcbinshash_nopcbgroup(struct inpcb *inp)
2148 {
2149
2150         return (in_pcbinshash_internal(inp, 0));
2151 }
2152
2153 /*
2154  * Move PCB to the proper hash bucket when { faddr, fport } have  been
2155  * changed. NOTE: This does not handle the case of the lport changing (the
2156  * hashed port list would have to be updated as well), so the lport must
2157  * not change after in_pcbinshash() has been called.
2158  */
2159 void
2160 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
2161 {
2162         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2163         struct inpcbhead *head;
2164         u_int32_t hashkey_faddr;
2165
2166         INP_WLOCK_ASSERT(inp);
2167         INP_HASH_WLOCK_ASSERT(pcbinfo);
2168
2169         KASSERT(inp->inp_flags & INP_INHASHLIST,
2170             ("in_pcbrehash: !INP_INHASHLIST"));
2171
2172 #ifdef INET6
2173         if (inp->inp_vflag & INP_IPV6)
2174                 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2175         else
2176 #endif
2177         hashkey_faddr = inp->inp_faddr.s_addr;
2178
2179         head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2180                 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2181
2182         LIST_REMOVE(inp, inp_hash);
2183         LIST_INSERT_HEAD(head, inp, inp_hash);
2184
2185 #ifdef PCBGROUP
2186         if (m != NULL)
2187                 in_pcbgroup_update_mbuf(inp, m);
2188         else
2189                 in_pcbgroup_update(inp);
2190 #endif
2191 }
2192
2193 void
2194 in_pcbrehash(struct inpcb *inp)
2195 {
2196
2197         in_pcbrehash_mbuf(inp, NULL);
2198 }
2199
2200 /*
2201  * Remove PCB from various lists.
2202  */
2203 static void
2204 in_pcbremlists(struct inpcb *inp)
2205 {
2206         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2207
2208 #ifdef INVARIANTS
2209         if (pcbinfo == &V_tcbinfo) {
2210                 INP_INFO_RLOCK_ASSERT(pcbinfo);
2211         } else {
2212                 INP_INFO_WLOCK_ASSERT(pcbinfo);
2213         }
2214 #endif
2215
2216         INP_WLOCK_ASSERT(inp);
2217         INP_LIST_WLOCK_ASSERT(pcbinfo);
2218
2219         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
2220         if (inp->inp_flags & INP_INHASHLIST) {
2221                 struct inpcbport *phd = inp->inp_phd;
2222
2223                 INP_HASH_WLOCK(pcbinfo);
2224                 LIST_REMOVE(inp, inp_hash);
2225                 LIST_REMOVE(inp, inp_portlist);
2226                 if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
2227                         LIST_REMOVE(phd, phd_hash);
2228                         free(phd, M_PCB);
2229                 }
2230                 INP_HASH_WUNLOCK(pcbinfo);
2231                 inp->inp_flags &= ~INP_INHASHLIST;
2232         }
2233         LIST_REMOVE(inp, inp_list);
2234         pcbinfo->ipi_count--;
2235 #ifdef PCBGROUP
2236         in_pcbgroup_remove(inp);
2237 #endif
2238 }
2239
2240 /*
2241  * Check for alternatives when higher level complains
2242  * about service problems.  For now, invalidate cached
2243  * routing information.  If the route was created dynamically
2244  * (by a redirect), time to try a default gateway again.
2245  */
2246 void
2247 in_losing(struct inpcb *inp)
2248 {
2249
2250         RO_RTFREE(&inp->inp_route);
2251         if (inp->inp_route.ro_lle)
2252                 LLE_FREE(inp->inp_route.ro_lle);        /* zeros ro_lle */
2253         return;
2254 }
2255
2256 /*
2257  * A set label operation has occurred at the socket layer, propagate the
2258  * label change into the in_pcb for the socket.
2259  */
2260 void
2261 in_pcbsosetlabel(struct socket *so)
2262 {
2263 #ifdef MAC
2264         struct inpcb *inp;
2265
2266         inp = sotoinpcb(so);
2267         KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2268
2269         INP_WLOCK(inp);
2270         SOCK_LOCK(so);
2271         mac_inpcb_sosetlabel(so, inp);
2272         SOCK_UNLOCK(so);
2273         INP_WUNLOCK(inp);
2274 #endif
2275 }
2276
2277 /*
2278  * ipport_tick runs once per second, determining if random port allocation
2279  * should be continued.  If more than ipport_randomcps ports have been
2280  * allocated in the last second, then we return to sequential port
2281  * allocation. We return to random allocation only once we drop below
2282  * ipport_randomcps for at least ipport_randomtime seconds.
2283  */
2284 static void
2285 ipport_tick(void *xtp)
2286 {
2287         VNET_ITERATOR_DECL(vnet_iter);
2288
2289         VNET_LIST_RLOCK_NOSLEEP();
2290         VNET_FOREACH(vnet_iter) {
2291                 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */
2292                 if (V_ipport_tcpallocs <=
2293                     V_ipport_tcplastcount + V_ipport_randomcps) {
2294                         if (V_ipport_stoprandom > 0)
2295                                 V_ipport_stoprandom--;
2296                 } else
2297                         V_ipport_stoprandom = V_ipport_randomtime;
2298                 V_ipport_tcplastcount = V_ipport_tcpallocs;
2299                 CURVNET_RESTORE();
2300         }
2301         VNET_LIST_RUNLOCK_NOSLEEP();
2302         callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
2303 }
2304
2305 static void
2306 ip_fini(void *xtp)
2307 {
2308
2309         callout_stop(&ipport_tick_callout);
2310 }
2311
2312 /* 
2313  * The ipport_callout should start running at about the time we attach the
2314  * inet or inet6 domains.
2315  */
2316 static void
2317 ipport_tick_init(const void *unused __unused)
2318 {
2319
2320         /* Start ipport_tick. */
2321         callout_init(&ipport_tick_callout, 1);
2322         callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
2323         EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
2324                 SHUTDOWN_PRI_DEFAULT);
2325 }
2326 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, 
2327     ipport_tick_init, NULL);
2328
2329 void
2330 inp_wlock(struct inpcb *inp)
2331 {
2332
2333         INP_WLOCK(inp);
2334 }
2335
2336 void
2337 inp_wunlock(struct inpcb *inp)
2338 {
2339
2340         INP_WUNLOCK(inp);
2341 }
2342
2343 void
2344 inp_rlock(struct inpcb *inp)
2345 {
2346
2347         INP_RLOCK(inp);
2348 }
2349
2350 void
2351 inp_runlock(struct inpcb *inp)
2352 {
2353
2354         INP_RUNLOCK(inp);
2355 }
2356
2357 #ifdef INVARIANT_SUPPORT
2358 void
2359 inp_lock_assert(struct inpcb *inp)
2360 {
2361
2362         INP_WLOCK_ASSERT(inp);
2363 }
2364
2365 void
2366 inp_unlock_assert(struct inpcb *inp)
2367 {
2368
2369         INP_UNLOCK_ASSERT(inp);
2370 }
2371 #endif
2372
2373 void
2374 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
2375 {
2376         struct inpcb *inp;
2377
2378         INP_INFO_WLOCK(&V_tcbinfo);
2379         LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
2380                 INP_WLOCK(inp);
2381                 func(inp, arg);
2382                 INP_WUNLOCK(inp);
2383         }
2384         INP_INFO_WUNLOCK(&V_tcbinfo);
2385 }
2386
2387 struct socket *
2388 inp_inpcbtosocket(struct inpcb *inp)
2389 {
2390
2391         INP_WLOCK_ASSERT(inp);
2392         return (inp->inp_socket);
2393 }
2394
2395 struct tcpcb *
2396 inp_inpcbtotcpcb(struct inpcb *inp)
2397 {
2398
2399         INP_WLOCK_ASSERT(inp);
2400         return ((struct tcpcb *)inp->inp_ppcb);
2401 }
2402
2403 int
2404 inp_ip_tos_get(const struct inpcb *inp)
2405 {
2406
2407         return (inp->inp_ip_tos);
2408 }
2409
2410 void
2411 inp_ip_tos_set(struct inpcb *inp, int val)
2412 {
2413
2414         inp->inp_ip_tos = val;
2415 }
2416
2417 void
2418 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2419     uint32_t *faddr, uint16_t *fp)
2420 {
2421
2422         INP_LOCK_ASSERT(inp);
2423         *laddr = inp->inp_laddr.s_addr;
2424         *faddr = inp->inp_faddr.s_addr;
2425         *lp = inp->inp_lport;
2426         *fp = inp->inp_fport;
2427 }
2428
2429 struct inpcb *
2430 so_sotoinpcb(struct socket *so)
2431 {
2432
2433         return (sotoinpcb(so));
2434 }
2435
2436 struct tcpcb *
2437 so_sototcpcb(struct socket *so)
2438 {
2439
2440         return (sototcpcb(so));
2441 }
2442
2443 /*
2444  * Create an external-format (``xinpcb'') structure using the information in
2445  * the kernel-format in_pcb structure pointed to by inp.  This is done to
2446  * reduce the spew of irrelevant information over this interface, to isolate
2447  * user code from changes in the kernel structure, and potentially to provide
2448  * information-hiding if we decide that some of this information should be
2449  * hidden from users.
2450  */
2451 void
2452 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
2453 {
2454
2455         xi->xi_len = sizeof(struct xinpcb);
2456         if (inp->inp_socket)
2457                 sotoxsocket(inp->inp_socket, &xi->xi_socket);
2458         else
2459                 bzero(&xi->xi_socket, sizeof(struct xsocket));
2460         bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
2461         xi->inp_gencnt = inp->inp_gencnt;
2462         xi->inp_ppcb = inp->inp_ppcb;
2463         xi->inp_flow = inp->inp_flow;
2464         xi->inp_flowid = inp->inp_flowid;
2465         xi->inp_flowtype = inp->inp_flowtype;
2466         xi->inp_flags = inp->inp_flags;
2467         xi->inp_flags2 = inp->inp_flags2;
2468         xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
2469         xi->in6p_cksum = inp->in6p_cksum;
2470         xi->in6p_hops = inp->in6p_hops;
2471         xi->inp_ip_tos = inp->inp_ip_tos;
2472         xi->inp_vflag = inp->inp_vflag;
2473         xi->inp_ip_ttl = inp->inp_ip_ttl;
2474         xi->inp_ip_p = inp->inp_ip_p;
2475         xi->inp_ip_minttl = inp->inp_ip_minttl;
2476 }
2477
2478 #ifdef DDB
2479 static void
2480 db_print_indent(int indent)
2481 {
2482         int i;
2483
2484         for (i = 0; i < indent; i++)
2485                 db_printf(" ");
2486 }
2487
2488 static void
2489 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
2490 {
2491         char faddr_str[48], laddr_str[48];
2492
2493         db_print_indent(indent);
2494         db_printf("%s at %p\n", name, inc);
2495
2496         indent += 2;
2497
2498 #ifdef INET6
2499         if (inc->inc_flags & INC_ISIPV6) {
2500                 /* IPv6. */
2501                 ip6_sprintf(laddr_str, &inc->inc6_laddr);
2502                 ip6_sprintf(faddr_str, &inc->inc6_faddr);
2503         } else
2504 #endif
2505         {
2506                 /* IPv4. */
2507                 inet_ntoa_r(inc->inc_laddr, laddr_str);
2508                 inet_ntoa_r(inc->inc_faddr, faddr_str);
2509         }
2510         db_print_indent(indent);
2511         db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
2512             ntohs(inc->inc_lport));
2513         db_print_indent(indent);
2514         db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
2515             ntohs(inc->inc_fport));
2516 }
2517
2518 static void
2519 db_print_inpflags(int inp_flags)
2520 {
2521         int comma;
2522
2523         comma = 0;
2524         if (inp_flags & INP_RECVOPTS) {
2525                 db_printf("%sINP_RECVOPTS", comma ? ", " : "");
2526                 comma = 1;
2527         }
2528         if (inp_flags & INP_RECVRETOPTS) {
2529                 db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
2530                 comma = 1;
2531         }
2532         if (inp_flags & INP_RECVDSTADDR) {
2533                 db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
2534                 comma = 1;
2535         }
2536         if (inp_flags & INP_ORIGDSTADDR) {
2537                 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
2538                 comma = 1;
2539         }
2540         if (inp_flags & INP_HDRINCL) {
2541                 db_printf("%sINP_HDRINCL", comma ? ", " : "");
2542                 comma = 1;
2543         }
2544         if (inp_flags & INP_HIGHPORT) {
2545                 db_printf("%sINP_HIGHPORT", comma ? ", " : "");
2546                 comma = 1;
2547         }
2548         if (inp_flags & INP_LOWPORT) {
2549                 db_printf("%sINP_LOWPORT", comma ? ", " : "");
2550                 comma = 1;
2551         }
2552         if (inp_flags & INP_ANONPORT) {
2553                 db_printf("%sINP_ANONPORT", comma ? ", " : "");
2554                 comma = 1;
2555         }
2556         if (inp_flags & INP_RECVIF) {
2557                 db_printf("%sINP_RECVIF", comma ? ", " : "");
2558                 comma = 1;
2559         }
2560         if (inp_flags & INP_MTUDISC) {
2561                 db_printf("%sINP_MTUDISC", comma ? ", " : "");
2562                 comma = 1;
2563         }
2564         if (inp_flags & INP_RECVTTL) {
2565                 db_printf("%sINP_RECVTTL", comma ? ", " : "");
2566                 comma = 1;
2567         }
2568         if (inp_flags & INP_DONTFRAG) {
2569                 db_printf("%sINP_DONTFRAG", comma ? ", " : "");
2570                 comma = 1;
2571         }
2572         if (inp_flags & INP_RECVTOS) {
2573                 db_printf("%sINP_RECVTOS", comma ? ", " : "");
2574                 comma = 1;
2575         }
2576         if (inp_flags & IN6P_IPV6_V6ONLY) {
2577                 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
2578                 comma = 1;
2579         }
2580         if (inp_flags & IN6P_PKTINFO) {
2581                 db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
2582                 comma = 1;
2583         }
2584         if (inp_flags & IN6P_HOPLIMIT) {
2585                 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
2586                 comma = 1;
2587         }
2588         if (inp_flags & IN6P_HOPOPTS) {
2589                 db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
2590                 comma = 1;
2591         }
2592         if (inp_flags & IN6P_DSTOPTS) {
2593                 db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
2594                 comma = 1;
2595         }
2596         if (inp_flags & IN6P_RTHDR) {
2597                 db_printf("%sIN6P_RTHDR", comma ? ", " : "");
2598                 comma = 1;
2599         }
2600         if (inp_flags & IN6P_RTHDRDSTOPTS) {
2601                 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
2602                 comma = 1;
2603         }
2604         if (inp_flags & IN6P_TCLASS) {
2605                 db_printf("%sIN6P_TCLASS", comma ? ", " : "");
2606                 comma = 1;
2607         }
2608         if (inp_flags & IN6P_AUTOFLOWLABEL) {
2609                 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
2610                 comma = 1;
2611         }
2612         if (inp_flags & INP_TIMEWAIT) {
2613                 db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
2614                 comma  = 1;
2615         }
2616         if (inp_flags & INP_ONESBCAST) {
2617                 db_printf("%sINP_ONESBCAST", comma ? ", " : "");
2618                 comma  = 1;
2619         }
2620         if (inp_flags & INP_DROPPED) {
2621                 db_printf("%sINP_DROPPED", comma ? ", " : "");
2622                 comma  = 1;
2623         }
2624         if (inp_flags & INP_SOCKREF) {
2625                 db_printf("%sINP_SOCKREF", comma ? ", " : "");
2626                 comma  = 1;
2627         }
2628         if (inp_flags & IN6P_RFC2292) {
2629                 db_printf("%sIN6P_RFC2292", comma ? ", " : "");
2630                 comma = 1;
2631         }
2632         if (inp_flags & IN6P_MTU) {
2633                 db_printf("IN6P_MTU%s", comma ? ", " : "");
2634                 comma = 1;
2635         }
2636 }
2637
2638 static void
2639 db_print_inpvflag(u_char inp_vflag)
2640 {
2641         int comma;
2642
2643         comma = 0;
2644         if (inp_vflag & INP_IPV4) {
2645                 db_printf("%sINP_IPV4", comma ? ", " : "");
2646                 comma  = 1;
2647         }
2648         if (inp_vflag & INP_IPV6) {
2649                 db_printf("%sINP_IPV6", comma ? ", " : "");
2650                 comma  = 1;
2651         }
2652         if (inp_vflag & INP_IPV6PROTO) {
2653                 db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
2654                 comma  = 1;
2655         }
2656 }
2657
2658 static void
2659 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
2660 {
2661
2662         db_print_indent(indent);
2663         db_printf("%s at %p\n", name, inp);
2664
2665         indent += 2;
2666
2667         db_print_indent(indent);
2668         db_printf("inp_flow: 0x%x\n", inp->inp_flow);
2669
2670         db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
2671
2672         db_print_indent(indent);
2673         db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
2674             inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
2675
2676         db_print_indent(indent);
2677         db_printf("inp_label: %p   inp_flags: 0x%x (",
2678            inp->inp_label, inp->inp_flags);
2679         db_print_inpflags(inp->inp_flags);
2680         db_printf(")\n");
2681
2682         db_print_indent(indent);
2683         db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
2684             inp->inp_vflag);
2685         db_print_inpvflag(inp->inp_vflag);
2686         db_printf(")\n");
2687
2688         db_print_indent(indent);
2689         db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
2690             inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
2691
2692         db_print_indent(indent);
2693 #ifdef INET6
2694         if (inp->inp_vflag & INP_IPV6) {
2695                 db_printf("in6p_options: %p   in6p_outputopts: %p   "
2696                     "in6p_moptions: %p\n", inp->in6p_options,
2697                     inp->in6p_outputopts, inp->in6p_moptions);
2698                 db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
2699                     "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
2700                     inp->in6p_hops);
2701         } else
2702 #endif
2703         {
2704                 db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
2705                     "inp_ip_moptions: %p\n", inp->inp_ip_tos,
2706                     inp->inp_options, inp->inp_moptions);
2707         }
2708
2709         db_print_indent(indent);
2710         db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
2711             (uintmax_t)inp->inp_gencnt);
2712 }
2713
2714 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
2715 {
2716         struct inpcb *inp;
2717
2718         if (!have_addr) {
2719                 db_printf("usage: show inpcb <addr>\n");
2720                 return;
2721         }
2722         inp = (struct inpcb *)addr;
2723
2724         db_print_inpcb(inp, "inpcb", 0);
2725 }
2726 #endif /* DDB */
2727
2728 #ifdef RATELIMIT
2729 /*
2730  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
2731  * if any.
2732  */
2733 int
2734 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
2735 {
2736         union if_snd_tag_modify_params params = {
2737                 .rate_limit.max_rate = max_pacing_rate,
2738         };
2739         struct m_snd_tag *mst;
2740         struct ifnet *ifp;
2741         int error;
2742
2743         mst = inp->inp_snd_tag;
2744         if (mst == NULL)
2745                 return (EINVAL);
2746
2747         ifp = mst->ifp;
2748         if (ifp == NULL)
2749                 return (EINVAL);
2750
2751         if (ifp->if_snd_tag_modify == NULL) {
2752                 error = EOPNOTSUPP;
2753         } else {
2754                 error = ifp->if_snd_tag_modify(mst, &params);
2755         }
2756         return (error);
2757 }
2758
2759 /*
2760  * Query existing TX rate limit based on the existing
2761  * "inp->inp_snd_tag", if any.
2762  */
2763 int
2764 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
2765 {
2766         union if_snd_tag_query_params params = { };
2767         struct m_snd_tag *mst;
2768         struct ifnet *ifp;
2769         int error;
2770
2771         mst = inp->inp_snd_tag;
2772         if (mst == NULL)
2773                 return (EINVAL);
2774
2775         ifp = mst->ifp;
2776         if (ifp == NULL)
2777                 return (EINVAL);
2778
2779         if (ifp->if_snd_tag_query == NULL) {
2780                 error = EOPNOTSUPP;
2781         } else {
2782                 error = ifp->if_snd_tag_query(mst, &params);
2783                 if (error == 0 &&  p_max_pacing_rate != NULL)
2784                         *p_max_pacing_rate = params.rate_limit.max_rate;
2785         }
2786         return (error);
2787 }
2788
2789 /*
2790  * Allocate a new TX rate limit send tag from the network interface
2791  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
2792  */
2793 int
2794 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
2795     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
2796 {
2797         union if_snd_tag_alloc_params params = {
2798                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
2799                 .rate_limit.hdr.flowid = flowid,
2800                 .rate_limit.hdr.flowtype = flowtype,
2801                 .rate_limit.max_rate = max_pacing_rate,
2802         };
2803         int error;
2804
2805         INP_WLOCK_ASSERT(inp);
2806
2807         if (inp->inp_snd_tag != NULL)
2808                 return (EINVAL);
2809
2810         if (ifp->if_snd_tag_alloc == NULL) {
2811                 error = EOPNOTSUPP;
2812         } else {
2813                 error = ifp->if_snd_tag_alloc(ifp, &params, &inp->inp_snd_tag);
2814
2815                 /*
2816                  * At success increment the refcount on
2817                  * the send tag's network interface:
2818                  */
2819                 if (error == 0)
2820                         if_ref(inp->inp_snd_tag->ifp);
2821         }
2822         return (error);
2823 }
2824
2825 /*
2826  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
2827  * if any:
2828  */
2829 void
2830 in_pcbdetach_txrtlmt(struct inpcb *inp)
2831 {
2832         struct m_snd_tag *mst;
2833         struct ifnet *ifp;
2834
2835         INP_WLOCK_ASSERT(inp);
2836
2837         mst = inp->inp_snd_tag;
2838         inp->inp_snd_tag = NULL;
2839
2840         if (mst == NULL)
2841                 return;
2842
2843         ifp = mst->ifp;
2844         if (ifp == NULL)
2845                 return;
2846
2847         /*
2848          * If the device was detached while we still had reference(s)
2849          * on the ifp, we assume if_snd_tag_free() was replaced with
2850          * stubs.
2851          */
2852         ifp->if_snd_tag_free(mst);
2853
2854         /* release reference count on network interface */
2855         if_rele(ifp);
2856 }
2857
2858 /*
2859  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
2860  * is set in the fast path and will attach/detach/modify the TX rate
2861  * limit send tag based on the socket's so_max_pacing_rate value.
2862  */
2863 void
2864 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
2865 {
2866         struct socket *socket;
2867         uint32_t max_pacing_rate;
2868         bool did_upgrade;
2869         int error;
2870
2871         if (inp == NULL)
2872                 return;
2873
2874         socket = inp->inp_socket;
2875         if (socket == NULL)
2876                 return;
2877
2878         if (!INP_WLOCKED(inp)) {
2879                 /*
2880                  * NOTE: If the write locking fails, we need to bail
2881                  * out and use the non-ratelimited ring for the
2882                  * transmit until there is a new chance to get the
2883                  * write lock.
2884                  */
2885                 if (!INP_TRY_UPGRADE(inp))
2886                         return;
2887                 did_upgrade = 1;
2888         } else {
2889                 did_upgrade = 0;
2890         }
2891
2892         /*
2893          * NOTE: The so_max_pacing_rate value is read unlocked,
2894          * because atomic updates are not required since the variable
2895          * is checked at every mbuf we send. It is assumed that the
2896          * variable read itself will be atomic.
2897          */
2898         max_pacing_rate = socket->so_max_pacing_rate;
2899
2900         /*
2901          * NOTE: When attaching to a network interface a reference is
2902          * made to ensure the network interface doesn't go away until
2903          * all ratelimit connections are gone. The network interface
2904          * pointers compared below represent valid network interfaces,
2905          * except when comparing towards NULL.
2906          */
2907         if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
2908                 error = 0;
2909         } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
2910                 if (inp->inp_snd_tag != NULL)
2911                         in_pcbdetach_txrtlmt(inp);
2912                 error = 0;
2913         } else if (inp->inp_snd_tag == NULL) {
2914                 /*
2915                  * In order to utilize packet pacing with RSS, we need
2916                  * to wait until there is a valid RSS hash before we
2917                  * can proceed:
2918                  */
2919                 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
2920                         error = EAGAIN;
2921                 } else {
2922                         error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
2923                             mb->m_pkthdr.flowid, max_pacing_rate);
2924                 }
2925         } else {
2926                 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
2927         }
2928         if (error == 0 || error == EOPNOTSUPP)
2929                 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
2930         if (did_upgrade)
2931                 INP_DOWNGRADE(inp);
2932 }
2933
2934 /*
2935  * Track route changes for TX rate limiting.
2936  */
2937 void
2938 in_pcboutput_eagain(struct inpcb *inp)
2939 {
2940         struct socket *socket;
2941         bool did_upgrade;
2942
2943         if (inp == NULL)
2944                 return;
2945
2946         socket = inp->inp_socket;
2947         if (socket == NULL)
2948                 return;
2949
2950         if (inp->inp_snd_tag == NULL)
2951                 return;
2952
2953         if (!INP_WLOCKED(inp)) {
2954                 /*
2955                  * NOTE: If the write locking fails, we need to bail
2956                  * out and use the non-ratelimited ring for the
2957                  * transmit until there is a new chance to get the
2958                  * write lock.
2959                  */
2960                 if (!INP_TRY_UPGRADE(inp))
2961                         return;
2962                 did_upgrade = 1;
2963         } else {
2964                 did_upgrade = 0;
2965         }
2966
2967         /* detach rate limiting */
2968         in_pcbdetach_txrtlmt(inp);
2969
2970         /* make sure new mbuf send tag allocation is made */
2971         inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
2972
2973         if (did_upgrade)
2974                 INP_DOWNGRADE(inp);
2975 }
2976 #endif /* RATELIMIT */