2 * Copyright (c) 1982, 1986, 1991, 1993, 1995
3 * The Regents of the University of California.
4 * Copyright (c) 2007 Robert N. M. Watson
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 4. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
38 #include "opt_ipsec.h"
39 #include "opt_inet6.h"
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/malloc.h>
46 #include <sys/domain.h>
47 #include <sys/protosw.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
53 #include <sys/kernel.h>
54 #include <sys/sysctl.h>
55 #include <sys/vimage.h>
64 #include <net/if_types.h>
65 #include <net/route.h>
67 #include <netinet/in.h>
68 #include <netinet/in_pcb.h>
69 #include <netinet/in_var.h>
70 #include <netinet/ip_var.h>
71 #include <netinet/tcp_var.h>
72 #include <netinet/udp.h>
73 #include <netinet/udp_var.h>
75 #include <netinet/ip6.h>
76 #include <netinet6/ip6_var.h>
81 #include <netipsec/ipsec.h>
82 #include <netipsec/key.h>
85 #include <security/mac/mac_framework.h>
88 * These configure the range of local port addresses assigned to
89 * "unspecified" outgoing connections/packets/whatever.
91 int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */
92 int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */
93 int ipport_firstauto = IPPORT_EPHEMERALFIRST; /* 10000 */
94 int ipport_lastauto = IPPORT_EPHEMERALLAST; /* 65535 */
95 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
96 int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */
99 * Reserved ports accessible only to root. There are significant
100 * security considerations that must be accounted for when changing these,
101 * but the security benefits can be great. Please be careful.
103 int ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */
104 int ipport_reservedlow = 0;
106 /* Variables dealing with random ephemeral port allocation. */
107 int ipport_randomized = 1; /* user controlled via sysctl */
108 int ipport_randomcps = 10; /* user controlled via sysctl */
109 int ipport_randomtime = 45; /* user controlled via sysctl */
110 int ipport_stoprandom = 0; /* toggled by ipport_tick */
111 int ipport_tcpallocs;
112 int ipport_tcplastcount;
114 #define RANGECHK(var, min, max) \
115 if ((var) < (min)) { (var) = (min); } \
116 else if ((var) > (max)) { (var) = (max); }
119 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
123 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
125 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
126 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
127 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
128 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
129 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
130 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
137 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
139 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW,
140 &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
141 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW,
142 &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
143 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW,
144 &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
145 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW,
146 &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
147 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW,
148 &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
149 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW,
150 &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
151 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
152 CTLFLAG_RW|CTLFLAG_SECURE, &ipport_reservedhigh, 0, "");
153 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
154 CTLFLAG_RW|CTLFLAG_SECURE, &ipport_reservedlow, 0, "");
155 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW,
156 &ipport_randomized, 0, "Enable random port allocation");
157 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW,
158 &ipport_randomcps, 0, "Maximum number of random port "
159 "allocations before switching to a sequental one");
160 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW,
161 &ipport_randomtime, 0, "Minimum time to keep sequental port "
162 "allocation before switching to a random one");
165 * in_pcb.c: manage the Protocol Control Blocks.
167 * NOTE: It is assumed that most of these functions will be called with
168 * the pcbinfo lock held, and often, the inpcb lock held, as these utility
169 * functions often modify hash chains or addresses in pcbs.
173 * Allocate a PCB and associate it with the socket.
174 * On success return with the PCB locked.
177 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
182 INP_INFO_WLOCK_ASSERT(pcbinfo);
184 inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
187 bzero(inp, inp_zero_size);
188 inp->inp_pcbinfo = pcbinfo;
189 inp->inp_socket = so;
190 inp->inp_inc.inc_fibnum = so->so_fibnum;
192 error = mac_inpcb_init(inp, M_NOWAIT);
196 mac_inpcb_create(so, inp);
201 error = ipsec_init_policy(so, &inp->inp_sp);
204 mac_inpcb_destroy(inp);
210 if (INP_SOCKAF(so) == AF_INET6) {
211 inp->inp_vflag |= INP_IPV6PROTO;
213 inp->inp_flags |= IN6P_IPV6_V6ONLY;
216 LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
217 pcbinfo->ipi_count++;
218 so->so_pcb = (caddr_t)inp;
220 if (V_ip6_auto_flowlabel)
221 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
224 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
226 #if defined(IPSEC) || defined(MAC)
229 uma_zfree(pcbinfo->ipi_zone, inp);
235 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
239 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
240 INP_WLOCK_ASSERT(inp);
242 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
244 anonport = inp->inp_lport == 0 && (nam == NULL ||
245 ((struct sockaddr_in *)nam)->sin_port == 0);
246 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
247 &inp->inp_lport, cred);
250 if (in_pcbinshash(inp) != 0) {
251 inp->inp_laddr.s_addr = INADDR_ANY;
256 inp->inp_flags |= INP_ANONPORT;
261 * Set up a bind operation on a PCB, performing port allocation
262 * as required, but do not actually modify the PCB. Callers can
263 * either complete the bind by setting inp_laddr/inp_lport and
264 * calling in_pcbinshash(), or they can just use the resulting
265 * port and address to authorise the sending of a once-off packet.
267 * On error, the values of *laddrp and *lportp are not changed.
270 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
271 u_short *lportp, struct ucred *cred)
273 struct socket *so = inp->inp_socket;
274 unsigned short *lastport;
275 struct sockaddr_in *sin;
276 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
277 struct in_addr laddr;
279 int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
280 int error, prison = 0;
284 * Because no actual state changes occur here, a global write lock on
285 * the pcbinfo isn't required.
287 INP_INFO_LOCK_ASSERT(pcbinfo);
288 INP_LOCK_ASSERT(inp);
290 if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
291 return (EADDRNOTAVAIL);
292 laddr.s_addr = *laddrp;
293 if (nam != NULL && laddr.s_addr != INADDR_ANY)
295 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
296 wild = INPLOOKUP_WILDCARD;
298 sin = (struct sockaddr_in *)nam;
299 if (nam->sa_len != sizeof (*sin))
303 * We should check the family, but old programs
304 * incorrectly fail to initialize it.
306 if (sin->sin_family != AF_INET)
307 return (EAFNOSUPPORT);
309 if (sin->sin_addr.s_addr != INADDR_ANY)
310 if (prison_ip(cred, 0, &sin->sin_addr.s_addr))
312 if (sin->sin_port != *lportp) {
313 /* Don't allow the port to change. */
316 lport = sin->sin_port;
318 /* NB: lport is left as 0 if the port isn't being changed. */
319 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
321 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
322 * allow complete duplication of binding if
323 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
324 * and a multicast address is bound on both
325 * new and duplicated sockets.
327 if (so->so_options & SO_REUSEADDR)
328 reuseport = SO_REUSEADDR|SO_REUSEPORT;
329 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
330 sin->sin_port = 0; /* yech... */
331 bzero(&sin->sin_zero, sizeof(sin->sin_zero));
332 if (ifa_ifwithaddr((struct sockaddr *)sin) == 0)
333 return (EADDRNOTAVAIL);
335 laddr = sin->sin_addr;
341 if (ntohs(lport) <= V_ipport_reservedhigh &&
342 ntohs(lport) >= V_ipport_reservedlow &&
343 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
348 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
349 priv_check_cred(so->so_cred,
350 PRIV_NETINET_REUSEPORT, 0) != 0) {
351 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
352 lport, prison ? 0 : INPLOOKUP_WILDCARD,
356 * This entire block sorely needs a rewrite.
359 ((t->inp_vflag & INP_TIMEWAIT) == 0) &&
360 (so->so_type != SOCK_STREAM ||
361 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
362 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
363 ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
364 (t->inp_socket->so_options &
365 SO_REUSEPORT) == 0) &&
366 (so->so_cred->cr_uid !=
367 t->inp_socket->so_cred->cr_uid))
370 if (prison && prison_ip(cred, 0, &sin->sin_addr.s_addr))
371 return (EADDRNOTAVAIL);
372 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
373 lport, prison ? 0 : wild, cred);
374 if (t && (t->inp_vflag & INP_TIMEWAIT)) {
376 * XXXRW: If an incpb has had its timewait
377 * state recycled, we treat the address as
378 * being in use (for now). This is better
379 * than a panic, but not desirable.
383 (reuseport & tw->tw_so_options) == 0)
386 (reuseport & t->inp_socket->so_options) == 0) {
388 if (ntohl(sin->sin_addr.s_addr) !=
390 ntohl(t->inp_laddr.s_addr) !=
393 INP_SOCKAF(t->inp_socket))
402 u_short first, last, aux;
405 if (laddr.s_addr != INADDR_ANY)
406 if (prison_ip(cred, 0, &laddr.s_addr))
409 if (inp->inp_flags & INP_HIGHPORT) {
410 first = V_ipport_hifirstauto; /* sysctl */
411 last = V_ipport_hilastauto;
412 lastport = &pcbinfo->ipi_lasthi;
413 } else if (inp->inp_flags & INP_LOWPORT) {
414 error = priv_check_cred(cred,
415 PRIV_NETINET_RESERVEDPORT, 0);
418 first = V_ipport_lowfirstauto; /* 1023 */
419 last = V_ipport_lowlastauto; /* 600 */
420 lastport = &pcbinfo->ipi_lastlow;
422 first = V_ipport_firstauto; /* sysctl */
423 last = V_ipport_lastauto;
424 lastport = &pcbinfo->ipi_lastport;
427 * For UDP, use random port allocation as long as the user
428 * allows it. For TCP (and as of yet unknown) connections,
429 * use random port allocation only if the user allows it AND
430 * ipport_tick() allows it.
432 if (V_ipport_randomized &&
433 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo))
438 * It makes no sense to do random port allocation if
439 * we have the only port available.
443 /* Make sure to not include UDP packets in the count. */
444 if (pcbinfo != &V_udbinfo)
445 V_ipport_tcpallocs++;
447 * Simple check to ensure all ports are not used up causing
458 (arc4random() % (last - first));
460 count = last - first;
463 if (count-- < 0) /* completely used? */
464 return (EADDRNOTAVAIL);
466 if (*lastport < first || *lastport > last)
468 lport = htons(*lastport);
469 } while (in_pcblookup_local(pcbinfo, laddr,
472 if (prison_ip(cred, 0, &laddr.s_addr))
474 *laddrp = laddr.s_addr;
480 * Connect from a socket to a specified address.
481 * Both address and port must be specified in argument sin.
482 * If don't have a local address for this socket yet,
486 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
488 u_short lport, fport;
489 in_addr_t laddr, faddr;
492 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
493 INP_WLOCK_ASSERT(inp);
495 lport = inp->inp_lport;
496 laddr = inp->inp_laddr.s_addr;
497 anonport = (lport == 0);
498 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
503 /* Do the initial binding of the local address if required. */
504 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
505 inp->inp_lport = lport;
506 inp->inp_laddr.s_addr = laddr;
507 if (in_pcbinshash(inp) != 0) {
508 inp->inp_laddr.s_addr = INADDR_ANY;
514 /* Commit the remaining changes. */
515 inp->inp_lport = lport;
516 inp->inp_laddr.s_addr = laddr;
517 inp->inp_faddr.s_addr = faddr;
518 inp->inp_fport = fport;
522 inp->inp_flags |= INP_ANONPORT;
527 * Set up for a connect from a socket to the specified address.
528 * On entry, *laddrp and *lportp should contain the current local
529 * address and port for the PCB; these are updated to the values
530 * that should be placed in inp_laddr and inp_lport to complete
533 * On success, *faddrp and *fportp will be set to the remote address
534 * and port. These are not updated in the error case.
536 * If the operation fails because the connection already exists,
537 * *oinpp will be set to the PCB of that connection so that the
538 * caller can decide to override it. In all other cases, *oinpp
542 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
543 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
544 struct inpcb **oinpp, struct ucred *cred)
546 struct sockaddr_in *sin = (struct sockaddr_in *)nam;
547 struct in_ifaddr *ia;
548 struct sockaddr_in sa;
549 struct ucred *socred;
551 struct in_addr laddr, faddr;
552 u_short lport, fport;
556 * Because a global state change doesn't actually occur here, a read
557 * lock is sufficient.
559 INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo);
560 INP_LOCK_ASSERT(inp);
564 if (nam->sa_len != sizeof (*sin))
566 if (sin->sin_family != AF_INET)
567 return (EAFNOSUPPORT);
568 if (sin->sin_port == 0)
569 return (EADDRNOTAVAIL);
570 laddr.s_addr = *laddrp;
572 faddr = sin->sin_addr;
573 fport = sin->sin_port;
574 socred = inp->inp_socket->so_cred;
575 if (laddr.s_addr == INADDR_ANY && jailed(socred)) {
576 bzero(&sa, sizeof(sa));
577 sa.sin_addr.s_addr = htonl(prison_getip(socred));
578 sa.sin_len = sizeof(sa);
579 sa.sin_family = AF_INET;
580 error = in_pcbbind_setup(inp, (struct sockaddr *)&sa,
581 &laddr.s_addr, &lport, cred);
585 if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
587 * If the destination address is INADDR_ANY,
588 * use the primary local address.
589 * If the supplied address is INADDR_BROADCAST,
590 * and the primary interface supports broadcast,
591 * choose the broadcast address for that interface.
593 if (faddr.s_addr == INADDR_ANY)
594 faddr = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
595 else if (faddr.s_addr == (u_long)INADDR_BROADCAST &&
596 (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
598 faddr = satosin(&TAILQ_FIRST(
599 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
601 if (laddr.s_addr == INADDR_ANY) {
604 * If route is known our src addr is taken from the i/f,
607 * Find out route to destination
609 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
610 ia = ip_rtaddr(faddr, inp->inp_inc.inc_fibnum);
612 * If we found a route, use the address corresponding to
613 * the outgoing interface.
615 * Otherwise assume faddr is reachable on a directly connected
616 * network and try to find a corresponding interface to take
617 * the source address from.
620 bzero(&sa, sizeof(sa));
622 sa.sin_len = sizeof(sa);
623 sa.sin_family = AF_INET;
625 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sa)));
627 ia = ifatoia(ifa_ifwithnet(sintosa(&sa)));
629 return (ENETUNREACH);
632 * If the destination address is multicast and an outgoing
633 * interface has been set as a multicast option, use the
634 * address of that interface as our source address.
636 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
637 inp->inp_moptions != NULL) {
638 struct ip_moptions *imo;
641 imo = inp->inp_moptions;
642 if (imo->imo_multicast_ifp != NULL) {
643 ifp = imo->imo_multicast_ifp;
644 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link)
645 if (ia->ia_ifp == ifp)
648 return (EADDRNOTAVAIL);
651 laddr = ia->ia_addr.sin_addr;
654 oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
662 error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
667 *laddrp = laddr.s_addr;
669 *faddrp = faddr.s_addr;
675 in_pcbdisconnect(struct inpcb *inp)
678 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
679 INP_WLOCK_ASSERT(inp);
681 inp->inp_faddr.s_addr = INADDR_ANY;
687 * In the old world order, in_pcbdetach() served two functions: to detach the
688 * pcb from the socket/potentially free the socket, and to free the pcb
689 * itself. In the new world order, the protocol code is responsible for
690 * managing the relationship with the socket, and this code simply frees the
694 in_pcbdetach(struct inpcb *inp)
697 KASSERT(inp->inp_socket != NULL, ("in_pcbdetach: inp_socket == NULL"));
698 inp->inp_socket->so_pcb = NULL;
699 inp->inp_socket = NULL;
703 in_pcbfree(struct inpcb *inp)
705 struct inpcbinfo *ipi = inp->inp_pcbinfo;
707 KASSERT(inp->inp_socket == NULL, ("in_pcbfree: inp_socket != NULL"));
709 INP_INFO_WLOCK_ASSERT(ipi);
710 INP_WLOCK_ASSERT(inp);
713 ipsec4_delete_pcbpolicy(inp);
715 inp->inp_gencnt = ++ipi->ipi_gencnt;
717 if (inp->inp_options)
718 (void)m_free(inp->inp_options);
719 if (inp->inp_moptions != NULL)
720 inp_freemoptions(inp->inp_moptions);
724 mac_inpcb_destroy(inp);
727 uma_zfree(ipi->ipi_zone, inp);
731 * TCP needs to maintain its inpcb structure after the TCP connection has
732 * been torn down. However, it must be disconnected from the inpcb hashes as
733 * it must not prevent binding of future connections to the same port/ip
734 * combination by other inpcbs.
737 in_pcbdrop(struct inpcb *inp)
740 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
741 INP_WLOCK_ASSERT(inp);
743 inp->inp_vflag |= INP_DROPPED;
744 if (inp->inp_lport) {
745 struct inpcbport *phd = inp->inp_phd;
747 LIST_REMOVE(inp, inp_hash);
748 LIST_REMOVE(inp, inp_portlist);
749 if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
750 LIST_REMOVE(phd, phd_hash);
758 * Common routines to return the socket addresses associated with inpcbs.
761 in_sockaddr(in_port_t port, struct in_addr *addr_p)
763 struct sockaddr_in *sin;
765 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
767 sin->sin_family = AF_INET;
768 sin->sin_len = sizeof(*sin);
769 sin->sin_addr = *addr_p;
770 sin->sin_port = port;
772 return (struct sockaddr *)sin;
776 in_getsockaddr(struct socket *so, struct sockaddr **nam)
783 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
786 port = inp->inp_lport;
787 addr = inp->inp_laddr;
790 *nam = in_sockaddr(port, &addr);
795 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
802 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
805 port = inp->inp_fport;
806 addr = inp->inp_faddr;
809 *nam = in_sockaddr(port, &addr);
814 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
815 struct inpcb *(*notify)(struct inpcb *, int))
817 struct inpcb *inp, *inp_temp;
819 INP_INFO_WLOCK(pcbinfo);
820 LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
823 if ((inp->inp_vflag & INP_IPV4) == 0) {
828 if (inp->inp_faddr.s_addr != faddr.s_addr ||
829 inp->inp_socket == NULL) {
833 if ((*notify)(inp, errno))
836 INP_INFO_WUNLOCK(pcbinfo);
840 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
843 struct ip_moptions *imo;
846 INP_INFO_RLOCK(pcbinfo);
847 LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
849 imo = inp->inp_moptions;
850 if ((inp->inp_vflag & INP_IPV4) &&
853 * Unselect the outgoing interface if it is being
856 if (imo->imo_multicast_ifp == ifp)
857 imo->imo_multicast_ifp = NULL;
860 * Drop multicast group membership if we joined
861 * through the interface being detached.
863 for (i = 0, gap = 0; i < imo->imo_num_memberships;
865 if (imo->imo_membership[i]->inm_ifp == ifp) {
866 in_delmulti(imo->imo_membership[i]);
869 imo->imo_membership[i - gap] =
870 imo->imo_membership[i];
872 imo->imo_num_memberships -= gap;
876 INP_INFO_RUNLOCK(pcbinfo);
880 * Lookup a PCB based on the local address and port.
882 #define INP_LOOKUP_MAPPED_PCB_COST 3
884 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
885 u_short lport, int wild_okay, struct ucred *cred)
889 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
895 INP_INFO_LOCK_ASSERT(pcbinfo);
898 struct inpcbhead *head;
900 * Look for an unconnected (wildcard foreign addr) PCB that
901 * matches the local address and port we're looking for.
903 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
904 0, pcbinfo->ipi_hashmask)];
905 LIST_FOREACH(inp, head, inp_hash) {
907 if ((inp->inp_vflag & INP_IPV4) == 0)
910 if (inp->inp_faddr.s_addr == INADDR_ANY &&
911 inp->inp_laddr.s_addr == laddr.s_addr &&
912 inp->inp_lport == lport) {
924 struct inpcbporthead *porthash;
925 struct inpcbport *phd;
926 struct inpcb *match = NULL;
928 * Best fit PCB lookup.
930 * First see if this local port is in use by looking on the
933 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
934 pcbinfo->ipi_porthashmask)];
935 LIST_FOREACH(phd, porthash, phd_hash) {
936 if (phd->phd_port == lport)
941 * Port is in use by one or more PCBs. Look for best
944 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
947 if ((inp->inp_vflag & INP_IPV4) == 0)
950 * We never select the PCB that has
951 * INP_IPV6 flag and is bound to :: if
952 * we have another PCB which is bound
953 * to 0.0.0.0. If a PCB has the
954 * INP_IPV6 flag, then we set its cost
955 * higher than IPv4 only PCBs.
957 * Note that the case only happens
958 * when a socket is bound to ::, under
959 * the condition that the use of the
960 * mapped address is allowed.
962 if ((inp->inp_vflag & INP_IPV6) != 0)
963 wildcard += INP_LOOKUP_MAPPED_PCB_COST;
965 if (inp->inp_faddr.s_addr != INADDR_ANY)
967 if (inp->inp_laddr.s_addr != INADDR_ANY) {
968 if (laddr.s_addr == INADDR_ANY)
970 else if (inp->inp_laddr.s_addr != laddr.s_addr)
973 if (laddr.s_addr != INADDR_ANY)
976 if (wildcard < matchwild) {
978 matchwild = wildcard;
979 if (matchwild == 0) {
988 #undef INP_LOOKUP_MAPPED_PCB_COST
991 * Lookup PCB in hash list.
994 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
995 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
998 struct inpcbhead *head;
1000 u_short fport = fport_arg, lport = lport_arg;
1002 INP_INFO_LOCK_ASSERT(pcbinfo);
1005 * First look for an exact match.
1007 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
1008 pcbinfo->ipi_hashmask)];
1009 LIST_FOREACH(inp, head, inp_hash) {
1011 if ((inp->inp_vflag & INP_IPV4) == 0)
1014 if (inp->inp_faddr.s_addr == faddr.s_addr &&
1015 inp->inp_laddr.s_addr == laddr.s_addr &&
1016 inp->inp_fport == fport &&
1017 inp->inp_lport == lport)
1022 * Then look for a wildcard match, if requested.
1025 struct inpcb *local_wild = NULL;
1027 struct inpcb *local_wild_mapped = NULL;
1030 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1031 0, pcbinfo->ipi_hashmask)];
1032 LIST_FOREACH(inp, head, inp_hash) {
1034 if ((inp->inp_vflag & INP_IPV4) == 0)
1037 if (inp->inp_faddr.s_addr == INADDR_ANY &&
1038 inp->inp_lport == lport) {
1039 if (ifp && ifp->if_type == IFT_FAITH &&
1040 (inp->inp_flags & INP_FAITH) == 0)
1042 if (inp->inp_laddr.s_addr == laddr.s_addr)
1044 else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1046 if (INP_CHECK_SOCKAF(inp->inp_socket,
1048 local_wild_mapped = inp;
1056 if (local_wild == NULL)
1057 return (local_wild_mapped);
1059 return (local_wild);
1065 * Insert PCB onto various hash lists.
1068 in_pcbinshash(struct inpcb *inp)
1070 struct inpcbhead *pcbhash;
1071 struct inpcbporthead *pcbporthash;
1072 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1073 struct inpcbport *phd;
1074 u_int32_t hashkey_faddr;
1076 INP_INFO_WLOCK_ASSERT(pcbinfo);
1077 INP_WLOCK_ASSERT(inp);
1080 if (inp->inp_vflag & INP_IPV6)
1081 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1084 hashkey_faddr = inp->inp_faddr.s_addr;
1086 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
1087 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
1089 pcbporthash = &pcbinfo->ipi_porthashbase[
1090 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
1093 * Go through port list and look for a head for this lport.
1095 LIST_FOREACH(phd, pcbporthash, phd_hash) {
1096 if (phd->phd_port == inp->inp_lport)
1100 * If none exists, malloc one and tack it on.
1103 MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_NOWAIT);
1105 return (ENOBUFS); /* XXX */
1107 phd->phd_port = inp->inp_lport;
1108 LIST_INIT(&phd->phd_pcblist);
1109 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1112 LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1113 LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
1118 * Move PCB to the proper hash bucket when { faddr, fport } have been
1119 * changed. NOTE: This does not handle the case of the lport changing (the
1120 * hashed port list would have to be updated as well), so the lport must
1121 * not change after in_pcbinshash() has been called.
1124 in_pcbrehash(struct inpcb *inp)
1126 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1127 struct inpcbhead *head;
1128 u_int32_t hashkey_faddr;
1130 INP_INFO_WLOCK_ASSERT(pcbinfo);
1131 INP_WLOCK_ASSERT(inp);
1134 if (inp->inp_vflag & INP_IPV6)
1135 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1138 hashkey_faddr = inp->inp_faddr.s_addr;
1140 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
1141 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
1143 LIST_REMOVE(inp, inp_hash);
1144 LIST_INSERT_HEAD(head, inp, inp_hash);
1148 * Remove PCB from various lists.
1151 in_pcbremlists(struct inpcb *inp)
1153 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1155 INP_INFO_WLOCK_ASSERT(pcbinfo);
1156 INP_WLOCK_ASSERT(inp);
1158 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1159 if (inp->inp_lport) {
1160 struct inpcbport *phd = inp->inp_phd;
1162 LIST_REMOVE(inp, inp_hash);
1163 LIST_REMOVE(inp, inp_portlist);
1164 if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1165 LIST_REMOVE(phd, phd_hash);
1169 LIST_REMOVE(inp, inp_list);
1170 pcbinfo->ipi_count--;
1174 * A set label operation has occurred at the socket layer, propagate the
1175 * label change into the in_pcb for the socket.
1178 in_pcbsosetlabel(struct socket *so)
1183 inp = sotoinpcb(so);
1184 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
1188 mac_inpcb_sosetlabel(so, inp);
1195 * ipport_tick runs once per second, determining if random port allocation
1196 * should be continued. If more than ipport_randomcps ports have been
1197 * allocated in the last second, then we return to sequential port
1198 * allocation. We return to random allocation only once we drop below
1199 * ipport_randomcps for at least ipport_randomtime seconds.
1202 ipport_tick(void *xtp)
1205 if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) {
1206 if (V_ipport_stoprandom > 0)
1207 V_ipport_stoprandom--;
1209 V_ipport_stoprandom = V_ipport_randomtime;
1210 V_ipport_tcplastcount = V_ipport_tcpallocs;
1211 callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
1215 inp_wlock(struct inpcb *inp)
1222 inp_wunlock(struct inpcb *inp)
1229 inp_rlock(struct inpcb *inp)
1236 inp_runlock(struct inpcb *inp)
1244 inp_lock_assert(struct inpcb *inp)
1247 INP_WLOCK_ASSERT(inp);
1251 inp_unlock_assert(struct inpcb *inp)
1254 INP_UNLOCK_ASSERT(inp);
1259 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
1263 INP_INFO_RLOCK(&V_tcbinfo);
1264 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1269 INP_INFO_RUNLOCK(&V_tcbinfo);
1273 inp_inpcbtosocket(struct inpcb *inp)
1276 INP_WLOCK_ASSERT(inp);
1277 return (inp->inp_socket);
1281 inp_inpcbtotcpcb(struct inpcb *inp)
1284 INP_WLOCK_ASSERT(inp);
1285 return ((struct tcpcb *)inp->inp_ppcb);
1289 inp_ip_tos_get(const struct inpcb *inp)
1292 return (inp->inp_ip_tos);
1296 inp_ip_tos_set(struct inpcb *inp, int val)
1299 inp->inp_ip_tos = val;
1303 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
1304 uint32_t *faddr, uint16_t *fp)
1307 INP_LOCK_ASSERT(inp);
1308 *laddr = inp->inp_laddr.s_addr;
1309 *faddr = inp->inp_faddr.s_addr;
1310 *lp = inp->inp_lport;
1311 *fp = inp->inp_fport;
1315 so_sotoinpcb(struct socket *so)
1318 return (sotoinpcb(so));
1322 so_sototcpcb(struct socket *so)
1325 return (sototcpcb(so));
1330 db_print_indent(int indent)
1334 for (i = 0; i < indent; i++)
1339 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
1341 char faddr_str[48], laddr_str[48];
1343 db_print_indent(indent);
1344 db_printf("%s at %p\n", name, inc);
1349 if (inc->inc_flags == 1) {
1351 ip6_sprintf(laddr_str, &inc->inc6_laddr);
1352 ip6_sprintf(faddr_str, &inc->inc6_faddr);
1356 inet_ntoa_r(inc->inc_laddr, laddr_str);
1357 inet_ntoa_r(inc->inc_faddr, faddr_str);
1361 db_print_indent(indent);
1362 db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
1363 ntohs(inc->inc_lport));
1364 db_print_indent(indent);
1365 db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
1366 ntohs(inc->inc_fport));
1370 db_print_inpflags(int inp_flags)
1375 if (inp_flags & INP_RECVOPTS) {
1376 db_printf("%sINP_RECVOPTS", comma ? ", " : "");
1379 if (inp_flags & INP_RECVRETOPTS) {
1380 db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
1383 if (inp_flags & INP_RECVDSTADDR) {
1384 db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
1387 if (inp_flags & INP_HDRINCL) {
1388 db_printf("%sINP_HDRINCL", comma ? ", " : "");
1391 if (inp_flags & INP_HIGHPORT) {
1392 db_printf("%sINP_HIGHPORT", comma ? ", " : "");
1395 if (inp_flags & INP_LOWPORT) {
1396 db_printf("%sINP_LOWPORT", comma ? ", " : "");
1399 if (inp_flags & INP_ANONPORT) {
1400 db_printf("%sINP_ANONPORT", comma ? ", " : "");
1403 if (inp_flags & INP_RECVIF) {
1404 db_printf("%sINP_RECVIF", comma ? ", " : "");
1407 if (inp_flags & INP_MTUDISC) {
1408 db_printf("%sINP_MTUDISC", comma ? ", " : "");
1411 if (inp_flags & INP_FAITH) {
1412 db_printf("%sINP_FAITH", comma ? ", " : "");
1415 if (inp_flags & INP_RECVTTL) {
1416 db_printf("%sINP_RECVTTL", comma ? ", " : "");
1419 if (inp_flags & INP_DONTFRAG) {
1420 db_printf("%sINP_DONTFRAG", comma ? ", " : "");
1423 if (inp_flags & IN6P_IPV6_V6ONLY) {
1424 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
1427 if (inp_flags & IN6P_PKTINFO) {
1428 db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
1431 if (inp_flags & IN6P_HOPLIMIT) {
1432 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
1435 if (inp_flags & IN6P_HOPOPTS) {
1436 db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
1439 if (inp_flags & IN6P_DSTOPTS) {
1440 db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
1443 if (inp_flags & IN6P_RTHDR) {
1444 db_printf("%sIN6P_RTHDR", comma ? ", " : "");
1447 if (inp_flags & IN6P_RTHDRDSTOPTS) {
1448 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
1451 if (inp_flags & IN6P_TCLASS) {
1452 db_printf("%sIN6P_TCLASS", comma ? ", " : "");
1455 if (inp_flags & IN6P_AUTOFLOWLABEL) {
1456 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
1459 if (inp_flags & IN6P_RFC2292) {
1460 db_printf("%sIN6P_RFC2292", comma ? ", " : "");
1463 if (inp_flags & IN6P_MTU) {
1464 db_printf("IN6P_MTU%s", comma ? ", " : "");
1470 db_print_inpvflag(u_char inp_vflag)
1475 if (inp_vflag & INP_IPV4) {
1476 db_printf("%sINP_IPV4", comma ? ", " : "");
1479 if (inp_vflag & INP_IPV6) {
1480 db_printf("%sINP_IPV6", comma ? ", " : "");
1483 if (inp_vflag & INP_IPV6PROTO) {
1484 db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
1487 if (inp_vflag & INP_TIMEWAIT) {
1488 db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
1491 if (inp_vflag & INP_ONESBCAST) {
1492 db_printf("%sINP_ONESBCAST", comma ? ", " : "");
1495 if (inp_vflag & INP_DROPPED) {
1496 db_printf("%sINP_DROPPED", comma ? ", " : "");
1499 if (inp_vflag & INP_SOCKREF) {
1500 db_printf("%sINP_SOCKREF", comma ? ", " : "");
1506 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
1509 db_print_indent(indent);
1510 db_printf("%s at %p\n", name, inp);
1514 db_print_indent(indent);
1515 db_printf("inp_flow: 0x%x\n", inp->inp_flow);
1517 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
1519 db_print_indent(indent);
1520 db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n",
1521 inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
1523 db_print_indent(indent);
1524 db_printf("inp_label: %p inp_flags: 0x%x (",
1525 inp->inp_label, inp->inp_flags);
1526 db_print_inpflags(inp->inp_flags);
1529 db_print_indent(indent);
1530 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp,
1532 db_print_inpvflag(inp->inp_vflag);
1535 db_print_indent(indent);
1536 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
1537 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
1539 db_print_indent(indent);
1541 if (inp->inp_vflag & INP_IPV6) {
1542 db_printf("in6p_options: %p in6p_outputopts: %p "
1543 "in6p_moptions: %p\n", inp->in6p_options,
1544 inp->in6p_outputopts, inp->in6p_moptions);
1545 db_printf("in6p_icmp6filt: %p in6p_cksum %d "
1546 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
1551 db_printf("inp_ip_tos: %d inp_ip_options: %p "
1552 "inp_ip_moptions: %p\n", inp->inp_ip_tos,
1553 inp->inp_options, inp->inp_moptions);
1556 db_print_indent(indent);
1557 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd,
1558 (uintmax_t)inp->inp_gencnt);
1561 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
1566 db_printf("usage: show inpcb <addr>\n");
1569 inp = (struct inpcb *)addr;
1571 db_print_inpcb(inp, "inpcb", 0);