]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/udp_usrreq.c
[udp] fix possible mbuf and lock leak in udp_input().
[FreeBSD/FreeBSD.git] / sys / netinet / udp_usrreq.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
5  *      The Regents of the University of California.
6  * Copyright (c) 2008 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * Copyright (c) 2014 Kevin Lo
9  * All rights reserved.
10  *
11  * Portions of this software were developed by Robert N. M. Watson under
12  * contract to Juniper Networks, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *      @(#)udp_usrreq.c        8.6 (Berkeley) 5/23/95
39  */
40
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43
44 #include "opt_inet.h"
45 #include "opt_inet6.h"
46 #include "opt_ipsec.h"
47 #include "opt_route.h"
48 #include "opt_rss.h"
49
50 #include <sys/param.h>
51 #include <sys/domain.h>
52 #include <sys/eventhandler.h>
53 #include <sys/jail.h>
54 #include <sys/kernel.h>
55 #include <sys/lock.h>
56 #include <sys/malloc.h>
57 #include <sys/mbuf.h>
58 #include <sys/priv.h>
59 #include <sys/proc.h>
60 #include <sys/protosw.h>
61 #include <sys/sdt.h>
62 #include <sys/signalvar.h>
63 #include <sys/socket.h>
64 #include <sys/socketvar.h>
65 #include <sys/sx.h>
66 #include <sys/sysctl.h>
67 #include <sys/syslog.h>
68 #include <sys/systm.h>
69
70 #include <vm/uma.h>
71
72 #include <net/if.h>
73 #include <net/if_var.h>
74 #include <net/route.h>
75 #include <net/route/nhop.h>
76 #include <net/rss_config.h>
77
78 #include <netinet/in.h>
79 #include <netinet/in_kdtrace.h>
80 #include <netinet/in_fib.h>
81 #include <netinet/in_pcb.h>
82 #include <netinet/in_systm.h>
83 #include <netinet/in_var.h>
84 #include <netinet/ip.h>
85 #ifdef INET6
86 #include <netinet/ip6.h>
87 #endif
88 #include <netinet/ip_icmp.h>
89 #include <netinet/icmp_var.h>
90 #include <netinet/ip_var.h>
91 #include <netinet/ip_options.h>
92 #ifdef INET6
93 #include <netinet6/ip6_var.h>
94 #endif
95 #include <netinet/udp.h>
96 #include <netinet/udp_var.h>
97 #include <netinet/udplite.h>
98 #include <netinet/in_rss.h>
99
100 #include <netipsec/ipsec_support.h>
101
102 #include <machine/in_cksum.h>
103
104 #include <security/mac/mac_framework.h>
105
106 /*
107  * UDP and UDP-Lite protocols implementation.
108  * Per RFC 768, August, 1980.
109  * Per RFC 3828, July, 2004.
110  */
111
112 /*
113  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
114  * removes the only data integrity mechanism for packets and malformed
115  * packets that would otherwise be discarded due to bad checksums, and may
116  * cause problems (especially for NFS data blocks).
117  */
118 VNET_DEFINE(int, udp_cksum) = 1;
119 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW,
120     &VNET_NAME(udp_cksum), 0, "compute udp checksum");
121
122 VNET_DEFINE(int, udp_log_in_vain) = 0;
123 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW,
124     &VNET_NAME(udp_log_in_vain), 0, "Log all incoming UDP packets");
125
126 VNET_DEFINE(int, udp_blackhole) = 0;
127 SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
128     &VNET_NAME(udp_blackhole), 0,
129     "Do not send port unreachables for refused connects");
130
131 u_long  udp_sendspace = 9216;           /* really max datagram size */
132 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
133     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
134
135 u_long  udp_recvspace = 40 * (1024 +
136 #ifdef INET6
137                                       sizeof(struct sockaddr_in6)
138 #else
139                                       sizeof(struct sockaddr_in)
140 #endif
141                                       );        /* 40 1K datagrams */
142
143 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
144     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
145
146 VNET_DEFINE(struct inpcbhead, udb);             /* from udp_var.h */
147 VNET_DEFINE(struct inpcbinfo, udbinfo);
148 VNET_DEFINE(struct inpcbhead, ulitecb);
149 VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
150 VNET_DEFINE_STATIC(uma_zone_t, udpcb_zone);
151 #define V_udpcb_zone                    VNET(udpcb_zone)
152
153 #ifndef UDBHASHSIZE
154 #define UDBHASHSIZE     128
155 #endif
156
157 VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat);          /* from udp_var.h */
158 VNET_PCPUSTAT_SYSINIT(udpstat);
159 SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
160     udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
161
162 #ifdef VIMAGE
163 VNET_PCPUSTAT_SYSUNINIT(udpstat);
164 #endif /* VIMAGE */
165 #ifdef INET
166 static void     udp_detach(struct socket *so);
167 static int      udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
168                     struct mbuf *, struct thread *, int);
169 #endif
170
171 static void
172 udp_zone_change(void *tag)
173 {
174
175         uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
176         uma_zone_set_max(V_udpcb_zone, maxsockets);
177 }
178
179 static int
180 udp_inpcb_init(void *mem, int size, int flags)
181 {
182         struct inpcb *inp;
183
184         inp = mem;
185         INP_LOCK_INIT(inp, "inp", "udpinp");
186         return (0);
187 }
188
189 static int
190 udplite_inpcb_init(void *mem, int size, int flags)
191 {
192         struct inpcb *inp;
193
194         inp = mem;
195         INP_LOCK_INIT(inp, "inp", "udpliteinp");
196         return (0);
197 }
198
199 void
200 udp_init(void)
201 {
202
203         /*
204          * For now default to 2-tuple UDP hashing - until the fragment
205          * reassembly code can also update the flowid.
206          *
207          * Once we can calculate the flowid that way and re-establish
208          * a 4-tuple, flip this to 4-tuple.
209          */
210         in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
211             "udp_inpcb", udp_inpcb_init, IPI_HASHFIELDS_2TUPLE);
212         V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
213             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
214         uma_zone_set_max(V_udpcb_zone, maxsockets);
215         uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
216         EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
217             EVENTHANDLER_PRI_ANY);
218 }
219
220 void
221 udplite_init(void)
222 {
223
224         in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE,
225             UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init,
226             IPI_HASHFIELDS_2TUPLE);
227 }
228
229 /*
230  * Kernel module interface for updating udpstat.  The argument is an index
231  * into udpstat treated as an array of u_long.  While this encodes the
232  * general layout of udpstat into the caller, it doesn't encode its location,
233  * so that future changes to add, for example, per-CPU stats support won't
234  * cause binary compatibility problems for kernel modules.
235  */
236 void
237 kmod_udpstat_inc(int statnum)
238 {
239
240         counter_u64_add(VNET(udpstat)[statnum], 1);
241 }
242
243 int
244 udp_newudpcb(struct inpcb *inp)
245 {
246         struct udpcb *up;
247
248         up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
249         if (up == NULL)
250                 return (ENOBUFS);
251         inp->inp_ppcb = up;
252         return (0);
253 }
254
255 void
256 udp_discardcb(struct udpcb *up)
257 {
258
259         uma_zfree(V_udpcb_zone, up);
260 }
261
262 #ifdef VIMAGE
263 static void
264 udp_destroy(void *unused __unused)
265 {
266
267         in_pcbinfo_destroy(&V_udbinfo);
268         uma_zdestroy(V_udpcb_zone);
269 }
270 VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL);
271
272 static void
273 udplite_destroy(void *unused __unused)
274 {
275
276         in_pcbinfo_destroy(&V_ulitecbinfo);
277 }
278 VNET_SYSUNINIT(udplite, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udplite_destroy,
279     NULL);
280 #endif
281
282 #ifdef INET
283 /*
284  * Subroutine of udp_input(), which appends the provided mbuf chain to the
285  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
286  * contains the source address.  If the socket ends up being an IPv6 socket,
287  * udp_append() will convert to a sockaddr_in6 before passing the address
288  * into the socket code.
289  *
290  * In the normal case udp_append() will return 0, indicating that you
291  * must unlock the inp. However if a tunneling protocol is in place we increment
292  * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we
293  * then decrement the reference count. If the inp_rele returns 1, indicating the
294  * inp is gone, we return that to the caller to tell them *not* to unlock
295  * the inp. In the case of multi-cast this will cause the distribution
296  * to stop (though most tunneling protocols known currently do *not* use
297  * multicast).
298  */
299 static int
300 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
301     struct sockaddr_in *udp_in)
302 {
303         struct sockaddr *append_sa;
304         struct socket *so;
305         struct mbuf *tmpopts, *opts = NULL;
306 #ifdef INET6
307         struct sockaddr_in6 udp_in6;
308 #endif
309         struct udpcb *up;
310
311         INP_LOCK_ASSERT(inp);
312
313         /*
314          * Engage the tunneling protocol.
315          */
316         up = intoudpcb(inp);
317         if (up->u_tun_func != NULL) {
318                 in_pcbref(inp);
319                 INP_RUNLOCK(inp);
320                 (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&udp_in[0],
321                     up->u_tun_ctx);
322                 INP_RLOCK(inp);
323                 return (in_pcbrele_rlocked(inp));
324         }
325
326         off += sizeof(struct udphdr);
327
328 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
329         /* Check AH/ESP integrity. */
330         if (IPSEC_ENABLED(ipv4) &&
331             IPSEC_CHECK_POLICY(ipv4, n, inp) != 0) {
332                 m_freem(n);
333                 return (0);
334         }
335         if (up->u_flags & UF_ESPINUDP) {/* IPSec UDP encaps. */
336                 if (IPSEC_ENABLED(ipv4) &&
337                     UDPENCAP_INPUT(n, off, AF_INET) != 0)
338                         return (0);     /* Consumed. */
339         }
340 #endif /* IPSEC */
341 #ifdef MAC
342         if (mac_inpcb_check_deliver(inp, n) != 0) {
343                 m_freem(n);
344                 return (0);
345         }
346 #endif /* MAC */
347         if (inp->inp_flags & INP_CONTROLOPTS ||
348             inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
349 #ifdef INET6
350                 if (inp->inp_vflag & INP_IPV6)
351                         (void)ip6_savecontrol_v4(inp, n, &opts, NULL);
352                 else
353 #endif /* INET6 */
354                         ip_savecontrol(inp, &opts, ip, n);
355         }
356         if ((inp->inp_vflag & INP_IPV4) && (inp->inp_flags2 & INP_ORIGDSTADDR)) {
357                 tmpopts = sbcreatecontrol((caddr_t)&udp_in[1],
358                         sizeof(struct sockaddr_in), IP_ORIGDSTADDR, IPPROTO_IP);
359                 if (tmpopts) {
360                         if (opts) {
361                                 tmpopts->m_next = opts;
362                                 opts = tmpopts;
363                         } else
364                                 opts = tmpopts;
365                 }
366         }
367 #ifdef INET6
368         if (inp->inp_vflag & INP_IPV6) {
369                 bzero(&udp_in6, sizeof(udp_in6));
370                 udp_in6.sin6_len = sizeof(udp_in6);
371                 udp_in6.sin6_family = AF_INET6;
372                 in6_sin_2_v4mapsin6(&udp_in[0], &udp_in6);
373                 append_sa = (struct sockaddr *)&udp_in6;
374         } else
375 #endif /* INET6 */
376                 append_sa = (struct sockaddr *)&udp_in[0];
377         m_adj(n, off);
378
379         so = inp->inp_socket;
380         SOCKBUF_LOCK(&so->so_rcv);
381         if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
382                 SOCKBUF_UNLOCK(&so->so_rcv);
383                 m_freem(n);
384                 if (opts)
385                         m_freem(opts);
386                 UDPSTAT_INC(udps_fullsock);
387         } else
388                 sorwakeup_locked(so);
389         return (0);
390 }
391
392 int
393 udp_input(struct mbuf **mp, int *offp, int proto)
394 {
395         struct ip *ip;
396         struct udphdr *uh;
397         struct ifnet *ifp;
398         struct inpcb *inp;
399         uint16_t len, ip_len;
400         struct inpcbinfo *pcbinfo;
401         struct ip save_ip;
402         struct sockaddr_in udp_in[2];
403         struct mbuf *m;
404         struct m_tag *fwd_tag;
405         int cscov_partial, iphlen;
406
407         m = *mp;
408         iphlen = *offp;
409         ifp = m->m_pkthdr.rcvif;
410         *mp = NULL;
411         UDPSTAT_INC(udps_ipackets);
412
413         /*
414          * Strip IP options, if any; should skip this, make available to
415          * user, and use on returned packets, but we don't yet have a way to
416          * check the checksum with options still present.
417          */
418         if (iphlen > sizeof (struct ip)) {
419                 ip_stripoptions(m);
420                 iphlen = sizeof(struct ip);
421         }
422
423         /*
424          * Get IP and UDP header together in first mbuf.
425          */
426         if (m->m_len < iphlen + sizeof(struct udphdr)) {
427                 if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) {
428                         UDPSTAT_INC(udps_hdrops);
429                         return (IPPROTO_DONE);
430                 }
431         }
432         ip = mtod(m, struct ip *);
433         uh = (struct udphdr *)((caddr_t)ip + iphlen);
434         cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0;
435
436         /*
437          * Destination port of 0 is illegal, based on RFC768.
438          */
439         if (uh->uh_dport == 0)
440                 goto badunlocked;
441
442         /*
443          * Construct sockaddr format source address.  Stuff source address
444          * and datagram in user buffer.
445          */
446         bzero(&udp_in[0], sizeof(struct sockaddr_in) * 2);
447         udp_in[0].sin_len = sizeof(struct sockaddr_in);
448         udp_in[0].sin_family = AF_INET;
449         udp_in[0].sin_port = uh->uh_sport;
450         udp_in[0].sin_addr = ip->ip_src;
451         udp_in[1].sin_len = sizeof(struct sockaddr_in);
452         udp_in[1].sin_family = AF_INET;
453         udp_in[1].sin_port = uh->uh_dport;
454         udp_in[1].sin_addr = ip->ip_dst;
455
456         /*
457          * Make mbuf data length reflect UDP length.  If not enough data to
458          * reflect UDP length, drop.
459          */
460         len = ntohs((u_short)uh->uh_ulen);
461         ip_len = ntohs(ip->ip_len) - iphlen;
462         if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) {
463                 /* Zero means checksum over the complete packet. */
464                 if (len == 0)
465                         len = ip_len;
466                 cscov_partial = 0;
467         }
468         if (ip_len != len) {
469                 if (len > ip_len || len < sizeof(struct udphdr)) {
470                         UDPSTAT_INC(udps_badlen);
471                         goto badunlocked;
472                 }
473                 if (proto == IPPROTO_UDP)
474                         m_adj(m, len - ip_len);
475         }
476
477         /*
478          * Save a copy of the IP header in case we want restore it for
479          * sending an ICMP error message in response.
480          */
481         if (!V_udp_blackhole)
482                 save_ip = *ip;
483         else
484                 memset(&save_ip, 0, sizeof(save_ip));
485
486         /*
487          * Checksum extended UDP header and data.
488          */
489         if (uh->uh_sum) {
490                 u_short uh_sum;
491
492                 if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
493                     !cscov_partial) {
494                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
495                                 uh_sum = m->m_pkthdr.csum_data;
496                         else
497                                 uh_sum = in_pseudo(ip->ip_src.s_addr,
498                                     ip->ip_dst.s_addr, htonl((u_short)len +
499                                     m->m_pkthdr.csum_data + proto));
500                         uh_sum ^= 0xffff;
501                 } else {
502                         char b[9];
503
504                         bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
505                         bzero(((struct ipovly *)ip)->ih_x1, 9);
506                         ((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ?
507                             uh->uh_ulen : htons(ip_len);
508                         uh_sum = in_cksum(m, len + sizeof (struct ip));
509                         bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
510                 }
511                 if (uh_sum) {
512                         UDPSTAT_INC(udps_badsum);
513                         m_freem(m);
514                         return (IPPROTO_DONE);
515                 }
516         } else {
517                 if (proto == IPPROTO_UDP) {
518                         UDPSTAT_INC(udps_nosum);
519                 } else {
520                         /* UDPLite requires a checksum */
521                         /* XXX: What is the right UDPLite MIB counter here? */
522                         m_freem(m);
523                         return (IPPROTO_DONE);
524                 }
525         }
526
527         pcbinfo = udp_get_inpcbinfo(proto);
528         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
529             in_broadcast(ip->ip_dst, ifp)) {
530                 struct inpcb *last;
531                 struct inpcbhead *pcblist;
532
533                 NET_EPOCH_ASSERT();
534
535                 pcblist = udp_get_pcblist(proto);
536                 last = NULL;
537                 CK_LIST_FOREACH(inp, pcblist, inp_list) {
538                         if (inp->inp_lport != uh->uh_dport)
539                                 continue;
540 #ifdef INET6
541                         if ((inp->inp_vflag & INP_IPV4) == 0)
542                                 continue;
543 #endif
544                         if (inp->inp_laddr.s_addr != INADDR_ANY &&
545                             inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
546                                 continue;
547                         if (inp->inp_faddr.s_addr != INADDR_ANY &&
548                             inp->inp_faddr.s_addr != ip->ip_src.s_addr)
549                                 continue;
550                         if (inp->inp_fport != 0 &&
551                             inp->inp_fport != uh->uh_sport)
552                                 continue;
553
554                         INP_RLOCK(inp);
555
556                         if (__predict_false(inp->inp_flags2 & INP_FREED)) {
557                                 INP_RUNLOCK(inp);
558                                 continue;
559                         }
560
561                         /*
562                          * XXXRW: Because we weren't holding either the inpcb
563                          * or the hash lock when we checked for a match
564                          * before, we should probably recheck now that the
565                          * inpcb lock is held.
566                          */
567
568                         /*
569                          * Handle socket delivery policy for any-source
570                          * and source-specific multicast. [RFC3678]
571                          */
572                         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
573                                 struct ip_moptions      *imo;
574                                 struct sockaddr_in       group;
575                                 int                      blocked;
576
577                                 imo = inp->inp_moptions;
578                                 if (imo == NULL) {
579                                         INP_RUNLOCK(inp);
580                                         continue;
581                                 }
582                                 bzero(&group, sizeof(struct sockaddr_in));
583                                 group.sin_len = sizeof(struct sockaddr_in);
584                                 group.sin_family = AF_INET;
585                                 group.sin_addr = ip->ip_dst;
586
587                                 blocked = imo_multi_filter(imo, ifp,
588                                         (struct sockaddr *)&group,
589                                         (struct sockaddr *)&udp_in[0]);
590                                 if (blocked != MCAST_PASS) {
591                                         if (blocked == MCAST_NOTGMEMBER)
592                                                 IPSTAT_INC(ips_notmember);
593                                         if (blocked == MCAST_NOTSMEMBER ||
594                                             blocked == MCAST_MUTED)
595                                                 UDPSTAT_INC(udps_filtermcast);
596                                         INP_RUNLOCK(inp);
597                                         continue;
598                                 }
599                         }
600                         if (last != NULL) {
601                                 struct mbuf *n;
602
603                                 if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) !=
604                                     NULL) {
605                                         if (proto == IPPROTO_UDPLITE)
606                                                 UDPLITE_PROBE(receive, NULL, last, ip,
607                                                     last, uh);
608                                         else
609                                                 UDP_PROBE(receive, NULL, last, ip, last,
610                                                     uh);
611                                         if (udp_append(last, ip, n, iphlen,
612                                                 udp_in)) {
613                                                 INP_RUNLOCK(inp);
614                                                 goto badunlocked;
615                                         }
616                                 }
617                                 /* Release PCB lock taken on previous pass. */
618                                 INP_RUNLOCK(last);
619                         }
620                         last = inp;
621                         /*
622                          * Don't look for additional matches if this one does
623                          * not have either the SO_REUSEPORT or SO_REUSEADDR
624                          * socket options set.  This heuristic avoids
625                          * searching through all pcbs in the common case of a
626                          * non-shared port.  It assumes that an application
627                          * will never clear these options after setting them.
628                          */
629                         if ((last->inp_socket->so_options &
630                             (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0)
631                                 break;
632                 }
633
634                 if (last == NULL) {
635                         /*
636                          * No matching pcb found; discard datagram.  (No need
637                          * to send an ICMP Port Unreachable for a broadcast
638                          * or multicast datgram.)
639                          */
640                         UDPSTAT_INC(udps_noport);
641                         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
642                                 UDPSTAT_INC(udps_noportmcast);
643                         else
644                                 UDPSTAT_INC(udps_noportbcast);
645                         goto badunlocked;
646                 }
647                 if (proto == IPPROTO_UDPLITE)
648                         UDPLITE_PROBE(receive, NULL, last, ip, last, uh);
649                 else
650                         UDP_PROBE(receive, NULL, last, ip, last, uh);
651                 if (udp_append(last, ip, m, iphlen, udp_in) == 0)
652                         INP_RUNLOCK(last);
653                 return (IPPROTO_DONE);
654         }
655
656         /*
657          * Locate pcb for datagram.
658          */
659
660         /*
661          * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
662          */
663         if ((m->m_flags & M_IP_NEXTHOP) &&
664             (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
665                 struct sockaddr_in *next_hop;
666
667                 next_hop = (struct sockaddr_in *)(fwd_tag + 1);
668
669                 /*
670                  * Transparently forwarded. Pretend to be the destination.
671                  * Already got one like this?
672                  */
673                 inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
674                     ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
675                 if (!inp) {
676                         /*
677                          * It's new.  Try to find the ambushing socket.
678                          * Because we've rewritten the destination address,
679                          * any hardware-generated hash is ignored.
680                          */
681                         inp = in_pcblookup(pcbinfo, ip->ip_src,
682                             uh->uh_sport, next_hop->sin_addr,
683                             next_hop->sin_port ? htons(next_hop->sin_port) :
684                             uh->uh_dport, INPLOOKUP_WILDCARD |
685                             INPLOOKUP_RLOCKPCB, ifp);
686                 }
687                 /* Remove the tag from the packet. We don't need it anymore. */
688                 m_tag_delete(m, fwd_tag);
689                 m->m_flags &= ~M_IP_NEXTHOP;
690         } else
691                 inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
692                     ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
693                     INPLOOKUP_RLOCKPCB, ifp, m);
694         if (inp == NULL) {
695                 if (V_udp_log_in_vain) {
696                         char src[INET_ADDRSTRLEN];
697                         char dst[INET_ADDRSTRLEN];
698
699                         log(LOG_INFO,
700                             "Connection attempt to UDP %s:%d from %s:%d\n",
701                             inet_ntoa_r(ip->ip_dst, dst), ntohs(uh->uh_dport),
702                             inet_ntoa_r(ip->ip_src, src), ntohs(uh->uh_sport));
703                 }
704                 if (proto == IPPROTO_UDPLITE)
705                         UDPLITE_PROBE(receive, NULL, NULL, ip, NULL, uh);
706                 else
707                         UDP_PROBE(receive, NULL, NULL, ip, NULL, uh);
708                 UDPSTAT_INC(udps_noport);
709                 if (m->m_flags & (M_BCAST | M_MCAST)) {
710                         UDPSTAT_INC(udps_noportbcast);
711                         goto badunlocked;
712                 }
713                 if (V_udp_blackhole)
714                         goto badunlocked;
715                 if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
716                         goto badunlocked;
717                 *ip = save_ip;
718                 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
719                 return (IPPROTO_DONE);
720         }
721
722         /*
723          * Check the minimum TTL for socket.
724          */
725         INP_RLOCK_ASSERT(inp);
726         if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
727                 if (proto == IPPROTO_UDPLITE)
728                         UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
729                 else
730                         UDP_PROBE(receive, NULL, inp, ip, inp, uh);
731                 INP_RUNLOCK(inp);
732                 m_freem(m);
733                 return (IPPROTO_DONE);
734         }
735         if (cscov_partial) {
736                 struct udpcb *up;
737
738                 up = intoudpcb(inp);
739                 if (up->u_rxcslen == 0 || up->u_rxcslen > len) {
740                         INP_RUNLOCK(inp);
741                         m_freem(m);
742                         return (IPPROTO_DONE);
743                 }
744         }
745
746         if (proto == IPPROTO_UDPLITE)
747                 UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
748         else
749                 UDP_PROBE(receive, NULL, inp, ip, inp, uh);
750         if (udp_append(inp, ip, m, iphlen, udp_in) == 0)
751                 INP_RUNLOCK(inp);
752         return (IPPROTO_DONE);
753
754 badunlocked:
755         m_freem(m);
756         return (IPPROTO_DONE);
757 }
758 #endif /* INET */
759
760 /*
761  * Notify a udp user of an asynchronous error; just wake up so that they can
762  * collect error status.
763  */
764 struct inpcb *
765 udp_notify(struct inpcb *inp, int errno)
766 {
767
768         INP_WLOCK_ASSERT(inp);
769         if ((errno == EHOSTUNREACH || errno == ENETUNREACH ||
770              errno == EHOSTDOWN) && inp->inp_route.ro_nh) {
771                 NH_FREE(inp->inp_route.ro_nh);
772                 inp->inp_route.ro_nh = (struct nhop_object *)NULL;
773         }
774
775         inp->inp_socket->so_error = errno;
776         sorwakeup(inp->inp_socket);
777         sowwakeup(inp->inp_socket);
778         return (inp);
779 }
780
781 #ifdef INET
782 static void
783 udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip,
784     struct inpcbinfo *pcbinfo)
785 {
786         struct ip *ip = vip;
787         struct udphdr *uh;
788         struct in_addr faddr;
789         struct inpcb *inp;
790
791         faddr = ((struct sockaddr_in *)sa)->sin_addr;
792         if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
793                 return;
794
795         if (PRC_IS_REDIRECT(cmd)) {
796                 /* signal EHOSTDOWN, as it flushes the cached route */
797                 in_pcbnotifyall(&V_udbinfo, faddr, EHOSTDOWN, udp_notify);
798                 return;
799         }
800
801         /*
802          * Hostdead is ugly because it goes linearly through all PCBs.
803          *
804          * XXX: We never get this from ICMP, otherwise it makes an excellent
805          * DoS attack on machines with many connections.
806          */
807         if (cmd == PRC_HOSTDEAD)
808                 ip = NULL;
809         else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
810                 return;
811         if (ip != NULL) {
812                 uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
813                 inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
814                     ip->ip_src, uh->uh_sport, INPLOOKUP_WLOCKPCB, NULL);
815                 if (inp != NULL) {
816                         INP_WLOCK_ASSERT(inp);
817                         if (inp->inp_socket != NULL) {
818                                 udp_notify(inp, inetctlerrmap[cmd]);
819                         }
820                         INP_WUNLOCK(inp);
821                 } else {
822                         inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
823                                            ip->ip_src, uh->uh_sport,
824                                            INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
825                         if (inp != NULL) {
826                                 struct udpcb *up;
827                                 void *ctx;
828                                 udp_tun_icmp_t func;
829
830                                 up = intoudpcb(inp);
831                                 ctx = up->u_tun_ctx;
832                                 func = up->u_icmp_func;
833                                 INP_RUNLOCK(inp);
834                                 if (func != NULL)
835                                         (*func)(cmd, sa, vip, ctx);
836                         }
837                 }
838         } else
839                 in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd],
840                     udp_notify);
841 }
842 void
843 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
844 {
845
846         return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo));
847 }
848
849 void
850 udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip)
851 {
852
853         return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo));
854 }
855 #endif /* INET */
856
857 static int
858 udp_pcblist(SYSCTL_HANDLER_ARGS)
859 {
860         struct xinpgen xig;
861         struct epoch_tracker et;
862         struct inpcb *inp;
863         int error;
864
865         if (req->newptr != 0)
866                 return (EPERM);
867
868         if (req->oldptr == 0) {
869                 int n;
870
871                 n = V_udbinfo.ipi_count;
872                 n += imax(n / 8, 10);
873                 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
874                 return (0);
875         }
876
877         if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
878                 return (error);
879
880         bzero(&xig, sizeof(xig));
881         xig.xig_len = sizeof xig;
882         xig.xig_count = V_udbinfo.ipi_count;
883         xig.xig_gen = V_udbinfo.ipi_gencnt;
884         xig.xig_sogen = so_gencnt;
885         error = SYSCTL_OUT(req, &xig, sizeof xig);
886         if (error)
887                 return (error);
888
889         NET_EPOCH_ENTER(et);
890         for (inp = CK_LIST_FIRST(V_udbinfo.ipi_listhead);
891             inp != NULL;
892             inp = CK_LIST_NEXT(inp, inp_list)) {
893                 INP_RLOCK(inp);
894                 if (inp->inp_gencnt <= xig.xig_gen &&
895                     cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
896                         struct xinpcb xi;
897
898                         in_pcbtoxinpcb(inp, &xi);
899                         INP_RUNLOCK(inp);
900                         error = SYSCTL_OUT(req, &xi, sizeof xi);
901                         if (error)
902                                 break;
903                 } else
904                         INP_RUNLOCK(inp);
905         }
906         NET_EPOCH_EXIT(et);
907
908         if (!error) {
909                 /*
910                  * Give the user an updated idea of our state.  If the
911                  * generation differs from what we told her before, she knows
912                  * that something happened while we were processing this
913                  * request, and it might be necessary to retry.
914                  */
915                 xig.xig_gen = V_udbinfo.ipi_gencnt;
916                 xig.xig_sogen = so_gencnt;
917                 xig.xig_count = V_udbinfo.ipi_count;
918                 error = SYSCTL_OUT(req, &xig, sizeof xig);
919         }
920
921         return (error);
922 }
923
924 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
925     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
926     udp_pcblist, "S,xinpcb",
927     "List of active UDP sockets");
928
929 #ifdef INET
930 static int
931 udp_getcred(SYSCTL_HANDLER_ARGS)
932 {
933         struct xucred xuc;
934         struct sockaddr_in addrs[2];
935         struct epoch_tracker et;
936         struct inpcb *inp;
937         int error;
938
939         error = priv_check(req->td, PRIV_NETINET_GETCRED);
940         if (error)
941                 return (error);
942         error = SYSCTL_IN(req, addrs, sizeof(addrs));
943         if (error)
944                 return (error);
945         NET_EPOCH_ENTER(et);
946         inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
947             addrs[0].sin_addr, addrs[0].sin_port,
948             INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
949         NET_EPOCH_EXIT(et);
950         if (inp != NULL) {
951                 INP_RLOCK_ASSERT(inp);
952                 if (inp->inp_socket == NULL)
953                         error = ENOENT;
954                 if (error == 0)
955                         error = cr_canseeinpcb(req->td->td_ucred, inp);
956                 if (error == 0)
957                         cru2x(inp->inp_cred, &xuc);
958                 INP_RUNLOCK(inp);
959         } else
960                 error = ENOENT;
961         if (error == 0)
962                 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
963         return (error);
964 }
965
966 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
967     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
968     0, 0, udp_getcred, "S,xucred",
969     "Get the xucred of a UDP connection");
970 #endif /* INET */
971
972 int
973 udp_ctloutput(struct socket *so, struct sockopt *sopt)
974 {
975         struct inpcb *inp;
976         struct udpcb *up;
977         int isudplite, error, optval;
978
979         error = 0;
980         isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0;
981         inp = sotoinpcb(so);
982         KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
983         INP_WLOCK(inp);
984         if (sopt->sopt_level != so->so_proto->pr_protocol) {
985 #ifdef INET6
986                 if (INP_CHECK_SOCKAF(so, AF_INET6)) {
987                         INP_WUNLOCK(inp);
988                         error = ip6_ctloutput(so, sopt);
989                 }
990 #endif
991 #if defined(INET) && defined(INET6)
992                 else
993 #endif
994 #ifdef INET
995                 {
996                         INP_WUNLOCK(inp);
997                         error = ip_ctloutput(so, sopt);
998                 }
999 #endif
1000                 return (error);
1001         }
1002
1003         switch (sopt->sopt_dir) {
1004         case SOPT_SET:
1005                 switch (sopt->sopt_name) {
1006 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1007 #ifdef INET
1008                 case UDP_ENCAP:
1009                         if (!IPSEC_ENABLED(ipv4)) {
1010                                 INP_WUNLOCK(inp);
1011                                 return (ENOPROTOOPT);
1012                         }
1013                         error = UDPENCAP_PCBCTL(inp, sopt);
1014                         break;
1015 #endif /* INET */
1016 #endif /* IPSEC */
1017                 case UDPLITE_SEND_CSCOV:
1018                 case UDPLITE_RECV_CSCOV:
1019                         if (!isudplite) {
1020                                 INP_WUNLOCK(inp);
1021                                 error = ENOPROTOOPT;
1022                                 break;
1023                         }
1024                         INP_WUNLOCK(inp);
1025                         error = sooptcopyin(sopt, &optval, sizeof(optval),
1026                             sizeof(optval));
1027                         if (error != 0)
1028                                 break;
1029                         inp = sotoinpcb(so);
1030                         KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
1031                         INP_WLOCK(inp);
1032                         up = intoudpcb(inp);
1033                         KASSERT(up != NULL, ("%s: up == NULL", __func__));
1034                         if ((optval != 0 && optval < 8) || (optval > 65535)) {
1035                                 INP_WUNLOCK(inp);
1036                                 error = EINVAL;
1037                                 break;
1038                         }
1039                         if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
1040                                 up->u_txcslen = optval;
1041                         else
1042                                 up->u_rxcslen = optval;
1043                         INP_WUNLOCK(inp);
1044                         break;
1045                 default:
1046                         INP_WUNLOCK(inp);
1047                         error = ENOPROTOOPT;
1048                         break;
1049                 }
1050                 break;
1051         case SOPT_GET:
1052                 switch (sopt->sopt_name) {
1053 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1054 #ifdef INET
1055                 case UDP_ENCAP:
1056                         if (!IPSEC_ENABLED(ipv4)) {
1057                                 INP_WUNLOCK(inp);
1058                                 return (ENOPROTOOPT);
1059                         }
1060                         error = UDPENCAP_PCBCTL(inp, sopt);
1061                         break;
1062 #endif /* INET */
1063 #endif /* IPSEC */
1064                 case UDPLITE_SEND_CSCOV:
1065                 case UDPLITE_RECV_CSCOV:
1066                         if (!isudplite) {
1067                                 INP_WUNLOCK(inp);
1068                                 error = ENOPROTOOPT;
1069                                 break;
1070                         }
1071                         up = intoudpcb(inp);
1072                         KASSERT(up != NULL, ("%s: up == NULL", __func__));
1073                         if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
1074                                 optval = up->u_txcslen;
1075                         else
1076                                 optval = up->u_rxcslen;
1077                         INP_WUNLOCK(inp);
1078                         error = sooptcopyout(sopt, &optval, sizeof(optval));
1079                         break;
1080                 default:
1081                         INP_WUNLOCK(inp);
1082                         error = ENOPROTOOPT;
1083                         break;
1084                 }
1085                 break;
1086         }
1087         return (error);
1088 }
1089
1090 #ifdef INET
1091 #ifdef INET6
1092 /* The logic here is derived from ip6_setpktopt(). See comments there. */
1093 static int
1094 udp_v4mapped_pktinfo(struct cmsghdr *cm, struct sockaddr_in * src,
1095     struct inpcb *inp, int flags)
1096 {
1097         struct ifnet *ifp;
1098         struct in6_pktinfo *pktinfo;
1099         struct in_addr ia;
1100
1101         if ((flags & PRUS_IPV6) == 0)
1102                 return (0);
1103
1104         if (cm->cmsg_level != IPPROTO_IPV6)
1105                 return (0);
1106
1107         if  (cm->cmsg_type != IPV6_2292PKTINFO &&
1108             cm->cmsg_type != IPV6_PKTINFO)
1109                 return (0);
1110
1111         if (cm->cmsg_len !=
1112             CMSG_LEN(sizeof(struct in6_pktinfo)))
1113                 return (EINVAL);
1114
1115         pktinfo = (struct in6_pktinfo *)CMSG_DATA(cm);
1116         if (!IN6_IS_ADDR_V4MAPPED(&pktinfo->ipi6_addr) &&
1117             !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr))
1118                 return (EINVAL);
1119
1120         /* Validate the interface index if specified. */
1121         if (pktinfo->ipi6_ifindex > V_if_index)
1122                 return (ENXIO);
1123
1124         ifp = NULL;
1125         if (pktinfo->ipi6_ifindex) {
1126                 ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
1127                 if (ifp == NULL)
1128                         return (ENXIO);
1129         }
1130         if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
1131                 ia.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
1132                 if (in_ifhasaddr(ifp, ia) == 0)
1133                         return (EADDRNOTAVAIL);
1134         }
1135
1136         bzero(src, sizeof(*src));
1137         src->sin_family = AF_INET;
1138         src->sin_len = sizeof(*src);
1139         src->sin_port = inp->inp_lport;
1140         src->sin_addr.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
1141
1142         return (0);
1143 }
1144 #endif
1145
1146 static int
1147 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
1148     struct mbuf *control, struct thread *td, int flags)
1149 {
1150         struct udpiphdr *ui;
1151         int len = m->m_pkthdr.len;
1152         struct in_addr faddr, laddr;
1153         struct cmsghdr *cm;
1154         struct inpcbinfo *pcbinfo;
1155         struct sockaddr_in *sin, src;
1156         struct epoch_tracker et;
1157         int cscov_partial = 0;
1158         int error = 0;
1159         int ipflags = 0;
1160         u_short fport, lport;
1161         u_char tos;
1162         uint8_t pr;
1163         uint16_t cscov = 0;
1164         uint32_t flowid = 0;
1165         uint8_t flowtype = M_HASHTYPE_NONE;
1166
1167         if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
1168                 if (control)
1169                         m_freem(control);
1170                 m_freem(m);
1171                 return (EMSGSIZE);
1172         }
1173
1174         src.sin_family = 0;
1175         sin = (struct sockaddr_in *)addr;
1176
1177         /*
1178          * udp_output() may need to temporarily bind or connect the current
1179          * inpcb.  As such, we don't know up front whether we will need the
1180          * pcbinfo lock or not.  Do any work to decide what is needed up
1181          * front before acquiring any locks.
1182          *
1183          * We will need network epoch in either case, to safely lookup into
1184          * pcb hash.
1185          */
1186         if (sin == NULL ||
1187             (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0))
1188                 INP_WLOCK(inp);
1189         else
1190                 INP_RLOCK(inp);
1191         NET_EPOCH_ENTER(et);
1192         tos = inp->inp_ip_tos;
1193         if (control != NULL) {
1194                 /*
1195                  * XXX: Currently, we assume all the optional information is
1196                  * stored in a single mbuf.
1197                  */
1198                 if (control->m_next) {
1199                         m_freem(control);
1200                         error = EINVAL;
1201                         goto release;
1202                 }
1203                 for (; control->m_len > 0;
1204                     control->m_data += CMSG_ALIGN(cm->cmsg_len),
1205                     control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
1206                         cm = mtod(control, struct cmsghdr *);
1207                         if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
1208                             || cm->cmsg_len > control->m_len) {
1209                                 error = EINVAL;
1210                                 break;
1211                         }
1212 #ifdef INET6
1213                         error = udp_v4mapped_pktinfo(cm, &src, inp, flags);
1214                         if (error != 0)
1215                                 break;
1216 #endif
1217                         if (cm->cmsg_level != IPPROTO_IP)
1218                                 continue;
1219
1220                         switch (cm->cmsg_type) {
1221                         case IP_SENDSRCADDR:
1222                                 if (cm->cmsg_len !=
1223                                     CMSG_LEN(sizeof(struct in_addr))) {
1224                                         error = EINVAL;
1225                                         break;
1226                                 }
1227                                 bzero(&src, sizeof(src));
1228                                 src.sin_family = AF_INET;
1229                                 src.sin_len = sizeof(src);
1230                                 src.sin_port = inp->inp_lport;
1231                                 src.sin_addr =
1232                                     *(struct in_addr *)CMSG_DATA(cm);
1233                                 break;
1234
1235                         case IP_TOS:
1236                                 if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
1237                                         error = EINVAL;
1238                                         break;
1239                                 }
1240                                 tos = *(u_char *)CMSG_DATA(cm);
1241                                 break;
1242
1243                         case IP_FLOWID:
1244                                 if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1245                                         error = EINVAL;
1246                                         break;
1247                                 }
1248                                 flowid = *(uint32_t *) CMSG_DATA(cm);
1249                                 break;
1250
1251                         case IP_FLOWTYPE:
1252                                 if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1253                                         error = EINVAL;
1254                                         break;
1255                                 }
1256                                 flowtype = *(uint32_t *) CMSG_DATA(cm);
1257                                 break;
1258
1259 #ifdef  RSS
1260                         case IP_RSSBUCKETID:
1261                                 if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1262                                         error = EINVAL;
1263                                         break;
1264                                 }
1265                                 /* This is just a placeholder for now */
1266                                 break;
1267 #endif  /* RSS */
1268                         default:
1269                                 error = ENOPROTOOPT;
1270                                 break;
1271                         }
1272                         if (error)
1273                                 break;
1274                 }
1275                 m_freem(control);
1276         }
1277         if (error)
1278                 goto release;
1279
1280         pr = inp->inp_socket->so_proto->pr_protocol;
1281         pcbinfo = udp_get_inpcbinfo(pr);
1282
1283         /*
1284          * If the IP_SENDSRCADDR control message was specified, override the
1285          * source address for this datagram.  Its use is invalidated if the
1286          * address thus specified is incomplete or clobbers other inpcbs.
1287          */
1288         laddr = inp->inp_laddr;
1289         lport = inp->inp_lport;
1290         if (src.sin_family == AF_INET) {
1291                 INP_HASH_LOCK_ASSERT(pcbinfo);
1292                 if ((lport == 0) ||
1293                     (laddr.s_addr == INADDR_ANY &&
1294                      src.sin_addr.s_addr == INADDR_ANY)) {
1295                         error = EINVAL;
1296                         goto release;
1297                 }
1298                 error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
1299                     &laddr.s_addr, &lport, td->td_ucred);
1300                 if (error)
1301                         goto release;
1302         }
1303
1304         /*
1305          * If a UDP socket has been connected, then a local address/port will
1306          * have been selected and bound.
1307          *
1308          * If a UDP socket has not been connected to, then an explicit
1309          * destination address must be used, in which case a local
1310          * address/port may not have been selected and bound.
1311          */
1312         if (sin != NULL) {
1313                 INP_LOCK_ASSERT(inp);
1314                 if (inp->inp_faddr.s_addr != INADDR_ANY) {
1315                         error = EISCONN;
1316                         goto release;
1317                 }
1318
1319                 /*
1320                  * Jail may rewrite the destination address, so let it do
1321                  * that before we use it.
1322                  */
1323                 error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1324                 if (error)
1325                         goto release;
1326
1327                 /*
1328                  * If a local address or port hasn't yet been selected, or if
1329                  * the destination address needs to be rewritten due to using
1330                  * a special INADDR_ constant, invoke in_pcbconnect_setup()
1331                  * to do the heavy lifting.  Once a port is selected, we
1332                  * commit the binding back to the socket; we also commit the
1333                  * binding of the address if in jail.
1334                  *
1335                  * If we already have a valid binding and we're not
1336                  * requesting a destination address rewrite, use a fast path.
1337                  */
1338                 if (inp->inp_laddr.s_addr == INADDR_ANY ||
1339                     inp->inp_lport == 0 ||
1340                     sin->sin_addr.s_addr == INADDR_ANY ||
1341                     sin->sin_addr.s_addr == INADDR_BROADCAST) {
1342                         INP_HASH_LOCK_ASSERT(pcbinfo);
1343                         error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
1344                             &lport, &faddr.s_addr, &fport, NULL,
1345                             td->td_ucred);
1346                         if (error)
1347                                 goto release;
1348
1349                         /*
1350                          * XXXRW: Why not commit the port if the address is
1351                          * !INADDR_ANY?
1352                          */
1353                         /* Commit the local port if newly assigned. */
1354                         if (inp->inp_laddr.s_addr == INADDR_ANY &&
1355                             inp->inp_lport == 0) {
1356                                 INP_WLOCK_ASSERT(inp);
1357                                 /*
1358                                  * Remember addr if jailed, to prevent
1359                                  * rebinding.
1360                                  */
1361                                 if (prison_flag(td->td_ucred, PR_IP4))
1362                                         inp->inp_laddr = laddr;
1363                                 inp->inp_lport = lport;
1364                                 INP_HASH_WLOCK(pcbinfo);
1365                                 error = in_pcbinshash(inp);
1366                                 INP_HASH_WUNLOCK(pcbinfo);
1367                                 if (error != 0) {
1368                                         inp->inp_lport = 0;
1369                                         error = EAGAIN;
1370                                         goto release;
1371                                 }
1372                                 inp->inp_flags |= INP_ANONPORT;
1373                         }
1374                 } else {
1375                         faddr = sin->sin_addr;
1376                         fport = sin->sin_port;
1377                 }
1378         } else {
1379                 INP_LOCK_ASSERT(inp);
1380                 faddr = inp->inp_faddr;
1381                 fport = inp->inp_fport;
1382                 if (faddr.s_addr == INADDR_ANY) {
1383                         error = ENOTCONN;
1384                         goto release;
1385                 }
1386         }
1387
1388         /*
1389          * Calculate data length and get a mbuf for UDP, IP, and possible
1390          * link-layer headers.  Immediate slide the data pointer back forward
1391          * since we won't use that space at this layer.
1392          */
1393         M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
1394         if (m == NULL) {
1395                 error = ENOBUFS;
1396                 goto release;
1397         }
1398         m->m_data += max_linkhdr;
1399         m->m_len -= max_linkhdr;
1400         m->m_pkthdr.len -= max_linkhdr;
1401
1402         /*
1403          * Fill in mbuf with extended UDP header and addresses and length put
1404          * into network format.
1405          */
1406         ui = mtod(m, struct udpiphdr *);
1407         bzero(ui->ui_x1, sizeof(ui->ui_x1));    /* XXX still needed? */
1408         ui->ui_v = IPVERSION << 4;
1409         ui->ui_pr = pr;
1410         ui->ui_src = laddr;
1411         ui->ui_dst = faddr;
1412         ui->ui_sport = lport;
1413         ui->ui_dport = fport;
1414         ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
1415         if (pr == IPPROTO_UDPLITE) {
1416                 struct udpcb *up;
1417                 uint16_t plen;
1418
1419                 up = intoudpcb(inp);
1420                 cscov = up->u_txcslen;
1421                 plen = (u_short)len + sizeof(struct udphdr);
1422                 if (cscov >= plen)
1423                         cscov = 0;
1424                 ui->ui_len = htons(plen);
1425                 ui->ui_ulen = htons(cscov);
1426                 /*
1427                  * For UDP-Lite, checksum coverage length of zero means
1428                  * the entire UDPLite packet is covered by the checksum.
1429                  */
1430                 cscov_partial = (cscov == 0) ? 0 : 1;
1431         }
1432
1433         /*
1434          * Set the Don't Fragment bit in the IP header.
1435          */
1436         if (inp->inp_flags & INP_DONTFRAG) {
1437                 struct ip *ip;
1438
1439                 ip = (struct ip *)&ui->ui_i;
1440                 ip->ip_off |= htons(IP_DF);
1441         }
1442
1443         if (inp->inp_socket->so_options & SO_DONTROUTE)
1444                 ipflags |= IP_ROUTETOIF;
1445         if (inp->inp_socket->so_options & SO_BROADCAST)
1446                 ipflags |= IP_ALLOWBROADCAST;
1447         if (inp->inp_flags & INP_ONESBCAST)
1448                 ipflags |= IP_SENDONES;
1449
1450 #ifdef MAC
1451         mac_inpcb_create_mbuf(inp, m);
1452 #endif
1453
1454         /*
1455          * Set up checksum and output datagram.
1456          */
1457         ui->ui_sum = 0;
1458         if (pr == IPPROTO_UDPLITE) {
1459                 if (inp->inp_flags & INP_ONESBCAST)
1460                         faddr.s_addr = INADDR_BROADCAST;
1461                 if (cscov_partial) {
1462                         if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
1463                                 ui->ui_sum = 0xffff;
1464                 } else {
1465                         if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0)
1466                                 ui->ui_sum = 0xffff;
1467                 }
1468         } else if (V_udp_cksum) {
1469                 if (inp->inp_flags & INP_ONESBCAST)
1470                         faddr.s_addr = INADDR_BROADCAST;
1471                 ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
1472                     htons((u_short)len + sizeof(struct udphdr) + pr));
1473                 m->m_pkthdr.csum_flags = CSUM_UDP;
1474                 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
1475         }
1476         ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
1477         ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;    /* XXX */
1478         ((struct ip *)ui)->ip_tos = tos;                /* XXX */
1479         UDPSTAT_INC(udps_opackets);
1480
1481         /*
1482          * Setup flowid / RSS information for outbound socket.
1483          *
1484          * Once the UDP code decides to set a flowid some other way,
1485          * this allows the flowid to be overridden by userland.
1486          */
1487         if (flowtype != M_HASHTYPE_NONE) {
1488                 m->m_pkthdr.flowid = flowid;
1489                 M_HASHTYPE_SET(m, flowtype);
1490         }
1491 #if defined(ROUTE_MPATH) || defined(RSS)
1492         else if (CALC_FLOWID_OUTBOUND_SENDTO) {
1493                 uint32_t hash_val, hash_type;
1494
1495                 hash_val = fib4_calc_packet_hash(laddr, faddr,
1496                     lport, fport, pr, &hash_type);
1497                 m->m_pkthdr.flowid = hash_val;
1498                 M_HASHTYPE_SET(m, hash_type);
1499         }
1500
1501         /*
1502          * Don't override with the inp cached flowid value.
1503          *
1504          * Depending upon the kind of send being done, the inp
1505          * flowid/flowtype values may actually not be appropriate
1506          * for this particular socket send.
1507          *
1508          * We should either leave the flowid at zero (which is what is
1509          * currently done) or set it to some software generated
1510          * hash value based on the packet contents.
1511          */
1512         ipflags |= IP_NODEFAULTFLOWID;
1513 #endif  /* RSS */
1514
1515         if (pr == IPPROTO_UDPLITE)
1516                 UDPLITE_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1517         else
1518                 UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1519         error = ip_output(m, inp->inp_options,
1520             INP_WLOCKED(inp) ? &inp->inp_route : NULL, ipflags,
1521             inp->inp_moptions, inp);
1522         INP_UNLOCK(inp);
1523         NET_EPOCH_EXIT(et);
1524         return (error);
1525
1526 release:
1527         INP_UNLOCK(inp);
1528         NET_EPOCH_EXIT(et);
1529         m_freem(m);
1530         return (error);
1531 }
1532
1533 static void
1534 udp_abort(struct socket *so)
1535 {
1536         struct inpcb *inp;
1537         struct inpcbinfo *pcbinfo;
1538
1539         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1540         inp = sotoinpcb(so);
1541         KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
1542         INP_WLOCK(inp);
1543         if (inp->inp_faddr.s_addr != INADDR_ANY) {
1544                 INP_HASH_WLOCK(pcbinfo);
1545                 in_pcbdisconnect(inp);
1546                 inp->inp_laddr.s_addr = INADDR_ANY;
1547                 INP_HASH_WUNLOCK(pcbinfo);
1548                 soisdisconnected(so);
1549         }
1550         INP_WUNLOCK(inp);
1551 }
1552
1553 static int
1554 udp_attach(struct socket *so, int proto, struct thread *td)
1555 {
1556         static uint32_t udp_flowid;
1557         struct inpcb *inp;
1558         struct inpcbinfo *pcbinfo;
1559         int error;
1560
1561         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1562         inp = sotoinpcb(so);
1563         KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
1564         error = soreserve(so, udp_sendspace, udp_recvspace);
1565         if (error)
1566                 return (error);
1567         INP_INFO_WLOCK(pcbinfo);
1568         error = in_pcballoc(so, pcbinfo);
1569         if (error) {
1570                 INP_INFO_WUNLOCK(pcbinfo);
1571                 return (error);
1572         }
1573
1574         inp = sotoinpcb(so);
1575         inp->inp_vflag |= INP_IPV4;
1576         inp->inp_ip_ttl = V_ip_defttl;
1577         inp->inp_flowid = atomic_fetchadd_int(&udp_flowid, 1);
1578         inp->inp_flowtype = M_HASHTYPE_OPAQUE;
1579
1580         error = udp_newudpcb(inp);
1581         if (error) {
1582                 in_pcbdetach(inp);
1583                 in_pcbfree(inp);
1584                 INP_INFO_WUNLOCK(pcbinfo);
1585                 return (error);
1586         }
1587
1588         INP_WUNLOCK(inp);
1589         INP_INFO_WUNLOCK(pcbinfo);
1590         return (0);
1591 }
1592 #endif /* INET */
1593
1594 int
1595 udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx)
1596 {
1597         struct inpcb *inp;
1598         struct udpcb *up;
1599
1600         KASSERT(so->so_type == SOCK_DGRAM,
1601             ("udp_set_kernel_tunneling: !dgram"));
1602         inp = sotoinpcb(so);
1603         KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
1604         INP_WLOCK(inp);
1605         up = intoudpcb(inp);
1606         if ((up->u_tun_func != NULL) ||
1607             (up->u_icmp_func != NULL)) {
1608                 INP_WUNLOCK(inp);
1609                 return (EBUSY);
1610         }
1611         up->u_tun_func = f;
1612         up->u_icmp_func = i;
1613         up->u_tun_ctx = ctx;
1614         INP_WUNLOCK(inp);
1615         return (0);
1616 }
1617
1618 #ifdef INET
1619 static int
1620 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
1621 {
1622         struct inpcb *inp;
1623         struct inpcbinfo *pcbinfo;
1624         int error;
1625
1626         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1627         inp = sotoinpcb(so);
1628         KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
1629         INP_WLOCK(inp);
1630         INP_HASH_WLOCK(pcbinfo);
1631         error = in_pcbbind(inp, nam, td->td_ucred);
1632         INP_HASH_WUNLOCK(pcbinfo);
1633         INP_WUNLOCK(inp);
1634         return (error);
1635 }
1636
1637 static void
1638 udp_close(struct socket *so)
1639 {
1640         struct inpcb *inp;
1641         struct inpcbinfo *pcbinfo;
1642
1643         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1644         inp = sotoinpcb(so);
1645         KASSERT(inp != NULL, ("udp_close: inp == NULL"));
1646         INP_WLOCK(inp);
1647         if (inp->inp_faddr.s_addr != INADDR_ANY) {
1648                 INP_HASH_WLOCK(pcbinfo);
1649                 in_pcbdisconnect(inp);
1650                 inp->inp_laddr.s_addr = INADDR_ANY;
1651                 INP_HASH_WUNLOCK(pcbinfo);
1652                 soisdisconnected(so);
1653         }
1654         INP_WUNLOCK(inp);
1655 }
1656
1657 static int
1658 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1659 {
1660         struct epoch_tracker et;
1661         struct inpcb *inp;
1662         struct inpcbinfo *pcbinfo;
1663         struct sockaddr_in *sin;
1664         int error;
1665
1666         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1667         inp = sotoinpcb(so);
1668         KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
1669         INP_WLOCK(inp);
1670         if (inp->inp_faddr.s_addr != INADDR_ANY) {
1671                 INP_WUNLOCK(inp);
1672                 return (EISCONN);
1673         }
1674         sin = (struct sockaddr_in *)nam;
1675         error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1676         if (error != 0) {
1677                 INP_WUNLOCK(inp);
1678                 return (error);
1679         }
1680         NET_EPOCH_ENTER(et);
1681         INP_HASH_WLOCK(pcbinfo);
1682         error = in_pcbconnect(inp, nam, td->td_ucred);
1683         INP_HASH_WUNLOCK(pcbinfo);
1684         NET_EPOCH_EXIT(et);
1685         if (error == 0)
1686                 soisconnected(so);
1687         INP_WUNLOCK(inp);
1688         return (error);
1689 }
1690
1691 static void
1692 udp_detach(struct socket *so)
1693 {
1694         struct inpcb *inp;
1695         struct inpcbinfo *pcbinfo;
1696         struct udpcb *up;
1697
1698         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1699         inp = sotoinpcb(so);
1700         KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
1701         KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
1702             ("udp_detach: not disconnected"));
1703         INP_INFO_WLOCK(pcbinfo);
1704         INP_WLOCK(inp);
1705         up = intoudpcb(inp);
1706         KASSERT(up != NULL, ("%s: up == NULL", __func__));
1707         inp->inp_ppcb = NULL;
1708         in_pcbdetach(inp);
1709         in_pcbfree(inp);
1710         INP_INFO_WUNLOCK(pcbinfo);
1711         udp_discardcb(up);
1712 }
1713
1714 static int
1715 udp_disconnect(struct socket *so)
1716 {
1717         struct inpcb *inp;
1718         struct inpcbinfo *pcbinfo;
1719
1720         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1721         inp = sotoinpcb(so);
1722         KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
1723         INP_WLOCK(inp);
1724         if (inp->inp_faddr.s_addr == INADDR_ANY) {
1725                 INP_WUNLOCK(inp);
1726                 return (ENOTCONN);
1727         }
1728         INP_HASH_WLOCK(pcbinfo);
1729         in_pcbdisconnect(inp);
1730         inp->inp_laddr.s_addr = INADDR_ANY;
1731         INP_HASH_WUNLOCK(pcbinfo);
1732         SOCK_LOCK(so);
1733         so->so_state &= ~SS_ISCONNECTED;                /* XXX */
1734         SOCK_UNLOCK(so);
1735         INP_WUNLOCK(inp);
1736         return (0);
1737 }
1738
1739 static int
1740 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
1741     struct mbuf *control, struct thread *td)
1742 {
1743         struct inpcb *inp;
1744
1745         inp = sotoinpcb(so);
1746         KASSERT(inp != NULL, ("udp_send: inp == NULL"));
1747         return (udp_output(inp, m, addr, control, td, flags));
1748 }
1749 #endif /* INET */
1750
1751 int
1752 udp_shutdown(struct socket *so)
1753 {
1754         struct inpcb *inp;
1755
1756         inp = sotoinpcb(so);
1757         KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
1758         INP_WLOCK(inp);
1759         socantsendmore(so);
1760         INP_WUNLOCK(inp);
1761         return (0);
1762 }
1763
1764 #ifdef INET
1765 struct pr_usrreqs udp_usrreqs = {
1766         .pru_abort =            udp_abort,
1767         .pru_attach =           udp_attach,
1768         .pru_bind =             udp_bind,
1769         .pru_connect =          udp_connect,
1770         .pru_control =          in_control,
1771         .pru_detach =           udp_detach,
1772         .pru_disconnect =       udp_disconnect,
1773         .pru_peeraddr =         in_getpeeraddr,
1774         .pru_send =             udp_send,
1775         .pru_soreceive =        soreceive_dgram,
1776         .pru_sosend =           sosend_dgram,
1777         .pru_shutdown =         udp_shutdown,
1778         .pru_sockaddr =         in_getsockaddr,
1779         .pru_sosetlabel =       in_pcbsosetlabel,
1780         .pru_close =            udp_close,
1781 };
1782 #endif /* INET */