]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/udp_usrreq.c
Update to bmake-20220726
[FreeBSD/FreeBSD.git] / sys / netinet / udp_usrreq.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
5  *      The Regents of the University of California.
6  * Copyright (c) 2008 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * Copyright (c) 2014 Kevin Lo
9  * All rights reserved.
10  *
11  * Portions of this software were developed by Robert N. M. Watson under
12  * contract to Juniper Networks, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *      @(#)udp_usrreq.c        8.6 (Berkeley) 5/23/95
39  */
40
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43
44 #include "opt_inet.h"
45 #include "opt_inet6.h"
46 #include "opt_ipsec.h"
47 #include "opt_route.h"
48 #include "opt_rss.h"
49
50 #include <sys/param.h>
51 #include <sys/domain.h>
52 #include <sys/eventhandler.h>
53 #include <sys/jail.h>
54 #include <sys/kernel.h>
55 #include <sys/lock.h>
56 #include <sys/malloc.h>
57 #include <sys/mbuf.h>
58 #include <sys/priv.h>
59 #include <sys/proc.h>
60 #include <sys/protosw.h>
61 #include <sys/sdt.h>
62 #include <sys/signalvar.h>
63 #include <sys/socket.h>
64 #include <sys/socketvar.h>
65 #include <sys/sx.h>
66 #include <sys/sysctl.h>
67 #include <sys/syslog.h>
68 #include <sys/systm.h>
69
70 #include <vm/uma.h>
71
72 #include <net/if.h>
73 #include <net/if_var.h>
74 #include <net/route.h>
75 #include <net/route/nhop.h>
76 #include <net/rss_config.h>
77
78 #include <netinet/in.h>
79 #include <netinet/in_kdtrace.h>
80 #include <netinet/in_fib.h>
81 #include <netinet/in_pcb.h>
82 #include <netinet/in_systm.h>
83 #include <netinet/in_var.h>
84 #include <netinet/ip.h>
85 #ifdef INET6
86 #include <netinet/ip6.h>
87 #endif
88 #include <netinet/ip_icmp.h>
89 #include <netinet/icmp_var.h>
90 #include <netinet/ip_var.h>
91 #include <netinet/ip_options.h>
92 #ifdef INET6
93 #include <netinet6/ip6_var.h>
94 #endif
95 #include <netinet/udp.h>
96 #include <netinet/udp_var.h>
97 #include <netinet/udplite.h>
98 #include <netinet/in_rss.h>
99
100 #include <netipsec/ipsec_support.h>
101
102 #include <machine/in_cksum.h>
103
104 #include <security/mac/mac_framework.h>
105
106 /*
107  * UDP and UDP-Lite protocols implementation.
108  * Per RFC 768, August, 1980.
109  * Per RFC 3828, July, 2004.
110  */
111
112 /*
113  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
114  * removes the only data integrity mechanism for packets and malformed
115  * packets that would otherwise be discarded due to bad checksums, and may
116  * cause problems (especially for NFS data blocks).
117  */
118 VNET_DEFINE(int, udp_cksum) = 1;
119 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW,
120     &VNET_NAME(udp_cksum), 0, "compute udp checksum");
121
122 VNET_DEFINE(int, udp_log_in_vain) = 0;
123 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW,
124     &VNET_NAME(udp_log_in_vain), 0, "Log all incoming UDP packets");
125
126 VNET_DEFINE(int, udp_blackhole) = 0;
127 SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
128     &VNET_NAME(udp_blackhole), 0,
129     "Do not send port unreachables for refused connects");
130 VNET_DEFINE(bool, udp_blackhole_local) = false;
131 SYSCTL_BOOL(_net_inet_udp, OID_AUTO, blackhole_local, CTLFLAG_VNET |
132     CTLFLAG_RW, &VNET_NAME(udp_blackhole_local), false,
133     "Enforce net.inet.udp.blackhole for locally originated packets");
134
135 u_long  udp_sendspace = 9216;           /* really max datagram size */
136 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
137     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
138
139 u_long  udp_recvspace = 40 * (1024 +
140 #ifdef INET6
141                                       sizeof(struct sockaddr_in6)
142 #else
143                                       sizeof(struct sockaddr_in)
144 #endif
145                                       );        /* 40 1K datagrams */
146
147 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
148     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
149
150 VNET_DEFINE(struct inpcbinfo, udbinfo);
151 VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
152 VNET_DEFINE_STATIC(uma_zone_t, udpcb_zone);
153 #define V_udpcb_zone                    VNET(udpcb_zone)
154
155 #ifndef UDBHASHSIZE
156 #define UDBHASHSIZE     128
157 #endif
158
159 VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat);          /* from udp_var.h */
160 VNET_PCPUSTAT_SYSINIT(udpstat);
161 SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
162     udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
163
164 #ifdef VIMAGE
165 VNET_PCPUSTAT_SYSUNINIT(udpstat);
166 #endif /* VIMAGE */
167 #ifdef INET
168 static void     udp_detach(struct socket *so);
169 static int      udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
170                     struct mbuf *, struct thread *, int);
171 #endif
172
173 INPCBSTORAGE_DEFINE(udpcbstor, "udpinp", "udp_inpcb", "udp", "udphash");
174 INPCBSTORAGE_DEFINE(udplitecbstor, "udpliteinp", "udplite_inpcb", "udplite",
175     "udplitehash");
176
177 static void
178 udp_init(void *arg __unused)
179 {
180
181         /*
182          * For now default to 2-tuple UDP hashing - until the fragment
183          * reassembly code can also update the flowid.
184          *
185          * Once we can calculate the flowid that way and re-establish
186          * a 4-tuple, flip this to 4-tuple.
187          */
188         in_pcbinfo_init(&V_udbinfo, &udpcbstor, UDBHASHSIZE, UDBHASHSIZE);
189         V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
190             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
191         uma_zone_set_max(V_udpcb_zone, maxsockets);
192         uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
193
194         /* Additional pcbinfo for UDP-Lite */
195         in_pcbinfo_init(&V_ulitecbinfo, &udplitecbstor, UDBHASHSIZE,
196             UDBHASHSIZE);
197 }
198 VNET_SYSINIT(udp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, udp_init, NULL);
199
200 /*
201  * Kernel module interface for updating udpstat.  The argument is an index
202  * into udpstat treated as an array of u_long.  While this encodes the
203  * general layout of udpstat into the caller, it doesn't encode its location,
204  * so that future changes to add, for example, per-CPU stats support won't
205  * cause binary compatibility problems for kernel modules.
206  */
207 void
208 kmod_udpstat_inc(int statnum)
209 {
210
211         counter_u64_add(VNET(udpstat)[statnum], 1);
212 }
213
214 int
215 udp_newudpcb(struct inpcb *inp)
216 {
217         struct udpcb *up;
218
219         up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
220         if (up == NULL)
221                 return (ENOBUFS);
222         inp->inp_ppcb = up;
223         return (0);
224 }
225
226 void
227 udp_discardcb(struct udpcb *up)
228 {
229
230         uma_zfree(V_udpcb_zone, up);
231 }
232
233 #ifdef VIMAGE
234 static void
235 udp_destroy(void *unused __unused)
236 {
237
238         in_pcbinfo_destroy(&V_udbinfo);
239         uma_zdestroy(V_udpcb_zone);
240 }
241 VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL);
242
243 static void
244 udplite_destroy(void *unused __unused)
245 {
246
247         in_pcbinfo_destroy(&V_ulitecbinfo);
248 }
249 VNET_SYSUNINIT(udplite, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udplite_destroy,
250     NULL);
251 #endif
252
253 #ifdef INET
254 /*
255  * Subroutine of udp_input(), which appends the provided mbuf chain to the
256  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
257  * contains the source address.  If the socket ends up being an IPv6 socket,
258  * udp_append() will convert to a sockaddr_in6 before passing the address
259  * into the socket code.
260  *
261  * In the normal case udp_append() will return 0, indicating that you
262  * must unlock the inp. However if a tunneling protocol is in place we increment
263  * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we
264  * then decrement the reference count. If the inp_rele returns 1, indicating the
265  * inp is gone, we return that to the caller to tell them *not* to unlock
266  * the inp. In the case of multi-cast this will cause the distribution
267  * to stop (though most tunneling protocols known currently do *not* use
268  * multicast).
269  */
270 static int
271 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
272     struct sockaddr_in *udp_in)
273 {
274         struct sockaddr *append_sa;
275         struct socket *so;
276         struct mbuf *tmpopts, *opts = NULL;
277 #ifdef INET6
278         struct sockaddr_in6 udp_in6;
279 #endif
280         struct udpcb *up;
281         bool filtered;
282
283         INP_LOCK_ASSERT(inp);
284
285         /*
286          * Engage the tunneling protocol.
287          */
288         up = intoudpcb(inp);
289         if (up->u_tun_func != NULL) {
290                 in_pcbref(inp);
291                 INP_RUNLOCK(inp);
292                 filtered = (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&udp_in[0],
293                     up->u_tun_ctx);
294                 INP_RLOCK(inp);
295                 if (filtered)
296                         return (in_pcbrele_rlocked(inp));
297         }
298
299         off += sizeof(struct udphdr);
300
301 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
302         /* Check AH/ESP integrity. */
303         if (IPSEC_ENABLED(ipv4) &&
304             IPSEC_CHECK_POLICY(ipv4, n, inp) != 0) {
305                 m_freem(n);
306                 return (0);
307         }
308         if (up->u_flags & UF_ESPINUDP) {/* IPSec UDP encaps. */
309                 if (IPSEC_ENABLED(ipv4) &&
310                     UDPENCAP_INPUT(n, off, AF_INET) != 0)
311                         return (0);     /* Consumed. */
312         }
313 #endif /* IPSEC */
314 #ifdef MAC
315         if (mac_inpcb_check_deliver(inp, n) != 0) {
316                 m_freem(n);
317                 return (0);
318         }
319 #endif /* MAC */
320         if (inp->inp_flags & INP_CONTROLOPTS ||
321             inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
322 #ifdef INET6
323                 if (inp->inp_vflag & INP_IPV6)
324                         (void)ip6_savecontrol_v4(inp, n, &opts, NULL);
325                 else
326 #endif /* INET6 */
327                         ip_savecontrol(inp, &opts, ip, n);
328         }
329         if ((inp->inp_vflag & INP_IPV4) && (inp->inp_flags2 & INP_ORIGDSTADDR)) {
330                 tmpopts = sbcreatecontrol(&udp_in[1],
331                     sizeof(struct sockaddr_in), IP_ORIGDSTADDR, IPPROTO_IP,
332                     M_NOWAIT);
333                 if (tmpopts) {
334                         if (opts) {
335                                 tmpopts->m_next = opts;
336                                 opts = tmpopts;
337                         } else
338                                 opts = tmpopts;
339                 }
340         }
341 #ifdef INET6
342         if (inp->inp_vflag & INP_IPV6) {
343                 bzero(&udp_in6, sizeof(udp_in6));
344                 udp_in6.sin6_len = sizeof(udp_in6);
345                 udp_in6.sin6_family = AF_INET6;
346                 in6_sin_2_v4mapsin6(&udp_in[0], &udp_in6);
347                 append_sa = (struct sockaddr *)&udp_in6;
348         } else
349 #endif /* INET6 */
350                 append_sa = (struct sockaddr *)&udp_in[0];
351         m_adj(n, off);
352
353         so = inp->inp_socket;
354         SOCKBUF_LOCK(&so->so_rcv);
355         if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
356                 soroverflow_locked(so);
357                 m_freem(n);
358                 if (opts)
359                         m_freem(opts);
360                 UDPSTAT_INC(udps_fullsock);
361         } else
362                 sorwakeup_locked(so);
363         return (0);
364 }
365
366 static bool
367 udp_multi_match(const struct inpcb *inp, void *v)
368 {
369         struct ip *ip = v;
370         struct udphdr *uh = (struct udphdr *)(ip + 1);
371
372         if (inp->inp_lport != uh->uh_dport)
373                 return (false);
374 #ifdef INET6
375         if ((inp->inp_vflag & INP_IPV4) == 0)
376                 return (false);
377 #endif
378         if (inp->inp_laddr.s_addr != INADDR_ANY &&
379             inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
380                 return (false);
381         if (inp->inp_faddr.s_addr != INADDR_ANY &&
382             inp->inp_faddr.s_addr != ip->ip_src.s_addr)
383                 return (false);
384         if (inp->inp_fport != 0 &&
385             inp->inp_fport != uh->uh_sport)
386                 return (false);
387
388         return (true);
389 }
390
391 static int
392 udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in)
393 {
394         struct ip *ip = mtod(m, struct ip *);
395         struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto),
396             INPLOOKUP_RLOCKPCB, udp_multi_match, ip);
397 #ifdef KDTRACE_HOOKS
398         struct udphdr *uh = (struct udphdr *)(ip + 1);
399 #endif
400         struct inpcb *inp;
401         struct mbuf *n;
402         int appends = 0;
403
404         MPASS(ip->ip_hl == sizeof(struct ip) >> 2);
405
406         while ((inp = inp_next(&inpi)) != NULL) {
407                 /*
408                  * XXXRW: Because we weren't holding either the inpcb
409                  * or the hash lock when we checked for a match
410                  * before, we should probably recheck now that the
411                  * inpcb lock is held.
412                  */
413                 /*
414                  * Handle socket delivery policy for any-source
415                  * and source-specific multicast. [RFC3678]
416                  */
417                 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
418                         struct ip_moptions      *imo;
419                         struct sockaddr_in       group;
420                         int                      blocked;
421
422                         imo = inp->inp_moptions;
423                         if (imo == NULL)
424                                 continue;
425                         bzero(&group, sizeof(struct sockaddr_in));
426                         group.sin_len = sizeof(struct sockaddr_in);
427                         group.sin_family = AF_INET;
428                         group.sin_addr = ip->ip_dst;
429
430                         blocked = imo_multi_filter(imo, m->m_pkthdr.rcvif,
431                                 (struct sockaddr *)&group,
432                                 (struct sockaddr *)&udp_in[0]);
433                         if (blocked != MCAST_PASS) {
434                                 if (blocked == MCAST_NOTGMEMBER)
435                                         IPSTAT_INC(ips_notmember);
436                                 if (blocked == MCAST_NOTSMEMBER ||
437                                     blocked == MCAST_MUTED)
438                                         UDPSTAT_INC(udps_filtermcast);
439                                 continue;
440                         }
441                 }
442                 if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
443                         if (proto == IPPROTO_UDPLITE)
444                                 UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
445                         else
446                                 UDP_PROBE(receive, NULL, inp, ip, inp, uh);
447                         if (udp_append(inp, ip, n, sizeof(struct ip), udp_in)) {
448                                 INP_RUNLOCK(inp);
449                                 break;
450                         } else
451                                 appends++;
452                 }
453                 /*
454                  * Don't look for additional matches if this one does
455                  * not have either the SO_REUSEPORT or SO_REUSEADDR
456                  * socket options set.  This heuristic avoids
457                  * searching through all pcbs in the common case of a
458                  * non-shared port.  It assumes that an application
459                  * will never clear these options after setting them.
460                  */
461                 if ((inp->inp_socket->so_options &
462                     (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) {
463                         INP_RUNLOCK(inp);
464                         break;
465                 }
466         }
467
468         if (appends == 0) {
469                 /*
470                  * No matching pcb found; discard datagram.  (No need
471                  * to send an ICMP Port Unreachable for a broadcast
472                  * or multicast datgram.)
473                  */
474                 UDPSTAT_INC(udps_noport);
475                 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
476                         UDPSTAT_INC(udps_noportmcast);
477                 else
478                         UDPSTAT_INC(udps_noportbcast);
479         }
480         m_freem(m);
481
482         return (IPPROTO_DONE);
483 }
484
485 int
486 udp_input(struct mbuf **mp, int *offp, int proto)
487 {
488         struct ip *ip;
489         struct udphdr *uh;
490         struct ifnet *ifp;
491         struct inpcb *inp;
492         uint16_t len, ip_len;
493         struct inpcbinfo *pcbinfo;
494         struct sockaddr_in udp_in[2];
495         struct mbuf *m;
496         struct m_tag *fwd_tag;
497         int cscov_partial, iphlen;
498
499         m = *mp;
500         iphlen = *offp;
501         ifp = m->m_pkthdr.rcvif;
502         *mp = NULL;
503         UDPSTAT_INC(udps_ipackets);
504
505         /*
506          * Strip IP options, if any; should skip this, make available to
507          * user, and use on returned packets, but we don't yet have a way to
508          * check the checksum with options still present.
509          */
510         if (iphlen > sizeof (struct ip)) {
511                 ip_stripoptions(m);
512                 iphlen = sizeof(struct ip);
513         }
514
515         /*
516          * Get IP and UDP header together in first mbuf.
517          */
518         if (m->m_len < iphlen + sizeof(struct udphdr)) {
519                 if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) {
520                         UDPSTAT_INC(udps_hdrops);
521                         return (IPPROTO_DONE);
522                 }
523         }
524         ip = mtod(m, struct ip *);
525         uh = (struct udphdr *)((caddr_t)ip + iphlen);
526         cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0;
527
528         /*
529          * Destination port of 0 is illegal, based on RFC768.
530          */
531         if (uh->uh_dport == 0)
532                 goto badunlocked;
533
534         /*
535          * Construct sockaddr format source address.  Stuff source address
536          * and datagram in user buffer.
537          */
538         bzero(&udp_in[0], sizeof(struct sockaddr_in) * 2);
539         udp_in[0].sin_len = sizeof(struct sockaddr_in);
540         udp_in[0].sin_family = AF_INET;
541         udp_in[0].sin_port = uh->uh_sport;
542         udp_in[0].sin_addr = ip->ip_src;
543         udp_in[1].sin_len = sizeof(struct sockaddr_in);
544         udp_in[1].sin_family = AF_INET;
545         udp_in[1].sin_port = uh->uh_dport;
546         udp_in[1].sin_addr = ip->ip_dst;
547
548         /*
549          * Make mbuf data length reflect UDP length.  If not enough data to
550          * reflect UDP length, drop.
551          */
552         len = ntohs((u_short)uh->uh_ulen);
553         ip_len = ntohs(ip->ip_len) - iphlen;
554         if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) {
555                 /* Zero means checksum over the complete packet. */
556                 if (len == 0)
557                         len = ip_len;
558                 cscov_partial = 0;
559         }
560         if (ip_len != len) {
561                 if (len > ip_len || len < sizeof(struct udphdr)) {
562                         UDPSTAT_INC(udps_badlen);
563                         goto badunlocked;
564                 }
565                 if (proto == IPPROTO_UDP)
566                         m_adj(m, len - ip_len);
567         }
568
569         /*
570          * Checksum extended UDP header and data.
571          */
572         if (uh->uh_sum) {
573                 u_short uh_sum;
574
575                 if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
576                     !cscov_partial) {
577                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
578                                 uh_sum = m->m_pkthdr.csum_data;
579                         else
580                                 uh_sum = in_pseudo(ip->ip_src.s_addr,
581                                     ip->ip_dst.s_addr, htonl((u_short)len +
582                                     m->m_pkthdr.csum_data + proto));
583                         uh_sum ^= 0xffff;
584                 } else {
585                         char b[offsetof(struct ipovly, ih_src)];
586                         struct ipovly *ipov = (struct ipovly *)ip;
587
588                         bcopy(ipov, b, sizeof(b));
589                         bzero(ipov, sizeof(ipov->ih_x1));
590                         ipov->ih_len = (proto == IPPROTO_UDP) ?
591                             uh->uh_ulen : htons(ip_len);
592                         uh_sum = in_cksum(m, len + sizeof (struct ip));
593                         bcopy(b, ipov, sizeof(b));
594                 }
595                 if (uh_sum) {
596                         UDPSTAT_INC(udps_badsum);
597                         m_freem(m);
598                         return (IPPROTO_DONE);
599                 }
600         } else {
601                 if (proto == IPPROTO_UDP) {
602                         UDPSTAT_INC(udps_nosum);
603                 } else {
604                         /* UDPLite requires a checksum */
605                         /* XXX: What is the right UDPLite MIB counter here? */
606                         m_freem(m);
607                         return (IPPROTO_DONE);
608                 }
609         }
610
611         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
612             in_broadcast(ip->ip_dst, ifp))
613                 return (udp_multi_input(m, proto, udp_in));
614
615         pcbinfo = udp_get_inpcbinfo(proto);
616
617         /*
618          * Locate pcb for datagram.
619          *
620          * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
621          */
622         if ((m->m_flags & M_IP_NEXTHOP) &&
623             (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
624                 struct sockaddr_in *next_hop;
625
626                 next_hop = (struct sockaddr_in *)(fwd_tag + 1);
627
628                 /*
629                  * Transparently forwarded. Pretend to be the destination.
630                  * Already got one like this?
631                  */
632                 inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
633                     ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
634                 if (!inp) {
635                         /*
636                          * It's new.  Try to find the ambushing socket.
637                          * Because we've rewritten the destination address,
638                          * any hardware-generated hash is ignored.
639                          */
640                         inp = in_pcblookup(pcbinfo, ip->ip_src,
641                             uh->uh_sport, next_hop->sin_addr,
642                             next_hop->sin_port ? htons(next_hop->sin_port) :
643                             uh->uh_dport, INPLOOKUP_WILDCARD |
644                             INPLOOKUP_RLOCKPCB, ifp);
645                 }
646                 /* Remove the tag from the packet. We don't need it anymore. */
647                 m_tag_delete(m, fwd_tag);
648                 m->m_flags &= ~M_IP_NEXTHOP;
649         } else
650                 inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
651                     ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
652                     INPLOOKUP_RLOCKPCB, ifp, m);
653         if (inp == NULL) {
654                 if (V_udp_log_in_vain) {
655                         char src[INET_ADDRSTRLEN];
656                         char dst[INET_ADDRSTRLEN];
657
658                         log(LOG_INFO,
659                             "Connection attempt to UDP %s:%d from %s:%d\n",
660                             inet_ntoa_r(ip->ip_dst, dst), ntohs(uh->uh_dport),
661                             inet_ntoa_r(ip->ip_src, src), ntohs(uh->uh_sport));
662                 }
663                 if (proto == IPPROTO_UDPLITE)
664                         UDPLITE_PROBE(receive, NULL, NULL, ip, NULL, uh);
665                 else
666                         UDP_PROBE(receive, NULL, NULL, ip, NULL, uh);
667                 UDPSTAT_INC(udps_noport);
668                 if (m->m_flags & (M_BCAST | M_MCAST)) {
669                         UDPSTAT_INC(udps_noportbcast);
670                         goto badunlocked;
671                 }
672                 if (V_udp_blackhole && (V_udp_blackhole_local ||
673                     !in_localip(ip->ip_src)))
674                         goto badunlocked;
675                 if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
676                         goto badunlocked;
677                 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
678                 return (IPPROTO_DONE);
679         }
680
681         /*
682          * Check the minimum TTL for socket.
683          */
684         INP_RLOCK_ASSERT(inp);
685         if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
686                 if (proto == IPPROTO_UDPLITE)
687                         UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
688                 else
689                         UDP_PROBE(receive, NULL, inp, ip, inp, uh);
690                 INP_RUNLOCK(inp);
691                 m_freem(m);
692                 return (IPPROTO_DONE);
693         }
694         if (cscov_partial) {
695                 struct udpcb *up;
696
697                 up = intoudpcb(inp);
698                 if (up->u_rxcslen == 0 || up->u_rxcslen > len) {
699                         INP_RUNLOCK(inp);
700                         m_freem(m);
701                         return (IPPROTO_DONE);
702                 }
703         }
704
705         if (proto == IPPROTO_UDPLITE)
706                 UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
707         else
708                 UDP_PROBE(receive, NULL, inp, ip, inp, uh);
709         if (udp_append(inp, ip, m, iphlen, udp_in) == 0)
710                 INP_RUNLOCK(inp);
711         return (IPPROTO_DONE);
712
713 badunlocked:
714         m_freem(m);
715         return (IPPROTO_DONE);
716 }
717 #endif /* INET */
718
719 /*
720  * Notify a udp user of an asynchronous error; just wake up so that they can
721  * collect error status.
722  */
723 struct inpcb *
724 udp_notify(struct inpcb *inp, int errno)
725 {
726
727         INP_WLOCK_ASSERT(inp);
728         if ((errno == EHOSTUNREACH || errno == ENETUNREACH ||
729              errno == EHOSTDOWN) && inp->inp_route.ro_nh) {
730                 NH_FREE(inp->inp_route.ro_nh);
731                 inp->inp_route.ro_nh = (struct nhop_object *)NULL;
732         }
733
734         inp->inp_socket->so_error = errno;
735         sorwakeup(inp->inp_socket);
736         sowwakeup(inp->inp_socket);
737         return (inp);
738 }
739
740 #ifdef INET
741 static void
742 udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip,
743     struct inpcbinfo *pcbinfo)
744 {
745         struct ip *ip = vip;
746         struct udphdr *uh;
747         struct in_addr faddr;
748         struct inpcb *inp;
749
750         faddr = ((struct sockaddr_in *)sa)->sin_addr;
751         if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
752                 return;
753
754         if (PRC_IS_REDIRECT(cmd)) {
755                 /* signal EHOSTDOWN, as it flushes the cached route */
756                 in_pcbnotifyall(pcbinfo, faddr, EHOSTDOWN, udp_notify);
757                 return;
758         }
759
760         /*
761          * Hostdead is ugly because it goes linearly through all PCBs.
762          *
763          * XXX: We never get this from ICMP, otherwise it makes an excellent
764          * DoS attack on machines with many connections.
765          */
766         if (cmd == PRC_HOSTDEAD)
767                 ip = NULL;
768         else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
769                 return;
770         if (ip != NULL) {
771                 uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
772                 inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
773                     ip->ip_src, uh->uh_sport, INPLOOKUP_WLOCKPCB, NULL);
774                 if (inp != NULL) {
775                         INP_WLOCK_ASSERT(inp);
776                         if (inp->inp_socket != NULL) {
777                                 udp_notify(inp, inetctlerrmap[cmd]);
778                         }
779                         INP_WUNLOCK(inp);
780                 } else {
781                         inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
782                                            ip->ip_src, uh->uh_sport,
783                                            INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
784                         if (inp != NULL) {
785                                 struct udpcb *up;
786                                 void *ctx;
787                                 udp_tun_icmp_t func;
788
789                                 up = intoudpcb(inp);
790                                 ctx = up->u_tun_ctx;
791                                 func = up->u_icmp_func;
792                                 INP_RUNLOCK(inp);
793                                 if (func != NULL)
794                                         (*func)(cmd, sa, vip, ctx);
795                         }
796                 }
797         } else
798                 in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd],
799                     udp_notify);
800 }
801 void
802 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
803 {
804
805         return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo));
806 }
807
808 void
809 udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip)
810 {
811
812         return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo));
813 }
814 #endif /* INET */
815
816 static int
817 udp_pcblist(SYSCTL_HANDLER_ARGS)
818 {
819         struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_udbinfo,
820             INPLOOKUP_RLOCKPCB);
821         struct xinpgen xig;
822         struct inpcb *inp;
823         int error;
824
825         if (req->newptr != 0)
826                 return (EPERM);
827
828         if (req->oldptr == 0) {
829                 int n;
830
831                 n = V_udbinfo.ipi_count;
832                 n += imax(n / 8, 10);
833                 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
834                 return (0);
835         }
836
837         if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
838                 return (error);
839
840         bzero(&xig, sizeof(xig));
841         xig.xig_len = sizeof xig;
842         xig.xig_count = V_udbinfo.ipi_count;
843         xig.xig_gen = V_udbinfo.ipi_gencnt;
844         xig.xig_sogen = so_gencnt;
845         error = SYSCTL_OUT(req, &xig, sizeof xig);
846         if (error)
847                 return (error);
848
849         while ((inp = inp_next(&inpi)) != NULL) {
850                 if (inp->inp_gencnt <= xig.xig_gen &&
851                     cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
852                         struct xinpcb xi;
853
854                         in_pcbtoxinpcb(inp, &xi);
855                         error = SYSCTL_OUT(req, &xi, sizeof xi);
856                         if (error) {
857                                 INP_RUNLOCK(inp);
858                                 break;
859                         }
860                 }
861         }
862
863         if (!error) {
864                 /*
865                  * Give the user an updated idea of our state.  If the
866                  * generation differs from what we told her before, she knows
867                  * that something happened while we were processing this
868                  * request, and it might be necessary to retry.
869                  */
870                 xig.xig_gen = V_udbinfo.ipi_gencnt;
871                 xig.xig_sogen = so_gencnt;
872                 xig.xig_count = V_udbinfo.ipi_count;
873                 error = SYSCTL_OUT(req, &xig, sizeof xig);
874         }
875
876         return (error);
877 }
878
879 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
880     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
881     udp_pcblist, "S,xinpcb",
882     "List of active UDP sockets");
883
884 #ifdef INET
885 static int
886 udp_getcred(SYSCTL_HANDLER_ARGS)
887 {
888         struct xucred xuc;
889         struct sockaddr_in addrs[2];
890         struct epoch_tracker et;
891         struct inpcb *inp;
892         int error;
893
894         error = priv_check(req->td, PRIV_NETINET_GETCRED);
895         if (error)
896                 return (error);
897         error = SYSCTL_IN(req, addrs, sizeof(addrs));
898         if (error)
899                 return (error);
900         NET_EPOCH_ENTER(et);
901         inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
902             addrs[0].sin_addr, addrs[0].sin_port,
903             INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
904         NET_EPOCH_EXIT(et);
905         if (inp != NULL) {
906                 INP_RLOCK_ASSERT(inp);
907                 if (inp->inp_socket == NULL)
908                         error = ENOENT;
909                 if (error == 0)
910                         error = cr_canseeinpcb(req->td->td_ucred, inp);
911                 if (error == 0)
912                         cru2x(inp->inp_cred, &xuc);
913                 INP_RUNLOCK(inp);
914         } else
915                 error = ENOENT;
916         if (error == 0)
917                 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
918         return (error);
919 }
920
921 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
922     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
923     0, 0, udp_getcred, "S,xucred",
924     "Get the xucred of a UDP connection");
925 #endif /* INET */
926
927 int
928 udp_ctloutput(struct socket *so, struct sockopt *sopt)
929 {
930         struct inpcb *inp;
931         struct udpcb *up;
932         int isudplite, error, optval;
933
934         error = 0;
935         isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0;
936         inp = sotoinpcb(so);
937         KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
938         INP_WLOCK(inp);
939         if (sopt->sopt_level != so->so_proto->pr_protocol) {
940 #ifdef INET6
941                 if (INP_CHECK_SOCKAF(so, AF_INET6)) {
942                         INP_WUNLOCK(inp);
943                         error = ip6_ctloutput(so, sopt);
944                 }
945 #endif
946 #if defined(INET) && defined(INET6)
947                 else
948 #endif
949 #ifdef INET
950                 {
951                         INP_WUNLOCK(inp);
952                         error = ip_ctloutput(so, sopt);
953                 }
954 #endif
955                 return (error);
956         }
957
958         switch (sopt->sopt_dir) {
959         case SOPT_SET:
960                 switch (sopt->sopt_name) {
961 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
962 #ifdef INET
963                 case UDP_ENCAP:
964                         if (!IPSEC_ENABLED(ipv4)) {
965                                 INP_WUNLOCK(inp);
966                                 return (ENOPROTOOPT);
967                         }
968                         error = UDPENCAP_PCBCTL(inp, sopt);
969                         break;
970 #endif /* INET */
971 #endif /* IPSEC */
972                 case UDPLITE_SEND_CSCOV:
973                 case UDPLITE_RECV_CSCOV:
974                         if (!isudplite) {
975                                 INP_WUNLOCK(inp);
976                                 error = ENOPROTOOPT;
977                                 break;
978                         }
979                         INP_WUNLOCK(inp);
980                         error = sooptcopyin(sopt, &optval, sizeof(optval),
981                             sizeof(optval));
982                         if (error != 0)
983                                 break;
984                         inp = sotoinpcb(so);
985                         KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
986                         INP_WLOCK(inp);
987                         up = intoudpcb(inp);
988                         KASSERT(up != NULL, ("%s: up == NULL", __func__));
989                         if ((optval != 0 && optval < 8) || (optval > 65535)) {
990                                 INP_WUNLOCK(inp);
991                                 error = EINVAL;
992                                 break;
993                         }
994                         if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
995                                 up->u_txcslen = optval;
996                         else
997                                 up->u_rxcslen = optval;
998                         INP_WUNLOCK(inp);
999                         break;
1000                 default:
1001                         INP_WUNLOCK(inp);
1002                         error = ENOPROTOOPT;
1003                         break;
1004                 }
1005                 break;
1006         case SOPT_GET:
1007                 switch (sopt->sopt_name) {
1008 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1009 #ifdef INET
1010                 case UDP_ENCAP:
1011                         if (!IPSEC_ENABLED(ipv4)) {
1012                                 INP_WUNLOCK(inp);
1013                                 return (ENOPROTOOPT);
1014                         }
1015                         error = UDPENCAP_PCBCTL(inp, sopt);
1016                         break;
1017 #endif /* INET */
1018 #endif /* IPSEC */
1019                 case UDPLITE_SEND_CSCOV:
1020                 case UDPLITE_RECV_CSCOV:
1021                         if (!isudplite) {
1022                                 INP_WUNLOCK(inp);
1023                                 error = ENOPROTOOPT;
1024                                 break;
1025                         }
1026                         up = intoudpcb(inp);
1027                         KASSERT(up != NULL, ("%s: up == NULL", __func__));
1028                         if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
1029                                 optval = up->u_txcslen;
1030                         else
1031                                 optval = up->u_rxcslen;
1032                         INP_WUNLOCK(inp);
1033                         error = sooptcopyout(sopt, &optval, sizeof(optval));
1034                         break;
1035                 default:
1036                         INP_WUNLOCK(inp);
1037                         error = ENOPROTOOPT;
1038                         break;
1039                 }
1040                 break;
1041         }
1042         return (error);
1043 }
1044
1045 #ifdef INET
1046 #ifdef INET6
1047 /* The logic here is derived from ip6_setpktopt(). See comments there. */
1048 static int
1049 udp_v4mapped_pktinfo(struct cmsghdr *cm, struct sockaddr_in * src,
1050     struct inpcb *inp, int flags)
1051 {
1052         struct ifnet *ifp;
1053         struct in6_pktinfo *pktinfo;
1054         struct in_addr ia;
1055
1056         if ((flags & PRUS_IPV6) == 0)
1057                 return (0);
1058
1059         if (cm->cmsg_level != IPPROTO_IPV6)
1060                 return (0);
1061
1062         if  (cm->cmsg_type != IPV6_2292PKTINFO &&
1063             cm->cmsg_type != IPV6_PKTINFO)
1064                 return (0);
1065
1066         if (cm->cmsg_len !=
1067             CMSG_LEN(sizeof(struct in6_pktinfo)))
1068                 return (EINVAL);
1069
1070         pktinfo = (struct in6_pktinfo *)CMSG_DATA(cm);
1071         if (!IN6_IS_ADDR_V4MAPPED(&pktinfo->ipi6_addr) &&
1072             !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr))
1073                 return (EINVAL);
1074
1075         /* Validate the interface index if specified. */
1076         if (pktinfo->ipi6_ifindex) {
1077                 struct epoch_tracker et;
1078
1079                 NET_EPOCH_ENTER(et);
1080                 ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
1081                 NET_EPOCH_EXIT(et);     /* XXXGL: unsafe ifp */
1082                 if (ifp == NULL)
1083                         return (ENXIO);
1084         } else
1085                 ifp = NULL;
1086         if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
1087                 ia.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
1088                 if (in_ifhasaddr(ifp, ia) == 0)
1089                         return (EADDRNOTAVAIL);
1090         }
1091
1092         bzero(src, sizeof(*src));
1093         src->sin_family = AF_INET;
1094         src->sin_len = sizeof(*src);
1095         src->sin_port = inp->inp_lport;
1096         src->sin_addr.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
1097
1098         return (0);
1099 }
1100 #endif
1101
1102 static int
1103 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
1104     struct mbuf *control, struct thread *td, int flags)
1105 {
1106         struct udpiphdr *ui;
1107         int len = m->m_pkthdr.len;
1108         struct in_addr faddr, laddr;
1109         struct cmsghdr *cm;
1110         struct inpcbinfo *pcbinfo;
1111         struct sockaddr_in *sin, src;
1112         struct epoch_tracker et;
1113         int cscov_partial = 0;
1114         int error = 0;
1115         int ipflags = 0;
1116         u_short fport, lport;
1117         u_char tos;
1118         uint8_t pr;
1119         uint16_t cscov = 0;
1120         uint32_t flowid = 0;
1121         uint8_t flowtype = M_HASHTYPE_NONE;
1122
1123         if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
1124                 if (control)
1125                         m_freem(control);
1126                 m_freem(m);
1127                 return (EMSGSIZE);
1128         }
1129
1130         src.sin_family = 0;
1131         sin = (struct sockaddr_in *)addr;
1132
1133         /*
1134          * udp_output() may need to temporarily bind or connect the current
1135          * inpcb.  As such, we don't know up front whether we will need the
1136          * pcbinfo lock or not.  Do any work to decide what is needed up
1137          * front before acquiring any locks.
1138          *
1139          * We will need network epoch in either case, to safely lookup into
1140          * pcb hash.
1141          */
1142         if (sin == NULL ||
1143             (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0))
1144                 INP_WLOCK(inp);
1145         else
1146                 INP_RLOCK(inp);
1147         NET_EPOCH_ENTER(et);
1148         tos = inp->inp_ip_tos;
1149         if (control != NULL) {
1150                 /*
1151                  * XXX: Currently, we assume all the optional information is
1152                  * stored in a single mbuf.
1153                  */
1154                 if (control->m_next) {
1155                         m_freem(control);
1156                         error = EINVAL;
1157                         goto release;
1158                 }
1159                 for (; control->m_len > 0;
1160                     control->m_data += CMSG_ALIGN(cm->cmsg_len),
1161                     control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
1162                         cm = mtod(control, struct cmsghdr *);
1163                         if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
1164                             || cm->cmsg_len > control->m_len) {
1165                                 error = EINVAL;
1166                                 break;
1167                         }
1168 #ifdef INET6
1169                         error = udp_v4mapped_pktinfo(cm, &src, inp, flags);
1170                         if (error != 0)
1171                                 break;
1172 #endif
1173                         if (cm->cmsg_level != IPPROTO_IP)
1174                                 continue;
1175
1176                         switch (cm->cmsg_type) {
1177                         case IP_SENDSRCADDR:
1178                                 if (cm->cmsg_len !=
1179                                     CMSG_LEN(sizeof(struct in_addr))) {
1180                                         error = EINVAL;
1181                                         break;
1182                                 }
1183                                 bzero(&src, sizeof(src));
1184                                 src.sin_family = AF_INET;
1185                                 src.sin_len = sizeof(src);
1186                                 src.sin_port = inp->inp_lport;
1187                                 src.sin_addr =
1188                                     *(struct in_addr *)CMSG_DATA(cm);
1189                                 break;
1190
1191                         case IP_TOS:
1192                                 if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
1193                                         error = EINVAL;
1194                                         break;
1195                                 }
1196                                 tos = *(u_char *)CMSG_DATA(cm);
1197                                 break;
1198
1199                         case IP_FLOWID:
1200                                 if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1201                                         error = EINVAL;
1202                                         break;
1203                                 }
1204                                 flowid = *(uint32_t *) CMSG_DATA(cm);
1205                                 break;
1206
1207                         case IP_FLOWTYPE:
1208                                 if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1209                                         error = EINVAL;
1210                                         break;
1211                                 }
1212                                 flowtype = *(uint32_t *) CMSG_DATA(cm);
1213                                 break;
1214
1215 #ifdef  RSS
1216                         case IP_RSSBUCKETID:
1217                                 if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1218                                         error = EINVAL;
1219                                         break;
1220                                 }
1221                                 /* This is just a placeholder for now */
1222                                 break;
1223 #endif  /* RSS */
1224                         default:
1225                                 error = ENOPROTOOPT;
1226                                 break;
1227                         }
1228                         if (error)
1229                                 break;
1230                 }
1231                 m_freem(control);
1232                 control = NULL;
1233         }
1234         if (error)
1235                 goto release;
1236
1237         pr = inp->inp_socket->so_proto->pr_protocol;
1238         pcbinfo = udp_get_inpcbinfo(pr);
1239
1240         /*
1241          * If the IP_SENDSRCADDR control message was specified, override the
1242          * source address for this datagram.  Its use is invalidated if the
1243          * address thus specified is incomplete or clobbers other inpcbs.
1244          */
1245         laddr = inp->inp_laddr;
1246         lport = inp->inp_lport;
1247         if (src.sin_family == AF_INET) {
1248                 if ((lport == 0) ||
1249                     (laddr.s_addr == INADDR_ANY &&
1250                      src.sin_addr.s_addr == INADDR_ANY)) {
1251                         error = EINVAL;
1252                         goto release;
1253                 }
1254                 INP_HASH_WLOCK(pcbinfo);
1255                 error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
1256                     &laddr.s_addr, &lport, td->td_ucred);
1257                 INP_HASH_WUNLOCK(pcbinfo);
1258                 if (error)
1259                         goto release;
1260         }
1261
1262         /*
1263          * If a UDP socket has been connected, then a local address/port will
1264          * have been selected and bound.
1265          *
1266          * If a UDP socket has not been connected to, then an explicit
1267          * destination address must be used, in which case a local
1268          * address/port may not have been selected and bound.
1269          */
1270         if (sin != NULL) {
1271                 INP_LOCK_ASSERT(inp);
1272                 if (inp->inp_faddr.s_addr != INADDR_ANY) {
1273                         error = EISCONN;
1274                         goto release;
1275                 }
1276
1277                 /*
1278                  * Jail may rewrite the destination address, so let it do
1279                  * that before we use it.
1280                  */
1281                 error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1282                 if (error)
1283                         goto release;
1284
1285                 /*
1286                  * If a local address or port hasn't yet been selected, or if
1287                  * the destination address needs to be rewritten due to using
1288                  * a special INADDR_ constant, invoke in_pcbconnect_setup()
1289                  * to do the heavy lifting.  Once a port is selected, we
1290                  * commit the binding back to the socket; we also commit the
1291                  * binding of the address if in jail.
1292                  *
1293                  * If we already have a valid binding and we're not
1294                  * requesting a destination address rewrite, use a fast path.
1295                  */
1296                 if (inp->inp_laddr.s_addr == INADDR_ANY ||
1297                     inp->inp_lport == 0 ||
1298                     sin->sin_addr.s_addr == INADDR_ANY ||
1299                     sin->sin_addr.s_addr == INADDR_BROADCAST) {
1300                         INP_HASH_WLOCK(pcbinfo);
1301                         error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
1302                             &lport, &faddr.s_addr, &fport, NULL,
1303                             td->td_ucred);
1304                         if (error) {
1305                                 INP_HASH_WUNLOCK(pcbinfo);
1306                                 goto release;
1307                         }
1308
1309                         /*
1310                          * XXXRW: Why not commit the port if the address is
1311                          * !INADDR_ANY?
1312                          */
1313                         /* Commit the local port if newly assigned. */
1314                         if (inp->inp_laddr.s_addr == INADDR_ANY &&
1315                             inp->inp_lport == 0) {
1316                                 INP_WLOCK_ASSERT(inp);
1317                                 /*
1318                                  * Remember addr if jailed, to prevent
1319                                  * rebinding.
1320                                  */
1321                                 if (prison_flag(td->td_ucred, PR_IP4))
1322                                         inp->inp_laddr = laddr;
1323                                 inp->inp_lport = lport;
1324                                 error = in_pcbinshash(inp);
1325                                 INP_HASH_WUNLOCK(pcbinfo);
1326                                 if (error != 0) {
1327                                         inp->inp_lport = 0;
1328                                         error = EAGAIN;
1329                                         goto release;
1330                                 }
1331                                 inp->inp_flags |= INP_ANONPORT;
1332                         } else
1333                                 INP_HASH_WUNLOCK(pcbinfo);
1334                 } else {
1335                         faddr = sin->sin_addr;
1336                         fport = sin->sin_port;
1337                 }
1338         } else {
1339                 INP_LOCK_ASSERT(inp);
1340                 faddr = inp->inp_faddr;
1341                 fport = inp->inp_fport;
1342                 if (faddr.s_addr == INADDR_ANY) {
1343                         error = ENOTCONN;
1344                         goto release;
1345                 }
1346         }
1347
1348         /*
1349          * Calculate data length and get a mbuf for UDP, IP, and possible
1350          * link-layer headers.  Immediate slide the data pointer back forward
1351          * since we won't use that space at this layer.
1352          */
1353         M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
1354         if (m == NULL) {
1355                 error = ENOBUFS;
1356                 goto release;
1357         }
1358         m->m_data += max_linkhdr;
1359         m->m_len -= max_linkhdr;
1360         m->m_pkthdr.len -= max_linkhdr;
1361
1362         /*
1363          * Fill in mbuf with extended UDP header and addresses and length put
1364          * into network format.
1365          */
1366         ui = mtod(m, struct udpiphdr *);
1367         bzero(ui->ui_x1, sizeof(ui->ui_x1));    /* XXX still needed? */
1368         ui->ui_v = IPVERSION << 4;
1369         ui->ui_pr = pr;
1370         ui->ui_src = laddr;
1371         ui->ui_dst = faddr;
1372         ui->ui_sport = lport;
1373         ui->ui_dport = fport;
1374         ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
1375         if (pr == IPPROTO_UDPLITE) {
1376                 struct udpcb *up;
1377                 uint16_t plen;
1378
1379                 up = intoudpcb(inp);
1380                 cscov = up->u_txcslen;
1381                 plen = (u_short)len + sizeof(struct udphdr);
1382                 if (cscov >= plen)
1383                         cscov = 0;
1384                 ui->ui_len = htons(plen);
1385                 ui->ui_ulen = htons(cscov);
1386                 /*
1387                  * For UDP-Lite, checksum coverage length of zero means
1388                  * the entire UDPLite packet is covered by the checksum.
1389                  */
1390                 cscov_partial = (cscov == 0) ? 0 : 1;
1391         }
1392
1393         /*
1394          * Set the Don't Fragment bit in the IP header.
1395          */
1396         if (inp->inp_flags & INP_DONTFRAG) {
1397                 struct ip *ip;
1398
1399                 ip = (struct ip *)&ui->ui_i;
1400                 ip->ip_off |= htons(IP_DF);
1401         }
1402
1403         if (inp->inp_socket->so_options & SO_DONTROUTE)
1404                 ipflags |= IP_ROUTETOIF;
1405         if (inp->inp_socket->so_options & SO_BROADCAST)
1406                 ipflags |= IP_ALLOWBROADCAST;
1407         if (inp->inp_flags & INP_ONESBCAST)
1408                 ipflags |= IP_SENDONES;
1409
1410 #ifdef MAC
1411         mac_inpcb_create_mbuf(inp, m);
1412 #endif
1413
1414         /*
1415          * Set up checksum and output datagram.
1416          */
1417         ui->ui_sum = 0;
1418         if (pr == IPPROTO_UDPLITE) {
1419                 if (inp->inp_flags & INP_ONESBCAST)
1420                         faddr.s_addr = INADDR_BROADCAST;
1421                 if (cscov_partial) {
1422                         if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
1423                                 ui->ui_sum = 0xffff;
1424                 } else {
1425                         if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0)
1426                                 ui->ui_sum = 0xffff;
1427                 }
1428         } else if (V_udp_cksum) {
1429                 if (inp->inp_flags & INP_ONESBCAST)
1430                         faddr.s_addr = INADDR_BROADCAST;
1431                 ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
1432                     htons((u_short)len + sizeof(struct udphdr) + pr));
1433                 m->m_pkthdr.csum_flags = CSUM_UDP;
1434                 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
1435         }
1436         ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
1437         ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;    /* XXX */
1438         ((struct ip *)ui)->ip_tos = tos;                /* XXX */
1439         UDPSTAT_INC(udps_opackets);
1440
1441         /*
1442          * Setup flowid / RSS information for outbound socket.
1443          *
1444          * Once the UDP code decides to set a flowid some other way,
1445          * this allows the flowid to be overridden by userland.
1446          */
1447         if (flowtype != M_HASHTYPE_NONE) {
1448                 m->m_pkthdr.flowid = flowid;
1449                 M_HASHTYPE_SET(m, flowtype);
1450         }
1451 #if defined(ROUTE_MPATH) || defined(RSS)
1452         else if (CALC_FLOWID_OUTBOUND_SENDTO) {
1453                 uint32_t hash_val, hash_type;
1454
1455                 hash_val = fib4_calc_packet_hash(laddr, faddr,
1456                     lport, fport, pr, &hash_type);
1457                 m->m_pkthdr.flowid = hash_val;
1458                 M_HASHTYPE_SET(m, hash_type);
1459         }
1460
1461         /*
1462          * Don't override with the inp cached flowid value.
1463          *
1464          * Depending upon the kind of send being done, the inp
1465          * flowid/flowtype values may actually not be appropriate
1466          * for this particular socket send.
1467          *
1468          * We should either leave the flowid at zero (which is what is
1469          * currently done) or set it to some software generated
1470          * hash value based on the packet contents.
1471          */
1472         ipflags |= IP_NODEFAULTFLOWID;
1473 #endif  /* RSS */
1474
1475         if (pr == IPPROTO_UDPLITE)
1476                 UDPLITE_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1477         else
1478                 UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1479         error = ip_output(m, inp->inp_options,
1480             INP_WLOCKED(inp) ? &inp->inp_route : NULL, ipflags,
1481             inp->inp_moptions, inp);
1482         INP_UNLOCK(inp);
1483         NET_EPOCH_EXIT(et);
1484         return (error);
1485
1486 release:
1487         INP_UNLOCK(inp);
1488         NET_EPOCH_EXIT(et);
1489         m_freem(m);
1490         return (error);
1491 }
1492
1493 static void
1494 udp_abort(struct socket *so)
1495 {
1496         struct inpcb *inp;
1497         struct inpcbinfo *pcbinfo;
1498
1499         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1500         inp = sotoinpcb(so);
1501         KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
1502         INP_WLOCK(inp);
1503         if (inp->inp_faddr.s_addr != INADDR_ANY) {
1504                 INP_HASH_WLOCK(pcbinfo);
1505                 in_pcbdisconnect(inp);
1506                 inp->inp_laddr.s_addr = INADDR_ANY;
1507                 INP_HASH_WUNLOCK(pcbinfo);
1508                 soisdisconnected(so);
1509         }
1510         INP_WUNLOCK(inp);
1511 }
1512
1513 static int
1514 udp_attach(struct socket *so, int proto, struct thread *td)
1515 {
1516         static uint32_t udp_flowid;
1517         struct inpcb *inp;
1518         struct inpcbinfo *pcbinfo;
1519         int error;
1520
1521         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1522         inp = sotoinpcb(so);
1523         KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
1524         error = soreserve(so, udp_sendspace, udp_recvspace);
1525         if (error)
1526                 return (error);
1527         error = in_pcballoc(so, pcbinfo);
1528         if (error)
1529                 return (error);
1530
1531         inp = sotoinpcb(so);
1532         inp->inp_vflag |= INP_IPV4;
1533         inp->inp_ip_ttl = V_ip_defttl;
1534         inp->inp_flowid = atomic_fetchadd_int(&udp_flowid, 1);
1535         inp->inp_flowtype = M_HASHTYPE_OPAQUE;
1536
1537         error = udp_newudpcb(inp);
1538         if (error) {
1539                 in_pcbdetach(inp);
1540                 in_pcbfree(inp);
1541                 return (error);
1542         }
1543         INP_WUNLOCK(inp);
1544
1545         return (0);
1546 }
1547 #endif /* INET */
1548
1549 int
1550 udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx)
1551 {
1552         struct inpcb *inp;
1553         struct udpcb *up;
1554
1555         KASSERT(so->so_type == SOCK_DGRAM,
1556             ("udp_set_kernel_tunneling: !dgram"));
1557         inp = sotoinpcb(so);
1558         KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
1559         INP_WLOCK(inp);
1560         up = intoudpcb(inp);
1561         if ((f != NULL || i != NULL) && ((up->u_tun_func != NULL) ||
1562             (up->u_icmp_func != NULL))) {
1563                 INP_WUNLOCK(inp);
1564                 return (EBUSY);
1565         }
1566         up->u_tun_func = f;
1567         up->u_icmp_func = i;
1568         up->u_tun_ctx = ctx;
1569         INP_WUNLOCK(inp);
1570         return (0);
1571 }
1572
1573 #ifdef INET
1574 static int
1575 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
1576 {
1577         struct inpcb *inp;
1578         struct inpcbinfo *pcbinfo;
1579         struct sockaddr_in *sinp;
1580         int error;
1581
1582         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1583         inp = sotoinpcb(so);
1584         KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
1585
1586         sinp = (struct sockaddr_in *)nam;
1587         if (nam->sa_family != AF_INET) {
1588                 /*
1589                  * Preserve compatibility with old programs.
1590                  */
1591                 if (nam->sa_family != AF_UNSPEC ||
1592                     nam->sa_len < offsetof(struct sockaddr_in, sin_zero) ||
1593                     sinp->sin_addr.s_addr != INADDR_ANY)
1594                         return (EAFNOSUPPORT);
1595                 nam->sa_family = AF_INET;
1596         }
1597         if (nam->sa_len != sizeof(struct sockaddr_in))
1598                 return (EINVAL);
1599
1600         INP_WLOCK(inp);
1601         INP_HASH_WLOCK(pcbinfo);
1602         error = in_pcbbind(inp, nam, td->td_ucred);
1603         INP_HASH_WUNLOCK(pcbinfo);
1604         INP_WUNLOCK(inp);
1605         return (error);
1606 }
1607
1608 static void
1609 udp_close(struct socket *so)
1610 {
1611         struct inpcb *inp;
1612         struct inpcbinfo *pcbinfo;
1613
1614         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1615         inp = sotoinpcb(so);
1616         KASSERT(inp != NULL, ("udp_close: inp == NULL"));
1617         INP_WLOCK(inp);
1618         if (inp->inp_faddr.s_addr != INADDR_ANY) {
1619                 INP_HASH_WLOCK(pcbinfo);
1620                 in_pcbdisconnect(inp);
1621                 inp->inp_laddr.s_addr = INADDR_ANY;
1622                 INP_HASH_WUNLOCK(pcbinfo);
1623                 soisdisconnected(so);
1624         }
1625         INP_WUNLOCK(inp);
1626 }
1627
1628 static int
1629 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1630 {
1631         struct epoch_tracker et;
1632         struct inpcb *inp;
1633         struct inpcbinfo *pcbinfo;
1634         struct sockaddr_in *sin;
1635         int error;
1636
1637         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1638         inp = sotoinpcb(so);
1639         KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
1640
1641         sin = (struct sockaddr_in *)nam;
1642         if (sin->sin_family != AF_INET)
1643                 return (EAFNOSUPPORT);
1644         if (sin->sin_len != sizeof(*sin))
1645                 return (EINVAL);
1646
1647         INP_WLOCK(inp);
1648         if (inp->inp_faddr.s_addr != INADDR_ANY) {
1649                 INP_WUNLOCK(inp);
1650                 return (EISCONN);
1651         }
1652         error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1653         if (error != 0) {
1654                 INP_WUNLOCK(inp);
1655                 return (error);
1656         }
1657         NET_EPOCH_ENTER(et);
1658         INP_HASH_WLOCK(pcbinfo);
1659         error = in_pcbconnect(inp, nam, td->td_ucred, true);
1660         INP_HASH_WUNLOCK(pcbinfo);
1661         NET_EPOCH_EXIT(et);
1662         if (error == 0)
1663                 soisconnected(so);
1664         INP_WUNLOCK(inp);
1665         return (error);
1666 }
1667
1668 static void
1669 udp_detach(struct socket *so)
1670 {
1671         struct inpcb *inp;
1672         struct udpcb *up;
1673
1674         inp = sotoinpcb(so);
1675         KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
1676         KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
1677             ("udp_detach: not disconnected"));
1678         INP_WLOCK(inp);
1679         up = intoudpcb(inp);
1680         KASSERT(up != NULL, ("%s: up == NULL", __func__));
1681         inp->inp_ppcb = NULL;
1682         in_pcbdetach(inp);
1683         in_pcbfree(inp);
1684         udp_discardcb(up);
1685 }
1686
1687 static int
1688 udp_disconnect(struct socket *so)
1689 {
1690         struct inpcb *inp;
1691         struct inpcbinfo *pcbinfo;
1692
1693         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1694         inp = sotoinpcb(so);
1695         KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
1696         INP_WLOCK(inp);
1697         if (inp->inp_faddr.s_addr == INADDR_ANY) {
1698                 INP_WUNLOCK(inp);
1699                 return (ENOTCONN);
1700         }
1701         INP_HASH_WLOCK(pcbinfo);
1702         in_pcbdisconnect(inp);
1703         inp->inp_laddr.s_addr = INADDR_ANY;
1704         INP_HASH_WUNLOCK(pcbinfo);
1705         SOCK_LOCK(so);
1706         so->so_state &= ~SS_ISCONNECTED;                /* XXX */
1707         SOCK_UNLOCK(so);
1708         INP_WUNLOCK(inp);
1709         return (0);
1710 }
1711
1712 static int
1713 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
1714     struct mbuf *control, struct thread *td)
1715 {
1716         struct inpcb *inp;
1717         int error;
1718
1719         inp = sotoinpcb(so);
1720         KASSERT(inp != NULL, ("udp_send: inp == NULL"));
1721
1722         if (addr != NULL) {
1723                 error = 0;
1724                 if (addr->sa_family != AF_INET)
1725                         error = EAFNOSUPPORT;
1726                 else if (addr->sa_len != sizeof(struct sockaddr_in))
1727                         error = EINVAL;
1728                 if (__predict_false(error != 0)) {
1729                         m_freem(control);
1730                         m_freem(m);
1731                         return (error);
1732                 }
1733         }
1734         return (udp_output(inp, m, addr, control, td, flags));
1735 }
1736 #endif /* INET */
1737
1738 int
1739 udp_shutdown(struct socket *so)
1740 {
1741         struct inpcb *inp;
1742
1743         inp = sotoinpcb(so);
1744         KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
1745         INP_WLOCK(inp);
1746         socantsendmore(so);
1747         INP_WUNLOCK(inp);
1748         return (0);
1749 }
1750
1751 #ifdef INET
1752 struct pr_usrreqs udp_usrreqs = {
1753         .pru_abort =            udp_abort,
1754         .pru_attach =           udp_attach,
1755         .pru_bind =             udp_bind,
1756         .pru_connect =          udp_connect,
1757         .pru_control =          in_control,
1758         .pru_detach =           udp_detach,
1759         .pru_disconnect =       udp_disconnect,
1760         .pru_peeraddr =         in_getpeeraddr,
1761         .pru_send =             udp_send,
1762         .pru_soreceive =        soreceive_dgram,
1763         .pru_sosend =           sosend_dgram,
1764         .pru_shutdown =         udp_shutdown,
1765         .pru_sockaddr =         in_getsockaddr,
1766         .pru_sosetlabel =       in_pcbsosetlabel,
1767         .pru_close =            udp_close,
1768 };
1769 #endif /* INET */