]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/udp_usrreq.c
amd64: use register macros for gdb_cpu_getreg()
[FreeBSD/FreeBSD.git] / sys / netinet / udp_usrreq.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
5  *      The Regents of the University of California.
6  * Copyright (c) 2008 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * Copyright (c) 2014 Kevin Lo
9  * All rights reserved.
10  *
11  * Portions of this software were developed by Robert N. M. Watson under
12  * contract to Juniper Networks, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *      @(#)udp_usrreq.c        8.6 (Berkeley) 5/23/95
39  */
40
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43
44 #include "opt_inet.h"
45 #include "opt_inet6.h"
46 #include "opt_ipsec.h"
47 #include "opt_route.h"
48 #include "opt_rss.h"
49
50 #include <sys/param.h>
51 #include <sys/domain.h>
52 #include <sys/eventhandler.h>
53 #include <sys/jail.h>
54 #include <sys/kernel.h>
55 #include <sys/lock.h>
56 #include <sys/malloc.h>
57 #include <sys/mbuf.h>
58 #include <sys/priv.h>
59 #include <sys/proc.h>
60 #include <sys/protosw.h>
61 #include <sys/sdt.h>
62 #include <sys/signalvar.h>
63 #include <sys/socket.h>
64 #include <sys/socketvar.h>
65 #include <sys/sx.h>
66 #include <sys/sysctl.h>
67 #include <sys/syslog.h>
68 #include <sys/systm.h>
69
70 #include <vm/uma.h>
71
72 #include <net/if.h>
73 #include <net/if_var.h>
74 #include <net/route.h>
75 #include <net/route/nhop.h>
76 #include <net/rss_config.h>
77
78 #include <netinet/in.h>
79 #include <netinet/in_kdtrace.h>
80 #include <netinet/in_fib.h>
81 #include <netinet/in_pcb.h>
82 #include <netinet/in_systm.h>
83 #include <netinet/in_var.h>
84 #include <netinet/ip.h>
85 #ifdef INET6
86 #include <netinet/ip6.h>
87 #endif
88 #include <netinet/ip_icmp.h>
89 #include <netinet/icmp_var.h>
90 #include <netinet/ip_var.h>
91 #include <netinet/ip_options.h>
92 #ifdef INET6
93 #include <netinet6/ip6_var.h>
94 #endif
95 #include <netinet/udp.h>
96 #include <netinet/udp_var.h>
97 #include <netinet/udplite.h>
98 #include <netinet/in_rss.h>
99
100 #include <netipsec/ipsec_support.h>
101
102 #include <machine/in_cksum.h>
103
104 #include <security/mac/mac_framework.h>
105
106 /*
107  * UDP and UDP-Lite protocols implementation.
108  * Per RFC 768, August, 1980.
109  * Per RFC 3828, July, 2004.
110  */
111
112 /*
113  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
114  * removes the only data integrity mechanism for packets and malformed
115  * packets that would otherwise be discarded due to bad checksums, and may
116  * cause problems (especially for NFS data blocks).
117  */
118 VNET_DEFINE(int, udp_cksum) = 1;
119 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW,
120     &VNET_NAME(udp_cksum), 0, "compute udp checksum");
121
122 VNET_DEFINE(int, udp_log_in_vain) = 0;
123 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW,
124     &VNET_NAME(udp_log_in_vain), 0, "Log all incoming UDP packets");
125
126 VNET_DEFINE(int, udp_blackhole) = 0;
127 SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
128     &VNET_NAME(udp_blackhole), 0,
129     "Do not send port unreachables for refused connects");
130
131 u_long  udp_sendspace = 9216;           /* really max datagram size */
132 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
133     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
134
135 u_long  udp_recvspace = 40 * (1024 +
136 #ifdef INET6
137                                       sizeof(struct sockaddr_in6)
138 #else
139                                       sizeof(struct sockaddr_in)
140 #endif
141                                       );        /* 40 1K datagrams */
142
143 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
144     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
145
146 VNET_DEFINE(struct inpcbhead, udb);             /* from udp_var.h */
147 VNET_DEFINE(struct inpcbinfo, udbinfo);
148 VNET_DEFINE(struct inpcbhead, ulitecb);
149 VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
150 VNET_DEFINE_STATIC(uma_zone_t, udpcb_zone);
151 #define V_udpcb_zone                    VNET(udpcb_zone)
152
153 #ifndef UDBHASHSIZE
154 #define UDBHASHSIZE     128
155 #endif
156
157 VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat);          /* from udp_var.h */
158 VNET_PCPUSTAT_SYSINIT(udpstat);
159 SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
160     udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
161
162 #ifdef VIMAGE
163 VNET_PCPUSTAT_SYSUNINIT(udpstat);
164 #endif /* VIMAGE */
165 #ifdef INET
166 static void     udp_detach(struct socket *so);
167 static int      udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
168                     struct mbuf *, struct thread *, int);
169 #endif
170
171 static void
172 udp_zone_change(void *tag)
173 {
174
175         uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
176         uma_zone_set_max(V_udpcb_zone, maxsockets);
177 }
178
179 static int
180 udp_inpcb_init(void *mem, int size, int flags)
181 {
182         struct inpcb *inp;
183
184         inp = mem;
185         INP_LOCK_INIT(inp, "inp", "udpinp");
186         return (0);
187 }
188
189 static int
190 udplite_inpcb_init(void *mem, int size, int flags)
191 {
192         struct inpcb *inp;
193
194         inp = mem;
195         INP_LOCK_INIT(inp, "inp", "udpliteinp");
196         return (0);
197 }
198
199 void
200 udp_init(void)
201 {
202
203         /*
204          * For now default to 2-tuple UDP hashing - until the fragment
205          * reassembly code can also update the flowid.
206          *
207          * Once we can calculate the flowid that way and re-establish
208          * a 4-tuple, flip this to 4-tuple.
209          */
210         in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
211             "udp_inpcb", udp_inpcb_init, IPI_HASHFIELDS_2TUPLE);
212         V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
213             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
214         uma_zone_set_max(V_udpcb_zone, maxsockets);
215         uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
216         EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
217             EVENTHANDLER_PRI_ANY);
218 }
219
220 void
221 udplite_init(void)
222 {
223
224         in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE,
225             UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init,
226             IPI_HASHFIELDS_2TUPLE);
227 }
228
229 /*
230  * Kernel module interface for updating udpstat.  The argument is an index
231  * into udpstat treated as an array of u_long.  While this encodes the
232  * general layout of udpstat into the caller, it doesn't encode its location,
233  * so that future changes to add, for example, per-CPU stats support won't
234  * cause binary compatibility problems for kernel modules.
235  */
236 void
237 kmod_udpstat_inc(int statnum)
238 {
239
240         counter_u64_add(VNET(udpstat)[statnum], 1);
241 }
242
243 int
244 udp_newudpcb(struct inpcb *inp)
245 {
246         struct udpcb *up;
247
248         up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
249         if (up == NULL)
250                 return (ENOBUFS);
251         inp->inp_ppcb = up;
252         return (0);
253 }
254
255 void
256 udp_discardcb(struct udpcb *up)
257 {
258
259         uma_zfree(V_udpcb_zone, up);
260 }
261
262 #ifdef VIMAGE
263 static void
264 udp_destroy(void *unused __unused)
265 {
266
267         in_pcbinfo_destroy(&V_udbinfo);
268         uma_zdestroy(V_udpcb_zone);
269 }
270 VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL);
271
272 static void
273 udplite_destroy(void *unused __unused)
274 {
275
276         in_pcbinfo_destroy(&V_ulitecbinfo);
277 }
278 VNET_SYSUNINIT(udplite, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udplite_destroy,
279     NULL);
280 #endif
281
282 #ifdef INET
283 /*
284  * Subroutine of udp_input(), which appends the provided mbuf chain to the
285  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
286  * contains the source address.  If the socket ends up being an IPv6 socket,
287  * udp_append() will convert to a sockaddr_in6 before passing the address
288  * into the socket code.
289  *
290  * In the normal case udp_append() will return 0, indicating that you
291  * must unlock the inp. However if a tunneling protocol is in place we increment
292  * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we
293  * then decrement the reference count. If the inp_rele returns 1, indicating the
294  * inp is gone, we return that to the caller to tell them *not* to unlock
295  * the inp. In the case of multi-cast this will cause the distribution
296  * to stop (though most tunneling protocols known currently do *not* use
297  * multicast).
298  */
299 static int
300 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
301     struct sockaddr_in *udp_in)
302 {
303         struct sockaddr *append_sa;
304         struct socket *so;
305         struct mbuf *tmpopts, *opts = NULL;
306 #ifdef INET6
307         struct sockaddr_in6 udp_in6;
308 #endif
309         struct udpcb *up;
310
311         INP_LOCK_ASSERT(inp);
312
313         /*
314          * Engage the tunneling protocol.
315          */
316         up = intoudpcb(inp);
317         if (up->u_tun_func != NULL) {
318                 in_pcbref(inp);
319                 INP_RUNLOCK(inp);
320                 (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&udp_in[0],
321                     up->u_tun_ctx);
322                 INP_RLOCK(inp);
323                 return (in_pcbrele_rlocked(inp));
324         }
325
326         off += sizeof(struct udphdr);
327
328 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
329         /* Check AH/ESP integrity. */
330         if (IPSEC_ENABLED(ipv4) &&
331             IPSEC_CHECK_POLICY(ipv4, n, inp) != 0) {
332                 m_freem(n);
333                 return (0);
334         }
335         if (up->u_flags & UF_ESPINUDP) {/* IPSec UDP encaps. */
336                 if (IPSEC_ENABLED(ipv4) &&
337                     UDPENCAP_INPUT(n, off, AF_INET) != 0)
338                         return (0);     /* Consumed. */
339         }
340 #endif /* IPSEC */
341 #ifdef MAC
342         if (mac_inpcb_check_deliver(inp, n) != 0) {
343                 m_freem(n);
344                 return (0);
345         }
346 #endif /* MAC */
347         if (inp->inp_flags & INP_CONTROLOPTS ||
348             inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
349 #ifdef INET6
350                 if (inp->inp_vflag & INP_IPV6)
351                         (void)ip6_savecontrol_v4(inp, n, &opts, NULL);
352                 else
353 #endif /* INET6 */
354                         ip_savecontrol(inp, &opts, ip, n);
355         }
356         if ((inp->inp_vflag & INP_IPV4) && (inp->inp_flags2 & INP_ORIGDSTADDR)) {
357                 tmpopts = sbcreatecontrol((caddr_t)&udp_in[1],
358                         sizeof(struct sockaddr_in), IP_ORIGDSTADDR, IPPROTO_IP);
359                 if (tmpopts) {
360                         if (opts) {
361                                 tmpopts->m_next = opts;
362                                 opts = tmpopts;
363                         } else
364                                 opts = tmpopts;
365                 }
366         }
367 #ifdef INET6
368         if (inp->inp_vflag & INP_IPV6) {
369                 bzero(&udp_in6, sizeof(udp_in6));
370                 udp_in6.sin6_len = sizeof(udp_in6);
371                 udp_in6.sin6_family = AF_INET6;
372                 in6_sin_2_v4mapsin6(&udp_in[0], &udp_in6);
373                 append_sa = (struct sockaddr *)&udp_in6;
374         } else
375 #endif /* INET6 */
376                 append_sa = (struct sockaddr *)&udp_in[0];
377         m_adj(n, off);
378
379         so = inp->inp_socket;
380         SOCKBUF_LOCK(&so->so_rcv);
381         if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
382                 SOCKBUF_UNLOCK(&so->so_rcv);
383                 m_freem(n);
384                 if (opts)
385                         m_freem(opts);
386                 UDPSTAT_INC(udps_fullsock);
387         } else
388                 sorwakeup_locked(so);
389         return (0);
390 }
391
392 int
393 udp_input(struct mbuf **mp, int *offp, int proto)
394 {
395         struct ip *ip;
396         struct udphdr *uh;
397         struct ifnet *ifp;
398         struct inpcb *inp;
399         uint16_t len, ip_len;
400         struct inpcbinfo *pcbinfo;
401         struct ip save_ip;
402         struct sockaddr_in udp_in[2];
403         struct mbuf *m;
404         struct m_tag *fwd_tag;
405         int cscov_partial, iphlen;
406
407         m = *mp;
408         iphlen = *offp;
409         ifp = m->m_pkthdr.rcvif;
410         *mp = NULL;
411         UDPSTAT_INC(udps_ipackets);
412
413         /*
414          * Strip IP options, if any; should skip this, make available to
415          * user, and use on returned packets, but we don't yet have a way to
416          * check the checksum with options still present.
417          */
418         if (iphlen > sizeof (struct ip)) {
419                 ip_stripoptions(m);
420                 iphlen = sizeof(struct ip);
421         }
422
423         /*
424          * Get IP and UDP header together in first mbuf.
425          */
426         if (m->m_len < iphlen + sizeof(struct udphdr)) {
427                 if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) {
428                         UDPSTAT_INC(udps_hdrops);
429                         return (IPPROTO_DONE);
430                 }
431         }
432         ip = mtod(m, struct ip *);
433         uh = (struct udphdr *)((caddr_t)ip + iphlen);
434         cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0;
435
436         /*
437          * Destination port of 0 is illegal, based on RFC768.
438          */
439         if (uh->uh_dport == 0)
440                 goto badunlocked;
441
442         /*
443          * Construct sockaddr format source address.  Stuff source address
444          * and datagram in user buffer.
445          */
446         bzero(&udp_in[0], sizeof(struct sockaddr_in) * 2);
447         udp_in[0].sin_len = sizeof(struct sockaddr_in);
448         udp_in[0].sin_family = AF_INET;
449         udp_in[0].sin_port = uh->uh_sport;
450         udp_in[0].sin_addr = ip->ip_src;
451         udp_in[1].sin_len = sizeof(struct sockaddr_in);
452         udp_in[1].sin_family = AF_INET;
453         udp_in[1].sin_port = uh->uh_dport;
454         udp_in[1].sin_addr = ip->ip_dst;
455
456         /*
457          * Make mbuf data length reflect UDP length.  If not enough data to
458          * reflect UDP length, drop.
459          */
460         len = ntohs((u_short)uh->uh_ulen);
461         ip_len = ntohs(ip->ip_len) - iphlen;
462         if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) {
463                 /* Zero means checksum over the complete packet. */
464                 if (len == 0)
465                         len = ip_len;
466                 cscov_partial = 0;
467         }
468         if (ip_len != len) {
469                 if (len > ip_len || len < sizeof(struct udphdr)) {
470                         UDPSTAT_INC(udps_badlen);
471                         goto badunlocked;
472                 }
473                 if (proto == IPPROTO_UDP)
474                         m_adj(m, len - ip_len);
475         }
476
477         /*
478          * Save a copy of the IP header in case we want restore it for
479          * sending an ICMP error message in response.
480          */
481         if (!V_udp_blackhole)
482                 save_ip = *ip;
483         else
484                 memset(&save_ip, 0, sizeof(save_ip));
485
486         /*
487          * Checksum extended UDP header and data.
488          */
489         if (uh->uh_sum) {
490                 u_short uh_sum;
491
492                 if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
493                     !cscov_partial) {
494                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
495                                 uh_sum = m->m_pkthdr.csum_data;
496                         else
497                                 uh_sum = in_pseudo(ip->ip_src.s_addr,
498                                     ip->ip_dst.s_addr, htonl((u_short)len +
499                                     m->m_pkthdr.csum_data + proto));
500                         uh_sum ^= 0xffff;
501                 } else {
502                         char b[9];
503
504                         bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
505                         bzero(((struct ipovly *)ip)->ih_x1, 9);
506                         ((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ?
507                             uh->uh_ulen : htons(ip_len);
508                         uh_sum = in_cksum(m, len + sizeof (struct ip));
509                         bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
510                 }
511                 if (uh_sum) {
512                         UDPSTAT_INC(udps_badsum);
513                         m_freem(m);
514                         return (IPPROTO_DONE);
515                 }
516         } else {
517                 if (proto == IPPROTO_UDP) {
518                         UDPSTAT_INC(udps_nosum);
519                 } else {
520                         /* UDPLite requires a checksum */
521                         /* XXX: What is the right UDPLite MIB counter here? */
522                         m_freem(m);
523                         return (IPPROTO_DONE);
524                 }
525         }
526
527         pcbinfo = udp_get_inpcbinfo(proto);
528         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
529             in_broadcast(ip->ip_dst, ifp)) {
530                 struct inpcb *last;
531                 struct inpcbhead *pcblist;
532
533                 NET_EPOCH_ASSERT();
534
535                 pcblist = udp_get_pcblist(proto);
536                 last = NULL;
537                 CK_LIST_FOREACH(inp, pcblist, inp_list) {
538                         if (inp->inp_lport != uh->uh_dport)
539                                 continue;
540 #ifdef INET6
541                         if ((inp->inp_vflag & INP_IPV4) == 0)
542                                 continue;
543 #endif
544                         if (inp->inp_laddr.s_addr != INADDR_ANY &&
545                             inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
546                                 continue;
547                         if (inp->inp_faddr.s_addr != INADDR_ANY &&
548                             inp->inp_faddr.s_addr != ip->ip_src.s_addr)
549                                 continue;
550                         if (inp->inp_fport != 0 &&
551                             inp->inp_fport != uh->uh_sport)
552                                 continue;
553
554                         INP_RLOCK(inp);
555
556                         if (__predict_false(inp->inp_flags2 & INP_FREED)) {
557                                 INP_RUNLOCK(inp);
558                                 continue;
559                         }
560
561                         /*
562                          * XXXRW: Because we weren't holding either the inpcb
563                          * or the hash lock when we checked for a match
564                          * before, we should probably recheck now that the
565                          * inpcb lock is held.
566                          */
567
568                         /*
569                          * Handle socket delivery policy for any-source
570                          * and source-specific multicast. [RFC3678]
571                          */
572                         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
573                                 struct ip_moptions      *imo;
574                                 struct sockaddr_in       group;
575                                 int                      blocked;
576
577                                 imo = inp->inp_moptions;
578                                 if (imo == NULL) {
579                                         INP_RUNLOCK(inp);
580                                         continue;
581                                 }
582                                 bzero(&group, sizeof(struct sockaddr_in));
583                                 group.sin_len = sizeof(struct sockaddr_in);
584                                 group.sin_family = AF_INET;
585                                 group.sin_addr = ip->ip_dst;
586
587                                 blocked = imo_multi_filter(imo, ifp,
588                                         (struct sockaddr *)&group,
589                                         (struct sockaddr *)&udp_in[0]);
590                                 if (blocked != MCAST_PASS) {
591                                         if (blocked == MCAST_NOTGMEMBER)
592                                                 IPSTAT_INC(ips_notmember);
593                                         if (blocked == MCAST_NOTSMEMBER ||
594                                             blocked == MCAST_MUTED)
595                                                 UDPSTAT_INC(udps_filtermcast);
596                                         INP_RUNLOCK(inp);
597                                         continue;
598                                 }
599                         }
600                         if (last != NULL) {
601                                 struct mbuf *n;
602
603                                 if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) !=
604                                     NULL) {
605                                         if (proto == IPPROTO_UDPLITE)
606                                                 UDPLITE_PROBE(receive, NULL, last, ip,
607                                                     last, uh);
608                                         else
609                                                 UDP_PROBE(receive, NULL, last, ip, last,
610                                                     uh);
611                                         if (udp_append(last, ip, n, iphlen,
612                                                 udp_in)) {
613                                                 goto inp_lost;
614                                         }
615                                 }
616                                 INP_RUNLOCK(last);
617                         }
618                         last = inp;
619                         /*
620                          * Don't look for additional matches if this one does
621                          * not have either the SO_REUSEPORT or SO_REUSEADDR
622                          * socket options set.  This heuristic avoids
623                          * searching through all pcbs in the common case of a
624                          * non-shared port.  It assumes that an application
625                          * will never clear these options after setting them.
626                          */
627                         if ((last->inp_socket->so_options &
628                             (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0)
629                                 break;
630                 }
631
632                 if (last == NULL) {
633                         /*
634                          * No matching pcb found; discard datagram.  (No need
635                          * to send an ICMP Port Unreachable for a broadcast
636                          * or multicast datgram.)
637                          */
638                         UDPSTAT_INC(udps_noportbcast);
639                         if (inp)
640                                 INP_RUNLOCK(inp);
641                         goto badunlocked;
642                 }
643                 if (proto == IPPROTO_UDPLITE)
644                         UDPLITE_PROBE(receive, NULL, last, ip, last, uh);
645                 else
646                         UDP_PROBE(receive, NULL, last, ip, last, uh);
647                 if (udp_append(last, ip, m, iphlen, udp_in) == 0)
648                         INP_RUNLOCK(last);
649         inp_lost:
650                 return (IPPROTO_DONE);
651         }
652
653         /*
654          * Locate pcb for datagram.
655          */
656
657         /*
658          * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
659          */
660         if ((m->m_flags & M_IP_NEXTHOP) &&
661             (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
662                 struct sockaddr_in *next_hop;
663
664                 next_hop = (struct sockaddr_in *)(fwd_tag + 1);
665
666                 /*
667                  * Transparently forwarded. Pretend to be the destination.
668                  * Already got one like this?
669                  */
670                 inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
671                     ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
672                 if (!inp) {
673                         /*
674                          * It's new.  Try to find the ambushing socket.
675                          * Because we've rewritten the destination address,
676                          * any hardware-generated hash is ignored.
677                          */
678                         inp = in_pcblookup(pcbinfo, ip->ip_src,
679                             uh->uh_sport, next_hop->sin_addr,
680                             next_hop->sin_port ? htons(next_hop->sin_port) :
681                             uh->uh_dport, INPLOOKUP_WILDCARD |
682                             INPLOOKUP_RLOCKPCB, ifp);
683                 }
684                 /* Remove the tag from the packet. We don't need it anymore. */
685                 m_tag_delete(m, fwd_tag);
686                 m->m_flags &= ~M_IP_NEXTHOP;
687         } else
688                 inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
689                     ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
690                     INPLOOKUP_RLOCKPCB, ifp, m);
691         if (inp == NULL) {
692                 if (V_udp_log_in_vain) {
693                         char src[INET_ADDRSTRLEN];
694                         char dst[INET_ADDRSTRLEN];
695
696                         log(LOG_INFO,
697                             "Connection attempt to UDP %s:%d from %s:%d\n",
698                             inet_ntoa_r(ip->ip_dst, dst), ntohs(uh->uh_dport),
699                             inet_ntoa_r(ip->ip_src, src), ntohs(uh->uh_sport));
700                 }
701                 if (proto == IPPROTO_UDPLITE)
702                         UDPLITE_PROBE(receive, NULL, NULL, ip, NULL, uh);
703                 else
704                         UDP_PROBE(receive, NULL, NULL, ip, NULL, uh);
705                 UDPSTAT_INC(udps_noport);
706                 if (m->m_flags & (M_BCAST | M_MCAST)) {
707                         UDPSTAT_INC(udps_noportbcast);
708                         goto badunlocked;
709                 }
710                 if (V_udp_blackhole)
711                         goto badunlocked;
712                 if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
713                         goto badunlocked;
714                 *ip = save_ip;
715                 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
716                 return (IPPROTO_DONE);
717         }
718
719         /*
720          * Check the minimum TTL for socket.
721          */
722         INP_RLOCK_ASSERT(inp);
723         if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
724                 if (proto == IPPROTO_UDPLITE)
725                         UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
726                 else
727                         UDP_PROBE(receive, NULL, inp, ip, inp, uh);
728                 INP_RUNLOCK(inp);
729                 m_freem(m);
730                 return (IPPROTO_DONE);
731         }
732         if (cscov_partial) {
733                 struct udpcb *up;
734
735                 up = intoudpcb(inp);
736                 if (up->u_rxcslen == 0 || up->u_rxcslen > len) {
737                         INP_RUNLOCK(inp);
738                         m_freem(m);
739                         return (IPPROTO_DONE);
740                 }
741         }
742
743         if (proto == IPPROTO_UDPLITE)
744                 UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
745         else
746                 UDP_PROBE(receive, NULL, inp, ip, inp, uh);
747         if (udp_append(inp, ip, m, iphlen, udp_in) == 0)
748                 INP_RUNLOCK(inp);
749         return (IPPROTO_DONE);
750
751 badunlocked:
752         m_freem(m);
753         return (IPPROTO_DONE);
754 }
755 #endif /* INET */
756
757 /*
758  * Notify a udp user of an asynchronous error; just wake up so that they can
759  * collect error status.
760  */
761 struct inpcb *
762 udp_notify(struct inpcb *inp, int errno)
763 {
764
765         INP_WLOCK_ASSERT(inp);
766         if ((errno == EHOSTUNREACH || errno == ENETUNREACH ||
767              errno == EHOSTDOWN) && inp->inp_route.ro_nh) {
768                 NH_FREE(inp->inp_route.ro_nh);
769                 inp->inp_route.ro_nh = (struct nhop_object *)NULL;
770         }
771
772         inp->inp_socket->so_error = errno;
773         sorwakeup(inp->inp_socket);
774         sowwakeup(inp->inp_socket);
775         return (inp);
776 }
777
778 #ifdef INET
779 static void
780 udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip,
781     struct inpcbinfo *pcbinfo)
782 {
783         struct ip *ip = vip;
784         struct udphdr *uh;
785         struct in_addr faddr;
786         struct inpcb *inp;
787
788         faddr = ((struct sockaddr_in *)sa)->sin_addr;
789         if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
790                 return;
791
792         if (PRC_IS_REDIRECT(cmd)) {
793                 /* signal EHOSTDOWN, as it flushes the cached route */
794                 in_pcbnotifyall(&V_udbinfo, faddr, EHOSTDOWN, udp_notify);
795                 return;
796         }
797
798         /*
799          * Hostdead is ugly because it goes linearly through all PCBs.
800          *
801          * XXX: We never get this from ICMP, otherwise it makes an excellent
802          * DoS attack on machines with many connections.
803          */
804         if (cmd == PRC_HOSTDEAD)
805                 ip = NULL;
806         else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
807                 return;
808         if (ip != NULL) {
809                 uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
810                 inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
811                     ip->ip_src, uh->uh_sport, INPLOOKUP_WLOCKPCB, NULL);
812                 if (inp != NULL) {
813                         INP_WLOCK_ASSERT(inp);
814                         if (inp->inp_socket != NULL) {
815                                 udp_notify(inp, inetctlerrmap[cmd]);
816                         }
817                         INP_WUNLOCK(inp);
818                 } else {
819                         inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
820                                            ip->ip_src, uh->uh_sport,
821                                            INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
822                         if (inp != NULL) {
823                                 struct udpcb *up;
824                                 void *ctx;
825                                 udp_tun_icmp_t func;
826
827                                 up = intoudpcb(inp);
828                                 ctx = up->u_tun_ctx;
829                                 func = up->u_icmp_func;
830                                 INP_RUNLOCK(inp);
831                                 if (func != NULL)
832                                         (*func)(cmd, sa, vip, ctx);
833                         }
834                 }
835         } else
836                 in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd],
837                     udp_notify);
838 }
839 void
840 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
841 {
842
843         return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo));
844 }
845
846 void
847 udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip)
848 {
849
850         return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo));
851 }
852 #endif /* INET */
853
854 static int
855 udp_pcblist(SYSCTL_HANDLER_ARGS)
856 {
857         struct xinpgen xig;
858         struct epoch_tracker et;
859         struct inpcb *inp;
860         int error;
861
862         if (req->newptr != 0)
863                 return (EPERM);
864
865         if (req->oldptr == 0) {
866                 int n;
867
868                 n = V_udbinfo.ipi_count;
869                 n += imax(n / 8, 10);
870                 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
871                 return (0);
872         }
873
874         if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
875                 return (error);
876
877         bzero(&xig, sizeof(xig));
878         xig.xig_len = sizeof xig;
879         xig.xig_count = V_udbinfo.ipi_count;
880         xig.xig_gen = V_udbinfo.ipi_gencnt;
881         xig.xig_sogen = so_gencnt;
882         error = SYSCTL_OUT(req, &xig, sizeof xig);
883         if (error)
884                 return (error);
885
886         NET_EPOCH_ENTER(et);
887         for (inp = CK_LIST_FIRST(V_udbinfo.ipi_listhead);
888             inp != NULL;
889             inp = CK_LIST_NEXT(inp, inp_list)) {
890                 INP_RLOCK(inp);
891                 if (inp->inp_gencnt <= xig.xig_gen &&
892                     cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
893                         struct xinpcb xi;
894
895                         in_pcbtoxinpcb(inp, &xi);
896                         INP_RUNLOCK(inp);
897                         error = SYSCTL_OUT(req, &xi, sizeof xi);
898                         if (error)
899                                 break;
900                 } else
901                         INP_RUNLOCK(inp);
902         }
903         NET_EPOCH_EXIT(et);
904
905         if (!error) {
906                 /*
907                  * Give the user an updated idea of our state.  If the
908                  * generation differs from what we told her before, she knows
909                  * that something happened while we were processing this
910                  * request, and it might be necessary to retry.
911                  */
912                 xig.xig_gen = V_udbinfo.ipi_gencnt;
913                 xig.xig_sogen = so_gencnt;
914                 xig.xig_count = V_udbinfo.ipi_count;
915                 error = SYSCTL_OUT(req, &xig, sizeof xig);
916         }
917
918         return (error);
919 }
920
921 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
922     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
923     udp_pcblist, "S,xinpcb",
924     "List of active UDP sockets");
925
926 #ifdef INET
927 static int
928 udp_getcred(SYSCTL_HANDLER_ARGS)
929 {
930         struct xucred xuc;
931         struct sockaddr_in addrs[2];
932         struct epoch_tracker et;
933         struct inpcb *inp;
934         int error;
935
936         error = priv_check(req->td, PRIV_NETINET_GETCRED);
937         if (error)
938                 return (error);
939         error = SYSCTL_IN(req, addrs, sizeof(addrs));
940         if (error)
941                 return (error);
942         NET_EPOCH_ENTER(et);
943         inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
944             addrs[0].sin_addr, addrs[0].sin_port,
945             INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
946         NET_EPOCH_EXIT(et);
947         if (inp != NULL) {
948                 INP_RLOCK_ASSERT(inp);
949                 if (inp->inp_socket == NULL)
950                         error = ENOENT;
951                 if (error == 0)
952                         error = cr_canseeinpcb(req->td->td_ucred, inp);
953                 if (error == 0)
954                         cru2x(inp->inp_cred, &xuc);
955                 INP_RUNLOCK(inp);
956         } else
957                 error = ENOENT;
958         if (error == 0)
959                 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
960         return (error);
961 }
962
963 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
964     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
965     0, 0, udp_getcred, "S,xucred",
966     "Get the xucred of a UDP connection");
967 #endif /* INET */
968
969 int
970 udp_ctloutput(struct socket *so, struct sockopt *sopt)
971 {
972         struct inpcb *inp;
973         struct udpcb *up;
974         int isudplite, error, optval;
975
976         error = 0;
977         isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0;
978         inp = sotoinpcb(so);
979         KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
980         INP_WLOCK(inp);
981         if (sopt->sopt_level != so->so_proto->pr_protocol) {
982 #ifdef INET6
983                 if (INP_CHECK_SOCKAF(so, AF_INET6)) {
984                         INP_WUNLOCK(inp);
985                         error = ip6_ctloutput(so, sopt);
986                 }
987 #endif
988 #if defined(INET) && defined(INET6)
989                 else
990 #endif
991 #ifdef INET
992                 {
993                         INP_WUNLOCK(inp);
994                         error = ip_ctloutput(so, sopt);
995                 }
996 #endif
997                 return (error);
998         }
999
1000         switch (sopt->sopt_dir) {
1001         case SOPT_SET:
1002                 switch (sopt->sopt_name) {
1003 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1004 #ifdef INET
1005                 case UDP_ENCAP:
1006                         if (!IPSEC_ENABLED(ipv4)) {
1007                                 INP_WUNLOCK(inp);
1008                                 return (ENOPROTOOPT);
1009                         }
1010                         error = UDPENCAP_PCBCTL(inp, sopt);
1011                         break;
1012 #endif /* INET */
1013 #endif /* IPSEC */
1014                 case UDPLITE_SEND_CSCOV:
1015                 case UDPLITE_RECV_CSCOV:
1016                         if (!isudplite) {
1017                                 INP_WUNLOCK(inp);
1018                                 error = ENOPROTOOPT;
1019                                 break;
1020                         }
1021                         INP_WUNLOCK(inp);
1022                         error = sooptcopyin(sopt, &optval, sizeof(optval),
1023                             sizeof(optval));
1024                         if (error != 0)
1025                                 break;
1026                         inp = sotoinpcb(so);
1027                         KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
1028                         INP_WLOCK(inp);
1029                         up = intoudpcb(inp);
1030                         KASSERT(up != NULL, ("%s: up == NULL", __func__));
1031                         if ((optval != 0 && optval < 8) || (optval > 65535)) {
1032                                 INP_WUNLOCK(inp);
1033                                 error = EINVAL;
1034                                 break;
1035                         }
1036                         if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
1037                                 up->u_txcslen = optval;
1038                         else
1039                                 up->u_rxcslen = optval;
1040                         INP_WUNLOCK(inp);
1041                         break;
1042                 default:
1043                         INP_WUNLOCK(inp);
1044                         error = ENOPROTOOPT;
1045                         break;
1046                 }
1047                 break;
1048         case SOPT_GET:
1049                 switch (sopt->sopt_name) {
1050 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1051 #ifdef INET
1052                 case UDP_ENCAP:
1053                         if (!IPSEC_ENABLED(ipv4)) {
1054                                 INP_WUNLOCK(inp);
1055                                 return (ENOPROTOOPT);
1056                         }
1057                         error = UDPENCAP_PCBCTL(inp, sopt);
1058                         break;
1059 #endif /* INET */
1060 #endif /* IPSEC */
1061                 case UDPLITE_SEND_CSCOV:
1062                 case UDPLITE_RECV_CSCOV:
1063                         if (!isudplite) {
1064                                 INP_WUNLOCK(inp);
1065                                 error = ENOPROTOOPT;
1066                                 break;
1067                         }
1068                         up = intoudpcb(inp);
1069                         KASSERT(up != NULL, ("%s: up == NULL", __func__));
1070                         if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
1071                                 optval = up->u_txcslen;
1072                         else
1073                                 optval = up->u_rxcslen;
1074                         INP_WUNLOCK(inp);
1075                         error = sooptcopyout(sopt, &optval, sizeof(optval));
1076                         break;
1077                 default:
1078                         INP_WUNLOCK(inp);
1079                         error = ENOPROTOOPT;
1080                         break;
1081                 }
1082                 break;
1083         }
1084         return (error);
1085 }
1086
1087 #ifdef INET
1088 #ifdef INET6
1089 /* The logic here is derived from ip6_setpktopt(). See comments there. */
1090 static int
1091 udp_v4mapped_pktinfo(struct cmsghdr *cm, struct sockaddr_in * src,
1092     struct inpcb *inp, int flags)
1093 {
1094         struct ifnet *ifp;
1095         struct in6_pktinfo *pktinfo;
1096         struct in_addr ia;
1097
1098         if ((flags & PRUS_IPV6) == 0)
1099                 return (0);
1100
1101         if (cm->cmsg_level != IPPROTO_IPV6)
1102                 return (0);
1103
1104         if  (cm->cmsg_type != IPV6_2292PKTINFO &&
1105             cm->cmsg_type != IPV6_PKTINFO)
1106                 return (0);
1107
1108         if (cm->cmsg_len !=
1109             CMSG_LEN(sizeof(struct in6_pktinfo)))
1110                 return (EINVAL);
1111
1112         pktinfo = (struct in6_pktinfo *)CMSG_DATA(cm);
1113         if (!IN6_IS_ADDR_V4MAPPED(&pktinfo->ipi6_addr) &&
1114             !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr))
1115                 return (EINVAL);
1116
1117         /* Validate the interface index if specified. */
1118         if (pktinfo->ipi6_ifindex > V_if_index)
1119                 return (ENXIO);
1120
1121         ifp = NULL;
1122         if (pktinfo->ipi6_ifindex) {
1123                 ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
1124                 if (ifp == NULL)
1125                         return (ENXIO);
1126         }
1127         if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
1128                 ia.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
1129                 if (in_ifhasaddr(ifp, ia) == 0)
1130                         return (EADDRNOTAVAIL);
1131         }
1132
1133         bzero(src, sizeof(*src));
1134         src->sin_family = AF_INET;
1135         src->sin_len = sizeof(*src);
1136         src->sin_port = inp->inp_lport;
1137         src->sin_addr.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
1138
1139         return (0);
1140 }
1141 #endif
1142
1143 static int
1144 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
1145     struct mbuf *control, struct thread *td, int flags)
1146 {
1147         struct udpiphdr *ui;
1148         int len = m->m_pkthdr.len;
1149         struct in_addr faddr, laddr;
1150         struct cmsghdr *cm;
1151         struct inpcbinfo *pcbinfo;
1152         struct sockaddr_in *sin, src;
1153         struct epoch_tracker et;
1154         int cscov_partial = 0;
1155         int error = 0;
1156         int ipflags = 0;
1157         u_short fport, lport;
1158         u_char tos;
1159         uint8_t pr;
1160         uint16_t cscov = 0;
1161         uint32_t flowid = 0;
1162         uint8_t flowtype = M_HASHTYPE_NONE;
1163
1164         if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
1165                 if (control)
1166                         m_freem(control);
1167                 m_freem(m);
1168                 return (EMSGSIZE);
1169         }
1170
1171         src.sin_family = 0;
1172         sin = (struct sockaddr_in *)addr;
1173
1174         /*
1175          * udp_output() may need to temporarily bind or connect the current
1176          * inpcb.  As such, we don't know up front whether we will need the
1177          * pcbinfo lock or not.  Do any work to decide what is needed up
1178          * front before acquiring any locks.
1179          *
1180          * We will need network epoch in either case, to safely lookup into
1181          * pcb hash.
1182          */
1183         if (sin == NULL ||
1184             (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0))
1185                 INP_WLOCK(inp);
1186         else
1187                 INP_RLOCK(inp);
1188         NET_EPOCH_ENTER(et);
1189         tos = inp->inp_ip_tos;
1190         if (control != NULL) {
1191                 /*
1192                  * XXX: Currently, we assume all the optional information is
1193                  * stored in a single mbuf.
1194                  */
1195                 if (control->m_next) {
1196                         m_freem(control);
1197                         error = EINVAL;
1198                         goto release;
1199                 }
1200                 for (; control->m_len > 0;
1201                     control->m_data += CMSG_ALIGN(cm->cmsg_len),
1202                     control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
1203                         cm = mtod(control, struct cmsghdr *);
1204                         if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
1205                             || cm->cmsg_len > control->m_len) {
1206                                 error = EINVAL;
1207                                 break;
1208                         }
1209 #ifdef INET6
1210                         error = udp_v4mapped_pktinfo(cm, &src, inp, flags);
1211                         if (error != 0)
1212                                 break;
1213 #endif
1214                         if (cm->cmsg_level != IPPROTO_IP)
1215                                 continue;
1216
1217                         switch (cm->cmsg_type) {
1218                         case IP_SENDSRCADDR:
1219                                 if (cm->cmsg_len !=
1220                                     CMSG_LEN(sizeof(struct in_addr))) {
1221                                         error = EINVAL;
1222                                         break;
1223                                 }
1224                                 bzero(&src, sizeof(src));
1225                                 src.sin_family = AF_INET;
1226                                 src.sin_len = sizeof(src);
1227                                 src.sin_port = inp->inp_lport;
1228                                 src.sin_addr =
1229                                     *(struct in_addr *)CMSG_DATA(cm);
1230                                 break;
1231
1232                         case IP_TOS:
1233                                 if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
1234                                         error = EINVAL;
1235                                         break;
1236                                 }
1237                                 tos = *(u_char *)CMSG_DATA(cm);
1238                                 break;
1239
1240                         case IP_FLOWID:
1241                                 if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1242                                         error = EINVAL;
1243                                         break;
1244                                 }
1245                                 flowid = *(uint32_t *) CMSG_DATA(cm);
1246                                 break;
1247
1248                         case IP_FLOWTYPE:
1249                                 if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1250                                         error = EINVAL;
1251                                         break;
1252                                 }
1253                                 flowtype = *(uint32_t *) CMSG_DATA(cm);
1254                                 break;
1255
1256 #ifdef  RSS
1257                         case IP_RSSBUCKETID:
1258                                 if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1259                                         error = EINVAL;
1260                                         break;
1261                                 }
1262                                 /* This is just a placeholder for now */
1263                                 break;
1264 #endif  /* RSS */
1265                         default:
1266                                 error = ENOPROTOOPT;
1267                                 break;
1268                         }
1269                         if (error)
1270                                 break;
1271                 }
1272                 m_freem(control);
1273         }
1274         if (error)
1275                 goto release;
1276
1277         pr = inp->inp_socket->so_proto->pr_protocol;
1278         pcbinfo = udp_get_inpcbinfo(pr);
1279
1280         /*
1281          * If the IP_SENDSRCADDR control message was specified, override the
1282          * source address for this datagram.  Its use is invalidated if the
1283          * address thus specified is incomplete or clobbers other inpcbs.
1284          */
1285         laddr = inp->inp_laddr;
1286         lport = inp->inp_lport;
1287         if (src.sin_family == AF_INET) {
1288                 INP_HASH_LOCK_ASSERT(pcbinfo);
1289                 if ((lport == 0) ||
1290                     (laddr.s_addr == INADDR_ANY &&
1291                      src.sin_addr.s_addr == INADDR_ANY)) {
1292                         error = EINVAL;
1293                         goto release;
1294                 }
1295                 error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
1296                     &laddr.s_addr, &lport, td->td_ucred);
1297                 if (error)
1298                         goto release;
1299         }
1300
1301         /*
1302          * If a UDP socket has been connected, then a local address/port will
1303          * have been selected and bound.
1304          *
1305          * If a UDP socket has not been connected to, then an explicit
1306          * destination address must be used, in which case a local
1307          * address/port may not have been selected and bound.
1308          */
1309         if (sin != NULL) {
1310                 INP_LOCK_ASSERT(inp);
1311                 if (inp->inp_faddr.s_addr != INADDR_ANY) {
1312                         error = EISCONN;
1313                         goto release;
1314                 }
1315
1316                 /*
1317                  * Jail may rewrite the destination address, so let it do
1318                  * that before we use it.
1319                  */
1320                 error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1321                 if (error)
1322                         goto release;
1323
1324                 /*
1325                  * If a local address or port hasn't yet been selected, or if
1326                  * the destination address needs to be rewritten due to using
1327                  * a special INADDR_ constant, invoke in_pcbconnect_setup()
1328                  * to do the heavy lifting.  Once a port is selected, we
1329                  * commit the binding back to the socket; we also commit the
1330                  * binding of the address if in jail.
1331                  *
1332                  * If we already have a valid binding and we're not
1333                  * requesting a destination address rewrite, use a fast path.
1334                  */
1335                 if (inp->inp_laddr.s_addr == INADDR_ANY ||
1336                     inp->inp_lport == 0 ||
1337                     sin->sin_addr.s_addr == INADDR_ANY ||
1338                     sin->sin_addr.s_addr == INADDR_BROADCAST) {
1339                         INP_HASH_LOCK_ASSERT(pcbinfo);
1340                         error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
1341                             &lport, &faddr.s_addr, &fport, NULL,
1342                             td->td_ucred);
1343                         if (error)
1344                                 goto release;
1345
1346                         /*
1347                          * XXXRW: Why not commit the port if the address is
1348                          * !INADDR_ANY?
1349                          */
1350                         /* Commit the local port if newly assigned. */
1351                         if (inp->inp_laddr.s_addr == INADDR_ANY &&
1352                             inp->inp_lport == 0) {
1353                                 INP_WLOCK_ASSERT(inp);
1354                                 /*
1355                                  * Remember addr if jailed, to prevent
1356                                  * rebinding.
1357                                  */
1358                                 if (prison_flag(td->td_ucred, PR_IP4))
1359                                         inp->inp_laddr = laddr;
1360                                 inp->inp_lport = lport;
1361                                 INP_HASH_WLOCK(pcbinfo);
1362                                 error = in_pcbinshash(inp);
1363                                 INP_HASH_WUNLOCK(pcbinfo);
1364                                 if (error != 0) {
1365                                         inp->inp_lport = 0;
1366                                         error = EAGAIN;
1367                                         goto release;
1368                                 }
1369                                 inp->inp_flags |= INP_ANONPORT;
1370                         }
1371                 } else {
1372                         faddr = sin->sin_addr;
1373                         fport = sin->sin_port;
1374                 }
1375         } else {
1376                 INP_LOCK_ASSERT(inp);
1377                 faddr = inp->inp_faddr;
1378                 fport = inp->inp_fport;
1379                 if (faddr.s_addr == INADDR_ANY) {
1380                         error = ENOTCONN;
1381                         goto release;
1382                 }
1383         }
1384
1385         /*
1386          * Calculate data length and get a mbuf for UDP, IP, and possible
1387          * link-layer headers.  Immediate slide the data pointer back forward
1388          * since we won't use that space at this layer.
1389          */
1390         M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
1391         if (m == NULL) {
1392                 error = ENOBUFS;
1393                 goto release;
1394         }
1395         m->m_data += max_linkhdr;
1396         m->m_len -= max_linkhdr;
1397         m->m_pkthdr.len -= max_linkhdr;
1398
1399         /*
1400          * Fill in mbuf with extended UDP header and addresses and length put
1401          * into network format.
1402          */
1403         ui = mtod(m, struct udpiphdr *);
1404         bzero(ui->ui_x1, sizeof(ui->ui_x1));    /* XXX still needed? */
1405         ui->ui_v = IPVERSION << 4;
1406         ui->ui_pr = pr;
1407         ui->ui_src = laddr;
1408         ui->ui_dst = faddr;
1409         ui->ui_sport = lport;
1410         ui->ui_dport = fport;
1411         ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
1412         if (pr == IPPROTO_UDPLITE) {
1413                 struct udpcb *up;
1414                 uint16_t plen;
1415
1416                 up = intoudpcb(inp);
1417                 cscov = up->u_txcslen;
1418                 plen = (u_short)len + sizeof(struct udphdr);
1419                 if (cscov >= plen)
1420                         cscov = 0;
1421                 ui->ui_len = htons(plen);
1422                 ui->ui_ulen = htons(cscov);
1423                 /*
1424                  * For UDP-Lite, checksum coverage length of zero means
1425                  * the entire UDPLite packet is covered by the checksum.
1426                  */
1427                 cscov_partial = (cscov == 0) ? 0 : 1;
1428         }
1429
1430         /*
1431          * Set the Don't Fragment bit in the IP header.
1432          */
1433         if (inp->inp_flags & INP_DONTFRAG) {
1434                 struct ip *ip;
1435
1436                 ip = (struct ip *)&ui->ui_i;
1437                 ip->ip_off |= htons(IP_DF);
1438         }
1439
1440         if (inp->inp_socket->so_options & SO_DONTROUTE)
1441                 ipflags |= IP_ROUTETOIF;
1442         if (inp->inp_socket->so_options & SO_BROADCAST)
1443                 ipflags |= IP_ALLOWBROADCAST;
1444         if (inp->inp_flags & INP_ONESBCAST)
1445                 ipflags |= IP_SENDONES;
1446
1447 #ifdef MAC
1448         mac_inpcb_create_mbuf(inp, m);
1449 #endif
1450
1451         /*
1452          * Set up checksum and output datagram.
1453          */
1454         ui->ui_sum = 0;
1455         if (pr == IPPROTO_UDPLITE) {
1456                 if (inp->inp_flags & INP_ONESBCAST)
1457                         faddr.s_addr = INADDR_BROADCAST;
1458                 if (cscov_partial) {
1459                         if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
1460                                 ui->ui_sum = 0xffff;
1461                 } else {
1462                         if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0)
1463                                 ui->ui_sum = 0xffff;
1464                 }
1465         } else if (V_udp_cksum) {
1466                 if (inp->inp_flags & INP_ONESBCAST)
1467                         faddr.s_addr = INADDR_BROADCAST;
1468                 ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
1469                     htons((u_short)len + sizeof(struct udphdr) + pr));
1470                 m->m_pkthdr.csum_flags = CSUM_UDP;
1471                 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
1472         }
1473         ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
1474         ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;    /* XXX */
1475         ((struct ip *)ui)->ip_tos = tos;                /* XXX */
1476         UDPSTAT_INC(udps_opackets);
1477
1478         /*
1479          * Setup flowid / RSS information for outbound socket.
1480          *
1481          * Once the UDP code decides to set a flowid some other way,
1482          * this allows the flowid to be overridden by userland.
1483          */
1484         if (flowtype != M_HASHTYPE_NONE) {
1485                 m->m_pkthdr.flowid = flowid;
1486                 M_HASHTYPE_SET(m, flowtype);
1487         }
1488 #if defined(ROUTE_MPATH) || defined(RSS)
1489         else if (CALC_FLOWID_OUTBOUND_SENDTO) {
1490                 uint32_t hash_val, hash_type;
1491
1492                 hash_val = fib4_calc_packet_hash(laddr, faddr,
1493                     lport, fport, pr, &hash_type);
1494                 m->m_pkthdr.flowid = hash_val;
1495                 M_HASHTYPE_SET(m, hash_type);
1496         }
1497
1498         /*
1499          * Don't override with the inp cached flowid value.
1500          *
1501          * Depending upon the kind of send being done, the inp
1502          * flowid/flowtype values may actually not be appropriate
1503          * for this particular socket send.
1504          *
1505          * We should either leave the flowid at zero (which is what is
1506          * currently done) or set it to some software generated
1507          * hash value based on the packet contents.
1508          */
1509         ipflags |= IP_NODEFAULTFLOWID;
1510 #endif  /* RSS */
1511
1512         if (pr == IPPROTO_UDPLITE)
1513                 UDPLITE_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1514         else
1515                 UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1516         error = ip_output(m, inp->inp_options,
1517             INP_WLOCKED(inp) ? &inp->inp_route : NULL, ipflags,
1518             inp->inp_moptions, inp);
1519         INP_UNLOCK(inp);
1520         NET_EPOCH_EXIT(et);
1521         return (error);
1522
1523 release:
1524         INP_UNLOCK(inp);
1525         NET_EPOCH_EXIT(et);
1526         m_freem(m);
1527         return (error);
1528 }
1529
1530 static void
1531 udp_abort(struct socket *so)
1532 {
1533         struct inpcb *inp;
1534         struct inpcbinfo *pcbinfo;
1535
1536         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1537         inp = sotoinpcb(so);
1538         KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
1539         INP_WLOCK(inp);
1540         if (inp->inp_faddr.s_addr != INADDR_ANY) {
1541                 INP_HASH_WLOCK(pcbinfo);
1542                 in_pcbdisconnect(inp);
1543                 inp->inp_laddr.s_addr = INADDR_ANY;
1544                 INP_HASH_WUNLOCK(pcbinfo);
1545                 soisdisconnected(so);
1546         }
1547         INP_WUNLOCK(inp);
1548 }
1549
1550 static int
1551 udp_attach(struct socket *so, int proto, struct thread *td)
1552 {
1553         static uint32_t udp_flowid;
1554         struct inpcb *inp;
1555         struct inpcbinfo *pcbinfo;
1556         int error;
1557
1558         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1559         inp = sotoinpcb(so);
1560         KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
1561         error = soreserve(so, udp_sendspace, udp_recvspace);
1562         if (error)
1563                 return (error);
1564         INP_INFO_WLOCK(pcbinfo);
1565         error = in_pcballoc(so, pcbinfo);
1566         if (error) {
1567                 INP_INFO_WUNLOCK(pcbinfo);
1568                 return (error);
1569         }
1570
1571         inp = sotoinpcb(so);
1572         inp->inp_vflag |= INP_IPV4;
1573         inp->inp_ip_ttl = V_ip_defttl;
1574         inp->inp_flowid = atomic_fetchadd_int(&udp_flowid, 1);
1575         inp->inp_flowtype = M_HASHTYPE_OPAQUE;
1576
1577         error = udp_newudpcb(inp);
1578         if (error) {
1579                 in_pcbdetach(inp);
1580                 in_pcbfree(inp);
1581                 INP_INFO_WUNLOCK(pcbinfo);
1582                 return (error);
1583         }
1584
1585         INP_WUNLOCK(inp);
1586         INP_INFO_WUNLOCK(pcbinfo);
1587         return (0);
1588 }
1589 #endif /* INET */
1590
1591 int
1592 udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx)
1593 {
1594         struct inpcb *inp;
1595         struct udpcb *up;
1596
1597         KASSERT(so->so_type == SOCK_DGRAM,
1598             ("udp_set_kernel_tunneling: !dgram"));
1599         inp = sotoinpcb(so);
1600         KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
1601         INP_WLOCK(inp);
1602         up = intoudpcb(inp);
1603         if ((up->u_tun_func != NULL) ||
1604             (up->u_icmp_func != NULL)) {
1605                 INP_WUNLOCK(inp);
1606                 return (EBUSY);
1607         }
1608         up->u_tun_func = f;
1609         up->u_icmp_func = i;
1610         up->u_tun_ctx = ctx;
1611         INP_WUNLOCK(inp);
1612         return (0);
1613 }
1614
1615 #ifdef INET
1616 static int
1617 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
1618 {
1619         struct inpcb *inp;
1620         struct inpcbinfo *pcbinfo;
1621         int error;
1622
1623         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1624         inp = sotoinpcb(so);
1625         KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
1626         INP_WLOCK(inp);
1627         INP_HASH_WLOCK(pcbinfo);
1628         error = in_pcbbind(inp, nam, td->td_ucred);
1629         INP_HASH_WUNLOCK(pcbinfo);
1630         INP_WUNLOCK(inp);
1631         return (error);
1632 }
1633
1634 static void
1635 udp_close(struct socket *so)
1636 {
1637         struct inpcb *inp;
1638         struct inpcbinfo *pcbinfo;
1639
1640         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1641         inp = sotoinpcb(so);
1642         KASSERT(inp != NULL, ("udp_close: inp == NULL"));
1643         INP_WLOCK(inp);
1644         if (inp->inp_faddr.s_addr != INADDR_ANY) {
1645                 INP_HASH_WLOCK(pcbinfo);
1646                 in_pcbdisconnect(inp);
1647                 inp->inp_laddr.s_addr = INADDR_ANY;
1648                 INP_HASH_WUNLOCK(pcbinfo);
1649                 soisdisconnected(so);
1650         }
1651         INP_WUNLOCK(inp);
1652 }
1653
1654 static int
1655 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1656 {
1657         struct epoch_tracker et;
1658         struct inpcb *inp;
1659         struct inpcbinfo *pcbinfo;
1660         struct sockaddr_in *sin;
1661         int error;
1662
1663         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1664         inp = sotoinpcb(so);
1665         KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
1666         INP_WLOCK(inp);
1667         if (inp->inp_faddr.s_addr != INADDR_ANY) {
1668                 INP_WUNLOCK(inp);
1669                 return (EISCONN);
1670         }
1671         sin = (struct sockaddr_in *)nam;
1672         error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1673         if (error != 0) {
1674                 INP_WUNLOCK(inp);
1675                 return (error);
1676         }
1677         NET_EPOCH_ENTER(et);
1678         INP_HASH_WLOCK(pcbinfo);
1679         error = in_pcbconnect(inp, nam, td->td_ucred);
1680         INP_HASH_WUNLOCK(pcbinfo);
1681         NET_EPOCH_EXIT(et);
1682         if (error == 0)
1683                 soisconnected(so);
1684         INP_WUNLOCK(inp);
1685         return (error);
1686 }
1687
1688 static void
1689 udp_detach(struct socket *so)
1690 {
1691         struct inpcb *inp;
1692         struct inpcbinfo *pcbinfo;
1693         struct udpcb *up;
1694
1695         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1696         inp = sotoinpcb(so);
1697         KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
1698         KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
1699             ("udp_detach: not disconnected"));
1700         INP_INFO_WLOCK(pcbinfo);
1701         INP_WLOCK(inp);
1702         up = intoudpcb(inp);
1703         KASSERT(up != NULL, ("%s: up == NULL", __func__));
1704         inp->inp_ppcb = NULL;
1705         in_pcbdetach(inp);
1706         in_pcbfree(inp);
1707         INP_INFO_WUNLOCK(pcbinfo);
1708         udp_discardcb(up);
1709 }
1710
1711 static int
1712 udp_disconnect(struct socket *so)
1713 {
1714         struct inpcb *inp;
1715         struct inpcbinfo *pcbinfo;
1716
1717         pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1718         inp = sotoinpcb(so);
1719         KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
1720         INP_WLOCK(inp);
1721         if (inp->inp_faddr.s_addr == INADDR_ANY) {
1722                 INP_WUNLOCK(inp);
1723                 return (ENOTCONN);
1724         }
1725         INP_HASH_WLOCK(pcbinfo);
1726         in_pcbdisconnect(inp);
1727         inp->inp_laddr.s_addr = INADDR_ANY;
1728         INP_HASH_WUNLOCK(pcbinfo);
1729         SOCK_LOCK(so);
1730         so->so_state &= ~SS_ISCONNECTED;                /* XXX */
1731         SOCK_UNLOCK(so);
1732         INP_WUNLOCK(inp);
1733         return (0);
1734 }
1735
1736 static int
1737 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
1738     struct mbuf *control, struct thread *td)
1739 {
1740         struct inpcb *inp;
1741
1742         inp = sotoinpcb(so);
1743         KASSERT(inp != NULL, ("udp_send: inp == NULL"));
1744         return (udp_output(inp, m, addr, control, td, flags));
1745 }
1746 #endif /* INET */
1747
1748 int
1749 udp_shutdown(struct socket *so)
1750 {
1751         struct inpcb *inp;
1752
1753         inp = sotoinpcb(so);
1754         KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
1755         INP_WLOCK(inp);
1756         socantsendmore(so);
1757         INP_WUNLOCK(inp);
1758         return (0);
1759 }
1760
1761 #ifdef INET
1762 struct pr_usrreqs udp_usrreqs = {
1763         .pru_abort =            udp_abort,
1764         .pru_attach =           udp_attach,
1765         .pru_bind =             udp_bind,
1766         .pru_connect =          udp_connect,
1767         .pru_control =          in_control,
1768         .pru_detach =           udp_detach,
1769         .pru_disconnect =       udp_disconnect,
1770         .pru_peeraddr =         in_getpeeraddr,
1771         .pru_send =             udp_send,
1772         .pru_soreceive =        soreceive_dgram,
1773         .pru_sosend =           sosend_dgram,
1774         .pru_shutdown =         udp_shutdown,
1775         .pru_sockaddr =         in_getsockaddr,
1776         .pru_sosetlabel =       in_pcbsosetlabel,
1777         .pru_close =            udp_close,
1778 };
1779 #endif /* INET */