]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/udp_usrreq.c
Merge from head@222434.
[FreeBSD/FreeBSD.git] / sys / netinet / udp_usrreq.c
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *      The Regents of the University of California.
4  * Copyright (c) 2008 Robert N. M. Watson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 4. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *      @(#)udp_usrreq.c        8.6 (Berkeley) 5/23/95
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include "opt_ipfw.h"
38 #include "opt_inet.h"
39 #include "opt_inet6.h"
40 #include "opt_ipsec.h"
41
42 #include <sys/param.h>
43 #include <sys/domain.h>
44 #include <sys/eventhandler.h>
45 #include <sys/jail.h>
46 #include <sys/kernel.h>
47 #include <sys/lock.h>
48 #include <sys/malloc.h>
49 #include <sys/mbuf.h>
50 #include <sys/priv.h>
51 #include <sys/proc.h>
52 #include <sys/protosw.h>
53 #include <sys/signalvar.h>
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <sys/sx.h>
57 #include <sys/sysctl.h>
58 #include <sys/syslog.h>
59 #include <sys/systm.h>
60
61 #include <vm/uma.h>
62
63 #include <net/if.h>
64 #include <net/route.h>
65
66 #include <netinet/in.h>
67 #include <netinet/in_pcb.h>
68 #include <netinet/in_systm.h>
69 #include <netinet/in_var.h>
70 #include <netinet/ip.h>
71 #ifdef INET6
72 #include <netinet/ip6.h>
73 #endif
74 #include <netinet/ip_icmp.h>
75 #include <netinet/icmp_var.h>
76 #include <netinet/ip_var.h>
77 #include <netinet/ip_options.h>
78 #ifdef INET6
79 #include <netinet6/ip6_var.h>
80 #endif
81 #include <netinet/udp.h>
82 #include <netinet/udp_var.h>
83
84 #ifdef IPSEC
85 #include <netipsec/ipsec.h>
86 #include <netipsec/esp.h>
87 #endif
88
89 #include <machine/in_cksum.h>
90
91 #include <security/mac/mac_framework.h>
92
93 /*
94  * UDP protocol implementation.
95  * Per RFC 768, August, 1980.
96  */
97
98 /*
99  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
100  * removes the only data integrity mechanism for packets and malformed
101  * packets that would otherwise be discarded due to bad checksums, and may
102  * cause problems (especially for NFS data blocks).
103  */
104 static int      udp_cksum = 1;
105 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, &udp_cksum,
106     0, "compute udp checksum");
107
108 int     udp_log_in_vain = 0;
109 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
110     &udp_log_in_vain, 0, "Log all incoming UDP packets");
111
112 VNET_DEFINE(int, udp_blackhole) = 0;
113 SYSCTL_VNET_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW,
114     &VNET_NAME(udp_blackhole), 0,
115     "Do not send port unreachables for refused connects");
116
117 u_long  udp_sendspace = 9216;           /* really max datagram size */
118                                         /* 40 1K datagrams */
119 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
120     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
121
122 u_long  udp_recvspace = 40 * (1024 +
123 #ifdef INET6
124                                       sizeof(struct sockaddr_in6)
125 #else
126                                       sizeof(struct sockaddr_in)
127 #endif
128                                       );
129
130 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
131     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
132
133 VNET_DEFINE(struct inpcbhead, udb);             /* from udp_var.h */
134 VNET_DEFINE(struct inpcbinfo, udbinfo);
135 static VNET_DEFINE(uma_zone_t, udpcb_zone);
136 #define V_udpcb_zone                    VNET(udpcb_zone)
137
138 #ifndef UDBHASHSIZE
139 #define UDBHASHSIZE     128
140 #endif
141
142 VNET_DEFINE(struct udpstat, udpstat);           /* from udp_var.h */
143 SYSCTL_VNET_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW,
144     &VNET_NAME(udpstat), udpstat,
145     "UDP statistics (struct udpstat, netinet/udp_var.h)");
146
147 #ifdef INET
148 static void     udp_detach(struct socket *so);
149 static int      udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
150                     struct mbuf *, struct thread *);
151 #endif
152
153 #ifdef IPSEC
154 #ifdef IPSEC_NAT_T
155 #define UF_ESPINUDP_ALL (UF_ESPINUDP_NON_IKE|UF_ESPINUDP)
156 #ifdef INET
157 static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int);
158 #endif
159 #endif /* IPSEC_NAT_T */
160 #endif /* IPSEC */
161
162 static void
163 udp_zone_change(void *tag)
164 {
165
166         uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
167         uma_zone_set_max(V_udpcb_zone, maxsockets);
168 }
169
170 static int
171 udp_inpcb_init(void *mem, int size, int flags)
172 {
173         struct inpcb *inp;
174
175         inp = mem;
176         INP_LOCK_INIT(inp, "inp", "udpinp");
177         return (0);
178 }
179
180 void
181 udp_init(void)
182 {
183
184         in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
185             "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE);
186         V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
187             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
188         uma_zone_set_max(V_udpcb_zone, maxsockets);
189         EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
190             EVENTHANDLER_PRI_ANY);
191 }
192
193 /*
194  * Kernel module interface for updating udpstat.  The argument is an index
195  * into udpstat treated as an array of u_long.  While this encodes the
196  * general layout of udpstat into the caller, it doesn't encode its location,
197  * so that future changes to add, for example, per-CPU stats support won't
198  * cause binary compatibility problems for kernel modules.
199  */
200 void
201 kmod_udpstat_inc(int statnum)
202 {
203
204         (*((u_long *)&V_udpstat + statnum))++;
205 }
206
207 int
208 udp_newudpcb(struct inpcb *inp)
209 {
210         struct udpcb *up;
211
212         up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
213         if (up == NULL)
214                 return (ENOBUFS);
215         inp->inp_ppcb = up;
216         return (0);
217 }
218
219 void
220 udp_discardcb(struct udpcb *up)
221 {
222
223         uma_zfree(V_udpcb_zone, up);
224 }
225
226 #ifdef VIMAGE
227 void
228 udp_destroy(void)
229 {
230
231         in_pcbinfo_destroy(&V_udbinfo);
232         uma_zdestroy(V_udpcb_zone);
233 }
234 #endif
235
236 #ifdef INET
237 /*
238  * Subroutine of udp_input(), which appends the provided mbuf chain to the
239  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
240  * contains the source address.  If the socket ends up being an IPv6 socket,
241  * udp_append() will convert to a sockaddr_in6 before passing the address
242  * into the socket code.
243  */
244 static void
245 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
246     struct sockaddr_in *udp_in)
247 {
248         struct sockaddr *append_sa;
249         struct socket *so;
250         struct mbuf *opts = 0;
251 #ifdef INET6
252         struct sockaddr_in6 udp_in6;
253 #endif
254         struct udpcb *up;
255
256         INP_RLOCK_ASSERT(inp);
257
258         /*
259          * Engage the tunneling protocol.
260          */
261         up = intoudpcb(inp);
262         if (up->u_tun_func != NULL) {
263                 (*up->u_tun_func)(n, off, inp);
264                 return;
265         }
266
267         if (n == NULL)
268                 return;
269
270         off += sizeof(struct udphdr);
271
272 #ifdef IPSEC
273         /* Check AH/ESP integrity. */
274         if (ipsec4_in_reject(n, inp)) {
275                 m_freem(n);
276                 V_ipsec4stat.in_polvio++;
277                 return;
278         }
279 #ifdef IPSEC_NAT_T
280         up = intoudpcb(inp);
281         KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
282         if (up->u_flags & UF_ESPINUDP_ALL) {    /* IPSec UDP encaps. */
283                 n = udp4_espdecap(inp, n, off);
284                 if (n == NULL)                          /* Consumed. */
285                         return;
286         }
287 #endif /* IPSEC_NAT_T */
288 #endif /* IPSEC */
289 #ifdef MAC
290         if (mac_inpcb_check_deliver(inp, n) != 0) {
291                 m_freem(n);
292                 return;
293         }
294 #endif /* MAC */
295         if (inp->inp_flags & INP_CONTROLOPTS ||
296             inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
297 #ifdef INET6
298                 if (inp->inp_vflag & INP_IPV6)
299                         (void)ip6_savecontrol_v4(inp, n, &opts, NULL);
300                 else
301 #endif /* INET6 */
302                         ip_savecontrol(inp, &opts, ip, n);
303         }
304 #ifdef INET6
305         if (inp->inp_vflag & INP_IPV6) {
306                 bzero(&udp_in6, sizeof(udp_in6));
307                 udp_in6.sin6_len = sizeof(udp_in6);
308                 udp_in6.sin6_family = AF_INET6;
309                 in6_sin_2_v4mapsin6(udp_in, &udp_in6);
310                 append_sa = (struct sockaddr *)&udp_in6;
311         } else
312 #endif /* INET6 */
313                 append_sa = (struct sockaddr *)udp_in;
314         m_adj(n, off);
315
316         so = inp->inp_socket;
317         SOCKBUF_LOCK(&so->so_rcv);
318         if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
319                 SOCKBUF_UNLOCK(&so->so_rcv);
320                 m_freem(n);
321                 if (opts)
322                         m_freem(opts);
323                 UDPSTAT_INC(udps_fullsock);
324         } else
325                 sorwakeup_locked(so);
326 }
327
328 void
329 udp_input(struct mbuf *m, int off)
330 {
331         int iphlen = off;
332         struct ip *ip;
333         struct udphdr *uh;
334         struct ifnet *ifp;
335         struct inpcb *inp;
336         int len;
337         struct ip save_ip;
338         struct sockaddr_in udp_in;
339 #ifdef IPFIREWALL_FORWARD
340         struct m_tag *fwd_tag;
341 #endif
342
343         ifp = m->m_pkthdr.rcvif;
344         UDPSTAT_INC(udps_ipackets);
345
346         /*
347          * Strip IP options, if any; should skip this, make available to
348          * user, and use on returned packets, but we don't yet have a way to
349          * check the checksum with options still present.
350          */
351         if (iphlen > sizeof (struct ip)) {
352                 ip_stripoptions(m, (struct mbuf *)0);
353                 iphlen = sizeof(struct ip);
354         }
355
356         /*
357          * Get IP and UDP header together in first mbuf.
358          */
359         ip = mtod(m, struct ip *);
360         if (m->m_len < iphlen + sizeof(struct udphdr)) {
361                 if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) {
362                         UDPSTAT_INC(udps_hdrops);
363                         return;
364                 }
365                 ip = mtod(m, struct ip *);
366         }
367         uh = (struct udphdr *)((caddr_t)ip + iphlen);
368
369         /*
370          * Destination port of 0 is illegal, based on RFC768.
371          */
372         if (uh->uh_dport == 0)
373                 goto badunlocked;
374
375         /*
376          * Construct sockaddr format source address.  Stuff source address
377          * and datagram in user buffer.
378          */
379         bzero(&udp_in, sizeof(udp_in));
380         udp_in.sin_len = sizeof(udp_in);
381         udp_in.sin_family = AF_INET;
382         udp_in.sin_port = uh->uh_sport;
383         udp_in.sin_addr = ip->ip_src;
384
385         /*
386          * Make mbuf data length reflect UDP length.  If not enough data to
387          * reflect UDP length, drop.
388          */
389         len = ntohs((u_short)uh->uh_ulen);
390         if (ip->ip_len != len) {
391                 if (len > ip->ip_len || len < sizeof(struct udphdr)) {
392                         UDPSTAT_INC(udps_badlen);
393                         goto badunlocked;
394                 }
395                 m_adj(m, len - ip->ip_len);
396                 /* ip->ip_len = len; */
397         }
398
399         /*
400          * Save a copy of the IP header in case we want restore it for
401          * sending an ICMP error message in response.
402          */
403         if (!V_udp_blackhole)
404                 save_ip = *ip;
405         else
406                 memset(&save_ip, 0, sizeof(save_ip));
407
408         /*
409          * Checksum extended UDP header and data.
410          */
411         if (uh->uh_sum) {
412                 u_short uh_sum;
413
414                 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
415                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
416                                 uh_sum = m->m_pkthdr.csum_data;
417                         else
418                                 uh_sum = in_pseudo(ip->ip_src.s_addr,
419                                     ip->ip_dst.s_addr, htonl((u_short)len +
420                                     m->m_pkthdr.csum_data + IPPROTO_UDP));
421                         uh_sum ^= 0xffff;
422                 } else {
423                         char b[9];
424
425                         bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
426                         bzero(((struct ipovly *)ip)->ih_x1, 9);
427                         ((struct ipovly *)ip)->ih_len = uh->uh_ulen;
428                         uh_sum = in_cksum(m, len + sizeof (struct ip));
429                         bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
430                 }
431                 if (uh_sum) {
432                         UDPSTAT_INC(udps_badsum);
433                         m_freem(m);
434                         return;
435                 }
436         } else
437                 UDPSTAT_INC(udps_nosum);
438
439 #ifdef IPFIREWALL_FORWARD
440         /*
441          * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
442          */
443         fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
444         if (fwd_tag != NULL) {
445                 struct sockaddr_in *next_hop;
446
447                 /*
448                  * Do the hack.
449                  */
450                 next_hop = (struct sockaddr_in *)(fwd_tag + 1);
451                 ip->ip_dst = next_hop->sin_addr;
452                 uh->uh_dport = ntohs(next_hop->sin_port);
453
454                 /*
455                  * Remove the tag from the packet.  We don't need it anymore.
456                  */
457                 m_tag_delete(m, fwd_tag);
458         }
459 #endif
460
461         INP_INFO_RLOCK(&V_udbinfo);
462         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
463             in_broadcast(ip->ip_dst, ifp)) {
464                 struct inpcb *last;
465                 struct ip_moptions *imo;
466
467                 last = NULL;
468                 LIST_FOREACH(inp, &V_udb, inp_list) {
469                         if (inp->inp_lport != uh->uh_dport)
470                                 continue;
471 #ifdef INET6
472                         if ((inp->inp_vflag & INP_IPV4) == 0)
473                                 continue;
474 #endif
475                         if (inp->inp_laddr.s_addr != INADDR_ANY &&
476                             inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
477                                 continue;
478                         if (inp->inp_faddr.s_addr != INADDR_ANY &&
479                             inp->inp_faddr.s_addr != ip->ip_src.s_addr)
480                                 continue;
481                         if (inp->inp_fport != 0 &&
482                             inp->inp_fport != uh->uh_sport)
483                                 continue;
484
485                         INP_RLOCK(inp);
486
487                         /*
488                          * Handle socket delivery policy for any-source
489                          * and source-specific multicast. [RFC3678]
490                          */
491                         imo = inp->inp_moptions;
492                         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
493                                 struct sockaddr_in       group;
494                                 int                      blocked;
495                                 if (imo == NULL) {
496                                         INP_RUNLOCK(inp);
497                                         continue;
498                                 }
499                                 bzero(&group, sizeof(struct sockaddr_in));
500                                 group.sin_len = sizeof(struct sockaddr_in);
501                                 group.sin_family = AF_INET;
502                                 group.sin_addr = ip->ip_dst;
503
504                                 blocked = imo_multi_filter(imo, ifp,
505                                         (struct sockaddr *)&group,
506                                         (struct sockaddr *)&udp_in);
507                                 if (blocked != MCAST_PASS) {
508                                         if (blocked == MCAST_NOTGMEMBER)
509                                                 IPSTAT_INC(ips_notmember);
510                                         if (blocked == MCAST_NOTSMEMBER ||
511                                             blocked == MCAST_MUTED)
512                                                 UDPSTAT_INC(udps_filtermcast);
513                                         INP_RUNLOCK(inp);
514                                         continue;
515                                 }
516                         }
517                         if (last != NULL) {
518                                 struct mbuf *n;
519
520                                 n = m_copy(m, 0, M_COPYALL);
521                                 udp_append(last, ip, n, iphlen, &udp_in);
522                                 INP_RUNLOCK(last);
523                         }
524                         last = inp;
525                         /*
526                          * Don't look for additional matches if this one does
527                          * not have either the SO_REUSEPORT or SO_REUSEADDR
528                          * socket options set.  This heuristic avoids
529                          * searching through all pcbs in the common case of a
530                          * non-shared port.  It assumes that an application
531                          * will never clear these options after setting them.
532                          */
533                         if ((last->inp_socket->so_options &
534                             (SO_REUSEPORT|SO_REUSEADDR)) == 0)
535                                 break;
536                 }
537
538                 if (last == NULL) {
539                         /*
540                          * No matching pcb found; discard datagram.  (No need
541                          * to send an ICMP Port Unreachable for a broadcast
542                          * or multicast datgram.)
543                          */
544                         UDPSTAT_INC(udps_noportbcast);
545                         goto badheadlocked;
546                 }
547                 udp_append(last, ip, m, iphlen, &udp_in);
548                 INP_RUNLOCK(last);
549                 INP_INFO_RUNLOCK(&V_udbinfo);
550                 return;
551         }
552
553         /*
554          * Locate pcb for datagram.
555          */
556         inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport,
557             ip->ip_dst, uh->uh_dport, 1, ifp);
558         if (inp == NULL) {
559                 if (udp_log_in_vain) {
560                         char buf[4*sizeof "123"];
561
562                         strcpy(buf, inet_ntoa(ip->ip_dst));
563                         log(LOG_INFO,
564                             "Connection attempt to UDP %s:%d from %s:%d\n",
565                             buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
566                             ntohs(uh->uh_sport));
567                 }
568                 UDPSTAT_INC(udps_noport);
569                 if (m->m_flags & (M_BCAST | M_MCAST)) {
570                         UDPSTAT_INC(udps_noportbcast);
571                         goto badheadlocked;
572                 }
573                 if (V_udp_blackhole)
574                         goto badheadlocked;
575                 if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
576                         goto badheadlocked;
577                 *ip = save_ip;
578                 ip->ip_len += iphlen;
579                 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
580                 INP_INFO_RUNLOCK(&V_udbinfo);
581                 return;
582         }
583
584         /*
585          * Check the minimum TTL for socket.
586          */
587         INP_RLOCK(inp);
588         INP_INFO_RUNLOCK(&V_udbinfo);
589         if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
590                 INP_RUNLOCK(inp);
591                 goto badunlocked;
592         }
593         udp_append(inp, ip, m, iphlen, &udp_in);
594         INP_RUNLOCK(inp);
595         return;
596
597 badheadlocked:
598         if (inp)
599                 INP_RUNLOCK(inp);
600         INP_INFO_RUNLOCK(&V_udbinfo);
601 badunlocked:
602         m_freem(m);
603 }
604 #endif /* INET */
605
606 /*
607  * Notify a udp user of an asynchronous error; just wake up so that they can
608  * collect error status.
609  */
610 struct inpcb *
611 udp_notify(struct inpcb *inp, int errno)
612 {
613
614         /*
615          * While udp_ctlinput() always calls udp_notify() with a read lock
616          * when invoking it directly, in_pcbnotifyall() currently uses write
617          * locks due to sharing code with TCP.  For now, accept either a read
618          * or a write lock, but a read lock is sufficient.
619          */
620         INP_LOCK_ASSERT(inp);
621
622         inp->inp_socket->so_error = errno;
623         sorwakeup(inp->inp_socket);
624         sowwakeup(inp->inp_socket);
625         return (inp);
626 }
627
628 #ifdef INET
629 void
630 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
631 {
632         struct ip *ip = vip;
633         struct udphdr *uh;
634         struct in_addr faddr;
635         struct inpcb *inp;
636
637         faddr = ((struct sockaddr_in *)sa)->sin_addr;
638         if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
639                 return;
640
641         /*
642          * Redirects don't need to be handled up here.
643          */
644         if (PRC_IS_REDIRECT(cmd))
645                 return;
646
647         /*
648          * Hostdead is ugly because it goes linearly through all PCBs.
649          *
650          * XXX: We never get this from ICMP, otherwise it makes an excellent
651          * DoS attack on machines with many connections.
652          */
653         if (cmd == PRC_HOSTDEAD)
654                 ip = NULL;
655         else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
656                 return;
657         if (ip != NULL) {
658                 uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
659                 INP_INFO_RLOCK(&V_udbinfo);
660                 inp = in_pcblookup_hash(&V_udbinfo, faddr, uh->uh_dport,
661                     ip->ip_src, uh->uh_sport, 0, NULL);
662                 if (inp != NULL) {
663                         INP_RLOCK(inp);
664                         if (inp->inp_socket != NULL) {
665                                 udp_notify(inp, inetctlerrmap[cmd]);
666                         }
667                         INP_RUNLOCK(inp);
668                 }
669                 INP_INFO_RUNLOCK(&V_udbinfo);
670         } else
671                 in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd],
672                     udp_notify);
673 }
674 #endif /* INET */
675
676 static int
677 udp_pcblist(SYSCTL_HANDLER_ARGS)
678 {
679         int error, i, n;
680         struct inpcb *inp, **inp_list;
681         inp_gen_t gencnt;
682         struct xinpgen xig;
683
684         /*
685          * The process of preparing the PCB list is too time-consuming and
686          * resource-intensive to repeat twice on every request.
687          */
688         if (req->oldptr == 0) {
689                 n = V_udbinfo.ipi_count;
690                 n += imax(n / 8, 10);
691                 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
692                 return (0);
693         }
694
695         if (req->newptr != 0)
696                 return (EPERM);
697
698         /*
699          * OK, now we're committed to doing something.
700          */
701         INP_INFO_RLOCK(&V_udbinfo);
702         gencnt = V_udbinfo.ipi_gencnt;
703         n = V_udbinfo.ipi_count;
704         INP_INFO_RUNLOCK(&V_udbinfo);
705
706         error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
707                 + n * sizeof(struct xinpcb));
708         if (error != 0)
709                 return (error);
710
711         xig.xig_len = sizeof xig;
712         xig.xig_count = n;
713         xig.xig_gen = gencnt;
714         xig.xig_sogen = so_gencnt;
715         error = SYSCTL_OUT(req, &xig, sizeof xig);
716         if (error)
717                 return (error);
718
719         inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
720         if (inp_list == 0)
721                 return (ENOMEM);
722
723         INP_INFO_RLOCK(&V_udbinfo);
724         for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n;
725              inp = LIST_NEXT(inp, inp_list)) {
726                 INP_WLOCK(inp);
727                 if (inp->inp_gencnt <= gencnt &&
728                     cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
729                         in_pcbref(inp);
730                         inp_list[i++] = inp;
731                 }
732                 INP_WUNLOCK(inp);
733         }
734         INP_INFO_RUNLOCK(&V_udbinfo);
735         n = i;
736
737         error = 0;
738         for (i = 0; i < n; i++) {
739                 inp = inp_list[i];
740                 INP_RLOCK(inp);
741                 if (inp->inp_gencnt <= gencnt) {
742                         struct xinpcb xi;
743
744                         bzero(&xi, sizeof(xi));
745                         xi.xi_len = sizeof xi;
746                         /* XXX should avoid extra copy */
747                         bcopy(inp, &xi.xi_inp, sizeof *inp);
748                         if (inp->inp_socket)
749                                 sotoxsocket(inp->inp_socket, &xi.xi_socket);
750                         xi.xi_inp.inp_gencnt = inp->inp_gencnt;
751                         INP_RUNLOCK(inp);
752                         error = SYSCTL_OUT(req, &xi, sizeof xi);
753                 } else
754                         INP_RUNLOCK(inp);
755         }
756         INP_INFO_WLOCK(&V_udbinfo);
757         for (i = 0; i < n; i++) {
758                 inp = inp_list[i];
759                 INP_WLOCK(inp);
760                 if (!in_pcbrele(inp))
761                         INP_WUNLOCK(inp);
762         }
763         INP_INFO_WUNLOCK(&V_udbinfo);
764
765         if (!error) {
766                 /*
767                  * Give the user an updated idea of our state.  If the
768                  * generation differs from what we told her before, she knows
769                  * that something happened while we were processing this
770                  * request, and it might be necessary to retry.
771                  */
772                 INP_INFO_RLOCK(&V_udbinfo);
773                 xig.xig_gen = V_udbinfo.ipi_gencnt;
774                 xig.xig_sogen = so_gencnt;
775                 xig.xig_count = V_udbinfo.ipi_count;
776                 INP_INFO_RUNLOCK(&V_udbinfo);
777                 error = SYSCTL_OUT(req, &xig, sizeof xig);
778         }
779         free(inp_list, M_TEMP);
780         return (error);
781 }
782
783 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
784     CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
785     udp_pcblist, "S,xinpcb", "List of active UDP sockets");
786
787 #ifdef INET
788 static int
789 udp_getcred(SYSCTL_HANDLER_ARGS)
790 {
791         struct xucred xuc;
792         struct sockaddr_in addrs[2];
793         struct inpcb *inp;
794         int error;
795
796         error = priv_check(req->td, PRIV_NETINET_GETCRED);
797         if (error)
798                 return (error);
799         error = SYSCTL_IN(req, addrs, sizeof(addrs));
800         if (error)
801                 return (error);
802         INP_INFO_RLOCK(&V_udbinfo);
803         inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
804                                 addrs[0].sin_addr, addrs[0].sin_port, 1, NULL);
805         if (inp != NULL) {
806                 INP_RLOCK(inp);
807                 INP_INFO_RUNLOCK(&V_udbinfo);
808                 if (inp->inp_socket == NULL)
809                         error = ENOENT;
810                 if (error == 0)
811                         error = cr_canseeinpcb(req->td->td_ucred, inp);
812                 if (error == 0)
813                         cru2x(inp->inp_cred, &xuc);
814                 INP_RUNLOCK(inp);
815         } else {
816                 INP_INFO_RUNLOCK(&V_udbinfo);
817                 error = ENOENT;
818         }
819         if (error == 0)
820                 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
821         return (error);
822 }
823
824 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
825     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
826     udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
827 #endif /* INET */
828
829 int
830 udp_ctloutput(struct socket *so, struct sockopt *sopt)
831 {
832         int error = 0, optval;
833         struct inpcb *inp;
834 #ifdef IPSEC_NAT_T
835         struct udpcb *up;
836 #endif
837
838         inp = sotoinpcb(so);
839         KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
840         INP_WLOCK(inp);
841         if (sopt->sopt_level != IPPROTO_UDP) {
842 #ifdef INET6
843                 if (INP_CHECK_SOCKAF(so, AF_INET6)) {
844                         INP_WUNLOCK(inp);
845                         error = ip6_ctloutput(so, sopt);
846                 }
847 #endif
848 #if defined(INET) && defined(INET6)
849                 else
850 #endif
851 #ifdef INET
852                 {
853                         INP_WUNLOCK(inp);
854                         error = ip_ctloutput(so, sopt);
855                 }
856 #endif
857                 return (error);
858         }
859
860         switch (sopt->sopt_dir) {
861         case SOPT_SET:
862                 switch (sopt->sopt_name) {
863                 case UDP_ENCAP:
864                         INP_WUNLOCK(inp);
865                         error = sooptcopyin(sopt, &optval, sizeof optval,
866                                             sizeof optval);
867                         if (error)
868                                 break;
869                         inp = sotoinpcb(so);
870                         KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
871                         INP_WLOCK(inp);
872 #ifdef IPSEC_NAT_T
873                         up = intoudpcb(inp);
874                         KASSERT(up != NULL, ("%s: up == NULL", __func__));
875 #endif
876                         switch (optval) {
877                         case 0:
878                                 /* Clear all UDP encap. */
879 #ifdef IPSEC_NAT_T
880                                 up->u_flags &= ~UF_ESPINUDP_ALL;
881 #endif
882                                 break;
883 #ifdef IPSEC_NAT_T
884                         case UDP_ENCAP_ESPINUDP:
885                         case UDP_ENCAP_ESPINUDP_NON_IKE:
886                                 up->u_flags &= ~UF_ESPINUDP_ALL;
887                                 if (optval == UDP_ENCAP_ESPINUDP)
888                                         up->u_flags |= UF_ESPINUDP;
889                                 else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE)
890                                         up->u_flags |= UF_ESPINUDP_NON_IKE;
891                                 break;
892 #endif
893                         default:
894                                 error = EINVAL;
895                                 break;
896                         }
897                         INP_WUNLOCK(inp);
898                         break;
899                 default:
900                         INP_WUNLOCK(inp);
901                         error = ENOPROTOOPT;
902                         break;
903                 }
904                 break;
905         case SOPT_GET:
906                 switch (sopt->sopt_name) {
907 #ifdef IPSEC_NAT_T
908                 case UDP_ENCAP:
909                         up = intoudpcb(inp);
910                         KASSERT(up != NULL, ("%s: up == NULL", __func__));
911                         optval = up->u_flags & UF_ESPINUDP_ALL;
912                         INP_WUNLOCK(inp);
913                         error = sooptcopyout(sopt, &optval, sizeof optval);
914                         break;
915 #endif
916                 default:
917                         INP_WUNLOCK(inp);
918                         error = ENOPROTOOPT;
919                         break;
920                 }
921                 break;
922         }       
923         return (error);
924 }
925
926 #ifdef INET
927 static int
928 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
929     struct mbuf *control, struct thread *td)
930 {
931         struct udpiphdr *ui;
932         int len = m->m_pkthdr.len;
933         struct in_addr faddr, laddr;
934         struct cmsghdr *cm;
935         struct sockaddr_in *sin, src;
936         int error = 0;
937         int ipflags;
938         u_short fport, lport;
939         int unlock_udbinfo;
940
941         /*
942          * udp_output() may need to temporarily bind or connect the current
943          * inpcb.  As such, we don't know up front whether we will need the
944          * pcbinfo lock or not.  Do any work to decide what is needed up
945          * front before acquiring any locks.
946          */
947         if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
948                 if (control)
949                         m_freem(control);
950                 m_freem(m);
951                 return (EMSGSIZE);
952         }
953
954         src.sin_family = 0;
955         if (control != NULL) {
956                 /*
957                  * XXX: Currently, we assume all the optional information is
958                  * stored in a single mbuf.
959                  */
960                 if (control->m_next) {
961                         m_freem(control);
962                         m_freem(m);
963                         return (EINVAL);
964                 }
965                 for (; control->m_len > 0;
966                     control->m_data += CMSG_ALIGN(cm->cmsg_len),
967                     control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
968                         cm = mtod(control, struct cmsghdr *);
969                         if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
970                             || cm->cmsg_len > control->m_len) {
971                                 error = EINVAL;
972                                 break;
973                         }
974                         if (cm->cmsg_level != IPPROTO_IP)
975                                 continue;
976
977                         switch (cm->cmsg_type) {
978                         case IP_SENDSRCADDR:
979                                 if (cm->cmsg_len !=
980                                     CMSG_LEN(sizeof(struct in_addr))) {
981                                         error = EINVAL;
982                                         break;
983                                 }
984                                 bzero(&src, sizeof(src));
985                                 src.sin_family = AF_INET;
986                                 src.sin_len = sizeof(src);
987                                 src.sin_port = inp->inp_lport;
988                                 src.sin_addr =
989                                     *(struct in_addr *)CMSG_DATA(cm);
990                                 break;
991
992                         default:
993                                 error = ENOPROTOOPT;
994                                 break;
995                         }
996                         if (error)
997                                 break;
998                 }
999                 m_freem(control);
1000         }
1001         if (error) {
1002                 m_freem(m);
1003                 return (error);
1004         }
1005
1006         /*
1007          * Depending on whether or not the application has bound or connected
1008          * the socket, we may have to do varying levels of work.  The optimal
1009          * case is for a connected UDP socket, as a global lock isn't
1010          * required at all.
1011          *
1012          * In order to decide which we need, we require stability of the
1013          * inpcb binding, which we ensure by acquiring a read lock on the
1014          * inpcb.  This doesn't strictly follow the lock order, so we play
1015          * the trylock and retry game; note that we may end up with more
1016          * conservative locks than required the second time around, so later
1017          * assertions have to accept that.  Further analysis of the number of
1018          * misses under contention is required.
1019          */
1020         sin = (struct sockaddr_in *)addr;
1021         INP_RLOCK(inp);
1022         if (sin != NULL &&
1023             (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
1024                 INP_RUNLOCK(inp);
1025                 INP_INFO_WLOCK(&V_udbinfo);
1026                 INP_WLOCK(inp);
1027                 unlock_udbinfo = 2;
1028         } else if ((sin != NULL && (
1029             (sin->sin_addr.s_addr == INADDR_ANY) ||
1030             (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
1031             (inp->inp_laddr.s_addr == INADDR_ANY) ||
1032             (inp->inp_lport == 0))) ||
1033             (src.sin_family == AF_INET)) {
1034                 if (!INP_INFO_TRY_RLOCK(&V_udbinfo)) {
1035                         INP_RUNLOCK(inp);
1036                         INP_INFO_RLOCK(&V_udbinfo);
1037                         INP_RLOCK(inp);
1038                 }
1039                 unlock_udbinfo = 1;
1040         } else
1041                 unlock_udbinfo = 0;
1042
1043         /*
1044          * If the IP_SENDSRCADDR control message was specified, override the
1045          * source address for this datagram.  Its use is invalidated if the
1046          * address thus specified is incomplete or clobbers other inpcbs.
1047          */
1048         laddr = inp->inp_laddr;
1049         lport = inp->inp_lport;
1050         if (src.sin_family == AF_INET) {
1051                 INP_INFO_LOCK_ASSERT(&V_udbinfo);
1052                 if ((lport == 0) ||
1053                     (laddr.s_addr == INADDR_ANY &&
1054                      src.sin_addr.s_addr == INADDR_ANY)) {
1055                         error = EINVAL;
1056                         goto release;
1057                 }
1058                 error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
1059                     &laddr.s_addr, &lport, td->td_ucred);
1060                 if (error)
1061                         goto release;
1062         }
1063
1064         /*
1065          * If a UDP socket has been connected, then a local address/port will
1066          * have been selected and bound.
1067          *
1068          * If a UDP socket has not been connected to, then an explicit
1069          * destination address must be used, in which case a local
1070          * address/port may not have been selected and bound.
1071          */
1072         if (sin != NULL) {
1073                 INP_LOCK_ASSERT(inp);
1074                 if (inp->inp_faddr.s_addr != INADDR_ANY) {
1075                         error = EISCONN;
1076                         goto release;
1077                 }
1078
1079                 /*
1080                  * Jail may rewrite the destination address, so let it do
1081                  * that before we use it.
1082                  */
1083                 error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1084                 if (error)
1085                         goto release;
1086
1087                 /*
1088                  * If a local address or port hasn't yet been selected, or if
1089                  * the destination address needs to be rewritten due to using
1090                  * a special INADDR_ constant, invoke in_pcbconnect_setup()
1091                  * to do the heavy lifting.  Once a port is selected, we
1092                  * commit the binding back to the socket; we also commit the
1093                  * binding of the address if in jail.
1094                  *
1095                  * If we already have a valid binding and we're not
1096                  * requesting a destination address rewrite, use a fast path.
1097                  */
1098                 if (inp->inp_laddr.s_addr == INADDR_ANY ||
1099                     inp->inp_lport == 0 ||
1100                     sin->sin_addr.s_addr == INADDR_ANY ||
1101                     sin->sin_addr.s_addr == INADDR_BROADCAST) {
1102                         INP_INFO_LOCK_ASSERT(&V_udbinfo);
1103                         error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
1104                             &lport, &faddr.s_addr, &fport, NULL,
1105                             td->td_ucred);
1106                         if (error)
1107                                 goto release;
1108
1109                         /*
1110                          * XXXRW: Why not commit the port if the address is
1111                          * !INADDR_ANY?
1112                          */
1113                         /* Commit the local port if newly assigned. */
1114                         if (inp->inp_laddr.s_addr == INADDR_ANY &&
1115                             inp->inp_lport == 0) {
1116                                 INP_INFO_WLOCK_ASSERT(&V_udbinfo);
1117                                 INP_WLOCK_ASSERT(inp);
1118                                 /*
1119                                  * Remember addr if jailed, to prevent
1120                                  * rebinding.
1121                                  */
1122                                 if (prison_flag(td->td_ucred, PR_IP4))
1123                                         inp->inp_laddr = laddr;
1124                                 inp->inp_lport = lport;
1125                                 if (in_pcbinshash(inp) != 0) {
1126                                         inp->inp_lport = 0;
1127                                         error = EAGAIN;
1128                                         goto release;
1129                                 }
1130                                 inp->inp_flags |= INP_ANONPORT;
1131                         }
1132                 } else {
1133                         faddr = sin->sin_addr;
1134                         fport = sin->sin_port;
1135                 }
1136         } else {
1137                 INP_LOCK_ASSERT(inp);
1138                 faddr = inp->inp_faddr;
1139                 fport = inp->inp_fport;
1140                 if (faddr.s_addr == INADDR_ANY) {
1141                         error = ENOTCONN;
1142                         goto release;
1143                 }
1144         }
1145
1146         /*
1147          * Calculate data length and get a mbuf for UDP, IP, and possible
1148          * link-layer headers.  Immediate slide the data pointer back forward
1149          * since we won't use that space at this layer.
1150          */
1151         M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT);
1152         if (m == NULL) {
1153                 error = ENOBUFS;
1154                 goto release;
1155         }
1156         m->m_data += max_linkhdr;
1157         m->m_len -= max_linkhdr;
1158         m->m_pkthdr.len -= max_linkhdr;
1159
1160         /*
1161          * Fill in mbuf with extended UDP header and addresses and length put
1162          * into network format.
1163          */
1164         ui = mtod(m, struct udpiphdr *);
1165         bzero(ui->ui_x1, sizeof(ui->ui_x1));    /* XXX still needed? */
1166         ui->ui_pr = IPPROTO_UDP;
1167         ui->ui_src = laddr;
1168         ui->ui_dst = faddr;
1169         ui->ui_sport = lport;
1170         ui->ui_dport = fport;
1171         ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
1172
1173         /*
1174          * Set the Don't Fragment bit in the IP header.
1175          */
1176         if (inp->inp_flags & INP_DONTFRAG) {
1177                 struct ip *ip;
1178
1179                 ip = (struct ip *)&ui->ui_i;
1180                 ip->ip_off |= IP_DF;
1181         }
1182
1183         ipflags = 0;
1184         if (inp->inp_socket->so_options & SO_DONTROUTE)
1185                 ipflags |= IP_ROUTETOIF;
1186         if (inp->inp_socket->so_options & SO_BROADCAST)
1187                 ipflags |= IP_ALLOWBROADCAST;
1188         if (inp->inp_flags & INP_ONESBCAST)
1189                 ipflags |= IP_SENDONES;
1190
1191 #ifdef MAC
1192         mac_inpcb_create_mbuf(inp, m);
1193 #endif
1194
1195         /*
1196          * Set up checksum and output datagram.
1197          */
1198         if (udp_cksum) {
1199                 if (inp->inp_flags & INP_ONESBCAST)
1200                         faddr.s_addr = INADDR_BROADCAST;
1201                 ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
1202                     htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP));
1203                 m->m_pkthdr.csum_flags = CSUM_UDP;
1204                 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
1205         } else
1206                 ui->ui_sum = 0;
1207         ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len;
1208         ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;    /* XXX */
1209         ((struct ip *)ui)->ip_tos = inp->inp_ip_tos;    /* XXX */
1210         UDPSTAT_INC(udps_opackets);
1211
1212         if (unlock_udbinfo == 2)
1213                 INP_INFO_WUNLOCK(&V_udbinfo);
1214         else if (unlock_udbinfo == 1)
1215                 INP_INFO_RUNLOCK(&V_udbinfo);
1216         error = ip_output(m, inp->inp_options, NULL, ipflags,
1217             inp->inp_moptions, inp);
1218         if (unlock_udbinfo == 2)
1219                 INP_WUNLOCK(inp);
1220         else
1221                 INP_RUNLOCK(inp);
1222         return (error);
1223
1224 release:
1225         if (unlock_udbinfo == 2) {
1226                 INP_WUNLOCK(inp);
1227                 INP_INFO_WUNLOCK(&V_udbinfo);
1228         } else if (unlock_udbinfo == 1) {
1229                 INP_RUNLOCK(inp);
1230                 INP_INFO_RUNLOCK(&V_udbinfo);
1231         } else
1232                 INP_RUNLOCK(inp);
1233         m_freem(m);
1234         return (error);
1235 }
1236
1237
1238 #if defined(IPSEC) && defined(IPSEC_NAT_T)
1239 /*
1240  * Potentially decap ESP in UDP frame.  Check for an ESP header
1241  * and optional marker; if present, strip the UDP header and
1242  * push the result through IPSec.
1243  *
1244  * Returns mbuf to be processed (potentially re-allocated) or
1245  * NULL if consumed and/or processed.
1246  */
1247 static struct mbuf *
1248 udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
1249 {
1250         size_t minlen, payload, skip, iphlen;
1251         caddr_t data;
1252         struct udpcb *up;
1253         struct m_tag *tag;
1254         struct udphdr *udphdr;
1255         struct ip *ip;
1256
1257         INP_RLOCK_ASSERT(inp);
1258
1259         /* 
1260          * Pull up data so the longest case is contiguous:
1261          *    IP/UDP hdr + non ESP marker + ESP hdr.
1262          */
1263         minlen = off + sizeof(uint64_t) + sizeof(struct esp);
1264         if (minlen > m->m_pkthdr.len)
1265                 minlen = m->m_pkthdr.len;
1266         if ((m = m_pullup(m, minlen)) == NULL) {
1267                 V_ipsec4stat.in_inval++;
1268                 return (NULL);          /* Bypass caller processing. */
1269         }
1270         data = mtod(m, caddr_t);        /* Points to ip header. */
1271         payload = m->m_len - off;       /* Size of payload. */
1272
1273         if (payload == 1 && data[off] == '\xff')
1274                 return (m);             /* NB: keepalive packet, no decap. */
1275
1276         up = intoudpcb(inp);
1277         KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
1278         KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0,
1279             ("u_flags 0x%x", up->u_flags));
1280
1281         /* 
1282          * Check that the payload is large enough to hold an
1283          * ESP header and compute the amount of data to remove.
1284          *
1285          * NB: the caller has already done a pullup for us.
1286          * XXX can we assume alignment and eliminate bcopys?
1287          */
1288         if (up->u_flags & UF_ESPINUDP_NON_IKE) {
1289                 /*
1290                  * draft-ietf-ipsec-nat-t-ike-0[01].txt and
1291                  * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring
1292                  * possible AH mode non-IKE marker+non-ESP marker
1293                  * from draft-ietf-ipsec-udp-encaps-00.txt.
1294                  */
1295                 uint64_t marker;
1296
1297                 if (payload <= sizeof(uint64_t) + sizeof(struct esp))
1298                         return (m);     /* NB: no decap. */
1299                 bcopy(data + off, &marker, sizeof(uint64_t));
1300                 if (marker != 0)        /* Non-IKE marker. */
1301                         return (m);     /* NB: no decap. */
1302                 skip = sizeof(uint64_t) + sizeof(struct udphdr);
1303         } else {
1304                 uint32_t spi;
1305
1306                 if (payload <= sizeof(struct esp)) {
1307                         V_ipsec4stat.in_inval++;
1308                         m_freem(m);
1309                         return (NULL);  /* Discard. */
1310                 }
1311                 bcopy(data + off, &spi, sizeof(uint32_t));
1312                 if (spi == 0)           /* Non-ESP marker. */
1313                         return (m);     /* NB: no decap. */
1314                 skip = sizeof(struct udphdr);
1315         }
1316
1317         /*
1318          * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember
1319          * the UDP ports. This is required if we want to select
1320          * the right SPD for multiple hosts behind same NAT.
1321          *
1322          * NB: ports are maintained in network byte order everywhere
1323          *     in the NAT-T code.
1324          */
1325         tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
1326                 2 * sizeof(uint16_t), M_NOWAIT);
1327         if (tag == NULL) {
1328                 V_ipsec4stat.in_nomem++;
1329                 m_freem(m);
1330                 return (NULL);          /* Discard. */
1331         }
1332         iphlen = off - sizeof(struct udphdr);
1333         udphdr = (struct udphdr *)(data + iphlen);
1334         ((uint16_t *)(tag + 1))[0] = udphdr->uh_sport;
1335         ((uint16_t *)(tag + 1))[1] = udphdr->uh_dport;
1336         m_tag_prepend(m, tag);
1337
1338         /*
1339          * Remove the UDP header (and possibly the non ESP marker)
1340          * IP header length is iphlen
1341          * Before:
1342          *   <--- off --->
1343          *   +----+------+-----+
1344          *   | IP |  UDP | ESP |
1345          *   +----+------+-----+
1346          *        <-skip->
1347          * After:
1348          *          +----+-----+
1349          *          | IP | ESP |
1350          *          +----+-----+
1351          *   <-skip->
1352          */
1353         ovbcopy(data, data + skip, iphlen);
1354         m_adj(m, skip);
1355
1356         ip = mtod(m, struct ip *);
1357         ip->ip_len -= skip;
1358         ip->ip_p = IPPROTO_ESP;
1359
1360         /*
1361          * We cannot yet update the cksums so clear any
1362          * h/w cksum flags as they are no longer valid.
1363          */
1364         if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)
1365                 m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
1366
1367         (void) ipsec4_common_input(m, iphlen, ip->ip_p);
1368         return (NULL);                  /* NB: consumed, bypass processing. */
1369 }
1370 #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */
1371
1372 static void
1373 udp_abort(struct socket *so)
1374 {
1375         struct inpcb *inp;
1376
1377         inp = sotoinpcb(so);
1378         KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
1379         INP_INFO_WLOCK(&V_udbinfo);
1380         INP_WLOCK(inp);
1381         if (inp->inp_faddr.s_addr != INADDR_ANY) {
1382                 in_pcbdisconnect(inp);
1383                 inp->inp_laddr.s_addr = INADDR_ANY;
1384                 soisdisconnected(so);
1385         }
1386         INP_WUNLOCK(inp);
1387         INP_INFO_WUNLOCK(&V_udbinfo);
1388 }
1389
1390 static int
1391 udp_attach(struct socket *so, int proto, struct thread *td)
1392 {
1393         struct inpcb *inp;
1394         int error;
1395
1396         inp = sotoinpcb(so);
1397         KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
1398         error = soreserve(so, udp_sendspace, udp_recvspace);
1399         if (error)
1400                 return (error);
1401         INP_INFO_WLOCK(&V_udbinfo);
1402         error = in_pcballoc(so, &V_udbinfo);
1403         if (error) {
1404                 INP_INFO_WUNLOCK(&V_udbinfo);
1405                 return (error);
1406         }
1407
1408         inp = sotoinpcb(so);
1409         inp->inp_vflag |= INP_IPV4;
1410         inp->inp_ip_ttl = V_ip_defttl;
1411
1412         error = udp_newudpcb(inp);
1413         if (error) {
1414                 in_pcbdetach(inp);
1415                 in_pcbfree(inp);
1416                 INP_INFO_WUNLOCK(&V_udbinfo);
1417                 return (error);
1418         }
1419
1420         INP_WUNLOCK(inp);
1421         INP_INFO_WUNLOCK(&V_udbinfo);
1422         return (0);
1423 }
1424 #endif /* INET */
1425
1426 int
1427 udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f)
1428 {
1429         struct inpcb *inp;
1430         struct udpcb *up;
1431
1432         KASSERT(so->so_type == SOCK_DGRAM,
1433             ("udp_set_kernel_tunneling: !dgram"));
1434         inp = sotoinpcb(so);
1435         KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
1436         INP_WLOCK(inp);
1437         up = intoudpcb(inp);
1438         if (up->u_tun_func != NULL) {
1439                 INP_WUNLOCK(inp);
1440                 return (EBUSY);
1441         }
1442         up->u_tun_func = f;
1443         INP_WUNLOCK(inp);
1444         return (0);
1445 }
1446
1447 #ifdef INET
1448 static int
1449 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
1450 {
1451         struct inpcb *inp;
1452         int error;
1453
1454         inp = sotoinpcb(so);
1455         KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
1456         INP_INFO_WLOCK(&V_udbinfo);
1457         INP_WLOCK(inp);
1458         error = in_pcbbind(inp, nam, td->td_ucred);
1459         INP_WUNLOCK(inp);
1460         INP_INFO_WUNLOCK(&V_udbinfo);
1461         return (error);
1462 }
1463
1464 static void
1465 udp_close(struct socket *so)
1466 {
1467         struct inpcb *inp;
1468
1469         inp = sotoinpcb(so);
1470         KASSERT(inp != NULL, ("udp_close: inp == NULL"));
1471         INP_INFO_WLOCK(&V_udbinfo);
1472         INP_WLOCK(inp);
1473         if (inp->inp_faddr.s_addr != INADDR_ANY) {
1474                 in_pcbdisconnect(inp);
1475                 inp->inp_laddr.s_addr = INADDR_ANY;
1476                 soisdisconnected(so);
1477         }
1478         INP_WUNLOCK(inp);
1479         INP_INFO_WUNLOCK(&V_udbinfo);
1480 }
1481
1482 static int
1483 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1484 {
1485         struct inpcb *inp;
1486         int error;
1487         struct sockaddr_in *sin;
1488
1489         inp = sotoinpcb(so);
1490         KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
1491         INP_INFO_WLOCK(&V_udbinfo);
1492         INP_WLOCK(inp);
1493         if (inp->inp_faddr.s_addr != INADDR_ANY) {
1494                 INP_WUNLOCK(inp);
1495                 INP_INFO_WUNLOCK(&V_udbinfo);
1496                 return (EISCONN);
1497         }
1498         sin = (struct sockaddr_in *)nam;
1499         error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1500         if (error != 0) {
1501                 INP_WUNLOCK(inp);
1502                 INP_INFO_WUNLOCK(&V_udbinfo);
1503                 return (error);
1504         }
1505         error = in_pcbconnect(inp, nam, td->td_ucred);
1506         if (error == 0)
1507                 soisconnected(so);
1508         INP_WUNLOCK(inp);
1509         INP_INFO_WUNLOCK(&V_udbinfo);
1510         return (error);
1511 }
1512
1513 static void
1514 udp_detach(struct socket *so)
1515 {
1516         struct inpcb *inp;
1517         struct udpcb *up;
1518
1519         inp = sotoinpcb(so);
1520         KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
1521         KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
1522             ("udp_detach: not disconnected"));
1523         INP_INFO_WLOCK(&V_udbinfo);
1524         INP_WLOCK(inp);
1525         up = intoudpcb(inp);
1526         KASSERT(up != NULL, ("%s: up == NULL", __func__));
1527         inp->inp_ppcb = NULL;
1528         in_pcbdetach(inp);
1529         in_pcbfree(inp);
1530         INP_INFO_WUNLOCK(&V_udbinfo);
1531         udp_discardcb(up);
1532 }
1533
1534 static int
1535 udp_disconnect(struct socket *so)
1536 {
1537         struct inpcb *inp;
1538
1539         inp = sotoinpcb(so);
1540         KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
1541         INP_INFO_WLOCK(&V_udbinfo);
1542         INP_WLOCK(inp);
1543         if (inp->inp_faddr.s_addr == INADDR_ANY) {
1544                 INP_WUNLOCK(inp);
1545                 INP_INFO_WUNLOCK(&V_udbinfo);
1546                 return (ENOTCONN);
1547         }
1548
1549         in_pcbdisconnect(inp);
1550         inp->inp_laddr.s_addr = INADDR_ANY;
1551         SOCK_LOCK(so);
1552         so->so_state &= ~SS_ISCONNECTED;                /* XXX */
1553         SOCK_UNLOCK(so);
1554         INP_WUNLOCK(inp);
1555         INP_INFO_WUNLOCK(&V_udbinfo);
1556         return (0);
1557 }
1558
1559 static int
1560 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
1561     struct mbuf *control, struct thread *td)
1562 {
1563         struct inpcb *inp;
1564
1565         inp = sotoinpcb(so);
1566         KASSERT(inp != NULL, ("udp_send: inp == NULL"));
1567         return (udp_output(inp, m, addr, control, td));
1568 }
1569 #endif /* INET */
1570
1571 int
1572 udp_shutdown(struct socket *so)
1573 {
1574         struct inpcb *inp;
1575
1576         inp = sotoinpcb(so);
1577         KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
1578         INP_WLOCK(inp);
1579         socantsendmore(so);
1580         INP_WUNLOCK(inp);
1581         return (0);
1582 }
1583
1584 #ifdef INET
1585 struct pr_usrreqs udp_usrreqs = {
1586         .pru_abort =            udp_abort,
1587         .pru_attach =           udp_attach,
1588         .pru_bind =             udp_bind,
1589         .pru_connect =          udp_connect,
1590         .pru_control =          in_control,
1591         .pru_detach =           udp_detach,
1592         .pru_disconnect =       udp_disconnect,
1593         .pru_peeraddr =         in_getpeeraddr,
1594         .pru_send =             udp_send,
1595         .pru_soreceive =        soreceive_dgram,
1596         .pru_sosend =           sosend_dgram,
1597         .pru_shutdown =         udp_shutdown,
1598         .pru_sockaddr =         in_getsockaddr,
1599         .pru_sosetlabel =       in_pcbsosetlabel,
1600         .pru_close =            udp_close,
1601 };
1602 #endif /* INET */