]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/ip_output.c
LinuxKPI: interrupt.h: add disable_irq_nosync(), irq_set_status_flags()
[FreeBSD/FreeBSD.git] / sys / netinet / ip_output.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993
5  *      The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *      @(#)ip_output.c 8.3 (Berkeley) 1/21/94
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include "opt_inet.h"
38 #include "opt_ipsec.h"
39 #include "opt_kern_tls.h"
40 #include "opt_mbuf_stress_test.h"
41 #include "opt_ratelimit.h"
42 #include "opt_route.h"
43 #include "opt_rss.h"
44 #include "opt_sctp.h"
45
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/ktls.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/mbuf.h>
53 #include <sys/priv.h>
54 #include <sys/proc.h>
55 #include <sys/protosw.h>
56 #include <sys/sdt.h>
57 #include <sys/socket.h>
58 #include <sys/socketvar.h>
59 #include <sys/sysctl.h>
60 #include <sys/ucred.h>
61
62 #include <net/if.h>
63 #include <net/if_var.h>
64 #include <net/if_private.h>
65 #include <net/if_vlan_var.h>
66 #include <net/if_llatbl.h>
67 #include <net/ethernet.h>
68 #include <net/netisr.h>
69 #include <net/pfil.h>
70 #include <net/route.h>
71 #include <net/route/nhop.h>
72 #include <net/rss_config.h>
73 #include <net/vnet.h>
74
75 #include <netinet/in.h>
76 #include <netinet/in_fib.h>
77 #include <netinet/in_kdtrace.h>
78 #include <netinet/in_systm.h>
79 #include <netinet/ip.h>
80 #include <netinet/in_fib.h>
81 #include <netinet/in_pcb.h>
82 #include <netinet/in_rss.h>
83 #include <netinet/in_var.h>
84 #include <netinet/ip_var.h>
85 #include <netinet/ip_options.h>
86
87 #include <netinet/udp.h>
88 #include <netinet/udp_var.h>
89
90 #if defined(SCTP) || defined(SCTP_SUPPORT)
91 #include <netinet/sctp.h>
92 #include <netinet/sctp_crc32.h>
93 #endif
94
95 #include <netipsec/ipsec_support.h>
96
97 #include <machine/in_cksum.h>
98
99 #include <security/mac/mac_framework.h>
100
101 #ifdef MBUF_STRESS_TEST
102 static int mbuf_frag_size = 0;
103 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
104         &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
105 #endif
106
107 static void     ip_mloopback(struct ifnet *, const struct mbuf *, int);
108
109 extern int in_mcast_loop;
110
111 static inline int
112 ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags,
113     struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error)
114 {
115         struct m_tag *fwd_tag = NULL;
116         struct mbuf *m;
117         struct in_addr odst;
118         struct ip *ip;
119         int pflags = PFIL_OUT;
120
121         m = *mp;
122         ip = mtod(m, struct ip *);
123
124         /* Run through list of hooks for output packets. */
125         odst.s_addr = ip->ip_dst.s_addr;
126         switch (pfil_run_hooks(V_inet_pfil_head, mp, ifp, pflags, inp)) {
127         case PFIL_DROPPED:
128                 *error = EACCES;
129                 /* FALLTHROUGH */
130         case PFIL_CONSUMED:
131                 return 1; /* Finished */
132         case PFIL_PASS:
133                 *error = 0;
134         }
135         m = *mp;
136         ip = mtod(m, struct ip *);
137
138         /* See if destination IP address was changed by packet filter. */
139         if (odst.s_addr != ip->ip_dst.s_addr) {
140                 m->m_flags |= M_SKIP_FIREWALL;
141                 /* If destination is now ourself drop to ip_input(). */
142                 if (in_localip(ip->ip_dst)) {
143                         m->m_flags |= M_FASTFWD_OURS;
144                         if (m->m_pkthdr.rcvif == NULL)
145                                 m->m_pkthdr.rcvif = V_loif;
146                         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
147                                 m->m_pkthdr.csum_flags |=
148                                         CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
149                                 m->m_pkthdr.csum_data = 0xffff;
150                         }
151                         m->m_pkthdr.csum_flags |=
152                                 CSUM_IP_CHECKED | CSUM_IP_VALID;
153 #if defined(SCTP) || defined(SCTP_SUPPORT)
154                         if (m->m_pkthdr.csum_flags & CSUM_SCTP)
155                                 m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
156 #endif
157                         *error = netisr_queue(NETISR_IP, m);
158                         return 1; /* Finished */
159                 }
160
161                 bzero(dst, sizeof(*dst));
162                 dst->sin_family = AF_INET;
163                 dst->sin_len = sizeof(*dst);
164                 dst->sin_addr = ip->ip_dst;
165
166                 return -1; /* Reloop */
167         }
168         /* See if fib was changed by packet filter. */
169         if ((*fibnum) != M_GETFIB(m)) {
170                 m->m_flags |= M_SKIP_FIREWALL;
171                 *fibnum = M_GETFIB(m);
172                 return -1; /* Reloop for FIB change */
173         }
174
175         /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
176         if (m->m_flags & M_FASTFWD_OURS) {
177                 if (m->m_pkthdr.rcvif == NULL)
178                         m->m_pkthdr.rcvif = V_loif;
179                 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
180                         m->m_pkthdr.csum_flags |=
181                                 CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
182                         m->m_pkthdr.csum_data = 0xffff;
183                 }
184 #if defined(SCTP) || defined(SCTP_SUPPORT)
185                 if (m->m_pkthdr.csum_flags & CSUM_SCTP)
186                         m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
187 #endif
188                 m->m_pkthdr.csum_flags |=
189                         CSUM_IP_CHECKED | CSUM_IP_VALID;
190
191                 *error = netisr_queue(NETISR_IP, m);
192                 return 1; /* Finished */
193         }
194         /* Or forward to some other address? */
195         if ((m->m_flags & M_IP_NEXTHOP) &&
196             ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
197                 bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
198                 m->m_flags |= M_SKIP_FIREWALL;
199                 m->m_flags &= ~M_IP_NEXTHOP;
200                 m_tag_delete(m, fwd_tag);
201
202                 return -1; /* Reloop for CHANGE of dst */
203         }
204
205         return 0;
206 }
207
208 static int
209 ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m,
210     const struct sockaddr *gw, struct route *ro, bool stamp_tag)
211 {
212 #ifdef KERN_TLS
213         struct ktls_session *tls = NULL;
214 #endif
215         struct m_snd_tag *mst;
216         int error;
217
218         MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
219         mst = NULL;
220
221 #ifdef KERN_TLS
222         /*
223          * If this is an unencrypted TLS record, save a reference to
224          * the record.  This local reference is used to call
225          * ktls_output_eagain after the mbuf has been freed (thus
226          * dropping the mbuf's reference) in if_output.
227          */
228         if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) {
229                 tls = ktls_hold(m->m_next->m_epg_tls);
230                 mst = tls->snd_tag;
231
232                 /*
233                  * If a TLS session doesn't have a valid tag, it must
234                  * have had an earlier ifp mismatch, so drop this
235                  * packet.
236                  */
237                 if (mst == NULL) {
238                         m_freem(m);
239                         error = EAGAIN;
240                         goto done;
241                 }
242                 /*
243                  * Always stamp tags that include NIC ktls.
244                  */
245                 stamp_tag = true;
246         }
247 #endif
248 #ifdef RATELIMIT
249         if (inp != NULL && mst == NULL) {
250                 if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 ||
251                     (inp->inp_snd_tag != NULL &&
252                     inp->inp_snd_tag->ifp != ifp))
253                         in_pcboutput_txrtlmt(inp, ifp, m);
254
255                 if (inp->inp_snd_tag != NULL)
256                         mst = inp->inp_snd_tag;
257         }
258 #endif
259         if (stamp_tag && mst != NULL) {
260                 KASSERT(m->m_pkthdr.rcvif == NULL,
261                     ("trying to add a send tag to a forwarded packet"));
262                 if (mst->ifp != ifp) {
263                         m_freem(m);
264                         error = EAGAIN;
265                         goto done;
266                 }
267
268                 /* stamp send tag on mbuf */
269                 m->m_pkthdr.snd_tag = m_snd_tag_ref(mst);
270                 m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
271         }
272
273         error = (*ifp->if_output)(ifp, m, gw, ro);
274
275 done:
276         /* Check for route change invalidating send tags. */
277 #ifdef KERN_TLS
278         if (tls != NULL) {
279                 if (error == EAGAIN)
280                         error = ktls_output_eagain(inp, tls);
281                 ktls_free(tls);
282         }
283 #endif
284 #ifdef RATELIMIT
285         if (error == EAGAIN)
286                 in_pcboutput_eagain(inp);
287 #endif
288         return (error);
289 }
290
291 /* rte<>ro_flags translation */
292 static inline void
293 rt_update_ro_flags(struct route *ro, const struct nhop_object *nh)
294 {
295         int nh_flags = nh->nh_flags;
296
297         ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW);
298
299         ro->ro_flags |= (nh_flags & NHF_REJECT) ? RT_REJECT : 0;
300         ro->ro_flags |= (nh_flags & NHF_BLACKHOLE) ? RT_BLACKHOLE : 0;
301         ro->ro_flags |= (nh_flags & NHF_GATEWAY) ? RT_HAS_GW : 0;
302 }
303
304 /*
305  * IP output.  The packet in mbuf chain m contains a skeletal IP
306  * header (with len, off, ttl, proto, tos, src, dst).
307  * The mbuf chain containing the packet will be freed.
308  * The mbuf opt, if present, will not be freed.
309  * If route ro is present and has ro_rt initialized, route lookup would be
310  * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
311  * then result of route lookup is stored in ro->ro_rt.
312  *
313  * In the IP forwarding case, the packet will arrive with options already
314  * inserted, so must have a NULL opt pointer.
315  */
316 int
317 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
318     struct ip_moptions *imo, struct inpcb *inp)
319 {
320         struct ip *ip;
321         struct ifnet *ifp = NULL;       /* keep compiler happy */
322         struct mbuf *m0;
323         int hlen = sizeof (struct ip);
324         int mtu = 0;
325         int error = 0;
326         int vlan_pcp = -1;
327         struct sockaddr_in *dst;
328         const struct sockaddr *gw;
329         struct in_ifaddr *ia = NULL;
330         struct in_addr src;
331         int isbroadcast;
332         uint16_t ip_len, ip_off;
333         struct route iproute;
334         uint32_t fibnum;
335 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
336         int no_route_but_check_spd = 0;
337 #endif
338
339         M_ASSERTPKTHDR(m);
340         NET_EPOCH_ASSERT();
341
342         if (inp != NULL) {
343                 INP_LOCK_ASSERT(inp);
344                 M_SETFIB(m, inp->inp_inc.inc_fibnum);
345                 if ((flags & IP_NODEFAULTFLOWID) == 0) {
346                         m->m_pkthdr.flowid = inp->inp_flowid;
347                         M_HASHTYPE_SET(m, inp->inp_flowtype);
348                 }
349                 if ((inp->inp_flags2 & INP_2PCP_SET) != 0)
350                         vlan_pcp = (inp->inp_flags2 & INP_2PCP_MASK) >>
351                             INP_2PCP_SHIFT;
352 #ifdef NUMA
353                 m->m_pkthdr.numa_domain = inp->inp_numa_domain;
354 #endif
355         }
356
357         if (opt) {
358                 int len = 0;
359                 m = ip_insertoptions(m, opt, &len);
360                 if (len != 0)
361                         hlen = len; /* ip->ip_hl is updated above */
362         }
363         ip = mtod(m, struct ip *);
364         ip_len = ntohs(ip->ip_len);
365         ip_off = ntohs(ip->ip_off);
366
367         if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
368                 ip->ip_v = IPVERSION;
369                 ip->ip_hl = hlen >> 2;
370                 ip_fillid(ip);
371         } else {
372                 /* Header already set, fetch hlen from there */
373                 hlen = ip->ip_hl << 2;
374         }
375         if ((flags & IP_FORWARDING) == 0)
376                 IPSTAT_INC(ips_localout);
377
378         /*
379          * dst/gw handling:
380          *
381          * gw is readonly but can point either to dst OR rt_gateway,
382          * therefore we need restore gw if we're redoing lookup.
383          */
384         fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
385         if (ro == NULL) {
386                 ro = &iproute;
387                 bzero(ro, sizeof (*ro));
388         }
389         dst = (struct sockaddr_in *)&ro->ro_dst;
390         if (ro->ro_nh == NULL) {
391                 dst->sin_family = AF_INET;
392                 dst->sin_len = sizeof(*dst);
393                 dst->sin_addr = ip->ip_dst;
394         }
395         gw = (const struct sockaddr *)dst;
396 again:
397         /*
398          * Validate route against routing table additions;
399          * a better/more specific route might have been added.
400          */
401         if (inp != NULL && ro->ro_nh != NULL)
402                 NH_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
403         /*
404          * If there is a cached route,
405          * check that it is to the same destination
406          * and is still up.  If not, free it and try again.
407          * The address family should also be checked in case of sharing the
408          * cache with IPv6.
409          * Also check whether routing cache needs invalidation.
410          */
411         if (ro->ro_nh != NULL &&
412             ((!NH_IS_VALID(ro->ro_nh)) || dst->sin_family != AF_INET ||
413             dst->sin_addr.s_addr != ip->ip_dst.s_addr))
414                 RO_INVALIDATE_CACHE(ro);
415         ia = NULL;
416         /*
417          * If routing to interface only, short circuit routing lookup.
418          * The use of an all-ones broadcast address implies this; an
419          * interface is specified by the broadcast address of an interface,
420          * or the destination address of a ptp interface.
421          */
422         if (flags & IP_SENDONES) {
423                 if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst),
424                                                       M_GETFIB(m)))) == NULL &&
425                     (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
426                                                     M_GETFIB(m)))) == NULL) {
427                         IPSTAT_INC(ips_noroute);
428                         error = ENETUNREACH;
429                         goto bad;
430                 }
431                 ip->ip_dst.s_addr = INADDR_BROADCAST;
432                 dst->sin_addr = ip->ip_dst;
433                 ifp = ia->ia_ifp;
434                 mtu = ifp->if_mtu;
435                 ip->ip_ttl = 1;
436                 isbroadcast = 1;
437                 src = IA_SIN(ia)->sin_addr;
438         } else if (flags & IP_ROUTETOIF) {
439                 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
440                                                     M_GETFIB(m)))) == NULL &&
441                     (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0,
442                                                 M_GETFIB(m)))) == NULL) {
443                         IPSTAT_INC(ips_noroute);
444                         error = ENETUNREACH;
445                         goto bad;
446                 }
447                 ifp = ia->ia_ifp;
448                 mtu = ifp->if_mtu;
449                 ip->ip_ttl = 1;
450                 isbroadcast = ifp->if_flags & IFF_BROADCAST ?
451                     in_ifaddr_broadcast(dst->sin_addr, ia) : 0;
452                 src = IA_SIN(ia)->sin_addr;
453         } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
454             imo != NULL && imo->imo_multicast_ifp != NULL) {
455                 /*
456                  * Bypass the normal routing lookup for multicast
457                  * packets if the interface is specified.
458                  */
459                 ifp = imo->imo_multicast_ifp;
460                 mtu = ifp->if_mtu;
461                 IFP_TO_IA(ifp, ia);
462                 isbroadcast = 0;        /* fool gcc */
463                 /* Interface may have no addresses. */
464                 if (ia != NULL)
465                         src = IA_SIN(ia)->sin_addr;
466                 else
467                         src.s_addr = INADDR_ANY;
468         } else if (ro != &iproute) {
469                 if (ro->ro_nh == NULL) {
470                         /*
471                          * We want to do any cloning requested by the link
472                          * layer, as this is probably required in all cases
473                          * for correct operation (as it is for ARP).
474                          */
475                         uint32_t flowid;
476                         flowid = m->m_pkthdr.flowid;
477                         ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0,
478                             NHR_REF, flowid);
479
480                         if (ro->ro_nh == NULL || (!NH_IS_VALID(ro->ro_nh))) {
481 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
482                                 /*
483                                  * There is no route for this packet, but it is
484                                  * possible that a matching SPD entry exists.
485                                  */
486                                 no_route_but_check_spd = 1;
487                                 goto sendit;
488 #endif
489                                 IPSTAT_INC(ips_noroute);
490                                 error = EHOSTUNREACH;
491                                 goto bad;
492                         }
493                 }
494                 struct nhop_object *nh = ro->ro_nh;
495
496                 ia = ifatoia(nh->nh_ifa);
497                 ifp = nh->nh_ifp;
498                 counter_u64_add(nh->nh_pksent, 1);
499                 rt_update_ro_flags(ro, nh);
500                 if (nh->nh_flags & NHF_GATEWAY)
501                         gw = &nh->gw_sa;
502                 if (nh->nh_flags & NHF_HOST)
503                         isbroadcast = (nh->nh_flags & NHF_BROADCAST);
504                 else if ((ifp->if_flags & IFF_BROADCAST) && (gw->sa_family == AF_INET))
505                         isbroadcast = in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia);
506                 else
507                         isbroadcast = 0;
508                 mtu = nh->nh_mtu;
509                 src = IA_SIN(ia)->sin_addr;
510         } else {
511                 struct nhop_object *nh;
512
513                 nh = fib4_lookup(M_GETFIB(m), dst->sin_addr, 0, NHR_NONE,
514                     m->m_pkthdr.flowid);
515                 if (nh == NULL) {
516 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
517                         /*
518                          * There is no route for this packet, but it is
519                          * possible that a matching SPD entry exists.
520                          */
521                         no_route_but_check_spd = 1;
522                         goto sendit;
523 #endif
524                         IPSTAT_INC(ips_noroute);
525                         error = EHOSTUNREACH;
526                         goto bad;
527                 }
528                 ifp = nh->nh_ifp;
529                 mtu = nh->nh_mtu;
530                 rt_update_ro_flags(ro, nh);
531                 if (nh->nh_flags & NHF_GATEWAY)
532                         gw = &nh->gw_sa;
533                 ia = ifatoia(nh->nh_ifa);
534                 src = IA_SIN(ia)->sin_addr;
535                 isbroadcast = (((nh->nh_flags & (NHF_HOST | NHF_BROADCAST)) ==
536                     (NHF_HOST | NHF_BROADCAST)) ||
537                     ((ifp->if_flags & IFF_BROADCAST) &&
538                     (gw->sa_family == AF_INET) &&
539                     in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia)));
540         }
541
542         /* Catch a possible divide by zero later. */
543         KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (nh_flags=0x%08x) ifp=%p",
544             __func__, mtu, ro,
545             (ro != NULL && ro->ro_nh != NULL) ? ro->ro_nh->nh_flags : 0, ifp));
546
547         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
548                 m->m_flags |= M_MCAST;
549                 /*
550                  * IP destination address is multicast.  Make sure "gw"
551                  * still points to the address in "ro".  (It may have been
552                  * changed to point to a gateway address, above.)
553                  */
554                 gw = (const struct sockaddr *)dst;
555                 /*
556                  * See if the caller provided any multicast options
557                  */
558                 if (imo != NULL) {
559                         ip->ip_ttl = imo->imo_multicast_ttl;
560                         if (imo->imo_multicast_vif != -1)
561                                 ip->ip_src.s_addr =
562                                     ip_mcast_src ?
563                                     ip_mcast_src(imo->imo_multicast_vif) :
564                                     INADDR_ANY;
565                 } else
566                         ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
567                 /*
568                  * Confirm that the outgoing interface supports multicast.
569                  */
570                 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
571                         if ((ifp->if_flags & IFF_MULTICAST) == 0) {
572                                 IPSTAT_INC(ips_noroute);
573                                 error = ENETUNREACH;
574                                 goto bad;
575                         }
576                 }
577                 /*
578                  * If source address not specified yet, use address
579                  * of outgoing interface.
580                  */
581                 if (ip->ip_src.s_addr == INADDR_ANY)
582                         ip->ip_src = src;
583
584                 if ((imo == NULL && in_mcast_loop) ||
585                     (imo && imo->imo_multicast_loop)) {
586                         /*
587                          * Loop back multicast datagram if not expressly
588                          * forbidden to do so, even if we are not a member
589                          * of the group; ip_input() will filter it later,
590                          * thus deferring a hash lookup and mutex acquisition
591                          * at the expense of a cheap copy using m_copym().
592                          */
593                         ip_mloopback(ifp, m, hlen);
594                 } else {
595                         /*
596                          * If we are acting as a multicast router, perform
597                          * multicast forwarding as if the packet had just
598                          * arrived on the interface to which we are about
599                          * to send.  The multicast forwarding function
600                          * recursively calls this function, using the
601                          * IP_FORWARDING flag to prevent infinite recursion.
602                          *
603                          * Multicasts that are looped back by ip_mloopback(),
604                          * above, will be forwarded by the ip_input() routine,
605                          * if necessary.
606                          */
607                         if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
608                                 /*
609                                  * If rsvp daemon is not running, do not
610                                  * set ip_moptions. This ensures that the packet
611                                  * is multicast and not just sent down one link
612                                  * as prescribed by rsvpd.
613                                  */
614                                 if (!V_rsvp_on)
615                                         imo = NULL;
616                                 if (ip_mforward &&
617                                     ip_mforward(ip, ifp, m, imo) != 0) {
618                                         m_freem(m);
619                                         goto done;
620                                 }
621                         }
622                 }
623
624                 /*
625                  * Multicasts with a time-to-live of zero may be looped-
626                  * back, above, but must not be transmitted on a network.
627                  * Also, multicasts addressed to the loopback interface
628                  * are not sent -- the above call to ip_mloopback() will
629                  * loop back a copy. ip_input() will drop the copy if
630                  * this host does not belong to the destination group on
631                  * the loopback interface.
632                  */
633                 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
634                         m_freem(m);
635                         goto done;
636                 }
637
638                 goto sendit;
639         }
640
641         /*
642          * If the source address is not specified yet, use the address
643          * of the outoing interface.
644          */
645         if (ip->ip_src.s_addr == INADDR_ANY)
646                 ip->ip_src = src;
647
648         /*
649          * Look for broadcast address and
650          * verify user is allowed to send
651          * such a packet.
652          */
653         if (isbroadcast) {
654                 if ((ifp->if_flags & IFF_BROADCAST) == 0) {
655                         error = EADDRNOTAVAIL;
656                         goto bad;
657                 }
658                 if ((flags & IP_ALLOWBROADCAST) == 0) {
659                         error = EACCES;
660                         goto bad;
661                 }
662                 /* don't allow broadcast messages to be fragmented */
663                 if (ip_len > mtu) {
664                         error = EMSGSIZE;
665                         goto bad;
666                 }
667                 m->m_flags |= M_BCAST;
668         } else {
669                 m->m_flags &= ~M_BCAST;
670         }
671
672 sendit:
673 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
674         if (IPSEC_ENABLED(ipv4)) {
675                 if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) {
676                         if (error == EINPROGRESS)
677                                 error = 0;
678                         goto done;
679                 }
680         }
681         /*
682          * Check if there was a route for this packet; return error if not.
683          */
684         if (no_route_but_check_spd) {
685                 IPSTAT_INC(ips_noroute);
686                 error = EHOSTUNREACH;
687                 goto bad;
688         }
689         /* Update variables that are affected by ipsec4_output(). */
690         ip = mtod(m, struct ip *);
691         hlen = ip->ip_hl << 2;
692 #endif /* IPSEC */
693
694         /* Jump over all PFIL processing if hooks are not active. */
695         if (PFIL_HOOKED_OUT(V_inet_pfil_head)) {
696                 switch (ip_output_pfil(&m, ifp, flags, inp, dst, &fibnum,
697                     &error)) {
698                 case 1: /* Finished */
699                         goto done;
700
701                 case 0: /* Continue normally */
702                         ip = mtod(m, struct ip *);
703                         break;
704
705                 case -1: /* Need to try again */
706                         /* Reset everything for a new round */
707                         if (ro != NULL) {
708                                 RO_NHFREE(ro);
709                                 ro->ro_prepend = NULL;
710                         }
711                         gw = (const struct sockaddr *)dst;
712                         ip = mtod(m, struct ip *);
713                         goto again;
714                 }
715         }
716
717         if (vlan_pcp > -1)
718                 EVL_APPLY_PRI(m, vlan_pcp);
719
720         /* IN_LOOPBACK must not appear on the wire - RFC1122. */
721         if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) ||
722             IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) {
723                 if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
724                         IPSTAT_INC(ips_badaddr);
725                         error = EADDRNOTAVAIL;
726                         goto bad;
727                 }
728         }
729
730         /* Ensure the packet data is mapped if the interface requires it. */
731         if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
732                 m = mb_unmapped_to_ext(m);
733                 if (m == NULL) {
734                         IPSTAT_INC(ips_odropped);
735                         error = ENOBUFS;
736                         goto bad;
737                 }
738         }
739
740         m->m_pkthdr.csum_flags |= CSUM_IP;
741         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
742                 in_delayed_cksum(m);
743                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
744         }
745 #if defined(SCTP) || defined(SCTP_SUPPORT)
746         if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
747                 sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
748                 m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
749         }
750 #endif
751
752         /*
753          * If small enough for interface, or the interface will take
754          * care of the fragmentation for us, we can just send directly.
755          * Note that if_vxlan could have requested TSO even though the outer
756          * frame is UDP.  It is correct to not fragment such datagrams and
757          * instead just pass them on to the driver.
758          */
759         if (ip_len <= mtu ||
760             (m->m_pkthdr.csum_flags & ifp->if_hwassist &
761             (CSUM_TSO | CSUM_INNER_TSO)) != 0) {
762                 ip->ip_sum = 0;
763                 if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
764                         ip->ip_sum = in_cksum(m, hlen);
765                         m->m_pkthdr.csum_flags &= ~CSUM_IP;
766                 }
767
768                 /*
769                  * Record statistics for this interface address.
770                  * With CSUM_TSO the byte/packet count will be slightly
771                  * incorrect because we count the IP+TCP headers only
772                  * once instead of for every generated packet.
773                  */
774                 if (!(flags & IP_FORWARDING) && ia) {
775                         if (m->m_pkthdr.csum_flags &
776                             (CSUM_TSO | CSUM_INNER_TSO))
777                                 counter_u64_add(ia->ia_ifa.ifa_opackets,
778                                     m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
779                         else
780                                 counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
781
782                         counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
783                 }
784 #ifdef MBUF_STRESS_TEST
785                 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
786                         m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
787 #endif
788                 /*
789                  * Reset layer specific mbuf flags
790                  * to avoid confusing lower layers.
791                  */
792                 m_clrprotoflags(m);
793                 IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
794                 error = ip_output_send(inp, ifp, m, gw, ro,
795                     (flags & IP_NO_SND_TAG_RL) ? false : true);
796                 goto done;
797         }
798
799         /* Balk when DF bit is set or the interface didn't support TSO. */
800         if ((ip_off & IP_DF) ||
801             (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO))) {
802                 error = EMSGSIZE;
803                 IPSTAT_INC(ips_cantfrag);
804                 goto bad;
805         }
806
807         /*
808          * Too large for interface; fragment if possible. If successful,
809          * on return, m will point to a list of packets to be sent.
810          */
811         error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
812         if (error)
813                 goto bad;
814         for (; m; m = m0) {
815                 m0 = m->m_nextpkt;
816                 m->m_nextpkt = 0;
817                 if (error == 0) {
818                         /* Record statistics for this interface address. */
819                         if (ia != NULL) {
820                                 counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
821                                 counter_u64_add(ia->ia_ifa.ifa_obytes,
822                                     m->m_pkthdr.len);
823                         }
824                         /*
825                          * Reset layer specific mbuf flags
826                          * to avoid confusing upper layers.
827                          */
828                         m_clrprotoflags(m);
829
830                         IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
831                             mtod(m, struct ip *), NULL);
832                         error = ip_output_send(inp, ifp, m, gw, ro, true);
833                 } else
834                         m_freem(m);
835         }
836
837         if (error == 0)
838                 IPSTAT_INC(ips_fragmented);
839
840 done:
841         return (error);
842  bad:
843         m_freem(m);
844         goto done;
845 }
846
847 /*
848  * Create a chain of fragments which fit the given mtu. m_frag points to the
849  * mbuf to be fragmented; on return it points to the chain with the fragments.
850  * Return 0 if no error. If error, m_frag may contain a partially built
851  * chain of fragments that should be freed by the caller.
852  *
853  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
854  */
855 int
856 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
857     u_long if_hwassist_flags)
858 {
859         int error = 0;
860         int hlen = ip->ip_hl << 2;
861         int len = (mtu - hlen) & ~7;    /* size of payload in each fragment */
862         int off;
863         struct mbuf *m0 = *m_frag;      /* the original packet          */
864         int firstlen;
865         struct mbuf **mnext;
866         int nfrags;
867         uint16_t ip_len, ip_off;
868
869         ip_len = ntohs(ip->ip_len);
870         ip_off = ntohs(ip->ip_off);
871
872         /*
873          * Packet shall not have "Don't Fragment" flag and have at least 8
874          * bytes of payload.
875          */
876         if (__predict_false((ip_off & IP_DF) || len < 8)) {
877                 IPSTAT_INC(ips_cantfrag);
878                 return (EMSGSIZE);
879         }
880
881         /*
882          * If the interface will not calculate checksums on
883          * fragmented packets, then do it here.
884          */
885         if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
886                 in_delayed_cksum(m0);
887                 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
888         }
889 #if defined(SCTP) || defined(SCTP_SUPPORT)
890         if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
891                 sctp_delayed_cksum(m0, hlen);
892                 m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
893         }
894 #endif
895         if (len > PAGE_SIZE) {
896                 /*
897                  * Fragment large datagrams such that each segment
898                  * contains a multiple of PAGE_SIZE amount of data,
899                  * plus headers. This enables a receiver to perform
900                  * page-flipping zero-copy optimizations.
901                  *
902                  * XXX When does this help given that sender and receiver
903                  * could have different page sizes, and also mtu could
904                  * be less than the receiver's page size ?
905                  */
906                 int newlen;
907
908                 off = MIN(mtu, m0->m_pkthdr.len);
909
910                 /*
911                  * firstlen (off - hlen) must be aligned on an
912                  * 8-byte boundary
913                  */
914                 if (off < hlen)
915                         goto smart_frag_failure;
916                 off = ((off - hlen) & ~7) + hlen;
917                 newlen = (~PAGE_MASK) & mtu;
918                 if ((newlen + sizeof (struct ip)) > mtu) {
919                         /* we failed, go back the default */
920 smart_frag_failure:
921                         newlen = len;
922                         off = hlen + len;
923                 }
924                 len = newlen;
925
926         } else {
927                 off = hlen + len;
928         }
929
930         firstlen = off - hlen;
931         mnext = &m0->m_nextpkt;         /* pointer to next packet */
932
933         /*
934          * Loop through length of segment after first fragment,
935          * make new header and copy data of each part and link onto chain.
936          * Here, m0 is the original packet, m is the fragment being created.
937          * The fragments are linked off the m_nextpkt of the original
938          * packet, which after processing serves as the first fragment.
939          */
940         for (nfrags = 1; off < ip_len; off += len, nfrags++) {
941                 struct ip *mhip;        /* ip header on the fragment */
942                 struct mbuf *m;
943                 int mhlen = sizeof (struct ip);
944
945                 m = m_gethdr(M_NOWAIT, MT_DATA);
946                 if (m == NULL) {
947                         error = ENOBUFS;
948                         IPSTAT_INC(ips_odropped);
949                         goto done;
950                 }
951                 /*
952                  * Make sure the complete packet header gets copied
953                  * from the originating mbuf to the newly created
954                  * mbuf. This also ensures that existing firewall
955                  * classification(s), VLAN tags and so on get copied
956                  * to the resulting fragmented packet(s):
957                  */
958                 if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
959                         m_free(m);
960                         error = ENOBUFS;
961                         IPSTAT_INC(ips_odropped);
962                         goto done;
963                 }
964                 /*
965                  * In the first mbuf, leave room for the link header, then
966                  * copy the original IP header including options. The payload
967                  * goes into an additional mbuf chain returned by m_copym().
968                  */
969                 m->m_data += max_linkhdr;
970                 mhip = mtod(m, struct ip *);
971                 *mhip = *ip;
972                 if (hlen > sizeof (struct ip)) {
973                         mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
974                         mhip->ip_v = IPVERSION;
975                         mhip->ip_hl = mhlen >> 2;
976                 }
977                 m->m_len = mhlen;
978                 /* XXX do we need to add ip_off below ? */
979                 mhip->ip_off = ((off - hlen) >> 3) + ip_off;
980                 if (off + len >= ip_len)
981                         len = ip_len - off;
982                 else
983                         mhip->ip_off |= IP_MF;
984                 mhip->ip_len = htons((u_short)(len + mhlen));
985                 m->m_next = m_copym(m0, off, len, M_NOWAIT);
986                 if (m->m_next == NULL) {        /* copy failed */
987                         m_free(m);
988                         error = ENOBUFS;        /* ??? */
989                         IPSTAT_INC(ips_odropped);
990                         goto done;
991                 }
992                 m->m_pkthdr.len = mhlen + len;
993 #ifdef MAC
994                 mac_netinet_fragment(m0, m);
995 #endif
996                 mhip->ip_off = htons(mhip->ip_off);
997                 mhip->ip_sum = 0;
998                 if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
999                         mhip->ip_sum = in_cksum(m, mhlen);
1000                         m->m_pkthdr.csum_flags &= ~CSUM_IP;
1001                 }
1002                 *mnext = m;
1003                 mnext = &m->m_nextpkt;
1004         }
1005         IPSTAT_ADD(ips_ofragments, nfrags);
1006
1007         /*
1008          * Update first fragment by trimming what's been copied out
1009          * and updating header.
1010          */
1011         m_adj(m0, hlen + firstlen - ip_len);
1012         m0->m_pkthdr.len = hlen + firstlen;
1013         ip->ip_len = htons((u_short)m0->m_pkthdr.len);
1014         ip->ip_off = htons(ip_off | IP_MF);
1015         ip->ip_sum = 0;
1016         if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
1017                 ip->ip_sum = in_cksum(m0, hlen);
1018                 m0->m_pkthdr.csum_flags &= ~CSUM_IP;
1019         }
1020
1021 done:
1022         *m_frag = m0;
1023         return error;
1024 }
1025
1026 void
1027 in_delayed_cksum(struct mbuf *m)
1028 {
1029         struct ip *ip;
1030         struct udphdr *uh;
1031         uint16_t cklen, csum, offset;
1032
1033         ip = mtod(m, struct ip *);
1034         offset = ip->ip_hl << 2 ;
1035
1036         if (m->m_pkthdr.csum_flags & CSUM_UDP) {
1037                 /* if udp header is not in the first mbuf copy udplen */
1038                 if (offset + sizeof(struct udphdr) > m->m_len) {
1039                         m_copydata(m, offset + offsetof(struct udphdr,
1040                             uh_ulen), sizeof(cklen), (caddr_t)&cklen);
1041                         cklen = ntohs(cklen);
1042                 } else {
1043                         uh = (struct udphdr *)mtodo(m, offset);
1044                         cklen = ntohs(uh->uh_ulen);
1045                 }
1046                 csum = in_cksum_skip(m, cklen + offset, offset);
1047                 if (csum == 0)
1048                         csum = 0xffff;
1049         } else {
1050                 cklen = ntohs(ip->ip_len);
1051                 csum = in_cksum_skip(m, cklen, offset);
1052         }
1053         offset += m->m_pkthdr.csum_data;        /* checksum offset */
1054
1055         if (offset + sizeof(csum) > m->m_len)
1056                 m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
1057         else
1058                 *(u_short *)mtodo(m, offset) = csum;
1059 }
1060
1061 /*
1062  * IP socket option processing.
1063  */
1064 int
1065 ip_ctloutput(struct socket *so, struct sockopt *sopt)
1066 {
1067         struct inpcb *inp = sotoinpcb(so);
1068         int     error, optval;
1069 #ifdef  RSS
1070         uint32_t rss_bucket;
1071         int retval;
1072 #endif
1073
1074         error = optval = 0;
1075         if (sopt->sopt_level != IPPROTO_IP) {
1076                 error = EINVAL;
1077
1078                 if (sopt->sopt_level == SOL_SOCKET &&
1079                     sopt->sopt_dir == SOPT_SET) {
1080                         switch (sopt->sopt_name) {
1081                         case SO_REUSEADDR:
1082                                 INP_WLOCK(inp);
1083                                 if ((so->so_options & SO_REUSEADDR) != 0)
1084                                         inp->inp_flags2 |= INP_REUSEADDR;
1085                                 else
1086                                         inp->inp_flags2 &= ~INP_REUSEADDR;
1087                                 INP_WUNLOCK(inp);
1088                                 error = 0;
1089                                 break;
1090                         case SO_REUSEPORT:
1091                                 INP_WLOCK(inp);
1092                                 if ((so->so_options & SO_REUSEPORT) != 0)
1093                                         inp->inp_flags2 |= INP_REUSEPORT;
1094                                 else
1095                                         inp->inp_flags2 &= ~INP_REUSEPORT;
1096                                 INP_WUNLOCK(inp);
1097                                 error = 0;
1098                                 break;
1099                         case SO_REUSEPORT_LB:
1100                                 INP_WLOCK(inp);
1101                                 if ((so->so_options & SO_REUSEPORT_LB) != 0)
1102                                         inp->inp_flags2 |= INP_REUSEPORT_LB;
1103                                 else
1104                                         inp->inp_flags2 &= ~INP_REUSEPORT_LB;
1105                                 INP_WUNLOCK(inp);
1106                                 error = 0;
1107                                 break;
1108                         case SO_SETFIB:
1109                                 INP_WLOCK(inp);
1110                                 inp->inp_inc.inc_fibnum = so->so_fibnum;
1111                                 INP_WUNLOCK(inp);
1112                                 error = 0;
1113                                 break;
1114                         case SO_MAX_PACING_RATE:
1115 #ifdef RATELIMIT
1116                                 INP_WLOCK(inp);
1117                                 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
1118                                 INP_WUNLOCK(inp);
1119                                 error = 0;
1120 #else
1121                                 error = EOPNOTSUPP;
1122 #endif
1123                                 break;
1124                         default:
1125                                 break;
1126                         }
1127                 }
1128                 return (error);
1129         }
1130
1131         switch (sopt->sopt_dir) {
1132         case SOPT_SET:
1133                 switch (sopt->sopt_name) {
1134                 case IP_OPTIONS:
1135 #ifdef notyet
1136                 case IP_RETOPTS:
1137 #endif
1138                 {
1139                         struct mbuf *m;
1140                         if (sopt->sopt_valsize > MLEN) {
1141                                 error = EMSGSIZE;
1142                                 break;
1143                         }
1144                         m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
1145                         if (m == NULL) {
1146                                 error = ENOBUFS;
1147                                 break;
1148                         }
1149                         m->m_len = sopt->sopt_valsize;
1150                         error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
1151                                             m->m_len);
1152                         if (error) {
1153                                 m_free(m);
1154                                 break;
1155                         }
1156                         INP_WLOCK(inp);
1157                         error = ip_pcbopts(inp, sopt->sopt_name, m);
1158                         INP_WUNLOCK(inp);
1159                         return (error);
1160                 }
1161
1162                 case IP_BINDANY:
1163                         if (sopt->sopt_td != NULL) {
1164                                 error = priv_check(sopt->sopt_td,
1165                                     PRIV_NETINET_BINDANY);
1166                                 if (error)
1167                                         break;
1168                         }
1169                         /* FALLTHROUGH */
1170                 case IP_BINDMULTI:
1171 #ifdef  RSS
1172                 case IP_RSS_LISTEN_BUCKET:
1173 #endif
1174                 case IP_TOS:
1175                 case IP_TTL:
1176                 case IP_MINTTL:
1177                 case IP_RECVOPTS:
1178                 case IP_RECVRETOPTS:
1179                 case IP_ORIGDSTADDR:
1180                 case IP_RECVDSTADDR:
1181                 case IP_RECVTTL:
1182                 case IP_RECVIF:
1183                 case IP_ONESBCAST:
1184                 case IP_DONTFRAG:
1185                 case IP_RECVTOS:
1186                 case IP_RECVFLOWID:
1187 #ifdef  RSS
1188                 case IP_RECVRSSBUCKETID:
1189 #endif
1190                 case IP_VLAN_PCP:
1191                         error = sooptcopyin(sopt, &optval, sizeof optval,
1192                                             sizeof optval);
1193                         if (error)
1194                                 break;
1195
1196                         switch (sopt->sopt_name) {
1197                         case IP_TOS:
1198                                 inp->inp_ip_tos = optval;
1199                                 break;
1200
1201                         case IP_TTL:
1202                                 inp->inp_ip_ttl = optval;
1203                                 break;
1204
1205                         case IP_MINTTL:
1206                                 if (optval >= 0 && optval <= MAXTTL)
1207                                         inp->inp_ip_minttl = optval;
1208                                 else
1209                                         error = EINVAL;
1210                                 break;
1211
1212 #define OPTSET(bit) do {                                                \
1213         INP_WLOCK(inp);                                                 \
1214         if (optval)                                                     \
1215                 inp->inp_flags |= bit;                                  \
1216         else                                                            \
1217                 inp->inp_flags &= ~bit;                                 \
1218         INP_WUNLOCK(inp);                                               \
1219 } while (0)
1220
1221 #define OPTSET2(bit, val) do {                                          \
1222         INP_WLOCK(inp);                                                 \
1223         if (val)                                                        \
1224                 inp->inp_flags2 |= bit;                                 \
1225         else                                                            \
1226                 inp->inp_flags2 &= ~bit;                                \
1227         INP_WUNLOCK(inp);                                               \
1228 } while (0)
1229
1230                         case IP_RECVOPTS:
1231                                 OPTSET(INP_RECVOPTS);
1232                                 break;
1233
1234                         case IP_RECVRETOPTS:
1235                                 OPTSET(INP_RECVRETOPTS);
1236                                 break;
1237
1238                         case IP_RECVDSTADDR:
1239                                 OPTSET(INP_RECVDSTADDR);
1240                                 break;
1241
1242                         case IP_ORIGDSTADDR:
1243                                 OPTSET2(INP_ORIGDSTADDR, optval);
1244                                 break;
1245
1246                         case IP_RECVTTL:
1247                                 OPTSET(INP_RECVTTL);
1248                                 break;
1249
1250                         case IP_RECVIF:
1251                                 OPTSET(INP_RECVIF);
1252                                 break;
1253
1254                         case IP_ONESBCAST:
1255                                 OPTSET(INP_ONESBCAST);
1256                                 break;
1257                         case IP_DONTFRAG:
1258                                 OPTSET(INP_DONTFRAG);
1259                                 break;
1260                         case IP_BINDANY:
1261                                 OPTSET(INP_BINDANY);
1262                                 break;
1263                         case IP_RECVTOS:
1264                                 OPTSET(INP_RECVTOS);
1265                                 break;
1266                         case IP_BINDMULTI:
1267                                 OPTSET2(INP_BINDMULTI, optval);
1268                                 break;
1269                         case IP_RECVFLOWID:
1270                                 OPTSET2(INP_RECVFLOWID, optval);
1271                                 break;
1272 #ifdef  RSS
1273                         case IP_RSS_LISTEN_BUCKET:
1274                                 if ((optval >= 0) &&
1275                                     (optval < rss_getnumbuckets())) {
1276                                         inp->inp_rss_listen_bucket = optval;
1277                                         OPTSET2(INP_RSS_BUCKET_SET, 1);
1278                                 } else {
1279                                         error = EINVAL;
1280                                 }
1281                                 break;
1282                         case IP_RECVRSSBUCKETID:
1283                                 OPTSET2(INP_RECVRSSBUCKETID, optval);
1284                                 break;
1285 #endif
1286                         case IP_VLAN_PCP:
1287                                 if ((optval >= -1) && (optval <=
1288                                     (INP_2PCP_MASK >> INP_2PCP_SHIFT))) {
1289                                         if (optval == -1) {
1290                                                 INP_WLOCK(inp);
1291                                                 inp->inp_flags2 &=
1292                                                     ~(INP_2PCP_SET |
1293                                                       INP_2PCP_MASK);
1294                                                 INP_WUNLOCK(inp);
1295                                         } else {
1296                                                 INP_WLOCK(inp);
1297                                                 inp->inp_flags2 |=
1298                                                     INP_2PCP_SET;
1299                                                 inp->inp_flags2 &=
1300                                                     ~INP_2PCP_MASK;
1301                                                 inp->inp_flags2 |=
1302                                                     optval << INP_2PCP_SHIFT;
1303                                                 INP_WUNLOCK(inp);
1304                                         }
1305                                 } else
1306                                         error = EINVAL;
1307                                 break;
1308                         }
1309                         break;
1310 #undef OPTSET
1311 #undef OPTSET2
1312
1313                 /*
1314                  * Multicast socket options are processed by the in_mcast
1315                  * module.
1316                  */
1317                 case IP_MULTICAST_IF:
1318                 case IP_MULTICAST_VIF:
1319                 case IP_MULTICAST_TTL:
1320                 case IP_MULTICAST_LOOP:
1321                 case IP_ADD_MEMBERSHIP:
1322                 case IP_DROP_MEMBERSHIP:
1323                 case IP_ADD_SOURCE_MEMBERSHIP:
1324                 case IP_DROP_SOURCE_MEMBERSHIP:
1325                 case IP_BLOCK_SOURCE:
1326                 case IP_UNBLOCK_SOURCE:
1327                 case IP_MSFILTER:
1328                 case MCAST_JOIN_GROUP:
1329                 case MCAST_LEAVE_GROUP:
1330                 case MCAST_JOIN_SOURCE_GROUP:
1331                 case MCAST_LEAVE_SOURCE_GROUP:
1332                 case MCAST_BLOCK_SOURCE:
1333                 case MCAST_UNBLOCK_SOURCE:
1334                         error = inp_setmoptions(inp, sopt);
1335                         break;
1336
1337                 case IP_PORTRANGE:
1338                         error = sooptcopyin(sopt, &optval, sizeof optval,
1339                                             sizeof optval);
1340                         if (error)
1341                                 break;
1342
1343                         INP_WLOCK(inp);
1344                         switch (optval) {
1345                         case IP_PORTRANGE_DEFAULT:
1346                                 inp->inp_flags &= ~(INP_LOWPORT);
1347                                 inp->inp_flags &= ~(INP_HIGHPORT);
1348                                 break;
1349
1350                         case IP_PORTRANGE_HIGH:
1351                                 inp->inp_flags &= ~(INP_LOWPORT);
1352                                 inp->inp_flags |= INP_HIGHPORT;
1353                                 break;
1354
1355                         case IP_PORTRANGE_LOW:
1356                                 inp->inp_flags &= ~(INP_HIGHPORT);
1357                                 inp->inp_flags |= INP_LOWPORT;
1358                                 break;
1359
1360                         default:
1361                                 error = EINVAL;
1362                                 break;
1363                         }
1364                         INP_WUNLOCK(inp);
1365                         break;
1366
1367 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1368                 case IP_IPSEC_POLICY:
1369                         if (IPSEC_ENABLED(ipv4)) {
1370                                 error = IPSEC_PCBCTL(ipv4, inp, sopt);
1371                                 break;
1372                         }
1373                         /* FALLTHROUGH */
1374 #endif /* IPSEC */
1375
1376                 default:
1377                         error = ENOPROTOOPT;
1378                         break;
1379                 }
1380                 break;
1381
1382         case SOPT_GET:
1383                 switch (sopt->sopt_name) {
1384                 case IP_OPTIONS:
1385                 case IP_RETOPTS:
1386                         INP_RLOCK(inp);
1387                         if (inp->inp_options) {
1388                                 struct mbuf *options;
1389
1390                                 options = m_copym(inp->inp_options, 0,
1391                                     M_COPYALL, M_NOWAIT);
1392                                 INP_RUNLOCK(inp);
1393                                 if (options != NULL) {
1394                                         error = sooptcopyout(sopt,
1395                                                              mtod(options, char *),
1396                                                              options->m_len);
1397                                         m_freem(options);
1398                                 } else
1399                                         error = ENOMEM;
1400                         } else {
1401                                 INP_RUNLOCK(inp);
1402                                 sopt->sopt_valsize = 0;
1403                         }
1404                         break;
1405
1406                 case IP_TOS:
1407                 case IP_TTL:
1408                 case IP_MINTTL:
1409                 case IP_RECVOPTS:
1410                 case IP_RECVRETOPTS:
1411                 case IP_ORIGDSTADDR:
1412                 case IP_RECVDSTADDR:
1413                 case IP_RECVTTL:
1414                 case IP_RECVIF:
1415                 case IP_PORTRANGE:
1416                 case IP_ONESBCAST:
1417                 case IP_DONTFRAG:
1418                 case IP_BINDANY:
1419                 case IP_RECVTOS:
1420                 case IP_BINDMULTI:
1421                 case IP_FLOWID:
1422                 case IP_FLOWTYPE:
1423                 case IP_RECVFLOWID:
1424 #ifdef  RSS
1425                 case IP_RSSBUCKETID:
1426                 case IP_RECVRSSBUCKETID:
1427 #endif
1428                 case IP_VLAN_PCP:
1429                         switch (sopt->sopt_name) {
1430                         case IP_TOS:
1431                                 optval = inp->inp_ip_tos;
1432                                 break;
1433
1434                         case IP_TTL:
1435                                 optval = inp->inp_ip_ttl;
1436                                 break;
1437
1438                         case IP_MINTTL:
1439                                 optval = inp->inp_ip_minttl;
1440                                 break;
1441
1442 #define OPTBIT(bit)     (inp->inp_flags & bit ? 1 : 0)
1443 #define OPTBIT2(bit)    (inp->inp_flags2 & bit ? 1 : 0)
1444
1445                         case IP_RECVOPTS:
1446                                 optval = OPTBIT(INP_RECVOPTS);
1447                                 break;
1448
1449                         case IP_RECVRETOPTS:
1450                                 optval = OPTBIT(INP_RECVRETOPTS);
1451                                 break;
1452
1453                         case IP_RECVDSTADDR:
1454                                 optval = OPTBIT(INP_RECVDSTADDR);
1455                                 break;
1456
1457                         case IP_ORIGDSTADDR:
1458                                 optval = OPTBIT2(INP_ORIGDSTADDR);
1459                                 break;
1460
1461                         case IP_RECVTTL:
1462                                 optval = OPTBIT(INP_RECVTTL);
1463                                 break;
1464
1465                         case IP_RECVIF:
1466                                 optval = OPTBIT(INP_RECVIF);
1467                                 break;
1468
1469                         case IP_PORTRANGE:
1470                                 if (inp->inp_flags & INP_HIGHPORT)
1471                                         optval = IP_PORTRANGE_HIGH;
1472                                 else if (inp->inp_flags & INP_LOWPORT)
1473                                         optval = IP_PORTRANGE_LOW;
1474                                 else
1475                                         optval = 0;
1476                                 break;
1477
1478                         case IP_ONESBCAST:
1479                                 optval = OPTBIT(INP_ONESBCAST);
1480                                 break;
1481                         case IP_DONTFRAG:
1482                                 optval = OPTBIT(INP_DONTFRAG);
1483                                 break;
1484                         case IP_BINDANY:
1485                                 optval = OPTBIT(INP_BINDANY);
1486                                 break;
1487                         case IP_RECVTOS:
1488                                 optval = OPTBIT(INP_RECVTOS);
1489                                 break;
1490                         case IP_FLOWID:
1491                                 optval = inp->inp_flowid;
1492                                 break;
1493                         case IP_FLOWTYPE:
1494                                 optval = inp->inp_flowtype;
1495                                 break;
1496                         case IP_RECVFLOWID:
1497                                 optval = OPTBIT2(INP_RECVFLOWID);
1498                                 break;
1499 #ifdef  RSS
1500                         case IP_RSSBUCKETID:
1501                                 retval = rss_hash2bucket(inp->inp_flowid,
1502                                     inp->inp_flowtype,
1503                                     &rss_bucket);
1504                                 if (retval == 0)
1505                                         optval = rss_bucket;
1506                                 else
1507                                         error = EINVAL;
1508                                 break;
1509                         case IP_RECVRSSBUCKETID:
1510                                 optval = OPTBIT2(INP_RECVRSSBUCKETID);
1511                                 break;
1512 #endif
1513                         case IP_BINDMULTI:
1514                                 optval = OPTBIT2(INP_BINDMULTI);
1515                                 break;
1516                         case IP_VLAN_PCP:
1517                                 if (OPTBIT2(INP_2PCP_SET)) {
1518                                         optval = (inp->inp_flags2 &
1519                                             INP_2PCP_MASK) >> INP_2PCP_SHIFT;
1520                                 } else {
1521                                         optval = -1;
1522                                 }
1523                                 break;
1524                         }
1525                         error = sooptcopyout(sopt, &optval, sizeof optval);
1526                         break;
1527
1528                 /*
1529                  * Multicast socket options are processed by the in_mcast
1530                  * module.
1531                  */
1532                 case IP_MULTICAST_IF:
1533                 case IP_MULTICAST_VIF:
1534                 case IP_MULTICAST_TTL:
1535                 case IP_MULTICAST_LOOP:
1536                 case IP_MSFILTER:
1537                         error = inp_getmoptions(inp, sopt);
1538                         break;
1539
1540 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1541                 case IP_IPSEC_POLICY:
1542                         if (IPSEC_ENABLED(ipv4)) {
1543                                 error = IPSEC_PCBCTL(ipv4, inp, sopt);
1544                                 break;
1545                         }
1546                         /* FALLTHROUGH */
1547 #endif /* IPSEC */
1548
1549                 default:
1550                         error = ENOPROTOOPT;
1551                         break;
1552                 }
1553                 break;
1554         }
1555         return (error);
1556 }
1557
1558 /*
1559  * Routine called from ip_output() to loop back a copy of an IP multicast
1560  * packet to the input queue of a specified interface.  Note that this
1561  * calls the output routine of the loopback "driver", but with an interface
1562  * pointer that might NOT be a loopback interface -- evil, but easier than
1563  * replicating that code here.
1564  */
1565 static void
1566 ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen)
1567 {
1568         struct ip *ip;
1569         struct mbuf *copym;
1570
1571         /*
1572          * Make a deep copy of the packet because we're going to
1573          * modify the pack in order to generate checksums.
1574          */
1575         copym = m_dup(m, M_NOWAIT);
1576         if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen))
1577                 copym = m_pullup(copym, hlen);
1578         if (copym != NULL) {
1579                 /* If needed, compute the checksum and mark it as valid. */
1580                 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1581                         in_delayed_cksum(copym);
1582                         copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1583                         copym->m_pkthdr.csum_flags |=
1584                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1585                         copym->m_pkthdr.csum_data = 0xffff;
1586                 }
1587                 /*
1588                  * We don't bother to fragment if the IP length is greater
1589                  * than the interface's MTU.  Can this possibly matter?
1590                  */
1591                 ip = mtod(copym, struct ip *);
1592                 ip->ip_sum = 0;
1593                 ip->ip_sum = in_cksum(copym, hlen);
1594                 if_simloop(ifp, copym, AF_INET, 0);
1595         }
1596 }