]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/ip_output.c
ssh: Update to OpenSSH 9.3p2
[FreeBSD/FreeBSD.git] / sys / netinet / ip_output.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993
5  *      The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *      @(#)ip_output.c 8.3 (Berkeley) 1/21/94
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include "opt_inet.h"
38 #include "opt_ipsec.h"
39 #include "opt_kern_tls.h"
40 #include "opt_mbuf_stress_test.h"
41 #include "opt_ratelimit.h"
42 #include "opt_route.h"
43 #include "opt_rss.h"
44 #include "opt_sctp.h"
45
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/ktls.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/mbuf.h>
53 #include <sys/priv.h>
54 #include <sys/proc.h>
55 #include <sys/protosw.h>
56 #include <sys/sdt.h>
57 #include <sys/socket.h>
58 #include <sys/socketvar.h>
59 #include <sys/sysctl.h>
60 #include <sys/ucred.h>
61
62 #include <net/if.h>
63 #include <net/if_var.h>
64 #include <net/if_private.h>
65 #include <net/if_vlan_var.h>
66 #include <net/if_llatbl.h>
67 #include <net/ethernet.h>
68 #include <net/netisr.h>
69 #include <net/pfil.h>
70 #include <net/route.h>
71 #include <net/route/nhop.h>
72 #include <net/rss_config.h>
73 #include <net/vnet.h>
74
75 #include <netinet/in.h>
76 #include <netinet/in_fib.h>
77 #include <netinet/in_kdtrace.h>
78 #include <netinet/in_systm.h>
79 #include <netinet/ip.h>
80 #include <netinet/in_fib.h>
81 #include <netinet/in_pcb.h>
82 #include <netinet/in_rss.h>
83 #include <netinet/in_var.h>
84 #include <netinet/ip_var.h>
85 #include <netinet/ip_options.h>
86
87 #include <netinet/udp.h>
88 #include <netinet/udp_var.h>
89
90 #if defined(SCTP) || defined(SCTP_SUPPORT)
91 #include <netinet/sctp.h>
92 #include <netinet/sctp_crc32.h>
93 #endif
94
95 #include <netipsec/ipsec_support.h>
96
97 #include <machine/in_cksum.h>
98
99 #include <security/mac/mac_framework.h>
100
101 #ifdef MBUF_STRESS_TEST
102 static int mbuf_frag_size = 0;
103 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
104         &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
105 #endif
106
107 static void     ip_mloopback(struct ifnet *, const struct mbuf *, int);
108
109 extern int in_mcast_loop;
110
111 static inline int
112 ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags,
113     struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error)
114 {
115         struct m_tag *fwd_tag = NULL;
116         struct mbuf *m;
117         struct in_addr odst;
118         struct ip *ip;
119
120         m = *mp;
121         ip = mtod(m, struct ip *);
122
123         /* Run through list of hooks for output packets. */
124         odst.s_addr = ip->ip_dst.s_addr;
125         switch (pfil_mbuf_out(V_inet_pfil_head, mp, ifp, inp)) {
126         case PFIL_DROPPED:
127                 *error = EACCES;
128                 /* FALLTHROUGH */
129         case PFIL_CONSUMED:
130                 return 1; /* Finished */
131         case PFIL_PASS:
132                 *error = 0;
133         }
134         m = *mp;
135         ip = mtod(m, struct ip *);
136
137         /* See if destination IP address was changed by packet filter. */
138         if (odst.s_addr != ip->ip_dst.s_addr) {
139                 m->m_flags |= M_SKIP_FIREWALL;
140                 /* If destination is now ourself drop to ip_input(). */
141                 if (in_localip(ip->ip_dst)) {
142                         m->m_flags |= M_FASTFWD_OURS;
143                         if (m->m_pkthdr.rcvif == NULL)
144                                 m->m_pkthdr.rcvif = V_loif;
145                         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
146                                 m->m_pkthdr.csum_flags |=
147                                         CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
148                                 m->m_pkthdr.csum_data = 0xffff;
149                         }
150                         m->m_pkthdr.csum_flags |=
151                                 CSUM_IP_CHECKED | CSUM_IP_VALID;
152 #if defined(SCTP) || defined(SCTP_SUPPORT)
153                         if (m->m_pkthdr.csum_flags & CSUM_SCTP)
154                                 m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
155 #endif
156                         *error = netisr_queue(NETISR_IP, m);
157                         return 1; /* Finished */
158                 }
159
160                 bzero(dst, sizeof(*dst));
161                 dst->sin_family = AF_INET;
162                 dst->sin_len = sizeof(*dst);
163                 dst->sin_addr = ip->ip_dst;
164
165                 return -1; /* Reloop */
166         }
167         /* See if fib was changed by packet filter. */
168         if ((*fibnum) != M_GETFIB(m)) {
169                 m->m_flags |= M_SKIP_FIREWALL;
170                 *fibnum = M_GETFIB(m);
171                 return -1; /* Reloop for FIB change */
172         }
173
174         /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
175         if (m->m_flags & M_FASTFWD_OURS) {
176                 if (m->m_pkthdr.rcvif == NULL)
177                         m->m_pkthdr.rcvif = V_loif;
178                 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
179                         m->m_pkthdr.csum_flags |=
180                                 CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
181                         m->m_pkthdr.csum_data = 0xffff;
182                 }
183 #if defined(SCTP) || defined(SCTP_SUPPORT)
184                 if (m->m_pkthdr.csum_flags & CSUM_SCTP)
185                         m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
186 #endif
187                 m->m_pkthdr.csum_flags |=
188                         CSUM_IP_CHECKED | CSUM_IP_VALID;
189
190                 *error = netisr_queue(NETISR_IP, m);
191                 return 1; /* Finished */
192         }
193         /* Or forward to some other address? */
194         if ((m->m_flags & M_IP_NEXTHOP) &&
195             ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
196                 bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
197                 m->m_flags |= M_SKIP_FIREWALL;
198                 m->m_flags &= ~M_IP_NEXTHOP;
199                 m_tag_delete(m, fwd_tag);
200
201                 return -1; /* Reloop for CHANGE of dst */
202         }
203
204         return 0;
205 }
206
207 static int
208 ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m,
209     const struct sockaddr *gw, struct route *ro, bool stamp_tag)
210 {
211 #ifdef KERN_TLS
212         struct ktls_session *tls = NULL;
213 #endif
214         struct m_snd_tag *mst;
215         int error;
216
217         MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
218         mst = NULL;
219
220 #ifdef KERN_TLS
221         /*
222          * If this is an unencrypted TLS record, save a reference to
223          * the record.  This local reference is used to call
224          * ktls_output_eagain after the mbuf has been freed (thus
225          * dropping the mbuf's reference) in if_output.
226          */
227         if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) {
228                 tls = ktls_hold(m->m_next->m_epg_tls);
229                 mst = tls->snd_tag;
230
231                 /*
232                  * If a TLS session doesn't have a valid tag, it must
233                  * have had an earlier ifp mismatch, so drop this
234                  * packet.
235                  */
236                 if (mst == NULL) {
237                         m_freem(m);
238                         error = EAGAIN;
239                         goto done;
240                 }
241                 /*
242                  * Always stamp tags that include NIC ktls.
243                  */
244                 stamp_tag = true;
245         }
246 #endif
247 #ifdef RATELIMIT
248         if (inp != NULL && mst == NULL) {
249                 if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 ||
250                     (inp->inp_snd_tag != NULL &&
251                     inp->inp_snd_tag->ifp != ifp))
252                         in_pcboutput_txrtlmt(inp, ifp, m);
253
254                 if (inp->inp_snd_tag != NULL)
255                         mst = inp->inp_snd_tag;
256         }
257 #endif
258         if (stamp_tag && mst != NULL) {
259                 KASSERT(m->m_pkthdr.rcvif == NULL,
260                     ("trying to add a send tag to a forwarded packet"));
261                 if (mst->ifp != ifp) {
262                         m_freem(m);
263                         error = EAGAIN;
264                         goto done;
265                 }
266
267                 /* stamp send tag on mbuf */
268                 m->m_pkthdr.snd_tag = m_snd_tag_ref(mst);
269                 m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
270         }
271
272         error = (*ifp->if_output)(ifp, m, gw, ro);
273
274 done:
275         /* Check for route change invalidating send tags. */
276 #ifdef KERN_TLS
277         if (tls != NULL) {
278                 if (error == EAGAIN)
279                         error = ktls_output_eagain(inp, tls);
280                 ktls_free(tls);
281         }
282 #endif
283 #ifdef RATELIMIT
284         if (error == EAGAIN)
285                 in_pcboutput_eagain(inp);
286 #endif
287         return (error);
288 }
289
290 /* rte<>ro_flags translation */
291 static inline void
292 rt_update_ro_flags(struct route *ro, const struct nhop_object *nh)
293 {
294         int nh_flags = nh->nh_flags;
295
296         ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW);
297
298         ro->ro_flags |= (nh_flags & NHF_REJECT) ? RT_REJECT : 0;
299         ro->ro_flags |= (nh_flags & NHF_BLACKHOLE) ? RT_BLACKHOLE : 0;
300         ro->ro_flags |= (nh_flags & NHF_GATEWAY) ? RT_HAS_GW : 0;
301 }
302
303 /*
304  * IP output.  The packet in mbuf chain m contains a skeletal IP
305  * header (with len, off, ttl, proto, tos, src, dst).
306  * The mbuf chain containing the packet will be freed.
307  * The mbuf opt, if present, will not be freed.
308  * If route ro is present and has ro_rt initialized, route lookup would be
309  * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
310  * then result of route lookup is stored in ro->ro_rt.
311  *
312  * In the IP forwarding case, the packet will arrive with options already
313  * inserted, so must have a NULL opt pointer.
314  */
315 int
316 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
317     struct ip_moptions *imo, struct inpcb *inp)
318 {
319         struct ip *ip;
320         struct ifnet *ifp = NULL;       /* keep compiler happy */
321         struct mbuf *m0;
322         int hlen = sizeof (struct ip);
323         int mtu = 0;
324         int error = 0;
325         int vlan_pcp = -1;
326         struct sockaddr_in *dst;
327         const struct sockaddr *gw;
328         struct in_ifaddr *ia = NULL;
329         struct in_addr src;
330         int isbroadcast;
331         uint16_t ip_len, ip_off;
332         struct route iproute;
333         uint32_t fibnum;
334 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
335         int no_route_but_check_spd = 0;
336 #endif
337
338         M_ASSERTPKTHDR(m);
339         NET_EPOCH_ASSERT();
340
341         if (inp != NULL) {
342                 INP_LOCK_ASSERT(inp);
343                 M_SETFIB(m, inp->inp_inc.inc_fibnum);
344                 if ((flags & IP_NODEFAULTFLOWID) == 0) {
345                         m->m_pkthdr.flowid = inp->inp_flowid;
346                         M_HASHTYPE_SET(m, inp->inp_flowtype);
347                 }
348                 if ((inp->inp_flags2 & INP_2PCP_SET) != 0)
349                         vlan_pcp = (inp->inp_flags2 & INP_2PCP_MASK) >>
350                             INP_2PCP_SHIFT;
351 #ifdef NUMA
352                 m->m_pkthdr.numa_domain = inp->inp_numa_domain;
353 #endif
354         }
355
356         if (opt) {
357                 int len = 0;
358                 m = ip_insertoptions(m, opt, &len);
359                 if (len != 0)
360                         hlen = len; /* ip->ip_hl is updated above */
361         }
362         ip = mtod(m, struct ip *);
363         ip_len = ntohs(ip->ip_len);
364         ip_off = ntohs(ip->ip_off);
365
366         if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
367                 ip->ip_v = IPVERSION;
368                 ip->ip_hl = hlen >> 2;
369                 ip_fillid(ip);
370         } else {
371                 /* Header already set, fetch hlen from there */
372                 hlen = ip->ip_hl << 2;
373         }
374         if ((flags & IP_FORWARDING) == 0)
375                 IPSTAT_INC(ips_localout);
376
377         /*
378          * dst/gw handling:
379          *
380          * gw is readonly but can point either to dst OR rt_gateway,
381          * therefore we need restore gw if we're redoing lookup.
382          */
383         fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
384         if (ro == NULL) {
385                 ro = &iproute;
386                 bzero(ro, sizeof (*ro));
387         }
388         dst = (struct sockaddr_in *)&ro->ro_dst;
389         if (ro->ro_nh == NULL) {
390                 dst->sin_family = AF_INET;
391                 dst->sin_len = sizeof(*dst);
392                 dst->sin_addr = ip->ip_dst;
393         }
394         gw = (const struct sockaddr *)dst;
395 again:
396         /*
397          * Validate route against routing table additions;
398          * a better/more specific route might have been added.
399          */
400         if (inp != NULL && ro->ro_nh != NULL)
401                 NH_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
402         /*
403          * If there is a cached route,
404          * check that it is to the same destination
405          * and is still up.  If not, free it and try again.
406          * The address family should also be checked in case of sharing the
407          * cache with IPv6.
408          * Also check whether routing cache needs invalidation.
409          */
410         if (ro->ro_nh != NULL &&
411             ((!NH_IS_VALID(ro->ro_nh)) || dst->sin_family != AF_INET ||
412             dst->sin_addr.s_addr != ip->ip_dst.s_addr))
413                 RO_INVALIDATE_CACHE(ro);
414         ia = NULL;
415         /*
416          * If routing to interface only, short circuit routing lookup.
417          * The use of an all-ones broadcast address implies this; an
418          * interface is specified by the broadcast address of an interface,
419          * or the destination address of a ptp interface.
420          */
421         if (flags & IP_SENDONES) {
422                 if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst),
423                                                       M_GETFIB(m)))) == NULL &&
424                     (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
425                                                     M_GETFIB(m)))) == NULL) {
426                         IPSTAT_INC(ips_noroute);
427                         error = ENETUNREACH;
428                         goto bad;
429                 }
430                 ip->ip_dst.s_addr = INADDR_BROADCAST;
431                 dst->sin_addr = ip->ip_dst;
432                 ifp = ia->ia_ifp;
433                 mtu = ifp->if_mtu;
434                 ip->ip_ttl = 1;
435                 isbroadcast = 1;
436                 src = IA_SIN(ia)->sin_addr;
437         } else if (flags & IP_ROUTETOIF) {
438                 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
439                                                     M_GETFIB(m)))) == NULL &&
440                     (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0,
441                                                 M_GETFIB(m)))) == NULL) {
442                         IPSTAT_INC(ips_noroute);
443                         error = ENETUNREACH;
444                         goto bad;
445                 }
446                 ifp = ia->ia_ifp;
447                 mtu = ifp->if_mtu;
448                 ip->ip_ttl = 1;
449                 isbroadcast = ifp->if_flags & IFF_BROADCAST ?
450                     in_ifaddr_broadcast(dst->sin_addr, ia) : 0;
451                 src = IA_SIN(ia)->sin_addr;
452         } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
453             imo != NULL && imo->imo_multicast_ifp != NULL) {
454                 /*
455                  * Bypass the normal routing lookup for multicast
456                  * packets if the interface is specified.
457                  */
458                 ifp = imo->imo_multicast_ifp;
459                 mtu = ifp->if_mtu;
460                 IFP_TO_IA(ifp, ia);
461                 isbroadcast = 0;        /* fool gcc */
462                 /* Interface may have no addresses. */
463                 if (ia != NULL)
464                         src = IA_SIN(ia)->sin_addr;
465                 else
466                         src.s_addr = INADDR_ANY;
467         } else if (ro != &iproute) {
468                 if (ro->ro_nh == NULL) {
469                         /*
470                          * We want to do any cloning requested by the link
471                          * layer, as this is probably required in all cases
472                          * for correct operation (as it is for ARP).
473                          */
474                         uint32_t flowid;
475                         flowid = m->m_pkthdr.flowid;
476                         ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0,
477                             NHR_REF, flowid);
478
479                         if (ro->ro_nh == NULL || (!NH_IS_VALID(ro->ro_nh))) {
480 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
481                                 /*
482                                  * There is no route for this packet, but it is
483                                  * possible that a matching SPD entry exists.
484                                  */
485                                 no_route_but_check_spd = 1;
486                                 goto sendit;
487 #endif
488                                 IPSTAT_INC(ips_noroute);
489                                 error = EHOSTUNREACH;
490                                 goto bad;
491                         }
492                 }
493                 struct nhop_object *nh = ro->ro_nh;
494
495                 ia = ifatoia(nh->nh_ifa);
496                 ifp = nh->nh_ifp;
497                 counter_u64_add(nh->nh_pksent, 1);
498                 rt_update_ro_flags(ro, nh);
499                 if (nh->nh_flags & NHF_GATEWAY)
500                         gw = &nh->gw_sa;
501                 if (nh->nh_flags & NHF_HOST)
502                         isbroadcast = (nh->nh_flags & NHF_BROADCAST);
503                 else if ((ifp->if_flags & IFF_BROADCAST) && (gw->sa_family == AF_INET))
504                         isbroadcast = in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia);
505                 else
506                         isbroadcast = 0;
507                 mtu = nh->nh_mtu;
508                 src = IA_SIN(ia)->sin_addr;
509         } else {
510                 struct nhop_object *nh;
511
512                 nh = fib4_lookup(M_GETFIB(m), dst->sin_addr, 0, NHR_NONE,
513                     m->m_pkthdr.flowid);
514                 if (nh == NULL) {
515 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
516                         /*
517                          * There is no route for this packet, but it is
518                          * possible that a matching SPD entry exists.
519                          */
520                         no_route_but_check_spd = 1;
521                         goto sendit;
522 #endif
523                         IPSTAT_INC(ips_noroute);
524                         error = EHOSTUNREACH;
525                         goto bad;
526                 }
527                 ifp = nh->nh_ifp;
528                 mtu = nh->nh_mtu;
529                 rt_update_ro_flags(ro, nh);
530                 if (nh->nh_flags & NHF_GATEWAY)
531                         gw = &nh->gw_sa;
532                 ia = ifatoia(nh->nh_ifa);
533                 src = IA_SIN(ia)->sin_addr;
534                 isbroadcast = (((nh->nh_flags & (NHF_HOST | NHF_BROADCAST)) ==
535                     (NHF_HOST | NHF_BROADCAST)) ||
536                     ((ifp->if_flags & IFF_BROADCAST) &&
537                     (gw->sa_family == AF_INET) &&
538                     in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia)));
539         }
540
541         /* Catch a possible divide by zero later. */
542         KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (nh_flags=0x%08x) ifp=%p",
543             __func__, mtu, ro,
544             (ro != NULL && ro->ro_nh != NULL) ? ro->ro_nh->nh_flags : 0, ifp));
545
546         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
547                 m->m_flags |= M_MCAST;
548                 /*
549                  * IP destination address is multicast.  Make sure "gw"
550                  * still points to the address in "ro".  (It may have been
551                  * changed to point to a gateway address, above.)
552                  */
553                 gw = (const struct sockaddr *)dst;
554                 /*
555                  * See if the caller provided any multicast options
556                  */
557                 if (imo != NULL) {
558                         ip->ip_ttl = imo->imo_multicast_ttl;
559                         if (imo->imo_multicast_vif != -1)
560                                 ip->ip_src.s_addr =
561                                     ip_mcast_src ?
562                                     ip_mcast_src(imo->imo_multicast_vif) :
563                                     INADDR_ANY;
564                 } else
565                         ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
566                 /*
567                  * Confirm that the outgoing interface supports multicast.
568                  */
569                 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
570                         if ((ifp->if_flags & IFF_MULTICAST) == 0) {
571                                 IPSTAT_INC(ips_noroute);
572                                 error = ENETUNREACH;
573                                 goto bad;
574                         }
575                 }
576                 /*
577                  * If source address not specified yet, use address
578                  * of outgoing interface.
579                  */
580                 if (ip->ip_src.s_addr == INADDR_ANY)
581                         ip->ip_src = src;
582
583                 if ((imo == NULL && in_mcast_loop) ||
584                     (imo && imo->imo_multicast_loop)) {
585                         /*
586                          * Loop back multicast datagram if not expressly
587                          * forbidden to do so, even if we are not a member
588                          * of the group; ip_input() will filter it later,
589                          * thus deferring a hash lookup and mutex acquisition
590                          * at the expense of a cheap copy using m_copym().
591                          */
592                         ip_mloopback(ifp, m, hlen);
593                 } else {
594                         /*
595                          * If we are acting as a multicast router, perform
596                          * multicast forwarding as if the packet had just
597                          * arrived on the interface to which we are about
598                          * to send.  The multicast forwarding function
599                          * recursively calls this function, using the
600                          * IP_FORWARDING flag to prevent infinite recursion.
601                          *
602                          * Multicasts that are looped back by ip_mloopback(),
603                          * above, will be forwarded by the ip_input() routine,
604                          * if necessary.
605                          */
606                         if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
607                                 /*
608                                  * If rsvp daemon is not running, do not
609                                  * set ip_moptions. This ensures that the packet
610                                  * is multicast and not just sent down one link
611                                  * as prescribed by rsvpd.
612                                  */
613                                 if (!V_rsvp_on)
614                                         imo = NULL;
615                                 if (ip_mforward &&
616                                     ip_mforward(ip, ifp, m, imo) != 0) {
617                                         m_freem(m);
618                                         goto done;
619                                 }
620                         }
621                 }
622
623                 /*
624                  * Multicasts with a time-to-live of zero may be looped-
625                  * back, above, but must not be transmitted on a network.
626                  * Also, multicasts addressed to the loopback interface
627                  * are not sent -- the above call to ip_mloopback() will
628                  * loop back a copy. ip_input() will drop the copy if
629                  * this host does not belong to the destination group on
630                  * the loopback interface.
631                  */
632                 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
633                         m_freem(m);
634                         goto done;
635                 }
636
637                 goto sendit;
638         }
639
640         /*
641          * If the source address is not specified yet, use the address
642          * of the outoing interface.
643          */
644         if (ip->ip_src.s_addr == INADDR_ANY)
645                 ip->ip_src = src;
646
647         /*
648          * Look for broadcast address and
649          * verify user is allowed to send
650          * such a packet.
651          */
652         if (isbroadcast) {
653                 if ((ifp->if_flags & IFF_BROADCAST) == 0) {
654                         error = EADDRNOTAVAIL;
655                         goto bad;
656                 }
657                 if ((flags & IP_ALLOWBROADCAST) == 0) {
658                         error = EACCES;
659                         goto bad;
660                 }
661                 /* don't allow broadcast messages to be fragmented */
662                 if (ip_len > mtu) {
663                         error = EMSGSIZE;
664                         goto bad;
665                 }
666                 m->m_flags |= M_BCAST;
667         } else {
668                 m->m_flags &= ~M_BCAST;
669         }
670
671 sendit:
672 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
673         if (IPSEC_ENABLED(ipv4)) {
674                 if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) {
675                         if (error == EINPROGRESS)
676                                 error = 0;
677                         goto done;
678                 }
679         }
680         /*
681          * Check if there was a route for this packet; return error if not.
682          */
683         if (no_route_but_check_spd) {
684                 IPSTAT_INC(ips_noroute);
685                 error = EHOSTUNREACH;
686                 goto bad;
687         }
688         /* Update variables that are affected by ipsec4_output(). */
689         ip = mtod(m, struct ip *);
690         hlen = ip->ip_hl << 2;
691 #endif /* IPSEC */
692
693         /* Jump over all PFIL processing if hooks are not active. */
694         if (PFIL_HOOKED_OUT(V_inet_pfil_head)) {
695                 switch (ip_output_pfil(&m, ifp, flags, inp, dst, &fibnum,
696                     &error)) {
697                 case 1: /* Finished */
698                         goto done;
699
700                 case 0: /* Continue normally */
701                         ip = mtod(m, struct ip *);
702                         ip_len = ntohs(ip->ip_len);
703                         break;
704
705                 case -1: /* Need to try again */
706                         /* Reset everything for a new round */
707                         if (ro != NULL) {
708                                 RO_NHFREE(ro);
709                                 ro->ro_prepend = NULL;
710                         }
711                         gw = (const struct sockaddr *)dst;
712                         ip = mtod(m, struct ip *);
713                         goto again;
714                 }
715         }
716
717         if (vlan_pcp > -1)
718                 EVL_APPLY_PRI(m, vlan_pcp);
719
720         /* IN_LOOPBACK must not appear on the wire - RFC1122. */
721         if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) ||
722             IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) {
723                 if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
724                         IPSTAT_INC(ips_badaddr);
725                         error = EADDRNOTAVAIL;
726                         goto bad;
727                 }
728         }
729
730         /* Ensure the packet data is mapped if the interface requires it. */
731         if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
732                 m = mb_unmapped_to_ext(m);
733                 if (m == NULL) {
734                         IPSTAT_INC(ips_odropped);
735                         error = ENOBUFS;
736                         goto bad;
737                 }
738         }
739
740         m->m_pkthdr.csum_flags |= CSUM_IP;
741         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
742                 in_delayed_cksum(m);
743                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
744         }
745 #if defined(SCTP) || defined(SCTP_SUPPORT)
746         if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
747                 sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
748                 m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
749         }
750 #endif
751
752         /*
753          * If small enough for interface, or the interface will take
754          * care of the fragmentation for us, we can just send directly.
755          * Note that if_vxlan could have requested TSO even though the outer
756          * frame is UDP.  It is correct to not fragment such datagrams and
757          * instead just pass them on to the driver.
758          */
759         if (ip_len <= mtu ||
760             (m->m_pkthdr.csum_flags & ifp->if_hwassist &
761             (CSUM_TSO | CSUM_INNER_TSO)) != 0) {
762                 ip->ip_sum = 0;
763                 if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
764                         ip->ip_sum = in_cksum(m, hlen);
765                         m->m_pkthdr.csum_flags &= ~CSUM_IP;
766                 }
767
768                 /*
769                  * Record statistics for this interface address.
770                  * With CSUM_TSO the byte/packet count will be slightly
771                  * incorrect because we count the IP+TCP headers only
772                  * once instead of for every generated packet.
773                  */
774                 if (!(flags & IP_FORWARDING) && ia) {
775                         if (m->m_pkthdr.csum_flags &
776                             (CSUM_TSO | CSUM_INNER_TSO))
777                                 counter_u64_add(ia->ia_ifa.ifa_opackets,
778                                     m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
779                         else
780                                 counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
781
782                         counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
783                 }
784 #ifdef MBUF_STRESS_TEST
785                 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
786                         m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
787 #endif
788                 /*
789                  * Reset layer specific mbuf flags
790                  * to avoid confusing lower layers.
791                  */
792                 m_clrprotoflags(m);
793                 IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
794                 error = ip_output_send(inp, ifp, m, gw, ro,
795                     (flags & IP_NO_SND_TAG_RL) ? false : true);
796                 goto done;
797         }
798
799         /* Balk when DF bit is set or the interface didn't support TSO. */
800         if ((ip_off & IP_DF) ||
801             (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO))) {
802                 error = EMSGSIZE;
803                 IPSTAT_INC(ips_cantfrag);
804                 goto bad;
805         }
806
807         /*
808          * Too large for interface; fragment if possible. If successful,
809          * on return, m will point to a list of packets to be sent.
810          */
811         error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
812         if (error)
813                 goto bad;
814         for (; m; m = m0) {
815                 m0 = m->m_nextpkt;
816                 m->m_nextpkt = 0;
817                 if (error == 0) {
818                         /* Record statistics for this interface address. */
819                         if (ia != NULL) {
820                                 counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
821                                 counter_u64_add(ia->ia_ifa.ifa_obytes,
822                                     m->m_pkthdr.len);
823                         }
824                         /*
825                          * Reset layer specific mbuf flags
826                          * to avoid confusing upper layers.
827                          */
828                         m_clrprotoflags(m);
829
830                         IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
831                             mtod(m, struct ip *), NULL);
832                         error = ip_output_send(inp, ifp, m, gw, ro, true);
833                 } else
834                         m_freem(m);
835         }
836
837         if (error == 0)
838                 IPSTAT_INC(ips_fragmented);
839
840 done:
841         return (error);
842  bad:
843         m_freem(m);
844         goto done;
845 }
846
847 /*
848  * Create a chain of fragments which fit the given mtu. m_frag points to the
849  * mbuf to be fragmented; on return it points to the chain with the fragments.
850  * Return 0 if no error. If error, m_frag may contain a partially built
851  * chain of fragments that should be freed by the caller.
852  *
853  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
854  */
855 int
856 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
857     u_long if_hwassist_flags)
858 {
859         int error = 0;
860         int hlen = ip->ip_hl << 2;
861         int len = (mtu - hlen) & ~7;    /* size of payload in each fragment */
862         int off;
863         struct mbuf *m0 = *m_frag;      /* the original packet          */
864         int firstlen;
865         struct mbuf **mnext;
866         int nfrags;
867         uint16_t ip_len, ip_off;
868
869         ip_len = ntohs(ip->ip_len);
870         ip_off = ntohs(ip->ip_off);
871
872         /*
873          * Packet shall not have "Don't Fragment" flag and have at least 8
874          * bytes of payload.
875          */
876         if (__predict_false((ip_off & IP_DF) || len < 8)) {
877                 IPSTAT_INC(ips_cantfrag);
878                 return (EMSGSIZE);
879         }
880
881         /*
882          * If the interface will not calculate checksums on
883          * fragmented packets, then do it here.
884          */
885         if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
886                 in_delayed_cksum(m0);
887                 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
888         }
889 #if defined(SCTP) || defined(SCTP_SUPPORT)
890         if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
891                 sctp_delayed_cksum(m0, hlen);
892                 m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
893         }
894 #endif
895         if (len > PAGE_SIZE) {
896                 /*
897                  * Fragment large datagrams such that each segment
898                  * contains a multiple of PAGE_SIZE amount of data,
899                  * plus headers. This enables a receiver to perform
900                  * page-flipping zero-copy optimizations.
901                  *
902                  * XXX When does this help given that sender and receiver
903                  * could have different page sizes, and also mtu could
904                  * be less than the receiver's page size ?
905                  */
906                 int newlen;
907
908                 off = MIN(mtu, m0->m_pkthdr.len);
909
910                 /*
911                  * firstlen (off - hlen) must be aligned on an
912                  * 8-byte boundary
913                  */
914                 if (off < hlen)
915                         goto smart_frag_failure;
916                 off = ((off - hlen) & ~7) + hlen;
917                 newlen = (~PAGE_MASK) & mtu;
918                 if ((newlen + sizeof (struct ip)) > mtu) {
919                         /* we failed, go back the default */
920 smart_frag_failure:
921                         newlen = len;
922                         off = hlen + len;
923                 }
924                 len = newlen;
925
926         } else {
927                 off = hlen + len;
928         }
929
930         firstlen = off - hlen;
931         mnext = &m0->m_nextpkt;         /* pointer to next packet */
932
933         /*
934          * Loop through length of segment after first fragment,
935          * make new header and copy data of each part and link onto chain.
936          * Here, m0 is the original packet, m is the fragment being created.
937          * The fragments are linked off the m_nextpkt of the original
938          * packet, which after processing serves as the first fragment.
939          */
940         for (nfrags = 1; off < ip_len; off += len, nfrags++) {
941                 struct ip *mhip;        /* ip header on the fragment */
942                 struct mbuf *m;
943                 int mhlen = sizeof (struct ip);
944
945                 m = m_gethdr(M_NOWAIT, MT_DATA);
946                 if (m == NULL) {
947                         error = ENOBUFS;
948                         IPSTAT_INC(ips_odropped);
949                         goto done;
950                 }
951                 /*
952                  * Make sure the complete packet header gets copied
953                  * from the originating mbuf to the newly created
954                  * mbuf. This also ensures that existing firewall
955                  * classification(s), VLAN tags and so on get copied
956                  * to the resulting fragmented packet(s):
957                  */
958                 if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
959                         m_free(m);
960                         error = ENOBUFS;
961                         IPSTAT_INC(ips_odropped);
962                         goto done;
963                 }
964                 /*
965                  * In the first mbuf, leave room for the link header, then
966                  * copy the original IP header including options. The payload
967                  * goes into an additional mbuf chain returned by m_copym().
968                  */
969                 m->m_data += max_linkhdr;
970                 mhip = mtod(m, struct ip *);
971                 *mhip = *ip;
972                 if (hlen > sizeof (struct ip)) {
973                         mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
974                         mhip->ip_v = IPVERSION;
975                         mhip->ip_hl = mhlen >> 2;
976                 }
977                 m->m_len = mhlen;
978                 /* XXX do we need to add ip_off below ? */
979                 mhip->ip_off = ((off - hlen) >> 3) + ip_off;
980                 if (off + len >= ip_len)
981                         len = ip_len - off;
982                 else
983                         mhip->ip_off |= IP_MF;
984                 mhip->ip_len = htons((u_short)(len + mhlen));
985                 m->m_next = m_copym(m0, off, len, M_NOWAIT);
986                 if (m->m_next == NULL) {        /* copy failed */
987                         m_free(m);
988                         error = ENOBUFS;        /* ??? */
989                         IPSTAT_INC(ips_odropped);
990                         goto done;
991                 }
992                 m->m_pkthdr.len = mhlen + len;
993 #ifdef MAC
994                 mac_netinet_fragment(m0, m);
995 #endif
996                 mhip->ip_off = htons(mhip->ip_off);
997                 mhip->ip_sum = 0;
998                 if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
999                         mhip->ip_sum = in_cksum(m, mhlen);
1000                         m->m_pkthdr.csum_flags &= ~CSUM_IP;
1001                 }
1002                 *mnext = m;
1003                 mnext = &m->m_nextpkt;
1004         }
1005         IPSTAT_ADD(ips_ofragments, nfrags);
1006
1007         /*
1008          * Update first fragment by trimming what's been copied out
1009          * and updating header.
1010          */
1011         m_adj(m0, hlen + firstlen - ip_len);
1012         m0->m_pkthdr.len = hlen + firstlen;
1013         ip->ip_len = htons((u_short)m0->m_pkthdr.len);
1014         ip->ip_off = htons(ip_off | IP_MF);
1015         ip->ip_sum = 0;
1016         if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
1017                 ip->ip_sum = in_cksum(m0, hlen);
1018                 m0->m_pkthdr.csum_flags &= ~CSUM_IP;
1019         }
1020
1021 done:
1022         *m_frag = m0;
1023         return error;
1024 }
1025
1026 void
1027 in_delayed_cksum(struct mbuf *m)
1028 {
1029         struct ip *ip;
1030         struct udphdr *uh;
1031         uint16_t cklen, csum, offset;
1032
1033         ip = mtod(m, struct ip *);
1034         offset = ip->ip_hl << 2 ;
1035
1036         if (m->m_pkthdr.csum_flags & CSUM_UDP) {
1037                 /* if udp header is not in the first mbuf copy udplen */
1038                 if (offset + sizeof(struct udphdr) > m->m_len) {
1039                         m_copydata(m, offset + offsetof(struct udphdr,
1040                             uh_ulen), sizeof(cklen), (caddr_t)&cklen);
1041                         cklen = ntohs(cklen);
1042                 } else {
1043                         uh = (struct udphdr *)mtodo(m, offset);
1044                         cklen = ntohs(uh->uh_ulen);
1045                 }
1046                 csum = in_cksum_skip(m, cklen + offset, offset);
1047                 if (csum == 0)
1048                         csum = 0xffff;
1049         } else {
1050                 cklen = ntohs(ip->ip_len);
1051                 csum = in_cksum_skip(m, cklen, offset);
1052         }
1053         offset += m->m_pkthdr.csum_data;        /* checksum offset */
1054
1055         if (offset + sizeof(csum) > m->m_len)
1056                 m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
1057         else
1058                 *(u_short *)mtodo(m, offset) = csum;
1059 }
1060
1061 /*
1062  * IP socket option processing.
1063  */
1064 int
1065 ip_ctloutput(struct socket *so, struct sockopt *sopt)
1066 {
1067         struct inpcb *inp = sotoinpcb(so);
1068         int     error, optval;
1069 #ifdef  RSS
1070         uint32_t rss_bucket;
1071         int retval;
1072 #endif
1073
1074         error = optval = 0;
1075         if (sopt->sopt_level != IPPROTO_IP) {
1076                 error = EINVAL;
1077
1078                 if (sopt->sopt_level == SOL_SOCKET &&
1079                     sopt->sopt_dir == SOPT_SET) {
1080                         switch (sopt->sopt_name) {
1081                         case SO_REUSEADDR:
1082                                 INP_WLOCK(inp);
1083                                 if ((so->so_options & SO_REUSEADDR) != 0)
1084                                         inp->inp_flags2 |= INP_REUSEADDR;
1085                                 else
1086                                         inp->inp_flags2 &= ~INP_REUSEADDR;
1087                                 INP_WUNLOCK(inp);
1088                                 error = 0;
1089                                 break;
1090                         case SO_REUSEPORT:
1091                                 INP_WLOCK(inp);
1092                                 if ((so->so_options & SO_REUSEPORT) != 0)
1093                                         inp->inp_flags2 |= INP_REUSEPORT;
1094                                 else
1095                                         inp->inp_flags2 &= ~INP_REUSEPORT;
1096                                 INP_WUNLOCK(inp);
1097                                 error = 0;
1098                                 break;
1099                         case SO_REUSEPORT_LB:
1100                                 INP_WLOCK(inp);
1101                                 if ((so->so_options & SO_REUSEPORT_LB) != 0)
1102                                         inp->inp_flags2 |= INP_REUSEPORT_LB;
1103                                 else
1104                                         inp->inp_flags2 &= ~INP_REUSEPORT_LB;
1105                                 INP_WUNLOCK(inp);
1106                                 error = 0;
1107                                 break;
1108                         case SO_SETFIB:
1109                                 INP_WLOCK(inp);
1110                                 inp->inp_inc.inc_fibnum = so->so_fibnum;
1111                                 INP_WUNLOCK(inp);
1112                                 error = 0;
1113                                 break;
1114                         case SO_MAX_PACING_RATE:
1115 #ifdef RATELIMIT
1116                                 INP_WLOCK(inp);
1117                                 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
1118                                 INP_WUNLOCK(inp);
1119                                 error = 0;
1120 #else
1121                                 error = EOPNOTSUPP;
1122 #endif
1123                                 break;
1124                         default:
1125                                 break;
1126                         }
1127                 }
1128                 return (error);
1129         }
1130
1131         switch (sopt->sopt_dir) {
1132         case SOPT_SET:
1133                 switch (sopt->sopt_name) {
1134                 case IP_OPTIONS:
1135 #ifdef notyet
1136                 case IP_RETOPTS:
1137 #endif
1138                 {
1139                         struct mbuf *m;
1140                         if (sopt->sopt_valsize > MLEN) {
1141                                 error = EMSGSIZE;
1142                                 break;
1143                         }
1144                         m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
1145                         if (m == NULL) {
1146                                 error = ENOBUFS;
1147                                 break;
1148                         }
1149                         m->m_len = sopt->sopt_valsize;
1150                         error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
1151                                             m->m_len);
1152                         if (error) {
1153                                 m_free(m);
1154                                 break;
1155                         }
1156                         INP_WLOCK(inp);
1157                         error = ip_pcbopts(inp, sopt->sopt_name, m);
1158                         INP_WUNLOCK(inp);
1159                         return (error);
1160                 }
1161
1162                 case IP_BINDANY:
1163                         if (sopt->sopt_td != NULL) {
1164                                 error = priv_check(sopt->sopt_td,
1165                                     PRIV_NETINET_BINDANY);
1166                                 if (error)
1167                                         break;
1168                         }
1169                         /* FALLTHROUGH */
1170                 case IP_TOS:
1171                 case IP_TTL:
1172                 case IP_MINTTL:
1173                 case IP_RECVOPTS:
1174                 case IP_RECVRETOPTS:
1175                 case IP_ORIGDSTADDR:
1176                 case IP_RECVDSTADDR:
1177                 case IP_RECVTTL:
1178                 case IP_RECVIF:
1179                 case IP_ONESBCAST:
1180                 case IP_DONTFRAG:
1181                 case IP_RECVTOS:
1182                 case IP_RECVFLOWID:
1183 #ifdef  RSS
1184                 case IP_RECVRSSBUCKETID:
1185 #endif
1186                 case IP_VLAN_PCP:
1187                         error = sooptcopyin(sopt, &optval, sizeof optval,
1188                                             sizeof optval);
1189                         if (error)
1190                                 break;
1191
1192                         switch (sopt->sopt_name) {
1193                         case IP_TOS:
1194                                 inp->inp_ip_tos = optval;
1195                                 break;
1196
1197                         case IP_TTL:
1198                                 inp->inp_ip_ttl = optval;
1199                                 break;
1200
1201                         case IP_MINTTL:
1202                                 if (optval >= 0 && optval <= MAXTTL)
1203                                         inp->inp_ip_minttl = optval;
1204                                 else
1205                                         error = EINVAL;
1206                                 break;
1207
1208 #define OPTSET(bit) do {                                                \
1209         INP_WLOCK(inp);                                                 \
1210         if (optval)                                                     \
1211                 inp->inp_flags |= bit;                                  \
1212         else                                                            \
1213                 inp->inp_flags &= ~bit;                                 \
1214         INP_WUNLOCK(inp);                                               \
1215 } while (0)
1216
1217 #define OPTSET2(bit, val) do {                                          \
1218         INP_WLOCK(inp);                                                 \
1219         if (val)                                                        \
1220                 inp->inp_flags2 |= bit;                                 \
1221         else                                                            \
1222                 inp->inp_flags2 &= ~bit;                                \
1223         INP_WUNLOCK(inp);                                               \
1224 } while (0)
1225
1226                         case IP_RECVOPTS:
1227                                 OPTSET(INP_RECVOPTS);
1228                                 break;
1229
1230                         case IP_RECVRETOPTS:
1231                                 OPTSET(INP_RECVRETOPTS);
1232                                 break;
1233
1234                         case IP_RECVDSTADDR:
1235                                 OPTSET(INP_RECVDSTADDR);
1236                                 break;
1237
1238                         case IP_ORIGDSTADDR:
1239                                 OPTSET2(INP_ORIGDSTADDR, optval);
1240                                 break;
1241
1242                         case IP_RECVTTL:
1243                                 OPTSET(INP_RECVTTL);
1244                                 break;
1245
1246                         case IP_RECVIF:
1247                                 OPTSET(INP_RECVIF);
1248                                 break;
1249
1250                         case IP_ONESBCAST:
1251                                 OPTSET(INP_ONESBCAST);
1252                                 break;
1253                         case IP_DONTFRAG:
1254                                 OPTSET(INP_DONTFRAG);
1255                                 break;
1256                         case IP_BINDANY:
1257                                 OPTSET(INP_BINDANY);
1258                                 break;
1259                         case IP_RECVTOS:
1260                                 OPTSET(INP_RECVTOS);
1261                                 break;
1262                         case IP_RECVFLOWID:
1263                                 OPTSET2(INP_RECVFLOWID, optval);
1264                                 break;
1265 #ifdef RSS
1266                         case IP_RECVRSSBUCKETID:
1267                                 OPTSET2(INP_RECVRSSBUCKETID, optval);
1268                                 break;
1269 #endif
1270                         case IP_VLAN_PCP:
1271                                 if ((optval >= -1) && (optval <=
1272                                     (INP_2PCP_MASK >> INP_2PCP_SHIFT))) {
1273                                         if (optval == -1) {
1274                                                 INP_WLOCK(inp);
1275                                                 inp->inp_flags2 &=
1276                                                     ~(INP_2PCP_SET |
1277                                                       INP_2PCP_MASK);
1278                                                 INP_WUNLOCK(inp);
1279                                         } else {
1280                                                 INP_WLOCK(inp);
1281                                                 inp->inp_flags2 |=
1282                                                     INP_2PCP_SET;
1283                                                 inp->inp_flags2 &=
1284                                                     ~INP_2PCP_MASK;
1285                                                 inp->inp_flags2 |=
1286                                                     optval << INP_2PCP_SHIFT;
1287                                                 INP_WUNLOCK(inp);
1288                                         }
1289                                 } else
1290                                         error = EINVAL;
1291                                 break;
1292                         }
1293                         break;
1294 #undef OPTSET
1295 #undef OPTSET2
1296
1297                 /*
1298                  * Multicast socket options are processed by the in_mcast
1299                  * module.
1300                  */
1301                 case IP_MULTICAST_IF:
1302                 case IP_MULTICAST_VIF:
1303                 case IP_MULTICAST_TTL:
1304                 case IP_MULTICAST_LOOP:
1305                 case IP_ADD_MEMBERSHIP:
1306                 case IP_DROP_MEMBERSHIP:
1307                 case IP_ADD_SOURCE_MEMBERSHIP:
1308                 case IP_DROP_SOURCE_MEMBERSHIP:
1309                 case IP_BLOCK_SOURCE:
1310                 case IP_UNBLOCK_SOURCE:
1311                 case IP_MSFILTER:
1312                 case MCAST_JOIN_GROUP:
1313                 case MCAST_LEAVE_GROUP:
1314                 case MCAST_JOIN_SOURCE_GROUP:
1315                 case MCAST_LEAVE_SOURCE_GROUP:
1316                 case MCAST_BLOCK_SOURCE:
1317                 case MCAST_UNBLOCK_SOURCE:
1318                         error = inp_setmoptions(inp, sopt);
1319                         break;
1320
1321                 case IP_PORTRANGE:
1322                         error = sooptcopyin(sopt, &optval, sizeof optval,
1323                                             sizeof optval);
1324                         if (error)
1325                                 break;
1326
1327                         INP_WLOCK(inp);
1328                         switch (optval) {
1329                         case IP_PORTRANGE_DEFAULT:
1330                                 inp->inp_flags &= ~(INP_LOWPORT);
1331                                 inp->inp_flags &= ~(INP_HIGHPORT);
1332                                 break;
1333
1334                         case IP_PORTRANGE_HIGH:
1335                                 inp->inp_flags &= ~(INP_LOWPORT);
1336                                 inp->inp_flags |= INP_HIGHPORT;
1337                                 break;
1338
1339                         case IP_PORTRANGE_LOW:
1340                                 inp->inp_flags &= ~(INP_HIGHPORT);
1341                                 inp->inp_flags |= INP_LOWPORT;
1342                                 break;
1343
1344                         default:
1345                                 error = EINVAL;
1346                                 break;
1347                         }
1348                         INP_WUNLOCK(inp);
1349                         break;
1350
1351 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1352                 case IP_IPSEC_POLICY:
1353                         if (IPSEC_ENABLED(ipv4)) {
1354                                 error = IPSEC_PCBCTL(ipv4, inp, sopt);
1355                                 break;
1356                         }
1357                         /* FALLTHROUGH */
1358 #endif /* IPSEC */
1359
1360                 default:
1361                         error = ENOPROTOOPT;
1362                         break;
1363                 }
1364                 break;
1365
1366         case SOPT_GET:
1367                 switch (sopt->sopt_name) {
1368                 case IP_OPTIONS:
1369                 case IP_RETOPTS:
1370                         INP_RLOCK(inp);
1371                         if (inp->inp_options) {
1372                                 struct mbuf *options;
1373
1374                                 options = m_copym(inp->inp_options, 0,
1375                                     M_COPYALL, M_NOWAIT);
1376                                 INP_RUNLOCK(inp);
1377                                 if (options != NULL) {
1378                                         error = sooptcopyout(sopt,
1379                                                              mtod(options, char *),
1380                                                              options->m_len);
1381                                         m_freem(options);
1382                                 } else
1383                                         error = ENOMEM;
1384                         } else {
1385                                 INP_RUNLOCK(inp);
1386                                 sopt->sopt_valsize = 0;
1387                         }
1388                         break;
1389
1390                 case IP_TOS:
1391                 case IP_TTL:
1392                 case IP_MINTTL:
1393                 case IP_RECVOPTS:
1394                 case IP_RECVRETOPTS:
1395                 case IP_ORIGDSTADDR:
1396                 case IP_RECVDSTADDR:
1397                 case IP_RECVTTL:
1398                 case IP_RECVIF:
1399                 case IP_PORTRANGE:
1400                 case IP_ONESBCAST:
1401                 case IP_DONTFRAG:
1402                 case IP_BINDANY:
1403                 case IP_RECVTOS:
1404                 case IP_FLOWID:
1405                 case IP_FLOWTYPE:
1406                 case IP_RECVFLOWID:
1407 #ifdef  RSS
1408                 case IP_RSSBUCKETID:
1409                 case IP_RECVRSSBUCKETID:
1410 #endif
1411                 case IP_VLAN_PCP:
1412                         switch (sopt->sopt_name) {
1413                         case IP_TOS:
1414                                 optval = inp->inp_ip_tos;
1415                                 break;
1416
1417                         case IP_TTL:
1418                                 optval = inp->inp_ip_ttl;
1419                                 break;
1420
1421                         case IP_MINTTL:
1422                                 optval = inp->inp_ip_minttl;
1423                                 break;
1424
1425 #define OPTBIT(bit)     (inp->inp_flags & bit ? 1 : 0)
1426 #define OPTBIT2(bit)    (inp->inp_flags2 & bit ? 1 : 0)
1427
1428                         case IP_RECVOPTS:
1429                                 optval = OPTBIT(INP_RECVOPTS);
1430                                 break;
1431
1432                         case IP_RECVRETOPTS:
1433                                 optval = OPTBIT(INP_RECVRETOPTS);
1434                                 break;
1435
1436                         case IP_RECVDSTADDR:
1437                                 optval = OPTBIT(INP_RECVDSTADDR);
1438                                 break;
1439
1440                         case IP_ORIGDSTADDR:
1441                                 optval = OPTBIT2(INP_ORIGDSTADDR);
1442                                 break;
1443
1444                         case IP_RECVTTL:
1445                                 optval = OPTBIT(INP_RECVTTL);
1446                                 break;
1447
1448                         case IP_RECVIF:
1449                                 optval = OPTBIT(INP_RECVIF);
1450                                 break;
1451
1452                         case IP_PORTRANGE:
1453                                 if (inp->inp_flags & INP_HIGHPORT)
1454                                         optval = IP_PORTRANGE_HIGH;
1455                                 else if (inp->inp_flags & INP_LOWPORT)
1456                                         optval = IP_PORTRANGE_LOW;
1457                                 else
1458                                         optval = 0;
1459                                 break;
1460
1461                         case IP_ONESBCAST:
1462                                 optval = OPTBIT(INP_ONESBCAST);
1463                                 break;
1464                         case IP_DONTFRAG:
1465                                 optval = OPTBIT(INP_DONTFRAG);
1466                                 break;
1467                         case IP_BINDANY:
1468                                 optval = OPTBIT(INP_BINDANY);
1469                                 break;
1470                         case IP_RECVTOS:
1471                                 optval = OPTBIT(INP_RECVTOS);
1472                                 break;
1473                         case IP_FLOWID:
1474                                 optval = inp->inp_flowid;
1475                                 break;
1476                         case IP_FLOWTYPE:
1477                                 optval = inp->inp_flowtype;
1478                                 break;
1479                         case IP_RECVFLOWID:
1480                                 optval = OPTBIT2(INP_RECVFLOWID);
1481                                 break;
1482 #ifdef  RSS
1483                         case IP_RSSBUCKETID:
1484                                 retval = rss_hash2bucket(inp->inp_flowid,
1485                                     inp->inp_flowtype,
1486                                     &rss_bucket);
1487                                 if (retval == 0)
1488                                         optval = rss_bucket;
1489                                 else
1490                                         error = EINVAL;
1491                                 break;
1492                         case IP_RECVRSSBUCKETID:
1493                                 optval = OPTBIT2(INP_RECVRSSBUCKETID);
1494                                 break;
1495 #endif
1496                         case IP_VLAN_PCP:
1497                                 if (OPTBIT2(INP_2PCP_SET)) {
1498                                         optval = (inp->inp_flags2 &
1499                                             INP_2PCP_MASK) >> INP_2PCP_SHIFT;
1500                                 } else {
1501                                         optval = -1;
1502                                 }
1503                                 break;
1504                         }
1505                         error = sooptcopyout(sopt, &optval, sizeof optval);
1506                         break;
1507
1508                 /*
1509                  * Multicast socket options are processed by the in_mcast
1510                  * module.
1511                  */
1512                 case IP_MULTICAST_IF:
1513                 case IP_MULTICAST_VIF:
1514                 case IP_MULTICAST_TTL:
1515                 case IP_MULTICAST_LOOP:
1516                 case IP_MSFILTER:
1517                         error = inp_getmoptions(inp, sopt);
1518                         break;
1519
1520 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1521                 case IP_IPSEC_POLICY:
1522                         if (IPSEC_ENABLED(ipv4)) {
1523                                 error = IPSEC_PCBCTL(ipv4, inp, sopt);
1524                                 break;
1525                         }
1526                         /* FALLTHROUGH */
1527 #endif /* IPSEC */
1528
1529                 default:
1530                         error = ENOPROTOOPT;
1531                         break;
1532                 }
1533                 break;
1534         }
1535         return (error);
1536 }
1537
1538 /*
1539  * Routine called from ip_output() to loop back a copy of an IP multicast
1540  * packet to the input queue of a specified interface.  Note that this
1541  * calls the output routine of the loopback "driver", but with an interface
1542  * pointer that might NOT be a loopback interface -- evil, but easier than
1543  * replicating that code here.
1544  */
1545 static void
1546 ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen)
1547 {
1548         struct ip *ip;
1549         struct mbuf *copym;
1550
1551         /*
1552          * Make a deep copy of the packet because we're going to
1553          * modify the pack in order to generate checksums.
1554          */
1555         copym = m_dup(m, M_NOWAIT);
1556         if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen))
1557                 copym = m_pullup(copym, hlen);
1558         if (copym != NULL) {
1559                 /* If needed, compute the checksum and mark it as valid. */
1560                 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1561                         in_delayed_cksum(copym);
1562                         copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1563                         copym->m_pkthdr.csum_flags |=
1564                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1565                         copym->m_pkthdr.csum_data = 0xffff;
1566                 }
1567                 /*
1568                  * We don't bother to fragment if the IP length is greater
1569                  * than the interface's MTU.  Can this possibly matter?
1570                  */
1571                 ip = mtod(copym, struct ip *);
1572                 ip->ip_sum = 0;
1573                 ip->ip_sum = in_cksum(copym, hlen);
1574                 if_simloop(ifp, copym, AF_INET, 0);
1575         }
1576 }