]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/ip_output.c
This commit was generated by cvs2svn to compensate for changes in r157184,
[FreeBSD/FreeBSD.git] / sys / netinet / ip_output.c
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *      @(#)ip_output.c 8.3 (Berkeley) 1/21/94
30  * $FreeBSD$
31  */
32
33 #include "opt_ipfw.h"
34 #include "opt_ipsec.h"
35 #include "opt_mac.h"
36 #include "opt_mbuf_stress_test.h"
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/mac.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48
49 #include <net/if.h>
50 #include <net/netisr.h>
51 #include <net/pfil.h>
52 #include <net/route.h>
53
54 #include <netinet/in.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/ip.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_var.h>
59 #include <netinet/ip_var.h>
60 #include <netinet/ip_options.h>
61
62 #if defined(IPSEC) || defined(FAST_IPSEC)
63 #include <netinet/ip_ipsec.h>
64 #ifdef IPSEC
65 #include <netinet6/ipsec.h>
66 #endif
67 #ifdef FAST_IPSEC
68 #include <netipsec/ipsec.h>
69 #endif
70 #endif /*IPSEC*/
71
72 #include <machine/in_cksum.h>
73
74 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
75
76 #define print_ip(x, a, y)        printf("%s %d.%d.%d.%d%s",\
77                                 x, (ntohl(a.s_addr)>>24)&0xFF,\
78                                   (ntohl(a.s_addr)>>16)&0xFF,\
79                                   (ntohl(a.s_addr)>>8)&0xFF,\
80                                   (ntohl(a.s_addr))&0xFF, y);
81
82 u_short ip_id;
83
84 #ifdef MBUF_STRESS_TEST
85 int mbuf_frag_size = 0;
86 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
87         &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
88 #endif
89
90 static struct ifnet *ip_multicast_if(struct in_addr *, int *);
91 static void     ip_mloopback
92         (struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
93 static int      ip_getmoptions(struct inpcb *, struct sockopt *);
94 static int      ip_setmoptions(struct inpcb *, struct sockopt *);
95
96
97 extern  struct protosw inetsw[];
98
99 /*
100  * IP output.  The packet in mbuf chain m contains a skeletal IP
101  * header (with len, off, ttl, proto, tos, src, dst).
102  * The mbuf chain containing the packet will be freed.
103  * The mbuf opt, if present, will not be freed.
104  * In the IP forwarding case, the packet will arrive with options already
105  * inserted, so must have a NULL opt pointer.
106  */
107 int
108 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro,
109         int flags, struct ip_moptions *imo, struct inpcb *inp)
110 {
111         struct ip *ip;
112         struct ifnet *ifp = NULL;       /* keep compiler happy */
113         struct mbuf *m0;
114         int hlen = sizeof (struct ip);
115         int len, error = 0;
116         struct sockaddr_in *dst = NULL; /* keep compiler happy */
117         struct in_ifaddr *ia = NULL;
118         int isbroadcast, sw_csum;
119         struct route iproute;
120         struct in_addr odst;
121 #ifdef IPFIREWALL_FORWARD
122         struct m_tag *fwd_tag = NULL;
123 #endif
124         M_ASSERTPKTHDR(m);
125
126         if (ro == NULL) {
127                 ro = &iproute;
128                 bzero(ro, sizeof (*ro));
129         }
130
131         if (inp != NULL)
132                 INP_LOCK_ASSERT(inp);
133
134         if (opt) {
135                 len = 0;
136                 m = ip_insertoptions(m, opt, &len);
137                 if (len != 0)
138                         hlen = len;
139         }
140         ip = mtod(m, struct ip *);
141
142         /*
143          * Fill in IP header.  If we are not allowing fragmentation,
144          * then the ip_id field is meaningless, but we don't set it
145          * to zero.  Doing so causes various problems when devices along
146          * the path (routers, load balancers, firewalls, etc.) illegally
147          * disable DF on our packet.  Note that a 16-bit counter
148          * will wrap around in less than 10 seconds at 100 Mbit/s on a
149          * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
150          * for Counting NATted Hosts", Proc. IMW'02, available at
151          * <http://www.research.att.com/~smb/papers/fnat.pdf>.
152          */
153         if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
154                 ip->ip_v = IPVERSION;
155                 ip->ip_hl = hlen >> 2;
156                 ip->ip_id = ip_newid();
157                 ipstat.ips_localout++;
158         } else {
159                 hlen = ip->ip_hl << 2;
160         }
161
162         dst = (struct sockaddr_in *)&ro->ro_dst;
163 again:
164         /*
165          * If there is a cached route,
166          * check that it is to the same destination
167          * and is still up.  If not, free it and try again.
168          * The address family should also be checked in case of sharing the
169          * cache with IPv6.
170          */
171         if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
172                           dst->sin_family != AF_INET ||
173                           dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
174                 RTFREE(ro->ro_rt);
175                 ro->ro_rt = (struct rtentry *)0;
176         }
177 #ifdef IPFIREWALL_FORWARD
178         if (ro->ro_rt == NULL && fwd_tag == NULL) {
179 #else
180         if (ro->ro_rt == NULL) {
181 #endif
182                 bzero(dst, sizeof(*dst));
183                 dst->sin_family = AF_INET;
184                 dst->sin_len = sizeof(*dst);
185                 dst->sin_addr = ip->ip_dst;
186         }
187         /*
188          * If routing to interface only,
189          * short circuit routing lookup.
190          */
191         if (flags & IP_ROUTETOIF) {
192                 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
193                     (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
194                         ipstat.ips_noroute++;
195                         error = ENETUNREACH;
196                         goto bad;
197                 }
198                 ifp = ia->ia_ifp;
199                 ip->ip_ttl = 1;
200                 isbroadcast = in_broadcast(dst->sin_addr, ifp);
201         } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
202             imo != NULL && imo->imo_multicast_ifp != NULL) {
203                 /*
204                  * Bypass the normal routing lookup for multicast
205                  * packets if the interface is specified.
206                  */
207                 ifp = imo->imo_multicast_ifp;
208                 IFP_TO_IA(ifp, ia);
209                 isbroadcast = 0;        /* fool gcc */
210         } else {
211                 /*
212                  * We want to do any cloning requested by the link layer,
213                  * as this is probably required in all cases for correct
214                  * operation (as it is for ARP).
215                  */
216                 if (ro->ro_rt == NULL)
217                         rtalloc_ign(ro, 0);
218                 if (ro->ro_rt == NULL) {
219                         ipstat.ips_noroute++;
220                         error = EHOSTUNREACH;
221                         goto bad;
222                 }
223                 ia = ifatoia(ro->ro_rt->rt_ifa);
224                 ifp = ro->ro_rt->rt_ifp;
225                 ro->ro_rt->rt_rmx.rmx_pksent++;
226                 if (ro->ro_rt->rt_flags & RTF_GATEWAY)
227                         dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
228                 if (ro->ro_rt->rt_flags & RTF_HOST)
229                         isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
230                 else
231                         isbroadcast = in_broadcast(dst->sin_addr, ifp);
232         }
233         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
234                 struct in_multi *inm;
235
236                 m->m_flags |= M_MCAST;
237                 /*
238                  * IP destination address is multicast.  Make sure "dst"
239                  * still points to the address in "ro".  (It may have been
240                  * changed to point to a gateway address, above.)
241                  */
242                 dst = (struct sockaddr_in *)&ro->ro_dst;
243                 /*
244                  * See if the caller provided any multicast options
245                  */
246                 if (imo != NULL) {
247                         ip->ip_ttl = imo->imo_multicast_ttl;
248                         if (imo->imo_multicast_vif != -1)
249                                 ip->ip_src.s_addr =
250                                     ip_mcast_src ?
251                                     ip_mcast_src(imo->imo_multicast_vif) :
252                                     INADDR_ANY;
253                 } else
254                         ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
255                 /*
256                  * Confirm that the outgoing interface supports multicast.
257                  */
258                 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
259                         if ((ifp->if_flags & IFF_MULTICAST) == 0) {
260                                 ipstat.ips_noroute++;
261                                 error = ENETUNREACH;
262                                 goto bad;
263                         }
264                 }
265                 /*
266                  * If source address not specified yet, use address
267                  * of outgoing interface.
268                  */
269                 if (ip->ip_src.s_addr == INADDR_ANY) {
270                         /* Interface may have no addresses. */
271                         if (ia != NULL)
272                                 ip->ip_src = IA_SIN(ia)->sin_addr;
273                 }
274
275                 IN_MULTI_LOCK();
276                 IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
277                 if (inm != NULL &&
278                    (imo == NULL || imo->imo_multicast_loop)) {
279                         IN_MULTI_UNLOCK();
280                         /*
281                          * If we belong to the destination multicast group
282                          * on the outgoing interface, and the caller did not
283                          * forbid loopback, loop back a copy.
284                          */
285                         ip_mloopback(ifp, m, dst, hlen);
286                 }
287                 else {
288                         IN_MULTI_UNLOCK();
289                         /*
290                          * If we are acting as a multicast router, perform
291                          * multicast forwarding as if the packet had just
292                          * arrived on the interface to which we are about
293                          * to send.  The multicast forwarding function
294                          * recursively calls this function, using the
295                          * IP_FORWARDING flag to prevent infinite recursion.
296                          *
297                          * Multicasts that are looped back by ip_mloopback(),
298                          * above, will be forwarded by the ip_input() routine,
299                          * if necessary.
300                          */
301                         if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
302                                 /*
303                                  * If rsvp daemon is not running, do not
304                                  * set ip_moptions. This ensures that the packet
305                                  * is multicast and not just sent down one link
306                                  * as prescribed by rsvpd.
307                                  */
308                                 if (!rsvp_on)
309                                         imo = NULL;
310                                 if (ip_mforward &&
311                                     ip_mforward(ip, ifp, m, imo) != 0) {
312                                         m_freem(m);
313                                         goto done;
314                                 }
315                         }
316                 }
317
318                 /*
319                  * Multicasts with a time-to-live of zero may be looped-
320                  * back, above, but must not be transmitted on a network.
321                  * Also, multicasts addressed to the loopback interface
322                  * are not sent -- the above call to ip_mloopback() will
323                  * loop back a copy if this host actually belongs to the
324                  * destination group on the loopback interface.
325                  */
326                 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
327                         m_freem(m);
328                         goto done;
329                 }
330
331                 goto sendit;
332         }
333 #ifndef notdef
334         /*
335          * If the source address is not specified yet, use the address
336          * of the outoing interface.
337          */
338         if (ip->ip_src.s_addr == INADDR_ANY) {
339                 /* Interface may have no addresses. */
340                 if (ia != NULL) {
341                         ip->ip_src = IA_SIN(ia)->sin_addr;
342                 }
343         }
344 #endif /* notdef */
345         /*
346          * Verify that we have any chance at all of being able to queue the
347          * packet or packet fragments, unless ALTQ is enabled on the given
348          * interface in which case packetdrop should be done by queueing.
349          */
350 #ifdef ALTQ
351         if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
352             ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
353             ifp->if_snd.ifq_maxlen))
354 #else
355         if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
356             ifp->if_snd.ifq_maxlen)
357 #endif /* ALTQ */
358         {
359                 error = ENOBUFS;
360                 ipstat.ips_odropped++;
361                 ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
362                 goto bad;
363         }
364
365         /*
366          * Look for broadcast address and
367          * verify user is allowed to send
368          * such a packet.
369          */
370         if (isbroadcast) {
371                 if ((ifp->if_flags & IFF_BROADCAST) == 0) {
372                         error = EADDRNOTAVAIL;
373                         goto bad;
374                 }
375                 if ((flags & IP_ALLOWBROADCAST) == 0) {
376                         error = EACCES;
377                         goto bad;
378                 }
379                 /* don't allow broadcast messages to be fragmented */
380                 if (ip->ip_len > ifp->if_mtu) {
381                         error = EMSGSIZE;
382                         goto bad;
383                 }
384                 if (flags & IP_SENDONES)
385                         ip->ip_dst.s_addr = INADDR_BROADCAST;
386                 m->m_flags |= M_BCAST;
387         } else {
388                 m->m_flags &= ~M_BCAST;
389         }
390
391 sendit:
392 #if defined(IPSEC) || defined(FAST_IPSEC)
393         switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) {
394         case 1:
395                 goto bad;
396         case -1:
397                 goto done;
398         case 0:
399         default:
400                 break;  /* Continue with packet processing. */
401         }
402         /* Update variables that are affected by ipsec4_output(). */
403         ip = mtod(m, struct ip *);
404         hlen = ip->ip_hl << 2;
405 #endif /* IPSEC */
406
407         /* Jump over all PFIL processing if hooks are not active. */
408         if (!PFIL_HOOKED(&inet_pfil_hook))
409                 goto passout;
410
411         /* Run through list of hooks for output packets. */
412         odst.s_addr = ip->ip_dst.s_addr;
413         error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
414         if (error != 0 || m == NULL)
415                 goto done;
416
417         ip = mtod(m, struct ip *);
418
419         /* See if destination IP address was changed by packet filter. */
420         if (odst.s_addr != ip->ip_dst.s_addr) {
421                 m->m_flags |= M_SKIP_FIREWALL;
422                 /* If destination is now ourself drop to ip_input(). */
423                 if (in_localip(ip->ip_dst)) {
424                         m->m_flags |= M_FASTFWD_OURS;
425                         if (m->m_pkthdr.rcvif == NULL)
426                                 m->m_pkthdr.rcvif = loif;
427                         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
428                                 m->m_pkthdr.csum_flags |=
429                                     CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
430                                 m->m_pkthdr.csum_data = 0xffff;
431                         }
432                         m->m_pkthdr.csum_flags |=
433                             CSUM_IP_CHECKED | CSUM_IP_VALID;
434
435                         error = netisr_queue(NETISR_IP, m);
436                         goto done;
437                 } else
438                         goto again;     /* Redo the routing table lookup. */
439         }
440
441 #ifdef IPFIREWALL_FORWARD
442         /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
443         if (m->m_flags & M_FASTFWD_OURS) {
444                 if (m->m_pkthdr.rcvif == NULL)
445                         m->m_pkthdr.rcvif = loif;
446                 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
447                         m->m_pkthdr.csum_flags |=
448                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
449                         m->m_pkthdr.csum_data = 0xffff;
450                 }
451                 m->m_pkthdr.csum_flags |=
452                             CSUM_IP_CHECKED | CSUM_IP_VALID;
453
454                 error = netisr_queue(NETISR_IP, m);
455                 goto done;
456         }
457         /* Or forward to some other address? */
458         fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
459         if (fwd_tag) {
460 #ifndef IPFIREWALL_FORWARD_EXTENDED
461                 if (!in_localip(ip->ip_src) && !in_localaddr(ip->ip_dst)) {
462 #endif
463                         dst = (struct sockaddr_in *)&ro->ro_dst;
464                         bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
465                         m->m_flags |= M_SKIP_FIREWALL;
466                         m_tag_delete(m, fwd_tag);
467                         goto again;
468 #ifndef IPFIREWALL_FORWARD_EXTENDED
469                 } else {
470                         m_tag_delete(m, fwd_tag);
471                         /* Continue. */
472                 }
473 #endif
474         }
475 #endif /* IPFIREWALL_FORWARD */
476
477 passout:
478         /* 127/8 must not appear on wire - RFC1122. */
479         if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
480             (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
481                 if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
482                         ipstat.ips_badaddr++;
483                         error = EADDRNOTAVAIL;
484                         goto bad;
485                 }
486         }
487
488         m->m_pkthdr.csum_flags |= CSUM_IP;
489         sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
490         if (sw_csum & CSUM_DELAY_DATA) {
491                 in_delayed_cksum(m);
492                 sw_csum &= ~CSUM_DELAY_DATA;
493         }
494         m->m_pkthdr.csum_flags &= ifp->if_hwassist;
495
496         /*
497          * If small enough for interface, or the interface will take
498          * care of the fragmentation for us, can just send directly.
499          */
500         if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT &&
501             ((ip->ip_off & IP_DF) == 0))) {
502                 ip->ip_len = htons(ip->ip_len);
503                 ip->ip_off = htons(ip->ip_off);
504                 ip->ip_sum = 0;
505                 if (sw_csum & CSUM_DELAY_IP)
506                         ip->ip_sum = in_cksum(m, hlen);
507
508                 /* Record statistics for this interface address. */
509                 if (!(flags & IP_FORWARDING) && ia) {
510                         ia->ia_ifa.if_opackets++;
511                         ia->ia_ifa.if_obytes += m->m_pkthdr.len;
512                 }
513 #ifdef IPSEC
514                 /* clean ipsec history once it goes out of the node */
515                 ipsec_delaux(m);
516 #endif
517 #ifdef MBUF_STRESS_TEST
518                 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
519                         m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
520 #endif
521                 /*
522                  * Reset layer specific mbuf flags
523                  * to avoid confusing lower layers.
524                  */
525                 m->m_flags &= ~(M_PROTOFLAGS);
526
527                 error = (*ifp->if_output)(ifp, m,
528                                 (struct sockaddr *)dst, ro->ro_rt);
529                 goto done;
530         }
531
532         if (ip->ip_off & IP_DF) {
533                 error = EMSGSIZE;
534                 /*
535                  * This case can happen if the user changed the MTU
536                  * of an interface after enabling IP on it.  Because
537                  * most netifs don't keep track of routes pointing to
538                  * them, there is no way for one to update all its
539                  * routes when the MTU is changed.
540                  */
541                 if (ro != NULL &&
542                     (ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
543                     (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
544                         ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
545                 }
546                 ipstat.ips_cantfrag++;
547                 goto bad;
548         }
549
550         /*
551          * Too large for interface; fragment if possible. If successful,
552          * on return, m will point to a list of packets to be sent.
553          */
554         error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
555         if (error)
556                 goto bad;
557         for (; m; m = m0) {
558                 m0 = m->m_nextpkt;
559                 m->m_nextpkt = 0;
560 #ifdef IPSEC
561                 /* clean ipsec history once it goes out of the node */
562                 ipsec_delaux(m);
563 #endif
564                 if (error == 0) {
565                         /* Record statistics for this interface address. */
566                         if (ia != NULL) {
567                                 ia->ia_ifa.if_opackets++;
568                                 ia->ia_ifa.if_obytes += m->m_pkthdr.len;
569                         }
570                         /*
571                          * Reset layer specific mbuf flags
572                          * to avoid confusing upper layers.
573                          */
574                         m->m_flags &= ~(M_PROTOFLAGS);
575
576                         error = (*ifp->if_output)(ifp, m,
577                             (struct sockaddr *)dst, ro->ro_rt);
578                 } else
579                         m_freem(m);
580         }
581
582         if (error == 0)
583                 ipstat.ips_fragmented++;
584
585 done:
586         if (ro == &iproute && ro->ro_rt) {
587                 RTFREE(ro->ro_rt);
588         }
589         return (error);
590 bad:
591         m_freem(m);
592         goto done;
593 }
594
595 /*
596  * Create a chain of fragments which fit the given mtu. m_frag points to the
597  * mbuf to be fragmented; on return it points to the chain with the fragments.
598  * Return 0 if no error. If error, m_frag may contain a partially built
599  * chain of fragments that should be freed by the caller.
600  *
601  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
602  * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
603  */
604 int
605 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
606             u_long if_hwassist_flags, int sw_csum)
607 {
608         int error = 0;
609         int hlen = ip->ip_hl << 2;
610         int len = (mtu - hlen) & ~7;    /* size of payload in each fragment */
611         int off;
612         struct mbuf *m0 = *m_frag;      /* the original packet          */
613         int firstlen;
614         struct mbuf **mnext;
615         int nfrags;
616
617         if (ip->ip_off & IP_DF) {       /* Fragmentation not allowed */
618                 ipstat.ips_cantfrag++;
619                 return EMSGSIZE;
620         }
621
622         /*
623          * Must be able to put at least 8 bytes per fragment.
624          */
625         if (len < 8)
626                 return EMSGSIZE;
627
628         /*
629          * If the interface will not calculate checksums on
630          * fragmented packets, then do it here.
631          */
632         if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
633             (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
634                 in_delayed_cksum(m0);
635                 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
636         }
637
638         if (len > PAGE_SIZE) {
639                 /* 
640                  * Fragment large datagrams such that each segment 
641                  * contains a multiple of PAGE_SIZE amount of data, 
642                  * plus headers. This enables a receiver to perform 
643                  * page-flipping zero-copy optimizations.
644                  *
645                  * XXX When does this help given that sender and receiver
646                  * could have different page sizes, and also mtu could
647                  * be less than the receiver's page size ?
648                  */
649                 int newlen;
650                 struct mbuf *m;
651
652                 for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
653                         off += m->m_len;
654
655                 /*
656                  * firstlen (off - hlen) must be aligned on an 
657                  * 8-byte boundary
658                  */
659                 if (off < hlen)
660                         goto smart_frag_failure;
661                 off = ((off - hlen) & ~7) + hlen;
662                 newlen = (~PAGE_MASK) & mtu;
663                 if ((newlen + sizeof (struct ip)) > mtu) {
664                         /* we failed, go back the default */
665 smart_frag_failure:
666                         newlen = len;
667                         off = hlen + len;
668                 }
669                 len = newlen;
670
671         } else {
672                 off = hlen + len;
673         }
674
675         firstlen = off - hlen;
676         mnext = &m0->m_nextpkt;         /* pointer to next packet */
677
678         /*
679          * Loop through length of segment after first fragment,
680          * make new header and copy data of each part and link onto chain.
681          * Here, m0 is the original packet, m is the fragment being created.
682          * The fragments are linked off the m_nextpkt of the original
683          * packet, which after processing serves as the first fragment.
684          */
685         for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
686                 struct ip *mhip;        /* ip header on the fragment */
687                 struct mbuf *m;
688                 int mhlen = sizeof (struct ip);
689
690                 MGETHDR(m, M_DONTWAIT, MT_DATA);
691                 if (m == NULL) {
692                         error = ENOBUFS;
693                         ipstat.ips_odropped++;
694                         goto done;
695                 }
696                 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
697                 /*
698                  * In the first mbuf, leave room for the link header, then
699                  * copy the original IP header including options. The payload
700                  * goes into an additional mbuf chain returned by m_copy().
701                  */
702                 m->m_data += max_linkhdr;
703                 mhip = mtod(m, struct ip *);
704                 *mhip = *ip;
705                 if (hlen > sizeof (struct ip)) {
706                         mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
707                         mhip->ip_v = IPVERSION;
708                         mhip->ip_hl = mhlen >> 2;
709                 }
710                 m->m_len = mhlen;
711                 /* XXX do we need to add ip->ip_off below ? */
712                 mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
713                 if (off + len >= ip->ip_len) {  /* last fragment */
714                         len = ip->ip_len - off;
715                         m->m_flags |= M_LASTFRAG;
716                 } else
717                         mhip->ip_off |= IP_MF;
718                 mhip->ip_len = htons((u_short)(len + mhlen));
719                 m->m_next = m_copy(m0, off, len);
720                 if (m->m_next == NULL) {        /* copy failed */
721                         m_free(m);
722                         error = ENOBUFS;        /* ??? */
723                         ipstat.ips_odropped++;
724                         goto done;
725                 }
726                 m->m_pkthdr.len = mhlen + len;
727                 m->m_pkthdr.rcvif = NULL;
728 #ifdef MAC
729                 mac_create_fragment(m0, m);
730 #endif
731                 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
732                 mhip->ip_off = htons(mhip->ip_off);
733                 mhip->ip_sum = 0;
734                 if (sw_csum & CSUM_DELAY_IP)
735                         mhip->ip_sum = in_cksum(m, mhlen);
736                 *mnext = m;
737                 mnext = &m->m_nextpkt;
738         }
739         ipstat.ips_ofragments += nfrags;
740
741         /* set first marker for fragment chain */
742         m0->m_flags |= M_FIRSTFRAG | M_FRAG;
743         m0->m_pkthdr.csum_data = nfrags;
744
745         /*
746          * Update first fragment by trimming what's been copied out
747          * and updating header.
748          */
749         m_adj(m0, hlen + firstlen - ip->ip_len);
750         m0->m_pkthdr.len = hlen + firstlen;
751         ip->ip_len = htons((u_short)m0->m_pkthdr.len);
752         ip->ip_off |= IP_MF;
753         ip->ip_off = htons(ip->ip_off);
754         ip->ip_sum = 0;
755         if (sw_csum & CSUM_DELAY_IP)
756                 ip->ip_sum = in_cksum(m0, hlen);
757
758 done:
759         *m_frag = m0;
760         return error;
761 }
762
763 void
764 in_delayed_cksum(struct mbuf *m)
765 {
766         struct ip *ip;
767         u_short csum, offset;
768
769         ip = mtod(m, struct ip *);
770         offset = ip->ip_hl << 2 ;
771         csum = in_cksum_skip(m, ip->ip_len, offset);
772         if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
773                 csum = 0xffff;
774         offset += m->m_pkthdr.csum_data;        /* checksum offset */
775
776         if (offset + sizeof(u_short) > m->m_len) {
777                 printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
778                     m->m_len, offset, ip->ip_p);
779                 /*
780                  * XXX
781                  * this shouldn't happen, but if it does, the
782                  * correct behavior may be to insert the checksum
783                  * in the appropriate next mbuf in the chain.
784                  */
785                 return;
786         }
787         *(u_short *)(m->m_data + offset) = csum;
788 }
789
790 /*
791  * IP socket option processing.
792  */
793 int
794 ip_ctloutput(so, sopt)
795         struct socket *so;
796         struct sockopt *sopt;
797 {
798         struct  inpcb *inp = sotoinpcb(so);
799         int     error, optval;
800
801         error = optval = 0;
802         if (sopt->sopt_level != IPPROTO_IP) {
803                 return (EINVAL);
804         }
805
806         switch (sopt->sopt_dir) {
807         case SOPT_SET:
808                 switch (sopt->sopt_name) {
809                 case IP_OPTIONS:
810 #ifdef notyet
811                 case IP_RETOPTS:
812 #endif
813                 {
814                         struct mbuf *m;
815                         if (sopt->sopt_valsize > MLEN) {
816                                 error = EMSGSIZE;
817                                 break;
818                         }
819                         MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
820                         if (m == NULL) {
821                                 error = ENOBUFS;
822                                 break;
823                         }
824                         m->m_len = sopt->sopt_valsize;
825                         error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
826                                             m->m_len);
827                         INP_LOCK(inp);
828                         error = ip_pcbopts(inp, sopt->sopt_name, m);
829                         INP_UNLOCK(inp);
830                         return (error);
831                 }
832
833                 case IP_TOS:
834                 case IP_TTL:
835                 case IP_MINTTL:
836                 case IP_RECVOPTS:
837                 case IP_RECVRETOPTS:
838                 case IP_RECVDSTADDR:
839                 case IP_RECVTTL:
840                 case IP_RECVIF:
841                 case IP_FAITH:
842                 case IP_ONESBCAST:
843                 case IP_DONTFRAG:
844                         error = sooptcopyin(sopt, &optval, sizeof optval,
845                                             sizeof optval);
846                         if (error)
847                                 break;
848
849                         switch (sopt->sopt_name) {
850                         case IP_TOS:
851                                 inp->inp_ip_tos = optval;
852                                 break;
853
854                         case IP_TTL:
855                                 inp->inp_ip_ttl = optval;
856                                 break;
857
858                         case IP_MINTTL:
859                                 if (optval > 0 && optval <= MAXTTL)
860                                         inp->inp_ip_minttl = optval;
861                                 else
862                                         error = EINVAL;
863                                 break;
864
865 #define OPTSET(bit) do {                                                \
866         INP_LOCK(inp);                                                  \
867         if (optval)                                                     \
868                 inp->inp_flags |= bit;                                  \
869         else                                                            \
870                 inp->inp_flags &= ~bit;                                 \
871         INP_UNLOCK(inp);                                                \
872 } while (0)
873
874                         case IP_RECVOPTS:
875                                 OPTSET(INP_RECVOPTS);
876                                 break;
877
878                         case IP_RECVRETOPTS:
879                                 OPTSET(INP_RECVRETOPTS);
880                                 break;
881
882                         case IP_RECVDSTADDR:
883                                 OPTSET(INP_RECVDSTADDR);
884                                 break;
885
886                         case IP_RECVTTL:
887                                 OPTSET(INP_RECVTTL);
888                                 break;
889
890                         case IP_RECVIF:
891                                 OPTSET(INP_RECVIF);
892                                 break;
893
894                         case IP_FAITH:
895                                 OPTSET(INP_FAITH);
896                                 break;
897
898                         case IP_ONESBCAST:
899                                 OPTSET(INP_ONESBCAST);
900                                 break;
901                         case IP_DONTFRAG:
902                                 OPTSET(INP_DONTFRAG);
903                                 break;
904                         }
905                         break;
906 #undef OPTSET
907
908                 case IP_MULTICAST_IF:
909                 case IP_MULTICAST_VIF:
910                 case IP_MULTICAST_TTL:
911                 case IP_MULTICAST_LOOP:
912                 case IP_ADD_MEMBERSHIP:
913                 case IP_DROP_MEMBERSHIP:
914                         error = ip_setmoptions(inp, sopt);
915                         break;
916
917                 case IP_PORTRANGE:
918                         error = sooptcopyin(sopt, &optval, sizeof optval,
919                                             sizeof optval);
920                         if (error)
921                                 break;
922
923                         INP_LOCK(inp);
924                         switch (optval) {
925                         case IP_PORTRANGE_DEFAULT:
926                                 inp->inp_flags &= ~(INP_LOWPORT);
927                                 inp->inp_flags &= ~(INP_HIGHPORT);
928                                 break;
929
930                         case IP_PORTRANGE_HIGH:
931                                 inp->inp_flags &= ~(INP_LOWPORT);
932                                 inp->inp_flags |= INP_HIGHPORT;
933                                 break;
934
935                         case IP_PORTRANGE_LOW:
936                                 inp->inp_flags &= ~(INP_HIGHPORT);
937                                 inp->inp_flags |= INP_LOWPORT;
938                                 break;
939
940                         default:
941                                 error = EINVAL;
942                                 break;
943                         }
944                         INP_UNLOCK(inp);
945                         break;
946
947 #if defined(IPSEC) || defined(FAST_IPSEC)
948                 case IP_IPSEC_POLICY:
949                 {
950                         caddr_t req;
951                         size_t len = 0;
952                         int priv;
953                         struct mbuf *m;
954                         int optname;
955
956                         if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
957                                 break;
958                         if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
959                                 break;
960                         priv = (sopt->sopt_td != NULL &&
961                                 suser(sopt->sopt_td) != 0) ? 0 : 1;
962                         req = mtod(m, caddr_t);
963                         len = m->m_len;
964                         optname = sopt->sopt_name;
965                         error = ipsec4_set_policy(inp, optname, req, len, priv);
966                         m_freem(m);
967                         break;
968                 }
969 #endif /*IPSEC*/
970
971                 default:
972                         error = ENOPROTOOPT;
973                         break;
974                 }
975                 break;
976
977         case SOPT_GET:
978                 switch (sopt->sopt_name) {
979                 case IP_OPTIONS:
980                 case IP_RETOPTS:
981                         if (inp->inp_options)
982                                 error = sooptcopyout(sopt, 
983                                                      mtod(inp->inp_options,
984                                                           char *),
985                                                      inp->inp_options->m_len);
986                         else
987                                 sopt->sopt_valsize = 0;
988                         break;
989
990                 case IP_TOS:
991                 case IP_TTL:
992                 case IP_MINTTL:
993                 case IP_RECVOPTS:
994                 case IP_RECVRETOPTS:
995                 case IP_RECVDSTADDR:
996                 case IP_RECVTTL:
997                 case IP_RECVIF:
998                 case IP_PORTRANGE:
999                 case IP_FAITH:
1000                 case IP_ONESBCAST:
1001                 case IP_DONTFRAG:
1002                         switch (sopt->sopt_name) {
1003
1004                         case IP_TOS:
1005                                 optval = inp->inp_ip_tos;
1006                                 break;
1007
1008                         case IP_TTL:
1009                                 optval = inp->inp_ip_ttl;
1010                                 break;
1011
1012                         case IP_MINTTL:
1013                                 optval = inp->inp_ip_minttl;
1014                                 break;
1015
1016 #define OPTBIT(bit)     (inp->inp_flags & bit ? 1 : 0)
1017
1018                         case IP_RECVOPTS:
1019                                 optval = OPTBIT(INP_RECVOPTS);
1020                                 break;
1021
1022                         case IP_RECVRETOPTS:
1023                                 optval = OPTBIT(INP_RECVRETOPTS);
1024                                 break;
1025
1026                         case IP_RECVDSTADDR:
1027                                 optval = OPTBIT(INP_RECVDSTADDR);
1028                                 break;
1029
1030                         case IP_RECVTTL:
1031                                 optval = OPTBIT(INP_RECVTTL);
1032                                 break;
1033
1034                         case IP_RECVIF:
1035                                 optval = OPTBIT(INP_RECVIF);
1036                                 break;
1037
1038                         case IP_PORTRANGE:
1039                                 if (inp->inp_flags & INP_HIGHPORT)
1040                                         optval = IP_PORTRANGE_HIGH;
1041                                 else if (inp->inp_flags & INP_LOWPORT)
1042                                         optval = IP_PORTRANGE_LOW;
1043                                 else
1044                                         optval = 0;
1045                                 break;
1046
1047                         case IP_FAITH:
1048                                 optval = OPTBIT(INP_FAITH);
1049                                 break;
1050
1051                         case IP_ONESBCAST:
1052                                 optval = OPTBIT(INP_ONESBCAST);
1053                                 break;
1054                         case IP_DONTFRAG:
1055                                 optval = OPTBIT(INP_DONTFRAG);
1056                                 break;
1057                         }
1058                         error = sooptcopyout(sopt, &optval, sizeof optval);
1059                         break;
1060
1061                 case IP_MULTICAST_IF:
1062                 case IP_MULTICAST_VIF:
1063                 case IP_MULTICAST_TTL:
1064                 case IP_MULTICAST_LOOP:
1065                 case IP_ADD_MEMBERSHIP:
1066                 case IP_DROP_MEMBERSHIP:
1067                         error = ip_getmoptions(inp, sopt);
1068                         break;
1069
1070 #if defined(IPSEC) || defined(FAST_IPSEC)
1071                 case IP_IPSEC_POLICY:
1072                 {
1073                         struct mbuf *m = NULL;
1074                         caddr_t req = NULL;
1075                         size_t len = 0;
1076
1077                         if (m != 0) {
1078                                 req = mtod(m, caddr_t);
1079                                 len = m->m_len;
1080                         }
1081                         error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1082                         if (error == 0)
1083                                 error = soopt_mcopyout(sopt, m); /* XXX */
1084                         if (error == 0)
1085                                 m_freem(m);
1086                         break;
1087                 }
1088 #endif /*IPSEC*/
1089
1090                 default:
1091                         error = ENOPROTOOPT;
1092                         break;
1093                 }
1094                 break;
1095         }
1096         return (error);
1097 }
1098
1099 /*
1100  * XXX
1101  * The whole multicast option thing needs to be re-thought.
1102  * Several of these options are equally applicable to non-multicast
1103  * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1104  * standard option (IP_TTL).
1105  */
1106
1107 /*
1108  * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1109  */
1110 static struct ifnet *
1111 ip_multicast_if(a, ifindexp)
1112         struct in_addr *a;
1113         int *ifindexp;
1114 {
1115         int ifindex;
1116         struct ifnet *ifp;
1117
1118         if (ifindexp)
1119                 *ifindexp = 0;
1120         if (ntohl(a->s_addr) >> 24 == 0) {
1121                 ifindex = ntohl(a->s_addr) & 0xffffff;
1122                 if (ifindex < 0 || if_index < ifindex)
1123                         return NULL;
1124                 ifp = ifnet_byindex(ifindex);
1125                 if (ifindexp)
1126                         *ifindexp = ifindex;
1127         } else {
1128                 INADDR_TO_IFP(*a, ifp);
1129         }
1130         return ifp;
1131 }
1132
1133 /*
1134  * Given an inpcb, return its multicast options structure pointer.  Accepts
1135  * an unlocked inpcb pointer, but will return it locked.  May sleep.
1136  */
1137 static struct ip_moptions *
1138 ip_findmoptions(struct inpcb *inp)
1139 {
1140         struct ip_moptions *imo;
1141
1142         INP_LOCK(inp);
1143         if (inp->inp_moptions != NULL)
1144                 return (inp->inp_moptions);
1145
1146         INP_UNLOCK(inp);
1147
1148         imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK);
1149
1150         imo->imo_multicast_ifp = NULL;
1151         imo->imo_multicast_addr.s_addr = INADDR_ANY;
1152         imo->imo_multicast_vif = -1;
1153         imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1154         imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1155         imo->imo_num_memberships = 0;
1156
1157         INP_LOCK(inp);
1158         if (inp->inp_moptions != NULL) {
1159                 free(imo, M_IPMOPTS);
1160                 return (inp->inp_moptions);
1161         }
1162         inp->inp_moptions = imo;
1163         return (imo);
1164 }
1165
1166 /*
1167  * Set the IP multicast options in response to user setsockopt().
1168  */
1169 static int
1170 ip_setmoptions(struct inpcb *inp, struct sockopt *sopt)
1171 {
1172         int error = 0;
1173         int i;
1174         struct in_addr addr;
1175         struct ip_mreq mreq;
1176         struct ifnet *ifp;
1177         struct ip_moptions *imo;
1178         struct route ro;
1179         struct sockaddr_in *dst;
1180         int ifindex;
1181         int s;
1182
1183         switch (sopt->sopt_name) {
1184         /* store an index number for the vif you wanna use in the send */
1185         case IP_MULTICAST_VIF:
1186                 if (legal_vif_num == 0) {
1187                         error = EOPNOTSUPP;
1188                         break;
1189                 }
1190                 error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
1191                 if (error)
1192                         break;
1193                 if (!legal_vif_num(i) && (i != -1)) {
1194                         error = EINVAL;
1195                         break;
1196                 }
1197                 imo = ip_findmoptions(inp);
1198                 imo->imo_multicast_vif = i;
1199                 INP_UNLOCK(inp);
1200                 break;
1201
1202         case IP_MULTICAST_IF:
1203                 /*
1204                  * Select the interface for outgoing multicast packets.
1205                  */
1206                 error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
1207                 if (error)
1208                         break;
1209                 /*
1210                  * INADDR_ANY is used to remove a previous selection.
1211                  * When no interface is selected, a default one is
1212                  * chosen every time a multicast packet is sent.
1213                  */
1214                 imo = ip_findmoptions(inp);
1215                 if (addr.s_addr == INADDR_ANY) {
1216                         imo->imo_multicast_ifp = NULL;
1217                         INP_UNLOCK(inp);
1218                         break;
1219                 }
1220                 /*
1221                  * The selected interface is identified by its local
1222                  * IP address.  Find the interface and confirm that
1223                  * it supports multicasting.
1224                  */
1225                 s = splimp();
1226                 ifp = ip_multicast_if(&addr, &ifindex);
1227                 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1228                         INP_UNLOCK(inp);
1229                         splx(s);
1230                         error = EADDRNOTAVAIL;
1231                         break;
1232                 }
1233                 imo->imo_multicast_ifp = ifp;
1234                 if (ifindex)
1235                         imo->imo_multicast_addr = addr;
1236                 else
1237                         imo->imo_multicast_addr.s_addr = INADDR_ANY;
1238                 INP_UNLOCK(inp);
1239                 splx(s);
1240                 break;
1241
1242         case IP_MULTICAST_TTL:
1243                 /*
1244                  * Set the IP time-to-live for outgoing multicast packets.
1245                  * The original multicast API required a char argument,
1246                  * which is inconsistent with the rest of the socket API.
1247                  * We allow either a char or an int.
1248                  */
1249                 if (sopt->sopt_valsize == 1) {
1250                         u_char ttl;
1251                         error = sooptcopyin(sopt, &ttl, 1, 1);
1252                         if (error)
1253                                 break;
1254                         imo = ip_findmoptions(inp);
1255                         imo->imo_multicast_ttl = ttl;
1256                         INP_UNLOCK(inp);
1257                 } else {
1258                         u_int ttl;
1259                         error = sooptcopyin(sopt, &ttl, sizeof ttl, 
1260                                             sizeof ttl);
1261                         if (error)
1262                                 break;
1263                         if (ttl > 255)
1264                                 error = EINVAL;
1265                         else {
1266                                 imo = ip_findmoptions(inp);
1267                                 imo->imo_multicast_ttl = ttl;
1268                                 INP_UNLOCK(inp);
1269                         }
1270                 }
1271                 break;
1272
1273         case IP_MULTICAST_LOOP:
1274                 /*
1275                  * Set the loopback flag for outgoing multicast packets.
1276                  * Must be zero or one.  The original multicast API required a
1277                  * char argument, which is inconsistent with the rest
1278                  * of the socket API.  We allow either a char or an int.
1279                  */
1280                 if (sopt->sopt_valsize == 1) {
1281                         u_char loop;
1282                         error = sooptcopyin(sopt, &loop, 1, 1);
1283                         if (error)
1284                                 break;
1285                         imo = ip_findmoptions(inp);
1286                         imo->imo_multicast_loop = !!loop;
1287                         INP_UNLOCK(inp);
1288                 } else {
1289                         u_int loop;
1290                         error = sooptcopyin(sopt, &loop, sizeof loop,
1291                                             sizeof loop);
1292                         if (error)
1293                                 break;
1294                         imo = ip_findmoptions(inp);
1295                         imo->imo_multicast_loop = !!loop;
1296                         INP_UNLOCK(inp);
1297                 }
1298                 break;
1299
1300         case IP_ADD_MEMBERSHIP:
1301                 /*
1302                  * Add a multicast group membership.
1303                  * Group must be a valid IP multicast address.
1304                  */
1305                 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1306                 if (error)
1307                         break;
1308
1309                 if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1310                         error = EINVAL;
1311                         break;
1312                 }
1313                 s = splimp();
1314                 /*
1315                  * If no interface address was provided, use the interface of
1316                  * the route to the given multicast address.
1317                  */
1318                 if (mreq.imr_interface.s_addr == INADDR_ANY) {
1319                         bzero((caddr_t)&ro, sizeof(ro));
1320                         dst = (struct sockaddr_in *)&ro.ro_dst;
1321                         dst->sin_len = sizeof(*dst);
1322                         dst->sin_family = AF_INET;
1323                         dst->sin_addr = mreq.imr_multiaddr;
1324                         rtalloc_ign(&ro, RTF_CLONING);
1325                         if (ro.ro_rt == NULL) {
1326                                 error = EADDRNOTAVAIL;
1327                                 splx(s);
1328                                 break;
1329                         }
1330                         ifp = ro.ro_rt->rt_ifp;
1331                         RTFREE(ro.ro_rt);
1332                 }
1333                 else {
1334                         ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1335                 }
1336
1337                 /*
1338                  * See if we found an interface, and confirm that it
1339                  * supports multicast.
1340                  */
1341                 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1342                         error = EADDRNOTAVAIL;
1343                         splx(s);
1344                         break;
1345                 }
1346                 /*
1347                  * See if the membership already exists or if all the
1348                  * membership slots are full.
1349                  */
1350                 imo = ip_findmoptions(inp);
1351                 for (i = 0; i < imo->imo_num_memberships; ++i) {
1352                         if (imo->imo_membership[i]->inm_ifp == ifp &&
1353                             imo->imo_membership[i]->inm_addr.s_addr
1354                                                 == mreq.imr_multiaddr.s_addr)
1355                                 break;
1356                 }
1357                 if (i < imo->imo_num_memberships) {
1358                         INP_UNLOCK(inp);
1359                         error = EADDRINUSE;
1360                         splx(s);
1361                         break;
1362                 }
1363                 if (i == IP_MAX_MEMBERSHIPS) {
1364                         INP_UNLOCK(inp);
1365                         error = ETOOMANYREFS;
1366                         splx(s);
1367                         break;
1368                 }
1369                 /*
1370                  * Everything looks good; add a new record to the multicast
1371                  * address list for the given interface.
1372                  */
1373                 if ((imo->imo_membership[i] =
1374                     in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
1375                         INP_UNLOCK(inp);
1376                         error = ENOBUFS;
1377                         splx(s);
1378                         break;
1379                 }
1380                 ++imo->imo_num_memberships;
1381                 INP_UNLOCK(inp);
1382                 splx(s);
1383                 break;
1384
1385         case IP_DROP_MEMBERSHIP:
1386                 /*
1387                  * Drop a multicast group membership.
1388                  * Group must be a valid IP multicast address.
1389                  */
1390                 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1391                 if (error)
1392                         break;
1393
1394                 if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1395                         error = EINVAL;
1396                         break;
1397                 }
1398
1399                 s = splimp();
1400                 /*
1401                  * If an interface address was specified, get a pointer
1402                  * to its ifnet structure.
1403                  */
1404                 if (mreq.imr_interface.s_addr == INADDR_ANY)
1405                         ifp = NULL;
1406                 else {
1407                         ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1408                         if (ifp == NULL) {
1409                                 error = EADDRNOTAVAIL;
1410                                 splx(s);
1411                                 break;
1412                         }
1413                 }
1414                 /*
1415                  * Find the membership in the membership array.
1416                  */
1417                 imo = ip_findmoptions(inp);
1418                 for (i = 0; i < imo->imo_num_memberships; ++i) {
1419                         if ((ifp == NULL ||
1420                              imo->imo_membership[i]->inm_ifp == ifp) &&
1421                              imo->imo_membership[i]->inm_addr.s_addr ==
1422                              mreq.imr_multiaddr.s_addr)
1423                                 break;
1424                 }
1425                 if (i == imo->imo_num_memberships) {
1426                         INP_UNLOCK(inp);
1427                         error = EADDRNOTAVAIL;
1428                         splx(s);
1429                         break;
1430                 }
1431                 /*
1432                  * Give up the multicast address record to which the
1433                  * membership points.
1434                  */
1435                 in_delmulti(imo->imo_membership[i]);
1436                 /*
1437                  * Remove the gap in the membership array.
1438                  */
1439                 for (++i; i < imo->imo_num_memberships; ++i)
1440                         imo->imo_membership[i-1] = imo->imo_membership[i];
1441                 --imo->imo_num_memberships;
1442                 INP_UNLOCK(inp);
1443                 splx(s);
1444                 break;
1445
1446         default:
1447                 error = EOPNOTSUPP;
1448                 break;
1449         }
1450
1451         return (error);
1452 }
1453
1454 /*
1455  * Return the IP multicast options in response to user getsockopt().
1456  */
1457 static int
1458 ip_getmoptions(struct inpcb *inp, struct sockopt *sopt)
1459 {
1460         struct ip_moptions *imo;
1461         struct in_addr addr;
1462         struct in_ifaddr *ia;
1463         int error, optval;
1464         u_char coptval;
1465
1466         INP_LOCK(inp);
1467         imo = inp->inp_moptions;
1468
1469         error = 0;
1470         switch (sopt->sopt_name) {
1471         case IP_MULTICAST_VIF: 
1472                 if (imo != NULL)
1473                         optval = imo->imo_multicast_vif;
1474                 else
1475                         optval = -1;
1476                 INP_UNLOCK(inp);
1477                 error = sooptcopyout(sopt, &optval, sizeof optval);
1478                 break;
1479
1480         case IP_MULTICAST_IF:
1481                 if (imo == NULL || imo->imo_multicast_ifp == NULL)
1482                         addr.s_addr = INADDR_ANY;
1483                 else if (imo->imo_multicast_addr.s_addr) {
1484                         /* return the value user has set */
1485                         addr = imo->imo_multicast_addr;
1486                 } else {
1487                         IFP_TO_IA(imo->imo_multicast_ifp, ia);
1488                         addr.s_addr = (ia == NULL) ? INADDR_ANY
1489                                 : IA_SIN(ia)->sin_addr.s_addr;
1490                 }
1491                 INP_UNLOCK(inp);
1492                 error = sooptcopyout(sopt, &addr, sizeof addr);
1493                 break;
1494
1495         case IP_MULTICAST_TTL:
1496                 if (imo == 0)
1497                         optval = coptval = IP_DEFAULT_MULTICAST_TTL;
1498                 else
1499                         optval = coptval = imo->imo_multicast_ttl;
1500                 INP_UNLOCK(inp);
1501                 if (sopt->sopt_valsize == 1)
1502                         error = sooptcopyout(sopt, &coptval, 1);
1503                 else
1504                         error = sooptcopyout(sopt, &optval, sizeof optval);
1505                 break;
1506
1507         case IP_MULTICAST_LOOP:
1508                 if (imo == 0)
1509                         optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
1510                 else
1511                         optval = coptval = imo->imo_multicast_loop;
1512                 INP_UNLOCK(inp);
1513                 if (sopt->sopt_valsize == 1)
1514                         error = sooptcopyout(sopt, &coptval, 1);
1515                 else
1516                         error = sooptcopyout(sopt, &optval, sizeof optval);
1517                 break;
1518
1519         default:
1520                 INP_UNLOCK(inp);
1521                 error = ENOPROTOOPT;
1522                 break;
1523         }
1524         INP_UNLOCK_ASSERT(inp);
1525
1526         return (error);
1527 }
1528
1529 /*
1530  * Discard the IP multicast options.
1531  */
1532 void
1533 ip_freemoptions(imo)
1534         register struct ip_moptions *imo;
1535 {
1536         register int i;
1537
1538         if (imo != NULL) {
1539                 for (i = 0; i < imo->imo_num_memberships; ++i)
1540                         in_delmulti(imo->imo_membership[i]);
1541                 free(imo, M_IPMOPTS);
1542         }
1543 }
1544
1545 /*
1546  * Routine called from ip_output() to loop back a copy of an IP multicast
1547  * packet to the input queue of a specified interface.  Note that this
1548  * calls the output routine of the loopback "driver", but with an interface
1549  * pointer that might NOT be a loopback interface -- evil, but easier than
1550  * replicating that code here.
1551  */
1552 static void
1553 ip_mloopback(ifp, m, dst, hlen)
1554         struct ifnet *ifp;
1555         register struct mbuf *m;
1556         register struct sockaddr_in *dst;
1557         int hlen;
1558 {
1559         register struct ip *ip;
1560         struct mbuf *copym;
1561
1562         copym = m_copy(m, 0, M_COPYALL);
1563         if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
1564                 copym = m_pullup(copym, hlen);
1565         if (copym != NULL) {
1566                 /* If needed, compute the checksum and mark it as valid. */
1567                 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1568                         in_delayed_cksum(copym);
1569                         copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1570                         copym->m_pkthdr.csum_flags |=
1571                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1572                         copym->m_pkthdr.csum_data = 0xffff;
1573                 }
1574                 /*
1575                  * We don't bother to fragment if the IP length is greater
1576                  * than the interface's MTU.  Can this possibly matter?
1577                  */
1578                 ip = mtod(copym, struct ip *);
1579                 ip->ip_len = htons(ip->ip_len);
1580                 ip->ip_off = htons(ip->ip_off);
1581                 ip->ip_sum = 0;
1582                 ip->ip_sum = in_cksum(copym, hlen);
1583                 /*
1584                  * NB:
1585                  * It's not clear whether there are any lingering
1586                  * reentrancy problems in other areas which might
1587                  * be exposed by using ip_input directly (in
1588                  * particular, everything which modifies the packet
1589                  * in-place).  Yet another option is using the
1590                  * protosw directly to deliver the looped back
1591                  * packet.  For the moment, we'll err on the side
1592                  * of safety by using if_simloop().
1593                  */
1594 #if 1 /* XXX */
1595                 if (dst->sin_family != AF_INET) {
1596                         printf("ip_mloopback: bad address family %d\n",
1597                                                 dst->sin_family);
1598                         dst->sin_family = AF_INET;
1599                 }
1600 #endif
1601
1602 #ifdef notdef
1603                 copym->m_pkthdr.rcvif = ifp;
1604                 ip_input(copym);
1605 #else
1606                 if_simloop(ifp, copym, dst->sin_family, 0);
1607 #endif
1608         }
1609 }