]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/ip_output.c
This commit was generated by cvs2svn to compensate for changes in r159985,
[FreeBSD/FreeBSD.git] / sys / netinet / ip_output.c
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *      @(#)ip_output.c 8.3 (Berkeley) 1/21/94
30  * $FreeBSD$
31  */
32
33 #include "opt_ipfw.h"
34 #include "opt_ipsec.h"
35 #include "opt_mac.h"
36 #include "opt_mbuf_stress_test.h"
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/mac.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48
49 #include <net/if.h>
50 #include <net/netisr.h>
51 #include <net/pfil.h>
52 #include <net/route.h>
53
54 #include <netinet/in.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/ip.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_var.h>
59 #include <netinet/ip_var.h>
60 #include <netinet/ip_options.h>
61
62 #if defined(IPSEC) || defined(FAST_IPSEC)
63 #include <netinet/ip_ipsec.h>
64 #ifdef IPSEC
65 #include <netinet6/ipsec.h>
66 #endif
67 #ifdef FAST_IPSEC
68 #include <netipsec/ipsec.h>
69 #endif
70 #endif /*IPSEC*/
71
72 #include <machine/in_cksum.h>
73
74 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
75
76 #define print_ip(x, a, y)        printf("%s %d.%d.%d.%d%s",\
77                                 x, (ntohl(a.s_addr)>>24)&0xFF,\
78                                   (ntohl(a.s_addr)>>16)&0xFF,\
79                                   (ntohl(a.s_addr)>>8)&0xFF,\
80                                   (ntohl(a.s_addr))&0xFF, y);
81
82 u_short ip_id;
83
84 #ifdef MBUF_STRESS_TEST
85 int mbuf_frag_size = 0;
86 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
87         &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
88 #endif
89
90 static struct ifnet *ip_multicast_if(struct in_addr *, int *);
91 static void     ip_mloopback
92         (struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
93 static int      ip_getmoptions(struct inpcb *, struct sockopt *);
94 static int      ip_setmoptions(struct inpcb *, struct sockopt *);
95
96
97 extern  struct protosw inetsw[];
98
99 /*
100  * IP output.  The packet in mbuf chain m contains a skeletal IP
101  * header (with len, off, ttl, proto, tos, src, dst).
102  * The mbuf chain containing the packet will be freed.
103  * The mbuf opt, if present, will not be freed.
104  * In the IP forwarding case, the packet will arrive with options already
105  * inserted, so must have a NULL opt pointer.
106  */
107 int
108 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro,
109         int flags, struct ip_moptions *imo, struct inpcb *inp)
110 {
111         struct ip *ip;
112         struct ifnet *ifp = NULL;       /* keep compiler happy */
113         struct mbuf *m0;
114         int hlen = sizeof (struct ip);
115         int len, error = 0;
116         struct sockaddr_in *dst = NULL; /* keep compiler happy */
117         struct in_ifaddr *ia = NULL;
118         int isbroadcast, sw_csum;
119         struct route iproute;
120         struct in_addr odst;
121 #ifdef IPFIREWALL_FORWARD
122         struct m_tag *fwd_tag = NULL;
123 #endif
124         M_ASSERTPKTHDR(m);
125
126         if (ro == NULL) {
127                 ro = &iproute;
128                 bzero(ro, sizeof (*ro));
129         }
130
131         if (inp != NULL)
132                 INP_LOCK_ASSERT(inp);
133
134         if (opt) {
135                 len = 0;
136                 m = ip_insertoptions(m, opt, &len);
137                 if (len != 0)
138                         hlen = len;
139         }
140         ip = mtod(m, struct ip *);
141
142         /*
143          * Fill in IP header.  If we are not allowing fragmentation,
144          * then the ip_id field is meaningless, but we don't set it
145          * to zero.  Doing so causes various problems when devices along
146          * the path (routers, load balancers, firewalls, etc.) illegally
147          * disable DF on our packet.  Note that a 16-bit counter
148          * will wrap around in less than 10 seconds at 100 Mbit/s on a
149          * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
150          * for Counting NATted Hosts", Proc. IMW'02, available at
151          * <http://www.research.att.com/~smb/papers/fnat.pdf>.
152          */
153         if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
154                 ip->ip_v = IPVERSION;
155                 ip->ip_hl = hlen >> 2;
156                 ip->ip_id = ip_newid();
157                 ipstat.ips_localout++;
158         } else {
159                 hlen = ip->ip_hl << 2;
160         }
161
162         dst = (struct sockaddr_in *)&ro->ro_dst;
163 again:
164         /*
165          * If there is a cached route,
166          * check that it is to the same destination
167          * and is still up.  If not, free it and try again.
168          * The address family should also be checked in case of sharing the
169          * cache with IPv6.
170          */
171         if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
172                           dst->sin_family != AF_INET ||
173                           dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
174                 RTFREE(ro->ro_rt);
175                 ro->ro_rt = (struct rtentry *)0;
176         }
177 #ifdef IPFIREWALL_FORWARD
178         if (ro->ro_rt == NULL && fwd_tag == NULL) {
179 #else
180         if (ro->ro_rt == NULL) {
181 #endif
182                 bzero(dst, sizeof(*dst));
183                 dst->sin_family = AF_INET;
184                 dst->sin_len = sizeof(*dst);
185                 dst->sin_addr = ip->ip_dst;
186         }
187         /*
188          * If routing to interface only,
189          * short circuit routing lookup.
190          */
191         if (flags & IP_ROUTETOIF) {
192                 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
193                     (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
194                         ipstat.ips_noroute++;
195                         error = ENETUNREACH;
196                         goto bad;
197                 }
198                 ifp = ia->ia_ifp;
199                 ip->ip_ttl = 1;
200                 isbroadcast = in_broadcast(dst->sin_addr, ifp);
201         } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
202             imo != NULL && imo->imo_multicast_ifp != NULL) {
203                 /*
204                  * Bypass the normal routing lookup for multicast
205                  * packets if the interface is specified.
206                  */
207                 ifp = imo->imo_multicast_ifp;
208                 IFP_TO_IA(ifp, ia);
209                 isbroadcast = 0;        /* fool gcc */
210         } else {
211                 /*
212                  * We want to do any cloning requested by the link layer,
213                  * as this is probably required in all cases for correct
214                  * operation (as it is for ARP).
215                  */
216                 if (ro->ro_rt == NULL)
217                         rtalloc_ign(ro, 0);
218                 if (ro->ro_rt == NULL) {
219                         ipstat.ips_noroute++;
220                         error = EHOSTUNREACH;
221                         goto bad;
222                 }
223                 ia = ifatoia(ro->ro_rt->rt_ifa);
224                 ifp = ro->ro_rt->rt_ifp;
225                 ro->ro_rt->rt_rmx.rmx_pksent++;
226                 if (ro->ro_rt->rt_flags & RTF_GATEWAY)
227                         dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
228                 if (ro->ro_rt->rt_flags & RTF_HOST)
229                         isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
230                 else
231                         isbroadcast = in_broadcast(dst->sin_addr, ifp);
232         }
233         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
234                 struct in_multi *inm;
235
236                 m->m_flags |= M_MCAST;
237                 /*
238                  * IP destination address is multicast.  Make sure "dst"
239                  * still points to the address in "ro".  (It may have been
240                  * changed to point to a gateway address, above.)
241                  */
242                 dst = (struct sockaddr_in *)&ro->ro_dst;
243                 /*
244                  * See if the caller provided any multicast options
245                  */
246                 if (imo != NULL) {
247                         ip->ip_ttl = imo->imo_multicast_ttl;
248                         if (imo->imo_multicast_vif != -1)
249                                 ip->ip_src.s_addr =
250                                     ip_mcast_src ?
251                                     ip_mcast_src(imo->imo_multicast_vif) :
252                                     INADDR_ANY;
253                 } else
254                         ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
255                 /*
256                  * Confirm that the outgoing interface supports multicast.
257                  */
258                 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
259                         if ((ifp->if_flags & IFF_MULTICAST) == 0) {
260                                 ipstat.ips_noroute++;
261                                 error = ENETUNREACH;
262                                 goto bad;
263                         }
264                 }
265                 /*
266                  * If source address not specified yet, use address
267                  * of outgoing interface.
268                  */
269                 if (ip->ip_src.s_addr == INADDR_ANY) {
270                         /* Interface may have no addresses. */
271                         if (ia != NULL)
272                                 ip->ip_src = IA_SIN(ia)->sin_addr;
273                 }
274
275                 IN_MULTI_LOCK();
276                 IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
277                 if (inm != NULL &&
278                    (imo == NULL || imo->imo_multicast_loop)) {
279                         IN_MULTI_UNLOCK();
280                         /*
281                          * If we belong to the destination multicast group
282                          * on the outgoing interface, and the caller did not
283                          * forbid loopback, loop back a copy.
284                          */
285                         ip_mloopback(ifp, m, dst, hlen);
286                 }
287                 else {
288                         IN_MULTI_UNLOCK();
289                         /*
290                          * If we are acting as a multicast router, perform
291                          * multicast forwarding as if the packet had just
292                          * arrived on the interface to which we are about
293                          * to send.  The multicast forwarding function
294                          * recursively calls this function, using the
295                          * IP_FORWARDING flag to prevent infinite recursion.
296                          *
297                          * Multicasts that are looped back by ip_mloopback(),
298                          * above, will be forwarded by the ip_input() routine,
299                          * if necessary.
300                          */
301                         if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
302                                 /*
303                                  * If rsvp daemon is not running, do not
304                                  * set ip_moptions. This ensures that the packet
305                                  * is multicast and not just sent down one link
306                                  * as prescribed by rsvpd.
307                                  */
308                                 if (!rsvp_on)
309                                         imo = NULL;
310                                 if (ip_mforward &&
311                                     ip_mforward(ip, ifp, m, imo) != 0) {
312                                         m_freem(m);
313                                         goto done;
314                                 }
315                         }
316                 }
317
318                 /*
319                  * Multicasts with a time-to-live of zero may be looped-
320                  * back, above, but must not be transmitted on a network.
321                  * Also, multicasts addressed to the loopback interface
322                  * are not sent -- the above call to ip_mloopback() will
323                  * loop back a copy if this host actually belongs to the
324                  * destination group on the loopback interface.
325                  */
326                 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
327                         m_freem(m);
328                         goto done;
329                 }
330
331                 goto sendit;
332         }
333 #ifndef notdef
334         /*
335          * If the source address is not specified yet, use the address
336          * of the outoing interface.
337          */
338         if (ip->ip_src.s_addr == INADDR_ANY) {
339                 /* Interface may have no addresses. */
340                 if (ia != NULL) {
341                         ip->ip_src = IA_SIN(ia)->sin_addr;
342                 }
343         }
344 #endif /* notdef */
345         /*
346          * Verify that we have any chance at all of being able to queue the
347          * packet or packet fragments, unless ALTQ is enabled on the given
348          * interface in which case packetdrop should be done by queueing.
349          */
350 #ifdef ALTQ
351         if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
352             ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
353             ifp->if_snd.ifq_maxlen))
354 #else
355         if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
356             ifp->if_snd.ifq_maxlen)
357 #endif /* ALTQ */
358         {
359                 error = ENOBUFS;
360                 ipstat.ips_odropped++;
361                 ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
362                 goto bad;
363         }
364
365         /*
366          * Look for broadcast address and
367          * verify user is allowed to send
368          * such a packet.
369          */
370         if (isbroadcast) {
371                 if ((ifp->if_flags & IFF_BROADCAST) == 0) {
372                         error = EADDRNOTAVAIL;
373                         goto bad;
374                 }
375                 if ((flags & IP_ALLOWBROADCAST) == 0) {
376                         error = EACCES;
377                         goto bad;
378                 }
379                 /* don't allow broadcast messages to be fragmented */
380                 if (ip->ip_len > ifp->if_mtu) {
381                         error = EMSGSIZE;
382                         goto bad;
383                 }
384                 if (flags & IP_SENDONES)
385                         ip->ip_dst.s_addr = INADDR_BROADCAST;
386                 m->m_flags |= M_BCAST;
387         } else {
388                 m->m_flags &= ~M_BCAST;
389         }
390
391 sendit:
392 #if defined(IPSEC) || defined(FAST_IPSEC)
393         switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) {
394         case 1:
395                 goto bad;
396         case -1:
397                 goto done;
398         case 0:
399         default:
400                 break;  /* Continue with packet processing. */
401         }
402         /* Update variables that are affected by ipsec4_output(). */
403         ip = mtod(m, struct ip *);
404         hlen = ip->ip_hl << 2;
405 #endif /* IPSEC */
406
407         /* Jump over all PFIL processing if hooks are not active. */
408         if (!PFIL_HOOKED(&inet_pfil_hook))
409                 goto passout;
410
411         /* Run through list of hooks for output packets. */
412         odst.s_addr = ip->ip_dst.s_addr;
413         error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
414         if (error != 0 || m == NULL)
415                 goto done;
416
417         ip = mtod(m, struct ip *);
418
419         /* See if destination IP address was changed by packet filter. */
420         if (odst.s_addr != ip->ip_dst.s_addr) {
421                 m->m_flags |= M_SKIP_FIREWALL;
422                 /* If destination is now ourself drop to ip_input(). */
423                 if (in_localip(ip->ip_dst)) {
424                         m->m_flags |= M_FASTFWD_OURS;
425                         if (m->m_pkthdr.rcvif == NULL)
426                                 m->m_pkthdr.rcvif = loif;
427                         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
428                                 m->m_pkthdr.csum_flags |=
429                                     CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
430                                 m->m_pkthdr.csum_data = 0xffff;
431                         }
432                         m->m_pkthdr.csum_flags |=
433                             CSUM_IP_CHECKED | CSUM_IP_VALID;
434
435                         error = netisr_queue(NETISR_IP, m);
436                         goto done;
437                 } else
438                         goto again;     /* Redo the routing table lookup. */
439         }
440
441 #ifdef IPFIREWALL_FORWARD
442         /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
443         if (m->m_flags & M_FASTFWD_OURS) {
444                 if (m->m_pkthdr.rcvif == NULL)
445                         m->m_pkthdr.rcvif = loif;
446                 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
447                         m->m_pkthdr.csum_flags |=
448                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
449                         m->m_pkthdr.csum_data = 0xffff;
450                 }
451                 m->m_pkthdr.csum_flags |=
452                             CSUM_IP_CHECKED | CSUM_IP_VALID;
453
454                 error = netisr_queue(NETISR_IP, m);
455                 goto done;
456         }
457         /* Or forward to some other address? */
458         fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
459         if (fwd_tag) {
460 #ifndef IPFIREWALL_FORWARD_EXTENDED
461                 if (!in_localip(ip->ip_src) && !in_localaddr(ip->ip_dst)) {
462 #endif
463                         dst = (struct sockaddr_in *)&ro->ro_dst;
464                         bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
465                         m->m_flags |= M_SKIP_FIREWALL;
466                         m_tag_delete(m, fwd_tag);
467                         goto again;
468 #ifndef IPFIREWALL_FORWARD_EXTENDED
469                 } else {
470                         m_tag_delete(m, fwd_tag);
471                         /* Continue. */
472                 }
473 #endif
474         }
475 #endif /* IPFIREWALL_FORWARD */
476
477 passout:
478         /* 127/8 must not appear on wire - RFC1122. */
479         if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
480             (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
481                 if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
482                         ipstat.ips_badaddr++;
483                         error = EADDRNOTAVAIL;
484                         goto bad;
485                 }
486         }
487
488         m->m_pkthdr.csum_flags |= CSUM_IP;
489         sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
490         if (sw_csum & CSUM_DELAY_DATA) {
491                 in_delayed_cksum(m);
492                 sw_csum &= ~CSUM_DELAY_DATA;
493         }
494         m->m_pkthdr.csum_flags &= ifp->if_hwassist;
495
496         /*
497          * If small enough for interface, or the interface will take
498          * care of the fragmentation for us, can just send directly.
499          */
500         if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT &&
501             ((ip->ip_off & IP_DF) == 0))) {
502                 ip->ip_len = htons(ip->ip_len);
503                 ip->ip_off = htons(ip->ip_off);
504                 ip->ip_sum = 0;
505                 if (sw_csum & CSUM_DELAY_IP)
506                         ip->ip_sum = in_cksum(m, hlen);
507
508                 /* Record statistics for this interface address. */
509                 if (!(flags & IP_FORWARDING) && ia) {
510                         ia->ia_ifa.if_opackets++;
511                         ia->ia_ifa.if_obytes += m->m_pkthdr.len;
512                 }
513 #ifdef IPSEC
514                 /* clean ipsec history once it goes out of the node */
515                 ipsec_delaux(m);
516 #endif
517 #ifdef MBUF_STRESS_TEST
518                 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
519                         m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
520 #endif
521                 /*
522                  * Reset layer specific mbuf flags
523                  * to avoid confusing lower layers.
524                  */
525                 m->m_flags &= ~(M_PROTOFLAGS);
526
527                 error = (*ifp->if_output)(ifp, m,
528                                 (struct sockaddr *)dst, ro->ro_rt);
529                 goto done;
530         }
531
532         if (ip->ip_off & IP_DF) {
533                 error = EMSGSIZE;
534                 /*
535                  * This case can happen if the user changed the MTU
536                  * of an interface after enabling IP on it.  Because
537                  * most netifs don't keep track of routes pointing to
538                  * them, there is no way for one to update all its
539                  * routes when the MTU is changed.
540                  */
541                 if (ro != NULL &&
542                     (ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
543                     (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
544                         ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
545                 }
546                 ipstat.ips_cantfrag++;
547                 goto bad;
548         }
549
550         /*
551          * Too large for interface; fragment if possible. If successful,
552          * on return, m will point to a list of packets to be sent.
553          */
554         error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
555         if (error)
556                 goto bad;
557         for (; m; m = m0) {
558                 m0 = m->m_nextpkt;
559                 m->m_nextpkt = 0;
560 #ifdef IPSEC
561                 /* clean ipsec history once it goes out of the node */
562                 ipsec_delaux(m);
563 #endif
564                 if (error == 0) {
565                         /* Record statistics for this interface address. */
566                         if (ia != NULL) {
567                                 ia->ia_ifa.if_opackets++;
568                                 ia->ia_ifa.if_obytes += m->m_pkthdr.len;
569                         }
570                         /*
571                          * Reset layer specific mbuf flags
572                          * to avoid confusing upper layers.
573                          */
574                         m->m_flags &= ~(M_PROTOFLAGS);
575
576                         error = (*ifp->if_output)(ifp, m,
577                             (struct sockaddr *)dst, ro->ro_rt);
578                 } else
579                         m_freem(m);
580         }
581
582         if (error == 0)
583                 ipstat.ips_fragmented++;
584
585 done:
586         if (ro == &iproute && ro->ro_rt) {
587                 RTFREE(ro->ro_rt);
588         }
589         return (error);
590 bad:
591         m_freem(m);
592         goto done;
593 }
594
595 /*
596  * Create a chain of fragments which fit the given mtu. m_frag points to the
597  * mbuf to be fragmented; on return it points to the chain with the fragments.
598  * Return 0 if no error. If error, m_frag may contain a partially built
599  * chain of fragments that should be freed by the caller.
600  *
601  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
602  * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
603  */
604 int
605 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
606             u_long if_hwassist_flags, int sw_csum)
607 {
608         int error = 0;
609         int hlen = ip->ip_hl << 2;
610         int len = (mtu - hlen) & ~7;    /* size of payload in each fragment */
611         int off;
612         struct mbuf *m0 = *m_frag;      /* the original packet          */
613         int firstlen;
614         struct mbuf **mnext;
615         int nfrags;
616
617         if (ip->ip_off & IP_DF) {       /* Fragmentation not allowed */
618                 ipstat.ips_cantfrag++;
619                 return EMSGSIZE;
620         }
621
622         /*
623          * Must be able to put at least 8 bytes per fragment.
624          */
625         if (len < 8)
626                 return EMSGSIZE;
627
628         /*
629          * If the interface will not calculate checksums on
630          * fragmented packets, then do it here.
631          */
632         if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
633             (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
634                 in_delayed_cksum(m0);
635                 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
636         }
637
638         if (len > PAGE_SIZE) {
639                 /* 
640                  * Fragment large datagrams such that each segment 
641                  * contains a multiple of PAGE_SIZE amount of data, 
642                  * plus headers. This enables a receiver to perform 
643                  * page-flipping zero-copy optimizations.
644                  *
645                  * XXX When does this help given that sender and receiver
646                  * could have different page sizes, and also mtu could
647                  * be less than the receiver's page size ?
648                  */
649                 int newlen;
650                 struct mbuf *m;
651
652                 for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
653                         off += m->m_len;
654
655                 /*
656                  * firstlen (off - hlen) must be aligned on an 
657                  * 8-byte boundary
658                  */
659                 if (off < hlen)
660                         goto smart_frag_failure;
661                 off = ((off - hlen) & ~7) + hlen;
662                 newlen = (~PAGE_MASK) & mtu;
663                 if ((newlen + sizeof (struct ip)) > mtu) {
664                         /* we failed, go back the default */
665 smart_frag_failure:
666                         newlen = len;
667                         off = hlen + len;
668                 }
669                 len = newlen;
670
671         } else {
672                 off = hlen + len;
673         }
674
675         firstlen = off - hlen;
676         mnext = &m0->m_nextpkt;         /* pointer to next packet */
677
678         /*
679          * Loop through length of segment after first fragment,
680          * make new header and copy data of each part and link onto chain.
681          * Here, m0 is the original packet, m is the fragment being created.
682          * The fragments are linked off the m_nextpkt of the original
683          * packet, which after processing serves as the first fragment.
684          */
685         for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
686                 struct ip *mhip;        /* ip header on the fragment */
687                 struct mbuf *m;
688                 int mhlen = sizeof (struct ip);
689
690                 MGETHDR(m, M_DONTWAIT, MT_DATA);
691                 if (m == NULL) {
692                         error = ENOBUFS;
693                         ipstat.ips_odropped++;
694                         goto done;
695                 }
696                 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
697                 /*
698                  * In the first mbuf, leave room for the link header, then
699                  * copy the original IP header including options. The payload
700                  * goes into an additional mbuf chain returned by m_copy().
701                  */
702                 m->m_data += max_linkhdr;
703                 mhip = mtod(m, struct ip *);
704                 *mhip = *ip;
705                 if (hlen > sizeof (struct ip)) {
706                         mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
707                         mhip->ip_v = IPVERSION;
708                         mhip->ip_hl = mhlen >> 2;
709                 }
710                 m->m_len = mhlen;
711                 /* XXX do we need to add ip->ip_off below ? */
712                 mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
713                 if (off + len >= ip->ip_len) {  /* last fragment */
714                         len = ip->ip_len - off;
715                         m->m_flags |= M_LASTFRAG;
716                 } else
717                         mhip->ip_off |= IP_MF;
718                 mhip->ip_len = htons((u_short)(len + mhlen));
719                 m->m_next = m_copy(m0, off, len);
720                 if (m->m_next == NULL) {        /* copy failed */
721                         m_free(m);
722                         error = ENOBUFS;        /* ??? */
723                         ipstat.ips_odropped++;
724                         goto done;
725                 }
726                 m->m_pkthdr.len = mhlen + len;
727                 m->m_pkthdr.rcvif = NULL;
728 #ifdef MAC
729                 mac_create_fragment(m0, m);
730 #endif
731                 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
732                 mhip->ip_off = htons(mhip->ip_off);
733                 mhip->ip_sum = 0;
734                 if (sw_csum & CSUM_DELAY_IP)
735                         mhip->ip_sum = in_cksum(m, mhlen);
736                 *mnext = m;
737                 mnext = &m->m_nextpkt;
738         }
739         ipstat.ips_ofragments += nfrags;
740
741         /* set first marker for fragment chain */
742         m0->m_flags |= M_FIRSTFRAG | M_FRAG;
743         m0->m_pkthdr.csum_data = nfrags;
744
745         /*
746          * Update first fragment by trimming what's been copied out
747          * and updating header.
748          */
749         m_adj(m0, hlen + firstlen - ip->ip_len);
750         m0->m_pkthdr.len = hlen + firstlen;
751         ip->ip_len = htons((u_short)m0->m_pkthdr.len);
752         ip->ip_off |= IP_MF;
753         ip->ip_off = htons(ip->ip_off);
754         ip->ip_sum = 0;
755         if (sw_csum & CSUM_DELAY_IP)
756                 ip->ip_sum = in_cksum(m0, hlen);
757
758 done:
759         *m_frag = m0;
760         return error;
761 }
762
763 void
764 in_delayed_cksum(struct mbuf *m)
765 {
766         struct ip *ip;
767         u_short csum, offset;
768
769         ip = mtod(m, struct ip *);
770         offset = ip->ip_hl << 2 ;
771         csum = in_cksum_skip(m, ip->ip_len, offset);
772         if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
773                 csum = 0xffff;
774         offset += m->m_pkthdr.csum_data;        /* checksum offset */
775
776         if (offset + sizeof(u_short) > m->m_len) {
777                 printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
778                     m->m_len, offset, ip->ip_p);
779                 /*
780                  * XXX
781                  * this shouldn't happen, but if it does, the
782                  * correct behavior may be to insert the checksum
783                  * in the appropriate next mbuf in the chain.
784                  */
785                 return;
786         }
787         *(u_short *)(m->m_data + offset) = csum;
788 }
789
790 /*
791  * IP socket option processing.
792  */
793 int
794 ip_ctloutput(so, sopt)
795         struct socket *so;
796         struct sockopt *sopt;
797 {
798         struct  inpcb *inp = sotoinpcb(so);
799         int     error, optval;
800
801         error = optval = 0;
802         if (sopt->sopt_level != IPPROTO_IP) {
803                 return (EINVAL);
804         }
805
806         switch (sopt->sopt_dir) {
807         case SOPT_SET:
808                 switch (sopt->sopt_name) {
809                 case IP_OPTIONS:
810 #ifdef notyet
811                 case IP_RETOPTS:
812 #endif
813                 {
814                         struct mbuf *m;
815                         if (sopt->sopt_valsize > MLEN) {
816                                 error = EMSGSIZE;
817                                 break;
818                         }
819                         MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
820                         if (m == NULL) {
821                                 error = ENOBUFS;
822                                 break;
823                         }
824                         m->m_len = sopt->sopt_valsize;
825                         error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
826                                             m->m_len);
827                         if (error) {
828                                 m_free(m);
829                                 break;
830                         }
831                         INP_LOCK(inp);
832                         error = ip_pcbopts(inp, sopt->sopt_name, m);
833                         INP_UNLOCK(inp);
834                         return (error);
835                 }
836
837                 case IP_TOS:
838                 case IP_TTL:
839                 case IP_MINTTL:
840                 case IP_RECVOPTS:
841                 case IP_RECVRETOPTS:
842                 case IP_RECVDSTADDR:
843                 case IP_RECVTTL:
844                 case IP_RECVIF:
845                 case IP_FAITH:
846                 case IP_ONESBCAST:
847                 case IP_DONTFRAG:
848                         error = sooptcopyin(sopt, &optval, sizeof optval,
849                                             sizeof optval);
850                         if (error)
851                                 break;
852
853                         switch (sopt->sopt_name) {
854                         case IP_TOS:
855                                 inp->inp_ip_tos = optval;
856                                 break;
857
858                         case IP_TTL:
859                                 inp->inp_ip_ttl = optval;
860                                 break;
861
862                         case IP_MINTTL:
863                                 if (optval > 0 && optval <= MAXTTL)
864                                         inp->inp_ip_minttl = optval;
865                                 else
866                                         error = EINVAL;
867                                 break;
868
869 #define OPTSET(bit) do {                                                \
870         INP_LOCK(inp);                                                  \
871         if (optval)                                                     \
872                 inp->inp_flags |= bit;                                  \
873         else                                                            \
874                 inp->inp_flags &= ~bit;                                 \
875         INP_UNLOCK(inp);                                                \
876 } while (0)
877
878                         case IP_RECVOPTS:
879                                 OPTSET(INP_RECVOPTS);
880                                 break;
881
882                         case IP_RECVRETOPTS:
883                                 OPTSET(INP_RECVRETOPTS);
884                                 break;
885
886                         case IP_RECVDSTADDR:
887                                 OPTSET(INP_RECVDSTADDR);
888                                 break;
889
890                         case IP_RECVTTL:
891                                 OPTSET(INP_RECVTTL);
892                                 break;
893
894                         case IP_RECVIF:
895                                 OPTSET(INP_RECVIF);
896                                 break;
897
898                         case IP_FAITH:
899                                 OPTSET(INP_FAITH);
900                                 break;
901
902                         case IP_ONESBCAST:
903                                 OPTSET(INP_ONESBCAST);
904                                 break;
905                         case IP_DONTFRAG:
906                                 OPTSET(INP_DONTFRAG);
907                                 break;
908                         }
909                         break;
910 #undef OPTSET
911
912                 case IP_MULTICAST_IF:
913                 case IP_MULTICAST_VIF:
914                 case IP_MULTICAST_TTL:
915                 case IP_MULTICAST_LOOP:
916                 case IP_ADD_MEMBERSHIP:
917                 case IP_DROP_MEMBERSHIP:
918                         error = ip_setmoptions(inp, sopt);
919                         break;
920
921                 case IP_PORTRANGE:
922                         error = sooptcopyin(sopt, &optval, sizeof optval,
923                                             sizeof optval);
924                         if (error)
925                                 break;
926
927                         INP_LOCK(inp);
928                         switch (optval) {
929                         case IP_PORTRANGE_DEFAULT:
930                                 inp->inp_flags &= ~(INP_LOWPORT);
931                                 inp->inp_flags &= ~(INP_HIGHPORT);
932                                 break;
933
934                         case IP_PORTRANGE_HIGH:
935                                 inp->inp_flags &= ~(INP_LOWPORT);
936                                 inp->inp_flags |= INP_HIGHPORT;
937                                 break;
938
939                         case IP_PORTRANGE_LOW:
940                                 inp->inp_flags &= ~(INP_HIGHPORT);
941                                 inp->inp_flags |= INP_LOWPORT;
942                                 break;
943
944                         default:
945                                 error = EINVAL;
946                                 break;
947                         }
948                         INP_UNLOCK(inp);
949                         break;
950
951 #if defined(IPSEC) || defined(FAST_IPSEC)
952                 case IP_IPSEC_POLICY:
953                 {
954                         caddr_t req;
955                         size_t len = 0;
956                         int priv;
957                         struct mbuf *m;
958                         int optname;
959
960                         if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
961                                 break;
962                         if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
963                                 break;
964                         priv = (sopt->sopt_td != NULL &&
965                                 suser(sopt->sopt_td) != 0) ? 0 : 1;
966                         req = mtod(m, caddr_t);
967                         len = m->m_len;
968                         optname = sopt->sopt_name;
969                         error = ipsec4_set_policy(inp, optname, req, len, priv);
970                         m_freem(m);
971                         break;
972                 }
973 #endif /*IPSEC*/
974
975                 default:
976                         error = ENOPROTOOPT;
977                         break;
978                 }
979                 break;
980
981         case SOPT_GET:
982                 switch (sopt->sopt_name) {
983                 case IP_OPTIONS:
984                 case IP_RETOPTS:
985                         if (inp->inp_options)
986                                 error = sooptcopyout(sopt, 
987                                                      mtod(inp->inp_options,
988                                                           char *),
989                                                      inp->inp_options->m_len);
990                         else
991                                 sopt->sopt_valsize = 0;
992                         break;
993
994                 case IP_TOS:
995                 case IP_TTL:
996                 case IP_MINTTL:
997                 case IP_RECVOPTS:
998                 case IP_RECVRETOPTS:
999                 case IP_RECVDSTADDR:
1000                 case IP_RECVTTL:
1001                 case IP_RECVIF:
1002                 case IP_PORTRANGE:
1003                 case IP_FAITH:
1004                 case IP_ONESBCAST:
1005                 case IP_DONTFRAG:
1006                         switch (sopt->sopt_name) {
1007
1008                         case IP_TOS:
1009                                 optval = inp->inp_ip_tos;
1010                                 break;
1011
1012                         case IP_TTL:
1013                                 optval = inp->inp_ip_ttl;
1014                                 break;
1015
1016                         case IP_MINTTL:
1017                                 optval = inp->inp_ip_minttl;
1018                                 break;
1019
1020 #define OPTBIT(bit)     (inp->inp_flags & bit ? 1 : 0)
1021
1022                         case IP_RECVOPTS:
1023                                 optval = OPTBIT(INP_RECVOPTS);
1024                                 break;
1025
1026                         case IP_RECVRETOPTS:
1027                                 optval = OPTBIT(INP_RECVRETOPTS);
1028                                 break;
1029
1030                         case IP_RECVDSTADDR:
1031                                 optval = OPTBIT(INP_RECVDSTADDR);
1032                                 break;
1033
1034                         case IP_RECVTTL:
1035                                 optval = OPTBIT(INP_RECVTTL);
1036                                 break;
1037
1038                         case IP_RECVIF:
1039                                 optval = OPTBIT(INP_RECVIF);
1040                                 break;
1041
1042                         case IP_PORTRANGE:
1043                                 if (inp->inp_flags & INP_HIGHPORT)
1044                                         optval = IP_PORTRANGE_HIGH;
1045                                 else if (inp->inp_flags & INP_LOWPORT)
1046                                         optval = IP_PORTRANGE_LOW;
1047                                 else
1048                                         optval = 0;
1049                                 break;
1050
1051                         case IP_FAITH:
1052                                 optval = OPTBIT(INP_FAITH);
1053                                 break;
1054
1055                         case IP_ONESBCAST:
1056                                 optval = OPTBIT(INP_ONESBCAST);
1057                                 break;
1058                         case IP_DONTFRAG:
1059                                 optval = OPTBIT(INP_DONTFRAG);
1060                                 break;
1061                         }
1062                         error = sooptcopyout(sopt, &optval, sizeof optval);
1063                         break;
1064
1065                 case IP_MULTICAST_IF:
1066                 case IP_MULTICAST_VIF:
1067                 case IP_MULTICAST_TTL:
1068                 case IP_MULTICAST_LOOP:
1069                 case IP_ADD_MEMBERSHIP:
1070                 case IP_DROP_MEMBERSHIP:
1071                         error = ip_getmoptions(inp, sopt);
1072                         break;
1073
1074 #if defined(IPSEC) || defined(FAST_IPSEC)
1075                 case IP_IPSEC_POLICY:
1076                 {
1077                         struct mbuf *m = NULL;
1078                         caddr_t req = NULL;
1079                         size_t len = 0;
1080
1081                         if (m != 0) {
1082                                 req = mtod(m, caddr_t);
1083                                 len = m->m_len;
1084                         }
1085                         error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1086                         if (error == 0)
1087                                 error = soopt_mcopyout(sopt, m); /* XXX */
1088                         if (error == 0)
1089                                 m_freem(m);
1090                         break;
1091                 }
1092 #endif /*IPSEC*/
1093
1094                 default:
1095                         error = ENOPROTOOPT;
1096                         break;
1097                 }
1098                 break;
1099         }
1100         return (error);
1101 }
1102
1103 /*
1104  * XXX
1105  * The whole multicast option thing needs to be re-thought.
1106  * Several of these options are equally applicable to non-multicast
1107  * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1108  * standard option (IP_TTL).
1109  */
1110
1111 /*
1112  * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1113  */
1114 static struct ifnet *
1115 ip_multicast_if(a, ifindexp)
1116         struct in_addr *a;
1117         int *ifindexp;
1118 {
1119         int ifindex;
1120         struct ifnet *ifp;
1121
1122         if (ifindexp)
1123                 *ifindexp = 0;
1124         if (ntohl(a->s_addr) >> 24 == 0) {
1125                 ifindex = ntohl(a->s_addr) & 0xffffff;
1126                 if (ifindex < 0 || if_index < ifindex)
1127                         return NULL;
1128                 ifp = ifnet_byindex(ifindex);
1129                 if (ifindexp)
1130                         *ifindexp = ifindex;
1131         } else {
1132                 INADDR_TO_IFP(*a, ifp);
1133         }
1134         return ifp;
1135 }
1136
1137 /*
1138  * Given an inpcb, return its multicast options structure pointer.  Accepts
1139  * an unlocked inpcb pointer, but will return it locked.  May sleep.
1140  */
1141 static struct ip_moptions *
1142 ip_findmoptions(struct inpcb *inp)
1143 {
1144         struct ip_moptions *imo;
1145         struct in_multi **immp;
1146
1147         INP_LOCK(inp);
1148         if (inp->inp_moptions != NULL)
1149                 return (inp->inp_moptions);
1150
1151         INP_UNLOCK(inp);
1152
1153         imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK);
1154         immp = (struct in_multi **)malloc((sizeof(*immp) * IP_MIN_MEMBERSHIPS),
1155                                           M_IPMOPTS, M_WAITOK);
1156
1157         imo->imo_multicast_ifp = NULL;
1158         imo->imo_multicast_addr.s_addr = INADDR_ANY;
1159         imo->imo_multicast_vif = -1;
1160         imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1161         imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1162         imo->imo_num_memberships = 0;
1163         imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
1164         imo->imo_membership = immp;
1165
1166         INP_LOCK(inp);
1167         if (inp->inp_moptions != NULL) {
1168                 free(immp, M_IPMOPTS);
1169                 free(imo, M_IPMOPTS);
1170                 return (inp->inp_moptions);
1171         }
1172         inp->inp_moptions = imo;
1173         return (imo);
1174 }
1175
1176 /*
1177  * Set the IP multicast options in response to user setsockopt().
1178  */
1179 static int
1180 ip_setmoptions(struct inpcb *inp, struct sockopt *sopt)
1181 {
1182         int error = 0;
1183         int i;
1184         struct in_addr addr;
1185         struct ip_mreq mreq;
1186         struct ifnet *ifp;
1187         struct ip_moptions *imo;
1188         struct route ro;
1189         struct sockaddr_in *dst;
1190         int ifindex;
1191         int s;
1192
1193         switch (sopt->sopt_name) {
1194         /* store an index number for the vif you wanna use in the send */
1195         case IP_MULTICAST_VIF:
1196                 if (legal_vif_num == 0) {
1197                         error = EOPNOTSUPP;
1198                         break;
1199                 }
1200                 error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
1201                 if (error)
1202                         break;
1203                 if (!legal_vif_num(i) && (i != -1)) {
1204                         error = EINVAL;
1205                         break;
1206                 }
1207                 imo = ip_findmoptions(inp);
1208                 imo->imo_multicast_vif = i;
1209                 INP_UNLOCK(inp);
1210                 break;
1211
1212         case IP_MULTICAST_IF:
1213                 /*
1214                  * Select the interface for outgoing multicast packets.
1215                  */
1216                 error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
1217                 if (error)
1218                         break;
1219                 /*
1220                  * INADDR_ANY is used to remove a previous selection.
1221                  * When no interface is selected, a default one is
1222                  * chosen every time a multicast packet is sent.
1223                  */
1224                 imo = ip_findmoptions(inp);
1225                 if (addr.s_addr == INADDR_ANY) {
1226                         imo->imo_multicast_ifp = NULL;
1227                         INP_UNLOCK(inp);
1228                         break;
1229                 }
1230                 /*
1231                  * The selected interface is identified by its local
1232                  * IP address.  Find the interface and confirm that
1233                  * it supports multicasting.
1234                  */
1235                 s = splimp();
1236                 ifp = ip_multicast_if(&addr, &ifindex);
1237                 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1238                         INP_UNLOCK(inp);
1239                         splx(s);
1240                         error = EADDRNOTAVAIL;
1241                         break;
1242                 }
1243                 imo->imo_multicast_ifp = ifp;
1244                 if (ifindex)
1245                         imo->imo_multicast_addr = addr;
1246                 else
1247                         imo->imo_multicast_addr.s_addr = INADDR_ANY;
1248                 INP_UNLOCK(inp);
1249                 splx(s);
1250                 break;
1251
1252         case IP_MULTICAST_TTL:
1253                 /*
1254                  * Set the IP time-to-live for outgoing multicast packets.
1255                  * The original multicast API required a char argument,
1256                  * which is inconsistent with the rest of the socket API.
1257                  * We allow either a char or an int.
1258                  */
1259                 if (sopt->sopt_valsize == 1) {
1260                         u_char ttl;
1261                         error = sooptcopyin(sopt, &ttl, 1, 1);
1262                         if (error)
1263                                 break;
1264                         imo = ip_findmoptions(inp);
1265                         imo->imo_multicast_ttl = ttl;
1266                         INP_UNLOCK(inp);
1267                 } else {
1268                         u_int ttl;
1269                         error = sooptcopyin(sopt, &ttl, sizeof ttl, 
1270                                             sizeof ttl);
1271                         if (error)
1272                                 break;
1273                         if (ttl > 255)
1274                                 error = EINVAL;
1275                         else {
1276                                 imo = ip_findmoptions(inp);
1277                                 imo->imo_multicast_ttl = ttl;
1278                                 INP_UNLOCK(inp);
1279                         }
1280                 }
1281                 break;
1282
1283         case IP_MULTICAST_LOOP:
1284                 /*
1285                  * Set the loopback flag for outgoing multicast packets.
1286                  * Must be zero or one.  The original multicast API required a
1287                  * char argument, which is inconsistent with the rest
1288                  * of the socket API.  We allow either a char or an int.
1289                  */
1290                 if (sopt->sopt_valsize == 1) {
1291                         u_char loop;
1292                         error = sooptcopyin(sopt, &loop, 1, 1);
1293                         if (error)
1294                                 break;
1295                         imo = ip_findmoptions(inp);
1296                         imo->imo_multicast_loop = !!loop;
1297                         INP_UNLOCK(inp);
1298                 } else {
1299                         u_int loop;
1300                         error = sooptcopyin(sopt, &loop, sizeof loop,
1301                                             sizeof loop);
1302                         if (error)
1303                                 break;
1304                         imo = ip_findmoptions(inp);
1305                         imo->imo_multicast_loop = !!loop;
1306                         INP_UNLOCK(inp);
1307                 }
1308                 break;
1309
1310         case IP_ADD_MEMBERSHIP:
1311                 /*
1312                  * Add a multicast group membership.
1313                  * Group must be a valid IP multicast address.
1314                  */
1315                 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1316                 if (error)
1317                         break;
1318
1319                 if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1320                         error = EINVAL;
1321                         break;
1322                 }
1323                 s = splimp();
1324                 /*
1325                  * If no interface address was provided, use the interface of
1326                  * the route to the given multicast address.
1327                  */
1328                 if (mreq.imr_interface.s_addr == INADDR_ANY) {
1329                         bzero((caddr_t)&ro, sizeof(ro));
1330                         dst = (struct sockaddr_in *)&ro.ro_dst;
1331                         dst->sin_len = sizeof(*dst);
1332                         dst->sin_family = AF_INET;
1333                         dst->sin_addr = mreq.imr_multiaddr;
1334                         rtalloc_ign(&ro, RTF_CLONING);
1335                         if (ro.ro_rt == NULL) {
1336                                 error = EADDRNOTAVAIL;
1337                                 splx(s);
1338                                 break;
1339                         }
1340                         ifp = ro.ro_rt->rt_ifp;
1341                         RTFREE(ro.ro_rt);
1342                 }
1343                 else {
1344                         ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1345                 }
1346
1347                 /*
1348                  * See if we found an interface, and confirm that it
1349                  * supports multicast.
1350                  */
1351                 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1352                         error = EADDRNOTAVAIL;
1353                         splx(s);
1354                         break;
1355                 }
1356                 /*
1357                  * See if the membership already exists or if all the
1358                  * membership slots are full.
1359                  */
1360                 imo = ip_findmoptions(inp);
1361                 for (i = 0; i < imo->imo_num_memberships; ++i) {
1362                         if (imo->imo_membership[i]->inm_ifp == ifp &&
1363                             imo->imo_membership[i]->inm_addr.s_addr
1364                                                 == mreq.imr_multiaddr.s_addr)
1365                                 break;
1366                 }
1367                 if (i < imo->imo_num_memberships) {
1368                         INP_UNLOCK(inp);
1369                         error = EADDRINUSE;
1370                         splx(s);
1371                         break;
1372                 }
1373                 if (imo->imo_num_memberships == imo->imo_max_memberships) {
1374                     struct in_multi **nmships, **omships;
1375                     size_t newmax;
1376                     /*
1377                      * Resize the vector to next power-of-two minus 1. If the
1378                      * size would exceed the maximum then we know we've really
1379                      * run out of entries. Otherwise, we realloc() the vector
1380                      * with the INP lock held to avoid introducing a race.
1381                      */
1382                     nmships = NULL;
1383                     omships = imo->imo_membership;
1384                     newmax = ((imo->imo_max_memberships + 1) * 2) - 1;
1385                     if (newmax <= IP_MAX_MEMBERSHIPS) {
1386                         nmships = (struct in_multi **)realloc(omships,
1387 sizeof(*nmships) * newmax, M_IPMOPTS, M_NOWAIT);
1388                         if (nmships != NULL) {
1389                             imo->imo_membership = nmships;
1390                             imo->imo_max_memberships = newmax;
1391                         }
1392                     }
1393                     if (nmships == NULL) {
1394                         INP_UNLOCK(inp);
1395                         error = ETOOMANYREFS;
1396                         splx(s);
1397                         break;
1398                     }
1399                 }
1400                 /*
1401                  * Everything looks good; add a new record to the multicast
1402                  * address list for the given interface.
1403                  */
1404                 if ((imo->imo_membership[i] =
1405                     in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
1406                         INP_UNLOCK(inp);
1407                         error = ENOBUFS;
1408                         splx(s);
1409                         break;
1410                 }
1411                 ++imo->imo_num_memberships;
1412                 INP_UNLOCK(inp);
1413                 splx(s);
1414                 break;
1415
1416         case IP_DROP_MEMBERSHIP:
1417                 /*
1418                  * Drop a multicast group membership.
1419                  * Group must be a valid IP multicast address.
1420                  */
1421                 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1422                 if (error)
1423                         break;
1424
1425                 if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1426                         error = EINVAL;
1427                         break;
1428                 }
1429
1430                 s = splimp();
1431                 /*
1432                  * If an interface address was specified, get a pointer
1433                  * to its ifnet structure.
1434                  */
1435                 if (mreq.imr_interface.s_addr == INADDR_ANY)
1436                         ifp = NULL;
1437                 else {
1438                         ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1439                         if (ifp == NULL) {
1440                                 error = EADDRNOTAVAIL;
1441                                 splx(s);
1442                                 break;
1443                         }
1444                 }
1445                 /*
1446                  * Find the membership in the membership array.
1447                  */
1448                 imo = ip_findmoptions(inp);
1449                 for (i = 0; i < imo->imo_num_memberships; ++i) {
1450                         if ((ifp == NULL ||
1451                              imo->imo_membership[i]->inm_ifp == ifp) &&
1452                              imo->imo_membership[i]->inm_addr.s_addr ==
1453                              mreq.imr_multiaddr.s_addr)
1454                                 break;
1455                 }
1456                 if (i == imo->imo_num_memberships) {
1457                         INP_UNLOCK(inp);
1458                         error = EADDRNOTAVAIL;
1459                         splx(s);
1460                         break;
1461                 }
1462                 /*
1463                  * Give up the multicast address record to which the
1464                  * membership points.
1465                  */
1466                 in_delmulti(imo->imo_membership[i]);
1467                 /*
1468                  * Remove the gap in the membership array.
1469                  */
1470                 for (++i; i < imo->imo_num_memberships; ++i)
1471                         imo->imo_membership[i-1] = imo->imo_membership[i];
1472                 --imo->imo_num_memberships;
1473                 INP_UNLOCK(inp);
1474                 splx(s);
1475                 break;
1476
1477         default:
1478                 error = EOPNOTSUPP;
1479                 break;
1480         }
1481
1482         return (error);
1483 }
1484
1485 /*
1486  * Return the IP multicast options in response to user getsockopt().
1487  */
1488 static int
1489 ip_getmoptions(struct inpcb *inp, struct sockopt *sopt)
1490 {
1491         struct ip_moptions *imo;
1492         struct in_addr addr;
1493         struct in_ifaddr *ia;
1494         int error, optval;
1495         u_char coptval;
1496
1497         INP_LOCK(inp);
1498         imo = inp->inp_moptions;
1499
1500         error = 0;
1501         switch (sopt->sopt_name) {
1502         case IP_MULTICAST_VIF: 
1503                 if (imo != NULL)
1504                         optval = imo->imo_multicast_vif;
1505                 else
1506                         optval = -1;
1507                 INP_UNLOCK(inp);
1508                 error = sooptcopyout(sopt, &optval, sizeof optval);
1509                 break;
1510
1511         case IP_MULTICAST_IF:
1512                 if (imo == NULL || imo->imo_multicast_ifp == NULL)
1513                         addr.s_addr = INADDR_ANY;
1514                 else if (imo->imo_multicast_addr.s_addr) {
1515                         /* return the value user has set */
1516                         addr = imo->imo_multicast_addr;
1517                 } else {
1518                         IFP_TO_IA(imo->imo_multicast_ifp, ia);
1519                         addr.s_addr = (ia == NULL) ? INADDR_ANY
1520                                 : IA_SIN(ia)->sin_addr.s_addr;
1521                 }
1522                 INP_UNLOCK(inp);
1523                 error = sooptcopyout(sopt, &addr, sizeof addr);
1524                 break;
1525
1526         case IP_MULTICAST_TTL:
1527                 if (imo == 0)
1528                         optval = coptval = IP_DEFAULT_MULTICAST_TTL;
1529                 else
1530                         optval = coptval = imo->imo_multicast_ttl;
1531                 INP_UNLOCK(inp);
1532                 if (sopt->sopt_valsize == 1)
1533                         error = sooptcopyout(sopt, &coptval, 1);
1534                 else
1535                         error = sooptcopyout(sopt, &optval, sizeof optval);
1536                 break;
1537
1538         case IP_MULTICAST_LOOP:
1539                 if (imo == 0)
1540                         optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
1541                 else
1542                         optval = coptval = imo->imo_multicast_loop;
1543                 INP_UNLOCK(inp);
1544                 if (sopt->sopt_valsize == 1)
1545                         error = sooptcopyout(sopt, &coptval, 1);
1546                 else
1547                         error = sooptcopyout(sopt, &optval, sizeof optval);
1548                 break;
1549
1550         default:
1551                 INP_UNLOCK(inp);
1552                 error = ENOPROTOOPT;
1553                 break;
1554         }
1555         INP_UNLOCK_ASSERT(inp);
1556
1557         return (error);
1558 }
1559
1560 /*
1561  * Discard the IP multicast options.
1562  */
1563 void
1564 ip_freemoptions(imo)
1565         register struct ip_moptions *imo;
1566 {
1567         register int i;
1568
1569         if (imo != NULL) {
1570                 for (i = 0; i < imo->imo_num_memberships; ++i)
1571                         in_delmulti(imo->imo_membership[i]);
1572                 free(imo->imo_membership, M_IPMOPTS);
1573                 free(imo, M_IPMOPTS);
1574         }
1575 }
1576
1577 /*
1578  * Routine called from ip_output() to loop back a copy of an IP multicast
1579  * packet to the input queue of a specified interface.  Note that this
1580  * calls the output routine of the loopback "driver", but with an interface
1581  * pointer that might NOT be a loopback interface -- evil, but easier than
1582  * replicating that code here.
1583  */
1584 static void
1585 ip_mloopback(ifp, m, dst, hlen)
1586         struct ifnet *ifp;
1587         register struct mbuf *m;
1588         register struct sockaddr_in *dst;
1589         int hlen;
1590 {
1591         register struct ip *ip;
1592         struct mbuf *copym;
1593
1594         copym = m_copy(m, 0, M_COPYALL);
1595         if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
1596                 copym = m_pullup(copym, hlen);
1597         if (copym != NULL) {
1598                 /* If needed, compute the checksum and mark it as valid. */
1599                 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1600                         in_delayed_cksum(copym);
1601                         copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1602                         copym->m_pkthdr.csum_flags |=
1603                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1604                         copym->m_pkthdr.csum_data = 0xffff;
1605                 }
1606                 /*
1607                  * We don't bother to fragment if the IP length is greater
1608                  * than the interface's MTU.  Can this possibly matter?
1609                  */
1610                 ip = mtod(copym, struct ip *);
1611                 ip->ip_len = htons(ip->ip_len);
1612                 ip->ip_off = htons(ip->ip_off);
1613                 ip->ip_sum = 0;
1614                 ip->ip_sum = in_cksum(copym, hlen);
1615                 /*
1616                  * NB:
1617                  * It's not clear whether there are any lingering
1618                  * reentrancy problems in other areas which might
1619                  * be exposed by using ip_input directly (in
1620                  * particular, everything which modifies the packet
1621                  * in-place).  Yet another option is using the
1622                  * protosw directly to deliver the looped back
1623                  * packet.  For the moment, we'll err on the side
1624                  * of safety by using if_simloop().
1625                  */
1626 #if 1 /* XXX */
1627                 if (dst->sin_family != AF_INET) {
1628                         printf("ip_mloopback: bad address family %d\n",
1629                                                 dst->sin_family);
1630                         dst->sin_family = AF_INET;
1631                 }
1632 #endif
1633
1634 #ifdef notdef
1635                 copym->m_pkthdr.rcvif = ifp;
1636                 ip_input(copym);
1637 #else
1638                 if_simloop(ifp, copym, dst->sin_family, 0);
1639 #endif
1640         }
1641 }