sys/netinet/ip_divert.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1982, 1986, 1988, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. Neither the name of the University nor the names of its contributors
  16  *    may be used to endorse or promote products derived from this software
  17  *    without specific prior written permission.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29  * SUCH DAMAGE.
  30  */
  31
  32 #include <sys/cdefs.h>
  33 __FBSDID("$FreeBSD$");
  34
  35 #include "opt_inet.h"
  36 #include "opt_inet6.h"
  37 #include "opt_sctp.h"
  38 #ifndef INET
  39 #error "IPDIVERT requires INET"
  40 #endif
  41
  42 #include <sys/param.h>
  43 #include <sys/eventhandler.h>
  44 #include <sys/kernel.h>
  45 #include <sys/lock.h>
  46 #include <sys/malloc.h>
  47 #include <sys/mbuf.h>
  48 #include <sys/module.h>
  49 #include <sys/kernel.h>
  50 #include <sys/priv.h>
  51 #include <sys/proc.h>
  52 #include <sys/protosw.h>
  53 #include <sys/socket.h>
  54 #include <sys/socketvar.h>
  55 #include <sys/sysctl.h>
  56 #include <net/vnet.h>
  57
  58 #include <net/if.h>
  59 #include <net/if_var.h>
  60 #include <net/netisr.h>
  61
  62 #include <netinet/in.h>
  63 #include <netinet/in_pcb.h>
  64 #include <netinet/in_systm.h>
  65 #include <netinet/in_var.h>
  66 #include <netinet/ip.h>
  67 #include <netinet/ip_var.h>
  68 #ifdef INET6
  69 #include <netinet/ip6.h>
  70 #include <netinet6/ip6_var.h>
  71 #endif
  72 #if defined(SCTP) || defined(SCTP_SUPPORT)
  73 #include <netinet/sctp_crc32.h>
  74 #endif
  75
  76 #include <security/mac/mac_framework.h>
  77 /*
  78  * Divert sockets
  79  */
  80
  81 /*
  82  * Allocate enough space to hold a full IP packet
  83  */
  84 #define DIVSNDQ         (65536 + 100)
  85 #define DIVRCVQ         (65536 + 100)
  86
  87 /*
  88  * Divert sockets work in conjunction with ipfw or other packet filters,
  89  * see the divert(4) manpage for features.
  90  * Packets are selected by the packet filter and tagged with an
  91  * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by
  92  * the packet filter) and information on the matching filter rule for
  93  * subsequent reinjection. The divert_port is used to put the packet
  94  * on the corresponding divert socket, while the rule number is passed
  95  * up (at least partially) as the sin_port in the struct sockaddr.
  96  *
  97  * Packets written to the divert socket carry in sin_addr a
  98  * destination address, and in sin_port the number of the filter rule
  99  * after which to continue processing.
 100  * If the destination address is INADDR_ANY, the packet is treated as
 101  * as outgoing and sent to ip_output(); otherwise it is treated as
 102  * incoming and sent to ip_input().
 103  * Further, sin_zero carries some information on the interface,
 104  * which can be used in the reinject -- see comments in the code.
 105  *
 106  * On reinjection, processing in ip_input() and ip_output()
 107  * will be exactly the same as for the original packet, except that
 108  * packet filter processing will start at the rule number after the one
 109  * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0
 110  * will apply the entire ruleset to the packet).
 111  */
 112
 113 /* Internal variables. */
 114 VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo);
 115 #define V_divcbinfo                     VNET(divcbinfo)
 116
 117 static u_long   div_sendspace = DIVSNDQ;        /* XXX sysctl ? */
 118 static u_long   div_recvspace = DIVRCVQ;        /* XXX sysctl ? */
 119
 120 static int div_output_inbound(int fmaily, struct socket *so, struct mbuf *m,
 121     struct sockaddr_in *sin);
 122 static int div_output_outbound(int family, struct socket *so, struct mbuf *m);
 123
 124 /*
 125  * Initialize divert connection block queue.
 126  */
 127 INPCBSTORAGE_DEFINE(divcbstor, "divinp", "divcb", "div", "divhash");
 128
 129 static void
 130 div_init(void *arg __unused)
 131 {
 132
 133         /*
 134          * XXX We don't use the hash list for divert IP, but it's easier to
 135          * allocate one-entry hash lists than it is to check all over the
 136          * place for hashbase == NULL.
 137          */
 138         in_pcbinfo_init(&V_divcbinfo, &divcbstor, 1, 1);
 139 }
 140 VNET_SYSINIT(div_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_init, NULL);
 141
 142 static void
 143 div_destroy(void *unused __unused)
 144 {
 145
 146         in_pcbinfo_destroy(&V_divcbinfo);
 147 }
 148 VNET_SYSUNINIT(divert, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_destroy, NULL);
 149
 150 static bool
 151 div_port_match(const struct inpcb *inp, void *v)
 152 {
 153         uint16_t nport = *(uint16_t *)v;
 154
 155         return (inp->inp_lport == nport);
 156 }
 157
 158 /*
 159  * Divert a packet by passing it up to the divert socket at port 'port'.
 160  */
 161 static void
 162 divert_packet(struct mbuf *m, bool incoming)
 163 {
 164 #if defined(SCTP) || defined(SCTP_SUPPORT)
 165         struct ip *ip;
 166 #endif
 167         struct inpcb *inp;
 168         struct socket *sa;
 169         u_int16_t nport;
 170         struct sockaddr_in divsrc;
 171         struct inpcb_iterator inpi = INP_ITERATOR(&V_divcbinfo,
 172             INPLOOKUP_RLOCKPCB, div_port_match, &nport);
 173         struct m_tag *mtag;
 174
 175         NET_EPOCH_ASSERT();
 176
 177         mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
 178         if (mtag == NULL) {
 179                 m_freem(m);
 180                 return;
 181         }
 182         /* Assure header */
 183         if (m->m_len < sizeof(struct ip) &&
 184             (m = m_pullup(m, sizeof(struct ip))) == NULL)
 185                 return;
 186
 187         /* Delayed checksums are currently not compatible with divert. */
 188         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 189                 in_delayed_cksum(m);
 190                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 191         }
 192 #if defined(SCTP) || defined(SCTP_SUPPORT)
 193         if (m->m_pkthdr.csum_flags & CSUM_SCTP) {
 194                 ip = mtod(m, struct ip *);
 195                 sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 196                 m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 197         }
 198 #endif
 199 #ifdef INET6
 200         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 201                 in6_delayed_cksum(m, m->m_pkthdr.len -
 202                     sizeof(struct ip6_hdr), sizeof(struct ip6_hdr));
 203                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 204         }
 205 #if defined(SCTP) || defined(SCTP_SUPPORT)
 206         if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
 207                 sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
 208                 m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
 209         }
 210 #endif
 211 #endif /* INET6 */
 212         bzero(&divsrc, sizeof(divsrc));
 213         divsrc.sin_len = sizeof(divsrc);
 214         divsrc.sin_family = AF_INET;
 215         /* record matching rule, in host format */
 216         divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum;
 217         /*
 218          * Record receive interface address, if any.
 219          * But only for incoming packets.
 220          */
 221         if (incoming) {
 222                 struct ifaddr *ifa;
 223                 struct ifnet *ifp;
 224
 225                 /* Sanity check */
 226                 M_ASSERTPKTHDR(m);
 227
 228                 /* Find IP address for receive interface */
 229                 ifp = m->m_pkthdr.rcvif;
 230                 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 231                         if (ifa->ifa_addr->sa_family != AF_INET)
 232                                 continue;
 233                         divsrc.sin_addr =
 234                             ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr;
 235                         break;
 236                 }
 237         }
 238         /*
 239          * Record the incoming interface name whenever we have one.
 240          */
 241         if (m->m_pkthdr.rcvif) {
 242                 /*
 243                  * Hide the actual interface name in there in the
 244                  * sin_zero array. XXX This needs to be moved to a
 245                  * different sockaddr type for divert, e.g.
 246                  * sockaddr_div with multiple fields like
 247                  * sockaddr_dl. Presently we have only 7 bytes
 248                  * but that will do for now as most interfaces
 249                  * are 4 or less + 2 or less bytes for unit.
 250                  * There is probably a faster way of doing this,
 251                  * possibly taking it from the sockaddr_dl on the iface.
 252                  * This solves the problem of a P2P link and a LAN interface
 253                  * having the same address, which can result in the wrong
 254                  * interface being assigned to the packet when fed back
 255                  * into the divert socket. Theoretically if the daemon saves
 256                  * and re-uses the sockaddr_in as suggested in the man pages,
 257                  * this iface name will come along for the ride.
 258                  * (see div_output for the other half of this.)
 259                  */
 260                 strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname,
 261                     sizeof(divsrc.sin_zero));
 262         }
 263
 264         /* Put packet on socket queue, if any */
 265         sa = NULL;
 266         /* nport is inp_next's context. */
 267         nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
 268         while ((inp = inp_next(&inpi)) != NULL) {
 269                 sa = inp->inp_socket;
 270                 SOCKBUF_LOCK(&sa->so_rcv);
 271                 if (sbappendaddr_locked(&sa->so_rcv,
 272                     (struct sockaddr *)&divsrc, m, NULL) == 0) {
 273                         soroverflow_locked(sa);
 274                         sa = NULL;      /* force mbuf reclaim below */
 275                 } else
 276                         sorwakeup_locked(sa);
 277                 /* XXX why does only one socket match? */
 278                 INP_RUNLOCK(inp);
 279                 break;
 280         }
 281         if (sa == NULL) {
 282                 m_freem(m);
 283                 KMOD_IPSTAT_INC(ips_noproto);
 284                 KMOD_IPSTAT_DEC(ips_delivered);
 285         }
 286 }
 287
 288 /*
 289  * Deliver packet back into the IP processing machinery.
 290  *
 291  * If no address specified, or address is 0.0.0.0, send to ip_output();
 292  * otherwise, send to ip_input() and mark as having been received on
 293  * the interface with that address.
 294  */
 295 static int
 296 div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
 297     struct mbuf *control)
 298 {
 299         struct epoch_tracker et;
 300         const struct ip *ip;
 301         struct m_tag *mtag;
 302         struct ipfw_rule_ref *dt;
 303         int error, family;
 304
 305         if (control) {
 306                 m_freem(control);               /* XXX */
 307                 control = NULL;
 308         }
 309
 310         if (sin != NULL) {
 311                 if (sin->sin_family != AF_INET) {
 312                         m_freem(m);
 313                         return (EAFNOSUPPORT);
 314                 }
 315                 if (sin->sin_len != sizeof(*sin)) {
 316                         m_freem(m);
 317                         return (EINVAL);
 318                 }
 319         }
 320
 321         /*
 322          * An mbuf may hasn't come from userland, but we pretend
 323          * that it has.
 324          */
 325         m->m_pkthdr.rcvif = NULL;
 326         m->m_nextpkt = NULL;
 327         M_SETFIB(m, so->so_fibnum);
 328
 329         mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
 330         if (mtag == NULL) {
 331                 /* this should be normal */
 332                 mtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
 333                     sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
 334                 if (mtag == NULL) {
 335                         m_freem(m);
 336                         return (ENOBUFS);
 337                 }
 338                 m_tag_prepend(m, mtag);
 339         }
 340         dt = (struct ipfw_rule_ref *)(mtag+1);
 341
 342         /* Loopback avoidance and state recovery */
 343         if (sin) {
 344                 int i;
 345
 346                 /* set the starting point. We provide a non-zero slot,
 347                  * but a non_matching chain_id to skip that info and use
 348                  * the rulenum/rule_id.
 349                  */
 350                 dt->slot = 1; /* dummy, chain_id is invalid */
 351                 dt->chain_id = 0;
 352                 dt->rulenum = sin->sin_port+1; /* host format ? */
 353                 dt->rule_id = 0;
 354                 /* XXX: broken for IPv6 */
 355                 /*
 356                  * Find receive interface with the given name, stuffed
 357                  * (if it exists) in the sin_zero[] field.
 358                  * The name is user supplied data so don't trust its size
 359                  * or that it is zero terminated.
 360                  */
 361                 for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++)
 362                         ;
 363                 if ( i > 0 && i < sizeof(sin->sin_zero))
 364                         m->m_pkthdr.rcvif = ifunit(sin->sin_zero);
 365         }
 366
 367         ip = mtod(m, struct ip *);
 368         switch (ip->ip_v) {
 369         case IPVERSION:
 370                 family = AF_INET;
 371                 break;
 372 #ifdef INET6
 373         case IPV6_VERSION >> 4:
 374                 family = AF_INET6;
 375                 break;
 376 #endif
 377         default:
 378                 m_freem(m);
 379                 return (EAFNOSUPPORT);
 380         }
 381
 382         /* Reinject packet into the system as incoming or outgoing */
 383         NET_EPOCH_ENTER(et);
 384         if (!sin || sin->sin_addr.s_addr == 0) {
 385                 dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT;
 386                 error = div_output_outbound(family, so, m);
 387         } else {
 388                 dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN;
 389                 error = div_output_inbound(family, so, m, sin);
 390         }
 391         NET_EPOCH_EXIT(et);
 392
 393         return (error);
 394 }
 395
 396 /*
 397  * Sends mbuf @m to the wire via ip[6]_output().
 398  *
 399  * Returns 0 on success or an errno value on failure.  @m is always consumed.
 400  */
 401 static int
 402 div_output_outbound(int family, struct socket *so, struct mbuf *m)
 403 {
 404         struct ip *const ip = mtod(m, struct ip *);
 405         struct mbuf *options;
 406         struct inpcb *inp;
 407         int error;
 408
 409         inp = sotoinpcb(so);
 410         INP_RLOCK(inp);
 411         switch (family) {
 412         case AF_INET:
 413                 /*
 414                  * Don't allow both user specified and setsockopt
 415                  * options, and don't allow packet length sizes that
 416                  * will crash.
 417                  */
 418                 if ((((ip->ip_hl << 2) != sizeof(struct ip)) &&
 419                     inp->inp_options != NULL) ||
 420                     ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
 421                         INP_RUNLOCK(inp);
 422                         m_freem(m);
 423                         return (EINVAL);
 424                 }
 425                 break;
 426 #ifdef INET6
 427         case AF_INET6:
 428             {
 429                 struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *);
 430
 431                 /* Don't allow packet length sizes that will crash */
 432                 if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) {
 433                         INP_RUNLOCK(inp);
 434                         m_freem(m);
 435                         return (EINVAL);
 436                 }
 437                 break;
 438             }
 439 #endif
 440         }
 441
 442         /* Send packet to output processing */
 443         KMOD_IPSTAT_INC(ips_rawout);            /* XXX */
 444
 445 #ifdef MAC
 446         mac_inpcb_create_mbuf(inp, m);
 447 #endif
 448         /*
 449          * Get ready to inject the packet into ip_output().
 450          * Just in case socket options were specified on the
 451          * divert socket, we duplicate them.  This is done
 452          * to avoid having to hold the PCB locks over the call
 453          * to ip_output(), as doing this results in a number of
 454          * lock ordering complexities.
 455          *
 456          * Note that we set the multicast options argument for
 457          * ip_output() to NULL since it should be invariant that
 458          * they are not present.
 459          */
 460         KASSERT(inp->inp_moptions == NULL,
 461             ("multicast options set on a divert socket"));
 462         /*
 463          * XXXCSJP: It is unclear to me whether or not it makes
 464          * sense for divert sockets to have options.  However,
 465          * for now we will duplicate them with the INP locks
 466          * held so we can use them in ip_output() without
 467          * requring a reference to the pcb.
 468          */
 469         options = NULL;
 470         if (inp->inp_options != NULL) {
 471                 options = m_dup(inp->inp_options, M_NOWAIT);
 472                 if (options == NULL) {
 473                         INP_RUNLOCK(inp);
 474                         m_freem(m);
 475                         return (ENOBUFS);
 476                 }
 477         }
 478         INP_RUNLOCK(inp);
 479
 480         error = 0;
 481         switch (family) {
 482         case AF_INET:
 483                 error = ip_output(m, options, NULL,
 484                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0)
 485                     | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL);
 486                 break;
 487 #ifdef INET6
 488         case AF_INET6:
 489                 error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
 490                 break;
 491 #endif
 492         }
 493         if (options != NULL)
 494                 m_freem(options);
 495
 496         return (error);
 497 }
 498
 499 /*
 500  * Schedules mbuf @m for local processing via IPv4/IPv6 netisr queue.
 501  *
 502  * Returns 0 on success or an errno value on failure.  @m is always consumed.
 503  */
 504 static int
 505 div_output_inbound(int family, struct socket *so, struct mbuf *m,
 506     struct sockaddr_in *sin)
 507 {
 508         const struct ip *ip;
 509         struct ifaddr *ifa;
 510
 511         if (m->m_pkthdr.rcvif == NULL) {
 512                 /*
 513                  * No luck with the name, check by IP address.
 514                  * Clear the port and the ifname to make sure
 515                  * there are no distractions for ifa_ifwithaddr.
 516                  */
 517
 518                 /* XXX: broken for IPv6 */
 519                 bzero(sin->sin_zero, sizeof(sin->sin_zero));
 520                 sin->sin_port = 0;
 521                 ifa = ifa_ifwithaddr((struct sockaddr *) sin);
 522                 if (ifa == NULL) {
 523                         m_freem(m);
 524                         return (EADDRNOTAVAIL);
 525                 }
 526                 m->m_pkthdr.rcvif = ifa->ifa_ifp;
 527         }
 528 #ifdef MAC
 529         mac_socket_create_mbuf(so, m);
 530 #endif
 531         /* Send packet to input processing via netisr */
 532         switch (family) {
 533         case AF_INET:
 534                 ip = mtod(m, struct ip *);
 535                 /*
 536                  * Restore M_BCAST flag when destination address is
 537                  * broadcast. It is expected by ip_tryforward().
 538                  */
 539                 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
 540                         m->m_flags |= M_MCAST;
 541                 else if (in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 542                         m->m_flags |= M_BCAST;
 543                 netisr_queue_src(NETISR_IP, (uintptr_t)so, m);
 544                 break;
 545 #ifdef INET6
 546         case AF_INET6:
 547                 netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m);
 548                 break;
 549 #endif
 550         default:
 551                 m_freem(m);
 552                 return (EINVAL);
 553         }
 554
 555         return (0);
 556 }
 557
 558 static int
 559 div_attach(struct socket *so, int proto, struct thread *td)
 560 {
 561         struct inpcb *inp;
 562         int error;
 563
 564         inp  = sotoinpcb(so);
 565         KASSERT(inp == NULL, ("div_attach: inp != NULL"));
 566         if (td != NULL) {
 567                 error = priv_check(td, PRIV_NETINET_DIVERT);
 568                 if (error)
 569                         return (error);
 570         }
 571         error = soreserve(so, div_sendspace, div_recvspace);
 572         if (error)
 573                 return error;
 574         error = in_pcballoc(so, &V_divcbinfo);
 575         if (error)
 576                 return error;
 577         inp = (struct inpcb *)so->so_pcb;
 578         inp->inp_ip_p = proto;
 579         inp->inp_flags |= INP_HDRINCL;
 580         INP_WUNLOCK(inp);
 581         return 0;
 582 }
 583
 584 static void
 585 div_detach(struct socket *so)
 586 {
 587         struct inpcb *inp;
 588
 589         inp = sotoinpcb(so);
 590         KASSERT(inp != NULL, ("div_detach: inp == NULL"));
 591         INP_WLOCK(inp);
 592         in_pcbdetach(inp);
 593         in_pcbfree(inp);
 594 }
 595
 596 static int
 597 div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 598 {
 599         struct inpcb *inp;
 600         int error;
 601
 602         inp = sotoinpcb(so);
 603         KASSERT(inp != NULL, ("div_bind: inp == NULL"));
 604         /* in_pcbbind assumes that nam is a sockaddr_in
 605          * and in_pcbbind requires a valid address. Since divert
 606          * sockets don't we need to make sure the address is
 607          * filled in properly.
 608          * XXX -- divert should not be abusing in_pcbind
 609          * and should probably have its own family.
 610          */
 611         if (nam->sa_family != AF_INET)
 612                 return EAFNOSUPPORT;
 613         if (nam->sa_len != sizeof(struct sockaddr_in))
 614                 return EINVAL;
 615         ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
 616         INP_WLOCK(inp);
 617         INP_HASH_WLOCK(&V_divcbinfo);
 618         error = in_pcbbind(inp, nam, td->td_ucred);
 619         INP_HASH_WUNLOCK(&V_divcbinfo);
 620         INP_WUNLOCK(inp);
 621         return error;
 622 }
 623
 624 static int
 625 div_shutdown(struct socket *so)
 626 {
 627         struct inpcb *inp;
 628
 629         inp = sotoinpcb(so);
 630         KASSERT(inp != NULL, ("div_shutdown: inp == NULL"));
 631         INP_WLOCK(inp);
 632         socantsendmore(so);
 633         INP_WUNLOCK(inp);
 634         return 0;
 635 }
 636
 637 static int
 638 div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 639     struct mbuf *control, struct thread *td)
 640 {
 641
 642         /* Packet must have a header (but that's about it) */
 643         if (m->m_len < sizeof (struct ip) &&
 644             (m = m_pullup(m, sizeof (struct ip))) == NULL) {
 645                 KMOD_IPSTAT_INC(ips_toosmall);
 646                 if (control != NULL)
 647                         m_freem(control);
 648                 m_freem(m);
 649                 return EINVAL;
 650         }
 651
 652         /* Send packet */
 653         return div_output(so, m, (struct sockaddr_in *)nam, control);
 654 }
 655
 656 static int
 657 div_pcblist(SYSCTL_HANDLER_ARGS)
 658 {
 659         struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_divcbinfo,
 660             INPLOOKUP_RLOCKPCB);
 661         struct xinpgen xig;
 662         struct inpcb *inp;
 663         int error;
 664
 665         if (req->newptr != 0)
 666                 return EPERM;
 667
 668         if (req->oldptr == 0) {
 669                 int n;
 670
 671                 n = V_divcbinfo.ipi_count;
 672                 n += imax(n / 8, 10);
 673                 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
 674                 return 0;
 675         }
 676
 677         if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
 678                 return (error);
 679
 680         bzero(&xig, sizeof(xig));
 681         xig.xig_len = sizeof xig;
 682         xig.xig_count = V_divcbinfo.ipi_count;
 683         xig.xig_gen = V_divcbinfo.ipi_gencnt;
 684         xig.xig_sogen = so_gencnt;
 685         error = SYSCTL_OUT(req, &xig, sizeof xig);
 686         if (error)
 687                 return error;
 688
 689         while ((inp = inp_next(&inpi)) != NULL) {
 690                 if (inp->inp_gencnt <= xig.xig_gen) {
 691                         struct xinpcb xi;
 692
 693                         in_pcbtoxinpcb(inp, &xi);
 694                         error = SYSCTL_OUT(req, &xi, sizeof xi);
 695                         if (error) {
 696                                 INP_RUNLOCK(inp);
 697                                 break;
 698                         }
 699                 }
 700         }
 701
 702         if (!error) {
 703                 /*
 704                  * Give the user an updated idea of our state.
 705                  * If the generation differs from what we told
 706                  * her before, she knows that something happened
 707                  * while we were processing this request, and it
 708                  * might be necessary to retry.
 709                  */
 710                 xig.xig_gen = V_divcbinfo.ipi_gencnt;
 711                 xig.xig_sogen = so_gencnt;
 712                 xig.xig_count = V_divcbinfo.ipi_count;
 713                 error = SYSCTL_OUT(req, &xig, sizeof xig);
 714         }
 715
 716         return (error);
 717 }
 718
 719 #ifdef SYSCTL_NODE
 720 static SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert,
 721     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 722     "IPDIVERT");
 723 SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist,
 724    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
 725     NULL, 0, div_pcblist, "S,xinpcb",
 726     "List of active divert sockets");
 727 #endif
 728
 729 static struct protosw div_protosw = {
 730         .pr_type =              SOCK_RAW,
 731         .pr_protocol =          IPPROTO_DIVERT,
 732         .pr_flags =             PR_ATOMIC|PR_ADDR,
 733         .pr_attach =            div_attach,
 734         .pr_bind =              div_bind,
 735         .pr_control =           in_control,
 736         .pr_detach =            div_detach,
 737         .pr_peeraddr =          in_getpeeraddr,
 738         .pr_send =              div_send,
 739         .pr_shutdown =          div_shutdown,
 740         .pr_sockaddr =          in_getsockaddr,
 741         .pr_sosetlabel =        in_pcbsosetlabel
 742 };
 743
 744 static int
 745 div_modevent(module_t mod, int type, void *unused)
 746 {
 747         int err = 0;
 748
 749         switch (type) {
 750         case MOD_LOAD:
 751                 /*
 752                  * Protocol will be initialized by pf_proto_register().
 753                  */
 754                 err = protosw_register(&inetdomain, &div_protosw);
 755                 if (err != 0)
 756                         return (err);
 757                 ip_divert_ptr = divert_packet;
 758                 break;
 759         case MOD_QUIESCE:
 760                 /*
 761                  * IPDIVERT may normally not be unloaded because of the
 762                  * potential race conditions.  Tell kldunload we can't be
 763                  * unloaded unless the unload is forced.
 764                  */
 765                 err = EPERM;
 766                 break;
 767         case MOD_UNLOAD:
 768                 /*
 769                  * Forced unload.
 770                  *
 771                  * Module ipdivert can only be unloaded if no sockets are
 772                  * connected.  Maybe this can be changed later to forcefully
 773                  * disconnect any open sockets.
 774                  *
 775                  * XXXRW: Note that there is a slight race here, as a new
 776                  * socket open request could be spinning on the lock and then
 777                  * we destroy the lock.
 778                  */
 779                 INP_INFO_WLOCK(&V_divcbinfo);
 780                 if (V_divcbinfo.ipi_count != 0) {
 781                         err = EBUSY;
 782                         INP_INFO_WUNLOCK(&V_divcbinfo);
 783                         break;
 784                 }
 785                 ip_divert_ptr = NULL;
 786                 err = protosw_unregister(&div_protosw);
 787                 INP_INFO_WUNLOCK(&V_divcbinfo);
 788 #ifndef VIMAGE
 789                 div_destroy(NULL);
 790 #endif
 791                 break;
 792         default:
 793                 err = EOPNOTSUPP;
 794                 break;
 795         }
 796         return err;
 797 }
 798
 799 static moduledata_t ipdivertmod = {
 800         "ipdivert",
 801         div_modevent,
 802         0
 803 };
 804
 805 DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY);
 806 MODULE_DEPEND(ipdivert, ipfw, 3, 3, 3);
 807 MODULE_VERSION(ipdivert, 1);