2 * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include "opt_inet6.h"
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/devctl.h>
36 #include <sys/eventhandler.h>
37 #include <sys/kernel.h>
39 #include <sys/module.h>
40 #include <sys/socket.h>
41 #include <sys/sysctl.h>
47 #include <net/ethernet.h>
48 #include <net/infiniband.h>
50 #include <net/if_var.h>
51 #include <net/if_private.h>
52 #include <net/if_dl.h>
53 #include <net/if_media.h>
54 #include <net/if_lagg.h>
55 #include <net/if_llatbl.h>
56 #include <net/if_types.h>
57 #include <net/netisr.h>
58 #include <net/route.h>
59 #include <netinet/if_ether.h>
60 #include <netinet/in.h>
61 #include <netinet/ip6.h>
62 #include <netinet6/in6_var.h>
63 #include <netinet6/nd6.h>
65 #include <security/mac/mac_framework.h>
67 /* if_lagg(4) support */
68 struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *);
72 infiniband_ipv4_multicast_map(uint32_t addr,
73 const uint8_t *broadcast, uint8_t *buf)
78 scope = broadcast[5] & 0xF;
85 buf[5] = 0x10 | scope;
88 buf[8] = broadcast[8];
89 buf[9] = broadcast[9];
96 buf[16] = (addr >> 24) & 0xff;
97 buf[17] = (addr >> 16) & 0xff;
98 buf[18] = (addr >> 8) & 0xff;
99 buf[19] = addr & 0xff;
105 infiniband_ipv6_multicast_map(const struct in6_addr *addr,
106 const uint8_t *broadcast, uint8_t *buf)
110 scope = broadcast[5] & 0xF;
117 buf[5] = 0x10 | scope;
120 buf[8] = broadcast[8];
121 buf[9] = broadcast[9];
122 memcpy(&buf[10], &addr->s6_addr[6], 10);
127 * This is for clients that have an infiniband_header in the mbuf.
130 infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb)
132 struct infiniband_header *ibh;
133 struct ether_header eh;
135 if (!bpf_peers_present(ifp->if_bpf))
139 if (mb->m_len < sizeof(*ibh))
142 ibh = mtod(mb, struct infiniband_header *);
143 eh.ether_type = ibh->ib_protocol;
144 memset(eh.ether_shost, 0, ETHER_ADDR_LEN);
145 memcpy(eh.ether_dhost, ibh->ib_hwaddr + 4, ETHER_ADDR_LEN);
146 mb->m_data += sizeof(*ibh);
147 mb->m_len -= sizeof(*ibh);
148 mb->m_pkthdr.len -= sizeof(*ibh);
149 bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
150 mb->m_data -= sizeof(*ibh);
151 mb->m_len += sizeof(*ibh);
152 mb->m_pkthdr.len += sizeof(*ibh);
156 update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst)
160 if (src->m_pkthdr.csum_flags & CSUM_IP)
161 csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID);
162 if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
163 csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
164 if (src->m_pkthdr.csum_flags & CSUM_SCTP)
165 csum_flags |= CSUM_SCTP_VALID;
166 dst->m_pkthdr.csum_flags |= csum_flags;
167 if (csum_flags & CSUM_DATA_VALID)
168 dst->m_pkthdr.csum_data = 0xffff;
172 * Handle link-layer encapsulation requests.
175 infiniband_requestencap(struct ifnet *ifp, struct if_encap_req *req)
177 struct infiniband_header *ih;
180 const uint8_t *lladdr;
182 if (req->rtype != IFENCAP_LL)
185 if (req->bufsize < INFINIBAND_HDR_LEN)
188 ih = (struct infiniband_header *)req->buf;
189 lladdr = req->lladdr;
192 switch (req->family) {
194 etype = htons(ETHERTYPE_IP);
197 etype = htons(ETHERTYPE_IPV6);
200 ah = (struct arphdr *)req->hdata;
201 ah->ar_hrd = htons(ARPHRD_INFINIBAND);
203 switch (ntohs(ah->ar_op)) {
204 case ARPOP_REVREQUEST:
206 etype = htons(ETHERTYPE_REVARP);
211 etype = htons(ETHERTYPE_ARP);
215 if (req->flags & IFENCAP_FLAG_BROADCAST)
216 lladdr = ifp->if_broadcastaddr;
219 return (EAFNOSUPPORT);
222 ih->ib_protocol = etype;
224 memcpy(ih->ib_hwaddr, lladdr, INFINIBAND_ADDR_LEN);
225 req->bufsize = sizeof(struct infiniband_header);
231 infiniband_resolve_addr(struct ifnet *ifp, struct mbuf *m,
232 const struct sockaddr *dst, struct route *ro, uint8_t *phdr,
233 uint32_t *pflags, struct llentry **plle)
235 #if defined(INET) || defined(INET6)
236 struct infiniband_header *ih = (struct infiniband_header *)phdr;
238 uint32_t lleflags = 0;
244 switch (dst->sa_family) {
247 if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) {
248 error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle);
250 if (m->m_flags & M_BCAST) {
251 memcpy(ih->ib_hwaddr, ifp->if_broadcastaddr,
252 INFINIBAND_ADDR_LEN);
254 infiniband_ipv4_multicast_map(
255 ((const struct sockaddr_in *)dst)->sin_addr.s_addr,
256 ifp->if_broadcastaddr, ih->ib_hwaddr);
258 ih->ib_protocol = htons(ETHERTYPE_IP);
265 if ((m->m_flags & M_MCAST) == 0) {
266 int af = RO_GET_FAMILY(ro, dst);
267 error = nd6_resolve(ifp, LLE_SF(af, 0), m, dst, phdr,
270 infiniband_ipv6_multicast_map(
271 &((const struct sockaddr_in6 *)dst)->sin6_addr,
272 ifp->if_broadcastaddr, ih->ib_hwaddr);
273 ih->ib_protocol = htons(ETHERTYPE_IPV6);
279 if_printf(ifp, "can't handle af%d\n", dst->sa_family);
282 return (EAFNOSUPPORT);
285 if (error == EHOSTDOWN) {
286 if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0)
287 error = EHOSTUNREACH;
293 *pflags = RT_MAY_LOOP;
294 if (lleflags & LLE_IFADDR)
301 * Infiniband output routine.
304 infiniband_output(struct ifnet *ifp, struct mbuf *m,
305 const struct sockaddr *dst, struct route *ro)
307 uint8_t linkhdr[INFINIBAND_HDR_LEN];
309 struct llentry *lle = NULL;
310 struct infiniband_header *ih;
312 int hlen; /* link layer header length */
322 /* XXX BPF uses ro_prepend */
323 if (ro->ro_prepend != NULL) {
324 phdr = ro->ro_prepend;
326 } else if (!(m->m_flags & (M_BCAST | M_MCAST))) {
327 if ((ro->ro_flags & RT_LLE_CACHE) != 0) {
330 (lle->la_flags & LLE_VALID) == 0) {
332 lle = NULL; /* redundant */
336 /* if we lookup, keep cache */
340 * Notify LLE code that
344 llentry_provide_feedback(lle);
347 phdr = lle->r_linkdata;
348 hlen = lle->r_hdrlen;
349 pflags = lle->r_flags;
355 error = mac_ifnet_check_transmit(ifp, m);
361 if (ifp->if_flags & IFF_MONITOR) {
365 if (!((ifp->if_flags & IFF_UP) &&
366 (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
372 /* No prepend data supplied. Try to calculate ourselves. */
374 hlen = INFINIBAND_HDR_LEN;
375 error = infiniband_resolve_addr(ifp, m, dst, ro, phdr, &pflags,
376 addref ? &lle : NULL);
377 if (addref && lle != NULL)
380 return (error == EWOULDBLOCK ? 0 : error);
383 if ((pflags & RT_L2_ME) != 0) {
384 update_mbuf_csumflags(m, m);
385 return (if_simloop(ifp, m, RO_GET_FAMILY(ro, dst), 0));
389 * Add local infiniband header. If no space in first mbuf,
392 M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT);
397 if ((pflags & RT_HAS_HEADER) == 0) {
398 ih = mtod(m, struct infiniband_header *);
399 memcpy(ih, phdr, hlen);
403 * Queue message on interface, update output statistics if
404 * successful, and start output if interface not yet active.
406 return (ifp->if_transmit(ifp, m));
414 * Process a received Infiniband packet.
417 infiniband_input(struct ifnet *ifp, struct mbuf *m)
419 struct infiniband_header *ibh;
420 struct epoch_tracker et;
424 needs_epoch = (ifp->if_flags & IFF_NEEDSEPOCH);
427 * This temporary code is here to prevent epoch unaware and unmarked
428 * drivers to panic the system. Once all drivers are taken care of,
429 * the whole INVARIANTS block should go away.
431 if (!needs_epoch && !in_epoch(net_epoch_preempt)) {
432 static bool printedonce;
437 if_printf(ifp, "called %s w/o net epoch! "
438 "PLEASE file a bug report.", __func__);
446 CURVNET_SET_QUIET(ifp->if_vnet);
447 if (__predict_false(needs_epoch))
450 if ((ifp->if_flags & IFF_UP) == 0) {
451 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
456 ibh = mtod(m, struct infiniband_header *);
459 * Reset layer specific mbuf flags to avoid confusing upper
462 m->m_flags &= ~M_VLANTAG;
465 if (INFINIBAND_IS_MULTICAST(ibh->ib_hwaddr)) {
466 if (memcmp(ibh->ib_hwaddr, ifp->if_broadcastaddr,
467 ifp->if_addrlen) == 0)
468 m->m_flags |= M_BCAST;
470 m->m_flags |= M_MCAST;
471 if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
474 /* Let BPF have it before we strip the header. */
475 infiniband_bpf_mtap(ifp, m);
477 /* Allow monitor mode to claim this frame, after stats are updated. */
478 if (ifp->if_flags & IFF_MONITOR) {
483 /* Direct packet to correct FIB based on interface config. */
484 M_SETFIB(m, ifp->if_fib);
486 /* Handle input from a lagg<N> port */
487 if (ifp->if_type == IFT_INFINIBANDLAG) {
488 KASSERT(lagg_input_infiniband_p != NULL,
489 ("%s: if_lagg not loaded!", __func__));
490 m = (*lagg_input_infiniband_p)(ifp, m);
491 if (__predict_false(m == NULL))
493 ifp = m->m_pkthdr.rcvif;
497 * Dispatch frame to upper layer.
499 switch (ibh->ib_protocol) {
501 case htons(ETHERTYPE_IP):
505 case htons(ETHERTYPE_ARP):
506 if (ifp->if_flags & IFF_NOARP) {
507 /* Discard packet if ARP is disabled on interface */
515 case htons(ETHERTYPE_IPV6):
520 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
525 /* Strip off the Infiniband header. */
526 m_adj(m, INFINIBAND_HDR_LEN);
530 * Tag the mbuf with an appropriate MAC label before any other
531 * consumers can get to it.
533 mac_ifnet_create_mbuf(ifp, m);
535 /* Allow monitor mode to claim this frame, after stats are updated. */
536 netisr_dispatch(isr, m);
538 if (__predict_false(needs_epoch))
544 infiniband_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
547 struct sockaddr_dl *sdl;
549 struct sockaddr_in *sin;
552 struct sockaddr_in6 *sin6;
556 switch (sa->sa_family) {
559 * No mapping needed. Just check that it's a valid MC address.
561 sdl = (struct sockaddr_dl *)sa;
562 e_addr = LLADDR(sdl);
563 if (!INFINIBAND_IS_MULTICAST(e_addr))
564 return (EADDRNOTAVAIL);
570 sin = (struct sockaddr_in *)sa;
571 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
572 return (EADDRNOTAVAIL);
573 sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
574 sdl->sdl_alen = INFINIBAND_ADDR_LEN;
575 e_addr = LLADDR(sdl);
576 infiniband_ipv4_multicast_map(
577 sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr);
578 *llsa = (struct sockaddr *)sdl;
583 sin6 = (struct sockaddr_in6 *)sa;
585 * An IP6 address of 0 means listen to all of the
586 * multicast address used for IP6. This has no meaning
589 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
590 return (EADDRNOTAVAIL);
591 if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
592 return (EADDRNOTAVAIL);
593 sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
594 sdl->sdl_alen = INFINIBAND_ADDR_LEN;
595 e_addr = LLADDR(sdl);
596 infiniband_ipv6_multicast_map(
597 &sin6->sin6_addr, ifp->if_broadcastaddr, e_addr);
598 *llsa = (struct sockaddr *)sdl;
602 return (EAFNOSUPPORT);
607 infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb)
609 struct sockaddr_dl *sdl;
613 ifp->if_addrlen = INFINIBAND_ADDR_LEN;
614 ifp->if_hdrlen = INFINIBAND_HDR_LEN;
615 ifp->if_mtu = INFINIBAND_MTU;
617 ifp->if_output = infiniband_output;
618 ifp->if_input = infiniband_input;
619 ifp->if_resolvemulti = infiniband_resolvemulti;
620 ifp->if_requestencap = infiniband_requestencap;
622 if (ifp->if_baudrate == 0)
623 ifp->if_baudrate = IF_Gbps(10); /* default value */
625 ifp->if_broadcastaddr = llb;
628 KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
629 sdl = (struct sockaddr_dl *)ifa->ifa_addr;
630 sdl->sdl_type = IFT_INFINIBAND;
631 sdl->sdl_alen = ifp->if_addrlen;
634 memcpy(LLADDR(sdl), lla, ifp->if_addrlen);
636 if (ifp->if_hw_addr != NULL)
637 memcpy(ifp->if_hw_addr, lla, ifp->if_addrlen);
642 /* Attach ethernet compatible network device */
643 bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
645 /* Announce Infiniband MAC address if non-zero. */
646 for (i = 0; i < ifp->if_addrlen; i++)
649 if (i != ifp->if_addrlen)
650 if_printf(ifp, "Infiniband address: %20D\n", lla, ":");
652 /* Add necessary bits are setup; announce it now. */
653 EVENTHANDLER_INVOKE(infiniband_ifattach_event, ifp);
655 if (IS_DEFAULT_VNET(curvnet))
656 devctl_notify("INFINIBAND", ifp->if_xname, "IFATTACH", NULL);
660 * Perform common duties while detaching an Infiniband interface
663 infiniband_ifdetach(struct ifnet *ifp)
670 infiniband_modevent(module_t mod, int type, void *data)
681 static moduledata_t infiniband_mod = {
682 .name = "if_infiniband",
683 .evhand = &infiniband_modevent,
686 DECLARE_MODULE(if_infiniband, infiniband_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
687 MODULE_VERSION(if_infiniband, 1);