2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/sockio.h>
59 #include <sys/malloc.h>
60 #include <sys/module.h>
61 #include <sys/kernel.h>
62 #include <sys/socket.h>
63 #include <sys/queue.h>
68 #include <net/if_arp.h>
69 #include <net/ethernet.h>
70 #include <net/if_dl.h>
71 #include <net/if_media.h>
75 #include <net/if_types.h>
76 #include <net/if_vlan_var.h>
79 #include <netinet/in_systm.h>
80 #include <netinet/in.h>
81 #include <netinet/ip.h>
82 #include <netinet/if_ether.h>
85 #include <vm/vm_param.h>
86 #include <vm/vm_kern.h>
89 #include <machine/bus.h>
90 #include <machine/resource.h>
91 #include <machine/frame.h>
92 #include <machine/vmparam.h>
96 #include <sys/mutex.h>
97 #include <sys/errno.h>
98 #include <sys/types.h>
99 #include <machine/atomic.h>
101 #include <machine/intr_machdep.h>
103 #include <dev/hyperv/include/hyperv.h>
104 #include "hv_net_vsc.h"
105 #include "hv_rndis.h"
106 #include "hv_rndis_filter.h"
109 /* Short for Hyper-V network interface */
110 #define NETVSC_DEVNAME "hn"
113 * It looks like offset 0 of buf is reserved to hold the softc pointer.
114 * The sc pointer evidently not needed, and is not presently populated.
115 * The packet offset is where the netvsc_packet starts in the buffer.
117 #define HV_NV_SC_PTR_OFFSET_IN_BUF 0
118 #define HV_NV_PACKET_OFFSET_IN_BUF 16
125 struct hv_netvsc_driver_context {
130 * Be aware that this sleepable mutex will exhibit WITNESS errors when
131 * certain TCP and ARP code paths are taken. This appears to be a
132 * well-known condition, as all other drivers checked use a sleeping
133 * mutex to protect their transmit paths.
134 * Also Be aware that mutexes do not play well with semaphores, and there
135 * is a conflicting semaphore in a certain channel code path.
137 #define NV_LOCK_INIT(_sc, _name) \
138 mtx_init(&(_sc)->hn_lock, _name, MTX_NETWORK_LOCK, MTX_DEF)
139 #define NV_LOCK(_sc) mtx_lock(&(_sc)->hn_lock)
140 #define NV_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->hn_lock, MA_OWNED)
141 #define NV_UNLOCK(_sc) mtx_unlock(&(_sc)->hn_lock)
142 #define NV_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->hn_lock)
149 int hv_promisc_mode = 0; /* normal mode by default */
151 /* The one and only one */
152 static struct hv_netvsc_driver_context g_netvsc_drv;
156 * Forward declarations
158 static void hn_stop(hn_softc_t *sc);
159 static void hn_ifinit_locked(hn_softc_t *sc);
160 static void hn_ifinit(void *xsc);
161 static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
162 static int hn_start_locked(struct ifnet *ifp);
163 static void hn_start(struct ifnet *ifp);
167 * NetVsc driver initialization
168 * Note: Filter init is no longer required
171 netvsc_drv_init(void)
177 * NetVsc global initialization entry point
182 printf("Netvsc initializing... ");
185 * XXXKYS: cleanup initialization
187 if (!cold && !g_netvsc_drv.drv_inited) {
188 g_netvsc_drv.drv_inited = 1;
191 printf("Already initialized!\n");
195 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
196 static const hv_guid g_net_vsc_device_type = {
197 .data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
198 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
202 * Standard probe entry point.
206 netvsc_probe(device_t dev)
210 p = vmbus_get_type(dev);
211 if (!memcmp(p, &g_net_vsc_device_type.data, sizeof(hv_guid))) {
212 device_set_desc(dev, "Synthetic Network Interface");
213 printf("Netvsc probe... DONE \n");
222 * Standard attach entry point.
224 * Called when the driver is loaded. It allocates needed resources,
225 * and initializes the "hardware" and software.
228 netvsc_attach(device_t dev)
230 struct hv_device *device_ctx = vmbus_get_devctx(dev);
231 netvsc_device_info device_info;
233 int unit = device_get_unit(dev);
239 sc = device_get_softc(dev);
244 bzero(sc, sizeof(hn_softc_t));
248 NV_LOCK_INIT(sc, "NetVSCLock");
250 sc->hn_dev_obj = device_ctx;
252 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
255 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
256 ifp->if_dunit = unit;
257 ifp->if_dname = NETVSC_DEVNAME;
259 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
260 ifp->if_ioctl = hn_ioctl;
261 ifp->if_start = hn_start;
262 ifp->if_init = hn_ifinit;
263 /* needed by hv_rf_on_device_add() code */
264 ifp->if_mtu = ETHERMTU;
265 IFQ_SET_MAXLEN(&ifp->if_snd, 512);
266 ifp->if_snd.ifq_drv_maxlen = 511;
267 IFQ_SET_READY(&ifp->if_snd);
270 * Tell upper layers that we support full VLAN capability.
272 ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header);
273 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
274 ifp->if_capenable |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
276 ret = hv_rf_on_device_add(device_ctx, &device_info);
282 if (device_info.link_state == 0) {
286 ether_ifattach(ifp, device_info.mac_addr);
292 * Standard detach entry point
295 netvsc_detach(device_t dev)
297 struct hv_device *hv_device = vmbus_get_devctx(dev);
299 printf("netvsc_detach\n");
302 * XXXKYS: Need to clean up all our
303 * driver state; this is the driver
308 * XXXKYS: Need to stop outgoing traffic and unregister
312 hv_rf_on_device_remove(hv_device, HV_RF_NV_DESTROY_CHANNEL);
318 * Standard shutdown entry point
321 netvsc_shutdown(device_t dev)
327 * Send completion processing
329 * Note: It looks like offset 0 of buf is reserved to hold the softc
330 * pointer. The sc pointer is not currently needed in this function, and
331 * it is not presently populated by the TX function.
334 netvsc_xmit_completion(void *context)
336 netvsc_packet *packet = (netvsc_packet *)context;
340 mb = (struct mbuf *)packet->compl.send.send_completion_tid;
341 buf = ((uint8_t *)packet) - HV_NV_PACKET_OFFSET_IN_BUF;
351 * Start a transmit of one or more packets
354 hn_start_locked(struct ifnet *ifp)
356 hn_softc_t *sc = ifp->if_softc;
357 struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
359 netvsc_packet *packet;
360 struct mbuf *m_head, *m;
361 struct mbuf *mc_head = NULL;
370 while (!IFQ_DRV_IS_EMPTY(&sc->hn_ifp->if_snd)) {
371 IFQ_DRV_DEQUEUE(&sc->hn_ifp->if_snd, m_head);
372 if (m_head == NULL) {
380 /* Walk the mbuf list computing total length and num frags */
381 for (m = m_head; m != NULL; m = m->m_next) {
389 * Reserve the number of pages requested. Currently,
390 * one page is reserved for the message in the RNDIS
393 num_frags += HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
395 /* If exceeds # page_buffers in netvsc_packet */
396 if (num_frags > NETVSC_PACKET_MAXPAGE) {
403 if (m_head->m_flags & M_VLANTAG) {
404 rppi_size = sizeof(rndis_per_packet_info) +
405 sizeof(ndis_8021q_info);
409 * Allocate a buffer with space for a netvsc packet plus a
410 * number of reserved areas. First comes a (currently 16
411 * bytes, currently unused) reserved data area. Second is
412 * the netvsc_packet, which includes (currently 4) page
413 * buffers. Third (optional) is a rndis_per_packet_info
414 * struct, but only if a VLAN tag should be inserted into the
415 * Ethernet frame by the Hyper-V infrastructure. Fourth is
416 * an area reserved for an rndis_filter_packet struct.
417 * Changed malloc to M_NOWAIT to avoid sleep under spin lock.
418 * No longer reserving extra space for page buffers, as they
419 * are already part of the netvsc_packet.
421 buf = malloc(HV_NV_PACKET_OFFSET_IN_BUF +
422 sizeof(netvsc_packet) + rppi_size +
423 sizeof(rndis_filter_packet),
424 M_DEVBUF, M_ZERO | M_NOWAIT);
431 packet = (netvsc_packet *)(buf + HV_NV_PACKET_OFFSET_IN_BUF);
432 *(vm_offset_t *)buf = HV_NV_SC_PTR_OFFSET_IN_BUF;
435 * extension points to the area reserved for the
436 * rndis_filter_packet, which is placed just after
437 * the netvsc_packet (and rppi struct, if present;
438 * length is updated later).
440 packet->extension = packet + 1;
442 /* Set up the rndis header */
443 packet->page_buf_count = num_frags;
445 /* Initialize it from the mbuf */
446 packet->tot_data_buf_len = len;
449 * If the Hyper-V infrastructure needs to embed a VLAN tag,
450 * initialize netvsc_packet and rppi struct values as needed.
453 /* Lower layers need the VLAN TCI */
454 packet->vlan_tci = m_head->m_pkthdr.ether_vtag;
458 * Fill the page buffers with mbuf info starting at index
459 * HV_RF_NUM_TX_RESERVED_PAGE_BUFS.
461 i = HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
462 for (m = m_head; m != NULL; m = m->m_next) {
465 vtophys(mtod(m, vm_offset_t));
466 packet->page_buffers[i].pfn =
468 packet->page_buffers[i].offset =
469 paddr & (PAGE_SIZE - 1);
470 packet->page_buffers[i].length = m->m_len;
476 * If bpf, copy the mbuf chain. This is less expensive than
477 * it appears; the mbuf clusters are not copied, only their
478 * reference counts are incremented.
479 * Needed to avoid a race condition where the completion
480 * callback is invoked, freeing the mbuf chain, before the
481 * bpf_mtap code has a chance to run.
484 mc_head = m_copypacket(m_head, M_DONTWAIT);
487 /* Set the completion routine */
488 packet->compl.send.on_send_completion = netvsc_xmit_completion;
489 packet->compl.send.send_completion_context = packet;
490 packet->compl.send.send_completion_tid = (uint64_t)m_head;
492 /* Removed critical_enter(), does not appear necessary */
493 ret = hv_rf_on_send(device_ctx, packet);
497 /* if bpf && mc_head, call bpf_mtap code */
499 ETHER_BPF_MTAP(ifp, mc_head);
507 IF_PREPEND(&ifp->if_snd, m_head);
508 ifp->if_drv_flags |= IFF_DRV_OACTIVE;
511 * Null the mbuf pointer so the completion function
512 * does not free the mbuf chain. We just pushed the
513 * mbuf chain back on the if_snd queue.
515 packet->compl.send.send_completion_tid = 0;
518 * Release the resources since we will not get any
521 netvsc_xmit_completion(packet);
524 /* if bpf && mc_head, free the mbuf chain copy */
534 * Link up/down notification
537 netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status)
539 hn_softc_t *sc = device_get_softc(device_obj->device);
553 * Append the specified data to the indicated mbuf chain,
554 * Extend the mbuf chain if the new data does not fit in
557 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
558 * There should be an equivalent in the kernel mbuf code,
559 * but there does not appear to be one yet.
561 * Differs from m_append() in that additional mbufs are
562 * allocated with cluster size MJUMPAGESIZE, and filled
565 * Return 1 if able to complete the job; otherwise 0.
568 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
571 int remainder, space;
573 for (m = m0; m->m_next != NULL; m = m->m_next)
576 space = M_TRAILINGSPACE(m);
579 * Copy into available space.
581 if (space > remainder)
583 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
588 while (remainder > 0) {
590 * Allocate a new mbuf; could check space
591 * and allocate a cluster instead.
593 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
596 n->m_len = min(MJUMPAGESIZE, remainder);
597 bcopy(cp, mtod(n, caddr_t), n->m_len);
599 remainder -= n->m_len;
603 if (m0->m_flags & M_PKTHDR)
604 m0->m_pkthdr.len += len - remainder;
606 return (remainder == 0);
611 * Called when we receive a data packet from the "wire" on the
614 * Note: This is no longer used as a callback
617 netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet)
619 hn_softc_t *sc = (hn_softc_t *)device_get_softc(device_ctx->device);
621 struct ifnet *ifp = sc->hn_ifp;
626 return (0); /* TODO: KYS how can this be! */
629 ifp = sc->arpcom.ac_ifp;
631 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
636 * Bail out if packet contains more data than configured MTU.
638 if (packet->tot_data_buf_len > (ifp->if_mtu + ETHER_HDR_LEN)) {
643 * Get an mbuf with a cluster. For packets 2K or less,
644 * get a standard 2K cluster. For anything larger, get a
645 * 4K cluster. Any buffers larger than 4K can cause problems
646 * if looped around to the Hyper-V TX channel, so avoid them.
650 if (packet->tot_data_buf_len > MCLBYTES) {
655 m_new = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, size);
661 * Remove trailing junk from RX data buffer.
662 * Fixme: This will not work for multiple Hyper-V RX buffers.
663 * Fortunately, the channel gathers all RX data into one buffer.
665 * L2 frame length, with L2 header, not including CRC
667 packet->page_buffers[0].length = packet->tot_data_buf_len;
670 * Copy the received packet to one or more mbufs.
671 * The copy is required since the memory pointed to by netvsc_packet
672 * cannot be deallocated
674 for (i=0; i < packet->page_buf_count; i++) {
675 /* Shift virtual page number to form virtual page address */
676 uint8_t *vaddr = (uint8_t *)
677 (packet->page_buffers[i].pfn << PAGE_SHIFT);
679 hv_m_append(m_new, packet->page_buffers[i].length,
680 vaddr + packet->page_buffers[i].offset);
683 m_new->m_pkthdr.rcvif = ifp;
685 if ((packet->vlan_tci != 0) &&
686 (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0) {
687 m_new->m_pkthdr.ether_vtag = packet->vlan_tci;
688 m_new->m_flags |= M_VLANTAG;
692 * Note: Moved RX completion back to hv_nv_on_receive() so all
693 * messages (not just data messages) will trigger a response.
698 /* We're not holding the lock here, so don't release it */
699 (*ifp->if_input)(ifp, m_new);
705 * Standard ioctl entry point. Called when the user wants to configure
709 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
711 hn_softc_t *sc = ifp->if_softc;
712 struct ifreq *ifr = (struct ifreq *)data;
713 netvsc_device_info device_info;
714 struct hv_device *hn_dev;
721 error = ether_ioctl(ifp, cmd, data);
724 hn_dev = vmbus_get_devctx(sc->hn_dev);
728 if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) {
733 /* Obtain and record requested MTU */
734 ifp->if_mtu = ifr->ifr_mtu;
737 * We must remove and add back the device to cause the new
738 * MTU to take effect. This includes tearing down, but not
739 * deleting the channel, then bringing it back up.
741 error = hv_rf_on_device_remove(hn_dev, HV_RF_NV_RETAIN_CHANNEL);
746 error = hv_rf_on_device_add(hn_dev, &device_info);
752 hn_ifinit_locked(sc);
758 if (ifp->if_flags & IFF_UP) {
760 * If only the state of the PROMISC flag changed,
761 * then just use the 'set promisc mode' command
762 * instead of reinitializing the entire NIC. Doing
763 * a full re-init means reloading the firmware and
764 * waiting for it to start up, which may take a
768 /* Fixme: Promiscuous mode? */
769 /* No promiscuous mode with Xen */
770 if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
771 ifp->if_flags & IFF_PROMISC &&
772 !(sc->hn_if_flags & IFF_PROMISC)) {
773 /* do something here for Hyper-V */
775 /* XN_SETBIT(sc, XN_RX_MODE, */
776 /* XN_RXMODE_RX_PROMISC); */
777 } else if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
778 !(ifp->if_flags & IFF_PROMISC) &&
779 sc->hn_if_flags & IFF_PROMISC) {
780 /* do something here for Hyper-V */
782 /* XN_CLRBIT(sc, XN_RX_MODE, */
783 /* XN_RXMODE_RX_PROMISC); */
786 hn_ifinit_locked(sc);
788 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
792 sc->hn_if_flags = ifp->if_flags;
797 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
798 if (mask & IFCAP_HWCSUM) {
799 if (IFCAP_HWCSUM & ifp->if_capenable) {
800 ifp->if_capenable &= ~IFCAP_HWCSUM;
802 ifp->if_capenable |= IFCAP_HWCSUM;
810 /* Fixme: Multicast mode? */
811 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
824 error = ether_ioctl(ifp, cmd, data);
835 hn_stop(hn_softc_t *sc)
839 struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
844 printf(" Closing Device ...\n");
846 ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
849 ret = hv_rf_on_close(device_ctx);
853 * FreeBSD transmit entry point
856 hn_start(struct ifnet *ifp)
862 hn_start_locked(ifp);
870 hn_ifinit_locked(hn_softc_t *sc)
873 struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
880 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
886 ret = hv_rf_on_open(device_ctx);
892 ifp->if_drv_flags |= IFF_DRV_RUNNING;
893 ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
902 hn_softc_t *sc = xsc;
905 hn_ifinit_locked(sc);
914 hn_watchdog(struct ifnet *ifp)
919 printf("hn%d: watchdog timeout -- resetting\n", sc->hn_unit);
920 hn_ifinit(sc); /*???*/
925 static device_method_t netvsc_methods[] = {
926 /* Device interface */
927 DEVMETHOD(device_probe, netvsc_probe),
928 DEVMETHOD(device_attach, netvsc_attach),
929 DEVMETHOD(device_detach, netvsc_detach),
930 DEVMETHOD(device_shutdown, netvsc_shutdown),
935 static driver_t netvsc_driver = {
941 static devclass_t netvsc_devclass;
943 DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0);
944 MODULE_VERSION(hn, 1);
945 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
946 SYSINIT(netvsc_initx, SI_SUB_KTHREAD_IDLE, SI_ORDER_MIDDLE + 1, netvsc_init,