2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/sockio.h>
65 #include <sys/malloc.h>
66 #include <sys/module.h>
67 #include <sys/kernel.h>
68 #include <sys/socket.h>
69 #include <sys/queue.h>
72 #include <sys/sysctl.h>
73 #include <sys/buf_ring.h>
76 #include <net/if_arp.h>
77 #include <net/ethernet.h>
78 #include <net/if_dl.h>
79 #include <net/if_media.h>
83 #include <net/if_var.h>
84 #include <net/if_types.h>
85 #include <net/if_vlan_var.h>
87 #include <netinet/in_systm.h>
88 #include <netinet/in.h>
89 #include <netinet/ip.h>
90 #include <netinet/if_ether.h>
91 #include <netinet/tcp.h>
92 #include <netinet/udp.h>
93 #include <netinet/ip6.h>
96 #include <vm/vm_param.h>
97 #include <vm/vm_kern.h>
100 #include <machine/bus.h>
101 #include <machine/resource.h>
102 #include <machine/frame.h>
105 #include <sys/rman.h>
106 #include <sys/mutex.h>
107 #include <sys/errno.h>
108 #include <sys/types.h>
109 #include <machine/atomic.h>
111 #include <machine/intr_machdep.h>
113 #include <machine/in_cksum.h>
115 #include <dev/hyperv/include/hyperv.h>
116 #include "hv_net_vsc.h"
117 #include "hv_rndis.h"
118 #include "hv_rndis_filter.h"
121 /* Short for Hyper-V network interface */
122 #define NETVSC_DEVNAME "hn"
125 * It looks like offset 0 of buf is reserved to hold the softc pointer.
126 * The sc pointer evidently not needed, and is not presently populated.
127 * The packet offset is where the netvsc_packet starts in the buffer.
129 #define HV_NV_SC_PTR_OFFSET_IN_BUF 0
130 #define HV_NV_PACKET_OFFSET_IN_BUF 16
132 /* YYY should get it from the underlying channel */
133 #define HN_TX_DESC_CNT 512
135 #define HN_LROENT_CNT_DEF 128
137 #define HN_RNDIS_MSG_LEN \
138 (sizeof(rndis_msg) + \
139 RNDIS_VLAN_PPI_SIZE + \
140 RNDIS_TSO_PPI_SIZE + \
142 #define HN_RNDIS_MSG_BOUNDARY PAGE_SIZE
143 #define HN_RNDIS_MSG_ALIGN CACHE_LINE_SIZE
145 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
146 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
147 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
148 #define HN_TX_DATA_SEGCNT_MAX \
149 (NETVSC_PACKET_MAXPAGE - HV_RF_NUM_TX_RESERVED_PAGE_BUFS)
151 #define HN_DIRECT_TX_SIZE_DEF 128
154 #ifndef HN_USE_TXDESC_BUFRING
155 SLIST_ENTRY(hn_txdesc) link;
158 struct hn_tx_ring *txr;
160 uint32_t flags; /* HN_TXD_FLAG_ */
161 netvsc_packet netvsc_pkt; /* XXX to be removed */
163 bus_dmamap_t data_dmap;
165 bus_addr_t rndis_msg_paddr;
166 rndis_msg *rndis_msg;
167 bus_dmamap_t rndis_msg_dmap;
170 #define HN_TXD_FLAG_ONLIST 0x1
171 #define HN_TXD_FLAG_DMAMAP 0x2
174 * Only enable UDP checksum offloading when it is on 2012R2 or
175 * later. UDP checksum offloading doesn't work on earlier
178 #define HN_CSUM_ASSIST_WIN8 (CSUM_IP | CSUM_TCP)
179 #define HN_CSUM_ASSIST (CSUM_IP | CSUM_UDP | CSUM_TCP)
181 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
182 /* YYY 2*MTU is a bit rough, but should be good enough. */
183 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
185 #define HN_LRO_ACKCNT_DEF 1
188 * Be aware that this sleepable mutex will exhibit WITNESS errors when
189 * certain TCP and ARP code paths are taken. This appears to be a
190 * well-known condition, as all other drivers checked use a sleeping
191 * mutex to protect their transmit paths.
192 * Also Be aware that mutexes do not play well with semaphores, and there
193 * is a conflicting semaphore in a certain channel code path.
195 #define NV_LOCK_INIT(_sc, _name) \
196 mtx_init(&(_sc)->hn_lock, _name, MTX_NETWORK_LOCK, MTX_DEF)
197 #define NV_LOCK(_sc) mtx_lock(&(_sc)->hn_lock)
198 #define NV_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->hn_lock, MA_OWNED)
199 #define NV_UNLOCK(_sc) mtx_unlock(&(_sc)->hn_lock)
200 #define NV_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->hn_lock)
207 int hv_promisc_mode = 0; /* normal mode by default */
209 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD, NULL, "Hyper-V network interface");
211 /* Trust tcp segements verification on host side. */
212 static int hn_trust_hosttcp = 1;
213 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
214 &hn_trust_hosttcp, 0,
215 "Trust tcp segement verification on host side, "
216 "when csum info is missing (global setting)");
218 /* Trust udp datagrams verification on host side. */
219 static int hn_trust_hostudp = 1;
220 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
221 &hn_trust_hostudp, 0,
222 "Trust udp datagram verification on host side, "
223 "when csum info is missing (global setting)");
225 /* Trust ip packets verification on host side. */
226 static int hn_trust_hostip = 1;
227 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
229 "Trust ip packet verification on host side, "
230 "when csum info is missing (global setting)");
232 #if __FreeBSD_version >= 1100045
233 /* Limit TSO burst size */
234 static int hn_tso_maxlen = 0;
235 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
236 &hn_tso_maxlen, 0, "TSO burst limit");
239 /* Limit chimney send size */
240 static int hn_tx_chimney_size = 0;
241 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
242 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
244 /* Limit the size of packet for direct transmission */
245 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
246 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
247 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
249 #if defined(INET) || defined(INET6)
250 #if __FreeBSD_version >= 1100095
251 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
252 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
253 &hn_lro_entry_count, 0, "LRO entry count");
257 static int hn_share_tx_taskq = 0;
258 SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
259 &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
261 static struct taskqueue *hn_tx_taskq;
263 #ifndef HN_USE_TXDESC_BUFRING
264 static int hn_use_txdesc_bufring = 0;
266 static int hn_use_txdesc_bufring = 1;
268 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
269 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
271 static int hn_bind_tx_taskq = -1;
272 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
273 &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
276 * Forward declarations
278 static void hn_stop(hn_softc_t *sc);
279 static void hn_ifinit_locked(hn_softc_t *sc);
280 static void hn_ifinit(void *xsc);
281 static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
282 static int hn_start_locked(struct hn_tx_ring *txr, int len);
283 static void hn_start(struct ifnet *ifp);
284 static void hn_start_txeof(struct hn_tx_ring *);
285 static int hn_ifmedia_upd(struct ifnet *ifp);
286 static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
287 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
288 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
289 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
290 static int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS);
291 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
292 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
293 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
294 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
295 static int hn_check_iplen(const struct mbuf *, int);
296 static int hn_create_tx_ring(struct hn_softc *, int);
297 static void hn_destroy_tx_ring(struct hn_tx_ring *);
298 static int hn_create_tx_data(struct hn_softc *);
299 static void hn_destroy_tx_data(struct hn_softc *);
300 static void hn_start_taskfunc(void *, int);
301 static void hn_start_txeof_taskfunc(void *, int);
302 static void hn_stop_tx_tasks(struct hn_softc *);
303 static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **);
304 static void hn_create_rx_data(struct hn_softc *sc);
305 static void hn_destroy_rx_data(struct hn_softc *sc);
306 static void hn_set_tx_chimney_size(struct hn_softc *, int);
309 hn_ifmedia_upd(struct ifnet *ifp __unused)
316 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
318 struct hn_softc *sc = ifp->if_softc;
320 ifmr->ifm_status = IFM_AVALID;
321 ifmr->ifm_active = IFM_ETHER;
323 if (!sc->hn_carrier) {
324 ifmr->ifm_active |= IFM_NONE;
327 ifmr->ifm_status |= IFM_ACTIVE;
328 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
331 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
332 static const hv_guid g_net_vsc_device_type = {
333 .data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
334 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
338 * Standard probe entry point.
342 netvsc_probe(device_t dev)
346 p = vmbus_get_type(dev);
347 if (!memcmp(p, &g_net_vsc_device_type.data, sizeof(hv_guid))) {
348 device_set_desc(dev, "Synthetic Network Interface");
350 printf("Netvsc probe... DONE \n");
352 return (BUS_PROBE_DEFAULT);
359 * Standard attach entry point.
361 * Called when the driver is loaded. It allocates needed resources,
362 * and initializes the "hardware" and software.
365 netvsc_attach(device_t dev)
367 struct hv_device *device_ctx = vmbus_get_devctx(dev);
368 netvsc_device_info device_info;
370 int unit = device_get_unit(dev);
371 struct ifnet *ifp = NULL;
373 #if __FreeBSD_version >= 1100045
377 sc = device_get_softc(dev);
382 bzero(sc, sizeof(hn_softc_t));
386 if (hn_tx_taskq == NULL) {
387 sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
388 taskqueue_thread_enqueue, &sc->hn_tx_taskq);
389 if (hn_bind_tx_taskq >= 0) {
390 int cpu = hn_bind_tx_taskq;
393 if (cpu > mp_ncpus - 1)
395 CPU_SETOF(cpu, &cpu_set);
396 taskqueue_start_threads_cpuset(&sc->hn_tx_taskq, 1,
397 PI_NET, &cpu_set, "%s tx",
398 device_get_nameunit(dev));
400 taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET,
401 "%s tx", device_get_nameunit(dev));
404 sc->hn_tx_taskq = hn_tx_taskq;
406 NV_LOCK_INIT(sc, "NetVSCLock");
408 sc->hn_dev_obj = device_ctx;
410 ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
413 error = hn_create_tx_data(sc);
417 hn_create_rx_data(sc);
419 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
420 ifp->if_dunit = unit;
421 ifp->if_dname = NETVSC_DEVNAME;
423 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
424 ifp->if_ioctl = hn_ioctl;
425 ifp->if_start = hn_start;
426 ifp->if_init = hn_ifinit;
427 /* needed by hv_rf_on_device_add() code */
428 ifp->if_mtu = ETHERMTU;
429 IFQ_SET_MAXLEN(&ifp->if_snd, 512);
430 ifp->if_snd.ifq_drv_maxlen = 511;
431 IFQ_SET_READY(&ifp->if_snd);
433 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
434 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
435 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
436 /* XXX ifmedia_set really should do this for us */
437 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
440 * Tell upper layers that we support full VLAN capability.
442 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
443 ifp->if_capabilities |=
444 IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO |
447 IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO |
449 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist | CSUM_TSO;
451 error = hv_rf_on_device_add(device_ctx, &device_info);
455 if (device_info.link_state == 0) {
459 #if __FreeBSD_version >= 1100045
460 tso_maxlen = hn_tso_maxlen;
461 if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET)
462 tso_maxlen = IP_MAXPACKET;
464 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
465 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
466 ifp->if_hw_tsomax = tso_maxlen -
467 (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
470 ether_ifattach(ifp, device_info.mac_addr);
472 #if __FreeBSD_version >= 1100045
473 if_printf(ifp, "TSO: %u/%u/%u\n", ifp->if_hw_tsomax,
474 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
477 sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
478 hn_set_tx_chimney_size(sc, sc->hn_tx_chimney_max);
479 if (hn_tx_chimney_size > 0 &&
480 hn_tx_chimney_size < sc->hn_tx_chimney_max)
481 hn_set_tx_chimney_size(sc, hn_tx_chimney_size);
485 hn_destroy_tx_data(sc);
492 * Standard detach entry point
495 netvsc_detach(device_t dev)
497 struct hn_softc *sc = device_get_softc(dev);
498 struct hv_device *hv_device = vmbus_get_devctx(dev);
501 printf("netvsc_detach\n");
504 * XXXKYS: Need to clean up all our
505 * driver state; this is the driver
510 * XXXKYS: Need to stop outgoing traffic and unregister
514 hv_rf_on_device_remove(hv_device, HV_RF_NV_DESTROY_CHANNEL);
516 hn_stop_tx_tasks(sc);
518 ifmedia_removeall(&sc->hn_media);
519 hn_destroy_rx_data(sc);
520 hn_destroy_tx_data(sc);
522 if (sc->hn_tx_taskq != hn_tx_taskq)
523 taskqueue_free(sc->hn_tx_taskq);
529 * Standard shutdown entry point
532 netvsc_shutdown(device_t dev)
538 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
539 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
541 struct mbuf *m = *m_head;
544 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
545 m, segs, nsegs, BUS_DMA_NOWAIT);
546 if (error == EFBIG) {
549 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
554 txr->hn_tx_collapsed++;
556 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
557 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
560 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
561 BUS_DMASYNC_PREWRITE);
562 txd->flags |= HN_TXD_FLAG_DMAMAP;
568 hn_txdesc_dmamap_unload(struct hn_tx_ring *txr, struct hn_txdesc *txd)
571 if (txd->flags & HN_TXD_FLAG_DMAMAP) {
572 bus_dmamap_sync(txr->hn_tx_data_dtag,
573 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
574 bus_dmamap_unload(txr->hn_tx_data_dtag,
576 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
581 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
584 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
585 ("put an onlist txd %#x", txd->flags));
587 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
588 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
591 hn_txdesc_dmamap_unload(txr, txd);
592 if (txd->m != NULL) {
597 txd->flags |= HN_TXD_FLAG_ONLIST;
599 #ifndef HN_USE_TXDESC_BUFRING
600 mtx_lock_spin(&txr->hn_txlist_spin);
601 KASSERT(txr->hn_txdesc_avail >= 0 &&
602 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
603 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
604 txr->hn_txdesc_avail++;
605 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
606 mtx_unlock_spin(&txr->hn_txlist_spin);
608 atomic_add_int(&txr->hn_txdesc_avail, 1);
609 buf_ring_enqueue(txr->hn_txdesc_br, txd);
615 static __inline struct hn_txdesc *
616 hn_txdesc_get(struct hn_tx_ring *txr)
618 struct hn_txdesc *txd;
620 #ifndef HN_USE_TXDESC_BUFRING
621 mtx_lock_spin(&txr->hn_txlist_spin);
622 txd = SLIST_FIRST(&txr->hn_txlist);
624 KASSERT(txr->hn_txdesc_avail > 0,
625 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
626 txr->hn_txdesc_avail--;
627 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
629 mtx_unlock_spin(&txr->hn_txlist_spin);
631 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
635 #ifdef HN_USE_TXDESC_BUFRING
636 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
638 KASSERT(txd->m == NULL && txd->refs == 0 &&
639 (txd->flags & HN_TXD_FLAG_ONLIST), ("invalid txd"));
640 txd->flags &= ~HN_TXD_FLAG_ONLIST;
647 hn_txdesc_hold(struct hn_txdesc *txd)
650 /* 0->1 transition will never work */
651 KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
652 atomic_add_int(&txd->refs, 1);
656 hn_tx_done(void *xpkt)
658 netvsc_packet *packet = xpkt;
659 struct hn_txdesc *txd;
660 struct hn_tx_ring *txr;
662 txd = (struct hn_txdesc *)(uintptr_t)
663 packet->compl.send.send_completion_tid;
666 txr->hn_has_txeof = 1;
667 hn_txdesc_put(txr, txd);
671 netvsc_channel_rollup(struct hv_device *device_ctx)
673 struct hn_softc *sc = device_get_softc(device_ctx->device);
674 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; /* TODO: vRSS */
675 #if defined(INET) || defined(INET6)
676 struct hn_rx_ring *rxr = &sc->hn_rx_ring[0]; /* TODO: vRSS */
677 struct lro_ctrl *lro = &rxr->hn_lro;
678 struct lro_entry *queued;
680 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
681 SLIST_REMOVE_HEAD(&lro->lro_active, next);
682 tcp_lro_flush(lro, queued);
686 if (!txr->hn_has_txeof)
689 txr->hn_has_txeof = 0;
695 * If this function fails, then both txd and m_head0 will be freed.
698 hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
700 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
702 struct mbuf *m_head = *m_head0;
703 netvsc_packet *packet;
704 rndis_msg *rndis_mesg;
705 rndis_packet *rndis_pkt;
706 rndis_per_packet_info *rppi;
707 uint32_t rndis_msg_size;
709 packet = &txd->netvsc_pkt;
710 packet->is_data_pkt = TRUE;
711 packet->tot_data_buf_len = m_head->m_pkthdr.len;
714 * extension points to the area reserved for the
715 * rndis_filter_packet, which is placed just after
716 * the netvsc_packet (and rppi struct, if present;
717 * length is updated later).
719 rndis_mesg = txd->rndis_msg;
720 /* XXX not necessary */
721 memset(rndis_mesg, 0, HN_RNDIS_MSG_LEN);
722 rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG;
724 rndis_pkt = &rndis_mesg->msg.packet;
725 rndis_pkt->data_offset = sizeof(rndis_packet);
726 rndis_pkt->data_length = packet->tot_data_buf_len;
727 rndis_pkt->per_pkt_info_offset = sizeof(rndis_packet);
729 rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet);
731 if (m_head->m_flags & M_VLANTAG) {
732 ndis_8021q_info *rppi_vlan_info;
734 rndis_msg_size += RNDIS_VLAN_PPI_SIZE;
735 rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE,
738 rppi_vlan_info = (ndis_8021q_info *)((uint8_t *)rppi +
739 rppi->per_packet_info_offset);
740 rppi_vlan_info->u1.s1.vlan_id =
741 m_head->m_pkthdr.ether_vtag & 0xfff;
744 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
745 rndis_tcp_tso_info *tso_info;
746 struct ether_vlan_header *eh;
750 * XXX need m_pullup and use mtodo
752 eh = mtod(m_head, struct ether_vlan_header*);
753 if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN))
754 ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
756 ether_len = ETHER_HDR_LEN;
758 rndis_msg_size += RNDIS_TSO_PPI_SIZE;
759 rppi = hv_set_rppi_data(rndis_mesg, RNDIS_TSO_PPI_SIZE,
760 tcp_large_send_info);
762 tso_info = (rndis_tcp_tso_info *)((uint8_t *)rppi +
763 rppi->per_packet_info_offset);
764 tso_info->lso_v2_xmit.type =
765 RNDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
768 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
770 (struct ip *)(m_head->m_data + ether_len);
771 unsigned long iph_len = ip->ip_hl << 2;
773 (struct tcphdr *)((caddr_t)ip + iph_len);
775 tso_info->lso_v2_xmit.ip_version =
776 RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV4;
780 th->th_sum = in_pseudo(ip->ip_src.s_addr,
781 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
784 #if defined(INET6) && defined(INET)
789 struct ip6_hdr *ip6 = (struct ip6_hdr *)
790 (m_head->m_data + ether_len);
791 struct tcphdr *th = (struct tcphdr *)(ip6 + 1);
793 tso_info->lso_v2_xmit.ip_version =
794 RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV6;
796 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
799 tso_info->lso_v2_xmit.tcp_header_offset = 0;
800 tso_info->lso_v2_xmit.mss = m_head->m_pkthdr.tso_segsz;
801 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
802 rndis_tcp_ip_csum_info *csum_info;
804 rndis_msg_size += RNDIS_CSUM_PPI_SIZE;
805 rppi = hv_set_rppi_data(rndis_mesg, RNDIS_CSUM_PPI_SIZE,
807 csum_info = (rndis_tcp_ip_csum_info *)((uint8_t *)rppi +
808 rppi->per_packet_info_offset);
810 csum_info->xmit.is_ipv4 = 1;
811 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
812 csum_info->xmit.ip_header_csum = 1;
814 if (m_head->m_pkthdr.csum_flags & CSUM_TCP) {
815 csum_info->xmit.tcp_csum = 1;
816 csum_info->xmit.tcp_header_offset = 0;
817 } else if (m_head->m_pkthdr.csum_flags & CSUM_UDP) {
818 csum_info->xmit.udp_csum = 1;
822 rndis_mesg->msg_len = packet->tot_data_buf_len + rndis_msg_size;
823 packet->tot_data_buf_len = rndis_mesg->msg_len;
826 * Chimney send, if the packet could fit into one chimney buffer.
828 if (packet->tot_data_buf_len < txr->hn_tx_chimney_size) {
829 netvsc_dev *net_dev = txr->hn_sc->net_dev;
830 uint32_t send_buf_section_idx;
832 send_buf_section_idx =
833 hv_nv_get_next_send_section(net_dev);
834 if (send_buf_section_idx !=
835 NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) {
836 uint8_t *dest = ((uint8_t *)net_dev->send_buf +
837 (send_buf_section_idx *
838 net_dev->send_section_size));
840 memcpy(dest, rndis_mesg, rndis_msg_size);
841 dest += rndis_msg_size;
842 m_copydata(m_head, 0, m_head->m_pkthdr.len, dest);
844 packet->send_buf_section_idx = send_buf_section_idx;
845 packet->send_buf_section_size =
846 packet->tot_data_buf_len;
847 packet->page_buf_count = 0;
848 txr->hn_tx_chimney++;
853 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
858 * This mbuf is not linked w/ the txd yet, so free it now.
863 freed = hn_txdesc_put(txr, txd);
865 ("fail to free txd upon txdma error"));
867 txr->hn_txdma_failed++;
868 if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1);
873 packet->page_buf_count = nsegs + HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
875 /* send packet with page buffer */
876 packet->page_buffers[0].pfn = atop(txd->rndis_msg_paddr);
877 packet->page_buffers[0].offset = txd->rndis_msg_paddr & PAGE_MASK;
878 packet->page_buffers[0].length = rndis_msg_size;
881 * Fill the page buffers with mbuf info starting at index
882 * HV_RF_NUM_TX_RESERVED_PAGE_BUFS.
884 for (i = 0; i < nsegs; ++i) {
885 hv_vmbus_page_buffer *pb = &packet->page_buffers[
886 i + HV_RF_NUM_TX_RESERVED_PAGE_BUFS];
888 pb->pfn = atop(segs[i].ds_addr);
889 pb->offset = segs[i].ds_addr & PAGE_MASK;
890 pb->length = segs[i].ds_len;
893 packet->send_buf_section_idx =
894 NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX;
895 packet->send_buf_section_size = 0;
899 /* Set the completion routine */
900 packet->compl.send.on_send_completion = hn_tx_done;
901 packet->compl.send.send_completion_context = packet;
902 packet->compl.send.send_completion_tid = (uint64_t)(uintptr_t)txd;
908 * Start a transmit of one or more packets
911 hn_start_locked(struct hn_tx_ring *txr, int len)
913 struct hn_softc *sc = txr->hn_sc;
914 struct ifnet *ifp = sc->hn_ifp;
915 struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
917 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
918 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
920 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
924 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
925 int error, send_failed = 0;
926 struct hn_txdesc *txd;
929 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
933 if (len > 0 && m_head->m_pkthdr.len > len) {
935 * This sending could be time consuming; let callers
936 * dispatch this packet sending (and sending of any
937 * following up packets) to tx taskqueue.
939 IF_PREPEND(&ifp->if_snd, m_head);
943 txd = hn_txdesc_get(txr);
945 txr->hn_no_txdescs++;
946 IF_PREPEND(&ifp->if_snd, m_head);
947 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
951 error = hn_encap(txr, txd, &m_head);
953 /* Both txd and m_head are freed */
958 * Make sure that txd is not freed before ETHER_BPF_MTAP.
961 error = hv_nv_on_send(device_ctx, &txd->netvsc_pkt);
963 ETHER_BPF_MTAP(ifp, m_head);
964 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
966 hn_txdesc_put(txr, txd);
968 if (__predict_false(error)) {
972 * This should "really rarely" happen.
974 * XXX Too many RX to be acked or too many sideband
975 * commands to run? Ask netvsc_channel_rollup()
976 * to kick start later.
978 txr->hn_has_txeof = 1;
980 txr->hn_send_failed++;
983 * Try sending again after set hn_has_txeof;
984 * in case that we missed the last
985 * netvsc_channel_rollup().
989 if_printf(ifp, "send failed\n");
992 * This mbuf will be prepended, don't free it
993 * in hn_txdesc_put(); only unload it from the
994 * DMA map in hn_txdesc_put(), if it was loaded.
997 freed = hn_txdesc_put(txr, txd);
999 ("fail to free txd upon send error"));
1001 txr->hn_send_failed++;
1002 IF_PREPEND(&ifp->if_snd, m_head);
1003 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1011 * Link up/down notification
1014 netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status)
1016 hn_softc_t *sc = device_get_softc(device_obj->device);
1030 * Append the specified data to the indicated mbuf chain,
1031 * Extend the mbuf chain if the new data does not fit in
1034 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1035 * There should be an equivalent in the kernel mbuf code,
1036 * but there does not appear to be one yet.
1038 * Differs from m_append() in that additional mbufs are
1039 * allocated with cluster size MJUMPAGESIZE, and filled
1042 * Return 1 if able to complete the job; otherwise 0.
1045 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
1048 int remainder, space;
1050 for (m = m0; m->m_next != NULL; m = m->m_next)
1053 space = M_TRAILINGSPACE(m);
1056 * Copy into available space.
1058 if (space > remainder)
1060 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1065 while (remainder > 0) {
1067 * Allocate a new mbuf; could check space
1068 * and allocate a cluster instead.
1070 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
1073 n->m_len = min(MJUMPAGESIZE, remainder);
1074 bcopy(cp, mtod(n, caddr_t), n->m_len);
1076 remainder -= n->m_len;
1080 if (m0->m_flags & M_PKTHDR)
1081 m0->m_pkthdr.len += len - remainder;
1083 return (remainder == 0);
1088 * Called when we receive a data packet from the "wire" on the
1091 * Note: This is no longer used as a callback
1094 netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
1095 rndis_tcp_ip_csum_info *csum_info)
1097 struct hn_softc *sc = device_get_softc(device_ctx->device);
1098 struct hn_rx_ring *rxr = &sc->hn_rx_ring[0]; /* TODO: vRSS */
1101 int size, do_lro = 0, do_csum = 1;
1104 return (0); /* TODO: KYS how can this be! */
1109 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1114 * Bail out if packet contains more data than configured MTU.
1116 if (packet->tot_data_buf_len > (ifp->if_mtu + ETHER_HDR_LEN)) {
1118 } else if (packet->tot_data_buf_len <= MHLEN) {
1119 m_new = m_gethdr(M_NOWAIT, MT_DATA);
1122 memcpy(mtod(m_new, void *), packet->data,
1123 packet->tot_data_buf_len);
1124 m_new->m_pkthdr.len = m_new->m_len = packet->tot_data_buf_len;
1125 rxr->hn_small_pkts++;
1128 * Get an mbuf with a cluster. For packets 2K or less,
1129 * get a standard 2K cluster. For anything larger, get a
1130 * 4K cluster. Any buffers larger than 4K can cause problems
1131 * if looped around to the Hyper-V TX channel, so avoid them.
1134 if (packet->tot_data_buf_len > MCLBYTES) {
1136 size = MJUMPAGESIZE;
1139 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
1140 if (m_new == NULL) {
1141 if_printf(ifp, "alloc mbuf failed.\n");
1145 hv_m_append(m_new, packet->tot_data_buf_len, packet->data);
1147 m_new->m_pkthdr.rcvif = ifp;
1149 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
1152 /* receive side checksum offload */
1153 if (csum_info != NULL) {
1154 /* IP csum offload */
1155 if (csum_info->receive.ip_csum_succeeded && do_csum) {
1156 m_new->m_pkthdr.csum_flags |=
1157 (CSUM_IP_CHECKED | CSUM_IP_VALID);
1161 /* TCP/UDP csum offload */
1162 if ((csum_info->receive.tcp_csum_succeeded ||
1163 csum_info->receive.udp_csum_succeeded) && do_csum) {
1164 m_new->m_pkthdr.csum_flags |=
1165 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1166 m_new->m_pkthdr.csum_data = 0xffff;
1167 if (csum_info->receive.tcp_csum_succeeded)
1173 if (csum_info->receive.ip_csum_succeeded &&
1174 csum_info->receive.tcp_csum_succeeded)
1177 const struct ether_header *eh;
1182 if (m_new->m_len < hoff)
1184 eh = mtod(m_new, struct ether_header *);
1185 etype = ntohs(eh->ether_type);
1186 if (etype == ETHERTYPE_VLAN) {
1187 const struct ether_vlan_header *evl;
1189 hoff = sizeof(*evl);
1190 if (m_new->m_len < hoff)
1192 evl = mtod(m_new, struct ether_vlan_header *);
1193 etype = ntohs(evl->evl_proto);
1196 if (etype == ETHERTYPE_IP) {
1199 pr = hn_check_iplen(m_new, hoff);
1200 if (pr == IPPROTO_TCP) {
1202 (rxr->hn_trust_hcsum &
1203 HN_TRUST_HCSUM_TCP)) {
1204 rxr->hn_csum_trusted++;
1205 m_new->m_pkthdr.csum_flags |=
1206 (CSUM_IP_CHECKED | CSUM_IP_VALID |
1207 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1208 m_new->m_pkthdr.csum_data = 0xffff;
1210 /* Rely on SW csum verification though... */
1212 } else if (pr == IPPROTO_UDP) {
1214 (rxr->hn_trust_hcsum &
1215 HN_TRUST_HCSUM_UDP)) {
1216 rxr->hn_csum_trusted++;
1217 m_new->m_pkthdr.csum_flags |=
1218 (CSUM_IP_CHECKED | CSUM_IP_VALID |
1219 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1220 m_new->m_pkthdr.csum_data = 0xffff;
1222 } else if (pr != IPPROTO_DONE && do_csum &&
1223 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
1224 rxr->hn_csum_trusted++;
1225 m_new->m_pkthdr.csum_flags |=
1226 (CSUM_IP_CHECKED | CSUM_IP_VALID);
1231 if ((packet->vlan_tci != 0) &&
1232 (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0) {
1233 m_new->m_pkthdr.ether_vtag = packet->vlan_tci;
1234 m_new->m_flags |= M_VLANTAG;
1238 * Note: Moved RX completion back to hv_nv_on_receive() so all
1239 * messages (not just data messages) will trigger a response.
1242 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
1244 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
1245 #if defined(INET) || defined(INET6)
1246 struct lro_ctrl *lro = &rxr->hn_lro;
1249 rxr->hn_lro_tried++;
1250 if (tcp_lro_rx(lro, m_new, 0) == 0) {
1258 /* We're not holding the lock here, so don't release it */
1259 (*ifp->if_input)(ifp, m_new);
1265 netvsc_recv_rollup(struct hv_device *device_ctx __unused)
1270 * Rules for using sc->temp_unusable:
1271 * 1. sc->temp_unusable can only be read or written while holding NV_LOCK()
1272 * 2. code reading sc->temp_unusable under NV_LOCK(), and finding
1273 * sc->temp_unusable set, must release NV_LOCK() and exit
1274 * 3. to retain exclusive control of the interface,
1275 * sc->temp_unusable must be set by code before releasing NV_LOCK()
1276 * 4. only code setting sc->temp_unusable can clear sc->temp_unusable
1277 * 5. code setting sc->temp_unusable must eventually clear sc->temp_unusable
1281 * Standard ioctl entry point. Called when the user wants to configure
1285 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1287 hn_softc_t *sc = ifp->if_softc;
1288 struct ifreq *ifr = (struct ifreq *)data;
1290 struct ifaddr *ifa = (struct ifaddr *)data;
1292 netvsc_device_info device_info;
1293 struct hv_device *hn_dev;
1294 int mask, error = 0;
1295 int retry_cnt = 500;
1301 if (ifa->ifa_addr->sa_family == AF_INET) {
1302 ifp->if_flags |= IFF_UP;
1303 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
1305 arp_ifinit(ifp, ifa);
1308 error = ether_ioctl(ifp, cmd, data);
1311 hn_dev = vmbus_get_devctx(sc->hn_dev);
1313 /* Check MTU value change */
1314 if (ifp->if_mtu == ifr->ifr_mtu)
1317 if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) {
1322 /* Obtain and record requested MTU */
1323 ifp->if_mtu = ifr->ifr_mtu;
1326 * Make sure that LRO aggregation length limit is still
1327 * valid, after the MTU change.
1330 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
1331 HN_LRO_LENLIM_MIN(ifp)) {
1334 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1335 sc->hn_rx_ring[i].hn_lro.lro_length_lim =
1336 HN_LRO_LENLIM_MIN(ifp);
1343 if (!sc->temp_unusable) {
1344 sc->temp_unusable = TRUE;
1348 if (retry_cnt > 0) {
1352 } while (retry_cnt > 0);
1354 if (retry_cnt == 0) {
1359 /* We must remove and add back the device to cause the new
1360 * MTU to take effect. This includes tearing down, but not
1361 * deleting the channel, then bringing it back up.
1363 error = hv_rf_on_device_remove(hn_dev, HV_RF_NV_RETAIN_CHANNEL);
1366 sc->temp_unusable = FALSE;
1370 error = hv_rf_on_device_add(hn_dev, &device_info);
1373 sc->temp_unusable = FALSE;
1378 sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
1379 if (sc->hn_tx_ring[0].hn_tx_chimney_size >
1380 sc->hn_tx_chimney_max)
1381 hn_set_tx_chimney_size(sc, sc->hn_tx_chimney_max);
1383 hn_ifinit_locked(sc);
1386 sc->temp_unusable = FALSE;
1392 if (!sc->temp_unusable) {
1393 sc->temp_unusable = TRUE;
1397 if (retry_cnt > 0) {
1401 } while (retry_cnt > 0);
1403 if (retry_cnt == 0) {
1408 if (ifp->if_flags & IFF_UP) {
1410 * If only the state of the PROMISC flag changed,
1411 * then just use the 'set promisc mode' command
1412 * instead of reinitializing the entire NIC. Doing
1413 * a full re-init means reloading the firmware and
1414 * waiting for it to start up, which may take a
1418 /* Fixme: Promiscuous mode? */
1419 if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1420 ifp->if_flags & IFF_PROMISC &&
1421 !(sc->hn_if_flags & IFF_PROMISC)) {
1422 /* do something here for Hyper-V */
1423 } else if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1424 !(ifp->if_flags & IFF_PROMISC) &&
1425 sc->hn_if_flags & IFF_PROMISC) {
1426 /* do something here for Hyper-V */
1429 hn_ifinit_locked(sc);
1431 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1436 sc->temp_unusable = FALSE;
1438 sc->hn_if_flags = ifp->if_flags;
1444 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
1445 if (mask & IFCAP_TXCSUM) {
1446 ifp->if_capenable ^= IFCAP_TXCSUM;
1447 if (ifp->if_capenable & IFCAP_TXCSUM) {
1449 sc->hn_tx_ring[0].hn_csum_assist;
1452 ~sc->hn_tx_ring[0].hn_csum_assist;
1456 if (mask & IFCAP_RXCSUM)
1457 ifp->if_capenable ^= IFCAP_RXCSUM;
1459 if (mask & IFCAP_LRO)
1460 ifp->if_capenable ^= IFCAP_LRO;
1462 if (mask & IFCAP_TSO4) {
1463 ifp->if_capenable ^= IFCAP_TSO4;
1464 if (ifp->if_capenable & IFCAP_TSO4)
1465 ifp->if_hwassist |= CSUM_IP_TSO;
1467 ifp->if_hwassist &= ~CSUM_IP_TSO;
1470 if (mask & IFCAP_TSO6) {
1471 ifp->if_capenable ^= IFCAP_TSO6;
1472 if (ifp->if_capenable & IFCAP_TSO6)
1473 ifp->if_hwassist |= CSUM_IP6_TSO;
1475 ifp->if_hwassist &= ~CSUM_IP6_TSO;
1484 /* Fixme: Multicast mode? */
1485 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1487 netvsc_setmulti(sc);
1496 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
1499 error = ether_ioctl(ifp, cmd, data);
1510 hn_stop(hn_softc_t *sc)
1514 struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
1519 printf(" Closing Device ...\n");
1521 atomic_clear_int(&ifp->if_drv_flags,
1522 (IFF_DRV_RUNNING | IFF_DRV_OACTIVE));
1523 if_link_state_change(ifp, LINK_STATE_DOWN);
1524 sc->hn_initdone = 0;
1526 ret = hv_rf_on_close(device_ctx);
1530 * FreeBSD transmit entry point
1533 hn_start(struct ifnet *ifp)
1535 struct hn_softc *sc = ifp->if_softc;
1536 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
1538 if (txr->hn_sched_tx)
1541 if (mtx_trylock(&txr->hn_tx_lock)) {
1544 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
1545 mtx_unlock(&txr->hn_tx_lock);
1550 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
1554 hn_start_txeof(struct hn_tx_ring *txr)
1556 struct hn_softc *sc = txr->hn_sc;
1557 struct ifnet *ifp = sc->hn_ifp;
1559 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
1561 if (txr->hn_sched_tx)
1564 if (mtx_trylock(&txr->hn_tx_lock)) {
1567 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1568 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
1569 mtx_unlock(&txr->hn_tx_lock);
1571 taskqueue_enqueue(txr->hn_tx_taskq,
1577 * Release the OACTIVE earlier, with the hope, that
1578 * others could catch up. The task will clear the
1579 * flag again with the hn_tx_lock to avoid possible
1582 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1583 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
1591 hn_ifinit_locked(hn_softc_t *sc)
1594 struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
1599 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1603 hv_promisc_mode = 1;
1605 ret = hv_rf_on_open(device_ctx);
1609 sc->hn_initdone = 1;
1611 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1612 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
1613 if_link_state_change(ifp, LINK_STATE_UP);
1620 hn_ifinit(void *xsc)
1622 hn_softc_t *sc = xsc;
1625 if (sc->temp_unusable) {
1629 sc->temp_unusable = TRUE;
1632 hn_ifinit_locked(sc);
1635 sc->temp_unusable = FALSE;
1644 hn_watchdog(struct ifnet *ifp)
1649 printf("hn%d: watchdog timeout -- resetting\n", sc->hn_unit);
1650 hn_ifinit(sc); /*???*/
1651 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1656 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
1658 struct hn_softc *sc = arg1;
1659 unsigned int lenlim;
1662 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
1663 error = sysctl_handle_int(oidp, &lenlim, 0, req);
1664 if (error || req->newptr == NULL)
1667 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
1668 lenlim > TCP_LRO_LENGTH_MAX)
1672 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1673 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
1679 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
1681 struct hn_softc *sc = arg1;
1682 int ackcnt, error, i;
1685 * lro_ackcnt_lim is append count limit,
1686 * +1 to turn it into aggregation limit.
1688 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
1689 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
1690 if (error || req->newptr == NULL)
1693 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
1697 * Convert aggregation limit back to append
1702 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1703 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
1709 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
1711 struct hn_softc *sc = arg1;
1716 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
1719 error = sysctl_handle_int(oidp, &on, 0, req);
1720 if (error || req->newptr == NULL)
1724 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1725 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
1728 rxr->hn_trust_hcsum |= hcsum;
1730 rxr->hn_trust_hcsum &= ~hcsum;
1737 hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS)
1739 struct hn_softc *sc = arg1;
1740 int chimney_size, error;
1742 chimney_size = sc->hn_tx_ring[0].hn_tx_chimney_size;
1743 error = sysctl_handle_int(oidp, &chimney_size, 0, req);
1744 if (error || req->newptr == NULL)
1747 if (chimney_size > sc->hn_tx_chimney_max || chimney_size <= 0)
1750 hn_set_tx_chimney_size(sc, chimney_size);
1755 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
1757 struct hn_softc *sc = arg1;
1758 int ofs = arg2, i, error;
1759 struct hn_rx_ring *rxr;
1763 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1764 rxr = &sc->hn_rx_ring[i];
1765 stat += *((u_long *)((uint8_t *)rxr + ofs));
1768 error = sysctl_handle_long(oidp, &stat, 0, req);
1769 if (error || req->newptr == NULL)
1772 /* Zero out this stat. */
1773 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1774 rxr = &sc->hn_rx_ring[i];
1775 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
1781 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
1783 struct hn_softc *sc = arg1;
1784 int ofs = arg2, i, error;
1785 struct hn_rx_ring *rxr;
1789 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1790 rxr = &sc->hn_rx_ring[i];
1791 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
1794 error = sysctl_handle_64(oidp, &stat, 0, req);
1795 if (error || req->newptr == NULL)
1798 /* Zero out this stat. */
1799 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1800 rxr = &sc->hn_rx_ring[i];
1801 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
1807 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
1809 struct hn_softc *sc = arg1;
1810 int ofs = arg2, i, error;
1811 struct hn_tx_ring *txr;
1815 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1816 txr = &sc->hn_tx_ring[i];
1817 stat += *((u_long *)((uint8_t *)txr + ofs));
1820 error = sysctl_handle_long(oidp, &stat, 0, req);
1821 if (error || req->newptr == NULL)
1824 /* Zero out this stat. */
1825 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1826 txr = &sc->hn_tx_ring[i];
1827 *((u_long *)((uint8_t *)txr + ofs)) = 0;
1833 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
1835 struct hn_softc *sc = arg1;
1836 int ofs = arg2, i, error, conf;
1837 struct hn_tx_ring *txr;
1839 txr = &sc->hn_tx_ring[0];
1840 conf = *((int *)((uint8_t *)txr + ofs));
1842 error = sysctl_handle_int(oidp, &conf, 0, req);
1843 if (error || req->newptr == NULL)
1847 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1848 txr = &sc->hn_tx_ring[i];
1849 *((int *)((uint8_t *)txr + ofs)) = conf;
1857 hn_check_iplen(const struct mbuf *m, int hoff)
1859 const struct ip *ip;
1860 int len, iphlen, iplen;
1861 const struct tcphdr *th;
1862 int thoff; /* TCP data offset */
1864 len = hoff + sizeof(struct ip);
1866 /* The packet must be at least the size of an IP header. */
1867 if (m->m_pkthdr.len < len)
1868 return IPPROTO_DONE;
1870 /* The fixed IP header must reside completely in the first mbuf. */
1872 return IPPROTO_DONE;
1874 ip = mtodo(m, hoff);
1876 /* Bound check the packet's stated IP header length. */
1877 iphlen = ip->ip_hl << 2;
1878 if (iphlen < sizeof(struct ip)) /* minimum header length */
1879 return IPPROTO_DONE;
1881 /* The full IP header must reside completely in the one mbuf. */
1882 if (m->m_len < hoff + iphlen)
1883 return IPPROTO_DONE;
1885 iplen = ntohs(ip->ip_len);
1888 * Check that the amount of data in the buffers is as
1889 * at least much as the IP header would have us expect.
1891 if (m->m_pkthdr.len < hoff + iplen)
1892 return IPPROTO_DONE;
1895 * Ignore IP fragments.
1897 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
1898 return IPPROTO_DONE;
1901 * The TCP/IP or UDP/IP header must be entirely contained within
1902 * the first fragment of a packet.
1906 if (iplen < iphlen + sizeof(struct tcphdr))
1907 return IPPROTO_DONE;
1908 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
1909 return IPPROTO_DONE;
1910 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
1911 thoff = th->th_off << 2;
1912 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
1913 return IPPROTO_DONE;
1914 if (m->m_len < hoff + iphlen + thoff)
1915 return IPPROTO_DONE;
1918 if (iplen < iphlen + sizeof(struct udphdr))
1919 return IPPROTO_DONE;
1920 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
1921 return IPPROTO_DONE;
1925 return IPPROTO_DONE;
1932 hn_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error)
1934 bus_addr_t *paddr = arg;
1939 KASSERT(nseg == 1, ("too many segments %d!", nseg));
1940 *paddr = segs->ds_addr;
1944 hn_create_rx_data(struct hn_softc *sc)
1946 struct sysctl_oid_list *child;
1947 struct sysctl_ctx_list *ctx;
1948 device_t dev = sc->hn_dev;
1949 #if defined(INET) || defined(INET6)
1950 #if __FreeBSD_version >= 1100095
1956 sc->hn_rx_ring_cnt = 1; /* TODO: vRSS */
1957 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
1958 M_NETVSC, M_WAITOK | M_ZERO);
1960 #if defined(INET) || defined(INET6)
1961 #if __FreeBSD_version >= 1100095
1962 lroent_cnt = hn_lro_entry_count;
1963 if (lroent_cnt < TCP_LRO_ENTRIES)
1964 lroent_cnt = TCP_LRO_ENTRIES;
1965 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
1967 #endif /* INET || INET6 */
1969 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1970 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
1972 if (hn_trust_hosttcp)
1973 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
1974 if (hn_trust_hostudp)
1975 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
1976 if (hn_trust_hostip)
1977 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
1982 #if defined(INET) || defined(INET6)
1983 #if __FreeBSD_version >= 1100095
1984 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 0);
1986 tcp_lro_init(&rxr->hn_lro);
1987 rxr->hn_lro.ifp = sc->hn_ifp;
1989 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
1990 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
1991 #endif /* INET || INET6 */
1994 ctx = device_get_sysctl_ctx(dev);
1995 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1997 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
1998 CTLTYPE_U64 | CTLFLAG_RW, sc,
1999 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
2000 hn_rx_stat_u64_sysctl, "LU", "LRO queued");
2001 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
2002 CTLTYPE_U64 | CTLFLAG_RW, sc,
2003 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
2004 hn_rx_stat_u64_sysctl, "LU", "LRO flushed");
2005 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
2006 CTLTYPE_ULONG | CTLFLAG_RW, sc,
2007 __offsetof(struct hn_rx_ring, hn_lro_tried),
2008 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
2009 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
2010 CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_lro_lenlim_sysctl, "IU",
2011 "Max # of data bytes to be aggregated by LRO");
2012 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
2013 CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_ackcnt_sysctl, "I",
2014 "Max # of ACKs to be aggregated by LRO");
2015 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
2016 CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_TCP,
2017 hn_trust_hcsum_sysctl, "I",
2018 "Trust tcp segement verification on host side, "
2019 "when csum info is missing");
2020 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
2021 CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_UDP,
2022 hn_trust_hcsum_sysctl, "I",
2023 "Trust udp datagram verification on host side, "
2024 "when csum info is missing");
2025 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
2026 CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_IP,
2027 hn_trust_hcsum_sysctl, "I",
2028 "Trust ip packet verification on host side, "
2029 "when csum info is missing");
2030 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
2031 CTLTYPE_ULONG | CTLFLAG_RW, sc,
2032 __offsetof(struct hn_rx_ring, hn_csum_ip),
2033 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
2034 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
2035 CTLTYPE_ULONG | CTLFLAG_RW, sc,
2036 __offsetof(struct hn_rx_ring, hn_csum_tcp),
2037 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
2038 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
2039 CTLTYPE_ULONG | CTLFLAG_RW, sc,
2040 __offsetof(struct hn_rx_ring, hn_csum_udp),
2041 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
2042 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
2043 CTLTYPE_ULONG | CTLFLAG_RW, sc,
2044 __offsetof(struct hn_rx_ring, hn_csum_trusted),
2045 hn_rx_stat_ulong_sysctl, "LU",
2046 "# of packets that we trust host's csum verification");
2047 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
2048 CTLTYPE_ULONG | CTLFLAG_RW, sc,
2049 __offsetof(struct hn_rx_ring, hn_small_pkts),
2050 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
2054 hn_destroy_rx_data(struct hn_softc *sc)
2056 #if defined(INET) || defined(INET6)
2060 if (sc->hn_rx_ring_cnt == 0)
2063 #if defined(INET) || defined(INET6)
2064 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2065 tcp_lro_free(&sc->hn_rx_ring[i].hn_lro);
2067 free(sc->hn_rx_ring, M_NETVSC);
2068 sc->hn_rx_ring = NULL;
2070 sc->hn_rx_ring_cnt = 0;
2074 hn_create_tx_ring(struct hn_softc *sc, int id)
2076 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
2077 bus_dma_tag_t parent_dtag;
2082 #ifndef HN_USE_TXDESC_BUFRING
2083 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
2085 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
2087 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
2088 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
2089 M_NETVSC, M_WAITOK | M_ZERO);
2090 #ifndef HN_USE_TXDESC_BUFRING
2091 SLIST_INIT(&txr->hn_txlist);
2093 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_NETVSC,
2094 M_WAITOK, &txr->hn_tx_lock);
2097 txr->hn_tx_taskq = sc->hn_tx_taskq;
2098 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
2099 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
2101 txr->hn_direct_tx_size = hn_direct_tx_size;
2102 if (hv_vmbus_protocal_version >= HV_VMBUS_VERSION_WIN8_1)
2103 txr->hn_csum_assist = HN_CSUM_ASSIST;
2105 txr->hn_csum_assist = HN_CSUM_ASSIST_WIN8;
2108 * Always schedule transmission instead of trying to do direct
2109 * transmission. This one gives the best performance so far.
2111 txr->hn_sched_tx = 1;
2113 txr->hn_txeof = hn_start_txeof; /* TODO: if_transmit */
2115 parent_dtag = bus_get_dma_tag(sc->hn_dev);
2117 /* DMA tag for RNDIS messages. */
2118 error = bus_dma_tag_create(parent_dtag, /* parent */
2119 HN_RNDIS_MSG_ALIGN, /* alignment */
2120 HN_RNDIS_MSG_BOUNDARY, /* boundary */
2121 BUS_SPACE_MAXADDR, /* lowaddr */
2122 BUS_SPACE_MAXADDR, /* highaddr */
2123 NULL, NULL, /* filter, filterarg */
2124 HN_RNDIS_MSG_LEN, /* maxsize */
2126 HN_RNDIS_MSG_LEN, /* maxsegsize */
2128 NULL, /* lockfunc */
2129 NULL, /* lockfuncarg */
2130 &txr->hn_tx_rndis_dtag);
2132 device_printf(sc->hn_dev, "failed to create rndis dmatag\n");
2136 /* DMA tag for data. */
2137 error = bus_dma_tag_create(parent_dtag, /* parent */
2139 HN_TX_DATA_BOUNDARY, /* boundary */
2140 BUS_SPACE_MAXADDR, /* lowaddr */
2141 BUS_SPACE_MAXADDR, /* highaddr */
2142 NULL, NULL, /* filter, filterarg */
2143 HN_TX_DATA_MAXSIZE, /* maxsize */
2144 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
2145 HN_TX_DATA_SEGSIZE, /* maxsegsize */
2147 NULL, /* lockfunc */
2148 NULL, /* lockfuncarg */
2149 &txr->hn_tx_data_dtag);
2151 device_printf(sc->hn_dev, "failed to create data dmatag\n");
2155 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
2156 struct hn_txdesc *txd = &txr->hn_txdesc[i];
2161 * Allocate and load RNDIS messages.
2163 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
2164 (void **)&txd->rndis_msg,
2165 BUS_DMA_WAITOK | BUS_DMA_COHERENT,
2166 &txd->rndis_msg_dmap);
2168 device_printf(sc->hn_dev,
2169 "failed to allocate rndis_msg, %d\n", i);
2173 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
2174 txd->rndis_msg_dmap,
2175 txd->rndis_msg, HN_RNDIS_MSG_LEN,
2176 hn_dma_map_paddr, &txd->rndis_msg_paddr,
2179 device_printf(sc->hn_dev,
2180 "failed to load rndis_msg, %d\n", i);
2181 bus_dmamem_free(txr->hn_tx_rndis_dtag,
2182 txd->rndis_msg, txd->rndis_msg_dmap);
2186 /* DMA map for TX data. */
2187 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
2190 device_printf(sc->hn_dev,
2191 "failed to allocate tx data dmamap\n");
2192 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
2193 txd->rndis_msg_dmap);
2194 bus_dmamem_free(txr->hn_tx_rndis_dtag,
2195 txd->rndis_msg, txd->rndis_msg_dmap);
2199 /* All set, put it to list */
2200 txd->flags |= HN_TXD_FLAG_ONLIST;
2201 #ifndef HN_USE_TXDESC_BUFRING
2202 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2204 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2207 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
2209 if (sc->hn_tx_sysctl_tree != NULL) {
2210 struct sysctl_oid_list *child;
2211 struct sysctl_ctx_list *ctx;
2215 * Create per TX ring sysctl tree:
2216 * dev.hn.UNIT.tx.RINGID
2218 ctx = device_get_sysctl_ctx(sc->hn_dev);
2219 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
2221 snprintf(name, sizeof(name), "%d", id);
2222 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
2223 name, CTLFLAG_RD, 0, "");
2225 if (txr->hn_tx_sysctl_tree != NULL) {
2226 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
2228 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
2229 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
2230 "# of available TX descs");
2238 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
2240 struct hn_tx_ring *txr = txd->txr;
2242 KASSERT(txd->m == NULL, ("still has mbuf installed"));
2243 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
2245 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_msg_dmap);
2246 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_msg,
2247 txd->rndis_msg_dmap);
2248 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
2252 hn_destroy_tx_ring(struct hn_tx_ring *txr)
2254 struct hn_txdesc *txd;
2256 if (txr->hn_txdesc == NULL)
2259 #ifndef HN_USE_TXDESC_BUFRING
2260 while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
2261 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2262 hn_txdesc_dmamap_destroy(txd);
2265 while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
2266 hn_txdesc_dmamap_destroy(txd);
2269 if (txr->hn_tx_data_dtag != NULL)
2270 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
2271 if (txr->hn_tx_rndis_dtag != NULL)
2272 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
2274 #ifdef HN_USE_TXDESC_BUFRING
2275 buf_ring_free(txr->hn_txdesc_br, M_NETVSC);
2278 free(txr->hn_txdesc, M_NETVSC);
2279 txr->hn_txdesc = NULL;
2281 #ifndef HN_USE_TXDESC_BUFRING
2282 mtx_destroy(&txr->hn_txlist_spin);
2284 mtx_destroy(&txr->hn_tx_lock);
2288 hn_create_tx_data(struct hn_softc *sc)
2290 struct sysctl_oid_list *child;
2291 struct sysctl_ctx_list *ctx;
2294 sc->hn_tx_ring_cnt = 1; /* TODO: vRSS */
2295 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
2296 M_NETVSC, M_WAITOK | M_ZERO);
2298 ctx = device_get_sysctl_ctx(sc->hn_dev);
2299 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
2301 /* Create dev.hn.UNIT.tx sysctl tree */
2302 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
2305 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2308 error = hn_create_tx_ring(sc, i);
2313 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
2314 CTLTYPE_ULONG | CTLFLAG_RW, sc,
2315 __offsetof(struct hn_tx_ring, hn_no_txdescs),
2316 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
2317 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
2318 CTLTYPE_ULONG | CTLFLAG_RW, sc,
2319 __offsetof(struct hn_tx_ring, hn_send_failed),
2320 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
2321 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
2322 CTLTYPE_ULONG | CTLFLAG_RW, sc,
2323 __offsetof(struct hn_tx_ring, hn_txdma_failed),
2324 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
2325 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
2326 CTLTYPE_ULONG | CTLFLAG_RW, sc,
2327 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
2328 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
2329 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
2330 CTLTYPE_ULONG | CTLFLAG_RW, sc,
2331 __offsetof(struct hn_tx_ring, hn_tx_chimney),
2332 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
2333 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
2334 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
2335 "# of total TX descs");
2336 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
2337 CTLFLAG_RD, &sc->hn_tx_chimney_max, 0,
2338 "Chimney send packet size upper boundary");
2339 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
2340 CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_tx_chimney_size_sysctl,
2341 "I", "Chimney send packet size limit");
2342 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
2343 CTLTYPE_INT | CTLFLAG_RW, sc,
2344 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
2345 hn_tx_conf_int_sysctl, "I",
2346 "Size of the packet for direct transmission");
2347 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
2348 CTLTYPE_INT | CTLFLAG_RW, sc,
2349 __offsetof(struct hn_tx_ring, hn_sched_tx),
2350 hn_tx_conf_int_sysctl, "I",
2351 "Always schedule transmission "
2352 "instead of doing direct transmission");
2358 hn_set_tx_chimney_size(struct hn_softc *sc, int chimney_size)
2363 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
2364 sc->hn_tx_ring[i].hn_tx_chimney_size = chimney_size;
2369 hn_destroy_tx_data(struct hn_softc *sc)
2373 if (sc->hn_tx_ring_cnt == 0)
2376 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
2377 hn_destroy_tx_ring(&sc->hn_tx_ring[i]);
2379 free(sc->hn_tx_ring, M_NETVSC);
2380 sc->hn_tx_ring = NULL;
2382 sc->hn_tx_ring_cnt = 0;
2386 hn_start_taskfunc(void *xtxr, int pending __unused)
2388 struct hn_tx_ring *txr = xtxr;
2390 mtx_lock(&txr->hn_tx_lock);
2391 hn_start_locked(txr, 0);
2392 mtx_unlock(&txr->hn_tx_lock);
2396 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
2398 struct hn_tx_ring *txr = xtxr;
2400 mtx_lock(&txr->hn_tx_lock);
2401 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
2402 hn_start_locked(txr, 0);
2403 mtx_unlock(&txr->hn_tx_lock);
2407 hn_stop_tx_tasks(struct hn_softc *sc)
2411 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2412 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
2414 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
2415 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
2420 hn_tx_taskq_create(void *arg __unused)
2422 if (!hn_share_tx_taskq)
2425 hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
2426 taskqueue_thread_enqueue, &hn_tx_taskq);
2427 if (hn_bind_tx_taskq >= 0) {
2428 int cpu = hn_bind_tx_taskq;
2431 if (cpu > mp_ncpus - 1)
2433 CPU_SETOF(cpu, &cpu_set);
2434 taskqueue_start_threads_cpuset(&hn_tx_taskq, 1, PI_NET,
2437 taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
2440 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_FIRST,
2441 hn_tx_taskq_create, NULL);
2444 hn_tx_taskq_destroy(void *arg __unused)
2446 if (hn_tx_taskq != NULL)
2447 taskqueue_free(hn_tx_taskq);
2449 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_FIRST,
2450 hn_tx_taskq_destroy, NULL);
2452 static device_method_t netvsc_methods[] = {
2453 /* Device interface */
2454 DEVMETHOD(device_probe, netvsc_probe),
2455 DEVMETHOD(device_attach, netvsc_attach),
2456 DEVMETHOD(device_detach, netvsc_detach),
2457 DEVMETHOD(device_shutdown, netvsc_shutdown),
2462 static driver_t netvsc_driver = {
2468 static devclass_t netvsc_devclass;
2470 DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0);
2471 MODULE_VERSION(hn, 1);
2472 MODULE_DEPEND(hn, vmbus, 1, 1, 1);