2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/sockio.h>
65 #include <sys/malloc.h>
66 #include <sys/module.h>
67 #include <sys/kernel.h>
68 #include <sys/socket.h>
70 #include <sys/queue.h>
74 #include <sys/sysctl.h>
75 #include <sys/buf_ring.h>
78 #include <net/if_arp.h>
79 #include <net/ethernet.h>
80 #include <net/if_dl.h>
81 #include <net/if_media.h>
82 #include <net/rndis.h>
85 #include <net/if_types.h>
86 #include <net/if_vlan_var.h>
89 #include <netinet/in_systm.h>
90 #include <netinet/in.h>
91 #include <netinet/ip.h>
92 #include <netinet/if_ether.h>
93 #include <netinet/tcp.h>
94 #include <netinet/udp.h>
95 #include <netinet/ip6.h>
98 #include <vm/vm_param.h>
99 #include <vm/vm_kern.h>
102 #include <machine/bus.h>
103 #include <machine/resource.h>
104 #include <machine/frame.h>
105 #include <machine/vmparam.h>
108 #include <sys/rman.h>
109 #include <sys/mutex.h>
110 #include <sys/errno.h>
111 #include <sys/types.h>
112 #include <machine/atomic.h>
114 #include <machine/intr_machdep.h>
116 #include <machine/in_cksum.h>
118 #include <dev/hyperv/include/hyperv.h>
119 #include <dev/hyperv/include/hyperv_busdma.h>
120 #include <dev/hyperv/include/vmbus_xact.h>
122 #include <dev/hyperv/netvsc/hv_net_vsc.h>
123 #include <dev/hyperv/netvsc/hv_rndis_filter.h>
124 #include <dev/hyperv/netvsc/ndis.h>
126 #include "vmbus_if.h"
128 /* Short for Hyper-V network interface */
129 #define NETVSC_DEVNAME "hn"
132 * It looks like offset 0 of buf is reserved to hold the softc pointer.
133 * The sc pointer evidently not needed, and is not presently populated.
134 * The packet offset is where the netvsc_packet starts in the buffer.
136 #define HV_NV_SC_PTR_OFFSET_IN_BUF 0
137 #define HV_NV_PACKET_OFFSET_IN_BUF 16
139 /* YYY should get it from the underlying channel */
140 #define HN_TX_DESC_CNT 512
142 #define HN_LROENT_CNT_DEF 128
144 #define HN_RING_CNT_DEF_MAX 8
146 #define HN_RNDIS_PKT_LEN \
147 (sizeof(struct rndis_packet_msg) + \
148 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
149 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
150 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
151 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
152 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
153 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
155 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
156 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
157 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
158 /* -1 for RNDIS packet message */
159 #define HN_TX_DATA_SEGCNT_MAX (NETVSC_PACKET_MAXPAGE - 1)
161 #define HN_DIRECT_TX_SIZE_DEF 128
163 #define HN_EARLY_TXEOF_THRESH 8
166 #ifndef HN_USE_TXDESC_BUFRING
167 SLIST_ENTRY(hn_txdesc) link;
170 struct hn_tx_ring *txr;
172 uint32_t flags; /* HN_TXD_FLAG_ */
173 struct hn_send_ctx send_ctx;
175 bus_dmamap_t data_dmap;
177 bus_addr_t rndis_pkt_paddr;
178 struct rndis_packet_msg *rndis_pkt;
179 bus_dmamap_t rndis_pkt_dmap;
182 #define HN_TXD_FLAG_ONLIST 0x1
183 #define HN_TXD_FLAG_DMAMAP 0x2
185 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
186 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
187 /* YYY 2*MTU is a bit rough, but should be good enough. */
188 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
190 #define HN_LRO_ACKCNT_DEF 1
192 #define HN_LOCK_INIT(sc) \
193 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
194 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
195 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
196 #define HN_LOCK(sc) sx_xlock(&(sc)->hn_lock)
197 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
199 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
200 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
201 #define HN_CSUM_IP_HWASSIST(sc) \
202 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
203 #define HN_CSUM_IP6_HWASSIST(sc) \
204 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
210 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
211 "Hyper-V network interface");
213 /* Trust tcp segements verification on host side. */
214 static int hn_trust_hosttcp = 1;
215 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
216 &hn_trust_hosttcp, 0,
217 "Trust tcp segement verification on host side, "
218 "when csum info is missing (global setting)");
220 /* Trust udp datagrams verification on host side. */
221 static int hn_trust_hostudp = 1;
222 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
223 &hn_trust_hostudp, 0,
224 "Trust udp datagram verification on host side, "
225 "when csum info is missing (global setting)");
227 /* Trust ip packets verification on host side. */
228 static int hn_trust_hostip = 1;
229 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
231 "Trust ip packet verification on host side, "
232 "when csum info is missing (global setting)");
234 /* Limit TSO burst size */
235 static int hn_tso_maxlen = IP_MAXPACKET;
236 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
237 &hn_tso_maxlen, 0, "TSO burst limit");
239 /* Limit chimney send size */
240 static int hn_tx_chimney_size = 0;
241 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
242 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
244 /* Limit the size of packet for direct transmission */
245 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
246 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
247 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
249 #if defined(INET) || defined(INET6)
250 #if __FreeBSD_version >= 1100095
251 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
252 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
253 &hn_lro_entry_count, 0, "LRO entry count");
257 static int hn_share_tx_taskq = 0;
258 SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
259 &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
261 static struct taskqueue *hn_tx_taskq;
263 #ifndef HN_USE_TXDESC_BUFRING
264 static int hn_use_txdesc_bufring = 0;
266 static int hn_use_txdesc_bufring = 1;
268 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
269 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
271 static int hn_bind_tx_taskq = -1;
272 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
273 &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
275 static int hn_use_if_start = 0;
276 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
277 &hn_use_if_start, 0, "Use if_start TX method");
279 static int hn_chan_cnt = 0;
280 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
282 "# of channels to use; each channel has one RX ring and one TX ring");
284 static int hn_tx_ring_cnt = 0;
285 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
286 &hn_tx_ring_cnt, 0, "# of TX rings to use");
288 static int hn_tx_swq_depth = 0;
289 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
290 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
292 #if __FreeBSD_version >= 1100095
293 static u_int hn_lro_mbufq_depth = 0;
294 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
295 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
298 static u_int hn_cpu_index;
301 * Forward declarations
303 static void hn_stop(struct hn_softc *sc);
304 static void hn_init_locked(struct hn_softc *sc);
305 static void hn_init(void *xsc);
306 static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
307 static int hn_start_locked(struct hn_tx_ring *txr, int len);
308 static void hn_start(struct ifnet *ifp);
309 static void hn_start_txeof(struct hn_tx_ring *);
310 static int hn_ifmedia_upd(struct ifnet *ifp);
311 static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
312 #if __FreeBSD_version >= 1100099
313 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
314 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
316 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
317 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
318 #if __FreeBSD_version < 1100095
319 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
321 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
323 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
325 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
326 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
327 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
328 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
329 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
330 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
331 static int hn_check_iplen(const struct mbuf *, int);
332 static int hn_create_tx_ring(struct hn_softc *, int);
333 static void hn_destroy_tx_ring(struct hn_tx_ring *);
334 static int hn_create_tx_data(struct hn_softc *, int);
335 static void hn_fixup_tx_data(struct hn_softc *);
336 static void hn_destroy_tx_data(struct hn_softc *);
337 static void hn_start_taskfunc(void *, int);
338 static void hn_start_txeof_taskfunc(void *, int);
339 static void hn_link_taskfunc(void *, int);
340 static void hn_suspend_mgmt_taskfunc(void *, int);
341 static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **);
342 static int hn_create_rx_data(struct hn_softc *sc, int);
343 static void hn_destroy_rx_data(struct hn_softc *sc);
344 static void hn_set_chim_size(struct hn_softc *, int);
345 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
346 static int hn_chan_attach(struct hn_softc *, struct vmbus_channel *);
347 static void hn_chan_detach(struct hn_softc *, struct vmbus_channel *);
348 static int hn_attach_subchans(struct hn_softc *);
349 static void hn_detach_allchans(struct hn_softc *);
350 static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr);
351 static void hn_set_ring_inuse(struct hn_softc *, int);
352 static int hn_synth_attach(struct hn_softc *, int);
353 static void hn_synth_detach(struct hn_softc *);
354 static bool hn_tx_ring_pending(struct hn_tx_ring *);
355 static void hn_suspend(struct hn_softc *);
356 static void hn_suspend_data(struct hn_softc *);
357 static void hn_suspend_mgmt(struct hn_softc *);
358 static void hn_resume(struct hn_softc *);
359 static void hn_resume_data(struct hn_softc *);
360 static void hn_resume_mgmt(struct hn_softc *);
361 static void hn_rx_drain(struct vmbus_channel *);
362 static void hn_tx_resume(struct hn_softc *, int);
363 static void hn_tx_ring_qflush(struct hn_tx_ring *);
364 static int netvsc_detach(device_t dev);
366 static void hn_nvs_handle_notify(struct hn_softc *sc,
367 const struct vmbus_chanpkt_hdr *pkt);
368 static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
369 const struct vmbus_chanpkt_hdr *pkt);
370 static void hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr,
371 struct vmbus_channel *chan,
372 const struct vmbus_chanpkt_hdr *pkthdr);
373 static void hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid);
375 static int hn_transmit(struct ifnet *, struct mbuf *);
376 static void hn_xmit_qflush(struct ifnet *);
377 static int hn_xmit(struct hn_tx_ring *, int);
378 static void hn_xmit_txeof(struct hn_tx_ring *);
379 static void hn_xmit_taskfunc(void *, int);
380 static void hn_xmit_txeof_taskfunc(void *, int);
382 static const uint8_t hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
383 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
384 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
385 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
386 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
387 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
390 #if __FreeBSD_version >= 1100099
392 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
396 for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
397 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
402 hn_get_txswq_depth(const struct hn_tx_ring *txr)
405 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
406 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
407 return txr->hn_txdesc_cnt;
408 return hn_tx_swq_depth;
412 hn_rss_reconfig(struct hn_softc *sc)
418 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
425 * Direct reconfiguration by setting the UNCHG flags does
426 * _not_ work properly.
429 if_printf(sc->hn_ifp, "disable RSS\n");
430 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
432 if_printf(sc->hn_ifp, "RSS disable failed\n");
437 * Reenable the RSS w/ the updated RSS key or indirect
441 if_printf(sc->hn_ifp, "reconfig RSS\n");
442 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
444 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
451 hn_rss_ind_fixup(struct hn_softc *sc, int nchan)
453 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
456 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
459 * Check indirect table to make sure that all channels in it
462 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
463 if (rss->rss_ind[i] >= nchan) {
464 if_printf(sc->hn_ifp,
465 "RSS indirect table %d fixup: %u -> %d\n",
466 i, rss->rss_ind[i], nchan - 1);
467 rss->rss_ind[i] = nchan - 1;
473 hn_ifmedia_upd(struct ifnet *ifp __unused)
480 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
482 struct hn_softc *sc = ifp->if_softc;
484 ifmr->ifm_status = IFM_AVALID;
485 ifmr->ifm_active = IFM_ETHER;
487 if (!sc->hn_carrier) {
488 ifmr->ifm_active |= IFM_NONE;
491 ifmr->ifm_status |= IFM_ACTIVE;
492 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
495 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
496 static const struct hyperv_guid g_net_vsc_device_type = {
497 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
498 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
502 * Standard probe entry point.
506 netvsc_probe(device_t dev)
508 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
509 &g_net_vsc_device_type) == 0) {
510 device_set_desc(dev, "Hyper-V Network Interface");
511 return BUS_PROBE_DEFAULT;
517 hn_cpuset_setthread_task(void *xmask, int pending __unused)
519 cpuset_t *mask = xmask;
522 error = cpuset_setthread(curthread->td_tid, mask);
524 panic("curthread=%ju: can't pin; error=%d",
525 (uintmax_t)curthread->td_tid, error);
530 * Standard attach entry point.
532 * Called when the driver is loaded. It allocates needed resources,
533 * and initializes the "hardware" and software.
536 netvsc_attach(device_t dev)
538 struct hn_softc *sc = device_get_softc(dev);
539 struct sysctl_oid_list *child;
540 struct sysctl_ctx_list *ctx;
541 uint8_t eaddr[ETHER_ADDR_LEN];
542 struct ifnet *ifp = NULL;
543 int error, ring_cnt, tx_ring_cnt;
546 sc->hn_prichan = vmbus_get_channel(dev);
550 * Setup taskqueue for transmission.
552 if (hn_tx_taskq == NULL) {
553 sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
554 taskqueue_thread_enqueue, &sc->hn_tx_taskq);
555 taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
556 device_get_nameunit(dev));
557 if (hn_bind_tx_taskq >= 0) {
558 int cpu = hn_bind_tx_taskq;
559 struct task cpuset_task;
562 if (cpu > mp_ncpus - 1)
564 CPU_SETOF(cpu, &cpu_set);
565 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task,
567 taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task);
568 taskqueue_drain(sc->hn_tx_taskq, &cpuset_task);
571 sc->hn_tx_taskq = hn_tx_taskq;
575 * Setup taskqueue for mangement tasks, e.g. link status.
577 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
578 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
579 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
580 device_get_nameunit(dev));
581 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
584 * Allocate ifnet and setup its name earlier, so that if_printf
585 * can be used by functions, which will be called after
588 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
590 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
593 * Initialize ifmedia earlier so that it can be unconditionally
594 * destroyed, if error happened later on.
596 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
599 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
600 * to use (tx_ring_cnt).
603 * The # of RX rings to use is same as the # of channels to use.
605 ring_cnt = hn_chan_cnt;
609 if (ring_cnt > HN_RING_CNT_DEF_MAX)
610 ring_cnt = HN_RING_CNT_DEF_MAX;
611 } else if (ring_cnt > mp_ncpus) {
615 tx_ring_cnt = hn_tx_ring_cnt;
616 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
617 tx_ring_cnt = ring_cnt;
618 if (hn_use_if_start) {
619 /* ifnet.if_start only needs one TX ring. */
624 * Set the leader CPU for channels.
626 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
629 * Create enough TX/RX rings, even if only limited number of
630 * channels can be allocated.
632 error = hn_create_tx_data(sc, tx_ring_cnt);
635 error = hn_create_rx_data(sc, ring_cnt);
640 * Create transaction context for NVS and RNDIS transactions.
642 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
643 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
644 if (sc->hn_xact == NULL)
648 * Attach the synthetic parts, i.e. NVS and RNDIS.
650 error = hn_synth_attach(sc, ETHERMTU);
654 error = hn_rndis_get_eaddr(sc, eaddr);
658 #if __FreeBSD_version >= 1100099
659 if (sc->hn_rx_ring_inuse > 1) {
661 * Reduce TCP segment aggregation limit for multiple
662 * RX rings to increase ACK timeliness.
664 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
669 * Fixup TX stuffs after synthetic parts are attached.
671 hn_fixup_tx_data(sc);
673 ctx = device_get_sysctl_ctx(dev);
674 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
675 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
676 &sc->hn_nvs_ver, 0, "NVS version");
677 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
678 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
679 hn_ndis_version_sysctl, "A", "NDIS version");
680 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
681 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
682 hn_caps_sysctl, "A", "capabilities");
683 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
684 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
685 hn_hwassist_sysctl, "A", "hwassist");
686 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
687 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
688 hn_rss_key_sysctl, "IU", "RSS key");
689 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
690 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
691 hn_rss_ind_sysctl, "IU", "RSS indirect table");
694 * Setup the ifmedia, which has been initialized earlier.
696 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
697 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
698 /* XXX ifmedia_set really should do this for us */
699 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
702 * Setup the ifnet for this interface.
705 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
706 ifp->if_ioctl = hn_ioctl;
707 ifp->if_init = hn_init;
708 if (hn_use_if_start) {
709 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
711 ifp->if_start = hn_start;
712 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
713 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
714 IFQ_SET_READY(&ifp->if_snd);
716 ifp->if_transmit = hn_transmit;
717 ifp->if_qflush = hn_xmit_qflush;
720 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
722 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
723 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
725 if (sc->hn_caps & HN_CAP_VLAN) {
726 /* XXX not sure about VLAN_MTU. */
727 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
730 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
731 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
732 ifp->if_capabilities |= IFCAP_TXCSUM;
733 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
734 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
735 if (sc->hn_caps & HN_CAP_TSO4) {
736 ifp->if_capabilities |= IFCAP_TSO4;
737 ifp->if_hwassist |= CSUM_IP_TSO;
739 if (sc->hn_caps & HN_CAP_TSO6) {
740 ifp->if_capabilities |= IFCAP_TSO6;
741 ifp->if_hwassist |= CSUM_IP6_TSO;
744 /* Enable all available capabilities by default. */
745 ifp->if_capenable = ifp->if_capabilities;
747 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
748 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
749 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
750 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
753 ether_ifattach(ifp, eaddr);
755 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
756 if_printf(ifp, "TSO segcnt %u segsz %u\n",
757 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
760 /* Inform the upper layer about the long frame support. */
761 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
764 * Kick off link status check.
766 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
767 hn_link_status_update(sc);
771 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
778 netvsc_detach(device_t dev)
780 struct hn_softc *sc = device_get_softc(dev);
781 struct ifnet *ifp = sc->hn_ifp;
783 if (device_is_attached(dev)) {
785 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
786 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
790 * hn_stop() only suspends data, so managment
791 * stuffs have to be suspended manually here.
800 ifmedia_removeall(&sc->hn_media);
801 hn_destroy_rx_data(sc);
802 hn_destroy_tx_data(sc);
804 if (sc->hn_tx_taskq != hn_tx_taskq)
805 taskqueue_free(sc->hn_tx_taskq);
806 taskqueue_free(sc->hn_mgmt_taskq0);
808 if (sc->hn_xact != NULL)
809 vmbus_xact_ctx_destroy(sc->hn_xact);
818 * Standard shutdown entry point
821 netvsc_shutdown(device_t dev)
827 hn_link_taskfunc(void *xsc, int pending __unused)
829 struct hn_softc *sc = xsc;
830 struct ifnet *ifp = sc->hn_ifp;
831 uint32_t link_status;
834 error = hn_rndis_get_linkstatus(sc, &link_status);
836 /* XXX what to do? */
840 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
844 if_link_state_change(ifp,
845 sc->hn_carrier ? LINK_STATE_UP : LINK_STATE_DOWN);
849 hn_link_status_update(struct hn_softc *sc)
852 if (sc->hn_mgmt_taskq != NULL)
853 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
857 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
858 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
860 struct mbuf *m = *m_head;
863 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
864 m, segs, nsegs, BUS_DMA_NOWAIT);
865 if (error == EFBIG) {
868 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
873 txr->hn_tx_collapsed++;
875 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
876 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
879 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
880 BUS_DMASYNC_PREWRITE);
881 txd->flags |= HN_TXD_FLAG_DMAMAP;
887 hn_txdesc_dmamap_unload(struct hn_tx_ring *txr, struct hn_txdesc *txd)
890 if (txd->flags & HN_TXD_FLAG_DMAMAP) {
891 bus_dmamap_sync(txr->hn_tx_data_dtag,
892 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
893 bus_dmamap_unload(txr->hn_tx_data_dtag,
895 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
900 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
903 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
904 ("put an onlist txd %#x", txd->flags));
906 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
907 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
910 hn_txdesc_dmamap_unload(txr, txd);
911 if (txd->m != NULL) {
916 txd->flags |= HN_TXD_FLAG_ONLIST;
918 #ifndef HN_USE_TXDESC_BUFRING
919 mtx_lock_spin(&txr->hn_txlist_spin);
920 KASSERT(txr->hn_txdesc_avail >= 0 &&
921 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
922 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
923 txr->hn_txdesc_avail++;
924 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
925 mtx_unlock_spin(&txr->hn_txlist_spin);
927 atomic_add_int(&txr->hn_txdesc_avail, 1);
928 buf_ring_enqueue(txr->hn_txdesc_br, txd);
934 static __inline struct hn_txdesc *
935 hn_txdesc_get(struct hn_tx_ring *txr)
937 struct hn_txdesc *txd;
939 #ifndef HN_USE_TXDESC_BUFRING
940 mtx_lock_spin(&txr->hn_txlist_spin);
941 txd = SLIST_FIRST(&txr->hn_txlist);
943 KASSERT(txr->hn_txdesc_avail > 0,
944 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
945 txr->hn_txdesc_avail--;
946 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
948 mtx_unlock_spin(&txr->hn_txlist_spin);
950 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
954 #ifdef HN_USE_TXDESC_BUFRING
955 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
957 KASSERT(txd->m == NULL && txd->refs == 0 &&
958 (txd->flags & HN_TXD_FLAG_ONLIST), ("invalid txd"));
959 txd->flags &= ~HN_TXD_FLAG_ONLIST;
966 hn_txdesc_hold(struct hn_txdesc *txd)
969 /* 0->1 transition will never work */
970 KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
971 atomic_add_int(&txd->refs, 1);
975 hn_tx_ring_pending(struct hn_tx_ring *txr)
977 bool pending = false;
979 #ifndef HN_USE_TXDESC_BUFRING
980 mtx_lock_spin(&txr->hn_txlist_spin);
981 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
983 mtx_unlock_spin(&txr->hn_txlist_spin);
985 if (!buf_ring_full(txr->hn_txdesc_br))
992 hn_txeof(struct hn_tx_ring *txr)
994 txr->hn_has_txeof = 0;
999 hn_tx_done(struct hn_send_ctx *sndc, struct hn_softc *sc,
1000 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1002 struct hn_txdesc *txd = sndc->hn_cbarg;
1003 struct hn_tx_ring *txr;
1005 if (sndc->hn_chim_idx != HN_NVS_CHIM_IDX_INVALID)
1006 hn_chim_free(sc, sndc->hn_chim_idx);
1009 KASSERT(txr->hn_chan == chan,
1010 ("channel mismatch, on chan%u, should be chan%u",
1011 vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan)));
1013 txr->hn_has_txeof = 1;
1014 hn_txdesc_put(txr, txd);
1016 ++txr->hn_txdone_cnt;
1017 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1018 txr->hn_txdone_cnt = 0;
1019 if (txr->hn_oactive)
1025 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1027 #if defined(INET) || defined(INET6)
1028 struct lro_ctrl *lro = &rxr->hn_lro;
1029 struct lro_entry *queued;
1031 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1032 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1033 tcp_lro_flush(lro, queued);
1039 * 'txr' could be NULL, if multiple channels and
1040 * ifnet.if_start method are enabled.
1042 if (txr == NULL || !txr->hn_has_txeof)
1045 txr->hn_txdone_cnt = 0;
1049 static __inline uint32_t
1050 hn_rndis_pktmsg_offset(uint32_t ofs)
1053 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1054 ("invalid RNDIS packet msg offset %u", ofs));
1055 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1060 * If this function fails, then both txd and m_head0 will be freed.
1063 hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
1065 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1066 int error, nsegs, i;
1067 struct mbuf *m_head = *m_head0;
1068 struct rndis_packet_msg *pkt;
1069 uint32_t send_buf_section_idx;
1070 int send_buf_section_size, pktlen;
1074 * extension points to the area reserved for the
1075 * rndis_filter_packet, which is placed just after
1076 * the netvsc_packet (and rppi struct, if present;
1077 * length is updated later).
1079 pkt = txd->rndis_pkt;
1080 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1081 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1082 pkt->rm_dataoffset = sizeof(*pkt);
1083 pkt->rm_datalen = m_head->m_pkthdr.len;
1084 pkt->rm_pktinfooffset = sizeof(*pkt);
1085 pkt->rm_pktinfolen = 0;
1087 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1089 * Set the hash value for this packet, so that the host could
1090 * dispatch the TX done event for this packet back to this TX
1093 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1094 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1095 *pi_data = txr->hn_tx_idx;
1098 if (m_head->m_flags & M_VLANTAG) {
1099 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1100 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1101 *pi_data = NDIS_VLAN_INFO_MAKE(
1102 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1103 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1104 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1107 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1108 #if defined(INET6) || defined(INET)
1109 struct ether_vlan_header *eh;
1113 * XXX need m_pullup and use mtodo
1115 eh = mtod(m_head, struct ether_vlan_header*);
1116 if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN))
1117 ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1119 ether_len = ETHER_HDR_LEN;
1121 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1122 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1124 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1126 (struct ip *)(m_head->m_data + ether_len);
1127 unsigned long iph_len = ip->ip_hl << 2;
1129 (struct tcphdr *)((caddr_t)ip + iph_len);
1133 th->th_sum = in_pseudo(ip->ip_src.s_addr,
1134 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
1135 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1136 m_head->m_pkthdr.tso_segsz);
1139 #if defined(INET6) && defined(INET)
1144 struct ip6_hdr *ip6 = (struct ip6_hdr *)
1145 (m_head->m_data + ether_len);
1146 struct tcphdr *th = (struct tcphdr *)(ip6 + 1);
1149 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
1150 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1151 m_head->m_pkthdr.tso_segsz);
1154 #endif /* INET6 || INET */
1155 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1156 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1157 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1158 if (m_head->m_pkthdr.csum_flags &
1159 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1160 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1162 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1163 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1164 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1167 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1168 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1169 else if (m_head->m_pkthdr.csum_flags &
1170 (CSUM_IP_UDP | CSUM_IP6_UDP))
1171 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1174 pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1175 /* Convert RNDIS packet message offsets */
1176 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1177 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1180 * Chimney send, if the packet could fit into one chimney buffer.
1182 if (pkt->rm_len < txr->hn_chim_size) {
1183 txr->hn_tx_chimney_tried++;
1184 send_buf_section_idx = hn_chim_alloc(txr->hn_sc);
1185 if (send_buf_section_idx != HN_NVS_CHIM_IDX_INVALID) {
1186 uint8_t *dest = txr->hn_sc->hn_chim +
1187 (send_buf_section_idx * txr->hn_sc->hn_chim_szmax);
1189 memcpy(dest, pkt, pktlen);
1191 m_copydata(m_head, 0, m_head->m_pkthdr.len, dest);
1193 send_buf_section_size = pkt->rm_len;
1194 txr->hn_gpa_cnt = 0;
1195 txr->hn_tx_chimney++;
1200 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1205 * This mbuf is not linked w/ the txd yet, so free it now.
1210 freed = hn_txdesc_put(txr, txd);
1212 ("fail to free txd upon txdma error"));
1214 txr->hn_txdma_failed++;
1215 if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1);
1220 /* +1 RNDIS packet message */
1221 txr->hn_gpa_cnt = nsegs + 1;
1223 /* send packet with page buffer */
1224 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1225 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1226 txr->hn_gpa[0].gpa_len = pktlen;
1229 * Fill the page buffers with mbuf info after the page
1230 * buffer for RNDIS packet message.
1232 for (i = 0; i < nsegs; ++i) {
1233 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1235 gpa->gpa_page = atop(segs[i].ds_addr);
1236 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1237 gpa->gpa_len = segs[i].ds_len;
1240 send_buf_section_idx = HN_NVS_CHIM_IDX_INVALID;
1241 send_buf_section_size = 0;
1245 /* Set the completion routine */
1246 hn_send_ctx_init(&txd->send_ctx, hn_tx_done, txd,
1247 send_buf_section_idx, send_buf_section_size);
1254 * If this function fails, then txd will be freed, but the mbuf
1255 * associated w/ the txd will _not_ be freed.
1258 hn_send_pkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1260 int error, send_failed = 0;
1264 * Make sure that txd is not freed before ETHER_BPF_MTAP.
1266 hn_txdesc_hold(txd);
1267 error = hv_nv_on_send(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
1268 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt);
1270 ETHER_BPF_MTAP(ifp, txd->m);
1271 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
1272 if (!hn_use_if_start) {
1273 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1274 txd->m->m_pkthdr.len);
1275 if (txd->m->m_flags & M_MCAST)
1276 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
1280 hn_txdesc_put(txr, txd);
1282 if (__predict_false(error)) {
1286 * This should "really rarely" happen.
1288 * XXX Too many RX to be acked or too many sideband
1289 * commands to run? Ask netvsc_channel_rollup()
1290 * to kick start later.
1292 txr->hn_has_txeof = 1;
1294 txr->hn_send_failed++;
1297 * Try sending again after set hn_has_txeof;
1298 * in case that we missed the last
1299 * netvsc_channel_rollup().
1303 if_printf(ifp, "send failed\n");
1306 * Caller will perform further processing on the
1307 * associated mbuf, so don't free it in hn_txdesc_put();
1308 * only unload it from the DMA map in hn_txdesc_put(),
1312 freed = hn_txdesc_put(txr, txd);
1314 ("fail to free txd upon send error"));
1316 txr->hn_send_failed++;
1322 * Start a transmit of one or more packets
1325 hn_start_locked(struct hn_tx_ring *txr, int len)
1327 struct hn_softc *sc = txr->hn_sc;
1328 struct ifnet *ifp = sc->hn_ifp;
1330 KASSERT(hn_use_if_start,
1331 ("hn_start_locked is called, when if_start is disabled"));
1332 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
1333 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
1335 if (__predict_false(txr->hn_suspended))
1338 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
1342 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
1343 struct hn_txdesc *txd;
1344 struct mbuf *m_head;
1347 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
1351 if (len > 0 && m_head->m_pkthdr.len > len) {
1353 * This sending could be time consuming; let callers
1354 * dispatch this packet sending (and sending of any
1355 * following up packets) to tx taskqueue.
1357 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1361 txd = hn_txdesc_get(txr);
1363 txr->hn_no_txdescs++;
1364 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1365 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1369 error = hn_encap(txr, txd, &m_head);
1371 /* Both txd and m_head are freed */
1375 error = hn_send_pkt(ifp, txr, txd);
1376 if (__predict_false(error)) {
1377 /* txd is freed, but m_head is not */
1378 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1379 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1387 * Append the specified data to the indicated mbuf chain,
1388 * Extend the mbuf chain if the new data does not fit in
1391 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1392 * There should be an equivalent in the kernel mbuf code,
1393 * but there does not appear to be one yet.
1395 * Differs from m_append() in that additional mbufs are
1396 * allocated with cluster size MJUMPAGESIZE, and filled
1399 * Return 1 if able to complete the job; otherwise 0.
1402 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
1405 int remainder, space;
1407 for (m = m0; m->m_next != NULL; m = m->m_next)
1410 space = M_TRAILINGSPACE(m);
1413 * Copy into available space.
1415 if (space > remainder)
1417 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1422 while (remainder > 0) {
1424 * Allocate a new mbuf; could check space
1425 * and allocate a cluster instead.
1427 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
1430 n->m_len = min(MJUMPAGESIZE, remainder);
1431 bcopy(cp, mtod(n, caddr_t), n->m_len);
1433 remainder -= n->m_len;
1437 if (m0->m_flags & M_PKTHDR)
1438 m0->m_pkthdr.len += len - remainder;
1440 return (remainder == 0);
1443 #if defined(INET) || defined(INET6)
1445 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
1447 #if __FreeBSD_version >= 1100095
1448 if (hn_lro_mbufq_depth) {
1449 tcp_lro_queue_mbuf(lc, m);
1453 return tcp_lro_rx(lc, m, 0);
1458 * Called when we receive a data packet from the "wire" on the
1461 * Note: This is no longer used as a callback
1464 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
1465 const struct hn_recvinfo *info)
1467 struct ifnet *ifp = rxr->hn_ifp;
1469 int size, do_lro = 0, do_csum = 1;
1470 int hash_type = M_HASHTYPE_OPAQUE;
1472 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
1476 * Bail out if packet contains more data than configured MTU.
1478 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
1480 } else if (dlen <= MHLEN) {
1481 m_new = m_gethdr(M_NOWAIT, MT_DATA);
1482 if (m_new == NULL) {
1483 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1486 memcpy(mtod(m_new, void *), data, dlen);
1487 m_new->m_pkthdr.len = m_new->m_len = dlen;
1488 rxr->hn_small_pkts++;
1491 * Get an mbuf with a cluster. For packets 2K or less,
1492 * get a standard 2K cluster. For anything larger, get a
1493 * 4K cluster. Any buffers larger than 4K can cause problems
1494 * if looped around to the Hyper-V TX channel, so avoid them.
1497 if (dlen > MCLBYTES) {
1499 size = MJUMPAGESIZE;
1502 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
1503 if (m_new == NULL) {
1504 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1508 hv_m_append(m_new, dlen, data);
1510 m_new->m_pkthdr.rcvif = ifp;
1512 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
1515 /* receive side checksum offload */
1516 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
1517 /* IP csum offload */
1518 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
1519 m_new->m_pkthdr.csum_flags |=
1520 (CSUM_IP_CHECKED | CSUM_IP_VALID);
1524 /* TCP/UDP csum offload */
1525 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
1526 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
1527 m_new->m_pkthdr.csum_flags |=
1528 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1529 m_new->m_pkthdr.csum_data = 0xffff;
1530 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
1536 if ((info->csum_info &
1537 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
1538 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
1541 const struct ether_header *eh;
1546 if (m_new->m_len < hoff)
1548 eh = mtod(m_new, struct ether_header *);
1549 etype = ntohs(eh->ether_type);
1550 if (etype == ETHERTYPE_VLAN) {
1551 const struct ether_vlan_header *evl;
1553 hoff = sizeof(*evl);
1554 if (m_new->m_len < hoff)
1556 evl = mtod(m_new, struct ether_vlan_header *);
1557 etype = ntohs(evl->evl_proto);
1560 if (etype == ETHERTYPE_IP) {
1563 pr = hn_check_iplen(m_new, hoff);
1564 if (pr == IPPROTO_TCP) {
1566 (rxr->hn_trust_hcsum &
1567 HN_TRUST_HCSUM_TCP)) {
1568 rxr->hn_csum_trusted++;
1569 m_new->m_pkthdr.csum_flags |=
1570 (CSUM_IP_CHECKED | CSUM_IP_VALID |
1571 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1572 m_new->m_pkthdr.csum_data = 0xffff;
1575 } else if (pr == IPPROTO_UDP) {
1577 (rxr->hn_trust_hcsum &
1578 HN_TRUST_HCSUM_UDP)) {
1579 rxr->hn_csum_trusted++;
1580 m_new->m_pkthdr.csum_flags |=
1581 (CSUM_IP_CHECKED | CSUM_IP_VALID |
1582 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1583 m_new->m_pkthdr.csum_data = 0xffff;
1585 } else if (pr != IPPROTO_DONE && do_csum &&
1586 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
1587 rxr->hn_csum_trusted++;
1588 m_new->m_pkthdr.csum_flags |=
1589 (CSUM_IP_CHECKED | CSUM_IP_VALID);
1594 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
1595 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
1596 NDIS_VLAN_INFO_ID(info->vlan_info),
1597 NDIS_VLAN_INFO_PRI(info->vlan_info),
1598 NDIS_VLAN_INFO_CFI(info->vlan_info));
1599 m_new->m_flags |= M_VLANTAG;
1602 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
1604 m_new->m_pkthdr.flowid = info->hash_value;
1605 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
1606 NDIS_HASH_FUNCTION_TOEPLITZ) {
1607 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
1610 case NDIS_HASH_IPV4:
1611 hash_type = M_HASHTYPE_RSS_IPV4;
1614 case NDIS_HASH_TCP_IPV4:
1615 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
1618 case NDIS_HASH_IPV6:
1619 hash_type = M_HASHTYPE_RSS_IPV6;
1622 case NDIS_HASH_IPV6_EX:
1623 hash_type = M_HASHTYPE_RSS_IPV6_EX;
1626 case NDIS_HASH_TCP_IPV6:
1627 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
1630 case NDIS_HASH_TCP_IPV6_EX:
1631 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
1636 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
1638 M_HASHTYPE_SET(m_new, hash_type);
1641 * Note: Moved RX completion back to hv_nv_on_receive() so all
1642 * messages (not just data messages) will trigger a response.
1648 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
1649 #if defined(INET) || defined(INET6)
1650 struct lro_ctrl *lro = &rxr->hn_lro;
1653 rxr->hn_lro_tried++;
1654 if (hn_lro_rx(lro, m_new) == 0) {
1662 /* We're not holding the lock here, so don't release it */
1663 (*ifp->if_input)(ifp, m_new);
1669 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1671 struct hn_softc *sc = ifp->if_softc;
1672 struct ifreq *ifr = (struct ifreq *)data;
1673 int mask, error = 0;
1677 if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) {
1684 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
1689 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
1690 /* Can't change MTU */
1696 if (ifp->if_mtu == ifr->ifr_mtu) {
1701 /* Obtain and record requested MTU */
1702 ifp->if_mtu = ifr->ifr_mtu;
1704 #if __FreeBSD_version >= 1100099
1706 * Make sure that LRO aggregation length limit is still
1707 * valid, after the MTU change.
1709 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
1710 HN_LRO_LENLIM_MIN(ifp))
1711 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1715 * Suspend this interface before the synthetic parts
1721 * Detach the synthetics parts, i.e. NVS and RNDIS.
1723 hn_synth_detach(sc);
1726 * Reattach the synthetic parts, i.e. NVS and RNDIS,
1727 * with the new MTU setting.
1730 hn_synth_attach(sc, ifr->ifr_mtu);
1732 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
1733 hn_set_chim_size(sc, sc->hn_chim_szmax);
1734 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifr->ifr_mtu);
1737 * All done! Resume the interface now.
1747 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
1752 if (ifp->if_flags & IFF_UP) {
1754 * If only the state of the PROMISC flag changed,
1755 * then just use the 'set promisc mode' command
1756 * instead of reinitializing the entire NIC. Doing
1757 * a full re-init means reloading the firmware and
1758 * waiting for it to start up, which may take a
1762 /* Fixme: Promiscuous mode? */
1763 if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1764 ifp->if_flags & IFF_PROMISC &&
1765 !(sc->hn_if_flags & IFF_PROMISC)) {
1766 /* do something here for Hyper-V */
1767 } else if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1768 !(ifp->if_flags & IFF_PROMISC) &&
1769 sc->hn_if_flags & IFF_PROMISC) {
1770 /* do something here for Hyper-V */
1775 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1779 sc->hn_if_flags = ifp->if_flags;
1786 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
1788 if (mask & IFCAP_TXCSUM) {
1789 ifp->if_capenable ^= IFCAP_TXCSUM;
1790 if (ifp->if_capenable & IFCAP_TXCSUM)
1791 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
1793 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
1795 if (mask & IFCAP_TXCSUM_IPV6) {
1796 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
1797 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1798 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
1800 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
1803 /* TODO: flip RNDIS offload parameters for RXCSUM. */
1804 if (mask & IFCAP_RXCSUM)
1805 ifp->if_capenable ^= IFCAP_RXCSUM;
1807 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1808 if (mask & IFCAP_RXCSUM_IPV6)
1809 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
1812 if (mask & IFCAP_LRO)
1813 ifp->if_capenable ^= IFCAP_LRO;
1815 if (mask & IFCAP_TSO4) {
1816 ifp->if_capenable ^= IFCAP_TSO4;
1817 if (ifp->if_capenable & IFCAP_TSO4)
1818 ifp->if_hwassist |= CSUM_IP_TSO;
1820 ifp->if_hwassist &= ~CSUM_IP_TSO;
1822 if (mask & IFCAP_TSO6) {
1823 ifp->if_capenable ^= IFCAP_TSO6;
1824 if (ifp->if_capenable & IFCAP_TSO6)
1825 ifp->if_hwassist |= CSUM_IP6_TSO;
1827 ifp->if_hwassist &= ~CSUM_IP6_TSO;
1835 /* Always all-multi */
1838 * Enable/disable all-multi according to the emptiness of
1839 * the mcast address list.
1845 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
1849 error = ether_ioctl(ifp, cmd, data);
1856 hn_stop(struct hn_softc *sc)
1858 struct ifnet *ifp = sc->hn_ifp;
1863 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1864 ("synthetic parts were not attached"));
1866 /* Clear RUNNING bit _before_ hn_suspend_data() */
1867 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
1868 hn_suspend_data(sc);
1870 /* Clear OACTIVE bit. */
1871 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1872 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
1873 sc->hn_tx_ring[i].hn_oactive = 0;
1877 * FreeBSD transmit entry point
1880 hn_start(struct ifnet *ifp)
1882 struct hn_softc *sc = ifp->if_softc;
1883 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
1885 if (txr->hn_sched_tx)
1888 if (mtx_trylock(&txr->hn_tx_lock)) {
1891 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
1892 mtx_unlock(&txr->hn_tx_lock);
1897 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
1901 hn_start_txeof(struct hn_tx_ring *txr)
1903 struct hn_softc *sc = txr->hn_sc;
1904 struct ifnet *ifp = sc->hn_ifp;
1906 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
1908 if (txr->hn_sched_tx)
1911 if (mtx_trylock(&txr->hn_tx_lock)) {
1914 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1915 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
1916 mtx_unlock(&txr->hn_tx_lock);
1918 taskqueue_enqueue(txr->hn_tx_taskq,
1924 * Release the OACTIVE earlier, with the hope, that
1925 * others could catch up. The task will clear the
1926 * flag again with the hn_tx_lock to avoid possible
1929 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1930 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
1935 hn_init_locked(struct hn_softc *sc)
1937 struct ifnet *ifp = sc->hn_ifp;
1942 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1945 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1948 /* TODO: add hn_rx_filter */
1949 hn_rndis_set_rxfilter(sc, NDIS_PACKET_TYPE_PROMISCUOUS);
1951 /* Clear OACTIVE bit. */
1952 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1953 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
1954 sc->hn_tx_ring[i].hn_oactive = 0;
1956 /* Clear TX 'suspended' bit. */
1957 hn_tx_resume(sc, sc->hn_tx_ring_inuse);
1959 /* Everything is ready; unleash! */
1960 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
1966 struct hn_softc *sc = xsc;
1978 hn_watchdog(struct ifnet *ifp)
1981 if_printf(ifp, "watchdog timeout -- resetting\n");
1982 hn_init(ifp->if_softc); /* XXX */
1987 #if __FreeBSD_version >= 1100099
1990 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
1992 struct hn_softc *sc = arg1;
1993 unsigned int lenlim;
1996 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
1997 error = sysctl_handle_int(oidp, &lenlim, 0, req);
1998 if (error || req->newptr == NULL)
2002 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2003 lenlim > TCP_LRO_LENGTH_MAX) {
2007 hn_set_lro_lenlim(sc, lenlim);
2014 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2016 struct hn_softc *sc = arg1;
2017 int ackcnt, error, i;
2020 * lro_ackcnt_lim is append count limit,
2021 * +1 to turn it into aggregation limit.
2023 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2024 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2025 if (error || req->newptr == NULL)
2028 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2032 * Convert aggregation limit back to append
2037 for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
2038 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2046 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2048 struct hn_softc *sc = arg1;
2053 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2056 error = sysctl_handle_int(oidp, &on, 0, req);
2057 if (error || req->newptr == NULL)
2061 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2062 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2065 rxr->hn_trust_hcsum |= hcsum;
2067 rxr->hn_trust_hcsum &= ~hcsum;
2074 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2076 struct hn_softc *sc = arg1;
2077 int chim_size, error;
2079 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2080 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2081 if (error || req->newptr == NULL)
2084 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2088 hn_set_chim_size(sc, chim_size);
2093 #if __FreeBSD_version < 1100095
2095 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2097 struct hn_softc *sc = arg1;
2098 int ofs = arg2, i, error;
2099 struct hn_rx_ring *rxr;
2103 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2104 rxr = &sc->hn_rx_ring[i];
2105 stat += *((int *)((uint8_t *)rxr + ofs));
2108 error = sysctl_handle_64(oidp, &stat, 0, req);
2109 if (error || req->newptr == NULL)
2112 /* Zero out this stat. */
2113 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2114 rxr = &sc->hn_rx_ring[i];
2115 *((int *)((uint8_t *)rxr + ofs)) = 0;
2121 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2123 struct hn_softc *sc = arg1;
2124 int ofs = arg2, i, error;
2125 struct hn_rx_ring *rxr;
2129 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2130 rxr = &sc->hn_rx_ring[i];
2131 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2134 error = sysctl_handle_64(oidp, &stat, 0, req);
2135 if (error || req->newptr == NULL)
2138 /* Zero out this stat. */
2139 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2140 rxr = &sc->hn_rx_ring[i];
2141 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2149 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2151 struct hn_softc *sc = arg1;
2152 int ofs = arg2, i, error;
2153 struct hn_rx_ring *rxr;
2157 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2158 rxr = &sc->hn_rx_ring[i];
2159 stat += *((u_long *)((uint8_t *)rxr + ofs));
2162 error = sysctl_handle_long(oidp, &stat, 0, req);
2163 if (error || req->newptr == NULL)
2166 /* Zero out this stat. */
2167 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2168 rxr = &sc->hn_rx_ring[i];
2169 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2175 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2177 struct hn_softc *sc = arg1;
2178 int ofs = arg2, i, error;
2179 struct hn_tx_ring *txr;
2183 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2184 txr = &sc->hn_tx_ring[i];
2185 stat += *((u_long *)((uint8_t *)txr + ofs));
2188 error = sysctl_handle_long(oidp, &stat, 0, req);
2189 if (error || req->newptr == NULL)
2192 /* Zero out this stat. */
2193 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2194 txr = &sc->hn_tx_ring[i];
2195 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2201 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2203 struct hn_softc *sc = arg1;
2204 int ofs = arg2, i, error, conf;
2205 struct hn_tx_ring *txr;
2207 txr = &sc->hn_tx_ring[0];
2208 conf = *((int *)((uint8_t *)txr + ofs));
2210 error = sysctl_handle_int(oidp, &conf, 0, req);
2211 if (error || req->newptr == NULL)
2215 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2216 txr = &sc->hn_tx_ring[i];
2217 *((int *)((uint8_t *)txr + ofs)) = conf;
2225 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2227 struct hn_softc *sc = arg1;
2230 snprintf(verstr, sizeof(verstr), "%u.%u",
2231 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2232 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2233 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2237 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2239 struct hn_softc *sc = arg1;
2246 snprintf(caps_str, sizeof(caps_str), "%b", caps,
2257 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2261 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2263 struct hn_softc *sc = arg1;
2264 char assist_str[128];
2268 hwassist = sc->hn_ifp->if_hwassist;
2270 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2271 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2275 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2277 struct hn_softc *sc = arg1;
2282 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2283 if (error || req->newptr == NULL)
2286 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2289 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2291 if (sc->hn_rx_ring_inuse > 1) {
2292 error = hn_rss_reconfig(sc);
2294 /* Not RSS capable, at least for now; just save the RSS key. */
2303 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2305 struct hn_softc *sc = arg1;
2310 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2311 if (error || req->newptr == NULL)
2315 * Don't allow RSS indirect table change, if this interface is not
2316 * RSS capable currently.
2318 if (sc->hn_rx_ring_inuse == 1) {
2323 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2326 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2328 hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse);
2329 error = hn_rss_reconfig(sc);
2336 hn_check_iplen(const struct mbuf *m, int hoff)
2338 const struct ip *ip;
2339 int len, iphlen, iplen;
2340 const struct tcphdr *th;
2341 int thoff; /* TCP data offset */
2343 len = hoff + sizeof(struct ip);
2345 /* The packet must be at least the size of an IP header. */
2346 if (m->m_pkthdr.len < len)
2347 return IPPROTO_DONE;
2349 /* The fixed IP header must reside completely in the first mbuf. */
2351 return IPPROTO_DONE;
2353 ip = mtodo(m, hoff);
2355 /* Bound check the packet's stated IP header length. */
2356 iphlen = ip->ip_hl << 2;
2357 if (iphlen < sizeof(struct ip)) /* minimum header length */
2358 return IPPROTO_DONE;
2360 /* The full IP header must reside completely in the one mbuf. */
2361 if (m->m_len < hoff + iphlen)
2362 return IPPROTO_DONE;
2364 iplen = ntohs(ip->ip_len);
2367 * Check that the amount of data in the buffers is as
2368 * at least much as the IP header would have us expect.
2370 if (m->m_pkthdr.len < hoff + iplen)
2371 return IPPROTO_DONE;
2374 * Ignore IP fragments.
2376 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
2377 return IPPROTO_DONE;
2380 * The TCP/IP or UDP/IP header must be entirely contained within
2381 * the first fragment of a packet.
2385 if (iplen < iphlen + sizeof(struct tcphdr))
2386 return IPPROTO_DONE;
2387 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
2388 return IPPROTO_DONE;
2389 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
2390 thoff = th->th_off << 2;
2391 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
2392 return IPPROTO_DONE;
2393 if (m->m_len < hoff + iphlen + thoff)
2394 return IPPROTO_DONE;
2397 if (iplen < iphlen + sizeof(struct udphdr))
2398 return IPPROTO_DONE;
2399 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
2400 return IPPROTO_DONE;
2404 return IPPROTO_DONE;
2411 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
2413 struct sysctl_oid_list *child;
2414 struct sysctl_ctx_list *ctx;
2415 device_t dev = sc->hn_dev;
2416 #if defined(INET) || defined(INET6)
2417 #if __FreeBSD_version >= 1100095
2424 * Create RXBUF for reception.
2427 * - It is shared by all channels.
2428 * - A large enough buffer is allocated, certain version of NVSes
2429 * may further limit the usable space.
2431 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
2432 PAGE_SIZE, 0, NETVSC_RECEIVE_BUFFER_SIZE, &sc->hn_rxbuf_dma,
2433 BUS_DMA_WAITOK | BUS_DMA_ZERO);
2434 if (sc->hn_rxbuf == NULL) {
2435 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
2439 sc->hn_rx_ring_cnt = ring_cnt;
2440 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
2442 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
2443 M_NETVSC, M_WAITOK | M_ZERO);
2445 #if defined(INET) || defined(INET6)
2446 #if __FreeBSD_version >= 1100095
2447 lroent_cnt = hn_lro_entry_count;
2448 if (lroent_cnt < TCP_LRO_ENTRIES)
2449 lroent_cnt = TCP_LRO_ENTRIES;
2451 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
2453 #endif /* INET || INET6 */
2455 ctx = device_get_sysctl_ctx(dev);
2456 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2458 /* Create dev.hn.UNIT.rx sysctl tree */
2459 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
2460 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2462 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2463 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2465 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
2467 NETVSC_DEVICE_RING_BUFFER_SIZE +
2468 NETVSC_DEVICE_RING_BUFFER_SIZE,
2469 &rxr->hn_br_dma, BUS_DMA_WAITOK);
2470 if (rxr->hn_br == NULL) {
2471 device_printf(dev, "allocate bufring failed\n");
2475 if (hn_trust_hosttcp)
2476 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
2477 if (hn_trust_hostudp)
2478 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
2479 if (hn_trust_hostip)
2480 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
2481 rxr->hn_ifp = sc->hn_ifp;
2482 if (i < sc->hn_tx_ring_cnt)
2483 rxr->hn_txr = &sc->hn_tx_ring[i];
2484 rxr->hn_rdbuf = malloc(NETVSC_PACKET_SIZE, M_NETVSC, M_WAITOK);
2486 rxr->hn_rxbuf = sc->hn_rxbuf;
2491 #if defined(INET) || defined(INET6)
2492 #if __FreeBSD_version >= 1100095
2493 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
2494 hn_lro_mbufq_depth);
2496 tcp_lro_init(&rxr->hn_lro);
2497 rxr->hn_lro.ifp = sc->hn_ifp;
2499 #if __FreeBSD_version >= 1100099
2500 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
2501 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
2503 #endif /* INET || INET6 */
2505 if (sc->hn_rx_sysctl_tree != NULL) {
2509 * Create per RX ring sysctl tree:
2510 * dev.hn.UNIT.rx.RINGID
2512 snprintf(name, sizeof(name), "%d", i);
2513 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
2514 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
2515 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2517 if (rxr->hn_rx_sysctl_tree != NULL) {
2518 SYSCTL_ADD_ULONG(ctx,
2519 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2520 OID_AUTO, "packets", CTLFLAG_RW,
2521 &rxr->hn_pkts, "# of packets received");
2522 SYSCTL_ADD_ULONG(ctx,
2523 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2524 OID_AUTO, "rss_pkts", CTLFLAG_RW,
2526 "# of packets w/ RSS info received");
2531 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
2532 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2533 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
2534 #if __FreeBSD_version < 1100095
2535 hn_rx_stat_int_sysctl,
2537 hn_rx_stat_u64_sysctl,
2539 "LU", "LRO queued");
2540 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
2541 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2542 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
2543 #if __FreeBSD_version < 1100095
2544 hn_rx_stat_int_sysctl,
2546 hn_rx_stat_u64_sysctl,
2548 "LU", "LRO flushed");
2549 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
2550 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2551 __offsetof(struct hn_rx_ring, hn_lro_tried),
2552 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
2553 #if __FreeBSD_version >= 1100099
2554 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
2555 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2556 hn_lro_lenlim_sysctl, "IU",
2557 "Max # of data bytes to be aggregated by LRO");
2558 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
2559 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2560 hn_lro_ackcnt_sysctl, "I",
2561 "Max # of ACKs to be aggregated by LRO");
2563 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
2564 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
2565 hn_trust_hcsum_sysctl, "I",
2566 "Trust tcp segement verification on host side, "
2567 "when csum info is missing");
2568 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
2569 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
2570 hn_trust_hcsum_sysctl, "I",
2571 "Trust udp datagram verification on host side, "
2572 "when csum info is missing");
2573 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
2574 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
2575 hn_trust_hcsum_sysctl, "I",
2576 "Trust ip packet verification on host side, "
2577 "when csum info is missing");
2578 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
2579 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2580 __offsetof(struct hn_rx_ring, hn_csum_ip),
2581 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
2582 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
2583 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2584 __offsetof(struct hn_rx_ring, hn_csum_tcp),
2585 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
2586 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
2587 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2588 __offsetof(struct hn_rx_ring, hn_csum_udp),
2589 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
2590 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
2591 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2592 __offsetof(struct hn_rx_ring, hn_csum_trusted),
2593 hn_rx_stat_ulong_sysctl, "LU",
2594 "# of packets that we trust host's csum verification");
2595 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
2596 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2597 __offsetof(struct hn_rx_ring, hn_small_pkts),
2598 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
2599 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
2600 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
2601 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
2602 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
2608 hn_destroy_rx_data(struct hn_softc *sc)
2612 if (sc->hn_rxbuf != NULL) {
2613 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
2614 sc->hn_rxbuf = NULL;
2617 if (sc->hn_rx_ring_cnt == 0)
2620 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2621 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2623 if (rxr->hn_br == NULL)
2625 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
2628 #if defined(INET) || defined(INET6)
2629 tcp_lro_free(&rxr->hn_lro);
2631 free(rxr->hn_rdbuf, M_NETVSC);
2633 free(sc->hn_rx_ring, M_NETVSC);
2634 sc->hn_rx_ring = NULL;
2636 sc->hn_rx_ring_cnt = 0;
2637 sc->hn_rx_ring_inuse = 0;
2641 hn_create_tx_ring(struct hn_softc *sc, int id)
2643 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
2644 device_t dev = sc->hn_dev;
2645 bus_dma_tag_t parent_dtag;
2649 txr->hn_tx_idx = id;
2651 #ifndef HN_USE_TXDESC_BUFRING
2652 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
2654 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
2656 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
2657 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
2658 M_NETVSC, M_WAITOK | M_ZERO);
2659 #ifndef HN_USE_TXDESC_BUFRING
2660 SLIST_INIT(&txr->hn_txlist);
2662 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_NETVSC,
2663 M_WAITOK, &txr->hn_tx_lock);
2666 txr->hn_tx_taskq = sc->hn_tx_taskq;
2668 if (hn_use_if_start) {
2669 txr->hn_txeof = hn_start_txeof;
2670 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
2671 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
2675 txr->hn_txeof = hn_xmit_txeof;
2676 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
2677 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
2679 br_depth = hn_get_txswq_depth(txr);
2680 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_NETVSC,
2681 M_WAITOK, &txr->hn_tx_lock);
2684 txr->hn_direct_tx_size = hn_direct_tx_size;
2687 * Always schedule transmission instead of trying to do direct
2688 * transmission. This one gives the best performance so far.
2690 txr->hn_sched_tx = 1;
2692 parent_dtag = bus_get_dma_tag(dev);
2694 /* DMA tag for RNDIS packet messages. */
2695 error = bus_dma_tag_create(parent_dtag, /* parent */
2696 HN_RNDIS_PKT_ALIGN, /* alignment */
2697 HN_RNDIS_PKT_BOUNDARY, /* boundary */
2698 BUS_SPACE_MAXADDR, /* lowaddr */
2699 BUS_SPACE_MAXADDR, /* highaddr */
2700 NULL, NULL, /* filter, filterarg */
2701 HN_RNDIS_PKT_LEN, /* maxsize */
2703 HN_RNDIS_PKT_LEN, /* maxsegsize */
2705 NULL, /* lockfunc */
2706 NULL, /* lockfuncarg */
2707 &txr->hn_tx_rndis_dtag);
2709 device_printf(dev, "failed to create rndis dmatag\n");
2713 /* DMA tag for data. */
2714 error = bus_dma_tag_create(parent_dtag, /* parent */
2716 HN_TX_DATA_BOUNDARY, /* boundary */
2717 BUS_SPACE_MAXADDR, /* lowaddr */
2718 BUS_SPACE_MAXADDR, /* highaddr */
2719 NULL, NULL, /* filter, filterarg */
2720 HN_TX_DATA_MAXSIZE, /* maxsize */
2721 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
2722 HN_TX_DATA_SEGSIZE, /* maxsegsize */
2724 NULL, /* lockfunc */
2725 NULL, /* lockfuncarg */
2726 &txr->hn_tx_data_dtag);
2728 device_printf(dev, "failed to create data dmatag\n");
2732 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
2733 struct hn_txdesc *txd = &txr->hn_txdesc[i];
2738 * Allocate and load RNDIS packet message.
2740 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
2741 (void **)&txd->rndis_pkt,
2742 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
2743 &txd->rndis_pkt_dmap);
2746 "failed to allocate rndis_packet_msg, %d\n", i);
2750 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
2751 txd->rndis_pkt_dmap,
2752 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
2753 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
2757 "failed to load rndis_packet_msg, %d\n", i);
2758 bus_dmamem_free(txr->hn_tx_rndis_dtag,
2759 txd->rndis_pkt, txd->rndis_pkt_dmap);
2763 /* DMA map for TX data. */
2764 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
2768 "failed to allocate tx data dmamap\n");
2769 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
2770 txd->rndis_pkt_dmap);
2771 bus_dmamem_free(txr->hn_tx_rndis_dtag,
2772 txd->rndis_pkt, txd->rndis_pkt_dmap);
2776 /* All set, put it to list */
2777 txd->flags |= HN_TXD_FLAG_ONLIST;
2778 #ifndef HN_USE_TXDESC_BUFRING
2779 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2781 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2784 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
2786 if (sc->hn_tx_sysctl_tree != NULL) {
2787 struct sysctl_oid_list *child;
2788 struct sysctl_ctx_list *ctx;
2792 * Create per TX ring sysctl tree:
2793 * dev.hn.UNIT.tx.RINGID
2795 ctx = device_get_sysctl_ctx(dev);
2796 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
2798 snprintf(name, sizeof(name), "%d", id);
2799 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
2800 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2802 if (txr->hn_tx_sysctl_tree != NULL) {
2803 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
2805 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
2806 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
2807 "# of available TX descs");
2808 if (!hn_use_if_start) {
2809 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
2810 CTLFLAG_RD, &txr->hn_oactive, 0,
2813 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
2814 CTLFLAG_RW, &txr->hn_pkts,
2815 "# of packets transmitted");
2823 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
2825 struct hn_tx_ring *txr = txd->txr;
2827 KASSERT(txd->m == NULL, ("still has mbuf installed"));
2828 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
2830 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
2831 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
2832 txd->rndis_pkt_dmap);
2833 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
2837 hn_destroy_tx_ring(struct hn_tx_ring *txr)
2839 struct hn_txdesc *txd;
2841 if (txr->hn_txdesc == NULL)
2844 #ifndef HN_USE_TXDESC_BUFRING
2845 while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
2846 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2847 hn_txdesc_dmamap_destroy(txd);
2850 mtx_lock(&txr->hn_tx_lock);
2851 while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
2852 hn_txdesc_dmamap_destroy(txd);
2853 mtx_unlock(&txr->hn_tx_lock);
2856 if (txr->hn_tx_data_dtag != NULL)
2857 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
2858 if (txr->hn_tx_rndis_dtag != NULL)
2859 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
2861 #ifdef HN_USE_TXDESC_BUFRING
2862 buf_ring_free(txr->hn_txdesc_br, M_NETVSC);
2865 free(txr->hn_txdesc, M_NETVSC);
2866 txr->hn_txdesc = NULL;
2868 if (txr->hn_mbuf_br != NULL)
2869 buf_ring_free(txr->hn_mbuf_br, M_NETVSC);
2871 #ifndef HN_USE_TXDESC_BUFRING
2872 mtx_destroy(&txr->hn_txlist_spin);
2874 mtx_destroy(&txr->hn_tx_lock);
2878 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
2880 struct sysctl_oid_list *child;
2881 struct sysctl_ctx_list *ctx;
2885 * Create TXBUF for chimney sending.
2887 * NOTE: It is shared by all channels.
2889 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
2890 PAGE_SIZE, 0, NETVSC_SEND_BUFFER_SIZE, &sc->hn_chim_dma,
2891 BUS_DMA_WAITOK | BUS_DMA_ZERO);
2892 if (sc->hn_chim == NULL) {
2893 device_printf(sc->hn_dev, "allocate txbuf failed\n");
2897 sc->hn_tx_ring_cnt = ring_cnt;
2898 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
2900 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
2901 M_NETVSC, M_WAITOK | M_ZERO);
2903 ctx = device_get_sysctl_ctx(sc->hn_dev);
2904 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
2906 /* Create dev.hn.UNIT.tx sysctl tree */
2907 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
2908 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2910 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2913 error = hn_create_tx_ring(sc, i);
2918 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
2919 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2920 __offsetof(struct hn_tx_ring, hn_no_txdescs),
2921 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
2922 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
2923 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2924 __offsetof(struct hn_tx_ring, hn_send_failed),
2925 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
2926 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
2927 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2928 __offsetof(struct hn_tx_ring, hn_txdma_failed),
2929 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
2930 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
2931 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2932 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
2933 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
2934 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
2935 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2936 __offsetof(struct hn_tx_ring, hn_tx_chimney),
2937 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
2938 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
2939 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2940 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
2941 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
2942 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
2943 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
2944 "# of total TX descs");
2945 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
2946 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
2947 "Chimney send packet size upper boundary");
2948 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
2949 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2950 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
2951 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
2952 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2953 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
2954 hn_tx_conf_int_sysctl, "I",
2955 "Size of the packet for direct transmission");
2956 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
2957 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2958 __offsetof(struct hn_tx_ring, hn_sched_tx),
2959 hn_tx_conf_int_sysctl, "I",
2960 "Always schedule transmission "
2961 "instead of doing direct transmission");
2962 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
2963 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
2964 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
2965 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
2971 hn_set_chim_size(struct hn_softc *sc, int chim_size)
2975 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2976 sc->hn_tx_ring[i].hn_chim_size = chim_size;
2980 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
2982 struct ifnet *ifp = sc->hn_ifp;
2985 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
2988 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
2989 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
2990 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
2992 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
2993 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
2994 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
2996 if (tso_maxlen < tso_minlen)
2997 tso_maxlen = tso_minlen;
2998 else if (tso_maxlen > IP_MAXPACKET)
2999 tso_maxlen = IP_MAXPACKET;
3000 if (tso_maxlen > sc->hn_ndis_tso_szmax)
3001 tso_maxlen = sc->hn_ndis_tso_szmax;
3002 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3004 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3008 hn_fixup_tx_data(struct hn_softc *sc)
3010 uint64_t csum_assist;
3013 hn_set_chim_size(sc, sc->hn_chim_szmax);
3014 if (hn_tx_chimney_size > 0 &&
3015 hn_tx_chimney_size < sc->hn_chim_szmax)
3016 hn_set_chim_size(sc, hn_tx_chimney_size);
3019 if (sc->hn_caps & HN_CAP_IPCS)
3020 csum_assist |= CSUM_IP;
3021 if (sc->hn_caps & HN_CAP_TCP4CS)
3022 csum_assist |= CSUM_IP_TCP;
3023 if (sc->hn_caps & HN_CAP_UDP4CS)
3024 csum_assist |= CSUM_IP_UDP;
3026 if (sc->hn_caps & HN_CAP_TCP6CS)
3027 csum_assist |= CSUM_IP6_TCP;
3028 if (sc->hn_caps & HN_CAP_UDP6CS)
3029 csum_assist |= CSUM_IP6_UDP;
3032 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3033 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3035 if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_30) {
3036 /* Support HASHVAL pktinfo on TX path. */
3037 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3038 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3043 hn_destroy_tx_data(struct hn_softc *sc)
3047 if (sc->hn_chim != NULL) {
3048 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3052 if (sc->hn_tx_ring_cnt == 0)
3055 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3056 hn_destroy_tx_ring(&sc->hn_tx_ring[i]);
3058 free(sc->hn_tx_ring, M_NETVSC);
3059 sc->hn_tx_ring = NULL;
3061 sc->hn_tx_ring_cnt = 0;
3062 sc->hn_tx_ring_inuse = 0;
3066 hn_start_taskfunc(void *xtxr, int pending __unused)
3068 struct hn_tx_ring *txr = xtxr;
3070 mtx_lock(&txr->hn_tx_lock);
3071 hn_start_locked(txr, 0);
3072 mtx_unlock(&txr->hn_tx_lock);
3076 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3078 struct hn_tx_ring *txr = xtxr;
3080 mtx_lock(&txr->hn_tx_lock);
3081 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3082 hn_start_locked(txr, 0);
3083 mtx_unlock(&txr->hn_tx_lock);
3087 hn_xmit(struct hn_tx_ring *txr, int len)
3089 struct hn_softc *sc = txr->hn_sc;
3090 struct ifnet *ifp = sc->hn_ifp;
3091 struct mbuf *m_head;
3093 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3094 KASSERT(hn_use_if_start == 0,
3095 ("hn_xmit is called, when if_start is enabled"));
3097 if (__predict_false(txr->hn_suspended))
3100 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
3103 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
3104 struct hn_txdesc *txd;
3107 if (len > 0 && m_head->m_pkthdr.len > len) {
3109 * This sending could be time consuming; let callers
3110 * dispatch this packet sending (and sending of any
3111 * following up packets) to tx taskqueue.
3113 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3117 txd = hn_txdesc_get(txr);
3119 txr->hn_no_txdescs++;
3120 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3121 txr->hn_oactive = 1;
3125 error = hn_encap(txr, txd, &m_head);
3127 /* Both txd and m_head are freed; discard */
3128 drbr_advance(ifp, txr->hn_mbuf_br);
3132 error = hn_send_pkt(ifp, txr, txd);
3133 if (__predict_false(error)) {
3134 /* txd is freed, but m_head is not */
3135 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3136 txr->hn_oactive = 1;
3141 drbr_advance(ifp, txr->hn_mbuf_br);
3147 hn_transmit(struct ifnet *ifp, struct mbuf *m)
3149 struct hn_softc *sc = ifp->if_softc;
3150 struct hn_tx_ring *txr;
3154 * Select the TX ring based on flowid
3156 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
3157 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
3158 txr = &sc->hn_tx_ring[idx];
3160 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
3162 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
3166 if (txr->hn_oactive)
3169 if (txr->hn_sched_tx)
3172 if (mtx_trylock(&txr->hn_tx_lock)) {
3175 sched = hn_xmit(txr, txr->hn_direct_tx_size);
3176 mtx_unlock(&txr->hn_tx_lock);
3181 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3186 hn_tx_ring_qflush(struct hn_tx_ring *txr)
3190 mtx_lock(&txr->hn_tx_lock);
3191 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
3193 mtx_unlock(&txr->hn_tx_lock);
3197 hn_xmit_qflush(struct ifnet *ifp)
3199 struct hn_softc *sc = ifp->if_softc;
3202 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3203 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
3208 hn_xmit_txeof(struct hn_tx_ring *txr)
3211 if (txr->hn_sched_tx)
3214 if (mtx_trylock(&txr->hn_tx_lock)) {
3217 txr->hn_oactive = 0;
3218 sched = hn_xmit(txr, txr->hn_direct_tx_size);
3219 mtx_unlock(&txr->hn_tx_lock);
3221 taskqueue_enqueue(txr->hn_tx_taskq,
3227 * Release the oactive earlier, with the hope, that
3228 * others could catch up. The task will clear the
3229 * oactive again with the hn_tx_lock to avoid possible
3232 txr->hn_oactive = 0;
3233 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3238 hn_xmit_taskfunc(void *xtxr, int pending __unused)
3240 struct hn_tx_ring *txr = xtxr;
3242 mtx_lock(&txr->hn_tx_lock);
3244 mtx_unlock(&txr->hn_tx_lock);
3248 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
3250 struct hn_tx_ring *txr = xtxr;
3252 mtx_lock(&txr->hn_tx_lock);
3253 txr->hn_oactive = 0;
3255 mtx_unlock(&txr->hn_tx_lock);
3259 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
3261 struct vmbus_chan_br cbr;
3262 struct hn_rx_ring *rxr;
3263 struct hn_tx_ring *txr = NULL;
3266 idx = vmbus_chan_subidx(chan);
3269 * Link this channel to RX/TX ring.
3271 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
3272 ("invalid channel index %d, should > 0 && < %d",
3273 idx, sc->hn_rx_ring_inuse));
3274 rxr = &sc->hn_rx_ring[idx];
3275 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
3276 ("RX ring %d already attached", idx));
3277 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
3280 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
3281 idx, vmbus_chan_id(chan));
3284 if (idx < sc->hn_tx_ring_inuse) {
3285 txr = &sc->hn_tx_ring[idx];
3286 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
3287 ("TX ring %d already attached", idx));
3288 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
3290 txr->hn_chan = chan;
3292 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
3293 idx, vmbus_chan_id(chan));
3297 /* Bind this channel to a proper CPU. */
3298 vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
3303 cbr.cbr = rxr->hn_br;
3304 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
3305 cbr.cbr_txsz = NETVSC_DEVICE_RING_BUFFER_SIZE;
3306 cbr.cbr_rxsz = NETVSC_DEVICE_RING_BUFFER_SIZE;
3307 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
3309 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
3310 vmbus_chan_id(chan), error);
3311 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
3313 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
3319 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
3321 struct hn_rx_ring *rxr;
3324 idx = vmbus_chan_subidx(chan);
3327 * Link this channel to RX/TX ring.
3329 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
3330 ("invalid channel index %d, should > 0 && < %d",
3331 idx, sc->hn_rx_ring_inuse));
3332 rxr = &sc->hn_rx_ring[idx];
3333 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
3334 ("RX ring %d is not attached", idx));
3335 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
3337 if (idx < sc->hn_tx_ring_inuse) {
3338 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
3340 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
3341 ("TX ring %d is not attached attached", idx));
3342 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
3346 * Close this channel.
3349 * Channel closing does _not_ destroy the target channel.
3351 vmbus_chan_close(chan);
3355 hn_attach_subchans(struct hn_softc *sc)
3357 struct vmbus_channel **subchans;
3358 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
3361 if (subchan_cnt == 0)
3364 /* Attach the sub-channels. */
3365 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
3366 for (i = 0; i < subchan_cnt; ++i) {
3367 error = hn_chan_attach(sc, subchans[i]);
3371 vmbus_subchan_rel(subchans, subchan_cnt);
3374 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
3377 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
3385 hn_detach_allchans(struct hn_softc *sc)
3387 struct vmbus_channel **subchans;
3388 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
3391 if (subchan_cnt == 0)
3394 /* Detach the sub-channels. */
3395 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
3396 for (i = 0; i < subchan_cnt; ++i)
3397 hn_chan_detach(sc, subchans[i]);
3398 vmbus_subchan_rel(subchans, subchan_cnt);
3402 * Detach the primary channel, _after_ all sub-channels
3405 hn_chan_detach(sc, sc->hn_prichan);
3407 /* Wait for sub-channels to be destroyed, if any. */
3408 vmbus_subchan_drain(sc->hn_prichan);
3411 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3412 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
3413 HN_RX_FLAG_ATTACHED) == 0,
3414 ("%dth RX ring is still attached", i));
3416 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3417 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
3418 HN_TX_FLAG_ATTACHED) == 0,
3419 ("%dth TX ring is still attached", i));
3425 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
3427 struct vmbus_channel **subchans;
3428 int nchan, rxr_cnt, error;
3430 nchan = *nsubch + 1;
3431 if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_30 || nchan == 1) {
3433 * Either RSS is not supported, or multiple RX/TX rings
3434 * are not requested.
3441 * Get RSS capabilities, e.g. # of RX rings, and # of indirect
3444 error = hn_rndis_get_rsscaps(sc, &rxr_cnt);
3446 /* No RSS; this is benign. */
3451 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
3455 if (nchan > rxr_cnt)
3458 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
3464 * Allocate sub-channels from NVS.
3466 *nsubch = nchan - 1;
3467 error = hn_nvs_alloc_subchans(sc, nsubch);
3468 if (error || *nsubch == 0) {
3469 /* Failed to allocate sub-channels. */
3475 * Wait for all sub-channels to become ready before moving on.
3477 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
3478 vmbus_subchan_rel(subchans, *nsubch);
3483 hn_synth_attach(struct hn_softc *sc, int mtu)
3485 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
3486 int error, nsubch, nchan, i;
3489 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
3490 ("synthetic parts were attached"));
3492 /* Save capabilities for later verification. */
3493 old_caps = sc->hn_caps;
3497 * Attach the primary channel _before_ attaching NVS and RNDIS.
3499 error = hn_chan_attach(sc, sc->hn_prichan);
3506 error = hn_nvs_attach(sc, mtu);
3511 * Attach RNDIS _after_ NVS is attached.
3513 error = hn_rndis_attach(sc, mtu);
3518 * Make sure capabilities are not changed.
3520 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
3521 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
3522 old_caps, sc->hn_caps);
3523 /* Restore old capabilities and abort. */
3524 sc->hn_caps = old_caps;
3529 * Allocate sub-channels for multi-TX/RX rings.
3532 * The # of RX rings that can be used is equivalent to the # of
3533 * channels to be requested.
3535 nsubch = sc->hn_rx_ring_cnt - 1;
3536 error = hn_synth_alloc_subchans(sc, &nsubch);
3542 /* Only the primary channel can be used; done */
3547 * Configure RSS key and indirect table _after_ all sub-channels
3551 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
3553 * RSS key is not set yet; set it to the default RSS key.
3556 if_printf(sc->hn_ifp, "setup default RSS key\n");
3557 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
3558 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3561 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
3563 * RSS indirect table is not set yet; set it up in round-
3567 if_printf(sc->hn_ifp, "setup default RSS indirect "
3570 /* TODO: Take ndis_rss_caps.ndis_nind into account. */
3571 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
3572 rss->rss_ind[i] = i % nchan;
3573 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3576 * # of usable channels may be changed, so we have to
3577 * make sure that all entries in RSS indirect table
3580 hn_rss_ind_fixup(sc, nchan);
3583 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
3586 * Failed to configure RSS key or indirect table; only
3587 * the primary channel can be used.
3593 * Set the # of TX/RX rings that could be used according to
3594 * the # of channels that NVS offered.
3596 hn_set_ring_inuse(sc, nchan);
3599 * Attach the sub-channels, if any.
3601 error = hn_attach_subchans(sc);
3605 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
3611 * The interface must have been suspended though hn_suspend(), before
3612 * this function get called.
3615 hn_synth_detach(struct hn_softc *sc)
3619 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3620 ("synthetic parts were not attached"));
3622 /* Detach the RNDIS first. */
3623 hn_rndis_detach(sc);
3628 /* Detach all of the channels. */
3629 hn_detach_allchans(sc);
3631 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
3635 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
3637 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
3638 ("invalid ring count %d", ring_cnt));
3640 if (sc->hn_tx_ring_cnt > ring_cnt)
3641 sc->hn_tx_ring_inuse = ring_cnt;
3643 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3644 sc->hn_rx_ring_inuse = ring_cnt;
3647 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
3648 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
3653 hn_rx_drain(struct vmbus_channel *chan)
3656 while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan))
3658 vmbus_chan_intr_drain(chan);
3662 hn_suspend_data(struct hn_softc *sc)
3664 struct vmbus_channel **subch = NULL;
3672 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
3673 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
3675 mtx_lock(&txr->hn_tx_lock);
3676 txr->hn_suspended = 1;
3677 mtx_unlock(&txr->hn_tx_lock);
3678 /* No one is able send more packets now. */
3680 /* Wait for all pending sends to finish. */
3681 while (hn_tx_ring_pending(txr))
3682 pause("hnwtx", 1 /* 1 tick */);
3684 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
3685 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
3689 * Disable RX by clearing RX filter.
3691 hn_rndis_set_rxfilter(sc, 0);
3694 * Give RNDIS enough time to flush all pending data packets.
3696 pause("waitrx", (200 * hz) / 1000);
3699 * Drain RX/TX bufrings and interrupts.
3701 nsubch = sc->hn_rx_ring_inuse - 1;
3703 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
3705 if (subch != NULL) {
3706 for (i = 0; i < nsubch; ++i)
3707 hn_rx_drain(subch[i]);
3709 hn_rx_drain(sc->hn_prichan);
3712 vmbus_subchan_rel(subch, nsubch);
3716 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
3719 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
3723 hn_suspend_mgmt(struct hn_softc *sc)
3730 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
3731 * through hn_mgmt_taskq.
3733 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
3734 vmbus_chan_run_task(sc->hn_prichan, &task);
3737 * Make sure that all pending management tasks are completed.
3739 taskqueue_drain_all(sc->hn_mgmt_taskq0);
3743 hn_suspend(struct hn_softc *sc)
3746 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
3747 hn_suspend_data(sc);
3748 hn_suspend_mgmt(sc);
3752 hn_tx_resume(struct hn_softc *sc, int tx_ring_cnt)
3756 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
3757 ("invalid TX ring count %d", tx_ring_cnt));
3759 for (i = 0; i < tx_ring_cnt; ++i) {
3760 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
3762 mtx_lock(&txr->hn_tx_lock);
3763 txr->hn_suspended = 0;
3764 mtx_unlock(&txr->hn_tx_lock);
3769 hn_resume_data(struct hn_softc *sc)
3777 * TODO: add hn_rx_filter.
3779 hn_rndis_set_rxfilter(sc, NDIS_PACKET_TYPE_PROMISCUOUS);
3782 * Make sure to clear suspend status on "all" TX rings,
3783 * since hn_tx_ring_inuse can be changed after
3784 * hn_suspend_data().
3786 hn_tx_resume(sc, sc->hn_tx_ring_cnt);
3788 if (!hn_use_if_start) {
3790 * Flush unused drbrs, since hn_tx_ring_inuse may be
3793 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
3794 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
3800 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
3801 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
3804 * Use txeof task, so that any pending oactive can be
3807 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3812 hn_resume_mgmt(struct hn_softc *sc)
3816 * Kick off link status check.
3818 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
3819 hn_link_status_update(sc);
3823 hn_resume(struct hn_softc *sc)
3826 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
3832 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
3834 const struct hn_nvs_hdr *hdr;
3836 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
3837 if_printf(sc->hn_ifp, "invalid nvs notify\n");
3840 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
3842 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
3843 /* Useless; ignore */
3846 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
3850 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
3851 const struct vmbus_chanpkt_hdr *pkt)
3853 struct hn_send_ctx *sndc;
3855 sndc = (struct hn_send_ctx *)(uintptr_t)pkt->cph_xactid;
3856 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
3857 VMBUS_CHANPKT_DATALEN(pkt));
3860 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
3866 hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr,
3867 struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr)
3869 const struct vmbus_chanpkt_rxbuf *pkt;
3870 const struct hn_nvs_hdr *nvs_hdr;
3873 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
3874 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
3877 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
3879 /* Make sure that this is a RNDIS message. */
3880 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
3881 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
3886 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
3887 if (__predict_false(hlen < sizeof(*pkt))) {
3888 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
3891 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
3893 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
3894 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
3899 count = pkt->cp_rxbuf_cnt;
3900 if (__predict_false(hlen <
3901 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
3902 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
3906 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
3907 for (i = 0; i < count; ++i) {
3910 ofs = pkt->cp_rxbuf[i].rb_ofs;
3911 len = pkt->cp_rxbuf[i].rb_len;
3912 if (__predict_false(ofs + len > NETVSC_RECEIVE_BUFFER_SIZE)) {
3913 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
3914 "ofs %d, len %d\n", i, ofs, len);
3917 hv_rf_on_receive(sc, rxr, rxr->hn_rxbuf + ofs, len);
3921 * Moved completion call back here so that all received
3922 * messages (not just data messages) will trigger a response
3923 * message back to the host.
3925 hn_nvs_ack_rxbuf(chan, pkt->cp_hdr.cph_xactid);
3929 * Net VSC on receive completion
3931 * Send a receive completion packet to RNDIS device (ie NetVsp)
3934 hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid)
3936 struct hn_nvs_rndis_ack ack;
3940 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
3941 ack.nvs_status = HN_NVS_STATUS_OK;
3944 /* Send the completion */
3945 ret = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
3946 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
3950 } else if (ret == EAGAIN) {
3951 /* no more room... wait a bit and attempt to retry 3 times */
3956 goto retry_send_cmplt;
3962 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
3964 struct hn_rx_ring *rxr = xrxr;
3965 struct hn_softc *sc = rxr->hn_ifp->if_softc;
3967 int bufferlen = NETVSC_PACKET_SIZE;
3969 buffer = rxr->hn_rdbuf;
3971 struct vmbus_chanpkt_hdr *pkt = buffer;
3972 uint32_t bytes_rxed;
3975 bytes_rxed = bufferlen;
3976 ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed);
3978 switch (pkt->cph_type) {
3979 case VMBUS_CHANPKT_TYPE_COMP:
3980 hn_nvs_handle_comp(sc, chan, pkt);
3982 case VMBUS_CHANPKT_TYPE_RXBUF:
3983 hn_nvs_handle_rxbuf(sc, rxr, chan, pkt);
3985 case VMBUS_CHANPKT_TYPE_INBAND:
3986 hn_nvs_handle_notify(sc, pkt);
3989 if_printf(rxr->hn_ifp,
3990 "unknown chan pkt %u\n",
3994 } else if (ret == ENOBUFS) {
3995 /* Handle large packet */
3996 if (bufferlen > NETVSC_PACKET_SIZE) {
3997 free(buffer, M_NETVSC);
4001 /* alloc new buffer */
4002 buffer = malloc(bytes_rxed, M_NETVSC, M_NOWAIT);
4003 if (buffer == NULL) {
4004 if_printf(rxr->hn_ifp,
4005 "hv_cb malloc buffer failed, len=%u\n",
4010 bufferlen = bytes_rxed;
4012 /* No more packets */
4017 if (bufferlen > NETVSC_PACKET_SIZE)
4018 free(buffer, M_NETVSC);
4020 hv_rf_channel_rollup(rxr, rxr->hn_txr);
4024 hn_tx_taskq_create(void *arg __unused)
4026 if (!hn_share_tx_taskq)
4029 hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
4030 taskqueue_thread_enqueue, &hn_tx_taskq);
4031 taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
4032 if (hn_bind_tx_taskq >= 0) {
4033 int cpu = hn_bind_tx_taskq;
4034 struct task cpuset_task;
4037 if (cpu > mp_ncpus - 1)
4039 CPU_SETOF(cpu, &cpu_set);
4040 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set);
4041 taskqueue_enqueue(hn_tx_taskq, &cpuset_task);
4042 taskqueue_drain(hn_tx_taskq, &cpuset_task);
4045 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_FIRST,
4046 hn_tx_taskq_create, NULL);
4049 hn_tx_taskq_destroy(void *arg __unused)
4051 if (hn_tx_taskq != NULL)
4052 taskqueue_free(hn_tx_taskq);
4054 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_FIRST,
4055 hn_tx_taskq_destroy, NULL);
4057 static device_method_t netvsc_methods[] = {
4058 /* Device interface */
4059 DEVMETHOD(device_probe, netvsc_probe),
4060 DEVMETHOD(device_attach, netvsc_attach),
4061 DEVMETHOD(device_detach, netvsc_detach),
4062 DEVMETHOD(device_shutdown, netvsc_shutdown),
4067 static driver_t netvsc_driver = {
4070 sizeof(struct hn_softc)
4073 static devclass_t netvsc_devclass;
4075 DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0);
4076 MODULE_VERSION(hn, 1);
4077 MODULE_DEPEND(hn, vmbus, 1, 1, 1);