2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/sockio.h>
65 #include <sys/malloc.h>
66 #include <sys/module.h>
67 #include <sys/kernel.h>
68 #include <sys/socket.h>
70 #include <sys/queue.h>
74 #include <sys/sysctl.h>
75 #include <sys/buf_ring.h>
78 #include <net/if_arp.h>
79 #include <net/ethernet.h>
80 #include <net/if_dl.h>
81 #include <net/if_media.h>
82 #include <net/rndis.h>
85 #include <net/if_types.h>
86 #include <net/if_vlan_var.h>
89 #include <netinet/in_systm.h>
90 #include <netinet/in.h>
91 #include <netinet/ip.h>
92 #include <netinet/if_ether.h>
93 #include <netinet/tcp.h>
94 #include <netinet/udp.h>
95 #include <netinet/ip6.h>
98 #include <vm/vm_param.h>
99 #include <vm/vm_kern.h>
102 #include <machine/bus.h>
103 #include <machine/resource.h>
104 #include <machine/frame.h>
105 #include <machine/vmparam.h>
108 #include <sys/rman.h>
109 #include <sys/mutex.h>
110 #include <sys/errno.h>
111 #include <sys/types.h>
112 #include <machine/atomic.h>
114 #include <machine/intr_machdep.h>
116 #include <machine/in_cksum.h>
118 #include <dev/hyperv/include/hyperv.h>
119 #include <dev/hyperv/include/hyperv_busdma.h>
120 #include <dev/hyperv/include/vmbus_xact.h>
122 #include <dev/hyperv/netvsc/hv_net_vsc.h>
123 #include <dev/hyperv/netvsc/hv_rndis_filter.h>
124 #include <dev/hyperv/netvsc/ndis.h>
126 #include "vmbus_if.h"
128 /* Short for Hyper-V network interface */
129 #define NETVSC_DEVNAME "hn"
132 * It looks like offset 0 of buf is reserved to hold the softc pointer.
133 * The sc pointer evidently not needed, and is not presently populated.
134 * The packet offset is where the netvsc_packet starts in the buffer.
136 #define HV_NV_SC_PTR_OFFSET_IN_BUF 0
137 #define HV_NV_PACKET_OFFSET_IN_BUF 16
139 /* YYY should get it from the underlying channel */
140 #define HN_TX_DESC_CNT 512
142 #define HN_LROENT_CNT_DEF 128
144 #define HN_RING_CNT_DEF_MAX 8
146 #define HN_RNDIS_PKT_LEN \
147 (sizeof(struct rndis_packet_msg) + \
148 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
149 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
150 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
151 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
152 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
153 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
155 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
156 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
157 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
158 /* -1 for RNDIS packet message */
159 #define HN_TX_DATA_SEGCNT_MAX (NETVSC_PACKET_MAXPAGE - 1)
161 #define HN_DIRECT_TX_SIZE_DEF 128
163 #define HN_EARLY_TXEOF_THRESH 8
166 #ifndef HN_USE_TXDESC_BUFRING
167 SLIST_ENTRY(hn_txdesc) link;
170 struct hn_tx_ring *txr;
172 uint32_t flags; /* HN_TXD_FLAG_ */
173 struct hn_send_ctx send_ctx;
175 bus_dmamap_t data_dmap;
177 bus_addr_t rndis_pkt_paddr;
178 struct rndis_packet_msg *rndis_pkt;
179 bus_dmamap_t rndis_pkt_dmap;
182 #define HN_TXD_FLAG_ONLIST 0x1
183 #define HN_TXD_FLAG_DMAMAP 0x2
185 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
186 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
187 /* YYY 2*MTU is a bit rough, but should be good enough. */
188 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
190 #define HN_LRO_ACKCNT_DEF 1
192 #define HN_LOCK_INIT(sc) \
193 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
194 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
195 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
196 #define HN_LOCK(sc) sx_xlock(&(sc)->hn_lock)
197 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
199 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
200 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
201 #define HN_CSUM_IP_HWASSIST(sc) \
202 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
203 #define HN_CSUM_IP6_HWASSIST(sc) \
204 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
210 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
211 "Hyper-V network interface");
213 /* Trust tcp segements verification on host side. */
214 static int hn_trust_hosttcp = 1;
215 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
216 &hn_trust_hosttcp, 0,
217 "Trust tcp segement verification on host side, "
218 "when csum info is missing (global setting)");
220 /* Trust udp datagrams verification on host side. */
221 static int hn_trust_hostudp = 1;
222 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
223 &hn_trust_hostudp, 0,
224 "Trust udp datagram verification on host side, "
225 "when csum info is missing (global setting)");
227 /* Trust ip packets verification on host side. */
228 static int hn_trust_hostip = 1;
229 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
231 "Trust ip packet verification on host side, "
232 "when csum info is missing (global setting)");
234 /* Limit TSO burst size */
235 static int hn_tso_maxlen = 0;
236 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
237 &hn_tso_maxlen, 0, "TSO burst limit");
239 /* Limit chimney send size */
240 static int hn_tx_chimney_size = 0;
241 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
242 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
244 /* Limit the size of packet for direct transmission */
245 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
246 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
247 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
249 #if defined(INET) || defined(INET6)
250 #if __FreeBSD_version >= 1100095
251 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
252 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
253 &hn_lro_entry_count, 0, "LRO entry count");
257 static int hn_share_tx_taskq = 0;
258 SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
259 &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
261 static struct taskqueue *hn_tx_taskq;
263 #ifndef HN_USE_TXDESC_BUFRING
264 static int hn_use_txdesc_bufring = 0;
266 static int hn_use_txdesc_bufring = 1;
268 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
269 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
271 static int hn_bind_tx_taskq = -1;
272 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
273 &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
275 static int hn_use_if_start = 0;
276 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
277 &hn_use_if_start, 0, "Use if_start TX method");
279 static int hn_chan_cnt = 0;
280 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
282 "# of channels to use; each channel has one RX ring and one TX ring");
284 static int hn_tx_ring_cnt = 0;
285 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
286 &hn_tx_ring_cnt, 0, "# of TX rings to use");
288 static int hn_tx_swq_depth = 0;
289 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
290 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
292 #if __FreeBSD_version >= 1100095
293 static u_int hn_lro_mbufq_depth = 0;
294 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
295 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
298 static u_int hn_cpu_index;
301 * Forward declarations
303 static void hn_stop(struct hn_softc *sc);
304 static void hn_init_locked(struct hn_softc *sc);
305 static void hn_init(void *xsc);
306 static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
307 static int hn_start_locked(struct hn_tx_ring *txr, int len);
308 static void hn_start(struct ifnet *ifp);
309 static void hn_start_txeof(struct hn_tx_ring *);
310 static int hn_ifmedia_upd(struct ifnet *ifp);
311 static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
312 #if __FreeBSD_version >= 1100099
313 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
314 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
316 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
317 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
318 #if __FreeBSD_version < 1100095
319 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
321 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
323 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
325 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
326 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
327 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
328 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
329 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
330 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
331 static int hn_check_iplen(const struct mbuf *, int);
332 static int hn_create_tx_ring(struct hn_softc *, int);
333 static void hn_destroy_tx_ring(struct hn_tx_ring *);
334 static int hn_create_tx_data(struct hn_softc *, int);
335 static void hn_fixup_tx_data(struct hn_softc *);
336 static void hn_destroy_tx_data(struct hn_softc *);
337 static void hn_start_taskfunc(void *, int);
338 static void hn_start_txeof_taskfunc(void *, int);
339 static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **);
340 static int hn_create_rx_data(struct hn_softc *sc, int);
341 static void hn_destroy_rx_data(struct hn_softc *sc);
342 static void hn_set_chim_size(struct hn_softc *, int);
343 static int hn_chan_attach(struct hn_softc *, struct vmbus_channel *);
344 static void hn_chan_detach(struct hn_softc *, struct vmbus_channel *);
345 static int hn_attach_subchans(struct hn_softc *);
346 static void hn_detach_allchans(struct hn_softc *);
347 static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr);
348 static void hn_set_ring_inuse(struct hn_softc *, int);
349 static int hn_synth_attach(struct hn_softc *, int);
350 static void hn_synth_detach(struct hn_softc *);
351 static bool hn_tx_ring_pending(struct hn_tx_ring *);
352 static void hn_suspend(struct hn_softc *);
353 static void hn_resume(struct hn_softc *);
354 static void hn_rx_drain(struct vmbus_channel *);
355 static void hn_tx_resume(struct hn_softc *, int);
356 static void hn_tx_ring_qflush(struct hn_tx_ring *);
357 static int netvsc_detach(device_t dev);
359 static void hn_nvs_handle_notify(struct hn_softc *sc,
360 const struct vmbus_chanpkt_hdr *pkt);
361 static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
362 const struct vmbus_chanpkt_hdr *pkt);
363 static void hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr,
364 struct vmbus_channel *chan,
365 const struct vmbus_chanpkt_hdr *pkthdr);
366 static void hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid);
368 static int hn_transmit(struct ifnet *, struct mbuf *);
369 static void hn_xmit_qflush(struct ifnet *);
370 static int hn_xmit(struct hn_tx_ring *, int);
371 static void hn_xmit_txeof(struct hn_tx_ring *);
372 static void hn_xmit_taskfunc(void *, int);
373 static void hn_xmit_txeof_taskfunc(void *, int);
375 static const uint8_t hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
376 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
377 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
378 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
379 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
380 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
383 #if __FreeBSD_version >= 1100099
385 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
389 for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
390 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
395 hn_get_txswq_depth(const struct hn_tx_ring *txr)
398 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
399 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
400 return txr->hn_txdesc_cnt;
401 return hn_tx_swq_depth;
405 hn_rss_reconfig(struct hn_softc *sc)
411 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
418 * Direct reconfiguration by setting the UNCHG flags does
419 * _not_ work properly.
422 if_printf(sc->hn_ifp, "disable RSS\n");
423 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
425 if_printf(sc->hn_ifp, "RSS disable failed\n");
430 * Reenable the RSS w/ the updated RSS key or indirect
434 if_printf(sc->hn_ifp, "reconfig RSS\n");
435 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
437 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
444 hn_rss_ind_fixup(struct hn_softc *sc, int nchan)
446 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
449 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
452 * Check indirect table to make sure that all channels in it
455 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
456 if (rss->rss_ind[i] >= nchan) {
457 if_printf(sc->hn_ifp,
458 "RSS indirect table %d fixup: %u -> %d\n",
459 i, rss->rss_ind[i], nchan - 1);
460 rss->rss_ind[i] = nchan - 1;
466 hn_ifmedia_upd(struct ifnet *ifp __unused)
473 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
475 struct hn_softc *sc = ifp->if_softc;
477 ifmr->ifm_status = IFM_AVALID;
478 ifmr->ifm_active = IFM_ETHER;
480 if (!sc->hn_carrier) {
481 ifmr->ifm_active |= IFM_NONE;
484 ifmr->ifm_status |= IFM_ACTIVE;
485 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
488 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
489 static const struct hyperv_guid g_net_vsc_device_type = {
490 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
491 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
495 * Standard probe entry point.
499 netvsc_probe(device_t dev)
501 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
502 &g_net_vsc_device_type) == 0) {
503 device_set_desc(dev, "Hyper-V Network Interface");
504 return BUS_PROBE_DEFAULT;
510 hn_cpuset_setthread_task(void *xmask, int pending __unused)
512 cpuset_t *mask = xmask;
515 error = cpuset_setthread(curthread->td_tid, mask);
517 panic("curthread=%ju: can't pin; error=%d",
518 (uintmax_t)curthread->td_tid, error);
523 * Standard attach entry point.
525 * Called when the driver is loaded. It allocates needed resources,
526 * and initializes the "hardware" and software.
529 netvsc_attach(device_t dev)
531 struct hn_softc *sc = device_get_softc(dev);
532 struct sysctl_oid_list *child;
533 struct sysctl_ctx_list *ctx;
534 uint8_t eaddr[ETHER_ADDR_LEN];
535 uint32_t link_status;
536 struct ifnet *ifp = NULL;
537 int error, ring_cnt, tx_ring_cnt;
541 sc->hn_prichan = vmbus_get_channel(dev);
545 * Setup taskqueue for transmission.
547 if (hn_tx_taskq == NULL) {
548 sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
549 taskqueue_thread_enqueue, &sc->hn_tx_taskq);
550 taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
551 device_get_nameunit(dev));
552 if (hn_bind_tx_taskq >= 0) {
553 int cpu = hn_bind_tx_taskq;
554 struct task cpuset_task;
557 if (cpu > mp_ncpus - 1)
559 CPU_SETOF(cpu, &cpu_set);
560 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task,
562 taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task);
563 taskqueue_drain(sc->hn_tx_taskq, &cpuset_task);
566 sc->hn_tx_taskq = hn_tx_taskq;
570 * Allocate ifnet and setup its name earlier, so that if_printf
571 * can be used by functions, which will be called after
574 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
576 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
579 * Initialize ifmedia earlier so that it can be unconditionally
580 * destroyed, if error happened later on.
582 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
585 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
586 * to use (tx_ring_cnt).
589 * The # of RX rings to use is same as the # of channels to use.
591 ring_cnt = hn_chan_cnt;
595 if (ring_cnt > HN_RING_CNT_DEF_MAX)
596 ring_cnt = HN_RING_CNT_DEF_MAX;
597 } else if (ring_cnt > mp_ncpus) {
601 tx_ring_cnt = hn_tx_ring_cnt;
602 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
603 tx_ring_cnt = ring_cnt;
604 if (hn_use_if_start) {
605 /* ifnet.if_start only needs one TX ring. */
610 * Set the leader CPU for channels.
612 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
615 * Create enough TX/RX rings, even if only limited number of
616 * channels can be allocated.
618 error = hn_create_tx_data(sc, tx_ring_cnt);
621 error = hn_create_rx_data(sc, ring_cnt);
626 * Create transaction context for NVS and RNDIS transactions.
628 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
629 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
630 if (sc->hn_xact == NULL)
634 * Attach the synthetic parts, i.e. NVS and RNDIS.
636 error = hn_synth_attach(sc, ETHERMTU);
640 error = hn_rndis_get_linkstatus(sc, &link_status);
643 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
646 error = hn_rndis_get_eaddr(sc, eaddr);
650 #if __FreeBSD_version >= 1100099
651 if (sc->hn_rx_ring_inuse > 1) {
653 * Reduce TCP segment aggregation limit for multiple
654 * RX rings to increase ACK timeliness.
656 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
661 * Fixup TX stuffs after synthetic parts are attached.
663 hn_fixup_tx_data(sc);
665 ctx = device_get_sysctl_ctx(dev);
666 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
667 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
668 &sc->hn_nvs_ver, 0, "NVS version");
669 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
670 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
671 hn_ndis_version_sysctl, "A", "NDIS version");
672 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
673 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
674 hn_caps_sysctl, "A", "capabilities");
675 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
676 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
677 hn_hwassist_sysctl, "A", "hwassist");
678 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
679 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
680 hn_rss_key_sysctl, "IU", "RSS key");
681 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
682 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
683 hn_rss_ind_sysctl, "IU", "RSS indirect table");
686 * Setup the ifmedia, which has been initialized earlier.
688 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
689 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
690 /* XXX ifmedia_set really should do this for us */
691 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
694 * Setup the ifnet for this interface.
697 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
698 ifp->if_ioctl = hn_ioctl;
699 ifp->if_init = hn_init;
700 if (hn_use_if_start) {
701 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
703 ifp->if_start = hn_start;
704 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
705 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
706 IFQ_SET_READY(&ifp->if_snd);
708 ifp->if_transmit = hn_transmit;
709 ifp->if_qflush = hn_xmit_qflush;
712 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
714 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
715 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
717 if (sc->hn_caps & HN_CAP_VLAN) {
718 /* XXX not sure about VLAN_MTU. */
719 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
722 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
723 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
724 ifp->if_capabilities |= IFCAP_TXCSUM;
725 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
726 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
727 if (sc->hn_caps & HN_CAP_TSO4) {
728 ifp->if_capabilities |= IFCAP_TSO4;
729 ifp->if_hwassist |= CSUM_IP_TSO;
731 if (sc->hn_caps & HN_CAP_TSO6) {
732 ifp->if_capabilities |= IFCAP_TSO6;
733 ifp->if_hwassist |= CSUM_IP6_TSO;
736 /* Enable all available capabilities by default. */
737 ifp->if_capenable = ifp->if_capabilities;
739 tso_maxlen = hn_tso_maxlen;
740 if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET)
741 tso_maxlen = IP_MAXPACKET;
742 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
743 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
744 ifp->if_hw_tsomax = tso_maxlen -
745 (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
747 ether_ifattach(ifp, eaddr);
750 if_printf(ifp, "TSO: %u/%u/%u\n", ifp->if_hw_tsomax,
751 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
754 /* Inform the upper layer about the long frame support. */
755 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
759 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
766 netvsc_detach(device_t dev)
768 struct hn_softc *sc = device_get_softc(dev);
769 struct ifnet *ifp = sc->hn_ifp;
771 if (device_is_attached(dev)) {
773 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
774 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
782 ifmedia_removeall(&sc->hn_media);
783 hn_destroy_rx_data(sc);
784 hn_destroy_tx_data(sc);
786 if (sc->hn_tx_taskq != hn_tx_taskq)
787 taskqueue_free(sc->hn_tx_taskq);
789 if (sc->hn_xact != NULL)
790 vmbus_xact_ctx_destroy(sc->hn_xact);
799 * Standard shutdown entry point
802 netvsc_shutdown(device_t dev)
808 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
809 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
811 struct mbuf *m = *m_head;
814 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
815 m, segs, nsegs, BUS_DMA_NOWAIT);
816 if (error == EFBIG) {
819 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
824 txr->hn_tx_collapsed++;
826 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
827 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
830 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
831 BUS_DMASYNC_PREWRITE);
832 txd->flags |= HN_TXD_FLAG_DMAMAP;
838 hn_txdesc_dmamap_unload(struct hn_tx_ring *txr, struct hn_txdesc *txd)
841 if (txd->flags & HN_TXD_FLAG_DMAMAP) {
842 bus_dmamap_sync(txr->hn_tx_data_dtag,
843 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
844 bus_dmamap_unload(txr->hn_tx_data_dtag,
846 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
851 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
854 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
855 ("put an onlist txd %#x", txd->flags));
857 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
858 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
861 hn_txdesc_dmamap_unload(txr, txd);
862 if (txd->m != NULL) {
867 txd->flags |= HN_TXD_FLAG_ONLIST;
869 #ifndef HN_USE_TXDESC_BUFRING
870 mtx_lock_spin(&txr->hn_txlist_spin);
871 KASSERT(txr->hn_txdesc_avail >= 0 &&
872 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
873 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
874 txr->hn_txdesc_avail++;
875 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
876 mtx_unlock_spin(&txr->hn_txlist_spin);
878 atomic_add_int(&txr->hn_txdesc_avail, 1);
879 buf_ring_enqueue(txr->hn_txdesc_br, txd);
885 static __inline struct hn_txdesc *
886 hn_txdesc_get(struct hn_tx_ring *txr)
888 struct hn_txdesc *txd;
890 #ifndef HN_USE_TXDESC_BUFRING
891 mtx_lock_spin(&txr->hn_txlist_spin);
892 txd = SLIST_FIRST(&txr->hn_txlist);
894 KASSERT(txr->hn_txdesc_avail > 0,
895 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
896 txr->hn_txdesc_avail--;
897 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
899 mtx_unlock_spin(&txr->hn_txlist_spin);
901 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
905 #ifdef HN_USE_TXDESC_BUFRING
906 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
908 KASSERT(txd->m == NULL && txd->refs == 0 &&
909 (txd->flags & HN_TXD_FLAG_ONLIST), ("invalid txd"));
910 txd->flags &= ~HN_TXD_FLAG_ONLIST;
917 hn_txdesc_hold(struct hn_txdesc *txd)
920 /* 0->1 transition will never work */
921 KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
922 atomic_add_int(&txd->refs, 1);
926 hn_tx_ring_pending(struct hn_tx_ring *txr)
928 bool pending = false;
930 #ifndef HN_USE_TXDESC_BUFRING
931 mtx_lock_spin(&txr->hn_txlist_spin);
932 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
934 mtx_unlock_spin(&txr->hn_txlist_spin);
936 if (!buf_ring_full(txr->hn_txdesc_br))
943 hn_txeof(struct hn_tx_ring *txr)
945 txr->hn_has_txeof = 0;
950 hn_tx_done(struct hn_send_ctx *sndc, struct hn_softc *sc,
951 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
953 struct hn_txdesc *txd = sndc->hn_cbarg;
954 struct hn_tx_ring *txr;
956 if (sndc->hn_chim_idx != HN_NVS_CHIM_IDX_INVALID)
957 hn_chim_free(sc, sndc->hn_chim_idx);
960 KASSERT(txr->hn_chan == chan,
961 ("channel mismatch, on chan%u, should be chan%u",
962 vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan)));
964 txr->hn_has_txeof = 1;
965 hn_txdesc_put(txr, txd);
967 ++txr->hn_txdone_cnt;
968 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
969 txr->hn_txdone_cnt = 0;
976 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
978 #if defined(INET) || defined(INET6)
979 struct lro_ctrl *lro = &rxr->hn_lro;
980 struct lro_entry *queued;
982 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
983 SLIST_REMOVE_HEAD(&lro->lro_active, next);
984 tcp_lro_flush(lro, queued);
990 * 'txr' could be NULL, if multiple channels and
991 * ifnet.if_start method are enabled.
993 if (txr == NULL || !txr->hn_has_txeof)
996 txr->hn_txdone_cnt = 0;
1000 static __inline uint32_t
1001 hn_rndis_pktmsg_offset(uint32_t ofs)
1004 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1005 ("invalid RNDIS packet msg offset %u", ofs));
1006 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1011 * If this function fails, then both txd and m_head0 will be freed.
1014 hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
1016 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1017 int error, nsegs, i;
1018 struct mbuf *m_head = *m_head0;
1019 struct rndis_packet_msg *pkt;
1020 uint32_t send_buf_section_idx;
1021 int send_buf_section_size, pktlen;
1025 * extension points to the area reserved for the
1026 * rndis_filter_packet, which is placed just after
1027 * the netvsc_packet (and rppi struct, if present;
1028 * length is updated later).
1030 pkt = txd->rndis_pkt;
1031 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1032 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1033 pkt->rm_dataoffset = sizeof(*pkt);
1034 pkt->rm_datalen = m_head->m_pkthdr.len;
1035 pkt->rm_pktinfooffset = sizeof(*pkt);
1036 pkt->rm_pktinfolen = 0;
1038 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1040 * Set the hash value for this packet, so that the host could
1041 * dispatch the TX done event for this packet back to this TX
1044 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1045 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1046 *pi_data = txr->hn_tx_idx;
1049 if (m_head->m_flags & M_VLANTAG) {
1050 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1051 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1052 *pi_data = NDIS_VLAN_INFO_MAKE(
1053 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1054 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1055 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1058 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1059 #if defined(INET6) || defined(INET)
1060 struct ether_vlan_header *eh;
1064 * XXX need m_pullup and use mtodo
1066 eh = mtod(m_head, struct ether_vlan_header*);
1067 if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN))
1068 ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1070 ether_len = ETHER_HDR_LEN;
1072 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1073 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1075 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1077 (struct ip *)(m_head->m_data + ether_len);
1078 unsigned long iph_len = ip->ip_hl << 2;
1080 (struct tcphdr *)((caddr_t)ip + iph_len);
1084 th->th_sum = in_pseudo(ip->ip_src.s_addr,
1085 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
1086 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1087 m_head->m_pkthdr.tso_segsz);
1090 #if defined(INET6) && defined(INET)
1095 struct ip6_hdr *ip6 = (struct ip6_hdr *)
1096 (m_head->m_data + ether_len);
1097 struct tcphdr *th = (struct tcphdr *)(ip6 + 1);
1100 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
1101 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1102 m_head->m_pkthdr.tso_segsz);
1105 #endif /* INET6 || INET */
1106 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1107 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1108 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1109 if (m_head->m_pkthdr.csum_flags &
1110 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1111 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1113 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1114 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1115 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1118 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1119 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1120 else if (m_head->m_pkthdr.csum_flags &
1121 (CSUM_IP_UDP | CSUM_IP6_UDP))
1122 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1125 pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1126 /* Convert RNDIS packet message offsets */
1127 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1128 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1131 * Chimney send, if the packet could fit into one chimney buffer.
1133 if (pkt->rm_len < txr->hn_chim_size) {
1134 txr->hn_tx_chimney_tried++;
1135 send_buf_section_idx = hn_chim_alloc(txr->hn_sc);
1136 if (send_buf_section_idx != HN_NVS_CHIM_IDX_INVALID) {
1137 uint8_t *dest = txr->hn_sc->hn_chim +
1138 (send_buf_section_idx * txr->hn_sc->hn_chim_szmax);
1140 memcpy(dest, pkt, pktlen);
1142 m_copydata(m_head, 0, m_head->m_pkthdr.len, dest);
1144 send_buf_section_size = pkt->rm_len;
1145 txr->hn_gpa_cnt = 0;
1146 txr->hn_tx_chimney++;
1151 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1156 * This mbuf is not linked w/ the txd yet, so free it now.
1161 freed = hn_txdesc_put(txr, txd);
1163 ("fail to free txd upon txdma error"));
1165 txr->hn_txdma_failed++;
1166 if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1);
1171 /* +1 RNDIS packet message */
1172 txr->hn_gpa_cnt = nsegs + 1;
1174 /* send packet with page buffer */
1175 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1176 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1177 txr->hn_gpa[0].gpa_len = pktlen;
1180 * Fill the page buffers with mbuf info after the page
1181 * buffer for RNDIS packet message.
1183 for (i = 0; i < nsegs; ++i) {
1184 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1186 gpa->gpa_page = atop(segs[i].ds_addr);
1187 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1188 gpa->gpa_len = segs[i].ds_len;
1191 send_buf_section_idx = HN_NVS_CHIM_IDX_INVALID;
1192 send_buf_section_size = 0;
1196 /* Set the completion routine */
1197 hn_send_ctx_init(&txd->send_ctx, hn_tx_done, txd,
1198 send_buf_section_idx, send_buf_section_size);
1205 * If this function fails, then txd will be freed, but the mbuf
1206 * associated w/ the txd will _not_ be freed.
1209 hn_send_pkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1211 int error, send_failed = 0;
1215 * Make sure that txd is not freed before ETHER_BPF_MTAP.
1217 hn_txdesc_hold(txd);
1218 error = hv_nv_on_send(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
1219 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt);
1221 ETHER_BPF_MTAP(ifp, txd->m);
1222 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
1223 if (!hn_use_if_start) {
1224 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1225 txd->m->m_pkthdr.len);
1226 if (txd->m->m_flags & M_MCAST)
1227 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
1231 hn_txdesc_put(txr, txd);
1233 if (__predict_false(error)) {
1237 * This should "really rarely" happen.
1239 * XXX Too many RX to be acked or too many sideband
1240 * commands to run? Ask netvsc_channel_rollup()
1241 * to kick start later.
1243 txr->hn_has_txeof = 1;
1245 txr->hn_send_failed++;
1248 * Try sending again after set hn_has_txeof;
1249 * in case that we missed the last
1250 * netvsc_channel_rollup().
1254 if_printf(ifp, "send failed\n");
1257 * Caller will perform further processing on the
1258 * associated mbuf, so don't free it in hn_txdesc_put();
1259 * only unload it from the DMA map in hn_txdesc_put(),
1263 freed = hn_txdesc_put(txr, txd);
1265 ("fail to free txd upon send error"));
1267 txr->hn_send_failed++;
1273 * Start a transmit of one or more packets
1276 hn_start_locked(struct hn_tx_ring *txr, int len)
1278 struct hn_softc *sc = txr->hn_sc;
1279 struct ifnet *ifp = sc->hn_ifp;
1281 KASSERT(hn_use_if_start,
1282 ("hn_start_locked is called, when if_start is disabled"));
1283 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
1284 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
1286 if (__predict_false(txr->hn_suspended))
1289 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
1293 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
1294 struct hn_txdesc *txd;
1295 struct mbuf *m_head;
1298 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
1302 if (len > 0 && m_head->m_pkthdr.len > len) {
1304 * This sending could be time consuming; let callers
1305 * dispatch this packet sending (and sending of any
1306 * following up packets) to tx taskqueue.
1308 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1312 txd = hn_txdesc_get(txr);
1314 txr->hn_no_txdescs++;
1315 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1316 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1320 error = hn_encap(txr, txd, &m_head);
1322 /* Both txd and m_head are freed */
1326 error = hn_send_pkt(ifp, txr, txd);
1327 if (__predict_false(error)) {
1328 /* txd is freed, but m_head is not */
1329 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1330 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1338 * Link up/down notification
1341 netvsc_linkstatus_callback(struct hn_softc *sc, uint32_t status)
1351 * Append the specified data to the indicated mbuf chain,
1352 * Extend the mbuf chain if the new data does not fit in
1355 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1356 * There should be an equivalent in the kernel mbuf code,
1357 * but there does not appear to be one yet.
1359 * Differs from m_append() in that additional mbufs are
1360 * allocated with cluster size MJUMPAGESIZE, and filled
1363 * Return 1 if able to complete the job; otherwise 0.
1366 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
1369 int remainder, space;
1371 for (m = m0; m->m_next != NULL; m = m->m_next)
1374 space = M_TRAILINGSPACE(m);
1377 * Copy into available space.
1379 if (space > remainder)
1381 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1386 while (remainder > 0) {
1388 * Allocate a new mbuf; could check space
1389 * and allocate a cluster instead.
1391 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
1394 n->m_len = min(MJUMPAGESIZE, remainder);
1395 bcopy(cp, mtod(n, caddr_t), n->m_len);
1397 remainder -= n->m_len;
1401 if (m0->m_flags & M_PKTHDR)
1402 m0->m_pkthdr.len += len - remainder;
1404 return (remainder == 0);
1407 #if defined(INET) || defined(INET6)
1409 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
1411 #if __FreeBSD_version >= 1100095
1412 if (hn_lro_mbufq_depth) {
1413 tcp_lro_queue_mbuf(lc, m);
1417 return tcp_lro_rx(lc, m, 0);
1422 * Called when we receive a data packet from the "wire" on the
1425 * Note: This is no longer used as a callback
1428 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
1429 const struct hn_recvinfo *info)
1431 struct ifnet *ifp = rxr->hn_ifp;
1433 int size, do_lro = 0, do_csum = 1;
1434 int hash_type = M_HASHTYPE_OPAQUE;
1436 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
1440 * Bail out if packet contains more data than configured MTU.
1442 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
1444 } else if (dlen <= MHLEN) {
1445 m_new = m_gethdr(M_NOWAIT, MT_DATA);
1446 if (m_new == NULL) {
1447 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1450 memcpy(mtod(m_new, void *), data, dlen);
1451 m_new->m_pkthdr.len = m_new->m_len = dlen;
1452 rxr->hn_small_pkts++;
1455 * Get an mbuf with a cluster. For packets 2K or less,
1456 * get a standard 2K cluster. For anything larger, get a
1457 * 4K cluster. Any buffers larger than 4K can cause problems
1458 * if looped around to the Hyper-V TX channel, so avoid them.
1461 if (dlen > MCLBYTES) {
1463 size = MJUMPAGESIZE;
1466 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
1467 if (m_new == NULL) {
1468 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1472 hv_m_append(m_new, dlen, data);
1474 m_new->m_pkthdr.rcvif = ifp;
1476 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
1479 /* receive side checksum offload */
1480 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
1481 /* IP csum offload */
1482 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
1483 m_new->m_pkthdr.csum_flags |=
1484 (CSUM_IP_CHECKED | CSUM_IP_VALID);
1488 /* TCP/UDP csum offload */
1489 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
1490 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
1491 m_new->m_pkthdr.csum_flags |=
1492 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1493 m_new->m_pkthdr.csum_data = 0xffff;
1494 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
1500 if ((info->csum_info &
1501 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
1502 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
1505 const struct ether_header *eh;
1510 if (m_new->m_len < hoff)
1512 eh = mtod(m_new, struct ether_header *);
1513 etype = ntohs(eh->ether_type);
1514 if (etype == ETHERTYPE_VLAN) {
1515 const struct ether_vlan_header *evl;
1517 hoff = sizeof(*evl);
1518 if (m_new->m_len < hoff)
1520 evl = mtod(m_new, struct ether_vlan_header *);
1521 etype = ntohs(evl->evl_proto);
1524 if (etype == ETHERTYPE_IP) {
1527 pr = hn_check_iplen(m_new, hoff);
1528 if (pr == IPPROTO_TCP) {
1530 (rxr->hn_trust_hcsum &
1531 HN_TRUST_HCSUM_TCP)) {
1532 rxr->hn_csum_trusted++;
1533 m_new->m_pkthdr.csum_flags |=
1534 (CSUM_IP_CHECKED | CSUM_IP_VALID |
1535 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1536 m_new->m_pkthdr.csum_data = 0xffff;
1539 } else if (pr == IPPROTO_UDP) {
1541 (rxr->hn_trust_hcsum &
1542 HN_TRUST_HCSUM_UDP)) {
1543 rxr->hn_csum_trusted++;
1544 m_new->m_pkthdr.csum_flags |=
1545 (CSUM_IP_CHECKED | CSUM_IP_VALID |
1546 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1547 m_new->m_pkthdr.csum_data = 0xffff;
1549 } else if (pr != IPPROTO_DONE && do_csum &&
1550 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
1551 rxr->hn_csum_trusted++;
1552 m_new->m_pkthdr.csum_flags |=
1553 (CSUM_IP_CHECKED | CSUM_IP_VALID);
1558 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
1559 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
1560 NDIS_VLAN_INFO_ID(info->vlan_info),
1561 NDIS_VLAN_INFO_PRI(info->vlan_info),
1562 NDIS_VLAN_INFO_CFI(info->vlan_info));
1563 m_new->m_flags |= M_VLANTAG;
1566 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
1568 m_new->m_pkthdr.flowid = info->hash_value;
1569 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
1570 NDIS_HASH_FUNCTION_TOEPLITZ) {
1571 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
1574 case NDIS_HASH_IPV4:
1575 hash_type = M_HASHTYPE_RSS_IPV4;
1578 case NDIS_HASH_TCP_IPV4:
1579 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
1582 case NDIS_HASH_IPV6:
1583 hash_type = M_HASHTYPE_RSS_IPV6;
1586 case NDIS_HASH_IPV6_EX:
1587 hash_type = M_HASHTYPE_RSS_IPV6_EX;
1590 case NDIS_HASH_TCP_IPV6:
1591 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
1594 case NDIS_HASH_TCP_IPV6_EX:
1595 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
1600 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
1602 M_HASHTYPE_SET(m_new, hash_type);
1605 * Note: Moved RX completion back to hv_nv_on_receive() so all
1606 * messages (not just data messages) will trigger a response.
1612 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
1613 #if defined(INET) || defined(INET6)
1614 struct lro_ctrl *lro = &rxr->hn_lro;
1617 rxr->hn_lro_tried++;
1618 if (hn_lro_rx(lro, m_new) == 0) {
1626 /* We're not holding the lock here, so don't release it */
1627 (*ifp->if_input)(ifp, m_new);
1633 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1635 struct hn_softc *sc = ifp->if_softc;
1636 struct ifreq *ifr = (struct ifreq *)data;
1637 int mask, error = 0;
1641 if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) {
1648 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
1653 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
1654 /* Can't change MTU */
1660 if (ifp->if_mtu == ifr->ifr_mtu) {
1665 /* Obtain and record requested MTU */
1666 ifp->if_mtu = ifr->ifr_mtu;
1668 #if __FreeBSD_version >= 1100099
1670 * Make sure that LRO aggregation length limit is still
1671 * valid, after the MTU change.
1673 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
1674 HN_LRO_LENLIM_MIN(ifp))
1675 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1678 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1682 * Detach the synthetics parts, i.e. NVS and RNDIS.
1684 hn_synth_detach(sc);
1687 * Reattach the synthetic parts, i.e. NVS and RNDIS,
1688 * with the new MTU setting.
1691 hn_synth_attach(sc, ifr->ifr_mtu);
1693 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
1694 hn_set_chim_size(sc, sc->hn_chim_szmax);
1696 /* All done! Resume now. */
1697 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1706 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
1711 if (ifp->if_flags & IFF_UP) {
1713 * If only the state of the PROMISC flag changed,
1714 * then just use the 'set promisc mode' command
1715 * instead of reinitializing the entire NIC. Doing
1716 * a full re-init means reloading the firmware and
1717 * waiting for it to start up, which may take a
1721 /* Fixme: Promiscuous mode? */
1722 if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1723 ifp->if_flags & IFF_PROMISC &&
1724 !(sc->hn_if_flags & IFF_PROMISC)) {
1725 /* do something here for Hyper-V */
1726 } else if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1727 !(ifp->if_flags & IFF_PROMISC) &&
1728 sc->hn_if_flags & IFF_PROMISC) {
1729 /* do something here for Hyper-V */
1734 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1738 sc->hn_if_flags = ifp->if_flags;
1745 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
1747 if (mask & IFCAP_TXCSUM) {
1748 ifp->if_capenable ^= IFCAP_TXCSUM;
1749 if (ifp->if_capenable & IFCAP_TXCSUM)
1750 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
1752 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
1754 if (mask & IFCAP_TXCSUM_IPV6) {
1755 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
1756 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1757 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
1759 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
1762 /* TODO: flip RNDIS offload parameters for RXCSUM. */
1763 if (mask & IFCAP_RXCSUM)
1764 ifp->if_capenable ^= IFCAP_RXCSUM;
1766 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1767 if (mask & IFCAP_RXCSUM_IPV6)
1768 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
1771 if (mask & IFCAP_LRO)
1772 ifp->if_capenable ^= IFCAP_LRO;
1774 if (mask & IFCAP_TSO4) {
1775 ifp->if_capenable ^= IFCAP_TSO4;
1776 if (ifp->if_capenable & IFCAP_TSO4)
1777 ifp->if_hwassist |= CSUM_IP_TSO;
1779 ifp->if_hwassist &= ~CSUM_IP_TSO;
1781 if (mask & IFCAP_TSO6) {
1782 ifp->if_capenable ^= IFCAP_TSO6;
1783 if (ifp->if_capenable & IFCAP_TSO6)
1784 ifp->if_hwassist |= CSUM_IP6_TSO;
1786 ifp->if_hwassist &= ~CSUM_IP6_TSO;
1794 /* Always all-multi */
1797 * Enable/disable all-multi according to the emptiness of
1798 * the mcast address list.
1804 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
1808 error = ether_ioctl(ifp, cmd, data);
1815 hn_stop(struct hn_softc *sc)
1817 struct ifnet *ifp = sc->hn_ifp;
1822 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1823 ("synthetic parts were not attached"));
1825 /* Clear RUNNING bit _before_ hn_suspend() */
1826 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
1829 /* Clear OACTIVE bit. */
1830 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1831 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
1832 sc->hn_tx_ring[i].hn_oactive = 0;
1836 * FreeBSD transmit entry point
1839 hn_start(struct ifnet *ifp)
1841 struct hn_softc *sc = ifp->if_softc;
1842 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
1844 if (txr->hn_sched_tx)
1847 if (mtx_trylock(&txr->hn_tx_lock)) {
1850 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
1851 mtx_unlock(&txr->hn_tx_lock);
1856 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
1860 hn_start_txeof(struct hn_tx_ring *txr)
1862 struct hn_softc *sc = txr->hn_sc;
1863 struct ifnet *ifp = sc->hn_ifp;
1865 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
1867 if (txr->hn_sched_tx)
1870 if (mtx_trylock(&txr->hn_tx_lock)) {
1873 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1874 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
1875 mtx_unlock(&txr->hn_tx_lock);
1877 taskqueue_enqueue(txr->hn_tx_taskq,
1883 * Release the OACTIVE earlier, with the hope, that
1884 * others could catch up. The task will clear the
1885 * flag again with the hn_tx_lock to avoid possible
1888 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1889 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
1894 hn_init_locked(struct hn_softc *sc)
1896 struct ifnet *ifp = sc->hn_ifp;
1901 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1904 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1907 /* TODO: add hn_rx_filter */
1908 hn_rndis_set_rxfilter(sc, NDIS_PACKET_TYPE_PROMISCUOUS);
1910 /* Clear OACTIVE bit. */
1911 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1912 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
1913 sc->hn_tx_ring[i].hn_oactive = 0;
1915 /* Clear TX 'suspended' bit. */
1916 hn_tx_resume(sc, sc->hn_tx_ring_inuse);
1918 /* Everything is ready; unleash! */
1919 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
1921 /* TODO: check RNDIS link status. */
1922 if_link_state_change(ifp, LINK_STATE_UP);
1928 struct hn_softc *sc = xsc;
1940 hn_watchdog(struct ifnet *ifp)
1943 if_printf(ifp, "watchdog timeout -- resetting\n");
1944 hn_init(ifp->if_softc); /* XXX */
1949 #if __FreeBSD_version >= 1100099
1952 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
1954 struct hn_softc *sc = arg1;
1955 unsigned int lenlim;
1958 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
1959 error = sysctl_handle_int(oidp, &lenlim, 0, req);
1960 if (error || req->newptr == NULL)
1964 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
1965 lenlim > TCP_LRO_LENGTH_MAX) {
1969 hn_set_lro_lenlim(sc, lenlim);
1976 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
1978 struct hn_softc *sc = arg1;
1979 int ackcnt, error, i;
1982 * lro_ackcnt_lim is append count limit,
1983 * +1 to turn it into aggregation limit.
1985 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
1986 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
1987 if (error || req->newptr == NULL)
1990 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
1994 * Convert aggregation limit back to append
1999 for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
2000 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2008 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2010 struct hn_softc *sc = arg1;
2015 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2018 error = sysctl_handle_int(oidp, &on, 0, req);
2019 if (error || req->newptr == NULL)
2023 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2024 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2027 rxr->hn_trust_hcsum |= hcsum;
2029 rxr->hn_trust_hcsum &= ~hcsum;
2036 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2038 struct hn_softc *sc = arg1;
2039 int chim_size, error;
2041 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2042 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2043 if (error || req->newptr == NULL)
2046 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2050 hn_set_chim_size(sc, chim_size);
2055 #if __FreeBSD_version < 1100095
2057 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2059 struct hn_softc *sc = arg1;
2060 int ofs = arg2, i, error;
2061 struct hn_rx_ring *rxr;
2065 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2066 rxr = &sc->hn_rx_ring[i];
2067 stat += *((int *)((uint8_t *)rxr + ofs));
2070 error = sysctl_handle_64(oidp, &stat, 0, req);
2071 if (error || req->newptr == NULL)
2074 /* Zero out this stat. */
2075 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2076 rxr = &sc->hn_rx_ring[i];
2077 *((int *)((uint8_t *)rxr + ofs)) = 0;
2083 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2085 struct hn_softc *sc = arg1;
2086 int ofs = arg2, i, error;
2087 struct hn_rx_ring *rxr;
2091 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2092 rxr = &sc->hn_rx_ring[i];
2093 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2096 error = sysctl_handle_64(oidp, &stat, 0, req);
2097 if (error || req->newptr == NULL)
2100 /* Zero out this stat. */
2101 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2102 rxr = &sc->hn_rx_ring[i];
2103 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2111 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2113 struct hn_softc *sc = arg1;
2114 int ofs = arg2, i, error;
2115 struct hn_rx_ring *rxr;
2119 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2120 rxr = &sc->hn_rx_ring[i];
2121 stat += *((u_long *)((uint8_t *)rxr + ofs));
2124 error = sysctl_handle_long(oidp, &stat, 0, req);
2125 if (error || req->newptr == NULL)
2128 /* Zero out this stat. */
2129 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2130 rxr = &sc->hn_rx_ring[i];
2131 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2137 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2139 struct hn_softc *sc = arg1;
2140 int ofs = arg2, i, error;
2141 struct hn_tx_ring *txr;
2145 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2146 txr = &sc->hn_tx_ring[i];
2147 stat += *((u_long *)((uint8_t *)txr + ofs));
2150 error = sysctl_handle_long(oidp, &stat, 0, req);
2151 if (error || req->newptr == NULL)
2154 /* Zero out this stat. */
2155 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2156 txr = &sc->hn_tx_ring[i];
2157 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2163 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2165 struct hn_softc *sc = arg1;
2166 int ofs = arg2, i, error, conf;
2167 struct hn_tx_ring *txr;
2169 txr = &sc->hn_tx_ring[0];
2170 conf = *((int *)((uint8_t *)txr + ofs));
2172 error = sysctl_handle_int(oidp, &conf, 0, req);
2173 if (error || req->newptr == NULL)
2177 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2178 txr = &sc->hn_tx_ring[i];
2179 *((int *)((uint8_t *)txr + ofs)) = conf;
2187 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2189 struct hn_softc *sc = arg1;
2192 snprintf(verstr, sizeof(verstr), "%u.%u",
2193 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2194 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2195 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2199 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2201 struct hn_softc *sc = arg1;
2208 snprintf(caps_str, sizeof(caps_str), "%b", caps,
2219 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2223 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2225 struct hn_softc *sc = arg1;
2226 char assist_str[128];
2230 hwassist = sc->hn_ifp->if_hwassist;
2232 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2233 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2237 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2239 struct hn_softc *sc = arg1;
2244 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2245 if (error || req->newptr == NULL)
2248 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2251 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2253 if (sc->hn_rx_ring_inuse > 1) {
2254 error = hn_rss_reconfig(sc);
2256 /* Not RSS capable, at least for now; just save the RSS key. */
2265 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2267 struct hn_softc *sc = arg1;
2272 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2273 if (error || req->newptr == NULL)
2277 * Don't allow RSS indirect table change, if this interface is not
2278 * RSS capable currently.
2280 if (sc->hn_rx_ring_inuse == 1) {
2285 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2288 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2290 hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse);
2291 error = hn_rss_reconfig(sc);
2298 hn_check_iplen(const struct mbuf *m, int hoff)
2300 const struct ip *ip;
2301 int len, iphlen, iplen;
2302 const struct tcphdr *th;
2303 int thoff; /* TCP data offset */
2305 len = hoff + sizeof(struct ip);
2307 /* The packet must be at least the size of an IP header. */
2308 if (m->m_pkthdr.len < len)
2309 return IPPROTO_DONE;
2311 /* The fixed IP header must reside completely in the first mbuf. */
2313 return IPPROTO_DONE;
2315 ip = mtodo(m, hoff);
2317 /* Bound check the packet's stated IP header length. */
2318 iphlen = ip->ip_hl << 2;
2319 if (iphlen < sizeof(struct ip)) /* minimum header length */
2320 return IPPROTO_DONE;
2322 /* The full IP header must reside completely in the one mbuf. */
2323 if (m->m_len < hoff + iphlen)
2324 return IPPROTO_DONE;
2326 iplen = ntohs(ip->ip_len);
2329 * Check that the amount of data in the buffers is as
2330 * at least much as the IP header would have us expect.
2332 if (m->m_pkthdr.len < hoff + iplen)
2333 return IPPROTO_DONE;
2336 * Ignore IP fragments.
2338 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
2339 return IPPROTO_DONE;
2342 * The TCP/IP or UDP/IP header must be entirely contained within
2343 * the first fragment of a packet.
2347 if (iplen < iphlen + sizeof(struct tcphdr))
2348 return IPPROTO_DONE;
2349 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
2350 return IPPROTO_DONE;
2351 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
2352 thoff = th->th_off << 2;
2353 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
2354 return IPPROTO_DONE;
2355 if (m->m_len < hoff + iphlen + thoff)
2356 return IPPROTO_DONE;
2359 if (iplen < iphlen + sizeof(struct udphdr))
2360 return IPPROTO_DONE;
2361 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
2362 return IPPROTO_DONE;
2366 return IPPROTO_DONE;
2373 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
2375 struct sysctl_oid_list *child;
2376 struct sysctl_ctx_list *ctx;
2377 device_t dev = sc->hn_dev;
2378 #if defined(INET) || defined(INET6)
2379 #if __FreeBSD_version >= 1100095
2386 * Create RXBUF for reception.
2389 * - It is shared by all channels.
2390 * - A large enough buffer is allocated, certain version of NVSes
2391 * may further limit the usable space.
2393 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
2394 PAGE_SIZE, 0, NETVSC_RECEIVE_BUFFER_SIZE, &sc->hn_rxbuf_dma,
2395 BUS_DMA_WAITOK | BUS_DMA_ZERO);
2396 if (sc->hn_rxbuf == NULL) {
2397 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
2401 sc->hn_rx_ring_cnt = ring_cnt;
2402 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
2404 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
2405 M_NETVSC, M_WAITOK | M_ZERO);
2407 #if defined(INET) || defined(INET6)
2408 #if __FreeBSD_version >= 1100095
2409 lroent_cnt = hn_lro_entry_count;
2410 if (lroent_cnt < TCP_LRO_ENTRIES)
2411 lroent_cnt = TCP_LRO_ENTRIES;
2413 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
2415 #endif /* INET || INET6 */
2417 ctx = device_get_sysctl_ctx(dev);
2418 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2420 /* Create dev.hn.UNIT.rx sysctl tree */
2421 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
2422 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2424 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2425 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2427 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
2429 NETVSC_DEVICE_RING_BUFFER_SIZE +
2430 NETVSC_DEVICE_RING_BUFFER_SIZE,
2431 &rxr->hn_br_dma, BUS_DMA_WAITOK);
2432 if (rxr->hn_br == NULL) {
2433 device_printf(dev, "allocate bufring failed\n");
2437 if (hn_trust_hosttcp)
2438 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
2439 if (hn_trust_hostudp)
2440 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
2441 if (hn_trust_hostip)
2442 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
2443 rxr->hn_ifp = sc->hn_ifp;
2444 if (i < sc->hn_tx_ring_cnt)
2445 rxr->hn_txr = &sc->hn_tx_ring[i];
2446 rxr->hn_rdbuf = malloc(NETVSC_PACKET_SIZE, M_NETVSC, M_WAITOK);
2448 rxr->hn_rxbuf = sc->hn_rxbuf;
2453 #if defined(INET) || defined(INET6)
2454 #if __FreeBSD_version >= 1100095
2455 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
2456 hn_lro_mbufq_depth);
2458 tcp_lro_init(&rxr->hn_lro);
2459 rxr->hn_lro.ifp = sc->hn_ifp;
2461 #if __FreeBSD_version >= 1100099
2462 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
2463 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
2465 #endif /* INET || INET6 */
2467 if (sc->hn_rx_sysctl_tree != NULL) {
2471 * Create per RX ring sysctl tree:
2472 * dev.hn.UNIT.rx.RINGID
2474 snprintf(name, sizeof(name), "%d", i);
2475 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
2476 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
2477 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2479 if (rxr->hn_rx_sysctl_tree != NULL) {
2480 SYSCTL_ADD_ULONG(ctx,
2481 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2482 OID_AUTO, "packets", CTLFLAG_RW,
2483 &rxr->hn_pkts, "# of packets received");
2484 SYSCTL_ADD_ULONG(ctx,
2485 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2486 OID_AUTO, "rss_pkts", CTLFLAG_RW,
2488 "# of packets w/ RSS info received");
2493 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
2494 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2495 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
2496 #if __FreeBSD_version < 1100095
2497 hn_rx_stat_int_sysctl,
2499 hn_rx_stat_u64_sysctl,
2501 "LU", "LRO queued");
2502 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
2503 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2504 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
2505 #if __FreeBSD_version < 1100095
2506 hn_rx_stat_int_sysctl,
2508 hn_rx_stat_u64_sysctl,
2510 "LU", "LRO flushed");
2511 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
2512 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2513 __offsetof(struct hn_rx_ring, hn_lro_tried),
2514 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
2515 #if __FreeBSD_version >= 1100099
2516 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
2517 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2518 hn_lro_lenlim_sysctl, "IU",
2519 "Max # of data bytes to be aggregated by LRO");
2520 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
2521 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2522 hn_lro_ackcnt_sysctl, "I",
2523 "Max # of ACKs to be aggregated by LRO");
2525 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
2526 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
2527 hn_trust_hcsum_sysctl, "I",
2528 "Trust tcp segement verification on host side, "
2529 "when csum info is missing");
2530 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
2531 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
2532 hn_trust_hcsum_sysctl, "I",
2533 "Trust udp datagram verification on host side, "
2534 "when csum info is missing");
2535 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
2536 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
2537 hn_trust_hcsum_sysctl, "I",
2538 "Trust ip packet verification on host side, "
2539 "when csum info is missing");
2540 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
2541 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2542 __offsetof(struct hn_rx_ring, hn_csum_ip),
2543 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
2544 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
2545 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2546 __offsetof(struct hn_rx_ring, hn_csum_tcp),
2547 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
2548 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
2549 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2550 __offsetof(struct hn_rx_ring, hn_csum_udp),
2551 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
2552 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
2553 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2554 __offsetof(struct hn_rx_ring, hn_csum_trusted),
2555 hn_rx_stat_ulong_sysctl, "LU",
2556 "# of packets that we trust host's csum verification");
2557 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
2558 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2559 __offsetof(struct hn_rx_ring, hn_small_pkts),
2560 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
2561 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
2562 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
2563 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
2564 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
2570 hn_destroy_rx_data(struct hn_softc *sc)
2574 if (sc->hn_rxbuf != NULL) {
2575 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
2576 sc->hn_rxbuf = NULL;
2579 if (sc->hn_rx_ring_cnt == 0)
2582 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2583 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2585 if (rxr->hn_br == NULL)
2587 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
2590 #if defined(INET) || defined(INET6)
2591 tcp_lro_free(&rxr->hn_lro);
2593 free(rxr->hn_rdbuf, M_NETVSC);
2595 free(sc->hn_rx_ring, M_NETVSC);
2596 sc->hn_rx_ring = NULL;
2598 sc->hn_rx_ring_cnt = 0;
2599 sc->hn_rx_ring_inuse = 0;
2603 hn_create_tx_ring(struct hn_softc *sc, int id)
2605 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
2606 device_t dev = sc->hn_dev;
2607 bus_dma_tag_t parent_dtag;
2611 txr->hn_tx_idx = id;
2613 #ifndef HN_USE_TXDESC_BUFRING
2614 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
2616 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
2618 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
2619 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
2620 M_NETVSC, M_WAITOK | M_ZERO);
2621 #ifndef HN_USE_TXDESC_BUFRING
2622 SLIST_INIT(&txr->hn_txlist);
2624 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_NETVSC,
2625 M_WAITOK, &txr->hn_tx_lock);
2628 txr->hn_tx_taskq = sc->hn_tx_taskq;
2630 if (hn_use_if_start) {
2631 txr->hn_txeof = hn_start_txeof;
2632 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
2633 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
2637 txr->hn_txeof = hn_xmit_txeof;
2638 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
2639 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
2641 br_depth = hn_get_txswq_depth(txr);
2642 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_NETVSC,
2643 M_WAITOK, &txr->hn_tx_lock);
2646 txr->hn_direct_tx_size = hn_direct_tx_size;
2649 * Always schedule transmission instead of trying to do direct
2650 * transmission. This one gives the best performance so far.
2652 txr->hn_sched_tx = 1;
2654 parent_dtag = bus_get_dma_tag(dev);
2656 /* DMA tag for RNDIS packet messages. */
2657 error = bus_dma_tag_create(parent_dtag, /* parent */
2658 HN_RNDIS_PKT_ALIGN, /* alignment */
2659 HN_RNDIS_PKT_BOUNDARY, /* boundary */
2660 BUS_SPACE_MAXADDR, /* lowaddr */
2661 BUS_SPACE_MAXADDR, /* highaddr */
2662 NULL, NULL, /* filter, filterarg */
2663 HN_RNDIS_PKT_LEN, /* maxsize */
2665 HN_RNDIS_PKT_LEN, /* maxsegsize */
2667 NULL, /* lockfunc */
2668 NULL, /* lockfuncarg */
2669 &txr->hn_tx_rndis_dtag);
2671 device_printf(dev, "failed to create rndis dmatag\n");
2675 /* DMA tag for data. */
2676 error = bus_dma_tag_create(parent_dtag, /* parent */
2678 HN_TX_DATA_BOUNDARY, /* boundary */
2679 BUS_SPACE_MAXADDR, /* lowaddr */
2680 BUS_SPACE_MAXADDR, /* highaddr */
2681 NULL, NULL, /* filter, filterarg */
2682 HN_TX_DATA_MAXSIZE, /* maxsize */
2683 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
2684 HN_TX_DATA_SEGSIZE, /* maxsegsize */
2686 NULL, /* lockfunc */
2687 NULL, /* lockfuncarg */
2688 &txr->hn_tx_data_dtag);
2690 device_printf(dev, "failed to create data dmatag\n");
2694 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
2695 struct hn_txdesc *txd = &txr->hn_txdesc[i];
2700 * Allocate and load RNDIS packet message.
2702 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
2703 (void **)&txd->rndis_pkt,
2704 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
2705 &txd->rndis_pkt_dmap);
2708 "failed to allocate rndis_packet_msg, %d\n", i);
2712 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
2713 txd->rndis_pkt_dmap,
2714 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
2715 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
2719 "failed to load rndis_packet_msg, %d\n", i);
2720 bus_dmamem_free(txr->hn_tx_rndis_dtag,
2721 txd->rndis_pkt, txd->rndis_pkt_dmap);
2725 /* DMA map for TX data. */
2726 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
2730 "failed to allocate tx data dmamap\n");
2731 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
2732 txd->rndis_pkt_dmap);
2733 bus_dmamem_free(txr->hn_tx_rndis_dtag,
2734 txd->rndis_pkt, txd->rndis_pkt_dmap);
2738 /* All set, put it to list */
2739 txd->flags |= HN_TXD_FLAG_ONLIST;
2740 #ifndef HN_USE_TXDESC_BUFRING
2741 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2743 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2746 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
2748 if (sc->hn_tx_sysctl_tree != NULL) {
2749 struct sysctl_oid_list *child;
2750 struct sysctl_ctx_list *ctx;
2754 * Create per TX ring sysctl tree:
2755 * dev.hn.UNIT.tx.RINGID
2757 ctx = device_get_sysctl_ctx(dev);
2758 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
2760 snprintf(name, sizeof(name), "%d", id);
2761 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
2762 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2764 if (txr->hn_tx_sysctl_tree != NULL) {
2765 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
2767 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
2768 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
2769 "# of available TX descs");
2770 if (!hn_use_if_start) {
2771 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
2772 CTLFLAG_RD, &txr->hn_oactive, 0,
2775 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
2776 CTLFLAG_RW, &txr->hn_pkts,
2777 "# of packets transmitted");
2785 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
2787 struct hn_tx_ring *txr = txd->txr;
2789 KASSERT(txd->m == NULL, ("still has mbuf installed"));
2790 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
2792 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
2793 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
2794 txd->rndis_pkt_dmap);
2795 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
2799 hn_destroy_tx_ring(struct hn_tx_ring *txr)
2801 struct hn_txdesc *txd;
2803 if (txr->hn_txdesc == NULL)
2806 #ifndef HN_USE_TXDESC_BUFRING
2807 while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
2808 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2809 hn_txdesc_dmamap_destroy(txd);
2812 mtx_lock(&txr->hn_tx_lock);
2813 while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
2814 hn_txdesc_dmamap_destroy(txd);
2815 mtx_unlock(&txr->hn_tx_lock);
2818 if (txr->hn_tx_data_dtag != NULL)
2819 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
2820 if (txr->hn_tx_rndis_dtag != NULL)
2821 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
2823 #ifdef HN_USE_TXDESC_BUFRING
2824 buf_ring_free(txr->hn_txdesc_br, M_NETVSC);
2827 free(txr->hn_txdesc, M_NETVSC);
2828 txr->hn_txdesc = NULL;
2830 if (txr->hn_mbuf_br != NULL)
2831 buf_ring_free(txr->hn_mbuf_br, M_NETVSC);
2833 #ifndef HN_USE_TXDESC_BUFRING
2834 mtx_destroy(&txr->hn_txlist_spin);
2836 mtx_destroy(&txr->hn_tx_lock);
2840 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
2842 struct sysctl_oid_list *child;
2843 struct sysctl_ctx_list *ctx;
2847 * Create TXBUF for chimney sending.
2849 * NOTE: It is shared by all channels.
2851 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
2852 PAGE_SIZE, 0, NETVSC_SEND_BUFFER_SIZE, &sc->hn_chim_dma,
2853 BUS_DMA_WAITOK | BUS_DMA_ZERO);
2854 if (sc->hn_chim == NULL) {
2855 device_printf(sc->hn_dev, "allocate txbuf failed\n");
2859 sc->hn_tx_ring_cnt = ring_cnt;
2860 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
2862 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
2863 M_NETVSC, M_WAITOK | M_ZERO);
2865 ctx = device_get_sysctl_ctx(sc->hn_dev);
2866 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
2868 /* Create dev.hn.UNIT.tx sysctl tree */
2869 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
2870 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2872 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2875 error = hn_create_tx_ring(sc, i);
2880 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
2881 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2882 __offsetof(struct hn_tx_ring, hn_no_txdescs),
2883 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
2884 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
2885 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2886 __offsetof(struct hn_tx_ring, hn_send_failed),
2887 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
2888 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
2889 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2890 __offsetof(struct hn_tx_ring, hn_txdma_failed),
2891 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
2892 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
2893 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2894 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
2895 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
2896 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
2897 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2898 __offsetof(struct hn_tx_ring, hn_tx_chimney),
2899 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
2900 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
2901 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2902 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
2903 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
2904 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
2905 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
2906 "# of total TX descs");
2907 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
2908 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
2909 "Chimney send packet size upper boundary");
2910 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
2911 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2912 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
2913 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
2914 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2915 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
2916 hn_tx_conf_int_sysctl, "I",
2917 "Size of the packet for direct transmission");
2918 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
2919 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2920 __offsetof(struct hn_tx_ring, hn_sched_tx),
2921 hn_tx_conf_int_sysctl, "I",
2922 "Always schedule transmission "
2923 "instead of doing direct transmission");
2924 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
2925 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
2926 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
2927 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
2933 hn_set_chim_size(struct hn_softc *sc, int chim_size)
2937 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2938 sc->hn_tx_ring[i].hn_chim_size = chim_size;
2942 hn_fixup_tx_data(struct hn_softc *sc)
2944 uint64_t csum_assist;
2947 hn_set_chim_size(sc, sc->hn_chim_szmax);
2948 if (hn_tx_chimney_size > 0 &&
2949 hn_tx_chimney_size < sc->hn_chim_szmax)
2950 hn_set_chim_size(sc, hn_tx_chimney_size);
2953 if (sc->hn_caps & HN_CAP_IPCS)
2954 csum_assist |= CSUM_IP;
2955 if (sc->hn_caps & HN_CAP_TCP4CS)
2956 csum_assist |= CSUM_IP_TCP;
2957 if (sc->hn_caps & HN_CAP_UDP4CS)
2958 csum_assist |= CSUM_IP_UDP;
2960 if (sc->hn_caps & HN_CAP_TCP6CS)
2961 csum_assist |= CSUM_IP6_TCP;
2962 if (sc->hn_caps & HN_CAP_UDP6CS)
2963 csum_assist |= CSUM_IP6_UDP;
2966 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
2967 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
2969 if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_30) {
2970 /* Support HASHVAL pktinfo on TX path. */
2971 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
2972 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
2977 hn_destroy_tx_data(struct hn_softc *sc)
2981 if (sc->hn_chim != NULL) {
2982 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
2986 if (sc->hn_tx_ring_cnt == 0)
2989 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
2990 hn_destroy_tx_ring(&sc->hn_tx_ring[i]);
2992 free(sc->hn_tx_ring, M_NETVSC);
2993 sc->hn_tx_ring = NULL;
2995 sc->hn_tx_ring_cnt = 0;
2996 sc->hn_tx_ring_inuse = 0;
3000 hn_start_taskfunc(void *xtxr, int pending __unused)
3002 struct hn_tx_ring *txr = xtxr;
3004 mtx_lock(&txr->hn_tx_lock);
3005 hn_start_locked(txr, 0);
3006 mtx_unlock(&txr->hn_tx_lock);
3010 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3012 struct hn_tx_ring *txr = xtxr;
3014 mtx_lock(&txr->hn_tx_lock);
3015 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3016 hn_start_locked(txr, 0);
3017 mtx_unlock(&txr->hn_tx_lock);
3021 hn_xmit(struct hn_tx_ring *txr, int len)
3023 struct hn_softc *sc = txr->hn_sc;
3024 struct ifnet *ifp = sc->hn_ifp;
3025 struct mbuf *m_head;
3027 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3028 KASSERT(hn_use_if_start == 0,
3029 ("hn_xmit is called, when if_start is enabled"));
3031 if (__predict_false(txr->hn_suspended))
3034 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
3037 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
3038 struct hn_txdesc *txd;
3041 if (len > 0 && m_head->m_pkthdr.len > len) {
3043 * This sending could be time consuming; let callers
3044 * dispatch this packet sending (and sending of any
3045 * following up packets) to tx taskqueue.
3047 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3051 txd = hn_txdesc_get(txr);
3053 txr->hn_no_txdescs++;
3054 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3055 txr->hn_oactive = 1;
3059 error = hn_encap(txr, txd, &m_head);
3061 /* Both txd and m_head are freed; discard */
3062 drbr_advance(ifp, txr->hn_mbuf_br);
3066 error = hn_send_pkt(ifp, txr, txd);
3067 if (__predict_false(error)) {
3068 /* txd is freed, but m_head is not */
3069 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3070 txr->hn_oactive = 1;
3075 drbr_advance(ifp, txr->hn_mbuf_br);
3081 hn_transmit(struct ifnet *ifp, struct mbuf *m)
3083 struct hn_softc *sc = ifp->if_softc;
3084 struct hn_tx_ring *txr;
3088 * Select the TX ring based on flowid
3090 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
3091 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
3092 txr = &sc->hn_tx_ring[idx];
3094 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
3096 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
3100 if (txr->hn_oactive)
3103 if (txr->hn_sched_tx)
3106 if (mtx_trylock(&txr->hn_tx_lock)) {
3109 sched = hn_xmit(txr, txr->hn_direct_tx_size);
3110 mtx_unlock(&txr->hn_tx_lock);
3115 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3120 hn_tx_ring_qflush(struct hn_tx_ring *txr)
3124 mtx_lock(&txr->hn_tx_lock);
3125 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
3127 mtx_unlock(&txr->hn_tx_lock);
3131 hn_xmit_qflush(struct ifnet *ifp)
3133 struct hn_softc *sc = ifp->if_softc;
3136 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3137 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
3142 hn_xmit_txeof(struct hn_tx_ring *txr)
3145 if (txr->hn_sched_tx)
3148 if (mtx_trylock(&txr->hn_tx_lock)) {
3151 txr->hn_oactive = 0;
3152 sched = hn_xmit(txr, txr->hn_direct_tx_size);
3153 mtx_unlock(&txr->hn_tx_lock);
3155 taskqueue_enqueue(txr->hn_tx_taskq,
3161 * Release the oactive earlier, with the hope, that
3162 * others could catch up. The task will clear the
3163 * oactive again with the hn_tx_lock to avoid possible
3166 txr->hn_oactive = 0;
3167 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3172 hn_xmit_taskfunc(void *xtxr, int pending __unused)
3174 struct hn_tx_ring *txr = xtxr;
3176 mtx_lock(&txr->hn_tx_lock);
3178 mtx_unlock(&txr->hn_tx_lock);
3182 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
3184 struct hn_tx_ring *txr = xtxr;
3186 mtx_lock(&txr->hn_tx_lock);
3187 txr->hn_oactive = 0;
3189 mtx_unlock(&txr->hn_tx_lock);
3193 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
3195 struct vmbus_chan_br cbr;
3196 struct hn_rx_ring *rxr;
3197 struct hn_tx_ring *txr = NULL;
3200 idx = vmbus_chan_subidx(chan);
3203 * Link this channel to RX/TX ring.
3205 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
3206 ("invalid channel index %d, should > 0 && < %d",
3207 idx, sc->hn_rx_ring_inuse));
3208 rxr = &sc->hn_rx_ring[idx];
3209 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
3210 ("RX ring %d already attached", idx));
3211 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
3214 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
3215 idx, vmbus_chan_id(chan));
3218 if (idx < sc->hn_tx_ring_inuse) {
3219 txr = &sc->hn_tx_ring[idx];
3220 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
3221 ("TX ring %d already attached", idx));
3222 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
3224 txr->hn_chan = chan;
3226 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
3227 idx, vmbus_chan_id(chan));
3231 /* Bind this channel to a proper CPU. */
3232 vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
3237 cbr.cbr = rxr->hn_br;
3238 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
3239 cbr.cbr_txsz = NETVSC_DEVICE_RING_BUFFER_SIZE;
3240 cbr.cbr_rxsz = NETVSC_DEVICE_RING_BUFFER_SIZE;
3241 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
3243 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
3244 vmbus_chan_id(chan), error);
3245 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
3247 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
3253 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
3255 struct hn_rx_ring *rxr;
3258 idx = vmbus_chan_subidx(chan);
3261 * Link this channel to RX/TX ring.
3263 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
3264 ("invalid channel index %d, should > 0 && < %d",
3265 idx, sc->hn_rx_ring_inuse));
3266 rxr = &sc->hn_rx_ring[idx];
3267 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
3268 ("RX ring %d is not attached", idx));
3269 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
3271 if (idx < sc->hn_tx_ring_inuse) {
3272 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
3274 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
3275 ("TX ring %d is not attached attached", idx));
3276 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
3280 * Close this channel.
3283 * Channel closing does _not_ destroy the target channel.
3285 vmbus_chan_close(chan);
3289 hn_attach_subchans(struct hn_softc *sc)
3291 struct vmbus_channel **subchans;
3292 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
3295 if (subchan_cnt == 0)
3298 /* Attach the sub-channels. */
3299 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
3300 for (i = 0; i < subchan_cnt; ++i) {
3301 error = hn_chan_attach(sc, subchans[i]);
3305 vmbus_subchan_rel(subchans, subchan_cnt);
3308 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
3311 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
3319 hn_detach_allchans(struct hn_softc *sc)
3321 struct vmbus_channel **subchans;
3322 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
3325 if (subchan_cnt == 0)
3328 /* Detach the sub-channels. */
3329 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
3330 for (i = 0; i < subchan_cnt; ++i)
3331 hn_chan_detach(sc, subchans[i]);
3332 vmbus_subchan_rel(subchans, subchan_cnt);
3336 * Detach the primary channel, _after_ all sub-channels
3339 hn_chan_detach(sc, sc->hn_prichan);
3341 /* Wait for sub-channels to be destroyed, if any. */
3342 vmbus_subchan_drain(sc->hn_prichan);
3345 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3346 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
3347 HN_RX_FLAG_ATTACHED) == 0,
3348 ("%dth RX ring is still attached", i));
3350 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3351 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
3352 HN_TX_FLAG_ATTACHED) == 0,
3353 ("%dth TX ring is still attached", i));
3359 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
3361 struct vmbus_channel **subchans;
3362 int nchan, rxr_cnt, error;
3364 nchan = *nsubch + 1;
3365 if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_30 || nchan == 1) {
3367 * Either RSS is not supported, or multiple RX/TX rings
3368 * are not requested.
3375 * Get RSS capabilities, e.g. # of RX rings, and # of indirect
3378 error = hn_rndis_get_rsscaps(sc, &rxr_cnt);
3380 /* No RSS; this is benign. */
3385 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
3389 if (nchan > rxr_cnt)
3392 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
3398 * Allocate sub-channels from NVS.
3400 *nsubch = nchan - 1;
3401 error = hn_nvs_alloc_subchans(sc, nsubch);
3402 if (error || *nsubch == 0) {
3403 /* Failed to allocate sub-channels. */
3409 * Wait for all sub-channels to become ready before moving on.
3411 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
3412 vmbus_subchan_rel(subchans, *nsubch);
3417 hn_synth_attach(struct hn_softc *sc, int mtu)
3419 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
3420 int error, nsubch, nchan, i;
3423 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
3424 ("synthetic parts were attached"));
3426 /* Save capabilities for later verification. */
3427 old_caps = sc->hn_caps;
3431 * Attach the primary channel _before_ attaching NVS and RNDIS.
3433 error = hn_chan_attach(sc, sc->hn_prichan);
3440 error = hn_nvs_attach(sc, mtu);
3445 * Attach RNDIS _after_ NVS is attached.
3447 error = hn_rndis_attach(sc);
3452 * Make sure capabilities are not changed.
3454 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
3455 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
3456 old_caps, sc->hn_caps);
3457 /* Restore old capabilities and abort. */
3458 sc->hn_caps = old_caps;
3463 * Allocate sub-channels for multi-TX/RX rings.
3466 * The # of RX rings that can be used is equivalent to the # of
3467 * channels to be requested.
3469 nsubch = sc->hn_rx_ring_cnt - 1;
3470 error = hn_synth_alloc_subchans(sc, &nsubch);
3476 /* Only the primary channel can be used; done */
3481 * Configure RSS key and indirect table _after_ all sub-channels
3485 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
3487 * RSS key is not set yet; set it to the default RSS key.
3490 if_printf(sc->hn_ifp, "setup default RSS key\n");
3491 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
3492 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3495 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
3497 * RSS indirect table is not set yet; set it up in round-
3501 if_printf(sc->hn_ifp, "setup default RSS indirect "
3504 /* TODO: Take ndis_rss_caps.ndis_nind into account. */
3505 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
3506 rss->rss_ind[i] = i % nchan;
3507 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3510 * # of usable channels may be changed, so we have to
3511 * make sure that all entries in RSS indirect table
3514 hn_rss_ind_fixup(sc, nchan);
3517 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
3520 * Failed to configure RSS key or indirect table; only
3521 * the primary channel can be used.
3527 * Set the # of TX/RX rings that could be used according to
3528 * the # of channels that NVS offered.
3530 hn_set_ring_inuse(sc, nchan);
3533 * Attach the sub-channels, if any.
3535 error = hn_attach_subchans(sc);
3539 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
3545 * The interface must have been suspended though hn_suspend(), before
3546 * this function get called.
3549 hn_synth_detach(struct hn_softc *sc)
3553 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3554 ("synthetic parts were not attached"));
3556 /* Detach the RNDIS first. */
3557 hn_rndis_detach(sc);
3562 /* Detach all of the channels. */
3563 hn_detach_allchans(sc);
3565 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
3569 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
3571 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
3572 ("invalid ring count %d", ring_cnt));
3574 if (sc->hn_tx_ring_cnt > ring_cnt)
3575 sc->hn_tx_ring_inuse = ring_cnt;
3577 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3578 sc->hn_rx_ring_inuse = ring_cnt;
3581 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
3582 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
3587 hn_rx_drain(struct vmbus_channel *chan)
3590 while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan))
3592 vmbus_chan_intr_drain(chan);
3596 hn_suspend(struct hn_softc *sc)
3598 struct vmbus_channel **subch = NULL;
3606 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
3607 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
3609 mtx_lock(&txr->hn_tx_lock);
3610 txr->hn_suspended = 1;
3611 mtx_unlock(&txr->hn_tx_lock);
3612 /* No one is able send more packets now. */
3614 /* Wait for all pending sends to finish. */
3615 while (hn_tx_ring_pending(txr))
3616 pause("hnwtx", 1 /* 1 tick */);
3618 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
3619 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
3623 * Disable RX by clearing RX filter.
3625 hn_rndis_set_rxfilter(sc, 0);
3628 * Give RNDIS enough time to flush all pending data packets.
3630 pause("waitrx", (200 * hz) / 1000);
3633 * Drain RX/TX bufrings and interrupts.
3635 nsubch = sc->hn_rx_ring_inuse - 1;
3637 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
3639 if (subch != NULL) {
3640 for (i = 0; i < nsubch; ++i)
3641 hn_rx_drain(subch[i]);
3643 hn_rx_drain(sc->hn_prichan);
3646 vmbus_subchan_rel(subch, nsubch);
3650 hn_tx_resume(struct hn_softc *sc, int tx_ring_cnt)
3654 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
3655 ("invalid TX ring count %d", tx_ring_cnt));
3657 for (i = 0; i < tx_ring_cnt; ++i) {
3658 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
3660 mtx_lock(&txr->hn_tx_lock);
3661 txr->hn_suspended = 0;
3662 mtx_unlock(&txr->hn_tx_lock);
3667 hn_resume(struct hn_softc *sc)
3675 * TODO: add hn_rx_filter.
3677 hn_rndis_set_rxfilter(sc, NDIS_PACKET_TYPE_PROMISCUOUS);
3680 * Make sure to clear suspend status on "all" TX rings,
3681 * since hn_tx_ring_inuse can be changed after hn_suspend().
3683 hn_tx_resume(sc, sc->hn_tx_ring_cnt);
3685 if (!hn_use_if_start) {
3687 * Flush unused drbrs, since hn_tx_ring_inuse may be
3690 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
3691 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
3697 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
3698 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
3701 * Use txeof task, so that any pending oactive can be
3704 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3709 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
3711 const struct hn_nvs_hdr *hdr;
3713 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
3714 if_printf(sc->hn_ifp, "invalid nvs notify\n");
3717 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
3719 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
3720 /* Useless; ignore */
3723 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
3727 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
3728 const struct vmbus_chanpkt_hdr *pkt)
3730 struct hn_send_ctx *sndc;
3732 sndc = (struct hn_send_ctx *)(uintptr_t)pkt->cph_xactid;
3733 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
3734 VMBUS_CHANPKT_DATALEN(pkt));
3737 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
3743 hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr,
3744 struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr)
3746 const struct vmbus_chanpkt_rxbuf *pkt;
3747 const struct hn_nvs_hdr *nvs_hdr;
3750 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
3751 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
3754 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
3756 /* Make sure that this is a RNDIS message. */
3757 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
3758 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
3763 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
3764 if (__predict_false(hlen < sizeof(*pkt))) {
3765 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
3768 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
3770 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
3771 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
3776 count = pkt->cp_rxbuf_cnt;
3777 if (__predict_false(hlen <
3778 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
3779 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
3783 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
3784 for (i = 0; i < count; ++i) {
3787 ofs = pkt->cp_rxbuf[i].rb_ofs;
3788 len = pkt->cp_rxbuf[i].rb_len;
3789 if (__predict_false(ofs + len > NETVSC_RECEIVE_BUFFER_SIZE)) {
3790 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
3791 "ofs %d, len %d\n", i, ofs, len);
3794 hv_rf_on_receive(sc, rxr, rxr->hn_rxbuf + ofs, len);
3798 * Moved completion call back here so that all received
3799 * messages (not just data messages) will trigger a response
3800 * message back to the host.
3802 hn_nvs_ack_rxbuf(chan, pkt->cp_hdr.cph_xactid);
3806 * Net VSC on receive completion
3808 * Send a receive completion packet to RNDIS device (ie NetVsp)
3811 hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid)
3813 struct hn_nvs_rndis_ack ack;
3817 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
3818 ack.nvs_status = HN_NVS_STATUS_OK;
3821 /* Send the completion */
3822 ret = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
3823 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
3827 } else if (ret == EAGAIN) {
3828 /* no more room... wait a bit and attempt to retry 3 times */
3833 goto retry_send_cmplt;
3839 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
3841 struct hn_rx_ring *rxr = xrxr;
3842 struct hn_softc *sc = rxr->hn_ifp->if_softc;
3844 int bufferlen = NETVSC_PACKET_SIZE;
3846 buffer = rxr->hn_rdbuf;
3848 struct vmbus_chanpkt_hdr *pkt = buffer;
3849 uint32_t bytes_rxed;
3852 bytes_rxed = bufferlen;
3853 ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed);
3855 switch (pkt->cph_type) {
3856 case VMBUS_CHANPKT_TYPE_COMP:
3857 hn_nvs_handle_comp(sc, chan, pkt);
3859 case VMBUS_CHANPKT_TYPE_RXBUF:
3860 hn_nvs_handle_rxbuf(sc, rxr, chan, pkt);
3862 case VMBUS_CHANPKT_TYPE_INBAND:
3863 hn_nvs_handle_notify(sc, pkt);
3866 if_printf(rxr->hn_ifp,
3867 "unknown chan pkt %u\n",
3871 } else if (ret == ENOBUFS) {
3872 /* Handle large packet */
3873 if (bufferlen > NETVSC_PACKET_SIZE) {
3874 free(buffer, M_NETVSC);
3878 /* alloc new buffer */
3879 buffer = malloc(bytes_rxed, M_NETVSC, M_NOWAIT);
3880 if (buffer == NULL) {
3881 if_printf(rxr->hn_ifp,
3882 "hv_cb malloc buffer failed, len=%u\n",
3887 bufferlen = bytes_rxed;
3889 /* No more packets */
3894 if (bufferlen > NETVSC_PACKET_SIZE)
3895 free(buffer, M_NETVSC);
3897 hv_rf_channel_rollup(rxr, rxr->hn_txr);
3901 hn_tx_taskq_create(void *arg __unused)
3903 if (!hn_share_tx_taskq)
3906 hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
3907 taskqueue_thread_enqueue, &hn_tx_taskq);
3908 taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
3909 if (hn_bind_tx_taskq >= 0) {
3910 int cpu = hn_bind_tx_taskq;
3911 struct task cpuset_task;
3914 if (cpu > mp_ncpus - 1)
3916 CPU_SETOF(cpu, &cpu_set);
3917 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set);
3918 taskqueue_enqueue(hn_tx_taskq, &cpuset_task);
3919 taskqueue_drain(hn_tx_taskq, &cpuset_task);
3922 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_FIRST,
3923 hn_tx_taskq_create, NULL);
3926 hn_tx_taskq_destroy(void *arg __unused)
3928 if (hn_tx_taskq != NULL)
3929 taskqueue_free(hn_tx_taskq);
3931 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_FIRST,
3932 hn_tx_taskq_destroy, NULL);
3934 static device_method_t netvsc_methods[] = {
3935 /* Device interface */
3936 DEVMETHOD(device_probe, netvsc_probe),
3937 DEVMETHOD(device_attach, netvsc_attach),
3938 DEVMETHOD(device_detach, netvsc_detach),
3939 DEVMETHOD(device_shutdown, netvsc_shutdown),
3944 static driver_t netvsc_driver = {
3947 sizeof(struct hn_softc)
3950 static devclass_t netvsc_devclass;
3952 DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0);
3953 MODULE_VERSION(hn, 1);
3954 MODULE_DEPEND(hn, vmbus, 1, 1, 1);