2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/sockio.h>
65 #include <sys/malloc.h>
66 #include <sys/module.h>
67 #include <sys/kernel.h>
68 #include <sys/socket.h>
70 #include <sys/queue.h>
74 #include <sys/sysctl.h>
75 #include <sys/buf_ring.h>
78 #include <net/if_arp.h>
79 #include <net/ethernet.h>
80 #include <net/if_dl.h>
81 #include <net/if_media.h>
82 #include <net/rndis.h>
85 #include <net/if_types.h>
86 #include <net/if_vlan_var.h>
89 #include <netinet/in_systm.h>
90 #include <netinet/in.h>
91 #include <netinet/ip.h>
92 #include <netinet/if_ether.h>
93 #include <netinet/tcp.h>
94 #include <netinet/udp.h>
95 #include <netinet/ip6.h>
98 #include <vm/vm_param.h>
99 #include <vm/vm_kern.h>
102 #include <machine/bus.h>
103 #include <machine/resource.h>
104 #include <machine/frame.h>
105 #include <machine/vmparam.h>
108 #include <sys/rman.h>
109 #include <sys/mutex.h>
110 #include <sys/errno.h>
111 #include <sys/types.h>
112 #include <machine/atomic.h>
114 #include <machine/intr_machdep.h>
116 #include <machine/in_cksum.h>
118 #include <dev/hyperv/include/hyperv.h>
119 #include <dev/hyperv/include/hyperv_busdma.h>
120 #include <dev/hyperv/include/vmbus_xact.h>
122 #include <dev/hyperv/netvsc/hv_net_vsc.h>
123 #include <dev/hyperv/netvsc/hv_rndis_filter.h>
124 #include <dev/hyperv/netvsc/ndis.h>
126 #include "vmbus_if.h"
128 /* Short for Hyper-V network interface */
129 #define NETVSC_DEVNAME "hn"
132 * It looks like offset 0 of buf is reserved to hold the softc pointer.
133 * The sc pointer evidently not needed, and is not presently populated.
134 * The packet offset is where the netvsc_packet starts in the buffer.
136 #define HV_NV_SC_PTR_OFFSET_IN_BUF 0
137 #define HV_NV_PACKET_OFFSET_IN_BUF 16
139 /* YYY should get it from the underlying channel */
140 #define HN_TX_DESC_CNT 512
142 #define HN_LROENT_CNT_DEF 128
144 #define HN_RING_CNT_DEF_MAX 8
146 #define HN_RNDIS_PKT_LEN \
147 (sizeof(struct rndis_packet_msg) + \
148 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
149 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
150 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
151 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
152 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
153 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
155 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
156 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
157 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
158 /* -1 for RNDIS packet message */
159 #define HN_TX_DATA_SEGCNT_MAX (NETVSC_PACKET_MAXPAGE - 1)
161 #define HN_DIRECT_TX_SIZE_DEF 128
163 #define HN_EARLY_TXEOF_THRESH 8
166 #ifndef HN_USE_TXDESC_BUFRING
167 SLIST_ENTRY(hn_txdesc) link;
170 struct hn_tx_ring *txr;
172 uint32_t flags; /* HN_TXD_FLAG_ */
173 struct hn_send_ctx send_ctx;
175 bus_dmamap_t data_dmap;
177 bus_addr_t rndis_pkt_paddr;
178 struct rndis_packet_msg *rndis_pkt;
179 bus_dmamap_t rndis_pkt_dmap;
182 #define HN_TXD_FLAG_ONLIST 0x1
183 #define HN_TXD_FLAG_DMAMAP 0x2
185 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
186 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
187 /* YYY 2*MTU is a bit rough, but should be good enough. */
188 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
190 #define HN_LRO_ACKCNT_DEF 1
192 #define HN_LOCK_INIT(sc) \
193 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
194 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
195 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
196 #define HN_LOCK(sc) sx_xlock(&(sc)->hn_lock)
197 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
199 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
200 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
201 #define HN_CSUM_IP_HWASSIST(sc) \
202 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
203 #define HN_CSUM_IP6_HWASSIST(sc) \
204 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
210 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
211 "Hyper-V network interface");
213 /* Trust tcp segements verification on host side. */
214 static int hn_trust_hosttcp = 1;
215 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
216 &hn_trust_hosttcp, 0,
217 "Trust tcp segement verification on host side, "
218 "when csum info is missing (global setting)");
220 /* Trust udp datagrams verification on host side. */
221 static int hn_trust_hostudp = 1;
222 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
223 &hn_trust_hostudp, 0,
224 "Trust udp datagram verification on host side, "
225 "when csum info is missing (global setting)");
227 /* Trust ip packets verification on host side. */
228 static int hn_trust_hostip = 1;
229 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
231 "Trust ip packet verification on host side, "
232 "when csum info is missing (global setting)");
234 /* Limit TSO burst size */
235 static int hn_tso_maxlen = 0;
236 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
237 &hn_tso_maxlen, 0, "TSO burst limit");
239 /* Limit chimney send size */
240 static int hn_tx_chimney_size = 0;
241 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
242 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
244 /* Limit the size of packet for direct transmission */
245 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
246 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
247 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
249 #if defined(INET) || defined(INET6)
250 #if __FreeBSD_version >= 1100095
251 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
252 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
253 &hn_lro_entry_count, 0, "LRO entry count");
257 static int hn_share_tx_taskq = 0;
258 SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
259 &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
261 static struct taskqueue *hn_tx_taskq;
263 #ifndef HN_USE_TXDESC_BUFRING
264 static int hn_use_txdesc_bufring = 0;
266 static int hn_use_txdesc_bufring = 1;
268 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
269 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
271 static int hn_bind_tx_taskq = -1;
272 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
273 &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
275 static int hn_use_if_start = 0;
276 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
277 &hn_use_if_start, 0, "Use if_start TX method");
279 static int hn_chan_cnt = 0;
280 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
282 "# of channels to use; each channel has one RX ring and one TX ring");
284 static int hn_tx_ring_cnt = 0;
285 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
286 &hn_tx_ring_cnt, 0, "# of TX rings to use");
288 static int hn_tx_swq_depth = 0;
289 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
290 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
292 #if __FreeBSD_version >= 1100095
293 static u_int hn_lro_mbufq_depth = 0;
294 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
295 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
298 static u_int hn_cpu_index;
301 * Forward declarations
303 static void hn_stop(struct hn_softc *sc);
304 static void hn_init_locked(struct hn_softc *sc);
305 static void hn_init(void *xsc);
306 static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
307 static int hn_start_locked(struct hn_tx_ring *txr, int len);
308 static void hn_start(struct ifnet *ifp);
309 static void hn_start_txeof(struct hn_tx_ring *);
310 static int hn_ifmedia_upd(struct ifnet *ifp);
311 static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
312 #if __FreeBSD_version >= 1100099
313 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
314 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
316 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
317 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
318 #if __FreeBSD_version < 1100095
319 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
321 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
323 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
325 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
326 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
327 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
328 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
329 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
330 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
331 static int hn_check_iplen(const struct mbuf *, int);
332 static int hn_create_tx_ring(struct hn_softc *, int);
333 static void hn_destroy_tx_ring(struct hn_tx_ring *);
334 static int hn_create_tx_data(struct hn_softc *, int);
335 static void hn_fixup_tx_data(struct hn_softc *);
336 static void hn_destroy_tx_data(struct hn_softc *);
337 static void hn_start_taskfunc(void *, int);
338 static void hn_start_txeof_taskfunc(void *, int);
339 static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **);
340 static int hn_create_rx_data(struct hn_softc *sc, int);
341 static void hn_destroy_rx_data(struct hn_softc *sc);
342 static void hn_set_chim_size(struct hn_softc *, int);
343 static int hn_chan_attach(struct hn_softc *, struct vmbus_channel *);
344 static void hn_chan_detach(struct hn_softc *, struct vmbus_channel *);
345 static int hn_attach_subchans(struct hn_softc *);
346 static void hn_detach_allchans(struct hn_softc *);
347 static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr);
348 static void hn_set_ring_inuse(struct hn_softc *, int);
349 static int hn_synth_attach(struct hn_softc *, int);
350 static void hn_synth_detach(struct hn_softc *);
351 static bool hn_tx_ring_pending(struct hn_tx_ring *);
352 static void hn_suspend(struct hn_softc *);
353 static void hn_resume(struct hn_softc *);
354 static void hn_rx_drain(struct vmbus_channel *);
355 static void hn_tx_resume(struct hn_softc *, int);
356 static void hn_tx_ring_qflush(struct hn_tx_ring *);
358 static void hn_nvs_handle_notify(struct hn_softc *sc,
359 const struct vmbus_chanpkt_hdr *pkt);
360 static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
361 const struct vmbus_chanpkt_hdr *pkt);
362 static void hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr,
363 struct vmbus_channel *chan,
364 const struct vmbus_chanpkt_hdr *pkthdr);
365 static void hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid);
367 static int hn_transmit(struct ifnet *, struct mbuf *);
368 static void hn_xmit_qflush(struct ifnet *);
369 static int hn_xmit(struct hn_tx_ring *, int);
370 static void hn_xmit_txeof(struct hn_tx_ring *);
371 static void hn_xmit_taskfunc(void *, int);
372 static void hn_xmit_txeof_taskfunc(void *, int);
374 static const uint8_t hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
375 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
376 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
377 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
378 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
379 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
382 #if __FreeBSD_version >= 1100099
384 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
388 for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
389 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
394 hn_get_txswq_depth(const struct hn_tx_ring *txr)
397 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
398 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
399 return txr->hn_txdesc_cnt;
400 return hn_tx_swq_depth;
404 hn_rss_reconfig(struct hn_softc *sc)
414 * Direct reconfiguration by setting the UNCHG flags does
415 * _not_ work properly.
418 if_printf(sc->hn_ifp, "disable RSS\n");
419 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
421 if_printf(sc->hn_ifp, "RSS disable failed\n");
426 * Reenable the RSS w/ the updated RSS key or indirect
430 if_printf(sc->hn_ifp, "reconfig RSS\n");
431 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
433 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
440 hn_rss_ind_fixup(struct hn_softc *sc, int nchan)
442 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
445 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
448 * Check indirect table to make sure that all channels in it
451 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
452 if (rss->rss_ind[i] >= nchan) {
453 if_printf(sc->hn_ifp,
454 "RSS indirect table %d fixup: %u -> %d\n",
455 i, rss->rss_ind[i], nchan - 1);
456 rss->rss_ind[i] = nchan - 1;
462 hn_ifmedia_upd(struct ifnet *ifp __unused)
469 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
471 struct hn_softc *sc = ifp->if_softc;
473 ifmr->ifm_status = IFM_AVALID;
474 ifmr->ifm_active = IFM_ETHER;
476 if (!sc->hn_carrier) {
477 ifmr->ifm_active |= IFM_NONE;
480 ifmr->ifm_status |= IFM_ACTIVE;
481 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
484 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
485 static const struct hyperv_guid g_net_vsc_device_type = {
486 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
487 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
491 * Standard probe entry point.
495 netvsc_probe(device_t dev)
497 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
498 &g_net_vsc_device_type) == 0) {
499 device_set_desc(dev, "Hyper-V Network Interface");
500 return BUS_PROBE_DEFAULT;
506 hn_cpuset_setthread_task(void *xmask, int pending __unused)
508 cpuset_t *mask = xmask;
511 error = cpuset_setthread(curthread->td_tid, mask);
513 panic("curthread=%ju: can't pin; error=%d",
514 (uintmax_t)curthread->td_tid, error);
519 * Standard attach entry point.
521 * Called when the driver is loaded. It allocates needed resources,
522 * and initializes the "hardware" and software.
525 netvsc_attach(device_t dev)
527 struct hn_softc *sc = device_get_softc(dev);
528 struct sysctl_oid_list *child;
529 struct sysctl_ctx_list *ctx;
530 uint8_t eaddr[ETHER_ADDR_LEN];
531 uint32_t link_status;
532 struct ifnet *ifp = NULL;
533 int error, ring_cnt, tx_ring_cnt;
537 sc->hn_prichan = vmbus_get_channel(dev);
541 * Setup taskqueue for transmission.
543 if (hn_tx_taskq == NULL) {
544 sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
545 taskqueue_thread_enqueue, &sc->hn_tx_taskq);
546 taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
547 device_get_nameunit(dev));
548 if (hn_bind_tx_taskq >= 0) {
549 int cpu = hn_bind_tx_taskq;
550 struct task cpuset_task;
553 if (cpu > mp_ncpus - 1)
555 CPU_SETOF(cpu, &cpu_set);
556 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task,
558 taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task);
559 taskqueue_drain(sc->hn_tx_taskq, &cpuset_task);
562 sc->hn_tx_taskq = hn_tx_taskq;
566 * Allocate ifnet and setup its name earlier, so that if_printf
567 * can be used by functions, which will be called after
570 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
572 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
575 * Initialize ifmedia earlier so that it can be unconditionally
576 * destroyed, if error happened later on.
578 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
581 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
582 * to use (tx_ring_cnt).
585 * The # of RX rings to use is same as the # of channels to use.
587 ring_cnt = hn_chan_cnt;
591 if (ring_cnt > HN_RING_CNT_DEF_MAX)
592 ring_cnt = HN_RING_CNT_DEF_MAX;
593 } else if (ring_cnt > mp_ncpus) {
597 tx_ring_cnt = hn_tx_ring_cnt;
598 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
599 tx_ring_cnt = ring_cnt;
600 if (hn_use_if_start) {
601 /* ifnet.if_start only needs one TX ring. */
606 * Set the leader CPU for channels.
608 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
611 * Create enough TX/RX rings, even if only limited number of
612 * channels can be allocated.
614 error = hn_create_tx_data(sc, tx_ring_cnt);
617 error = hn_create_rx_data(sc, ring_cnt);
622 * Create transaction context for NVS and RNDIS transactions.
624 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
625 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
626 if (sc->hn_xact == NULL)
630 * Attach the synthetic parts, i.e. NVS and RNDIS.
632 error = hn_synth_attach(sc, ETHERMTU);
636 error = hn_rndis_get_linkstatus(sc, &link_status);
639 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
642 error = hn_rndis_get_eaddr(sc, eaddr);
646 #if __FreeBSD_version >= 1100099
647 if (sc->hn_rx_ring_inuse > 1) {
649 * Reduce TCP segment aggregation limit for multiple
650 * RX rings to increase ACK timeliness.
652 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
657 * Fixup TX stuffs after synthetic parts are attached.
659 hn_fixup_tx_data(sc);
661 ctx = device_get_sysctl_ctx(dev);
662 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
663 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
664 &sc->hn_nvs_ver, 0, "NVS version");
665 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
666 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
667 hn_ndis_version_sysctl, "A", "NDIS version");
668 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
669 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
670 hn_caps_sysctl, "A", "capabilities");
671 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
672 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
673 hn_hwassist_sysctl, "A", "hwassist");
674 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
675 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
676 hn_rss_key_sysctl, "IU", "RSS key");
677 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
678 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
679 hn_rss_ind_sysctl, "IU", "RSS indirect table");
682 * Setup the ifmedia, which has been initialized earlier.
684 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
685 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
686 /* XXX ifmedia_set really should do this for us */
687 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
690 * Setup the ifnet for this interface.
693 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
694 ifp->if_ioctl = hn_ioctl;
695 ifp->if_init = hn_init;
696 if (hn_use_if_start) {
697 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
699 ifp->if_start = hn_start;
700 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
701 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
702 IFQ_SET_READY(&ifp->if_snd);
704 ifp->if_transmit = hn_transmit;
705 ifp->if_qflush = hn_xmit_qflush;
708 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
710 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
711 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
713 if (sc->hn_caps & HN_CAP_VLAN) {
714 /* XXX not sure about VLAN_MTU. */
715 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
718 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
719 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
720 ifp->if_capabilities |= IFCAP_TXCSUM;
721 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
722 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
723 if (sc->hn_caps & HN_CAP_TSO4) {
724 ifp->if_capabilities |= IFCAP_TSO4;
725 ifp->if_hwassist |= CSUM_IP_TSO;
727 if (sc->hn_caps & HN_CAP_TSO6) {
728 ifp->if_capabilities |= IFCAP_TSO6;
729 ifp->if_hwassist |= CSUM_IP6_TSO;
732 /* Enable all available capabilities by default. */
733 ifp->if_capenable = ifp->if_capabilities;
735 tso_maxlen = hn_tso_maxlen;
736 if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET)
737 tso_maxlen = IP_MAXPACKET;
738 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
739 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
740 ifp->if_hw_tsomax = tso_maxlen -
741 (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
743 ether_ifattach(ifp, eaddr);
746 if_printf(ifp, "TSO: %u/%u/%u\n", ifp->if_hw_tsomax,
747 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
750 /* Inform the upper layer about the long frame support. */
751 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
755 /* TODO: reuse netvsc_detach() */
756 hn_destroy_tx_data(sc);
763 * TODO: Use this for error handling on attach path.
766 netvsc_detach(device_t dev)
768 struct hn_softc *sc = device_get_softc(dev);
770 /* TODO: ether_ifdetach */
777 ifmedia_removeall(&sc->hn_media);
778 hn_destroy_rx_data(sc);
779 hn_destroy_tx_data(sc);
781 if (sc->hn_tx_taskq != hn_tx_taskq)
782 taskqueue_free(sc->hn_tx_taskq);
784 vmbus_xact_ctx_destroy(sc->hn_xact);
792 * Standard shutdown entry point
795 netvsc_shutdown(device_t dev)
801 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
802 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
804 struct mbuf *m = *m_head;
807 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
808 m, segs, nsegs, BUS_DMA_NOWAIT);
809 if (error == EFBIG) {
812 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
817 txr->hn_tx_collapsed++;
819 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
820 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
823 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
824 BUS_DMASYNC_PREWRITE);
825 txd->flags |= HN_TXD_FLAG_DMAMAP;
831 hn_txdesc_dmamap_unload(struct hn_tx_ring *txr, struct hn_txdesc *txd)
834 if (txd->flags & HN_TXD_FLAG_DMAMAP) {
835 bus_dmamap_sync(txr->hn_tx_data_dtag,
836 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
837 bus_dmamap_unload(txr->hn_tx_data_dtag,
839 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
844 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
847 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
848 ("put an onlist txd %#x", txd->flags));
850 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
851 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
854 hn_txdesc_dmamap_unload(txr, txd);
855 if (txd->m != NULL) {
860 txd->flags |= HN_TXD_FLAG_ONLIST;
862 #ifndef HN_USE_TXDESC_BUFRING
863 mtx_lock_spin(&txr->hn_txlist_spin);
864 KASSERT(txr->hn_txdesc_avail >= 0 &&
865 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
866 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
867 txr->hn_txdesc_avail++;
868 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
869 mtx_unlock_spin(&txr->hn_txlist_spin);
871 atomic_add_int(&txr->hn_txdesc_avail, 1);
872 buf_ring_enqueue(txr->hn_txdesc_br, txd);
878 static __inline struct hn_txdesc *
879 hn_txdesc_get(struct hn_tx_ring *txr)
881 struct hn_txdesc *txd;
883 #ifndef HN_USE_TXDESC_BUFRING
884 mtx_lock_spin(&txr->hn_txlist_spin);
885 txd = SLIST_FIRST(&txr->hn_txlist);
887 KASSERT(txr->hn_txdesc_avail > 0,
888 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
889 txr->hn_txdesc_avail--;
890 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
892 mtx_unlock_spin(&txr->hn_txlist_spin);
894 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
898 #ifdef HN_USE_TXDESC_BUFRING
899 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
901 KASSERT(txd->m == NULL && txd->refs == 0 &&
902 (txd->flags & HN_TXD_FLAG_ONLIST), ("invalid txd"));
903 txd->flags &= ~HN_TXD_FLAG_ONLIST;
910 hn_txdesc_hold(struct hn_txdesc *txd)
913 /* 0->1 transition will never work */
914 KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
915 atomic_add_int(&txd->refs, 1);
919 hn_tx_ring_pending(struct hn_tx_ring *txr)
921 bool pending = false;
923 #ifndef HN_USE_TXDESC_BUFRING
924 mtx_lock_spin(&txr->hn_txlist_spin);
925 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
927 mtx_unlock_spin(&txr->hn_txlist_spin);
929 if (!buf_ring_full(txr->hn_txdesc_br))
936 hn_txeof(struct hn_tx_ring *txr)
938 txr->hn_has_txeof = 0;
943 hn_tx_done(struct hn_send_ctx *sndc, struct hn_softc *sc,
944 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
946 struct hn_txdesc *txd = sndc->hn_cbarg;
947 struct hn_tx_ring *txr;
949 if (sndc->hn_chim_idx != HN_NVS_CHIM_IDX_INVALID)
950 hn_chim_free(sc, sndc->hn_chim_idx);
953 KASSERT(txr->hn_chan == chan,
954 ("channel mismatch, on chan%u, should be chan%u",
955 vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan)));
957 txr->hn_has_txeof = 1;
958 hn_txdesc_put(txr, txd);
960 ++txr->hn_txdone_cnt;
961 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
962 txr->hn_txdone_cnt = 0;
969 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
971 #if defined(INET) || defined(INET6)
972 struct lro_ctrl *lro = &rxr->hn_lro;
973 struct lro_entry *queued;
975 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
976 SLIST_REMOVE_HEAD(&lro->lro_active, next);
977 tcp_lro_flush(lro, queued);
983 * 'txr' could be NULL, if multiple channels and
984 * ifnet.if_start method are enabled.
986 if (txr == NULL || !txr->hn_has_txeof)
989 txr->hn_txdone_cnt = 0;
993 static __inline uint32_t
994 hn_rndis_pktmsg_offset(uint32_t ofs)
997 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
998 ("invalid RNDIS packet msg offset %u", ofs));
999 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1004 * If this function fails, then both txd and m_head0 will be freed.
1007 hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
1009 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1010 int error, nsegs, i;
1011 struct mbuf *m_head = *m_head0;
1012 struct rndis_packet_msg *pkt;
1013 uint32_t send_buf_section_idx;
1014 int send_buf_section_size, pktlen;
1018 * extension points to the area reserved for the
1019 * rndis_filter_packet, which is placed just after
1020 * the netvsc_packet (and rppi struct, if present;
1021 * length is updated later).
1023 pkt = txd->rndis_pkt;
1024 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1025 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1026 pkt->rm_dataoffset = sizeof(*pkt);
1027 pkt->rm_datalen = m_head->m_pkthdr.len;
1028 pkt->rm_pktinfooffset = sizeof(*pkt);
1029 pkt->rm_pktinfolen = 0;
1032 * Set the hash value for this packet, so that the host could
1033 * dispatch the TX done event for this packet back to this TX
1036 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1037 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1038 *pi_data = txr->hn_tx_idx;
1040 if (m_head->m_flags & M_VLANTAG) {
1041 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1042 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1043 *pi_data = NDIS_VLAN_INFO_MAKE(
1044 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1045 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1046 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1049 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1050 #if defined(INET6) || defined(INET)
1051 struct ether_vlan_header *eh;
1055 * XXX need m_pullup and use mtodo
1057 eh = mtod(m_head, struct ether_vlan_header*);
1058 if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN))
1059 ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1061 ether_len = ETHER_HDR_LEN;
1063 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1064 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1066 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1068 (struct ip *)(m_head->m_data + ether_len);
1069 unsigned long iph_len = ip->ip_hl << 2;
1071 (struct tcphdr *)((caddr_t)ip + iph_len);
1075 th->th_sum = in_pseudo(ip->ip_src.s_addr,
1076 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
1077 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1078 m_head->m_pkthdr.tso_segsz);
1081 #if defined(INET6) && defined(INET)
1086 struct ip6_hdr *ip6 = (struct ip6_hdr *)
1087 (m_head->m_data + ether_len);
1088 struct tcphdr *th = (struct tcphdr *)(ip6 + 1);
1091 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
1092 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1093 m_head->m_pkthdr.tso_segsz);
1096 #endif /* INET6 || INET */
1097 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1098 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1099 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1100 if (m_head->m_pkthdr.csum_flags &
1101 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1102 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1104 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1105 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1106 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1109 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1110 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1111 else if (m_head->m_pkthdr.csum_flags &
1112 (CSUM_IP_UDP | CSUM_IP6_UDP))
1113 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1116 pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1117 /* Convert RNDIS packet message offsets */
1118 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1119 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1122 * Chimney send, if the packet could fit into one chimney buffer.
1124 if (pkt->rm_len < txr->hn_chim_size) {
1125 txr->hn_tx_chimney_tried++;
1126 send_buf_section_idx = hn_chim_alloc(txr->hn_sc);
1127 if (send_buf_section_idx != HN_NVS_CHIM_IDX_INVALID) {
1128 uint8_t *dest = txr->hn_sc->hn_chim +
1129 (send_buf_section_idx * txr->hn_sc->hn_chim_szmax);
1131 memcpy(dest, pkt, pktlen);
1133 m_copydata(m_head, 0, m_head->m_pkthdr.len, dest);
1135 send_buf_section_size = pkt->rm_len;
1136 txr->hn_gpa_cnt = 0;
1137 txr->hn_tx_chimney++;
1142 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1147 * This mbuf is not linked w/ the txd yet, so free it now.
1152 freed = hn_txdesc_put(txr, txd);
1154 ("fail to free txd upon txdma error"));
1156 txr->hn_txdma_failed++;
1157 if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1);
1162 /* +1 RNDIS packet message */
1163 txr->hn_gpa_cnt = nsegs + 1;
1165 /* send packet with page buffer */
1166 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1167 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1168 txr->hn_gpa[0].gpa_len = pktlen;
1171 * Fill the page buffers with mbuf info after the page
1172 * buffer for RNDIS packet message.
1174 for (i = 0; i < nsegs; ++i) {
1175 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1177 gpa->gpa_page = atop(segs[i].ds_addr);
1178 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1179 gpa->gpa_len = segs[i].ds_len;
1182 send_buf_section_idx = HN_NVS_CHIM_IDX_INVALID;
1183 send_buf_section_size = 0;
1187 /* Set the completion routine */
1188 hn_send_ctx_init(&txd->send_ctx, hn_tx_done, txd,
1189 send_buf_section_idx, send_buf_section_size);
1196 * If this function fails, then txd will be freed, but the mbuf
1197 * associated w/ the txd will _not_ be freed.
1200 hn_send_pkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1202 int error, send_failed = 0;
1206 * Make sure that txd is not freed before ETHER_BPF_MTAP.
1208 hn_txdesc_hold(txd);
1209 error = hv_nv_on_send(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
1210 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt);
1212 ETHER_BPF_MTAP(ifp, txd->m);
1213 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
1214 if (!hn_use_if_start) {
1215 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1216 txd->m->m_pkthdr.len);
1217 if (txd->m->m_flags & M_MCAST)
1218 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
1222 hn_txdesc_put(txr, txd);
1224 if (__predict_false(error)) {
1228 * This should "really rarely" happen.
1230 * XXX Too many RX to be acked or too many sideband
1231 * commands to run? Ask netvsc_channel_rollup()
1232 * to kick start later.
1234 txr->hn_has_txeof = 1;
1236 txr->hn_send_failed++;
1239 * Try sending again after set hn_has_txeof;
1240 * in case that we missed the last
1241 * netvsc_channel_rollup().
1245 if_printf(ifp, "send failed\n");
1248 * Caller will perform further processing on the
1249 * associated mbuf, so don't free it in hn_txdesc_put();
1250 * only unload it from the DMA map in hn_txdesc_put(),
1254 freed = hn_txdesc_put(txr, txd);
1256 ("fail to free txd upon send error"));
1258 txr->hn_send_failed++;
1264 * Start a transmit of one or more packets
1267 hn_start_locked(struct hn_tx_ring *txr, int len)
1269 struct hn_softc *sc = txr->hn_sc;
1270 struct ifnet *ifp = sc->hn_ifp;
1272 KASSERT(hn_use_if_start,
1273 ("hn_start_locked is called, when if_start is disabled"));
1274 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
1275 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
1277 if (__predict_false(txr->hn_suspended))
1280 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
1284 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
1285 struct hn_txdesc *txd;
1286 struct mbuf *m_head;
1289 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
1293 if (len > 0 && m_head->m_pkthdr.len > len) {
1295 * This sending could be time consuming; let callers
1296 * dispatch this packet sending (and sending of any
1297 * following up packets) to tx taskqueue.
1299 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1303 txd = hn_txdesc_get(txr);
1305 txr->hn_no_txdescs++;
1306 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1307 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1311 error = hn_encap(txr, txd, &m_head);
1313 /* Both txd and m_head are freed */
1317 error = hn_send_pkt(ifp, txr, txd);
1318 if (__predict_false(error)) {
1319 /* txd is freed, but m_head is not */
1320 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1321 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1329 * Link up/down notification
1332 netvsc_linkstatus_callback(struct hn_softc *sc, uint32_t status)
1342 * Append the specified data to the indicated mbuf chain,
1343 * Extend the mbuf chain if the new data does not fit in
1346 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1347 * There should be an equivalent in the kernel mbuf code,
1348 * but there does not appear to be one yet.
1350 * Differs from m_append() in that additional mbufs are
1351 * allocated with cluster size MJUMPAGESIZE, and filled
1354 * Return 1 if able to complete the job; otherwise 0.
1357 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
1360 int remainder, space;
1362 for (m = m0; m->m_next != NULL; m = m->m_next)
1365 space = M_TRAILINGSPACE(m);
1368 * Copy into available space.
1370 if (space > remainder)
1372 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1377 while (remainder > 0) {
1379 * Allocate a new mbuf; could check space
1380 * and allocate a cluster instead.
1382 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
1385 n->m_len = min(MJUMPAGESIZE, remainder);
1386 bcopy(cp, mtod(n, caddr_t), n->m_len);
1388 remainder -= n->m_len;
1392 if (m0->m_flags & M_PKTHDR)
1393 m0->m_pkthdr.len += len - remainder;
1395 return (remainder == 0);
1398 #if defined(INET) || defined(INET6)
1400 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
1402 #if __FreeBSD_version >= 1100095
1403 if (hn_lro_mbufq_depth) {
1404 tcp_lro_queue_mbuf(lc, m);
1408 return tcp_lro_rx(lc, m, 0);
1413 * Called when we receive a data packet from the "wire" on the
1416 * Note: This is no longer used as a callback
1419 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
1420 const struct hn_recvinfo *info)
1422 struct ifnet *ifp = rxr->hn_ifp;
1424 int size, do_lro = 0, do_csum = 1;
1425 int hash_type = M_HASHTYPE_OPAQUE;
1427 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
1431 * Bail out if packet contains more data than configured MTU.
1433 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
1435 } else if (dlen <= MHLEN) {
1436 m_new = m_gethdr(M_NOWAIT, MT_DATA);
1437 if (m_new == NULL) {
1438 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1441 memcpy(mtod(m_new, void *), data, dlen);
1442 m_new->m_pkthdr.len = m_new->m_len = dlen;
1443 rxr->hn_small_pkts++;
1446 * Get an mbuf with a cluster. For packets 2K or less,
1447 * get a standard 2K cluster. For anything larger, get a
1448 * 4K cluster. Any buffers larger than 4K can cause problems
1449 * if looped around to the Hyper-V TX channel, so avoid them.
1452 if (dlen > MCLBYTES) {
1454 size = MJUMPAGESIZE;
1457 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
1458 if (m_new == NULL) {
1459 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1463 hv_m_append(m_new, dlen, data);
1465 m_new->m_pkthdr.rcvif = ifp;
1467 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
1470 /* receive side checksum offload */
1471 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
1472 /* IP csum offload */
1473 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
1474 m_new->m_pkthdr.csum_flags |=
1475 (CSUM_IP_CHECKED | CSUM_IP_VALID);
1479 /* TCP/UDP csum offload */
1480 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
1481 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
1482 m_new->m_pkthdr.csum_flags |=
1483 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1484 m_new->m_pkthdr.csum_data = 0xffff;
1485 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
1491 if ((info->csum_info &
1492 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
1493 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
1496 const struct ether_header *eh;
1501 if (m_new->m_len < hoff)
1503 eh = mtod(m_new, struct ether_header *);
1504 etype = ntohs(eh->ether_type);
1505 if (etype == ETHERTYPE_VLAN) {
1506 const struct ether_vlan_header *evl;
1508 hoff = sizeof(*evl);
1509 if (m_new->m_len < hoff)
1511 evl = mtod(m_new, struct ether_vlan_header *);
1512 etype = ntohs(evl->evl_proto);
1515 if (etype == ETHERTYPE_IP) {
1518 pr = hn_check_iplen(m_new, hoff);
1519 if (pr == IPPROTO_TCP) {
1521 (rxr->hn_trust_hcsum &
1522 HN_TRUST_HCSUM_TCP)) {
1523 rxr->hn_csum_trusted++;
1524 m_new->m_pkthdr.csum_flags |=
1525 (CSUM_IP_CHECKED | CSUM_IP_VALID |
1526 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1527 m_new->m_pkthdr.csum_data = 0xffff;
1530 } else if (pr == IPPROTO_UDP) {
1532 (rxr->hn_trust_hcsum &
1533 HN_TRUST_HCSUM_UDP)) {
1534 rxr->hn_csum_trusted++;
1535 m_new->m_pkthdr.csum_flags |=
1536 (CSUM_IP_CHECKED | CSUM_IP_VALID |
1537 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1538 m_new->m_pkthdr.csum_data = 0xffff;
1540 } else if (pr != IPPROTO_DONE && do_csum &&
1541 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
1542 rxr->hn_csum_trusted++;
1543 m_new->m_pkthdr.csum_flags |=
1544 (CSUM_IP_CHECKED | CSUM_IP_VALID);
1549 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
1550 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
1551 NDIS_VLAN_INFO_ID(info->vlan_info),
1552 NDIS_VLAN_INFO_PRI(info->vlan_info),
1553 NDIS_VLAN_INFO_CFI(info->vlan_info));
1554 m_new->m_flags |= M_VLANTAG;
1557 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
1559 m_new->m_pkthdr.flowid = info->hash_value;
1560 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
1561 NDIS_HASH_FUNCTION_TOEPLITZ) {
1562 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
1565 case NDIS_HASH_IPV4:
1566 hash_type = M_HASHTYPE_RSS_IPV4;
1569 case NDIS_HASH_TCP_IPV4:
1570 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
1573 case NDIS_HASH_IPV6:
1574 hash_type = M_HASHTYPE_RSS_IPV6;
1577 case NDIS_HASH_IPV6_EX:
1578 hash_type = M_HASHTYPE_RSS_IPV6_EX;
1581 case NDIS_HASH_TCP_IPV6:
1582 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
1585 case NDIS_HASH_TCP_IPV6_EX:
1586 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
1591 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
1593 M_HASHTYPE_SET(m_new, hash_type);
1596 * Note: Moved RX completion back to hv_nv_on_receive() so all
1597 * messages (not just data messages) will trigger a response.
1603 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
1604 #if defined(INET) || defined(INET6)
1605 struct lro_ctrl *lro = &rxr->hn_lro;
1608 rxr->hn_lro_tried++;
1609 if (hn_lro_rx(lro, m_new) == 0) {
1617 /* We're not holding the lock here, so don't release it */
1618 (*ifp->if_input)(ifp, m_new);
1624 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1626 struct hn_softc *sc = ifp->if_softc;
1627 struct ifreq *ifr = (struct ifreq *)data;
1628 int mask, error = 0;
1632 if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) {
1639 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
1640 /* Can't change MTU */
1646 if (ifp->if_mtu == ifr->ifr_mtu) {
1651 /* Obtain and record requested MTU */
1652 ifp->if_mtu = ifr->ifr_mtu;
1654 #if __FreeBSD_version >= 1100099
1656 * Make sure that LRO aggregation length limit is still
1657 * valid, after the MTU change.
1659 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
1660 HN_LRO_LENLIM_MIN(ifp))
1661 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1664 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1668 * Detach the synthetics parts, i.e. NVS and RNDIS.
1670 hn_synth_detach(sc);
1673 * Reattach the synthetic parts, i.e. NVS and RNDIS,
1674 * with the new MTU setting.
1677 hn_synth_attach(sc, ifr->ifr_mtu);
1679 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
1680 hn_set_chim_size(sc, sc->hn_chim_szmax);
1682 /* All done! Resume now. */
1683 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1692 if (ifp->if_flags & IFF_UP) {
1694 * If only the state of the PROMISC flag changed,
1695 * then just use the 'set promisc mode' command
1696 * instead of reinitializing the entire NIC. Doing
1697 * a full re-init means reloading the firmware and
1698 * waiting for it to start up, which may take a
1702 /* Fixme: Promiscuous mode? */
1703 if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1704 ifp->if_flags & IFF_PROMISC &&
1705 !(sc->hn_if_flags & IFF_PROMISC)) {
1706 /* do something here for Hyper-V */
1707 } else if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1708 !(ifp->if_flags & IFF_PROMISC) &&
1709 sc->hn_if_flags & IFF_PROMISC) {
1710 /* do something here for Hyper-V */
1715 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1719 sc->hn_if_flags = ifp->if_flags;
1726 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
1728 if (mask & IFCAP_TXCSUM) {
1729 ifp->if_capenable ^= IFCAP_TXCSUM;
1730 if (ifp->if_capenable & IFCAP_TXCSUM)
1731 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
1733 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
1735 if (mask & IFCAP_TXCSUM_IPV6) {
1736 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
1737 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1738 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
1740 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
1743 /* TODO: flip RNDIS offload parameters for RXCSUM. */
1744 if (mask & IFCAP_RXCSUM)
1745 ifp->if_capenable ^= IFCAP_RXCSUM;
1747 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1748 if (mask & IFCAP_RXCSUM_IPV6)
1749 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
1752 if (mask & IFCAP_LRO)
1753 ifp->if_capenable ^= IFCAP_LRO;
1755 if (mask & IFCAP_TSO4) {
1756 ifp->if_capenable ^= IFCAP_TSO4;
1757 if (ifp->if_capenable & IFCAP_TSO4)
1758 ifp->if_hwassist |= CSUM_IP_TSO;
1760 ifp->if_hwassist &= ~CSUM_IP_TSO;
1762 if (mask & IFCAP_TSO6) {
1763 ifp->if_capenable ^= IFCAP_TSO6;
1764 if (ifp->if_capenable & IFCAP_TSO6)
1765 ifp->if_hwassist |= CSUM_IP6_TSO;
1767 ifp->if_hwassist &= ~CSUM_IP6_TSO;
1775 /* Always all-multi */
1778 * Enable/disable all-multi according to the emptiness of
1779 * the mcast address list.
1785 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
1789 error = ether_ioctl(ifp, cmd, data);
1796 hn_stop(struct hn_softc *sc)
1798 struct ifnet *ifp = sc->hn_ifp;
1803 /* Clear RUNNING bit _before_ hn_suspend() */
1804 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
1807 /* Clear OACTIVE bit. */
1808 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1809 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
1810 sc->hn_tx_ring[i].hn_oactive = 0;
1814 * FreeBSD transmit entry point
1817 hn_start(struct ifnet *ifp)
1819 struct hn_softc *sc = ifp->if_softc;
1820 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
1822 if (txr->hn_sched_tx)
1825 if (mtx_trylock(&txr->hn_tx_lock)) {
1828 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
1829 mtx_unlock(&txr->hn_tx_lock);
1834 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
1838 hn_start_txeof(struct hn_tx_ring *txr)
1840 struct hn_softc *sc = txr->hn_sc;
1841 struct ifnet *ifp = sc->hn_ifp;
1843 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
1845 if (txr->hn_sched_tx)
1848 if (mtx_trylock(&txr->hn_tx_lock)) {
1851 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1852 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
1853 mtx_unlock(&txr->hn_tx_lock);
1855 taskqueue_enqueue(txr->hn_tx_taskq,
1861 * Release the OACTIVE earlier, with the hope, that
1862 * others could catch up. The task will clear the
1863 * flag again with the hn_tx_lock to avoid possible
1866 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1867 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
1872 hn_init_locked(struct hn_softc *sc)
1874 struct ifnet *ifp = sc->hn_ifp;
1879 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1882 /* TODO: add hn_rx_filter */
1883 hn_rndis_set_rxfilter(sc, NDIS_PACKET_TYPE_PROMISCUOUS);
1885 /* Clear OACTIVE bit. */
1886 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1887 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
1888 sc->hn_tx_ring[i].hn_oactive = 0;
1890 /* Clear TX 'suspended' bit. */
1891 hn_tx_resume(sc, sc->hn_tx_ring_inuse);
1893 /* Everything is ready; unleash! */
1894 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
1896 /* TODO: check RNDIS link status. */
1897 if_link_state_change(ifp, LINK_STATE_UP);
1903 struct hn_softc *sc = xsc;
1915 hn_watchdog(struct ifnet *ifp)
1918 if_printf(ifp, "watchdog timeout -- resetting\n");
1919 hn_init(ifp->if_softc); /* XXX */
1924 #if __FreeBSD_version >= 1100099
1927 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
1929 struct hn_softc *sc = arg1;
1930 unsigned int lenlim;
1933 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
1934 error = sysctl_handle_int(oidp, &lenlim, 0, req);
1935 if (error || req->newptr == NULL)
1939 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
1940 lenlim > TCP_LRO_LENGTH_MAX) {
1944 hn_set_lro_lenlim(sc, lenlim);
1951 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
1953 struct hn_softc *sc = arg1;
1954 int ackcnt, error, i;
1957 * lro_ackcnt_lim is append count limit,
1958 * +1 to turn it into aggregation limit.
1960 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
1961 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
1962 if (error || req->newptr == NULL)
1965 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
1969 * Convert aggregation limit back to append
1974 for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
1975 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
1983 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
1985 struct hn_softc *sc = arg1;
1990 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
1993 error = sysctl_handle_int(oidp, &on, 0, req);
1994 if (error || req->newptr == NULL)
1998 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
1999 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2002 rxr->hn_trust_hcsum |= hcsum;
2004 rxr->hn_trust_hcsum &= ~hcsum;
2011 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2013 struct hn_softc *sc = arg1;
2014 int chim_size, error;
2016 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2017 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2018 if (error || req->newptr == NULL)
2021 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2025 hn_set_chim_size(sc, chim_size);
2030 #if __FreeBSD_version < 1100095
2032 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2034 struct hn_softc *sc = arg1;
2035 int ofs = arg2, i, error;
2036 struct hn_rx_ring *rxr;
2040 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2041 rxr = &sc->hn_rx_ring[i];
2042 stat += *((int *)((uint8_t *)rxr + ofs));
2045 error = sysctl_handle_64(oidp, &stat, 0, req);
2046 if (error || req->newptr == NULL)
2049 /* Zero out this stat. */
2050 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2051 rxr = &sc->hn_rx_ring[i];
2052 *((int *)((uint8_t *)rxr + ofs)) = 0;
2058 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2060 struct hn_softc *sc = arg1;
2061 int ofs = arg2, i, error;
2062 struct hn_rx_ring *rxr;
2066 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2067 rxr = &sc->hn_rx_ring[i];
2068 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2071 error = sysctl_handle_64(oidp, &stat, 0, req);
2072 if (error || req->newptr == NULL)
2075 /* Zero out this stat. */
2076 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2077 rxr = &sc->hn_rx_ring[i];
2078 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2086 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2088 struct hn_softc *sc = arg1;
2089 int ofs = arg2, i, error;
2090 struct hn_rx_ring *rxr;
2094 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2095 rxr = &sc->hn_rx_ring[i];
2096 stat += *((u_long *)((uint8_t *)rxr + ofs));
2099 error = sysctl_handle_long(oidp, &stat, 0, req);
2100 if (error || req->newptr == NULL)
2103 /* Zero out this stat. */
2104 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2105 rxr = &sc->hn_rx_ring[i];
2106 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2112 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2114 struct hn_softc *sc = arg1;
2115 int ofs = arg2, i, error;
2116 struct hn_tx_ring *txr;
2120 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2121 txr = &sc->hn_tx_ring[i];
2122 stat += *((u_long *)((uint8_t *)txr + ofs));
2125 error = sysctl_handle_long(oidp, &stat, 0, req);
2126 if (error || req->newptr == NULL)
2129 /* Zero out this stat. */
2130 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2131 txr = &sc->hn_tx_ring[i];
2132 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2138 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2140 struct hn_softc *sc = arg1;
2141 int ofs = arg2, i, error, conf;
2142 struct hn_tx_ring *txr;
2144 txr = &sc->hn_tx_ring[0];
2145 conf = *((int *)((uint8_t *)txr + ofs));
2147 error = sysctl_handle_int(oidp, &conf, 0, req);
2148 if (error || req->newptr == NULL)
2152 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2153 txr = &sc->hn_tx_ring[i];
2154 *((int *)((uint8_t *)txr + ofs)) = conf;
2162 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2164 struct hn_softc *sc = arg1;
2167 snprintf(verstr, sizeof(verstr), "%u.%u",
2168 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2169 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2170 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2174 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2176 struct hn_softc *sc = arg1;
2183 snprintf(caps_str, sizeof(caps_str), "%b", caps,
2194 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2198 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2200 struct hn_softc *sc = arg1;
2201 char assist_str[128];
2205 hwassist = sc->hn_ifp->if_hwassist;
2207 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2208 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2212 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2214 struct hn_softc *sc = arg1;
2219 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2220 if (error || req->newptr == NULL)
2223 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2226 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2228 if (sc->hn_rx_ring_inuse > 1) {
2229 error = hn_rss_reconfig(sc);
2231 /* Not RSS capable, at least for now; just save the RSS key. */
2240 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2242 struct hn_softc *sc = arg1;
2247 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2248 if (error || req->newptr == NULL)
2252 * Don't allow RSS indirect table change, if this interface is not
2253 * RSS capable currently.
2255 if (sc->hn_rx_ring_inuse == 1) {
2260 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2263 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2265 hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse);
2266 error = hn_rss_reconfig(sc);
2273 hn_check_iplen(const struct mbuf *m, int hoff)
2275 const struct ip *ip;
2276 int len, iphlen, iplen;
2277 const struct tcphdr *th;
2278 int thoff; /* TCP data offset */
2280 len = hoff + sizeof(struct ip);
2282 /* The packet must be at least the size of an IP header. */
2283 if (m->m_pkthdr.len < len)
2284 return IPPROTO_DONE;
2286 /* The fixed IP header must reside completely in the first mbuf. */
2288 return IPPROTO_DONE;
2290 ip = mtodo(m, hoff);
2292 /* Bound check the packet's stated IP header length. */
2293 iphlen = ip->ip_hl << 2;
2294 if (iphlen < sizeof(struct ip)) /* minimum header length */
2295 return IPPROTO_DONE;
2297 /* The full IP header must reside completely in the one mbuf. */
2298 if (m->m_len < hoff + iphlen)
2299 return IPPROTO_DONE;
2301 iplen = ntohs(ip->ip_len);
2304 * Check that the amount of data in the buffers is as
2305 * at least much as the IP header would have us expect.
2307 if (m->m_pkthdr.len < hoff + iplen)
2308 return IPPROTO_DONE;
2311 * Ignore IP fragments.
2313 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
2314 return IPPROTO_DONE;
2317 * The TCP/IP or UDP/IP header must be entirely contained within
2318 * the first fragment of a packet.
2322 if (iplen < iphlen + sizeof(struct tcphdr))
2323 return IPPROTO_DONE;
2324 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
2325 return IPPROTO_DONE;
2326 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
2327 thoff = th->th_off << 2;
2328 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
2329 return IPPROTO_DONE;
2330 if (m->m_len < hoff + iphlen + thoff)
2331 return IPPROTO_DONE;
2334 if (iplen < iphlen + sizeof(struct udphdr))
2335 return IPPROTO_DONE;
2336 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
2337 return IPPROTO_DONE;
2341 return IPPROTO_DONE;
2348 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
2350 struct sysctl_oid_list *child;
2351 struct sysctl_ctx_list *ctx;
2352 device_t dev = sc->hn_dev;
2353 #if defined(INET) || defined(INET6)
2354 #if __FreeBSD_version >= 1100095
2361 * Create RXBUF for reception.
2364 * - It is shared by all channels.
2365 * - A large enough buffer is allocated, certain version of NVSes
2366 * may further limit the usable space.
2368 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
2369 PAGE_SIZE, 0, NETVSC_RECEIVE_BUFFER_SIZE, &sc->hn_rxbuf_dma,
2370 BUS_DMA_WAITOK | BUS_DMA_ZERO);
2371 if (sc->hn_rxbuf == NULL) {
2372 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
2376 sc->hn_rx_ring_cnt = ring_cnt;
2377 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
2379 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
2380 M_NETVSC, M_WAITOK | M_ZERO);
2382 #if defined(INET) || defined(INET6)
2383 #if __FreeBSD_version >= 1100095
2384 lroent_cnt = hn_lro_entry_count;
2385 if (lroent_cnt < TCP_LRO_ENTRIES)
2386 lroent_cnt = TCP_LRO_ENTRIES;
2388 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
2390 #endif /* INET || INET6 */
2392 ctx = device_get_sysctl_ctx(dev);
2393 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2395 /* Create dev.hn.UNIT.rx sysctl tree */
2396 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
2397 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2399 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2400 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2402 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
2404 NETVSC_DEVICE_RING_BUFFER_SIZE +
2405 NETVSC_DEVICE_RING_BUFFER_SIZE,
2406 &rxr->hn_br_dma, BUS_DMA_WAITOK);
2407 if (rxr->hn_br == NULL) {
2408 device_printf(dev, "allocate bufring failed\n");
2412 if (hn_trust_hosttcp)
2413 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
2414 if (hn_trust_hostudp)
2415 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
2416 if (hn_trust_hostip)
2417 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
2418 rxr->hn_ifp = sc->hn_ifp;
2419 if (i < sc->hn_tx_ring_cnt)
2420 rxr->hn_txr = &sc->hn_tx_ring[i];
2421 rxr->hn_rdbuf = malloc(NETVSC_PACKET_SIZE, M_NETVSC, M_WAITOK);
2423 rxr->hn_rxbuf = sc->hn_rxbuf;
2428 #if defined(INET) || defined(INET6)
2429 #if __FreeBSD_version >= 1100095
2430 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
2431 hn_lro_mbufq_depth);
2433 tcp_lro_init(&rxr->hn_lro);
2434 rxr->hn_lro.ifp = sc->hn_ifp;
2436 #if __FreeBSD_version >= 1100099
2437 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
2438 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
2440 #endif /* INET || INET6 */
2442 if (sc->hn_rx_sysctl_tree != NULL) {
2446 * Create per RX ring sysctl tree:
2447 * dev.hn.UNIT.rx.RINGID
2449 snprintf(name, sizeof(name), "%d", i);
2450 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
2451 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
2452 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2454 if (rxr->hn_rx_sysctl_tree != NULL) {
2455 SYSCTL_ADD_ULONG(ctx,
2456 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2457 OID_AUTO, "packets", CTLFLAG_RW,
2458 &rxr->hn_pkts, "# of packets received");
2459 SYSCTL_ADD_ULONG(ctx,
2460 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2461 OID_AUTO, "rss_pkts", CTLFLAG_RW,
2463 "# of packets w/ RSS info received");
2468 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
2469 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2470 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
2471 #if __FreeBSD_version < 1100095
2472 hn_rx_stat_int_sysctl,
2474 hn_rx_stat_u64_sysctl,
2476 "LU", "LRO queued");
2477 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
2478 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2479 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
2480 #if __FreeBSD_version < 1100095
2481 hn_rx_stat_int_sysctl,
2483 hn_rx_stat_u64_sysctl,
2485 "LU", "LRO flushed");
2486 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
2487 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2488 __offsetof(struct hn_rx_ring, hn_lro_tried),
2489 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
2490 #if __FreeBSD_version >= 1100099
2491 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
2492 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2493 hn_lro_lenlim_sysctl, "IU",
2494 "Max # of data bytes to be aggregated by LRO");
2495 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
2496 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2497 hn_lro_ackcnt_sysctl, "I",
2498 "Max # of ACKs to be aggregated by LRO");
2500 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
2501 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
2502 hn_trust_hcsum_sysctl, "I",
2503 "Trust tcp segement verification on host side, "
2504 "when csum info is missing");
2505 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
2506 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
2507 hn_trust_hcsum_sysctl, "I",
2508 "Trust udp datagram verification on host side, "
2509 "when csum info is missing");
2510 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
2511 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
2512 hn_trust_hcsum_sysctl, "I",
2513 "Trust ip packet verification on host side, "
2514 "when csum info is missing");
2515 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
2516 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2517 __offsetof(struct hn_rx_ring, hn_csum_ip),
2518 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
2519 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
2520 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2521 __offsetof(struct hn_rx_ring, hn_csum_tcp),
2522 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
2523 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
2524 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2525 __offsetof(struct hn_rx_ring, hn_csum_udp),
2526 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
2527 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
2528 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2529 __offsetof(struct hn_rx_ring, hn_csum_trusted),
2530 hn_rx_stat_ulong_sysctl, "LU",
2531 "# of packets that we trust host's csum verification");
2532 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
2533 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2534 __offsetof(struct hn_rx_ring, hn_small_pkts),
2535 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
2536 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
2537 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
2538 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
2539 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
2545 hn_destroy_rx_data(struct hn_softc *sc)
2549 if (sc->hn_rxbuf != NULL) {
2550 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
2551 sc->hn_rxbuf = NULL;
2554 if (sc->hn_rx_ring_cnt == 0)
2557 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2558 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2560 if (rxr->hn_br == NULL)
2562 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
2565 #if defined(INET) || defined(INET6)
2566 tcp_lro_free(&rxr->hn_lro);
2568 free(rxr->hn_rdbuf, M_NETVSC);
2570 free(sc->hn_rx_ring, M_NETVSC);
2571 sc->hn_rx_ring = NULL;
2573 sc->hn_rx_ring_cnt = 0;
2574 sc->hn_rx_ring_inuse = 0;
2578 hn_create_tx_ring(struct hn_softc *sc, int id)
2580 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
2581 device_t dev = sc->hn_dev;
2582 bus_dma_tag_t parent_dtag;
2586 txr->hn_tx_idx = id;
2588 #ifndef HN_USE_TXDESC_BUFRING
2589 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
2591 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
2593 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
2594 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
2595 M_NETVSC, M_WAITOK | M_ZERO);
2596 #ifndef HN_USE_TXDESC_BUFRING
2597 SLIST_INIT(&txr->hn_txlist);
2599 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_NETVSC,
2600 M_WAITOK, &txr->hn_tx_lock);
2603 txr->hn_tx_taskq = sc->hn_tx_taskq;
2605 if (hn_use_if_start) {
2606 txr->hn_txeof = hn_start_txeof;
2607 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
2608 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
2612 txr->hn_txeof = hn_xmit_txeof;
2613 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
2614 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
2616 br_depth = hn_get_txswq_depth(txr);
2617 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_NETVSC,
2618 M_WAITOK, &txr->hn_tx_lock);
2621 txr->hn_direct_tx_size = hn_direct_tx_size;
2624 * Always schedule transmission instead of trying to do direct
2625 * transmission. This one gives the best performance so far.
2627 txr->hn_sched_tx = 1;
2629 parent_dtag = bus_get_dma_tag(dev);
2631 /* DMA tag for RNDIS packet messages. */
2632 error = bus_dma_tag_create(parent_dtag, /* parent */
2633 HN_RNDIS_PKT_ALIGN, /* alignment */
2634 HN_RNDIS_PKT_BOUNDARY, /* boundary */
2635 BUS_SPACE_MAXADDR, /* lowaddr */
2636 BUS_SPACE_MAXADDR, /* highaddr */
2637 NULL, NULL, /* filter, filterarg */
2638 HN_RNDIS_PKT_LEN, /* maxsize */
2640 HN_RNDIS_PKT_LEN, /* maxsegsize */
2642 NULL, /* lockfunc */
2643 NULL, /* lockfuncarg */
2644 &txr->hn_tx_rndis_dtag);
2646 device_printf(dev, "failed to create rndis dmatag\n");
2650 /* DMA tag for data. */
2651 error = bus_dma_tag_create(parent_dtag, /* parent */
2653 HN_TX_DATA_BOUNDARY, /* boundary */
2654 BUS_SPACE_MAXADDR, /* lowaddr */
2655 BUS_SPACE_MAXADDR, /* highaddr */
2656 NULL, NULL, /* filter, filterarg */
2657 HN_TX_DATA_MAXSIZE, /* maxsize */
2658 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
2659 HN_TX_DATA_SEGSIZE, /* maxsegsize */
2661 NULL, /* lockfunc */
2662 NULL, /* lockfuncarg */
2663 &txr->hn_tx_data_dtag);
2665 device_printf(dev, "failed to create data dmatag\n");
2669 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
2670 struct hn_txdesc *txd = &txr->hn_txdesc[i];
2675 * Allocate and load RNDIS packet message.
2677 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
2678 (void **)&txd->rndis_pkt,
2679 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
2680 &txd->rndis_pkt_dmap);
2683 "failed to allocate rndis_packet_msg, %d\n", i);
2687 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
2688 txd->rndis_pkt_dmap,
2689 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
2690 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
2694 "failed to load rndis_packet_msg, %d\n", i);
2695 bus_dmamem_free(txr->hn_tx_rndis_dtag,
2696 txd->rndis_pkt, txd->rndis_pkt_dmap);
2700 /* DMA map for TX data. */
2701 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
2705 "failed to allocate tx data dmamap\n");
2706 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
2707 txd->rndis_pkt_dmap);
2708 bus_dmamem_free(txr->hn_tx_rndis_dtag,
2709 txd->rndis_pkt, txd->rndis_pkt_dmap);
2713 /* All set, put it to list */
2714 txd->flags |= HN_TXD_FLAG_ONLIST;
2715 #ifndef HN_USE_TXDESC_BUFRING
2716 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2718 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2721 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
2723 if (sc->hn_tx_sysctl_tree != NULL) {
2724 struct sysctl_oid_list *child;
2725 struct sysctl_ctx_list *ctx;
2729 * Create per TX ring sysctl tree:
2730 * dev.hn.UNIT.tx.RINGID
2732 ctx = device_get_sysctl_ctx(dev);
2733 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
2735 snprintf(name, sizeof(name), "%d", id);
2736 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
2737 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2739 if (txr->hn_tx_sysctl_tree != NULL) {
2740 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
2742 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
2743 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
2744 "# of available TX descs");
2745 if (!hn_use_if_start) {
2746 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
2747 CTLFLAG_RD, &txr->hn_oactive, 0,
2750 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
2751 CTLFLAG_RW, &txr->hn_pkts,
2752 "# of packets transmitted");
2760 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
2762 struct hn_tx_ring *txr = txd->txr;
2764 KASSERT(txd->m == NULL, ("still has mbuf installed"));
2765 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
2767 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
2768 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
2769 txd->rndis_pkt_dmap);
2770 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
2774 hn_destroy_tx_ring(struct hn_tx_ring *txr)
2776 struct hn_txdesc *txd;
2778 if (txr->hn_txdesc == NULL)
2781 #ifndef HN_USE_TXDESC_BUFRING
2782 while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
2783 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2784 hn_txdesc_dmamap_destroy(txd);
2787 mtx_lock(&txr->hn_tx_lock);
2788 while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
2789 hn_txdesc_dmamap_destroy(txd);
2790 mtx_unlock(&txr->hn_tx_lock);
2793 if (txr->hn_tx_data_dtag != NULL)
2794 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
2795 if (txr->hn_tx_rndis_dtag != NULL)
2796 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
2798 #ifdef HN_USE_TXDESC_BUFRING
2799 buf_ring_free(txr->hn_txdesc_br, M_NETVSC);
2802 free(txr->hn_txdesc, M_NETVSC);
2803 txr->hn_txdesc = NULL;
2805 if (txr->hn_mbuf_br != NULL)
2806 buf_ring_free(txr->hn_mbuf_br, M_NETVSC);
2808 #ifndef HN_USE_TXDESC_BUFRING
2809 mtx_destroy(&txr->hn_txlist_spin);
2811 mtx_destroy(&txr->hn_tx_lock);
2815 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
2817 struct sysctl_oid_list *child;
2818 struct sysctl_ctx_list *ctx;
2822 * Create TXBUF for chimney sending.
2824 * NOTE: It is shared by all channels.
2826 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
2827 PAGE_SIZE, 0, NETVSC_SEND_BUFFER_SIZE, &sc->hn_chim_dma,
2828 BUS_DMA_WAITOK | BUS_DMA_ZERO);
2829 if (sc->hn_chim == NULL) {
2830 device_printf(sc->hn_dev, "allocate txbuf failed\n");
2834 sc->hn_tx_ring_cnt = ring_cnt;
2835 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
2837 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
2838 M_NETVSC, M_WAITOK | M_ZERO);
2840 ctx = device_get_sysctl_ctx(sc->hn_dev);
2841 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
2843 /* Create dev.hn.UNIT.tx sysctl tree */
2844 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
2845 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2847 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2850 error = hn_create_tx_ring(sc, i);
2855 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
2856 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2857 __offsetof(struct hn_tx_ring, hn_no_txdescs),
2858 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
2859 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
2860 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2861 __offsetof(struct hn_tx_ring, hn_send_failed),
2862 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
2863 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
2864 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2865 __offsetof(struct hn_tx_ring, hn_txdma_failed),
2866 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
2867 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
2868 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2869 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
2870 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
2871 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
2872 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2873 __offsetof(struct hn_tx_ring, hn_tx_chimney),
2874 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
2875 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
2876 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2877 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
2878 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
2879 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
2880 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
2881 "# of total TX descs");
2882 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
2883 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
2884 "Chimney send packet size upper boundary");
2885 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
2886 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2887 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
2888 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
2889 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2890 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
2891 hn_tx_conf_int_sysctl, "I",
2892 "Size of the packet for direct transmission");
2893 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
2894 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2895 __offsetof(struct hn_tx_ring, hn_sched_tx),
2896 hn_tx_conf_int_sysctl, "I",
2897 "Always schedule transmission "
2898 "instead of doing direct transmission");
2899 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
2900 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
2901 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
2902 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
2908 hn_set_chim_size(struct hn_softc *sc, int chim_size)
2912 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2913 sc->hn_tx_ring[i].hn_chim_size = chim_size;
2917 hn_fixup_tx_data(struct hn_softc *sc)
2919 uint64_t csum_assist;
2922 hn_set_chim_size(sc, sc->hn_chim_szmax);
2923 if (hn_tx_chimney_size > 0 &&
2924 hn_tx_chimney_size < sc->hn_chim_szmax)
2925 hn_set_chim_size(sc, hn_tx_chimney_size);
2928 if (sc->hn_caps & HN_CAP_IPCS)
2929 csum_assist |= CSUM_IP;
2930 if (sc->hn_caps & HN_CAP_TCP4CS)
2931 csum_assist |= CSUM_IP_TCP;
2932 if (sc->hn_caps & HN_CAP_UDP4CS)
2933 csum_assist |= CSUM_IP_UDP;
2935 if (sc->hn_caps & HN_CAP_TCP6CS)
2936 csum_assist |= CSUM_IP6_TCP;
2937 if (sc->hn_caps & HN_CAP_UDP6CS)
2938 csum_assist |= CSUM_IP6_UDP;
2941 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
2942 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
2946 hn_destroy_tx_data(struct hn_softc *sc)
2950 if (sc->hn_chim != NULL) {
2951 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
2955 if (sc->hn_tx_ring_cnt == 0)
2958 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
2959 hn_destroy_tx_ring(&sc->hn_tx_ring[i]);
2961 free(sc->hn_tx_ring, M_NETVSC);
2962 sc->hn_tx_ring = NULL;
2964 sc->hn_tx_ring_cnt = 0;
2965 sc->hn_tx_ring_inuse = 0;
2969 hn_start_taskfunc(void *xtxr, int pending __unused)
2971 struct hn_tx_ring *txr = xtxr;
2973 mtx_lock(&txr->hn_tx_lock);
2974 hn_start_locked(txr, 0);
2975 mtx_unlock(&txr->hn_tx_lock);
2979 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
2981 struct hn_tx_ring *txr = xtxr;
2983 mtx_lock(&txr->hn_tx_lock);
2984 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
2985 hn_start_locked(txr, 0);
2986 mtx_unlock(&txr->hn_tx_lock);
2990 hn_xmit(struct hn_tx_ring *txr, int len)
2992 struct hn_softc *sc = txr->hn_sc;
2993 struct ifnet *ifp = sc->hn_ifp;
2994 struct mbuf *m_head;
2996 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
2997 KASSERT(hn_use_if_start == 0,
2998 ("hn_xmit is called, when if_start is enabled"));
3000 if (__predict_false(txr->hn_suspended))
3003 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
3006 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
3007 struct hn_txdesc *txd;
3010 if (len > 0 && m_head->m_pkthdr.len > len) {
3012 * This sending could be time consuming; let callers
3013 * dispatch this packet sending (and sending of any
3014 * following up packets) to tx taskqueue.
3016 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3020 txd = hn_txdesc_get(txr);
3022 txr->hn_no_txdescs++;
3023 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3024 txr->hn_oactive = 1;
3028 error = hn_encap(txr, txd, &m_head);
3030 /* Both txd and m_head are freed; discard */
3031 drbr_advance(ifp, txr->hn_mbuf_br);
3035 error = hn_send_pkt(ifp, txr, txd);
3036 if (__predict_false(error)) {
3037 /* txd is freed, but m_head is not */
3038 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3039 txr->hn_oactive = 1;
3044 drbr_advance(ifp, txr->hn_mbuf_br);
3050 hn_transmit(struct ifnet *ifp, struct mbuf *m)
3052 struct hn_softc *sc = ifp->if_softc;
3053 struct hn_tx_ring *txr;
3057 * Select the TX ring based on flowid
3059 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
3060 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
3061 txr = &sc->hn_tx_ring[idx];
3063 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
3065 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
3069 if (txr->hn_oactive)
3072 if (txr->hn_sched_tx)
3075 if (mtx_trylock(&txr->hn_tx_lock)) {
3078 sched = hn_xmit(txr, txr->hn_direct_tx_size);
3079 mtx_unlock(&txr->hn_tx_lock);
3084 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3089 hn_tx_ring_qflush(struct hn_tx_ring *txr)
3093 mtx_lock(&txr->hn_tx_lock);
3094 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
3096 mtx_unlock(&txr->hn_tx_lock);
3100 hn_xmit_qflush(struct ifnet *ifp)
3102 struct hn_softc *sc = ifp->if_softc;
3105 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3106 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
3111 hn_xmit_txeof(struct hn_tx_ring *txr)
3114 if (txr->hn_sched_tx)
3117 if (mtx_trylock(&txr->hn_tx_lock)) {
3120 txr->hn_oactive = 0;
3121 sched = hn_xmit(txr, txr->hn_direct_tx_size);
3122 mtx_unlock(&txr->hn_tx_lock);
3124 taskqueue_enqueue(txr->hn_tx_taskq,
3130 * Release the oactive earlier, with the hope, that
3131 * others could catch up. The task will clear the
3132 * oactive again with the hn_tx_lock to avoid possible
3135 txr->hn_oactive = 0;
3136 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3141 hn_xmit_taskfunc(void *xtxr, int pending __unused)
3143 struct hn_tx_ring *txr = xtxr;
3145 mtx_lock(&txr->hn_tx_lock);
3147 mtx_unlock(&txr->hn_tx_lock);
3151 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
3153 struct hn_tx_ring *txr = xtxr;
3155 mtx_lock(&txr->hn_tx_lock);
3156 txr->hn_oactive = 0;
3158 mtx_unlock(&txr->hn_tx_lock);
3162 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
3164 struct vmbus_chan_br cbr;
3165 struct hn_rx_ring *rxr;
3166 struct hn_tx_ring *txr = NULL;
3169 idx = vmbus_chan_subidx(chan);
3172 * Link this channel to RX/TX ring.
3174 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
3175 ("invalid channel index %d, should > 0 && < %d",
3176 idx, sc->hn_rx_ring_inuse));
3177 rxr = &sc->hn_rx_ring[idx];
3178 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
3179 ("RX ring %d already attached", idx));
3180 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
3183 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
3184 idx, vmbus_chan_id(chan));
3187 if (idx < sc->hn_tx_ring_inuse) {
3188 txr = &sc->hn_tx_ring[idx];
3189 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
3190 ("TX ring %d already attached", idx));
3191 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
3193 txr->hn_chan = chan;
3195 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
3196 idx, vmbus_chan_id(chan));
3200 /* Bind this channel to a proper CPU. */
3201 vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
3206 cbr.cbr = rxr->hn_br;
3207 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
3208 cbr.cbr_txsz = NETVSC_DEVICE_RING_BUFFER_SIZE;
3209 cbr.cbr_rxsz = NETVSC_DEVICE_RING_BUFFER_SIZE;
3210 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
3212 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
3213 vmbus_chan_id(chan), error);
3214 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
3216 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
3222 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
3224 struct hn_rx_ring *rxr;
3227 idx = vmbus_chan_subidx(chan);
3230 * Link this channel to RX/TX ring.
3232 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
3233 ("invalid channel index %d, should > 0 && < %d",
3234 idx, sc->hn_rx_ring_inuse));
3235 rxr = &sc->hn_rx_ring[idx];
3236 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
3237 ("RX ring %d is not attached", idx));
3238 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
3240 if (idx < sc->hn_tx_ring_inuse) {
3241 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
3243 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
3244 ("TX ring %d is not attached attached", idx));
3245 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
3249 * Close this channel.
3252 * Channel closing does _not_ destroy the target channel.
3254 vmbus_chan_close(chan);
3258 hn_attach_subchans(struct hn_softc *sc)
3260 struct vmbus_channel **subchans;
3261 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
3264 if (subchan_cnt == 0)
3267 /* Attach the sub-channels. */
3268 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
3269 for (i = 0; i < subchan_cnt; ++i) {
3270 error = hn_chan_attach(sc, subchans[i]);
3274 vmbus_subchan_rel(subchans, subchan_cnt);
3277 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
3280 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
3288 hn_detach_allchans(struct hn_softc *sc)
3290 struct vmbus_channel **subchans;
3291 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
3294 if (subchan_cnt == 0)
3297 /* Detach the sub-channels. */
3298 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
3299 for (i = 0; i < subchan_cnt; ++i)
3300 hn_chan_detach(sc, subchans[i]);
3301 vmbus_subchan_rel(subchans, subchan_cnt);
3305 * Detach the primary channel, _after_ all sub-channels
3308 hn_chan_detach(sc, sc->hn_prichan);
3310 /* Wait for sub-channels to be destroyed, if any. */
3311 vmbus_subchan_drain(sc->hn_prichan);
3314 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3315 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
3316 HN_RX_FLAG_ATTACHED) == 0,
3317 ("%dth RX ring is still attached", i));
3319 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3320 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
3321 HN_TX_FLAG_ATTACHED) == 0,
3322 ("%dth TX ring is still attached", i));
3328 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
3330 struct vmbus_channel **subchans;
3331 int nchan, rxr_cnt, error;
3333 nchan = *nsubch + 1;
3334 if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_30 || nchan == 1) {
3336 * Either RSS is not supported, or multiple RX/TX rings
3337 * are not requested.
3344 * Get RSS capabilities, e.g. # of RX rings, and # of indirect
3347 error = hn_rndis_get_rsscaps(sc, &rxr_cnt);
3349 /* No RSS; this is benign. */
3354 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
3358 if (nchan > rxr_cnt)
3361 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
3367 * Allocate sub-channels from NVS.
3369 *nsubch = nchan - 1;
3370 error = hn_nvs_alloc_subchans(sc, nsubch);
3371 if (error || *nsubch == 0) {
3372 /* Failed to allocate sub-channels. */
3378 * Wait for all sub-channels to become ready before moving on.
3380 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
3381 vmbus_subchan_rel(subchans, *nsubch);
3386 hn_synth_attach(struct hn_softc *sc, int mtu)
3388 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
3389 int error, nsubch, nchan, i;
3392 /* Save capabilities for later verification. */
3393 old_caps = sc->hn_caps;
3397 * Attach the primary channel _before_ attaching NVS and RNDIS.
3399 error = hn_chan_attach(sc, sc->hn_prichan);
3406 error = hn_nvs_attach(sc, mtu);
3411 * Attach RNDIS _after_ NVS is attached.
3413 error = hn_rndis_attach(sc);
3418 * Make sure capabilities are not changed.
3420 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
3421 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
3422 old_caps, sc->hn_caps);
3423 /* Restore old capabilities and abort. */
3424 sc->hn_caps = old_caps;
3429 * Allocate sub-channels for multi-TX/RX rings.
3432 * The # of RX rings that can be used is equivalent to the # of
3433 * channels to be requested.
3435 nsubch = sc->hn_rx_ring_cnt - 1;
3436 error = hn_synth_alloc_subchans(sc, &nsubch);
3442 /* Only the primary channel can be used; done */
3447 * Configure RSS key and indirect table _after_ all sub-channels
3451 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
3453 * RSS key is not set yet; set it to the default RSS key.
3456 if_printf(sc->hn_ifp, "setup default RSS key\n");
3457 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
3458 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3461 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
3463 * RSS indirect table is not set yet; set it up in round-
3467 if_printf(sc->hn_ifp, "setup default RSS indirect "
3470 /* TODO: Take ndis_rss_caps.ndis_nind into account. */
3471 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
3472 rss->rss_ind[i] = i % nchan;
3473 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3476 * # of usable channels may be changed, so we have to
3477 * make sure that all entries in RSS indirect table
3480 hn_rss_ind_fixup(sc, nchan);
3483 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
3486 * Failed to configure RSS key or indirect table; only
3487 * the primary channel can be used.
3493 * Set the # of TX/RX rings that could be used according to
3494 * the # of channels that NVS offered.
3496 hn_set_ring_inuse(sc, nchan);
3499 * Attach the sub-channels, if any.
3501 error = hn_attach_subchans(sc);
3509 * The interface must have been suspended though hn_suspend(), before
3510 * this function get called.
3513 hn_synth_detach(struct hn_softc *sc)
3517 /* Detach the RNDIS first. */
3518 hn_rndis_detach(sc);
3523 /* Detach all of the channels. */
3524 hn_detach_allchans(sc);
3528 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
3530 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
3531 ("invalid ring count %d", ring_cnt));
3533 if (sc->hn_tx_ring_cnt > ring_cnt)
3534 sc->hn_tx_ring_inuse = ring_cnt;
3536 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3537 sc->hn_rx_ring_inuse = ring_cnt;
3540 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
3541 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
3546 hn_rx_drain(struct vmbus_channel *chan)
3549 while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan))
3551 vmbus_chan_intr_drain(chan);
3555 hn_suspend(struct hn_softc *sc)
3557 struct vmbus_channel **subch = NULL;
3565 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
3566 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
3568 mtx_lock(&txr->hn_tx_lock);
3569 txr->hn_suspended = 1;
3570 mtx_unlock(&txr->hn_tx_lock);
3571 /* No one is able send more packets now. */
3573 /* Wait for all pending sends to finish. */
3574 while (hn_tx_ring_pending(txr))
3575 pause("hnwtx", 1 /* 1 tick */);
3577 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
3578 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
3582 * Disable RX by clearing RX filter.
3584 hn_rndis_set_rxfilter(sc, 0);
3587 * Give RNDIS enough time to flush all pending data packets.
3589 pause("waitrx", (200 * hz) / 1000);
3592 * Drain RX/TX bufrings and interrupts.
3594 nsubch = sc->hn_rx_ring_inuse - 1;
3596 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
3598 if (subch != NULL) {
3599 for (i = 0; i < nsubch; ++i)
3600 hn_rx_drain(subch[i]);
3602 hn_rx_drain(sc->hn_prichan);
3605 vmbus_subchan_rel(subch, nsubch);
3609 hn_tx_resume(struct hn_softc *sc, int tx_ring_cnt)
3613 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
3614 ("invalid TX ring count %d", tx_ring_cnt));
3616 for (i = 0; i < tx_ring_cnt; ++i) {
3617 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
3619 mtx_lock(&txr->hn_tx_lock);
3620 txr->hn_suspended = 0;
3621 mtx_unlock(&txr->hn_tx_lock);
3626 hn_resume(struct hn_softc *sc)
3634 * TODO: add hn_rx_filter.
3636 hn_rndis_set_rxfilter(sc, NDIS_PACKET_TYPE_PROMISCUOUS);
3639 * Make sure to clear suspend status on "all" TX rings,
3640 * since hn_tx_ring_inuse can be changed after hn_suspend().
3642 hn_tx_resume(sc, sc->hn_tx_ring_cnt);
3644 if (!hn_use_if_start) {
3646 * Flush unused drbrs, since hn_tx_ring_inuse may be
3649 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
3650 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
3656 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
3657 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
3660 * Use txeof task, so that any pending oactive can be
3663 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3668 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
3670 const struct hn_nvs_hdr *hdr;
3672 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
3673 if_printf(sc->hn_ifp, "invalid nvs notify\n");
3676 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
3678 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
3679 /* Useless; ignore */
3682 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
3686 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
3687 const struct vmbus_chanpkt_hdr *pkt)
3689 struct hn_send_ctx *sndc;
3691 sndc = (struct hn_send_ctx *)(uintptr_t)pkt->cph_xactid;
3692 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
3693 VMBUS_CHANPKT_DATALEN(pkt));
3696 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
3702 hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr,
3703 struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr)
3705 const struct vmbus_chanpkt_rxbuf *pkt;
3706 const struct hn_nvs_hdr *nvs_hdr;
3709 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
3710 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
3713 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
3715 /* Make sure that this is a RNDIS message. */
3716 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
3717 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
3722 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
3723 if (__predict_false(hlen < sizeof(*pkt))) {
3724 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
3727 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
3729 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
3730 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
3735 count = pkt->cp_rxbuf_cnt;
3736 if (__predict_false(hlen <
3737 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
3738 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
3742 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
3743 for (i = 0; i < count; ++i) {
3746 ofs = pkt->cp_rxbuf[i].rb_ofs;
3747 len = pkt->cp_rxbuf[i].rb_len;
3748 if (__predict_false(ofs + len > NETVSC_RECEIVE_BUFFER_SIZE)) {
3749 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
3750 "ofs %d, len %d\n", i, ofs, len);
3753 hv_rf_on_receive(sc, rxr, rxr->hn_rxbuf + ofs, len);
3757 * Moved completion call back here so that all received
3758 * messages (not just data messages) will trigger a response
3759 * message back to the host.
3761 hn_nvs_ack_rxbuf(chan, pkt->cp_hdr.cph_xactid);
3765 * Net VSC on receive completion
3767 * Send a receive completion packet to RNDIS device (ie NetVsp)
3770 hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid)
3772 struct hn_nvs_rndis_ack ack;
3776 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
3777 ack.nvs_status = HN_NVS_STATUS_OK;
3780 /* Send the completion */
3781 ret = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
3782 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
3786 } else if (ret == EAGAIN) {
3787 /* no more room... wait a bit and attempt to retry 3 times */
3792 goto retry_send_cmplt;
3798 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
3800 struct hn_rx_ring *rxr = xrxr;
3801 struct hn_softc *sc = rxr->hn_ifp->if_softc;
3803 int bufferlen = NETVSC_PACKET_SIZE;
3805 buffer = rxr->hn_rdbuf;
3807 struct vmbus_chanpkt_hdr *pkt = buffer;
3808 uint32_t bytes_rxed;
3811 bytes_rxed = bufferlen;
3812 ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed);
3814 switch (pkt->cph_type) {
3815 case VMBUS_CHANPKT_TYPE_COMP:
3816 hn_nvs_handle_comp(sc, chan, pkt);
3818 case VMBUS_CHANPKT_TYPE_RXBUF:
3819 hn_nvs_handle_rxbuf(sc, rxr, chan, pkt);
3821 case VMBUS_CHANPKT_TYPE_INBAND:
3822 hn_nvs_handle_notify(sc, pkt);
3825 if_printf(rxr->hn_ifp,
3826 "unknown chan pkt %u\n",
3830 } else if (ret == ENOBUFS) {
3831 /* Handle large packet */
3832 if (bufferlen > NETVSC_PACKET_SIZE) {
3833 free(buffer, M_NETVSC);
3837 /* alloc new buffer */
3838 buffer = malloc(bytes_rxed, M_NETVSC, M_NOWAIT);
3839 if (buffer == NULL) {
3840 if_printf(rxr->hn_ifp,
3841 "hv_cb malloc buffer failed, len=%u\n",
3846 bufferlen = bytes_rxed;
3848 /* No more packets */
3853 if (bufferlen > NETVSC_PACKET_SIZE)
3854 free(buffer, M_NETVSC);
3856 hv_rf_channel_rollup(rxr, rxr->hn_txr);
3860 hn_tx_taskq_create(void *arg __unused)
3862 if (!hn_share_tx_taskq)
3865 hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
3866 taskqueue_thread_enqueue, &hn_tx_taskq);
3867 taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
3868 if (hn_bind_tx_taskq >= 0) {
3869 int cpu = hn_bind_tx_taskq;
3870 struct task cpuset_task;
3873 if (cpu > mp_ncpus - 1)
3875 CPU_SETOF(cpu, &cpu_set);
3876 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set);
3877 taskqueue_enqueue(hn_tx_taskq, &cpuset_task);
3878 taskqueue_drain(hn_tx_taskq, &cpuset_task);
3881 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_FIRST,
3882 hn_tx_taskq_create, NULL);
3885 hn_tx_taskq_destroy(void *arg __unused)
3887 if (hn_tx_taskq != NULL)
3888 taskqueue_free(hn_tx_taskq);
3890 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_FIRST,
3891 hn_tx_taskq_destroy, NULL);
3893 static device_method_t netvsc_methods[] = {
3894 /* Device interface */
3895 DEVMETHOD(device_probe, netvsc_probe),
3896 DEVMETHOD(device_attach, netvsc_attach),
3897 DEVMETHOD(device_detach, netvsc_detach),
3898 DEVMETHOD(device_shutdown, netvsc_shutdown),
3903 static driver_t netvsc_driver = {
3906 sizeof(struct hn_softc)
3909 static devclass_t netvsc_devclass;
3911 DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0);
3912 MODULE_VERSION(hn, 1);
3913 MODULE_DEPEND(hn, vmbus, 1, 1, 1);