2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
61 #include <sys/param.h>
63 #include <sys/kernel.h>
64 #include <sys/limits.h>
65 #include <sys/malloc.h>
67 #include <sys/module.h>
69 #include <sys/queue.h>
72 #include <sys/socket.h>
73 #include <sys/sockio.h>
75 #include <sys/sysctl.h>
76 #include <sys/systm.h>
77 #include <sys/taskqueue.h>
78 #include <sys/buf_ring.h>
80 #include <machine/atomic.h>
81 #include <machine/in_cksum.h>
84 #include <net/ethernet.h>
86 #include <net/if_arp.h>
87 #include <net/if_media.h>
88 #include <net/if_types.h>
89 #include <net/if_var.h>
90 #include <net/if_vlan_var.h>
91 #include <net/rndis.h>
93 #include <netinet/in_systm.h>
94 #include <netinet/in.h>
95 #include <netinet/ip.h>
96 #include <netinet/ip6.h>
97 #include <netinet/tcp.h>
98 #include <netinet/tcp_lro.h>
99 #include <netinet/udp.h>
101 #include <dev/hyperv/include/hyperv.h>
102 #include <dev/hyperv/include/hyperv_busdma.h>
103 #include <dev/hyperv/include/vmbus.h>
104 #include <dev/hyperv/include/vmbus_xact.h>
106 #include <dev/hyperv/netvsc/ndis.h>
107 #include <dev/hyperv/netvsc/if_hnreg.h>
108 #include <dev/hyperv/netvsc/if_hnvar.h>
109 #include <dev/hyperv/netvsc/hn_nvs.h>
110 #include <dev/hyperv/netvsc/hn_rndis.h>
112 #include "vmbus_if.h"
114 #define HN_RING_CNT_DEF_MAX 8
116 /* YYY should get it from the underlying channel */
117 #define HN_TX_DESC_CNT 512
119 #define HN_RNDIS_PKT_LEN \
120 (sizeof(struct rndis_packet_msg) + \
121 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
122 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
123 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
124 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
125 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
126 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
128 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
129 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
130 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
131 /* -1 for RNDIS packet message */
132 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
134 #define HN_DIRECT_TX_SIZE_DEF 128
136 #define HN_EARLY_TXEOF_THRESH 8
138 #define HN_PKTBUF_LEN_DEF (16 * 1024)
140 #define HN_LROENT_CNT_DEF 128
142 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
143 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
144 /* YYY 2*MTU is a bit rough, but should be good enough. */
145 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
147 #define HN_LRO_ACKCNT_DEF 1
149 #define HN_LOCK_INIT(sc) \
150 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
151 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
152 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
153 #define HN_LOCK(sc) sx_xlock(&(sc)->hn_lock)
154 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
156 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
157 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
158 #define HN_CSUM_IP_HWASSIST(sc) \
159 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
160 #define HN_CSUM_IP6_HWASSIST(sc) \
161 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
164 #ifndef HN_USE_TXDESC_BUFRING
165 SLIST_ENTRY(hn_txdesc) link;
168 struct hn_tx_ring *txr;
170 uint32_t flags; /* HN_TXD_FLAG_ */
171 struct hn_nvs_sendctx send_ctx;
175 bus_dmamap_t data_dmap;
177 bus_addr_t rndis_pkt_paddr;
178 struct rndis_packet_msg *rndis_pkt;
179 bus_dmamap_t rndis_pkt_dmap;
182 #define HN_TXD_FLAG_ONLIST 0x0001
183 #define HN_TXD_FLAG_DMAMAP 0x0002
192 #define HN_RXINFO_VLAN 0x0001
193 #define HN_RXINFO_CSUM 0x0002
194 #define HN_RXINFO_HASHINF 0x0004
195 #define HN_RXINFO_HASHVAL 0x0008
196 #define HN_RXINFO_ALL \
199 HN_RXINFO_HASHINF | \
202 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
203 #define HN_NDIS_RXCSUM_INFO_INVALID 0
204 #define HN_NDIS_HASH_INFO_INVALID 0
206 static int hn_probe(device_t);
207 static int hn_attach(device_t);
208 static int hn_detach(device_t);
209 static int hn_shutdown(device_t);
210 static void hn_chan_callback(struct vmbus_channel *,
213 static void hn_init(void *);
214 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
215 static void hn_start(struct ifnet *);
216 static int hn_transmit(struct ifnet *, struct mbuf *);
217 static void hn_xmit_qflush(struct ifnet *);
218 static int hn_ifmedia_upd(struct ifnet *);
219 static void hn_ifmedia_sts(struct ifnet *,
220 struct ifmediareq *);
222 static int hn_rndis_rxinfo(const void *, int,
224 static void hn_rndis_rx_data(struct hn_rx_ring *,
226 static void hn_rndis_rx_status(struct hn_softc *,
229 static void hn_nvs_handle_notify(struct hn_softc *,
230 const struct vmbus_chanpkt_hdr *);
231 static void hn_nvs_handle_comp(struct hn_softc *,
232 struct vmbus_channel *,
233 const struct vmbus_chanpkt_hdr *);
234 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
235 struct vmbus_channel *,
236 const struct vmbus_chanpkt_hdr *);
237 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
238 struct vmbus_channel *, uint64_t);
240 #if __FreeBSD_version >= 1100099
241 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
242 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
244 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
245 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
246 #if __FreeBSD_version < 1100095
247 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
249 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
251 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
252 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
253 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
254 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
255 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
256 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
257 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
258 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
259 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
260 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
262 static void hn_stop(struct hn_softc *);
263 static void hn_init_locked(struct hn_softc *);
264 static int hn_chan_attach(struct hn_softc *,
265 struct vmbus_channel *);
266 static void hn_chan_detach(struct hn_softc *,
267 struct vmbus_channel *);
268 static int hn_attach_subchans(struct hn_softc *);
269 static void hn_detach_allchans(struct hn_softc *);
270 static void hn_chan_rollup(struct hn_rx_ring *,
271 struct hn_tx_ring *);
272 static void hn_set_ring_inuse(struct hn_softc *, int);
273 static int hn_synth_attach(struct hn_softc *, int);
274 static void hn_synth_detach(struct hn_softc *);
275 static int hn_synth_alloc_subchans(struct hn_softc *,
277 static void hn_suspend(struct hn_softc *);
278 static void hn_suspend_data(struct hn_softc *);
279 static void hn_suspend_mgmt(struct hn_softc *);
280 static void hn_resume(struct hn_softc *);
281 static void hn_resume_data(struct hn_softc *);
282 static void hn_resume_mgmt(struct hn_softc *);
283 static void hn_suspend_mgmt_taskfunc(void *, int);
284 static void hn_chan_drain(struct vmbus_channel *);
286 static void hn_update_link_status(struct hn_softc *);
287 static void hn_change_network(struct hn_softc *);
288 static void hn_link_taskfunc(void *, int);
289 static void hn_netchg_init_taskfunc(void *, int);
290 static void hn_netchg_status_taskfunc(void *, int);
291 static void hn_link_status(struct hn_softc *);
293 static int hn_create_rx_data(struct hn_softc *, int);
294 static void hn_destroy_rx_data(struct hn_softc *);
295 static int hn_check_iplen(const struct mbuf *, int);
296 static int hn_set_rxfilter(struct hn_softc *);
297 static int hn_rss_reconfig(struct hn_softc *);
298 static void hn_rss_ind_fixup(struct hn_softc *, int);
299 static int hn_rxpkt(struct hn_rx_ring *, const void *,
300 int, const struct hn_rxinfo *);
302 static int hn_tx_ring_create(struct hn_softc *, int);
303 static void hn_tx_ring_destroy(struct hn_tx_ring *);
304 static int hn_create_tx_data(struct hn_softc *, int);
305 static void hn_fixup_tx_data(struct hn_softc *);
306 static void hn_destroy_tx_data(struct hn_softc *);
307 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
308 static int hn_encap(struct hn_tx_ring *,
309 struct hn_txdesc *, struct mbuf **);
310 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
312 static void hn_set_chim_size(struct hn_softc *, int);
313 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
314 static bool hn_tx_ring_pending(struct hn_tx_ring *);
315 static void hn_tx_ring_qflush(struct hn_tx_ring *);
316 static void hn_resume_tx(struct hn_softc *, int);
317 static int hn_get_txswq_depth(const struct hn_tx_ring *);
318 static void hn_txpkt_done(struct hn_nvs_sendctx *,
319 struct hn_softc *, struct vmbus_channel *,
321 static int hn_txpkt_sglist(struct hn_tx_ring *,
323 static int hn_txpkt_chim(struct hn_tx_ring *,
325 static int hn_xmit(struct hn_tx_ring *, int);
326 static void hn_xmit_taskfunc(void *, int);
327 static void hn_xmit_txeof(struct hn_tx_ring *);
328 static void hn_xmit_txeof_taskfunc(void *, int);
329 static int hn_start_locked(struct hn_tx_ring *, int);
330 static void hn_start_taskfunc(void *, int);
331 static void hn_start_txeof(struct hn_tx_ring *);
332 static void hn_start_txeof_taskfunc(void *, int);
334 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
335 "Hyper-V network interface");
337 /* Trust tcp segements verification on host side. */
338 static int hn_trust_hosttcp = 1;
339 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
340 &hn_trust_hosttcp, 0,
341 "Trust tcp segement verification on host side, "
342 "when csum info is missing (global setting)");
344 /* Trust udp datagrams verification on host side. */
345 static int hn_trust_hostudp = 1;
346 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
347 &hn_trust_hostudp, 0,
348 "Trust udp datagram verification on host side, "
349 "when csum info is missing (global setting)");
351 /* Trust ip packets verification on host side. */
352 static int hn_trust_hostip = 1;
353 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
355 "Trust ip packet verification on host side, "
356 "when csum info is missing (global setting)");
358 /* Limit TSO burst size */
359 static int hn_tso_maxlen = IP_MAXPACKET;
360 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
361 &hn_tso_maxlen, 0, "TSO burst limit");
363 /* Limit chimney send size */
364 static int hn_tx_chimney_size = 0;
365 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
366 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
368 /* Limit the size of packet for direct transmission */
369 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
370 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
371 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
373 /* # of LRO entries per RX ring */
374 #if defined(INET) || defined(INET6)
375 #if __FreeBSD_version >= 1100095
376 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
377 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
378 &hn_lro_entry_count, 0, "LRO entry count");
382 /* Use shared TX taskqueue */
383 static int hn_share_tx_taskq = 0;
384 SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
385 &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
387 #ifndef HN_USE_TXDESC_BUFRING
388 static int hn_use_txdesc_bufring = 0;
390 static int hn_use_txdesc_bufring = 1;
392 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
393 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
395 /* Bind TX taskqueue to the target CPU */
396 static int hn_bind_tx_taskq = -1;
397 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
398 &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
400 /* Use ifnet.if_start instead of ifnet.if_transmit */
401 static int hn_use_if_start = 0;
402 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
403 &hn_use_if_start, 0, "Use if_start TX method");
405 /* # of channels to use */
406 static int hn_chan_cnt = 0;
407 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
409 "# of channels to use; each channel has one RX ring and one TX ring");
411 /* # of transmit rings to use */
412 static int hn_tx_ring_cnt = 0;
413 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
414 &hn_tx_ring_cnt, 0, "# of TX rings to use");
416 /* Software TX ring deptch */
417 static int hn_tx_swq_depth = 0;
418 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
419 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
421 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
422 #if __FreeBSD_version >= 1100095
423 static u_int hn_lro_mbufq_depth = 0;
424 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
425 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
428 static u_int hn_cpu_index; /* next CPU for channel */
429 static struct taskqueue *hn_tx_taskq; /* shared TX taskqueue */
432 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
433 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
434 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
435 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
436 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
437 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
440 static device_method_t hn_methods[] = {
441 /* Device interface */
442 DEVMETHOD(device_probe, hn_probe),
443 DEVMETHOD(device_attach, hn_attach),
444 DEVMETHOD(device_detach, hn_detach),
445 DEVMETHOD(device_shutdown, hn_shutdown),
449 static driver_t hn_driver = {
452 sizeof(struct hn_softc)
455 static devclass_t hn_devclass;
457 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
458 MODULE_VERSION(hn, 1);
459 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
461 #if __FreeBSD_version >= 1100099
463 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
467 for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
468 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
473 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
476 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
477 txd->chim_size == 0, ("invalid rndis sglist txd"));
478 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
479 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
483 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
485 struct hn_nvs_rndis rndis;
487 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
488 txd->chim_size > 0, ("invalid rndis chim txd"));
490 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
491 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
492 rndis.nvs_chim_idx = txd->chim_index;
493 rndis.nvs_chim_sz = txd->chim_size;
495 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
496 &rndis, sizeof(rndis), &txd->send_ctx));
499 static __inline uint32_t
500 hn_chim_alloc(struct hn_softc *sc)
502 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
503 u_long *bmap = sc->hn_chim_bmap;
504 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
506 for (i = 0; i < bmap_cnt; ++i) {
509 idx = ffsl(~bmap[i]);
513 --idx; /* ffsl is 1-based */
514 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
515 ("invalid i %d and idx %d", i, idx));
517 if (atomic_testandset_long(&bmap[i], idx))
520 ret = i * LONG_BIT + idx;
527 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
532 idx = chim_idx / LONG_BIT;
533 KASSERT(idx < sc->hn_chim_bmap_cnt,
534 ("invalid chimney index 0x%x", chim_idx));
536 mask = 1UL << (chim_idx % LONG_BIT);
537 KASSERT(sc->hn_chim_bmap[idx] & mask,
538 ("index bitmap 0x%lx, chimney index %u, "
539 "bitmap idx %d, bitmask 0x%lx",
540 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
542 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
546 hn_set_rxfilter(struct hn_softc *sc)
548 struct ifnet *ifp = sc->hn_ifp;
554 if (ifp->if_flags & IFF_PROMISC) {
555 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
557 filter = NDIS_PACKET_TYPE_DIRECTED;
558 if (ifp->if_flags & IFF_BROADCAST)
559 filter |= NDIS_PACKET_TYPE_BROADCAST;
562 * See the comment in SIOCADDMULTI/SIOCDELMULTI.
564 /* TODO: support multicast list */
565 if ((ifp->if_flags & IFF_ALLMULTI) ||
566 !TAILQ_EMPTY(&ifp->if_multiaddrs))
567 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
569 /* Always enable ALLMULTI */
570 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
574 if (sc->hn_rx_filter != filter) {
575 error = hn_rndis_set_rxfilter(sc, filter);
577 sc->hn_rx_filter = filter;
583 hn_get_txswq_depth(const struct hn_tx_ring *txr)
586 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
587 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
588 return txr->hn_txdesc_cnt;
589 return hn_tx_swq_depth;
593 hn_rss_reconfig(struct hn_softc *sc)
599 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
606 * Direct reconfiguration by setting the UNCHG flags does
607 * _not_ work properly.
610 if_printf(sc->hn_ifp, "disable RSS\n");
611 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
613 if_printf(sc->hn_ifp, "RSS disable failed\n");
618 * Reenable the RSS w/ the updated RSS key or indirect
622 if_printf(sc->hn_ifp, "reconfig RSS\n");
623 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
625 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
632 hn_rss_ind_fixup(struct hn_softc *sc, int nchan)
634 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
637 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
640 * Check indirect table to make sure that all channels in it
643 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
644 if (rss->rss_ind[i] >= nchan) {
645 if_printf(sc->hn_ifp,
646 "RSS indirect table %d fixup: %u -> %d\n",
647 i, rss->rss_ind[i], nchan - 1);
648 rss->rss_ind[i] = nchan - 1;
654 hn_ifmedia_upd(struct ifnet *ifp __unused)
661 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
663 struct hn_softc *sc = ifp->if_softc;
665 ifmr->ifm_status = IFM_AVALID;
666 ifmr->ifm_active = IFM_ETHER;
668 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
669 ifmr->ifm_active |= IFM_NONE;
672 ifmr->ifm_status |= IFM_ACTIVE;
673 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
676 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
677 static const struct hyperv_guid g_net_vsc_device_type = {
678 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
679 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
683 hn_probe(device_t dev)
686 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
687 &g_net_vsc_device_type) == 0) {
688 device_set_desc(dev, "Hyper-V Network Interface");
689 return BUS_PROBE_DEFAULT;
695 hn_cpuset_setthread_task(void *xmask, int pending __unused)
697 cpuset_t *mask = xmask;
700 error = cpuset_setthread(curthread->td_tid, mask);
702 panic("curthread=%ju: can't pin; error=%d",
703 (uintmax_t)curthread->td_tid, error);
708 hn_attach(device_t dev)
710 struct hn_softc *sc = device_get_softc(dev);
711 struct sysctl_oid_list *child;
712 struct sysctl_ctx_list *ctx;
713 uint8_t eaddr[ETHER_ADDR_LEN];
714 struct ifnet *ifp = NULL;
715 int error, ring_cnt, tx_ring_cnt;
718 sc->hn_prichan = vmbus_get_channel(dev);
722 * Setup taskqueue for transmission.
724 if (hn_tx_taskq == NULL) {
725 sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
726 taskqueue_thread_enqueue, &sc->hn_tx_taskq);
727 taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
728 device_get_nameunit(dev));
729 if (hn_bind_tx_taskq >= 0) {
730 int cpu = hn_bind_tx_taskq;
731 struct task cpuset_task;
734 if (cpu > mp_ncpus - 1)
736 CPU_SETOF(cpu, &cpu_set);
737 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task,
739 taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task);
740 taskqueue_drain(sc->hn_tx_taskq, &cpuset_task);
743 sc->hn_tx_taskq = hn_tx_taskq;
747 * Setup taskqueue for mangement tasks, e.g. link status.
749 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
750 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
751 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
752 device_get_nameunit(dev));
753 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
754 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
755 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
756 hn_netchg_status_taskfunc, sc);
759 * Allocate ifnet and setup its name earlier, so that if_printf
760 * can be used by functions, which will be called after
763 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
765 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
768 * Initialize ifmedia earlier so that it can be unconditionally
769 * destroyed, if error happened later on.
771 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
774 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
775 * to use (tx_ring_cnt).
778 * The # of RX rings to use is same as the # of channels to use.
780 ring_cnt = hn_chan_cnt;
784 if (ring_cnt > HN_RING_CNT_DEF_MAX)
785 ring_cnt = HN_RING_CNT_DEF_MAX;
786 } else if (ring_cnt > mp_ncpus) {
790 tx_ring_cnt = hn_tx_ring_cnt;
791 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
792 tx_ring_cnt = ring_cnt;
793 if (hn_use_if_start) {
794 /* ifnet.if_start only needs one TX ring. */
799 * Set the leader CPU for channels.
801 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
804 * Create enough TX/RX rings, even if only limited number of
805 * channels can be allocated.
807 error = hn_create_tx_data(sc, tx_ring_cnt);
810 error = hn_create_rx_data(sc, ring_cnt);
815 * Create transaction context for NVS and RNDIS transactions.
817 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
818 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
819 if (sc->hn_xact == NULL)
823 * Attach the synthetic parts, i.e. NVS and RNDIS.
825 error = hn_synth_attach(sc, ETHERMTU);
829 error = hn_rndis_get_eaddr(sc, eaddr);
833 #if __FreeBSD_version >= 1100099
834 if (sc->hn_rx_ring_inuse > 1) {
836 * Reduce TCP segment aggregation limit for multiple
837 * RX rings to increase ACK timeliness.
839 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
844 * Fixup TX stuffs after synthetic parts are attached.
846 hn_fixup_tx_data(sc);
848 ctx = device_get_sysctl_ctx(dev);
849 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
850 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
851 &sc->hn_nvs_ver, 0, "NVS version");
852 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
853 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
854 hn_ndis_version_sysctl, "A", "NDIS version");
855 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
856 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
857 hn_caps_sysctl, "A", "capabilities");
858 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
859 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
860 hn_hwassist_sysctl, "A", "hwassist");
861 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
862 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
863 hn_rxfilter_sysctl, "A", "rxfilter");
864 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
865 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
866 hn_rss_hash_sysctl, "A", "RSS hash");
867 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
868 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
869 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
870 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
871 hn_rss_key_sysctl, "IU", "RSS key");
872 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
873 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
874 hn_rss_ind_sysctl, "IU", "RSS indirect table");
877 * Setup the ifmedia, which has been initialized earlier.
879 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
880 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
881 /* XXX ifmedia_set really should do this for us */
882 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
885 * Setup the ifnet for this interface.
889 ifp->if_baudrate = IF_Gbps(10);
891 /* if_baudrate is 32bits on 32bit system. */
892 ifp->if_baudrate = IF_Gbps(1);
894 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
895 ifp->if_ioctl = hn_ioctl;
896 ifp->if_init = hn_init;
897 if (hn_use_if_start) {
898 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
900 ifp->if_start = hn_start;
901 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
902 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
903 IFQ_SET_READY(&ifp->if_snd);
905 ifp->if_transmit = hn_transmit;
906 ifp->if_qflush = hn_xmit_qflush;
909 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
911 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
912 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
914 if (sc->hn_caps & HN_CAP_VLAN) {
915 /* XXX not sure about VLAN_MTU. */
916 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
919 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
920 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
921 ifp->if_capabilities |= IFCAP_TXCSUM;
922 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
923 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
924 if (sc->hn_caps & HN_CAP_TSO4) {
925 ifp->if_capabilities |= IFCAP_TSO4;
926 ifp->if_hwassist |= CSUM_IP_TSO;
928 if (sc->hn_caps & HN_CAP_TSO6) {
929 ifp->if_capabilities |= IFCAP_TSO6;
930 ifp->if_hwassist |= CSUM_IP6_TSO;
933 /* Enable all available capabilities by default. */
934 ifp->if_capenable = ifp->if_capabilities;
936 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
937 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
938 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
939 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
942 ether_ifattach(ifp, eaddr);
944 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
945 if_printf(ifp, "TSO segcnt %u segsz %u\n",
946 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
949 /* Inform the upper layer about the long frame support. */
950 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
953 * Kick off link status check.
955 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
956 hn_update_link_status(sc);
960 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
967 hn_detach(device_t dev)
969 struct hn_softc *sc = device_get_softc(dev);
970 struct ifnet *ifp = sc->hn_ifp;
972 if (device_is_attached(dev)) {
974 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
975 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
979 * hn_stop() only suspends data, so managment
980 * stuffs have to be suspended manually here.
989 ifmedia_removeall(&sc->hn_media);
990 hn_destroy_rx_data(sc);
991 hn_destroy_tx_data(sc);
993 if (sc->hn_tx_taskq != hn_tx_taskq)
994 taskqueue_free(sc->hn_tx_taskq);
995 taskqueue_free(sc->hn_mgmt_taskq0);
997 if (sc->hn_xact != NULL)
998 vmbus_xact_ctx_destroy(sc->hn_xact);
1002 HN_LOCK_DESTROY(sc);
1007 hn_shutdown(device_t dev)
1014 hn_link_status(struct hn_softc *sc)
1016 uint32_t link_status;
1019 error = hn_rndis_get_linkstatus(sc, &link_status);
1021 /* XXX what to do? */
1025 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1026 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1028 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1029 if_link_state_change(sc->hn_ifp,
1030 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1031 LINK_STATE_UP : LINK_STATE_DOWN);
1035 hn_link_taskfunc(void *xsc, int pending __unused)
1037 struct hn_softc *sc = xsc;
1039 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1045 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1047 struct hn_softc *sc = xsc;
1049 /* Prevent any link status checks from running. */
1050 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1053 * Fake up a [link down --> link up] state change; 5 seconds
1054 * delay is used, which closely simulates miibus reaction
1055 * upon link down event.
1057 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1058 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1059 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1060 &sc->hn_netchg_status, 5 * hz);
1064 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1066 struct hn_softc *sc = xsc;
1068 /* Re-allow link status checks. */
1069 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1074 hn_update_link_status(struct hn_softc *sc)
1077 if (sc->hn_mgmt_taskq != NULL)
1078 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1082 hn_change_network(struct hn_softc *sc)
1085 if (sc->hn_mgmt_taskq != NULL)
1086 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1090 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1091 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1093 struct mbuf *m = *m_head;
1096 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1098 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1099 m, segs, nsegs, BUS_DMA_NOWAIT);
1100 if (error == EFBIG) {
1103 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1107 *m_head = m = m_new;
1108 txr->hn_tx_collapsed++;
1110 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1111 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1114 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1115 BUS_DMASYNC_PREWRITE);
1116 txd->flags |= HN_TXD_FLAG_DMAMAP;
1122 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1125 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1126 ("put an onlist txd %#x", txd->flags));
1128 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1129 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1132 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1133 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1134 ("chim txd uses dmamap"));
1135 hn_chim_free(txr->hn_sc, txd->chim_index);
1136 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1137 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1138 bus_dmamap_sync(txr->hn_tx_data_dtag,
1139 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1140 bus_dmamap_unload(txr->hn_tx_data_dtag,
1142 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1145 if (txd->m != NULL) {
1150 txd->flags |= HN_TXD_FLAG_ONLIST;
1151 #ifndef HN_USE_TXDESC_BUFRING
1152 mtx_lock_spin(&txr->hn_txlist_spin);
1153 KASSERT(txr->hn_txdesc_avail >= 0 &&
1154 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1155 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1156 txr->hn_txdesc_avail++;
1157 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1158 mtx_unlock_spin(&txr->hn_txlist_spin);
1160 atomic_add_int(&txr->hn_txdesc_avail, 1);
1161 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1167 static __inline struct hn_txdesc *
1168 hn_txdesc_get(struct hn_tx_ring *txr)
1170 struct hn_txdesc *txd;
1172 #ifndef HN_USE_TXDESC_BUFRING
1173 mtx_lock_spin(&txr->hn_txlist_spin);
1174 txd = SLIST_FIRST(&txr->hn_txlist);
1176 KASSERT(txr->hn_txdesc_avail > 0,
1177 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1178 txr->hn_txdesc_avail--;
1179 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1181 mtx_unlock_spin(&txr->hn_txlist_spin);
1183 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1187 #ifdef HN_USE_TXDESC_BUFRING
1188 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1190 KASSERT(txd->m == NULL && txd->refs == 0 &&
1191 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1192 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1193 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1194 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1200 static __inline void
1201 hn_txdesc_hold(struct hn_txdesc *txd)
1204 /* 0->1 transition will never work */
1205 KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
1206 atomic_add_int(&txd->refs, 1);
1210 hn_tx_ring_pending(struct hn_tx_ring *txr)
1212 bool pending = false;
1214 #ifndef HN_USE_TXDESC_BUFRING
1215 mtx_lock_spin(&txr->hn_txlist_spin);
1216 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1218 mtx_unlock_spin(&txr->hn_txlist_spin);
1220 if (!buf_ring_full(txr->hn_txdesc_br))
1226 static __inline void
1227 hn_txeof(struct hn_tx_ring *txr)
1229 txr->hn_has_txeof = 0;
1234 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1235 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1237 struct hn_txdesc *txd = sndc->hn_cbarg;
1238 struct hn_tx_ring *txr;
1241 KASSERT(txr->hn_chan == chan,
1242 ("channel mismatch, on chan%u, should be chan%u",
1243 vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan)));
1245 txr->hn_has_txeof = 1;
1246 hn_txdesc_put(txr, txd);
1248 ++txr->hn_txdone_cnt;
1249 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1250 txr->hn_txdone_cnt = 0;
1251 if (txr->hn_oactive)
1257 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1259 #if defined(INET) || defined(INET6)
1260 struct lro_ctrl *lro = &rxr->hn_lro;
1261 struct lro_entry *queued;
1263 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1264 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1265 tcp_lro_flush(lro, queued);
1271 * 'txr' could be NULL, if multiple channels and
1272 * ifnet.if_start method are enabled.
1274 if (txr == NULL || !txr->hn_has_txeof)
1277 txr->hn_txdone_cnt = 0;
1281 static __inline uint32_t
1282 hn_rndis_pktmsg_offset(uint32_t ofs)
1285 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1286 ("invalid RNDIS packet msg offset %u", ofs));
1287 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1290 static __inline void *
1291 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1292 size_t pi_dlen, uint32_t pi_type)
1294 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1295 struct rndis_pktinfo *pi;
1297 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1298 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1301 * Per-packet-info does not move; it only grows.
1304 * rm_pktinfooffset in this phase counts from the beginning
1305 * of rndis_packet_msg.
1307 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1308 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1309 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1310 pkt->rm_pktinfolen);
1311 pkt->rm_pktinfolen += pi_size;
1313 pi->rm_size = pi_size;
1314 pi->rm_type = pi_type;
1315 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1317 /* Data immediately follow per-packet-info. */
1318 pkt->rm_dataoffset += pi_size;
1320 /* Update RNDIS packet msg length */
1321 pkt->rm_len += pi_size;
1323 return (pi->rm_data);
1328 * If this function fails, then both txd and m_head0 will be freed.
1331 hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
1333 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1334 int error, nsegs, i;
1335 struct mbuf *m_head = *m_head0;
1336 struct rndis_packet_msg *pkt;
1341 * extension points to the area reserved for the
1342 * rndis_filter_packet, which is placed just after
1343 * the netvsc_packet (and rppi struct, if present;
1344 * length is updated later).
1346 pkt = txd->rndis_pkt;
1347 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1348 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1349 pkt->rm_dataoffset = sizeof(*pkt);
1350 pkt->rm_datalen = m_head->m_pkthdr.len;
1351 pkt->rm_pktinfooffset = sizeof(*pkt);
1352 pkt->rm_pktinfolen = 0;
1354 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1356 * Set the hash value for this packet, so that the host could
1357 * dispatch the TX done event for this packet back to this TX
1360 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1361 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1362 *pi_data = txr->hn_tx_idx;
1365 if (m_head->m_flags & M_VLANTAG) {
1366 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1367 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1368 *pi_data = NDIS_VLAN_INFO_MAKE(
1369 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1370 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1371 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1374 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1375 #if defined(INET6) || defined(INET)
1376 struct ether_vlan_header *eh;
1380 * XXX need m_pullup and use mtodo
1382 eh = mtod(m_head, struct ether_vlan_header*);
1383 if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN))
1384 ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1386 ether_len = ETHER_HDR_LEN;
1388 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1389 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1391 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1393 (struct ip *)(m_head->m_data + ether_len);
1394 unsigned long iph_len = ip->ip_hl << 2;
1396 (struct tcphdr *)((caddr_t)ip + iph_len);
1400 th->th_sum = in_pseudo(ip->ip_src.s_addr,
1401 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
1402 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1403 m_head->m_pkthdr.tso_segsz);
1406 #if defined(INET6) && defined(INET)
1411 struct ip6_hdr *ip6 = (struct ip6_hdr *)
1412 (m_head->m_data + ether_len);
1413 struct tcphdr *th = (struct tcphdr *)(ip6 + 1);
1416 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
1417 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1418 m_head->m_pkthdr.tso_segsz);
1421 #endif /* INET6 || INET */
1422 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1423 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1424 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1425 if (m_head->m_pkthdr.csum_flags &
1426 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1427 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1429 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1430 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1431 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1434 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1435 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1436 else if (m_head->m_pkthdr.csum_flags &
1437 (CSUM_IP_UDP | CSUM_IP6_UDP))
1438 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1441 pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1442 /* Convert RNDIS packet message offsets */
1443 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1444 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1447 * Chimney send, if the packet could fit into one chimney buffer.
1449 if (pkt->rm_len < txr->hn_chim_size) {
1450 txr->hn_tx_chimney_tried++;
1451 txd->chim_index = hn_chim_alloc(txr->hn_sc);
1452 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1453 uint8_t *dest = txr->hn_sc->hn_chim +
1454 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1456 memcpy(dest, pkt, pktlen);
1458 m_copydata(m_head, 0, m_head->m_pkthdr.len, dest);
1460 txd->chim_size = pkt->rm_len;
1461 txr->hn_gpa_cnt = 0;
1462 txr->hn_tx_chimney++;
1463 txr->hn_sendpkt = hn_txpkt_chim;
1468 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1473 * This mbuf is not linked w/ the txd yet, so free it now.
1478 freed = hn_txdesc_put(txr, txd);
1480 ("fail to free txd upon txdma error"));
1482 txr->hn_txdma_failed++;
1483 if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1);
1488 /* +1 RNDIS packet message */
1489 txr->hn_gpa_cnt = nsegs + 1;
1491 /* send packet with page buffer */
1492 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1493 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1494 txr->hn_gpa[0].gpa_len = pktlen;
1497 * Fill the page buffers with mbuf info after the page
1498 * buffer for RNDIS packet message.
1500 for (i = 0; i < nsegs; ++i) {
1501 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1503 gpa->gpa_page = atop(segs[i].ds_addr);
1504 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1505 gpa->gpa_len = segs[i].ds_len;
1508 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1510 txr->hn_sendpkt = hn_txpkt_sglist;
1514 /* Set the completion routine */
1515 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1522 * If this function fails, then txd will be freed, but the mbuf
1523 * associated w/ the txd will _not_ be freed.
1526 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1528 int error, send_failed = 0;
1532 * Make sure that txd is not freed before ETHER_BPF_MTAP.
1534 hn_txdesc_hold(txd);
1535 error = txr->hn_sendpkt(txr, txd);
1537 ETHER_BPF_MTAP(ifp, txd->m);
1538 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
1539 if (!hn_use_if_start) {
1540 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1541 txd->m->m_pkthdr.len);
1542 if (txd->m->m_flags & M_MCAST)
1543 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
1547 hn_txdesc_put(txr, txd);
1549 if (__predict_false(error)) {
1553 * This should "really rarely" happen.
1555 * XXX Too many RX to be acked or too many sideband
1556 * commands to run? Ask netvsc_channel_rollup()
1557 * to kick start later.
1559 txr->hn_has_txeof = 1;
1561 txr->hn_send_failed++;
1564 * Try sending again after set hn_has_txeof;
1565 * in case that we missed the last
1566 * netvsc_channel_rollup().
1570 if_printf(ifp, "send failed\n");
1573 * Caller will perform further processing on the
1574 * associated mbuf, so don't free it in hn_txdesc_put();
1575 * only unload it from the DMA map in hn_txdesc_put(),
1579 freed = hn_txdesc_put(txr, txd);
1581 ("fail to free txd upon send error"));
1583 txr->hn_send_failed++;
1589 * Start a transmit of one or more packets
1592 hn_start_locked(struct hn_tx_ring *txr, int len)
1594 struct hn_softc *sc = txr->hn_sc;
1595 struct ifnet *ifp = sc->hn_ifp;
1597 KASSERT(hn_use_if_start,
1598 ("hn_start_locked is called, when if_start is disabled"));
1599 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
1600 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
1602 if (__predict_false(txr->hn_suspended))
1605 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
1609 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
1610 struct hn_txdesc *txd;
1611 struct mbuf *m_head;
1614 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
1618 if (len > 0 && m_head->m_pkthdr.len > len) {
1620 * This sending could be time consuming; let callers
1621 * dispatch this packet sending (and sending of any
1622 * following up packets) to tx taskqueue.
1624 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1628 txd = hn_txdesc_get(txr);
1630 txr->hn_no_txdescs++;
1631 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1632 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1636 error = hn_encap(txr, txd, &m_head);
1638 /* Both txd and m_head are freed */
1642 error = hn_txpkt(ifp, txr, txd);
1643 if (__predict_false(error)) {
1644 /* txd is freed, but m_head is not */
1645 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1646 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1654 * Append the specified data to the indicated mbuf chain,
1655 * Extend the mbuf chain if the new data does not fit in
1658 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1659 * There should be an equivalent in the kernel mbuf code,
1660 * but there does not appear to be one yet.
1662 * Differs from m_append() in that additional mbufs are
1663 * allocated with cluster size MJUMPAGESIZE, and filled
1666 * Return 1 if able to complete the job; otherwise 0.
1669 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
1672 int remainder, space;
1674 for (m = m0; m->m_next != NULL; m = m->m_next)
1677 space = M_TRAILINGSPACE(m);
1680 * Copy into available space.
1682 if (space > remainder)
1684 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1689 while (remainder > 0) {
1691 * Allocate a new mbuf; could check space
1692 * and allocate a cluster instead.
1694 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
1697 n->m_len = min(MJUMPAGESIZE, remainder);
1698 bcopy(cp, mtod(n, caddr_t), n->m_len);
1700 remainder -= n->m_len;
1704 if (m0->m_flags & M_PKTHDR)
1705 m0->m_pkthdr.len += len - remainder;
1707 return (remainder == 0);
1710 #if defined(INET) || defined(INET6)
1712 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
1714 #if __FreeBSD_version >= 1100095
1715 if (hn_lro_mbufq_depth) {
1716 tcp_lro_queue_mbuf(lc, m);
1720 return tcp_lro_rx(lc, m, 0);
1725 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
1726 const struct hn_rxinfo *info)
1728 struct ifnet *ifp = rxr->hn_ifp;
1730 int size, do_lro = 0, do_csum = 1;
1731 int hash_type = M_HASHTYPE_OPAQUE;
1733 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
1737 * Bail out if packet contains more data than configured MTU.
1739 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
1741 } else if (dlen <= MHLEN) {
1742 m_new = m_gethdr(M_NOWAIT, MT_DATA);
1743 if (m_new == NULL) {
1744 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1747 memcpy(mtod(m_new, void *), data, dlen);
1748 m_new->m_pkthdr.len = m_new->m_len = dlen;
1749 rxr->hn_small_pkts++;
1752 * Get an mbuf with a cluster. For packets 2K or less,
1753 * get a standard 2K cluster. For anything larger, get a
1754 * 4K cluster. Any buffers larger than 4K can cause problems
1755 * if looped around to the Hyper-V TX channel, so avoid them.
1758 if (dlen > MCLBYTES) {
1760 size = MJUMPAGESIZE;
1763 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
1764 if (m_new == NULL) {
1765 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1769 hv_m_append(m_new, dlen, data);
1771 m_new->m_pkthdr.rcvif = ifp;
1773 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
1776 /* receive side checksum offload */
1777 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
1778 /* IP csum offload */
1779 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
1780 m_new->m_pkthdr.csum_flags |=
1781 (CSUM_IP_CHECKED | CSUM_IP_VALID);
1785 /* TCP/UDP csum offload */
1786 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
1787 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
1788 m_new->m_pkthdr.csum_flags |=
1789 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1790 m_new->m_pkthdr.csum_data = 0xffff;
1791 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
1799 * As of this write (Oct 28th, 2016), host side will turn
1800 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
1801 * the do_lro setting here is actually _not_ accurate. We
1802 * depend on the RSS hash type check to reset do_lro.
1804 if ((info->csum_info &
1805 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
1806 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
1809 const struct ether_header *eh;
1814 if (m_new->m_len < hoff)
1816 eh = mtod(m_new, struct ether_header *);
1817 etype = ntohs(eh->ether_type);
1818 if (etype == ETHERTYPE_VLAN) {
1819 const struct ether_vlan_header *evl;
1821 hoff = sizeof(*evl);
1822 if (m_new->m_len < hoff)
1824 evl = mtod(m_new, struct ether_vlan_header *);
1825 etype = ntohs(evl->evl_proto);
1828 if (etype == ETHERTYPE_IP) {
1831 pr = hn_check_iplen(m_new, hoff);
1832 if (pr == IPPROTO_TCP) {
1834 (rxr->hn_trust_hcsum &
1835 HN_TRUST_HCSUM_TCP)) {
1836 rxr->hn_csum_trusted++;
1837 m_new->m_pkthdr.csum_flags |=
1838 (CSUM_IP_CHECKED | CSUM_IP_VALID |
1839 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1840 m_new->m_pkthdr.csum_data = 0xffff;
1843 } else if (pr == IPPROTO_UDP) {
1845 (rxr->hn_trust_hcsum &
1846 HN_TRUST_HCSUM_UDP)) {
1847 rxr->hn_csum_trusted++;
1848 m_new->m_pkthdr.csum_flags |=
1849 (CSUM_IP_CHECKED | CSUM_IP_VALID |
1850 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1851 m_new->m_pkthdr.csum_data = 0xffff;
1853 } else if (pr != IPPROTO_DONE && do_csum &&
1854 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
1855 rxr->hn_csum_trusted++;
1856 m_new->m_pkthdr.csum_flags |=
1857 (CSUM_IP_CHECKED | CSUM_IP_VALID);
1862 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
1863 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
1864 NDIS_VLAN_INFO_ID(info->vlan_info),
1865 NDIS_VLAN_INFO_PRI(info->vlan_info),
1866 NDIS_VLAN_INFO_CFI(info->vlan_info));
1867 m_new->m_flags |= M_VLANTAG;
1870 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
1872 m_new->m_pkthdr.flowid = info->hash_value;
1873 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
1874 NDIS_HASH_FUNCTION_TOEPLITZ) {
1875 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
1879 * do_lro is resetted, if the hash types are not TCP
1880 * related. See the comment in the above csum_flags
1884 case NDIS_HASH_IPV4:
1885 hash_type = M_HASHTYPE_RSS_IPV4;
1889 case NDIS_HASH_TCP_IPV4:
1890 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
1893 case NDIS_HASH_IPV6:
1894 hash_type = M_HASHTYPE_RSS_IPV6;
1898 case NDIS_HASH_IPV6_EX:
1899 hash_type = M_HASHTYPE_RSS_IPV6_EX;
1903 case NDIS_HASH_TCP_IPV6:
1904 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
1907 case NDIS_HASH_TCP_IPV6_EX:
1908 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
1913 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
1915 M_HASHTYPE_SET(m_new, hash_type);
1918 * Note: Moved RX completion back to hv_nv_on_receive() so all
1919 * messages (not just data messages) will trigger a response.
1925 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
1926 #if defined(INET) || defined(INET6)
1927 struct lro_ctrl *lro = &rxr->hn_lro;
1930 rxr->hn_lro_tried++;
1931 if (hn_lro_rx(lro, m_new) == 0) {
1939 /* We're not holding the lock here, so don't release it */
1940 (*ifp->if_input)(ifp, m_new);
1946 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1948 struct hn_softc *sc = ifp->if_softc;
1949 struct ifreq *ifr = (struct ifreq *)data;
1950 int mask, error = 0;
1954 if (ifr->ifr_mtu > HN_MTU_MAX) {
1961 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
1966 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
1967 /* Can't change MTU */
1973 if (ifp->if_mtu == ifr->ifr_mtu) {
1979 * Suspend this interface before the synthetic parts
1985 * Detach the synthetics parts, i.e. NVS and RNDIS.
1987 hn_synth_detach(sc);
1990 * Reattach the synthetic parts, i.e. NVS and RNDIS,
1991 * with the new MTU setting.
1993 error = hn_synth_attach(sc, ifr->ifr_mtu);
2000 * Commit the requested MTU, after the synthetic parts
2001 * have been successfully attached.
2003 ifp->if_mtu = ifr->ifr_mtu;
2006 * Make sure that various parameters based on MTU are
2007 * still valid, after the MTU change.
2009 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2010 hn_set_chim_size(sc, sc->hn_chim_szmax);
2011 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2012 #if __FreeBSD_version >= 1100099
2013 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2014 HN_LRO_LENLIM_MIN(ifp))
2015 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2019 * All done! Resume the interface now.
2029 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2034 if (ifp->if_flags & IFF_UP) {
2035 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2036 hn_set_rxfilter(sc);
2040 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2043 sc->hn_if_flags = ifp->if_flags;
2050 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2052 if (mask & IFCAP_TXCSUM) {
2053 ifp->if_capenable ^= IFCAP_TXCSUM;
2054 if (ifp->if_capenable & IFCAP_TXCSUM)
2055 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2057 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2059 if (mask & IFCAP_TXCSUM_IPV6) {
2060 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2061 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2062 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2064 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2067 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2068 if (mask & IFCAP_RXCSUM)
2069 ifp->if_capenable ^= IFCAP_RXCSUM;
2071 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2072 if (mask & IFCAP_RXCSUM_IPV6)
2073 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2076 if (mask & IFCAP_LRO)
2077 ifp->if_capenable ^= IFCAP_LRO;
2079 if (mask & IFCAP_TSO4) {
2080 ifp->if_capenable ^= IFCAP_TSO4;
2081 if (ifp->if_capenable & IFCAP_TSO4)
2082 ifp->if_hwassist |= CSUM_IP_TSO;
2084 ifp->if_hwassist &= ~CSUM_IP_TSO;
2086 if (mask & IFCAP_TSO6) {
2087 ifp->if_capenable ^= IFCAP_TSO6;
2088 if (ifp->if_capenable & IFCAP_TSO6)
2089 ifp->if_hwassist |= CSUM_IP6_TSO;
2091 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2102 * Multicast uses mutex, while RNDIS RX filter setting
2103 * sleeps. We workaround this by always enabling
2104 * ALLMULTI. ALLMULTI would actually always be on, even
2105 * if we supported the SIOCADDMULTI/SIOCDELMULTI, since
2106 * we don't support multicast address list configuration
2111 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2115 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2116 hn_set_rxfilter(sc);
2124 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2128 error = ether_ioctl(ifp, cmd, data);
2135 hn_stop(struct hn_softc *sc)
2137 struct ifnet *ifp = sc->hn_ifp;
2142 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2143 ("synthetic parts were not attached"));
2145 /* Clear RUNNING bit _before_ hn_suspend_data() */
2146 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2147 hn_suspend_data(sc);
2149 /* Clear OACTIVE bit. */
2150 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2151 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2152 sc->hn_tx_ring[i].hn_oactive = 0;
2156 hn_start(struct ifnet *ifp)
2158 struct hn_softc *sc = ifp->if_softc;
2159 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
2161 if (txr->hn_sched_tx)
2164 if (mtx_trylock(&txr->hn_tx_lock)) {
2167 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
2168 mtx_unlock(&txr->hn_tx_lock);
2173 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
2177 hn_start_txeof(struct hn_tx_ring *txr)
2179 struct hn_softc *sc = txr->hn_sc;
2180 struct ifnet *ifp = sc->hn_ifp;
2182 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
2184 if (txr->hn_sched_tx)
2187 if (mtx_trylock(&txr->hn_tx_lock)) {
2190 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2191 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
2192 mtx_unlock(&txr->hn_tx_lock);
2194 taskqueue_enqueue(txr->hn_tx_taskq,
2200 * Release the OACTIVE earlier, with the hope, that
2201 * others could catch up. The task will clear the
2202 * flag again with the hn_tx_lock to avoid possible
2205 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2206 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
2211 hn_init_locked(struct hn_softc *sc)
2213 struct ifnet *ifp = sc->hn_ifp;
2218 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2221 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2224 /* Configure RX filter */
2225 hn_set_rxfilter(sc);
2227 /* Clear OACTIVE bit. */
2228 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2229 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2230 sc->hn_tx_ring[i].hn_oactive = 0;
2232 /* Clear TX 'suspended' bit. */
2233 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2235 /* Everything is ready; unleash! */
2236 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2242 struct hn_softc *sc = xsc;
2249 #if __FreeBSD_version >= 1100099
2252 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2254 struct hn_softc *sc = arg1;
2255 unsigned int lenlim;
2258 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2259 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2260 if (error || req->newptr == NULL)
2264 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2265 lenlim > TCP_LRO_LENGTH_MAX) {
2269 hn_set_lro_lenlim(sc, lenlim);
2276 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2278 struct hn_softc *sc = arg1;
2279 int ackcnt, error, i;
2282 * lro_ackcnt_lim is append count limit,
2283 * +1 to turn it into aggregation limit.
2285 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2286 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2287 if (error || req->newptr == NULL)
2290 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2294 * Convert aggregation limit back to append
2299 for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
2300 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2308 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2310 struct hn_softc *sc = arg1;
2315 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2318 error = sysctl_handle_int(oidp, &on, 0, req);
2319 if (error || req->newptr == NULL)
2323 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2324 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2327 rxr->hn_trust_hcsum |= hcsum;
2329 rxr->hn_trust_hcsum &= ~hcsum;
2336 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2338 struct hn_softc *sc = arg1;
2339 int chim_size, error;
2341 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2342 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2343 if (error || req->newptr == NULL)
2346 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2350 hn_set_chim_size(sc, chim_size);
2355 #if __FreeBSD_version < 1100095
2357 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2359 struct hn_softc *sc = arg1;
2360 int ofs = arg2, i, error;
2361 struct hn_rx_ring *rxr;
2365 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2366 rxr = &sc->hn_rx_ring[i];
2367 stat += *((int *)((uint8_t *)rxr + ofs));
2370 error = sysctl_handle_64(oidp, &stat, 0, req);
2371 if (error || req->newptr == NULL)
2374 /* Zero out this stat. */
2375 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2376 rxr = &sc->hn_rx_ring[i];
2377 *((int *)((uint8_t *)rxr + ofs)) = 0;
2383 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2385 struct hn_softc *sc = arg1;
2386 int ofs = arg2, i, error;
2387 struct hn_rx_ring *rxr;
2391 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2392 rxr = &sc->hn_rx_ring[i];
2393 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2396 error = sysctl_handle_64(oidp, &stat, 0, req);
2397 if (error || req->newptr == NULL)
2400 /* Zero out this stat. */
2401 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2402 rxr = &sc->hn_rx_ring[i];
2403 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2411 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2413 struct hn_softc *sc = arg1;
2414 int ofs = arg2, i, error;
2415 struct hn_rx_ring *rxr;
2419 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2420 rxr = &sc->hn_rx_ring[i];
2421 stat += *((u_long *)((uint8_t *)rxr + ofs));
2424 error = sysctl_handle_long(oidp, &stat, 0, req);
2425 if (error || req->newptr == NULL)
2428 /* Zero out this stat. */
2429 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2430 rxr = &sc->hn_rx_ring[i];
2431 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2437 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2439 struct hn_softc *sc = arg1;
2440 int ofs = arg2, i, error;
2441 struct hn_tx_ring *txr;
2445 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2446 txr = &sc->hn_tx_ring[i];
2447 stat += *((u_long *)((uint8_t *)txr + ofs));
2450 error = sysctl_handle_long(oidp, &stat, 0, req);
2451 if (error || req->newptr == NULL)
2454 /* Zero out this stat. */
2455 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2456 txr = &sc->hn_tx_ring[i];
2457 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2463 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2465 struct hn_softc *sc = arg1;
2466 int ofs = arg2, i, error, conf;
2467 struct hn_tx_ring *txr;
2469 txr = &sc->hn_tx_ring[0];
2470 conf = *((int *)((uint8_t *)txr + ofs));
2472 error = sysctl_handle_int(oidp, &conf, 0, req);
2473 if (error || req->newptr == NULL)
2477 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2478 txr = &sc->hn_tx_ring[i];
2479 *((int *)((uint8_t *)txr + ofs)) = conf;
2487 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2489 struct hn_softc *sc = arg1;
2492 snprintf(verstr, sizeof(verstr), "%u.%u",
2493 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2494 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2495 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2499 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2501 struct hn_softc *sc = arg1;
2508 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2509 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2513 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2515 struct hn_softc *sc = arg1;
2516 char assist_str[128];
2520 hwassist = sc->hn_ifp->if_hwassist;
2522 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2523 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2527 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2529 struct hn_softc *sc = arg1;
2530 char filter_str[128];
2534 filter = sc->hn_rx_filter;
2536 snprintf(filter_str, sizeof(filter_str), "%b", filter,
2538 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2542 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2544 struct hn_softc *sc = arg1;
2549 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2550 if (error || req->newptr == NULL)
2553 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2556 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2558 if (sc->hn_rx_ring_inuse > 1) {
2559 error = hn_rss_reconfig(sc);
2561 /* Not RSS capable, at least for now; just save the RSS key. */
2570 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2572 struct hn_softc *sc = arg1;
2577 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2578 if (error || req->newptr == NULL)
2582 * Don't allow RSS indirect table change, if this interface is not
2583 * RSS capable currently.
2585 if (sc->hn_rx_ring_inuse == 1) {
2590 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2593 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2595 hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse);
2596 error = hn_rss_reconfig(sc);
2603 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
2605 struct hn_softc *sc = arg1;
2610 hash = sc->hn_rss_hash;
2612 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
2613 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
2617 hn_check_iplen(const struct mbuf *m, int hoff)
2619 const struct ip *ip;
2620 int len, iphlen, iplen;
2621 const struct tcphdr *th;
2622 int thoff; /* TCP data offset */
2624 len = hoff + sizeof(struct ip);
2626 /* The packet must be at least the size of an IP header. */
2627 if (m->m_pkthdr.len < len)
2628 return IPPROTO_DONE;
2630 /* The fixed IP header must reside completely in the first mbuf. */
2632 return IPPROTO_DONE;
2634 ip = mtodo(m, hoff);
2636 /* Bound check the packet's stated IP header length. */
2637 iphlen = ip->ip_hl << 2;
2638 if (iphlen < sizeof(struct ip)) /* minimum header length */
2639 return IPPROTO_DONE;
2641 /* The full IP header must reside completely in the one mbuf. */
2642 if (m->m_len < hoff + iphlen)
2643 return IPPROTO_DONE;
2645 iplen = ntohs(ip->ip_len);
2648 * Check that the amount of data in the buffers is as
2649 * at least much as the IP header would have us expect.
2651 if (m->m_pkthdr.len < hoff + iplen)
2652 return IPPROTO_DONE;
2655 * Ignore IP fragments.
2657 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
2658 return IPPROTO_DONE;
2661 * The TCP/IP or UDP/IP header must be entirely contained within
2662 * the first fragment of a packet.
2666 if (iplen < iphlen + sizeof(struct tcphdr))
2667 return IPPROTO_DONE;
2668 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
2669 return IPPROTO_DONE;
2670 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
2671 thoff = th->th_off << 2;
2672 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
2673 return IPPROTO_DONE;
2674 if (m->m_len < hoff + iphlen + thoff)
2675 return IPPROTO_DONE;
2678 if (iplen < iphlen + sizeof(struct udphdr))
2679 return IPPROTO_DONE;
2680 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
2681 return IPPROTO_DONE;
2685 return IPPROTO_DONE;
2692 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
2694 struct sysctl_oid_list *child;
2695 struct sysctl_ctx_list *ctx;
2696 device_t dev = sc->hn_dev;
2697 #if defined(INET) || defined(INET6)
2698 #if __FreeBSD_version >= 1100095
2705 * Create RXBUF for reception.
2708 * - It is shared by all channels.
2709 * - A large enough buffer is allocated, certain version of NVSes
2710 * may further limit the usable space.
2712 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
2713 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
2714 BUS_DMA_WAITOK | BUS_DMA_ZERO);
2715 if (sc->hn_rxbuf == NULL) {
2716 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
2720 sc->hn_rx_ring_cnt = ring_cnt;
2721 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
2723 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
2724 M_DEVBUF, M_WAITOK | M_ZERO);
2726 #if defined(INET) || defined(INET6)
2727 #if __FreeBSD_version >= 1100095
2728 lroent_cnt = hn_lro_entry_count;
2729 if (lroent_cnt < TCP_LRO_ENTRIES)
2730 lroent_cnt = TCP_LRO_ENTRIES;
2732 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
2734 #endif /* INET || INET6 */
2736 ctx = device_get_sysctl_ctx(dev);
2737 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2739 /* Create dev.hn.UNIT.rx sysctl tree */
2740 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
2741 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2743 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2744 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2746 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
2747 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
2748 &rxr->hn_br_dma, BUS_DMA_WAITOK);
2749 if (rxr->hn_br == NULL) {
2750 device_printf(dev, "allocate bufring failed\n");
2754 if (hn_trust_hosttcp)
2755 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
2756 if (hn_trust_hostudp)
2757 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
2758 if (hn_trust_hostip)
2759 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
2760 rxr->hn_ifp = sc->hn_ifp;
2761 if (i < sc->hn_tx_ring_cnt)
2762 rxr->hn_txr = &sc->hn_tx_ring[i];
2763 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
2764 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
2766 rxr->hn_rxbuf = sc->hn_rxbuf;
2771 #if defined(INET) || defined(INET6)
2772 #if __FreeBSD_version >= 1100095
2773 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
2774 hn_lro_mbufq_depth);
2776 tcp_lro_init(&rxr->hn_lro);
2777 rxr->hn_lro.ifp = sc->hn_ifp;
2779 #if __FreeBSD_version >= 1100099
2780 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
2781 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
2783 #endif /* INET || INET6 */
2785 if (sc->hn_rx_sysctl_tree != NULL) {
2789 * Create per RX ring sysctl tree:
2790 * dev.hn.UNIT.rx.RINGID
2792 snprintf(name, sizeof(name), "%d", i);
2793 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
2794 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
2795 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2797 if (rxr->hn_rx_sysctl_tree != NULL) {
2798 SYSCTL_ADD_ULONG(ctx,
2799 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2800 OID_AUTO, "packets", CTLFLAG_RW,
2801 &rxr->hn_pkts, "# of packets received");
2802 SYSCTL_ADD_ULONG(ctx,
2803 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2804 OID_AUTO, "rss_pkts", CTLFLAG_RW,
2806 "# of packets w/ RSS info received");
2808 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2809 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
2810 &rxr->hn_pktbuf_len, 0,
2811 "Temporary channel packet buffer length");
2816 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
2817 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2818 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
2819 #if __FreeBSD_version < 1100095
2820 hn_rx_stat_int_sysctl,
2822 hn_rx_stat_u64_sysctl,
2824 "LU", "LRO queued");
2825 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
2826 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2827 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
2828 #if __FreeBSD_version < 1100095
2829 hn_rx_stat_int_sysctl,
2831 hn_rx_stat_u64_sysctl,
2833 "LU", "LRO flushed");
2834 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
2835 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2836 __offsetof(struct hn_rx_ring, hn_lro_tried),
2837 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
2838 #if __FreeBSD_version >= 1100099
2839 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
2840 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2841 hn_lro_lenlim_sysctl, "IU",
2842 "Max # of data bytes to be aggregated by LRO");
2843 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
2844 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2845 hn_lro_ackcnt_sysctl, "I",
2846 "Max # of ACKs to be aggregated by LRO");
2848 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
2849 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
2850 hn_trust_hcsum_sysctl, "I",
2851 "Trust tcp segement verification on host side, "
2852 "when csum info is missing");
2853 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
2854 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
2855 hn_trust_hcsum_sysctl, "I",
2856 "Trust udp datagram verification on host side, "
2857 "when csum info is missing");
2858 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
2859 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
2860 hn_trust_hcsum_sysctl, "I",
2861 "Trust ip packet verification on host side, "
2862 "when csum info is missing");
2863 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
2864 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2865 __offsetof(struct hn_rx_ring, hn_csum_ip),
2866 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
2867 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
2868 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2869 __offsetof(struct hn_rx_ring, hn_csum_tcp),
2870 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
2871 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
2872 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2873 __offsetof(struct hn_rx_ring, hn_csum_udp),
2874 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
2875 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
2876 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2877 __offsetof(struct hn_rx_ring, hn_csum_trusted),
2878 hn_rx_stat_ulong_sysctl, "LU",
2879 "# of packets that we trust host's csum verification");
2880 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
2881 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2882 __offsetof(struct hn_rx_ring, hn_small_pkts),
2883 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
2884 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
2885 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2886 __offsetof(struct hn_rx_ring, hn_ack_failed),
2887 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
2888 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
2889 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
2890 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
2891 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
2897 hn_destroy_rx_data(struct hn_softc *sc)
2901 if (sc->hn_rxbuf != NULL) {
2902 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
2903 sc->hn_rxbuf = NULL;
2906 if (sc->hn_rx_ring_cnt == 0)
2909 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2910 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2912 if (rxr->hn_br == NULL)
2914 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
2917 #if defined(INET) || defined(INET6)
2918 tcp_lro_free(&rxr->hn_lro);
2920 free(rxr->hn_pktbuf, M_DEVBUF);
2922 free(sc->hn_rx_ring, M_DEVBUF);
2923 sc->hn_rx_ring = NULL;
2925 sc->hn_rx_ring_cnt = 0;
2926 sc->hn_rx_ring_inuse = 0;
2930 hn_tx_ring_create(struct hn_softc *sc, int id)
2932 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
2933 device_t dev = sc->hn_dev;
2934 bus_dma_tag_t parent_dtag;
2938 txr->hn_tx_idx = id;
2940 #ifndef HN_USE_TXDESC_BUFRING
2941 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
2943 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
2945 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
2946 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
2947 M_DEVBUF, M_WAITOK | M_ZERO);
2948 #ifndef HN_USE_TXDESC_BUFRING
2949 SLIST_INIT(&txr->hn_txlist);
2951 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
2952 M_WAITOK, &txr->hn_tx_lock);
2955 txr->hn_tx_taskq = sc->hn_tx_taskq;
2957 if (hn_use_if_start) {
2958 txr->hn_txeof = hn_start_txeof;
2959 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
2960 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
2964 txr->hn_txeof = hn_xmit_txeof;
2965 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
2966 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
2968 br_depth = hn_get_txswq_depth(txr);
2969 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
2970 M_WAITOK, &txr->hn_tx_lock);
2973 txr->hn_direct_tx_size = hn_direct_tx_size;
2976 * Always schedule transmission instead of trying to do direct
2977 * transmission. This one gives the best performance so far.
2979 txr->hn_sched_tx = 1;
2981 parent_dtag = bus_get_dma_tag(dev);
2983 /* DMA tag for RNDIS packet messages. */
2984 error = bus_dma_tag_create(parent_dtag, /* parent */
2985 HN_RNDIS_PKT_ALIGN, /* alignment */
2986 HN_RNDIS_PKT_BOUNDARY, /* boundary */
2987 BUS_SPACE_MAXADDR, /* lowaddr */
2988 BUS_SPACE_MAXADDR, /* highaddr */
2989 NULL, NULL, /* filter, filterarg */
2990 HN_RNDIS_PKT_LEN, /* maxsize */
2992 HN_RNDIS_PKT_LEN, /* maxsegsize */
2994 NULL, /* lockfunc */
2995 NULL, /* lockfuncarg */
2996 &txr->hn_tx_rndis_dtag);
2998 device_printf(dev, "failed to create rndis dmatag\n");
3002 /* DMA tag for data. */
3003 error = bus_dma_tag_create(parent_dtag, /* parent */
3005 HN_TX_DATA_BOUNDARY, /* boundary */
3006 BUS_SPACE_MAXADDR, /* lowaddr */
3007 BUS_SPACE_MAXADDR, /* highaddr */
3008 NULL, NULL, /* filter, filterarg */
3009 HN_TX_DATA_MAXSIZE, /* maxsize */
3010 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3011 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3013 NULL, /* lockfunc */
3014 NULL, /* lockfuncarg */
3015 &txr->hn_tx_data_dtag);
3017 device_printf(dev, "failed to create data dmatag\n");
3021 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3022 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3025 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3028 * Allocate and load RNDIS packet message.
3030 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3031 (void **)&txd->rndis_pkt,
3032 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3033 &txd->rndis_pkt_dmap);
3036 "failed to allocate rndis_packet_msg, %d\n", i);
3040 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3041 txd->rndis_pkt_dmap,
3042 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3043 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3047 "failed to load rndis_packet_msg, %d\n", i);
3048 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3049 txd->rndis_pkt, txd->rndis_pkt_dmap);
3053 /* DMA map for TX data. */
3054 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3058 "failed to allocate tx data dmamap\n");
3059 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3060 txd->rndis_pkt_dmap);
3061 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3062 txd->rndis_pkt, txd->rndis_pkt_dmap);
3066 /* All set, put it to list */
3067 txd->flags |= HN_TXD_FLAG_ONLIST;
3068 #ifndef HN_USE_TXDESC_BUFRING
3069 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3071 buf_ring_enqueue(txr->hn_txdesc_br, txd);
3074 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3076 if (sc->hn_tx_sysctl_tree != NULL) {
3077 struct sysctl_oid_list *child;
3078 struct sysctl_ctx_list *ctx;
3082 * Create per TX ring sysctl tree:
3083 * dev.hn.UNIT.tx.RINGID
3085 ctx = device_get_sysctl_ctx(dev);
3086 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3088 snprintf(name, sizeof(name), "%d", id);
3089 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3090 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3092 if (txr->hn_tx_sysctl_tree != NULL) {
3093 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3095 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3096 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3097 "# of available TX descs");
3098 if (!hn_use_if_start) {
3099 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3100 CTLFLAG_RD, &txr->hn_oactive, 0,
3103 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3104 CTLFLAG_RW, &txr->hn_pkts,
3105 "# of packets transmitted");
3113 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3115 struct hn_tx_ring *txr = txd->txr;
3117 KASSERT(txd->m == NULL, ("still has mbuf installed"));
3118 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3120 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3121 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3122 txd->rndis_pkt_dmap);
3123 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3127 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3129 struct hn_txdesc *txd;
3131 if (txr->hn_txdesc == NULL)
3134 #ifndef HN_USE_TXDESC_BUFRING
3135 while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
3136 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
3137 hn_txdesc_dmamap_destroy(txd);
3140 mtx_lock(&txr->hn_tx_lock);
3141 while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
3142 hn_txdesc_dmamap_destroy(txd);
3143 mtx_unlock(&txr->hn_tx_lock);
3146 if (txr->hn_tx_data_dtag != NULL)
3147 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3148 if (txr->hn_tx_rndis_dtag != NULL)
3149 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3151 #ifdef HN_USE_TXDESC_BUFRING
3152 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3155 free(txr->hn_txdesc, M_DEVBUF);
3156 txr->hn_txdesc = NULL;
3158 if (txr->hn_mbuf_br != NULL)
3159 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3161 #ifndef HN_USE_TXDESC_BUFRING
3162 mtx_destroy(&txr->hn_txlist_spin);
3164 mtx_destroy(&txr->hn_tx_lock);
3168 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3170 struct sysctl_oid_list *child;
3171 struct sysctl_ctx_list *ctx;
3175 * Create TXBUF for chimney sending.
3177 * NOTE: It is shared by all channels.
3179 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3180 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3181 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3182 if (sc->hn_chim == NULL) {
3183 device_printf(sc->hn_dev, "allocate txbuf failed\n");
3187 sc->hn_tx_ring_cnt = ring_cnt;
3188 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3190 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3191 M_DEVBUF, M_WAITOK | M_ZERO);
3193 ctx = device_get_sysctl_ctx(sc->hn_dev);
3194 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3196 /* Create dev.hn.UNIT.tx sysctl tree */
3197 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3198 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3200 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3203 error = hn_tx_ring_create(sc, i);
3208 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3209 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3210 __offsetof(struct hn_tx_ring, hn_no_txdescs),
3211 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3212 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3213 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3214 __offsetof(struct hn_tx_ring, hn_send_failed),
3215 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3216 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3217 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3218 __offsetof(struct hn_tx_ring, hn_txdma_failed),
3219 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3220 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3221 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3222 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3223 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3224 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3225 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3226 __offsetof(struct hn_tx_ring, hn_tx_chimney),
3227 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3228 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3229 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3230 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3231 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3232 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3233 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3234 "# of total TX descs");
3235 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3236 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3237 "Chimney send packet size upper boundary");
3238 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3239 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3240 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3241 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3242 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3243 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3244 hn_tx_conf_int_sysctl, "I",
3245 "Size of the packet for direct transmission");
3246 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3247 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3248 __offsetof(struct hn_tx_ring, hn_sched_tx),
3249 hn_tx_conf_int_sysctl, "I",
3250 "Always schedule transmission "
3251 "instead of doing direct transmission");
3252 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3253 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3254 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3255 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3261 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3265 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3266 sc->hn_tx_ring[i].hn_chim_size = chim_size;
3270 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3272 struct ifnet *ifp = sc->hn_ifp;
3275 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3278 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3279 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3280 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3282 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3283 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3284 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3286 if (tso_maxlen < tso_minlen)
3287 tso_maxlen = tso_minlen;
3288 else if (tso_maxlen > IP_MAXPACKET)
3289 tso_maxlen = IP_MAXPACKET;
3290 if (tso_maxlen > sc->hn_ndis_tso_szmax)
3291 tso_maxlen = sc->hn_ndis_tso_szmax;
3292 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3294 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3298 hn_fixup_tx_data(struct hn_softc *sc)
3300 uint64_t csum_assist;
3303 hn_set_chim_size(sc, sc->hn_chim_szmax);
3304 if (hn_tx_chimney_size > 0 &&
3305 hn_tx_chimney_size < sc->hn_chim_szmax)
3306 hn_set_chim_size(sc, hn_tx_chimney_size);
3309 if (sc->hn_caps & HN_CAP_IPCS)
3310 csum_assist |= CSUM_IP;
3311 if (sc->hn_caps & HN_CAP_TCP4CS)
3312 csum_assist |= CSUM_IP_TCP;
3313 if (sc->hn_caps & HN_CAP_UDP4CS)
3314 csum_assist |= CSUM_IP_UDP;
3316 if (sc->hn_caps & HN_CAP_TCP6CS)
3317 csum_assist |= CSUM_IP6_TCP;
3318 if (sc->hn_caps & HN_CAP_UDP6CS)
3319 csum_assist |= CSUM_IP6_UDP;
3321 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3322 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3324 if (sc->hn_caps & HN_CAP_HASHVAL) {
3326 * Support HASHVAL pktinfo on TX path.
3329 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3330 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3331 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3336 hn_destroy_tx_data(struct hn_softc *sc)
3340 if (sc->hn_chim != NULL) {
3341 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3345 if (sc->hn_tx_ring_cnt == 0)
3348 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3349 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3351 free(sc->hn_tx_ring, M_DEVBUF);
3352 sc->hn_tx_ring = NULL;
3354 sc->hn_tx_ring_cnt = 0;
3355 sc->hn_tx_ring_inuse = 0;
3359 hn_start_taskfunc(void *xtxr, int pending __unused)
3361 struct hn_tx_ring *txr = xtxr;
3363 mtx_lock(&txr->hn_tx_lock);
3364 hn_start_locked(txr, 0);
3365 mtx_unlock(&txr->hn_tx_lock);
3369 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3371 struct hn_tx_ring *txr = xtxr;
3373 mtx_lock(&txr->hn_tx_lock);
3374 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3375 hn_start_locked(txr, 0);
3376 mtx_unlock(&txr->hn_tx_lock);
3380 hn_xmit(struct hn_tx_ring *txr, int len)
3382 struct hn_softc *sc = txr->hn_sc;
3383 struct ifnet *ifp = sc->hn_ifp;
3384 struct mbuf *m_head;
3386 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3387 KASSERT(hn_use_if_start == 0,
3388 ("hn_xmit is called, when if_start is enabled"));
3390 if (__predict_false(txr->hn_suspended))
3393 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
3396 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
3397 struct hn_txdesc *txd;
3400 if (len > 0 && m_head->m_pkthdr.len > len) {
3402 * This sending could be time consuming; let callers
3403 * dispatch this packet sending (and sending of any
3404 * following up packets) to tx taskqueue.
3406 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3410 txd = hn_txdesc_get(txr);
3412 txr->hn_no_txdescs++;
3413 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3414 txr->hn_oactive = 1;
3418 error = hn_encap(txr, txd, &m_head);
3420 /* Both txd and m_head are freed; discard */
3421 drbr_advance(ifp, txr->hn_mbuf_br);
3425 error = hn_txpkt(ifp, txr, txd);
3426 if (__predict_false(error)) {
3427 /* txd is freed, but m_head is not */
3428 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3429 txr->hn_oactive = 1;
3434 drbr_advance(ifp, txr->hn_mbuf_br);
3440 hn_transmit(struct ifnet *ifp, struct mbuf *m)
3442 struct hn_softc *sc = ifp->if_softc;
3443 struct hn_tx_ring *txr;
3447 * Select the TX ring based on flowid
3449 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
3450 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
3451 txr = &sc->hn_tx_ring[idx];
3453 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
3455 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
3459 if (txr->hn_oactive)
3462 if (txr->hn_sched_tx)
3465 if (mtx_trylock(&txr->hn_tx_lock)) {
3468 sched = hn_xmit(txr, txr->hn_direct_tx_size);
3469 mtx_unlock(&txr->hn_tx_lock);
3474 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3479 hn_tx_ring_qflush(struct hn_tx_ring *txr)
3483 mtx_lock(&txr->hn_tx_lock);
3484 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
3486 mtx_unlock(&txr->hn_tx_lock);
3490 hn_xmit_qflush(struct ifnet *ifp)
3492 struct hn_softc *sc = ifp->if_softc;
3495 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3496 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
3501 hn_xmit_txeof(struct hn_tx_ring *txr)
3504 if (txr->hn_sched_tx)
3507 if (mtx_trylock(&txr->hn_tx_lock)) {
3510 txr->hn_oactive = 0;
3511 sched = hn_xmit(txr, txr->hn_direct_tx_size);
3512 mtx_unlock(&txr->hn_tx_lock);
3514 taskqueue_enqueue(txr->hn_tx_taskq,
3520 * Release the oactive earlier, with the hope, that
3521 * others could catch up. The task will clear the
3522 * oactive again with the hn_tx_lock to avoid possible
3525 txr->hn_oactive = 0;
3526 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3531 hn_xmit_taskfunc(void *xtxr, int pending __unused)
3533 struct hn_tx_ring *txr = xtxr;
3535 mtx_lock(&txr->hn_tx_lock);
3537 mtx_unlock(&txr->hn_tx_lock);
3541 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
3543 struct hn_tx_ring *txr = xtxr;
3545 mtx_lock(&txr->hn_tx_lock);
3546 txr->hn_oactive = 0;
3548 mtx_unlock(&txr->hn_tx_lock);
3552 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
3554 struct vmbus_chan_br cbr;
3555 struct hn_rx_ring *rxr;
3556 struct hn_tx_ring *txr = NULL;
3559 idx = vmbus_chan_subidx(chan);
3562 * Link this channel to RX/TX ring.
3564 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
3565 ("invalid channel index %d, should > 0 && < %d",
3566 idx, sc->hn_rx_ring_inuse));
3567 rxr = &sc->hn_rx_ring[idx];
3568 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
3569 ("RX ring %d already attached", idx));
3570 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
3573 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
3574 idx, vmbus_chan_id(chan));
3577 if (idx < sc->hn_tx_ring_inuse) {
3578 txr = &sc->hn_tx_ring[idx];
3579 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
3580 ("TX ring %d already attached", idx));
3581 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
3583 txr->hn_chan = chan;
3585 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
3586 idx, vmbus_chan_id(chan));
3590 /* Bind this channel to a proper CPU. */
3591 vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
3596 cbr.cbr = rxr->hn_br;
3597 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
3598 cbr.cbr_txsz = HN_TXBR_SIZE;
3599 cbr.cbr_rxsz = HN_RXBR_SIZE;
3600 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
3602 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
3603 vmbus_chan_id(chan), error);
3604 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
3606 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
3612 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
3614 struct hn_rx_ring *rxr;
3617 idx = vmbus_chan_subidx(chan);
3620 * Link this channel to RX/TX ring.
3622 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
3623 ("invalid channel index %d, should > 0 && < %d",
3624 idx, sc->hn_rx_ring_inuse));
3625 rxr = &sc->hn_rx_ring[idx];
3626 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
3627 ("RX ring %d is not attached", idx));
3628 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
3630 if (idx < sc->hn_tx_ring_inuse) {
3631 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
3633 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
3634 ("TX ring %d is not attached attached", idx));
3635 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
3639 * Close this channel.
3642 * Channel closing does _not_ destroy the target channel.
3644 vmbus_chan_close(chan);
3648 hn_attach_subchans(struct hn_softc *sc)
3650 struct vmbus_channel **subchans;
3651 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
3654 if (subchan_cnt == 0)
3657 /* Attach the sub-channels. */
3658 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
3659 for (i = 0; i < subchan_cnt; ++i) {
3660 error = hn_chan_attach(sc, subchans[i]);
3664 vmbus_subchan_rel(subchans, subchan_cnt);
3667 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
3670 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
3678 hn_detach_allchans(struct hn_softc *sc)
3680 struct vmbus_channel **subchans;
3681 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
3684 if (subchan_cnt == 0)
3687 /* Detach the sub-channels. */
3688 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
3689 for (i = 0; i < subchan_cnt; ++i)
3690 hn_chan_detach(sc, subchans[i]);
3691 vmbus_subchan_rel(subchans, subchan_cnt);
3695 * Detach the primary channel, _after_ all sub-channels
3698 hn_chan_detach(sc, sc->hn_prichan);
3700 /* Wait for sub-channels to be destroyed, if any. */
3701 vmbus_subchan_drain(sc->hn_prichan);
3704 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3705 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
3706 HN_RX_FLAG_ATTACHED) == 0,
3707 ("%dth RX ring is still attached", i));
3709 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3710 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
3711 HN_TX_FLAG_ATTACHED) == 0,
3712 ("%dth TX ring is still attached", i));
3718 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
3720 struct vmbus_channel **subchans;
3721 int nchan, rxr_cnt, error;
3723 nchan = *nsubch + 1;
3726 * Multiple RX/TX rings are not requested.
3733 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
3736 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
3738 /* No RSS; this is benign. */
3743 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
3747 if (nchan > rxr_cnt)
3750 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
3756 * Allocate sub-channels from NVS.
3758 *nsubch = nchan - 1;
3759 error = hn_nvs_alloc_subchans(sc, nsubch);
3760 if (error || *nsubch == 0) {
3761 /* Failed to allocate sub-channels. */
3767 * Wait for all sub-channels to become ready before moving on.
3769 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
3770 vmbus_subchan_rel(subchans, *nsubch);
3775 hn_synth_attach(struct hn_softc *sc, int mtu)
3777 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
3778 int error, nsubch, nchan, i;
3781 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
3782 ("synthetic parts were attached"));
3784 /* Save capabilities for later verification. */
3785 old_caps = sc->hn_caps;
3788 /* Clear RSS stuffs. */
3789 sc->hn_rss_ind_size = 0;
3790 sc->hn_rss_hash = 0;
3793 * Attach the primary channel _before_ attaching NVS and RNDIS.
3795 error = hn_chan_attach(sc, sc->hn_prichan);
3802 error = hn_nvs_attach(sc, mtu);
3807 * Attach RNDIS _after_ NVS is attached.
3809 error = hn_rndis_attach(sc, mtu);
3814 * Make sure capabilities are not changed.
3816 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
3817 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
3818 old_caps, sc->hn_caps);
3819 /* Restore old capabilities and abort. */
3820 sc->hn_caps = old_caps;
3825 * Allocate sub-channels for multi-TX/RX rings.
3828 * The # of RX rings that can be used is equivalent to the # of
3829 * channels to be requested.
3831 nsubch = sc->hn_rx_ring_cnt - 1;
3832 error = hn_synth_alloc_subchans(sc, &nsubch);
3838 /* Only the primary channel can be used; done */
3843 * Configure RSS key and indirect table _after_ all sub-channels
3847 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
3849 * RSS key is not set yet; set it to the default RSS key.
3852 if_printf(sc->hn_ifp, "setup default RSS key\n");
3853 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
3854 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3857 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
3859 * RSS indirect table is not set yet; set it up in round-
3863 if_printf(sc->hn_ifp, "setup default RSS indirect "
3866 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
3867 rss->rss_ind[i] = i % nchan;
3868 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3871 * # of usable channels may be changed, so we have to
3872 * make sure that all entries in RSS indirect table
3875 hn_rss_ind_fixup(sc, nchan);
3878 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
3881 * Failed to configure RSS key or indirect table; only
3882 * the primary channel can be used.
3888 * Set the # of TX/RX rings that could be used according to
3889 * the # of channels that NVS offered.
3891 hn_set_ring_inuse(sc, nchan);
3894 * Attach the sub-channels, if any.
3896 error = hn_attach_subchans(sc);
3900 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
3906 * The interface must have been suspended though hn_suspend(), before
3907 * this function get called.
3910 hn_synth_detach(struct hn_softc *sc)
3914 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3915 ("synthetic parts were not attached"));
3917 /* Detach the RNDIS first. */
3918 hn_rndis_detach(sc);
3923 /* Detach all of the channels. */
3924 hn_detach_allchans(sc);
3926 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
3930 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
3932 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
3933 ("invalid ring count %d", ring_cnt));
3935 if (sc->hn_tx_ring_cnt > ring_cnt)
3936 sc->hn_tx_ring_inuse = ring_cnt;
3938 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3939 sc->hn_rx_ring_inuse = ring_cnt;
3942 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
3943 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
3948 hn_chan_drain(struct vmbus_channel *chan)
3951 while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan))
3953 vmbus_chan_intr_drain(chan);
3957 hn_suspend_data(struct hn_softc *sc)
3959 struct vmbus_channel **subch = NULL;
3967 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
3968 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
3970 mtx_lock(&txr->hn_tx_lock);
3971 txr->hn_suspended = 1;
3972 mtx_unlock(&txr->hn_tx_lock);
3973 /* No one is able send more packets now. */
3975 /* Wait for all pending sends to finish. */
3976 while (hn_tx_ring_pending(txr))
3977 pause("hnwtx", 1 /* 1 tick */);
3979 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
3980 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
3984 * Disable RX by clearing RX filter.
3986 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
3987 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
3990 * Give RNDIS enough time to flush all pending data packets.
3992 pause("waitrx", (200 * hz) / 1000);
3995 * Drain RX/TX bufrings and interrupts.
3997 nsubch = sc->hn_rx_ring_inuse - 1;
3999 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4001 if (subch != NULL) {
4002 for (i = 0; i < nsubch; ++i)
4003 hn_chan_drain(subch[i]);
4005 hn_chan_drain(sc->hn_prichan);
4008 vmbus_subchan_rel(subch, nsubch);
4012 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4015 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4019 hn_suspend_mgmt(struct hn_softc *sc)
4026 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4027 * through hn_mgmt_taskq.
4029 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4030 vmbus_chan_run_task(sc->hn_prichan, &task);
4033 * Make sure that all pending management tasks are completed.
4035 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4036 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4037 taskqueue_drain_all(sc->hn_mgmt_taskq0);
4041 hn_suspend(struct hn_softc *sc)
4044 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4045 hn_suspend_data(sc);
4046 hn_suspend_mgmt(sc);
4050 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4054 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4055 ("invalid TX ring count %d", tx_ring_cnt));
4057 for (i = 0; i < tx_ring_cnt; ++i) {
4058 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4060 mtx_lock(&txr->hn_tx_lock);
4061 txr->hn_suspended = 0;
4062 mtx_unlock(&txr->hn_tx_lock);
4067 hn_resume_data(struct hn_softc *sc)
4076 hn_set_rxfilter(sc);
4079 * Make sure to clear suspend status on "all" TX rings,
4080 * since hn_tx_ring_inuse can be changed after
4081 * hn_suspend_data().
4083 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4085 if (!hn_use_if_start) {
4087 * Flush unused drbrs, since hn_tx_ring_inuse may be
4090 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4091 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4097 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4098 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4101 * Use txeof task, so that any pending oactive can be
4104 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4109 hn_resume_mgmt(struct hn_softc *sc)
4112 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4115 * Kick off network change detection, if it was pending.
4116 * If no network change was pending, start link status
4117 * checks, which is more lightweight than network change
4120 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4121 hn_change_network(sc);
4123 hn_update_link_status(sc);
4127 hn_resume(struct hn_softc *sc)
4130 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4136 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4138 const struct rndis_status_msg *msg;
4141 if (dlen < sizeof(*msg)) {
4142 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4147 switch (msg->rm_status) {
4148 case RNDIS_STATUS_MEDIA_CONNECT:
4149 case RNDIS_STATUS_MEDIA_DISCONNECT:
4150 hn_update_link_status(sc);
4153 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4154 /* Not really useful; ignore. */
4157 case RNDIS_STATUS_NETWORK_CHANGE:
4158 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4159 if (dlen < ofs + msg->rm_stbuflen ||
4160 msg->rm_stbuflen < sizeof(uint32_t)) {
4161 if_printf(sc->hn_ifp, "network changed\n");
4165 memcpy(&change, ((const uint8_t *)msg) + ofs,
4167 if_printf(sc->hn_ifp, "network changed, change %u\n",
4170 hn_change_network(sc);
4174 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4181 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4183 const struct rndis_pktinfo *pi = info_data;
4186 while (info_dlen != 0) {
4190 if (__predict_false(info_dlen < sizeof(*pi)))
4192 if (__predict_false(info_dlen < pi->rm_size))
4194 info_dlen -= pi->rm_size;
4196 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
4198 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
4200 dlen = pi->rm_size - pi->rm_pktinfooffset;
4203 switch (pi->rm_type) {
4204 case NDIS_PKTINFO_TYPE_VLAN:
4205 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
4207 info->vlan_info = *((const uint32_t *)data);
4208 mask |= HN_RXINFO_VLAN;
4211 case NDIS_PKTINFO_TYPE_CSUM:
4212 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
4214 info->csum_info = *((const uint32_t *)data);
4215 mask |= HN_RXINFO_CSUM;
4218 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
4219 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
4221 info->hash_value = *((const uint32_t *)data);
4222 mask |= HN_RXINFO_HASHVAL;
4225 case HN_NDIS_PKTINFO_TYPE_HASHINF:
4226 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
4228 info->hash_info = *((const uint32_t *)data);
4229 mask |= HN_RXINFO_HASHINF;
4236 if (mask == HN_RXINFO_ALL) {
4237 /* All found; done */
4241 pi = (const struct rndis_pktinfo *)
4242 ((const uint8_t *)pi + pi->rm_size);
4247 * - If there is no hash value, invalidate the hash info.
4249 if ((mask & HN_RXINFO_HASHVAL) == 0)
4250 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
4254 static __inline bool
4255 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
4258 if (off < check_off) {
4259 if (__predict_true(off + len <= check_off))
4261 } else if (off > check_off) {
4262 if (__predict_true(check_off + check_len <= off))
4269 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
4271 const struct rndis_packet_msg *pkt;
4272 struct hn_rxinfo info;
4273 int data_off, pktinfo_off, data_len, pktinfo_len;
4278 if (__predict_false(dlen < sizeof(*pkt))) {
4279 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
4284 if (__predict_false(dlen < pkt->rm_len)) {
4285 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
4286 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
4289 if (__predict_false(pkt->rm_len <
4290 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
4291 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
4292 "msglen %u, data %u, oob %u, pktinfo %u\n",
4293 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
4294 pkt->rm_pktinfolen);
4297 if (__predict_false(pkt->rm_datalen == 0)) {
4298 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
4305 #define IS_OFFSET_INVALID(ofs) \
4306 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
4307 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
4309 /* XXX Hyper-V does not meet data offset alignment requirement */
4310 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
4311 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4312 "data offset %u\n", pkt->rm_dataoffset);
4315 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
4316 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
4317 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4318 "oob offset %u\n", pkt->rm_oobdataoffset);
4321 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
4322 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
4323 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4324 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
4328 #undef IS_OFFSET_INVALID
4330 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
4331 data_len = pkt->rm_datalen;
4332 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
4333 pktinfo_len = pkt->rm_pktinfolen;
4336 * Check OOB coverage.
4338 if (__predict_false(pkt->rm_oobdatalen != 0)) {
4339 int oob_off, oob_len;
4341 if_printf(rxr->hn_ifp, "got oobdata\n");
4342 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
4343 oob_len = pkt->rm_oobdatalen;
4345 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
4346 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4347 "oob overflow, msglen %u, oob abs %d len %d\n",
4348 pkt->rm_len, oob_off, oob_len);
4353 * Check against data.
4355 if (hn_rndis_check_overlap(oob_off, oob_len,
4356 data_off, data_len)) {
4357 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4358 "oob overlaps data, oob abs %d len %d, "
4359 "data abs %d len %d\n",
4360 oob_off, oob_len, data_off, data_len);
4365 * Check against pktinfo.
4367 if (pktinfo_len != 0 &&
4368 hn_rndis_check_overlap(oob_off, oob_len,
4369 pktinfo_off, pktinfo_len)) {
4370 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4371 "oob overlaps pktinfo, oob abs %d len %d, "
4372 "pktinfo abs %d len %d\n",
4373 oob_off, oob_len, pktinfo_off, pktinfo_len);
4379 * Check per-packet-info coverage and find useful per-packet-info.
4381 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
4382 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
4383 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
4384 if (__predict_true(pktinfo_len != 0)) {
4388 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
4389 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4390 "pktinfo overflow, msglen %u, "
4391 "pktinfo abs %d len %d\n",
4392 pkt->rm_len, pktinfo_off, pktinfo_len);
4397 * Check packet info coverage.
4399 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
4400 data_off, data_len);
4401 if (__predict_false(overlap)) {
4402 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4403 "pktinfo overlap data, pktinfo abs %d len %d, "
4404 "data abs %d len %d\n",
4405 pktinfo_off, pktinfo_len, data_off, data_len);
4410 * Find useful per-packet-info.
4412 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
4413 pktinfo_len, &info);
4414 if (__predict_false(error)) {
4415 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
4421 if (__predict_false(data_off + data_len > pkt->rm_len)) {
4422 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4423 "data overflow, msglen %u, data abs %d len %d\n",
4424 pkt->rm_len, data_off, data_len);
4427 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
4430 static __inline void
4431 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
4433 const struct rndis_msghdr *hdr;
4435 if (__predict_false(dlen < sizeof(*hdr))) {
4436 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
4441 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
4442 /* Hot data path. */
4443 hn_rndis_rx_data(rxr, data, dlen);
4448 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
4449 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
4451 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
4455 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
4457 const struct hn_nvs_hdr *hdr;
4459 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
4460 if_printf(sc->hn_ifp, "invalid nvs notify\n");
4463 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
4465 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
4466 /* Useless; ignore */
4469 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
4473 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
4474 const struct vmbus_chanpkt_hdr *pkt)
4476 struct hn_nvs_sendctx *sndc;
4478 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
4479 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
4480 VMBUS_CHANPKT_DATALEN(pkt));
4483 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
4489 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
4490 const struct vmbus_chanpkt_hdr *pkthdr)
4492 const struct vmbus_chanpkt_rxbuf *pkt;
4493 const struct hn_nvs_hdr *nvs_hdr;
4496 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
4497 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
4500 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
4502 /* Make sure that this is a RNDIS message. */
4503 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
4504 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
4509 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
4510 if (__predict_false(hlen < sizeof(*pkt))) {
4511 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
4514 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
4516 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
4517 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
4522 count = pkt->cp_rxbuf_cnt;
4523 if (__predict_false(hlen <
4524 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
4525 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
4529 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
4530 for (i = 0; i < count; ++i) {
4533 ofs = pkt->cp_rxbuf[i].rb_ofs;
4534 len = pkt->cp_rxbuf[i].rb_len;
4535 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
4536 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
4537 "ofs %d, len %d\n", i, ofs, len);
4540 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
4544 * Ack the consumed RXBUF associated w/ this channel packet,
4545 * so that this RXBUF can be recycled by the hypervisor.
4547 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
4551 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
4554 struct hn_nvs_rndis_ack ack;
4557 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
4558 ack.nvs_status = HN_NVS_STATUS_OK;
4562 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
4563 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
4564 if (__predict_false(error == EAGAIN)) {
4567 * This should _not_ happen in real world, since the
4568 * consumption of the TX bufring from the TX path is
4571 if (rxr->hn_ack_failed == 0)
4572 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
4573 rxr->hn_ack_failed++;
4580 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
4585 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
4587 struct hn_rx_ring *rxr = xrxr;
4588 struct hn_softc *sc = rxr->hn_ifp->if_softc;
4591 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
4594 pktlen = rxr->hn_pktbuf_len;
4595 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
4596 if (__predict_false(error == ENOBUFS)) {
4601 * Expand channel packet buffer.
4604 * Use M_WAITOK here, since allocation failure
4607 nlen = rxr->hn_pktbuf_len * 2;
4608 while (nlen < pktlen)
4610 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
4612 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
4613 rxr->hn_pktbuf_len, nlen);
4615 free(rxr->hn_pktbuf, M_DEVBUF);
4616 rxr->hn_pktbuf = nbuf;
4617 rxr->hn_pktbuf_len = nlen;
4620 } else if (__predict_false(error == EAGAIN)) {
4621 /* No more channel packets; done! */
4624 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
4626 switch (pkt->cph_type) {
4627 case VMBUS_CHANPKT_TYPE_COMP:
4628 hn_nvs_handle_comp(sc, chan, pkt);
4631 case VMBUS_CHANPKT_TYPE_RXBUF:
4632 hn_nvs_handle_rxbuf(rxr, chan, pkt);
4635 case VMBUS_CHANPKT_TYPE_INBAND:
4636 hn_nvs_handle_notify(sc, pkt);
4640 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
4645 hn_chan_rollup(rxr, rxr->hn_txr);
4649 hn_tx_taskq_create(void *arg __unused)
4652 if (vm_guest != VM_GUEST_HV)
4655 if (!hn_share_tx_taskq)
4658 hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
4659 taskqueue_thread_enqueue, &hn_tx_taskq);
4660 taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
4661 if (hn_bind_tx_taskq >= 0) {
4662 int cpu = hn_bind_tx_taskq;
4663 struct task cpuset_task;
4666 if (cpu > mp_ncpus - 1)
4668 CPU_SETOF(cpu, &cpu_set);
4669 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set);
4670 taskqueue_enqueue(hn_tx_taskq, &cpuset_task);
4671 taskqueue_drain(hn_tx_taskq, &cpuset_task);
4674 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
4675 hn_tx_taskq_create, NULL);
4678 hn_tx_taskq_destroy(void *arg __unused)
4681 if (hn_tx_taskq != NULL)
4682 taskqueue_free(hn_tx_taskq);
4684 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
4685 hn_tx_taskq_destroy, NULL);