2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
61 #include <sys/param.h>
63 #include <sys/kernel.h>
64 #include <sys/limits.h>
65 #include <sys/malloc.h>
67 #include <sys/module.h>
69 #include <sys/queue.h>
72 #include <sys/socket.h>
73 #include <sys/sockio.h>
75 #include <sys/sysctl.h>
76 #include <sys/systm.h>
77 #include <sys/taskqueue.h>
78 #include <sys/buf_ring.h>
80 #include <machine/atomic.h>
81 #include <machine/in_cksum.h>
84 #include <net/ethernet.h>
86 #include <net/if_arp.h>
87 #include <net/if_media.h>
88 #include <net/if_types.h>
89 #include <net/if_var.h>
90 #include <net/if_vlan_var.h>
91 #include <net/rndis.h>
93 #include <netinet/in_systm.h>
94 #include <netinet/in.h>
95 #include <netinet/ip.h>
96 #include <netinet/ip6.h>
97 #include <netinet/tcp.h>
98 #include <netinet/tcp_lro.h>
99 #include <netinet/udp.h>
101 #include <dev/hyperv/include/hyperv.h>
102 #include <dev/hyperv/include/hyperv_busdma.h>
103 #include <dev/hyperv/include/vmbus.h>
104 #include <dev/hyperv/include/vmbus_xact.h>
106 #include <dev/hyperv/netvsc/ndis.h>
107 #include <dev/hyperv/netvsc/if_hnreg.h>
108 #include <dev/hyperv/netvsc/if_hnvar.h>
109 #include <dev/hyperv/netvsc/hn_nvs.h>
110 #include <dev/hyperv/netvsc/hn_rndis.h>
112 #include "vmbus_if.h"
114 #define HN_IFSTART_SUPPORT
116 #define HN_RING_CNT_DEF_MAX 8
118 /* YYY should get it from the underlying channel */
119 #define HN_TX_DESC_CNT 512
121 #define HN_RNDIS_PKT_LEN \
122 (sizeof(struct rndis_packet_msg) + \
123 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
124 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
125 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
126 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
127 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
128 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
130 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
131 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
132 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
133 /* -1 for RNDIS packet message */
134 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
136 #define HN_DIRECT_TX_SIZE_DEF 128
138 #define HN_EARLY_TXEOF_THRESH 8
140 #define HN_PKTBUF_LEN_DEF (16 * 1024)
142 #define HN_LROENT_CNT_DEF 128
144 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
145 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
146 /* YYY 2*MTU is a bit rough, but should be good enough. */
147 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
149 #define HN_LRO_ACKCNT_DEF 1
151 #define HN_LOCK_INIT(sc) \
152 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
153 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
154 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
155 #define HN_LOCK(sc) \
157 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
160 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
162 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
163 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
164 #define HN_CSUM_IP_HWASSIST(sc) \
165 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
166 #define HN_CSUM_IP6_HWASSIST(sc) \
167 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
169 #define HN_PKTSIZE_MIN(align) \
170 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
171 HN_RNDIS_PKT_LEN, (align))
172 #define HN_PKTSIZE(m, align) \
173 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
175 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
178 #ifndef HN_USE_TXDESC_BUFRING
179 SLIST_ENTRY(hn_txdesc) link;
181 STAILQ_ENTRY(hn_txdesc) agg_link;
183 /* Aggregated txdescs, in sending order. */
184 STAILQ_HEAD(, hn_txdesc) agg_list;
186 /* The oldest packet, if transmission aggregation happens. */
188 struct hn_tx_ring *txr;
190 uint32_t flags; /* HN_TXD_FLAG_ */
191 struct hn_nvs_sendctx send_ctx;
195 bus_dmamap_t data_dmap;
197 bus_addr_t rndis_pkt_paddr;
198 struct rndis_packet_msg *rndis_pkt;
199 bus_dmamap_t rndis_pkt_dmap;
202 #define HN_TXD_FLAG_ONLIST 0x0001
203 #define HN_TXD_FLAG_DMAMAP 0x0002
204 #define HN_TXD_FLAG_ONAGG 0x0004
213 #define HN_RXINFO_VLAN 0x0001
214 #define HN_RXINFO_CSUM 0x0002
215 #define HN_RXINFO_HASHINF 0x0004
216 #define HN_RXINFO_HASHVAL 0x0008
217 #define HN_RXINFO_ALL \
220 HN_RXINFO_HASHINF | \
223 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
224 #define HN_NDIS_RXCSUM_INFO_INVALID 0
225 #define HN_NDIS_HASH_INFO_INVALID 0
227 static int hn_probe(device_t);
228 static int hn_attach(device_t);
229 static int hn_detach(device_t);
230 static int hn_shutdown(device_t);
231 static void hn_chan_callback(struct vmbus_channel *,
234 static void hn_init(void *);
235 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
236 #ifdef HN_IFSTART_SUPPORT
237 static void hn_start(struct ifnet *);
239 static int hn_transmit(struct ifnet *, struct mbuf *);
240 static void hn_xmit_qflush(struct ifnet *);
241 static int hn_ifmedia_upd(struct ifnet *);
242 static void hn_ifmedia_sts(struct ifnet *,
243 struct ifmediareq *);
245 static int hn_rndis_rxinfo(const void *, int,
247 static void hn_rndis_rx_data(struct hn_rx_ring *,
249 static void hn_rndis_rx_status(struct hn_softc *,
252 static void hn_nvs_handle_notify(struct hn_softc *,
253 const struct vmbus_chanpkt_hdr *);
254 static void hn_nvs_handle_comp(struct hn_softc *,
255 struct vmbus_channel *,
256 const struct vmbus_chanpkt_hdr *);
257 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
258 struct vmbus_channel *,
259 const struct vmbus_chanpkt_hdr *);
260 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
261 struct vmbus_channel *, uint64_t);
263 #if __FreeBSD_version >= 1100099
264 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
265 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
267 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
268 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
269 #if __FreeBSD_version < 1100095
270 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
272 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
274 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
275 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
276 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
277 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
278 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
279 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
280 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
281 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
282 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
283 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
284 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
285 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
286 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
287 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
289 static void hn_stop(struct hn_softc *);
290 static void hn_init_locked(struct hn_softc *);
291 static int hn_chan_attach(struct hn_softc *,
292 struct vmbus_channel *);
293 static void hn_chan_detach(struct hn_softc *,
294 struct vmbus_channel *);
295 static int hn_attach_subchans(struct hn_softc *);
296 static void hn_detach_allchans(struct hn_softc *);
297 static void hn_chan_rollup(struct hn_rx_ring *,
298 struct hn_tx_ring *);
299 static void hn_set_ring_inuse(struct hn_softc *, int);
300 static int hn_synth_attach(struct hn_softc *, int);
301 static void hn_synth_detach(struct hn_softc *);
302 static int hn_synth_alloc_subchans(struct hn_softc *,
304 static bool hn_synth_attachable(const struct hn_softc *);
305 static void hn_suspend(struct hn_softc *);
306 static void hn_suspend_data(struct hn_softc *);
307 static void hn_suspend_mgmt(struct hn_softc *);
308 static void hn_resume(struct hn_softc *);
309 static void hn_resume_data(struct hn_softc *);
310 static void hn_resume_mgmt(struct hn_softc *);
311 static void hn_suspend_mgmt_taskfunc(void *, int);
312 static void hn_chan_drain(struct hn_softc *,
313 struct vmbus_channel *);
315 static void hn_update_link_status(struct hn_softc *);
316 static void hn_change_network(struct hn_softc *);
317 static void hn_link_taskfunc(void *, int);
318 static void hn_netchg_init_taskfunc(void *, int);
319 static void hn_netchg_status_taskfunc(void *, int);
320 static void hn_link_status(struct hn_softc *);
322 static int hn_create_rx_data(struct hn_softc *, int);
323 static void hn_destroy_rx_data(struct hn_softc *);
324 static int hn_check_iplen(const struct mbuf *, int);
325 static int hn_set_rxfilter(struct hn_softc *);
326 static int hn_rss_reconfig(struct hn_softc *);
327 static void hn_rss_ind_fixup(struct hn_softc *);
328 static int hn_rxpkt(struct hn_rx_ring *, const void *,
329 int, const struct hn_rxinfo *);
331 static int hn_tx_ring_create(struct hn_softc *, int);
332 static void hn_tx_ring_destroy(struct hn_tx_ring *);
333 static int hn_create_tx_data(struct hn_softc *, int);
334 static void hn_fixup_tx_data(struct hn_softc *);
335 static void hn_destroy_tx_data(struct hn_softc *);
336 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
337 static void hn_txdesc_gc(struct hn_tx_ring *,
339 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
340 struct hn_txdesc *, struct mbuf **);
341 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
343 static void hn_set_chim_size(struct hn_softc *, int);
344 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
345 static bool hn_tx_ring_pending(struct hn_tx_ring *);
346 static void hn_tx_ring_qflush(struct hn_tx_ring *);
347 static void hn_resume_tx(struct hn_softc *, int);
348 static void hn_set_txagg(struct hn_softc *);
349 static void *hn_try_txagg(struct ifnet *,
350 struct hn_tx_ring *, struct hn_txdesc *,
352 static int hn_get_txswq_depth(const struct hn_tx_ring *);
353 static void hn_txpkt_done(struct hn_nvs_sendctx *,
354 struct hn_softc *, struct vmbus_channel *,
356 static int hn_txpkt_sglist(struct hn_tx_ring *,
358 static int hn_txpkt_chim(struct hn_tx_ring *,
360 static int hn_xmit(struct hn_tx_ring *, int);
361 static void hn_xmit_taskfunc(void *, int);
362 static void hn_xmit_txeof(struct hn_tx_ring *);
363 static void hn_xmit_txeof_taskfunc(void *, int);
364 #ifdef HN_IFSTART_SUPPORT
365 static int hn_start_locked(struct hn_tx_ring *, int);
366 static void hn_start_taskfunc(void *, int);
367 static void hn_start_txeof(struct hn_tx_ring *);
368 static void hn_start_txeof_taskfunc(void *, int);
371 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
372 "Hyper-V network interface");
374 /* Trust tcp segements verification on host side. */
375 static int hn_trust_hosttcp = 1;
376 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
377 &hn_trust_hosttcp, 0,
378 "Trust tcp segement verification on host side, "
379 "when csum info is missing (global setting)");
381 /* Trust udp datagrams verification on host side. */
382 static int hn_trust_hostudp = 1;
383 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
384 &hn_trust_hostudp, 0,
385 "Trust udp datagram verification on host side, "
386 "when csum info is missing (global setting)");
388 /* Trust ip packets verification on host side. */
389 static int hn_trust_hostip = 1;
390 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
392 "Trust ip packet verification on host side, "
393 "when csum info is missing (global setting)");
395 /* Limit TSO burst size */
396 static int hn_tso_maxlen = IP_MAXPACKET;
397 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
398 &hn_tso_maxlen, 0, "TSO burst limit");
400 /* Limit chimney send size */
401 static int hn_tx_chimney_size = 0;
402 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
403 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
405 /* Limit the size of packet for direct transmission */
406 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
407 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
408 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
410 /* # of LRO entries per RX ring */
411 #if defined(INET) || defined(INET6)
412 #if __FreeBSD_version >= 1100095
413 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
414 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
415 &hn_lro_entry_count, 0, "LRO entry count");
419 static int hn_tx_taskq_cnt = 1;
420 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
421 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
423 #define HN_TX_TASKQ_M_INDEP 0
424 #define HN_TX_TASKQ_M_GLOBAL 1
425 #define HN_TX_TASKQ_M_EVTTQ 2
427 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
428 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
429 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
430 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
432 #ifndef HN_USE_TXDESC_BUFRING
433 static int hn_use_txdesc_bufring = 0;
435 static int hn_use_txdesc_bufring = 1;
437 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
438 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
440 #ifdef HN_IFSTART_SUPPORT
441 /* Use ifnet.if_start instead of ifnet.if_transmit */
442 static int hn_use_if_start = 0;
443 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
444 &hn_use_if_start, 0, "Use if_start TX method");
447 /* # of channels to use */
448 static int hn_chan_cnt = 0;
449 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
451 "# of channels to use; each channel has one RX ring and one TX ring");
453 /* # of transmit rings to use */
454 static int hn_tx_ring_cnt = 0;
455 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
456 &hn_tx_ring_cnt, 0, "# of TX rings to use");
458 /* Software TX ring deptch */
459 static int hn_tx_swq_depth = 0;
460 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
461 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
463 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
464 #if __FreeBSD_version >= 1100095
465 static u_int hn_lro_mbufq_depth = 0;
466 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
467 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
470 /* Packet transmission aggregation size limit */
471 static int hn_tx_agg_size = -1;
472 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
473 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
475 /* Packet transmission aggregation count limit */
476 static int hn_tx_agg_pkts = -1;
477 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
478 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
480 static u_int hn_cpu_index; /* next CPU for channel */
481 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
484 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
485 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
486 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
487 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
488 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
489 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
492 static device_method_t hn_methods[] = {
493 /* Device interface */
494 DEVMETHOD(device_probe, hn_probe),
495 DEVMETHOD(device_attach, hn_attach),
496 DEVMETHOD(device_detach, hn_detach),
497 DEVMETHOD(device_shutdown, hn_shutdown),
501 static driver_t hn_driver = {
504 sizeof(struct hn_softc)
507 static devclass_t hn_devclass;
509 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
510 MODULE_VERSION(hn, 1);
511 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
513 #if __FreeBSD_version >= 1100099
515 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
519 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
520 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
525 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
528 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
529 txd->chim_size == 0, ("invalid rndis sglist txd"));
530 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
531 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
535 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
537 struct hn_nvs_rndis rndis;
539 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
540 txd->chim_size > 0, ("invalid rndis chim txd"));
542 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
543 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
544 rndis.nvs_chim_idx = txd->chim_index;
545 rndis.nvs_chim_sz = txd->chim_size;
547 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
548 &rndis, sizeof(rndis), &txd->send_ctx));
551 static __inline uint32_t
552 hn_chim_alloc(struct hn_softc *sc)
554 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
555 u_long *bmap = sc->hn_chim_bmap;
556 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
558 for (i = 0; i < bmap_cnt; ++i) {
561 idx = ffsl(~bmap[i]);
565 --idx; /* ffsl is 1-based */
566 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
567 ("invalid i %d and idx %d", i, idx));
569 if (atomic_testandset_long(&bmap[i], idx))
572 ret = i * LONG_BIT + idx;
579 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
584 idx = chim_idx / LONG_BIT;
585 KASSERT(idx < sc->hn_chim_bmap_cnt,
586 ("invalid chimney index 0x%x", chim_idx));
588 mask = 1UL << (chim_idx % LONG_BIT);
589 KASSERT(sc->hn_chim_bmap[idx] & mask,
590 ("index bitmap 0x%lx, chimney index %u, "
591 "bitmap idx %d, bitmask 0x%lx",
592 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
594 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
597 #if defined(INET6) || defined(INET)
599 * NOTE: If this function failed, the m_head would be freed.
601 static __inline struct mbuf *
602 hn_tso_fixup(struct mbuf *m_head)
604 struct ether_vlan_header *evl;
608 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
610 #define PULLUP_HDR(m, len) \
612 if (__predict_false((m)->m_len < (len))) { \
613 (m) = m_pullup((m), (len)); \
619 PULLUP_HDR(m_head, sizeof(*evl));
620 evl = mtod(m_head, struct ether_vlan_header *);
621 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
622 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
624 ehlen = ETHER_HDR_LEN;
627 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
631 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
632 ip = mtodo(m_head, ehlen);
633 iphlen = ip->ip_hl << 2;
635 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
636 th = mtodo(m_head, ehlen + iphlen);
640 th->th_sum = in_pseudo(ip->ip_src.s_addr,
641 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
644 #if defined(INET6) && defined(INET)
651 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
652 ip6 = mtodo(m_head, ehlen);
653 if (ip6->ip6_nxt != IPPROTO_TCP) {
658 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
659 th = mtodo(m_head, ehlen + sizeof(*ip6));
662 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
669 #endif /* INET6 || INET */
672 hn_set_rxfilter(struct hn_softc *sc)
674 struct ifnet *ifp = sc->hn_ifp;
680 if (ifp->if_flags & IFF_PROMISC) {
681 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
683 filter = NDIS_PACKET_TYPE_DIRECTED;
684 if (ifp->if_flags & IFF_BROADCAST)
685 filter |= NDIS_PACKET_TYPE_BROADCAST;
686 /* TODO: support multicast list */
687 if ((ifp->if_flags & IFF_ALLMULTI) ||
688 !TAILQ_EMPTY(&ifp->if_multiaddrs))
689 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
692 if (sc->hn_rx_filter != filter) {
693 error = hn_rndis_set_rxfilter(sc, filter);
695 sc->hn_rx_filter = filter;
701 hn_set_txagg(struct hn_softc *sc)
707 * Setup aggregation size.
709 if (sc->hn_agg_size < 0)
712 size = sc->hn_agg_size;
714 if (sc->hn_rndis_agg_size < size)
715 size = sc->hn_rndis_agg_size;
717 /* NOTE: We only aggregate packets using chimney sending buffers. */
718 if (size > (uint32_t)sc->hn_chim_szmax)
719 size = sc->hn_chim_szmax;
721 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
728 /* NOTE: Type of the per TX ring setting is 'int'. */
733 * Setup aggregation packet count.
735 if (sc->hn_agg_pkts < 0)
738 pkts = sc->hn_agg_pkts;
740 if (sc->hn_rndis_agg_pkts < pkts)
741 pkts = sc->hn_rndis_agg_pkts;
750 /* NOTE: Type of the per TX ring setting is 'short'. */
755 /* NOTE: Type of the per TX ring setting is 'short'. */
756 if (sc->hn_rndis_agg_align > SHRT_MAX) {
763 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
764 size, pkts, sc->hn_rndis_agg_align);
767 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
768 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
770 mtx_lock(&txr->hn_tx_lock);
771 txr->hn_agg_szmax = size;
772 txr->hn_agg_pktmax = pkts;
773 txr->hn_agg_align = sc->hn_rndis_agg_align;
774 mtx_unlock(&txr->hn_tx_lock);
779 hn_get_txswq_depth(const struct hn_tx_ring *txr)
782 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
783 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
784 return txr->hn_txdesc_cnt;
785 return hn_tx_swq_depth;
789 hn_rss_reconfig(struct hn_softc *sc)
795 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
802 * Direct reconfiguration by setting the UNCHG flags does
803 * _not_ work properly.
806 if_printf(sc->hn_ifp, "disable RSS\n");
807 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
809 if_printf(sc->hn_ifp, "RSS disable failed\n");
814 * Reenable the RSS w/ the updated RSS key or indirect
818 if_printf(sc->hn_ifp, "reconfig RSS\n");
819 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
821 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
828 hn_rss_ind_fixup(struct hn_softc *sc)
830 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
833 nchan = sc->hn_rx_ring_inuse;
834 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
837 * Check indirect table to make sure that all channels in it
840 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
841 if (rss->rss_ind[i] >= nchan) {
842 if_printf(sc->hn_ifp,
843 "RSS indirect table %d fixup: %u -> %d\n",
844 i, rss->rss_ind[i], nchan - 1);
845 rss->rss_ind[i] = nchan - 1;
851 hn_ifmedia_upd(struct ifnet *ifp __unused)
858 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
860 struct hn_softc *sc = ifp->if_softc;
862 ifmr->ifm_status = IFM_AVALID;
863 ifmr->ifm_active = IFM_ETHER;
865 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
866 ifmr->ifm_active |= IFM_NONE;
869 ifmr->ifm_status |= IFM_ACTIVE;
870 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
873 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
874 static const struct hyperv_guid g_net_vsc_device_type = {
875 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
876 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
880 hn_probe(device_t dev)
883 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
884 &g_net_vsc_device_type) == 0) {
885 device_set_desc(dev, "Hyper-V Network Interface");
886 return BUS_PROBE_DEFAULT;
892 hn_attach(device_t dev)
894 struct hn_softc *sc = device_get_softc(dev);
895 struct sysctl_oid_list *child;
896 struct sysctl_ctx_list *ctx;
897 uint8_t eaddr[ETHER_ADDR_LEN];
898 struct ifnet *ifp = NULL;
899 int error, ring_cnt, tx_ring_cnt;
902 sc->hn_prichan = vmbus_get_channel(dev);
906 * Initialize these tunables once.
908 sc->hn_agg_size = hn_tx_agg_size;
909 sc->hn_agg_pkts = hn_tx_agg_pkts;
912 * Setup taskqueue for transmission.
914 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
918 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
920 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
921 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
922 M_WAITOK, taskqueue_thread_enqueue,
923 &sc->hn_tx_taskqs[i]);
924 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
925 "%s tx%d", device_get_nameunit(dev), i);
927 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
928 sc->hn_tx_taskqs = hn_tx_taskque;
932 * Setup taskqueue for mangement tasks, e.g. link status.
934 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
935 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
936 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
937 device_get_nameunit(dev));
938 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
939 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
940 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
941 hn_netchg_status_taskfunc, sc);
944 * Allocate ifnet and setup its name earlier, so that if_printf
945 * can be used by functions, which will be called after
948 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
950 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
953 * Initialize ifmedia earlier so that it can be unconditionally
954 * destroyed, if error happened later on.
956 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
959 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
960 * to use (tx_ring_cnt).
963 * The # of RX rings to use is same as the # of channels to use.
965 ring_cnt = hn_chan_cnt;
969 if (ring_cnt > HN_RING_CNT_DEF_MAX)
970 ring_cnt = HN_RING_CNT_DEF_MAX;
971 } else if (ring_cnt > mp_ncpus) {
975 tx_ring_cnt = hn_tx_ring_cnt;
976 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
977 tx_ring_cnt = ring_cnt;
978 #ifdef HN_IFSTART_SUPPORT
979 if (hn_use_if_start) {
980 /* ifnet.if_start only needs one TX ring. */
986 * Set the leader CPU for channels.
988 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
991 * Create enough TX/RX rings, even if only limited number of
992 * channels can be allocated.
994 error = hn_create_tx_data(sc, tx_ring_cnt);
997 error = hn_create_rx_data(sc, ring_cnt);
1002 * Create transaction context for NVS and RNDIS transactions.
1004 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1005 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1006 if (sc->hn_xact == NULL) {
1012 * Install orphan handler for the revocation of this device's
1016 * The processing order is critical here:
1017 * Install the orphan handler, _before_ testing whether this
1018 * device's primary channel has been revoked or not.
1020 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1021 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1027 * Attach the synthetic parts, i.e. NVS and RNDIS.
1029 error = hn_synth_attach(sc, ETHERMTU);
1033 error = hn_rndis_get_eaddr(sc, eaddr);
1037 #if __FreeBSD_version >= 1100099
1038 if (sc->hn_rx_ring_inuse > 1) {
1040 * Reduce TCP segment aggregation limit for multiple
1041 * RX rings to increase ACK timeliness.
1043 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1048 * Fixup TX stuffs after synthetic parts are attached.
1050 hn_fixup_tx_data(sc);
1052 ctx = device_get_sysctl_ctx(dev);
1053 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1054 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1055 &sc->hn_nvs_ver, 0, "NVS version");
1056 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1057 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1058 hn_ndis_version_sysctl, "A", "NDIS version");
1059 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1060 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1061 hn_caps_sysctl, "A", "capabilities");
1062 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1063 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1064 hn_hwassist_sysctl, "A", "hwassist");
1065 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1066 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1067 hn_rxfilter_sysctl, "A", "rxfilter");
1068 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1069 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1070 hn_rss_hash_sysctl, "A", "RSS hash");
1071 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1072 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1073 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1074 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1075 hn_rss_key_sysctl, "IU", "RSS key");
1076 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1077 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1078 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1079 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1080 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1081 "RNDIS offered packet transmission aggregation size limit");
1082 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1083 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1084 "RNDIS offered packet transmission aggregation count limit");
1085 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1086 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1087 "RNDIS packet transmission aggregation alignment");
1088 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1089 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1090 hn_txagg_size_sysctl, "I",
1091 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1092 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1093 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1094 hn_txagg_pkts_sysctl, "I",
1095 "Packet transmission aggregation packets, "
1096 "0 -- disable, -1 -- auto");
1099 * Setup the ifmedia, which has been initialized earlier.
1101 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1102 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1103 /* XXX ifmedia_set really should do this for us */
1104 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1107 * Setup the ifnet for this interface.
1111 ifp->if_baudrate = IF_Gbps(10);
1113 /* if_baudrate is 32bits on 32bit system. */
1114 ifp->if_baudrate = IF_Gbps(1);
1116 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1117 ifp->if_ioctl = hn_ioctl;
1118 ifp->if_init = hn_init;
1119 #ifdef HN_IFSTART_SUPPORT
1120 if (hn_use_if_start) {
1121 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1123 ifp->if_start = hn_start;
1124 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1125 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1126 IFQ_SET_READY(&ifp->if_snd);
1130 ifp->if_transmit = hn_transmit;
1131 ifp->if_qflush = hn_xmit_qflush;
1134 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1136 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1137 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1139 if (sc->hn_caps & HN_CAP_VLAN) {
1140 /* XXX not sure about VLAN_MTU. */
1141 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1144 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1145 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1146 ifp->if_capabilities |= IFCAP_TXCSUM;
1147 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1148 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1149 if (sc->hn_caps & HN_CAP_TSO4) {
1150 ifp->if_capabilities |= IFCAP_TSO4;
1151 ifp->if_hwassist |= CSUM_IP_TSO;
1153 if (sc->hn_caps & HN_CAP_TSO6) {
1154 ifp->if_capabilities |= IFCAP_TSO6;
1155 ifp->if_hwassist |= CSUM_IP6_TSO;
1158 /* Enable all available capabilities by default. */
1159 ifp->if_capenable = ifp->if_capabilities;
1162 * Disable IPv6 TSO and TXCSUM by default, they still can
1163 * be enabled through SIOCSIFCAP.
1165 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1166 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1168 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1169 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1170 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1171 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1174 ether_ifattach(ifp, eaddr);
1176 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1177 if_printf(ifp, "TSO segcnt %u segsz %u\n",
1178 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1181 /* Inform the upper layer about the long frame support. */
1182 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1185 * Kick off link status check.
1187 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1188 hn_update_link_status(sc);
1192 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1193 hn_synth_detach(sc);
1199 hn_detach(device_t dev)
1201 struct hn_softc *sc = device_get_softc(dev);
1202 struct ifnet *ifp = sc->hn_ifp;
1204 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1206 * In case that the vmbus missed the orphan handler
1209 vmbus_xact_ctx_orphan(sc->hn_xact);
1212 if (device_is_attached(dev)) {
1214 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1215 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1219 * hn_stop() only suspends data, so managment
1220 * stuffs have to be suspended manually here.
1222 hn_suspend_mgmt(sc);
1223 hn_synth_detach(sc);
1226 ether_ifdetach(ifp);
1229 ifmedia_removeall(&sc->hn_media);
1230 hn_destroy_rx_data(sc);
1231 hn_destroy_tx_data(sc);
1233 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1236 for (i = 0; i < hn_tx_taskq_cnt; ++i)
1237 taskqueue_free(sc->hn_tx_taskqs[i]);
1238 free(sc->hn_tx_taskqs, M_DEVBUF);
1240 taskqueue_free(sc->hn_mgmt_taskq0);
1242 if (sc->hn_xact != NULL) {
1244 * Uninstall the orphan handler _before_ the xact is
1247 vmbus_chan_unset_orphan(sc->hn_prichan);
1248 vmbus_xact_ctx_destroy(sc->hn_xact);
1253 HN_LOCK_DESTROY(sc);
1258 hn_shutdown(device_t dev)
1265 hn_link_status(struct hn_softc *sc)
1267 uint32_t link_status;
1270 error = hn_rndis_get_linkstatus(sc, &link_status);
1272 /* XXX what to do? */
1276 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1277 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1279 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1280 if_link_state_change(sc->hn_ifp,
1281 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1282 LINK_STATE_UP : LINK_STATE_DOWN);
1286 hn_link_taskfunc(void *xsc, int pending __unused)
1288 struct hn_softc *sc = xsc;
1290 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1296 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1298 struct hn_softc *sc = xsc;
1300 /* Prevent any link status checks from running. */
1301 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1304 * Fake up a [link down --> link up] state change; 5 seconds
1305 * delay is used, which closely simulates miibus reaction
1306 * upon link down event.
1308 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1309 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1310 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1311 &sc->hn_netchg_status, 5 * hz);
1315 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1317 struct hn_softc *sc = xsc;
1319 /* Re-allow link status checks. */
1320 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1325 hn_update_link_status(struct hn_softc *sc)
1328 if (sc->hn_mgmt_taskq != NULL)
1329 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1333 hn_change_network(struct hn_softc *sc)
1336 if (sc->hn_mgmt_taskq != NULL)
1337 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1341 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1342 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1344 struct mbuf *m = *m_head;
1347 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1349 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1350 m, segs, nsegs, BUS_DMA_NOWAIT);
1351 if (error == EFBIG) {
1354 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1358 *m_head = m = m_new;
1359 txr->hn_tx_collapsed++;
1361 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1362 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1365 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1366 BUS_DMASYNC_PREWRITE);
1367 txd->flags |= HN_TXD_FLAG_DMAMAP;
1373 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1376 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1377 ("put an onlist txd %#x", txd->flags));
1378 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1379 ("put an onagg txd %#x", txd->flags));
1381 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1382 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1385 if (!STAILQ_EMPTY(&txd->agg_list)) {
1386 struct hn_txdesc *tmp_txd;
1388 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1391 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1392 ("resursive aggregation on aggregated txdesc"));
1393 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1394 ("not aggregated txdesc"));
1395 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1396 ("aggregated txdesc uses dmamap"));
1397 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1398 ("aggregated txdesc consumes "
1399 "chimney sending buffer"));
1400 KASSERT(tmp_txd->chim_size == 0,
1401 ("aggregated txdesc has non-zero "
1402 "chimney sending size"));
1404 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1405 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1406 freed = hn_txdesc_put(txr, tmp_txd);
1407 KASSERT(freed, ("failed to free aggregated txdesc"));
1411 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1412 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1413 ("chim txd uses dmamap"));
1414 hn_chim_free(txr->hn_sc, txd->chim_index);
1415 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1417 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1418 bus_dmamap_sync(txr->hn_tx_data_dtag,
1419 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1420 bus_dmamap_unload(txr->hn_tx_data_dtag,
1422 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1425 if (txd->m != NULL) {
1430 txd->flags |= HN_TXD_FLAG_ONLIST;
1431 #ifndef HN_USE_TXDESC_BUFRING
1432 mtx_lock_spin(&txr->hn_txlist_spin);
1433 KASSERT(txr->hn_txdesc_avail >= 0 &&
1434 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1435 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1436 txr->hn_txdesc_avail++;
1437 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1438 mtx_unlock_spin(&txr->hn_txlist_spin);
1440 atomic_add_int(&txr->hn_txdesc_avail, 1);
1441 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1447 static __inline struct hn_txdesc *
1448 hn_txdesc_get(struct hn_tx_ring *txr)
1450 struct hn_txdesc *txd;
1452 #ifndef HN_USE_TXDESC_BUFRING
1453 mtx_lock_spin(&txr->hn_txlist_spin);
1454 txd = SLIST_FIRST(&txr->hn_txlist);
1456 KASSERT(txr->hn_txdesc_avail > 0,
1457 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1458 txr->hn_txdesc_avail--;
1459 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1461 mtx_unlock_spin(&txr->hn_txlist_spin);
1463 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1467 #ifdef HN_USE_TXDESC_BUFRING
1468 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1470 KASSERT(txd->m == NULL && txd->refs == 0 &&
1471 STAILQ_EMPTY(&txd->agg_list) &&
1472 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1473 txd->chim_size == 0 &&
1474 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1475 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1476 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1477 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1483 static __inline void
1484 hn_txdesc_hold(struct hn_txdesc *txd)
1487 /* 0->1 transition will never work */
1488 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1489 atomic_add_int(&txd->refs, 1);
1492 static __inline void
1493 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1496 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1497 ("recursive aggregation on aggregating txdesc"));
1499 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1500 ("already aggregated"));
1501 KASSERT(STAILQ_EMPTY(&txd->agg_list),
1502 ("recursive aggregation on to-be-aggregated txdesc"));
1504 txd->flags |= HN_TXD_FLAG_ONAGG;
1505 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1509 hn_tx_ring_pending(struct hn_tx_ring *txr)
1511 bool pending = false;
1513 #ifndef HN_USE_TXDESC_BUFRING
1514 mtx_lock_spin(&txr->hn_txlist_spin);
1515 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1517 mtx_unlock_spin(&txr->hn_txlist_spin);
1519 if (!buf_ring_full(txr->hn_txdesc_br))
1525 static __inline void
1526 hn_txeof(struct hn_tx_ring *txr)
1528 txr->hn_has_txeof = 0;
1533 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1534 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1536 struct hn_txdesc *txd = sndc->hn_cbarg;
1537 struct hn_tx_ring *txr;
1540 KASSERT(txr->hn_chan == chan,
1541 ("channel mismatch, on chan%u, should be chan%u",
1542 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1544 txr->hn_has_txeof = 1;
1545 hn_txdesc_put(txr, txd);
1547 ++txr->hn_txdone_cnt;
1548 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1549 txr->hn_txdone_cnt = 0;
1550 if (txr->hn_oactive)
1556 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1558 #if defined(INET) || defined(INET6)
1559 struct lro_ctrl *lro = &rxr->hn_lro;
1560 struct lro_entry *queued;
1562 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1563 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1564 tcp_lro_flush(lro, queued);
1570 * 'txr' could be NULL, if multiple channels and
1571 * ifnet.if_start method are enabled.
1573 if (txr == NULL || !txr->hn_has_txeof)
1576 txr->hn_txdone_cnt = 0;
1580 static __inline uint32_t
1581 hn_rndis_pktmsg_offset(uint32_t ofs)
1584 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1585 ("invalid RNDIS packet msg offset %u", ofs));
1586 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1589 static __inline void *
1590 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1591 size_t pi_dlen, uint32_t pi_type)
1593 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1594 struct rndis_pktinfo *pi;
1596 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1597 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1600 * Per-packet-info does not move; it only grows.
1603 * rm_pktinfooffset in this phase counts from the beginning
1604 * of rndis_packet_msg.
1606 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1607 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1608 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1609 pkt->rm_pktinfolen);
1610 pkt->rm_pktinfolen += pi_size;
1612 pi->rm_size = pi_size;
1613 pi->rm_type = pi_type;
1614 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1616 /* Data immediately follow per-packet-info. */
1617 pkt->rm_dataoffset += pi_size;
1619 /* Update RNDIS packet msg length */
1620 pkt->rm_len += pi_size;
1622 return (pi->rm_data);
1626 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1628 struct hn_txdesc *txd;
1632 txd = txr->hn_agg_txd;
1633 KASSERT(txd != NULL, ("no aggregate txdesc"));
1636 * Since hn_txpkt() will reset this temporary stat, save
1637 * it now, so that oerrors can be updated properly, if
1638 * hn_txpkt() ever fails.
1640 pkts = txr->hn_stat_pkts;
1643 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1644 * failure, save it for later freeing, if hn_txpkt() ever
1648 error = hn_txpkt(ifp, txr, txd);
1649 if (__predict_false(error)) {
1650 /* txd is freed, but m is not. */
1653 txr->hn_flush_failed++;
1654 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1657 /* Reset all aggregation states. */
1658 txr->hn_agg_txd = NULL;
1659 txr->hn_agg_szleft = 0;
1660 txr->hn_agg_pktleft = 0;
1661 txr->hn_agg_prevpkt = NULL;
1667 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1672 if (txr->hn_agg_txd != NULL) {
1673 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1674 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1675 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1679 * Update the previous RNDIS packet's total length,
1680 * it can be increased due to the mandatory alignment
1681 * padding for this RNDIS packet. And update the
1682 * aggregating txdesc's chimney sending buffer size
1686 * Zero-out the padding, as required by the RNDIS spec.
1689 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1690 agg_txd->chim_size += pkt->rm_len - olen;
1692 /* Link this txdesc to the parent. */
1693 hn_txdesc_agg(agg_txd, txd);
1695 chim = (uint8_t *)pkt + pkt->rm_len;
1696 /* Save the current packet for later fixup. */
1697 txr->hn_agg_prevpkt = chim;
1699 txr->hn_agg_pktleft--;
1700 txr->hn_agg_szleft -= pktsize;
1701 if (txr->hn_agg_szleft <=
1702 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1704 * Probably can't aggregate more packets,
1705 * flush this aggregating txdesc proactively.
1707 txr->hn_agg_pktleft = 0;
1712 hn_flush_txagg(ifp, txr);
1714 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1716 txr->hn_tx_chimney_tried++;
1717 txd->chim_index = hn_chim_alloc(txr->hn_sc);
1718 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1720 txr->hn_tx_chimney++;
1722 chim = txr->hn_sc->hn_chim +
1723 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1725 if (txr->hn_agg_pktmax > 1 &&
1726 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1727 txr->hn_agg_txd = txd;
1728 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1729 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1730 txr->hn_agg_prevpkt = chim;
1737 * If this function fails, then both txd and m_head0 will be freed.
1740 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1741 struct mbuf **m_head0)
1743 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1744 int error, nsegs, i;
1745 struct mbuf *m_head = *m_head0;
1746 struct rndis_packet_msg *pkt;
1749 int pkt_hlen, pkt_size;
1751 pkt = txd->rndis_pkt;
1752 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1753 if (pkt_size < txr->hn_chim_size) {
1754 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1758 if (txr->hn_agg_txd != NULL)
1759 hn_flush_txagg(ifp, txr);
1762 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1763 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1764 pkt->rm_dataoffset = sizeof(*pkt);
1765 pkt->rm_datalen = m_head->m_pkthdr.len;
1766 pkt->rm_oobdataoffset = 0;
1767 pkt->rm_oobdatalen = 0;
1768 pkt->rm_oobdataelements = 0;
1769 pkt->rm_pktinfooffset = sizeof(*pkt);
1770 pkt->rm_pktinfolen = 0;
1771 pkt->rm_vchandle = 0;
1772 pkt->rm_reserved = 0;
1774 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1776 * Set the hash value for this packet, so that the host could
1777 * dispatch the TX done event for this packet back to this TX
1780 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1781 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1782 *pi_data = txr->hn_tx_idx;
1785 if (m_head->m_flags & M_VLANTAG) {
1786 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1787 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1788 *pi_data = NDIS_VLAN_INFO_MAKE(
1789 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1790 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1791 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1794 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1795 #if defined(INET6) || defined(INET)
1796 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1797 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1799 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1800 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1801 m_head->m_pkthdr.tso_segsz);
1804 #if defined(INET6) && defined(INET)
1809 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1810 m_head->m_pkthdr.tso_segsz);
1813 #endif /* INET6 || INET */
1814 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1815 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1816 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1817 if (m_head->m_pkthdr.csum_flags &
1818 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1819 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1821 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1822 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1823 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1826 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1827 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1828 else if (m_head->m_pkthdr.csum_flags &
1829 (CSUM_IP_UDP | CSUM_IP6_UDP))
1830 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1833 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1834 /* Convert RNDIS packet message offsets */
1835 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1836 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1839 * Fast path: Chimney sending.
1842 struct hn_txdesc *tgt_txd = txd;
1844 if (txr->hn_agg_txd != NULL) {
1845 tgt_txd = txr->hn_agg_txd;
1851 KASSERT(pkt == chim,
1852 ("RNDIS pkt not in chimney sending buffer"));
1853 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1854 ("chimney sending buffer is not used"));
1855 tgt_txd->chim_size += pkt->rm_len;
1857 m_copydata(m_head, 0, m_head->m_pkthdr.len,
1858 ((uint8_t *)chim) + pkt_hlen);
1860 txr->hn_gpa_cnt = 0;
1861 txr->hn_sendpkt = hn_txpkt_chim;
1865 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1866 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1867 ("chimney buffer is used"));
1868 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1870 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1871 if (__predict_false(error)) {
1875 * This mbuf is not linked w/ the txd yet, so free it now.
1880 freed = hn_txdesc_put(txr, txd);
1882 ("fail to free txd upon txdma error"));
1884 txr->hn_txdma_failed++;
1885 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1890 /* +1 RNDIS packet message */
1891 txr->hn_gpa_cnt = nsegs + 1;
1893 /* send packet with page buffer */
1894 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1895 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1896 txr->hn_gpa[0].gpa_len = pkt_hlen;
1899 * Fill the page buffers with mbuf info after the page
1900 * buffer for RNDIS packet message.
1902 for (i = 0; i < nsegs; ++i) {
1903 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1905 gpa->gpa_page = atop(segs[i].ds_addr);
1906 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1907 gpa->gpa_len = segs[i].ds_len;
1910 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1912 txr->hn_sendpkt = hn_txpkt_sglist;
1916 /* Set the completion routine */
1917 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1919 /* Update temporary stats for later use. */
1920 txr->hn_stat_pkts++;
1921 txr->hn_stat_size += m_head->m_pkthdr.len;
1922 if (m_head->m_flags & M_MCAST)
1923 txr->hn_stat_mcasts++;
1930 * If this function fails, then txd will be freed, but the mbuf
1931 * associated w/ the txd will _not_ be freed.
1934 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1936 int error, send_failed = 0;
1940 * Make sure that this txd and any aggregated txds are not freed
1941 * before ETHER_BPF_MTAP.
1943 hn_txdesc_hold(txd);
1944 error = txr->hn_sendpkt(txr, txd);
1946 if (bpf_peers_present(ifp->if_bpf)) {
1947 const struct hn_txdesc *tmp_txd;
1949 ETHER_BPF_MTAP(ifp, txd->m);
1950 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1951 ETHER_BPF_MTAP(ifp, tmp_txd->m);
1954 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1955 #ifdef HN_IFSTART_SUPPORT
1956 if (!hn_use_if_start)
1959 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1961 if (txr->hn_stat_mcasts != 0) {
1962 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1963 txr->hn_stat_mcasts);
1966 txr->hn_pkts += txr->hn_stat_pkts;
1969 hn_txdesc_put(txr, txd);
1971 if (__predict_false(error)) {
1975 * This should "really rarely" happen.
1977 * XXX Too many RX to be acked or too many sideband
1978 * commands to run? Ask netvsc_channel_rollup()
1979 * to kick start later.
1981 txr->hn_has_txeof = 1;
1983 txr->hn_send_failed++;
1986 * Try sending again after set hn_has_txeof;
1987 * in case that we missed the last
1988 * netvsc_channel_rollup().
1992 if_printf(ifp, "send failed\n");
1995 * Caller will perform further processing on the
1996 * associated mbuf, so don't free it in hn_txdesc_put();
1997 * only unload it from the DMA map in hn_txdesc_put(),
2001 freed = hn_txdesc_put(txr, txd);
2003 ("fail to free txd upon send error"));
2005 txr->hn_send_failed++;
2008 /* Reset temporary stats, after this sending is done. */
2009 txr->hn_stat_size = 0;
2010 txr->hn_stat_pkts = 0;
2011 txr->hn_stat_mcasts = 0;
2017 * Append the specified data to the indicated mbuf chain,
2018 * Extend the mbuf chain if the new data does not fit in
2021 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2022 * There should be an equivalent in the kernel mbuf code,
2023 * but there does not appear to be one yet.
2025 * Differs from m_append() in that additional mbufs are
2026 * allocated with cluster size MJUMPAGESIZE, and filled
2029 * Return 1 if able to complete the job; otherwise 0.
2032 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2035 int remainder, space;
2037 for (m = m0; m->m_next != NULL; m = m->m_next)
2040 space = M_TRAILINGSPACE(m);
2043 * Copy into available space.
2045 if (space > remainder)
2047 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2052 while (remainder > 0) {
2054 * Allocate a new mbuf; could check space
2055 * and allocate a cluster instead.
2057 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
2060 n->m_len = min(MJUMPAGESIZE, remainder);
2061 bcopy(cp, mtod(n, caddr_t), n->m_len);
2063 remainder -= n->m_len;
2067 if (m0->m_flags & M_PKTHDR)
2068 m0->m_pkthdr.len += len - remainder;
2070 return (remainder == 0);
2073 #if defined(INET) || defined(INET6)
2075 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2077 #if __FreeBSD_version >= 1100095
2078 if (hn_lro_mbufq_depth) {
2079 tcp_lro_queue_mbuf(lc, m);
2083 return tcp_lro_rx(lc, m, 0);
2088 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2089 const struct hn_rxinfo *info)
2091 struct ifnet *ifp = rxr->hn_ifp;
2093 int size, do_lro = 0, do_csum = 1;
2094 int hash_type = M_HASHTYPE_OPAQUE;
2096 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2100 * Bail out if packet contains more data than configured MTU.
2102 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2104 } else if (dlen <= MHLEN) {
2105 m_new = m_gethdr(M_NOWAIT, MT_DATA);
2106 if (m_new == NULL) {
2107 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2110 memcpy(mtod(m_new, void *), data, dlen);
2111 m_new->m_pkthdr.len = m_new->m_len = dlen;
2112 rxr->hn_small_pkts++;
2115 * Get an mbuf with a cluster. For packets 2K or less,
2116 * get a standard 2K cluster. For anything larger, get a
2117 * 4K cluster. Any buffers larger than 4K can cause problems
2118 * if looped around to the Hyper-V TX channel, so avoid them.
2121 if (dlen > MCLBYTES) {
2123 size = MJUMPAGESIZE;
2126 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2127 if (m_new == NULL) {
2128 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2132 hv_m_append(m_new, dlen, data);
2134 m_new->m_pkthdr.rcvif = ifp;
2136 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2139 /* receive side checksum offload */
2140 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2141 /* IP csum offload */
2142 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2143 m_new->m_pkthdr.csum_flags |=
2144 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2148 /* TCP/UDP csum offload */
2149 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2150 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2151 m_new->m_pkthdr.csum_flags |=
2152 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2153 m_new->m_pkthdr.csum_data = 0xffff;
2154 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2162 * As of this write (Oct 28th, 2016), host side will turn
2163 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2164 * the do_lro setting here is actually _not_ accurate. We
2165 * depend on the RSS hash type check to reset do_lro.
2167 if ((info->csum_info &
2168 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2169 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2172 const struct ether_header *eh;
2177 if (m_new->m_len < hoff)
2179 eh = mtod(m_new, struct ether_header *);
2180 etype = ntohs(eh->ether_type);
2181 if (etype == ETHERTYPE_VLAN) {
2182 const struct ether_vlan_header *evl;
2184 hoff = sizeof(*evl);
2185 if (m_new->m_len < hoff)
2187 evl = mtod(m_new, struct ether_vlan_header *);
2188 etype = ntohs(evl->evl_proto);
2191 if (etype == ETHERTYPE_IP) {
2194 pr = hn_check_iplen(m_new, hoff);
2195 if (pr == IPPROTO_TCP) {
2197 (rxr->hn_trust_hcsum &
2198 HN_TRUST_HCSUM_TCP)) {
2199 rxr->hn_csum_trusted++;
2200 m_new->m_pkthdr.csum_flags |=
2201 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2202 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2203 m_new->m_pkthdr.csum_data = 0xffff;
2206 } else if (pr == IPPROTO_UDP) {
2208 (rxr->hn_trust_hcsum &
2209 HN_TRUST_HCSUM_UDP)) {
2210 rxr->hn_csum_trusted++;
2211 m_new->m_pkthdr.csum_flags |=
2212 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2213 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2214 m_new->m_pkthdr.csum_data = 0xffff;
2216 } else if (pr != IPPROTO_DONE && do_csum &&
2217 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2218 rxr->hn_csum_trusted++;
2219 m_new->m_pkthdr.csum_flags |=
2220 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2225 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2226 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2227 NDIS_VLAN_INFO_ID(info->vlan_info),
2228 NDIS_VLAN_INFO_PRI(info->vlan_info),
2229 NDIS_VLAN_INFO_CFI(info->vlan_info));
2230 m_new->m_flags |= M_VLANTAG;
2233 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2235 m_new->m_pkthdr.flowid = info->hash_value;
2236 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2237 NDIS_HASH_FUNCTION_TOEPLITZ) {
2238 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2242 * do_lro is resetted, if the hash types are not TCP
2243 * related. See the comment in the above csum_flags
2247 case NDIS_HASH_IPV4:
2248 hash_type = M_HASHTYPE_RSS_IPV4;
2252 case NDIS_HASH_TCP_IPV4:
2253 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2256 case NDIS_HASH_IPV6:
2257 hash_type = M_HASHTYPE_RSS_IPV6;
2261 case NDIS_HASH_IPV6_EX:
2262 hash_type = M_HASHTYPE_RSS_IPV6_EX;
2266 case NDIS_HASH_TCP_IPV6:
2267 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2270 case NDIS_HASH_TCP_IPV6_EX:
2271 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2276 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2278 M_HASHTYPE_SET(m_new, hash_type);
2281 * Note: Moved RX completion back to hv_nv_on_receive() so all
2282 * messages (not just data messages) will trigger a response.
2288 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2289 #if defined(INET) || defined(INET6)
2290 struct lro_ctrl *lro = &rxr->hn_lro;
2293 rxr->hn_lro_tried++;
2294 if (hn_lro_rx(lro, m_new) == 0) {
2302 /* We're not holding the lock here, so don't release it */
2303 (*ifp->if_input)(ifp, m_new);
2309 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2311 struct hn_softc *sc = ifp->if_softc;
2312 struct ifreq *ifr = (struct ifreq *)data;
2313 int mask, error = 0;
2317 if (ifr->ifr_mtu > HN_MTU_MAX) {
2324 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2329 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2330 /* Can't change MTU */
2336 if (ifp->if_mtu == ifr->ifr_mtu) {
2342 * Suspend this interface before the synthetic parts
2348 * Detach the synthetics parts, i.e. NVS and RNDIS.
2350 hn_synth_detach(sc);
2353 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2354 * with the new MTU setting.
2356 error = hn_synth_attach(sc, ifr->ifr_mtu);
2363 * Commit the requested MTU, after the synthetic parts
2364 * have been successfully attached.
2366 ifp->if_mtu = ifr->ifr_mtu;
2369 * Make sure that various parameters based on MTU are
2370 * still valid, after the MTU change.
2372 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2373 hn_set_chim_size(sc, sc->hn_chim_szmax);
2374 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2375 #if __FreeBSD_version >= 1100099
2376 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2377 HN_LRO_LENLIM_MIN(ifp))
2378 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2382 * All done! Resume the interface now.
2392 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2397 if (ifp->if_flags & IFF_UP) {
2398 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2400 * Caller meight hold mutex, e.g.
2401 * bpf; use busy-wait for the RNDIS
2405 hn_set_rxfilter(sc);
2411 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2414 sc->hn_if_flags = ifp->if_flags;
2421 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2423 if (mask & IFCAP_TXCSUM) {
2424 ifp->if_capenable ^= IFCAP_TXCSUM;
2425 if (ifp->if_capenable & IFCAP_TXCSUM)
2426 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2428 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2430 if (mask & IFCAP_TXCSUM_IPV6) {
2431 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2432 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2433 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2435 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2438 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2439 if (mask & IFCAP_RXCSUM)
2440 ifp->if_capenable ^= IFCAP_RXCSUM;
2442 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2443 if (mask & IFCAP_RXCSUM_IPV6)
2444 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2447 if (mask & IFCAP_LRO)
2448 ifp->if_capenable ^= IFCAP_LRO;
2450 if (mask & IFCAP_TSO4) {
2451 ifp->if_capenable ^= IFCAP_TSO4;
2452 if (ifp->if_capenable & IFCAP_TSO4)
2453 ifp->if_hwassist |= CSUM_IP_TSO;
2455 ifp->if_hwassist &= ~CSUM_IP_TSO;
2457 if (mask & IFCAP_TSO6) {
2458 ifp->if_capenable ^= IFCAP_TSO6;
2459 if (ifp->if_capenable & IFCAP_TSO6)
2460 ifp->if_hwassist |= CSUM_IP6_TSO;
2462 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2472 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2476 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2478 * Multicast uses mutex; use busy-wait for
2482 hn_set_rxfilter(sc);
2491 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2495 error = ether_ioctl(ifp, cmd, data);
2502 hn_stop(struct hn_softc *sc)
2504 struct ifnet *ifp = sc->hn_ifp;
2509 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2510 ("synthetic parts were not attached"));
2512 /* Clear RUNNING bit _before_ hn_suspend_data() */
2513 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2514 hn_suspend_data(sc);
2516 /* Clear OACTIVE bit. */
2517 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2518 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2519 sc->hn_tx_ring[i].hn_oactive = 0;
2523 hn_init_locked(struct hn_softc *sc)
2525 struct ifnet *ifp = sc->hn_ifp;
2530 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2533 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2536 /* Configure RX filter */
2537 hn_set_rxfilter(sc);
2539 /* Clear OACTIVE bit. */
2540 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2541 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2542 sc->hn_tx_ring[i].hn_oactive = 0;
2544 /* Clear TX 'suspended' bit. */
2545 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2547 /* Everything is ready; unleash! */
2548 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2554 struct hn_softc *sc = xsc;
2561 #if __FreeBSD_version >= 1100099
2564 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2566 struct hn_softc *sc = arg1;
2567 unsigned int lenlim;
2570 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2571 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2572 if (error || req->newptr == NULL)
2576 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2577 lenlim > TCP_LRO_LENGTH_MAX) {
2581 hn_set_lro_lenlim(sc, lenlim);
2588 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2590 struct hn_softc *sc = arg1;
2591 int ackcnt, error, i;
2594 * lro_ackcnt_lim is append count limit,
2595 * +1 to turn it into aggregation limit.
2597 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2598 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2599 if (error || req->newptr == NULL)
2602 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2606 * Convert aggregation limit back to append
2611 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2612 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2620 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2622 struct hn_softc *sc = arg1;
2627 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2630 error = sysctl_handle_int(oidp, &on, 0, req);
2631 if (error || req->newptr == NULL)
2635 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2636 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2639 rxr->hn_trust_hcsum |= hcsum;
2641 rxr->hn_trust_hcsum &= ~hcsum;
2648 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2650 struct hn_softc *sc = arg1;
2651 int chim_size, error;
2653 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2654 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2655 if (error || req->newptr == NULL)
2658 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2662 hn_set_chim_size(sc, chim_size);
2667 #if __FreeBSD_version < 1100095
2669 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2671 struct hn_softc *sc = arg1;
2672 int ofs = arg2, i, error;
2673 struct hn_rx_ring *rxr;
2677 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2678 rxr = &sc->hn_rx_ring[i];
2679 stat += *((int *)((uint8_t *)rxr + ofs));
2682 error = sysctl_handle_64(oidp, &stat, 0, req);
2683 if (error || req->newptr == NULL)
2686 /* Zero out this stat. */
2687 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2688 rxr = &sc->hn_rx_ring[i];
2689 *((int *)((uint8_t *)rxr + ofs)) = 0;
2695 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2697 struct hn_softc *sc = arg1;
2698 int ofs = arg2, i, error;
2699 struct hn_rx_ring *rxr;
2703 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2704 rxr = &sc->hn_rx_ring[i];
2705 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2708 error = sysctl_handle_64(oidp, &stat, 0, req);
2709 if (error || req->newptr == NULL)
2712 /* Zero out this stat. */
2713 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2714 rxr = &sc->hn_rx_ring[i];
2715 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2723 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2725 struct hn_softc *sc = arg1;
2726 int ofs = arg2, i, error;
2727 struct hn_rx_ring *rxr;
2731 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2732 rxr = &sc->hn_rx_ring[i];
2733 stat += *((u_long *)((uint8_t *)rxr + ofs));
2736 error = sysctl_handle_long(oidp, &stat, 0, req);
2737 if (error || req->newptr == NULL)
2740 /* Zero out this stat. */
2741 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2742 rxr = &sc->hn_rx_ring[i];
2743 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2749 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2751 struct hn_softc *sc = arg1;
2752 int ofs = arg2, i, error;
2753 struct hn_tx_ring *txr;
2757 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2758 txr = &sc->hn_tx_ring[i];
2759 stat += *((u_long *)((uint8_t *)txr + ofs));
2762 error = sysctl_handle_long(oidp, &stat, 0, req);
2763 if (error || req->newptr == NULL)
2766 /* Zero out this stat. */
2767 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2768 txr = &sc->hn_tx_ring[i];
2769 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2775 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2777 struct hn_softc *sc = arg1;
2778 int ofs = arg2, i, error, conf;
2779 struct hn_tx_ring *txr;
2781 txr = &sc->hn_tx_ring[0];
2782 conf = *((int *)((uint8_t *)txr + ofs));
2784 error = sysctl_handle_int(oidp, &conf, 0, req);
2785 if (error || req->newptr == NULL)
2789 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2790 txr = &sc->hn_tx_ring[i];
2791 *((int *)((uint8_t *)txr + ofs)) = conf;
2799 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2801 struct hn_softc *sc = arg1;
2804 size = sc->hn_agg_size;
2805 error = sysctl_handle_int(oidp, &size, 0, req);
2806 if (error || req->newptr == NULL)
2810 sc->hn_agg_size = size;
2818 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2820 struct hn_softc *sc = arg1;
2823 pkts = sc->hn_agg_pkts;
2824 error = sysctl_handle_int(oidp, &pkts, 0, req);
2825 if (error || req->newptr == NULL)
2829 sc->hn_agg_pkts = pkts;
2837 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2839 struct hn_softc *sc = arg1;
2842 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2843 return (sysctl_handle_int(oidp, &pkts, 0, req));
2847 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2849 struct hn_softc *sc = arg1;
2852 align = sc->hn_tx_ring[0].hn_agg_align;
2853 return (sysctl_handle_int(oidp, &align, 0, req));
2857 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2859 struct hn_softc *sc = arg1;
2862 snprintf(verstr, sizeof(verstr), "%u.%u",
2863 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2864 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2865 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2869 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2871 struct hn_softc *sc = arg1;
2878 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2879 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2883 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2885 struct hn_softc *sc = arg1;
2886 char assist_str[128];
2890 hwassist = sc->hn_ifp->if_hwassist;
2892 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2893 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2897 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2899 struct hn_softc *sc = arg1;
2900 char filter_str[128];
2904 filter = sc->hn_rx_filter;
2906 snprintf(filter_str, sizeof(filter_str), "%b", filter,
2908 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2912 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2914 struct hn_softc *sc = arg1;
2919 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2920 if (error || req->newptr == NULL)
2923 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2926 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2928 if (sc->hn_rx_ring_inuse > 1) {
2929 error = hn_rss_reconfig(sc);
2931 /* Not RSS capable, at least for now; just save the RSS key. */
2940 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2942 struct hn_softc *sc = arg1;
2947 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2948 if (error || req->newptr == NULL)
2952 * Don't allow RSS indirect table change, if this interface is not
2953 * RSS capable currently.
2955 if (sc->hn_rx_ring_inuse == 1) {
2960 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2963 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2965 hn_rss_ind_fixup(sc);
2966 error = hn_rss_reconfig(sc);
2973 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
2975 struct hn_softc *sc = arg1;
2980 hash = sc->hn_rss_hash;
2982 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
2983 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
2987 hn_check_iplen(const struct mbuf *m, int hoff)
2989 const struct ip *ip;
2990 int len, iphlen, iplen;
2991 const struct tcphdr *th;
2992 int thoff; /* TCP data offset */
2994 len = hoff + sizeof(struct ip);
2996 /* The packet must be at least the size of an IP header. */
2997 if (m->m_pkthdr.len < len)
2998 return IPPROTO_DONE;
3000 /* The fixed IP header must reside completely in the first mbuf. */
3002 return IPPROTO_DONE;
3004 ip = mtodo(m, hoff);
3006 /* Bound check the packet's stated IP header length. */
3007 iphlen = ip->ip_hl << 2;
3008 if (iphlen < sizeof(struct ip)) /* minimum header length */
3009 return IPPROTO_DONE;
3011 /* The full IP header must reside completely in the one mbuf. */
3012 if (m->m_len < hoff + iphlen)
3013 return IPPROTO_DONE;
3015 iplen = ntohs(ip->ip_len);
3018 * Check that the amount of data in the buffers is as
3019 * at least much as the IP header would have us expect.
3021 if (m->m_pkthdr.len < hoff + iplen)
3022 return IPPROTO_DONE;
3025 * Ignore IP fragments.
3027 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3028 return IPPROTO_DONE;
3031 * The TCP/IP or UDP/IP header must be entirely contained within
3032 * the first fragment of a packet.
3036 if (iplen < iphlen + sizeof(struct tcphdr))
3037 return IPPROTO_DONE;
3038 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3039 return IPPROTO_DONE;
3040 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3041 thoff = th->th_off << 2;
3042 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3043 return IPPROTO_DONE;
3044 if (m->m_len < hoff + iphlen + thoff)
3045 return IPPROTO_DONE;
3048 if (iplen < iphlen + sizeof(struct udphdr))
3049 return IPPROTO_DONE;
3050 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3051 return IPPROTO_DONE;
3055 return IPPROTO_DONE;
3062 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3064 struct sysctl_oid_list *child;
3065 struct sysctl_ctx_list *ctx;
3066 device_t dev = sc->hn_dev;
3067 #if defined(INET) || defined(INET6)
3068 #if __FreeBSD_version >= 1100095
3075 * Create RXBUF for reception.
3078 * - It is shared by all channels.
3079 * - A large enough buffer is allocated, certain version of NVSes
3080 * may further limit the usable space.
3082 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3083 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3084 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3085 if (sc->hn_rxbuf == NULL) {
3086 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3090 sc->hn_rx_ring_cnt = ring_cnt;
3091 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3093 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3094 M_DEVBUF, M_WAITOK | M_ZERO);
3096 #if defined(INET) || defined(INET6)
3097 #if __FreeBSD_version >= 1100095
3098 lroent_cnt = hn_lro_entry_count;
3099 if (lroent_cnt < TCP_LRO_ENTRIES)
3100 lroent_cnt = TCP_LRO_ENTRIES;
3102 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3104 #endif /* INET || INET6 */
3106 ctx = device_get_sysctl_ctx(dev);
3107 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3109 /* Create dev.hn.UNIT.rx sysctl tree */
3110 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3111 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3113 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3114 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3116 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3117 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3118 &rxr->hn_br_dma, BUS_DMA_WAITOK);
3119 if (rxr->hn_br == NULL) {
3120 device_printf(dev, "allocate bufring failed\n");
3124 if (hn_trust_hosttcp)
3125 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3126 if (hn_trust_hostudp)
3127 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3128 if (hn_trust_hostip)
3129 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3130 rxr->hn_ifp = sc->hn_ifp;
3131 if (i < sc->hn_tx_ring_cnt)
3132 rxr->hn_txr = &sc->hn_tx_ring[i];
3133 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3134 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3136 rxr->hn_rxbuf = sc->hn_rxbuf;
3141 #if defined(INET) || defined(INET6)
3142 #if __FreeBSD_version >= 1100095
3143 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3144 hn_lro_mbufq_depth);
3146 tcp_lro_init(&rxr->hn_lro);
3147 rxr->hn_lro.ifp = sc->hn_ifp;
3149 #if __FreeBSD_version >= 1100099
3150 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3151 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3153 #endif /* INET || INET6 */
3155 if (sc->hn_rx_sysctl_tree != NULL) {
3159 * Create per RX ring sysctl tree:
3160 * dev.hn.UNIT.rx.RINGID
3162 snprintf(name, sizeof(name), "%d", i);
3163 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3164 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3165 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3167 if (rxr->hn_rx_sysctl_tree != NULL) {
3168 SYSCTL_ADD_ULONG(ctx,
3169 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3170 OID_AUTO, "packets", CTLFLAG_RW,
3171 &rxr->hn_pkts, "# of packets received");
3172 SYSCTL_ADD_ULONG(ctx,
3173 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3174 OID_AUTO, "rss_pkts", CTLFLAG_RW,
3176 "# of packets w/ RSS info received");
3178 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3179 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3180 &rxr->hn_pktbuf_len, 0,
3181 "Temporary channel packet buffer length");
3186 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3187 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3188 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3189 #if __FreeBSD_version < 1100095
3190 hn_rx_stat_int_sysctl,
3192 hn_rx_stat_u64_sysctl,
3194 "LU", "LRO queued");
3195 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3196 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3197 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3198 #if __FreeBSD_version < 1100095
3199 hn_rx_stat_int_sysctl,
3201 hn_rx_stat_u64_sysctl,
3203 "LU", "LRO flushed");
3204 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3205 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3206 __offsetof(struct hn_rx_ring, hn_lro_tried),
3207 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3208 #if __FreeBSD_version >= 1100099
3209 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3210 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3211 hn_lro_lenlim_sysctl, "IU",
3212 "Max # of data bytes to be aggregated by LRO");
3213 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3214 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3215 hn_lro_ackcnt_sysctl, "I",
3216 "Max # of ACKs to be aggregated by LRO");
3218 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3219 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3220 hn_trust_hcsum_sysctl, "I",
3221 "Trust tcp segement verification on host side, "
3222 "when csum info is missing");
3223 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3224 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3225 hn_trust_hcsum_sysctl, "I",
3226 "Trust udp datagram verification on host side, "
3227 "when csum info is missing");
3228 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3229 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3230 hn_trust_hcsum_sysctl, "I",
3231 "Trust ip packet verification on host side, "
3232 "when csum info is missing");
3233 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3234 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3235 __offsetof(struct hn_rx_ring, hn_csum_ip),
3236 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3237 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3238 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3239 __offsetof(struct hn_rx_ring, hn_csum_tcp),
3240 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3241 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3242 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3243 __offsetof(struct hn_rx_ring, hn_csum_udp),
3244 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3245 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3246 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3247 __offsetof(struct hn_rx_ring, hn_csum_trusted),
3248 hn_rx_stat_ulong_sysctl, "LU",
3249 "# of packets that we trust host's csum verification");
3250 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3251 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3252 __offsetof(struct hn_rx_ring, hn_small_pkts),
3253 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3254 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3255 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3256 __offsetof(struct hn_rx_ring, hn_ack_failed),
3257 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3258 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3259 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3260 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3261 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3267 hn_destroy_rx_data(struct hn_softc *sc)
3271 if (sc->hn_rxbuf != NULL) {
3272 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3273 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3275 device_printf(sc->hn_dev, "RXBUF is referenced\n");
3276 sc->hn_rxbuf = NULL;
3279 if (sc->hn_rx_ring_cnt == 0)
3282 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3283 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3285 if (rxr->hn_br == NULL)
3287 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3288 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3290 device_printf(sc->hn_dev,
3291 "%dth channel bufring is referenced", i);
3295 #if defined(INET) || defined(INET6)
3296 tcp_lro_free(&rxr->hn_lro);
3298 free(rxr->hn_pktbuf, M_DEVBUF);
3300 free(sc->hn_rx_ring, M_DEVBUF);
3301 sc->hn_rx_ring = NULL;
3303 sc->hn_rx_ring_cnt = 0;
3304 sc->hn_rx_ring_inuse = 0;
3308 hn_tx_ring_create(struct hn_softc *sc, int id)
3310 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3311 device_t dev = sc->hn_dev;
3312 bus_dma_tag_t parent_dtag;
3316 txr->hn_tx_idx = id;
3318 #ifndef HN_USE_TXDESC_BUFRING
3319 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3321 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3323 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3324 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3325 M_DEVBUF, M_WAITOK | M_ZERO);
3326 #ifndef HN_USE_TXDESC_BUFRING
3327 SLIST_INIT(&txr->hn_txlist);
3329 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3330 M_WAITOK, &txr->hn_tx_lock);
3333 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3334 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3335 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3337 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3340 #ifdef HN_IFSTART_SUPPORT
3341 if (hn_use_if_start) {
3342 txr->hn_txeof = hn_start_txeof;
3343 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3344 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3350 txr->hn_txeof = hn_xmit_txeof;
3351 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3352 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3354 br_depth = hn_get_txswq_depth(txr);
3355 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3356 M_WAITOK, &txr->hn_tx_lock);
3359 txr->hn_direct_tx_size = hn_direct_tx_size;
3362 * Always schedule transmission instead of trying to do direct
3363 * transmission. This one gives the best performance so far.
3365 txr->hn_sched_tx = 1;
3367 parent_dtag = bus_get_dma_tag(dev);
3369 /* DMA tag for RNDIS packet messages. */
3370 error = bus_dma_tag_create(parent_dtag, /* parent */
3371 HN_RNDIS_PKT_ALIGN, /* alignment */
3372 HN_RNDIS_PKT_BOUNDARY, /* boundary */
3373 BUS_SPACE_MAXADDR, /* lowaddr */
3374 BUS_SPACE_MAXADDR, /* highaddr */
3375 NULL, NULL, /* filter, filterarg */
3376 HN_RNDIS_PKT_LEN, /* maxsize */
3378 HN_RNDIS_PKT_LEN, /* maxsegsize */
3380 NULL, /* lockfunc */
3381 NULL, /* lockfuncarg */
3382 &txr->hn_tx_rndis_dtag);
3384 device_printf(dev, "failed to create rndis dmatag\n");
3388 /* DMA tag for data. */
3389 error = bus_dma_tag_create(parent_dtag, /* parent */
3391 HN_TX_DATA_BOUNDARY, /* boundary */
3392 BUS_SPACE_MAXADDR, /* lowaddr */
3393 BUS_SPACE_MAXADDR, /* highaddr */
3394 NULL, NULL, /* filter, filterarg */
3395 HN_TX_DATA_MAXSIZE, /* maxsize */
3396 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3397 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3399 NULL, /* lockfunc */
3400 NULL, /* lockfuncarg */
3401 &txr->hn_tx_data_dtag);
3403 device_printf(dev, "failed to create data dmatag\n");
3407 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3408 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3411 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3412 STAILQ_INIT(&txd->agg_list);
3415 * Allocate and load RNDIS packet message.
3417 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3418 (void **)&txd->rndis_pkt,
3419 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3420 &txd->rndis_pkt_dmap);
3423 "failed to allocate rndis_packet_msg, %d\n", i);
3427 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3428 txd->rndis_pkt_dmap,
3429 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3430 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3434 "failed to load rndis_packet_msg, %d\n", i);
3435 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3436 txd->rndis_pkt, txd->rndis_pkt_dmap);
3440 /* DMA map for TX data. */
3441 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3445 "failed to allocate tx data dmamap\n");
3446 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3447 txd->rndis_pkt_dmap);
3448 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3449 txd->rndis_pkt, txd->rndis_pkt_dmap);
3453 /* All set, put it to list */
3454 txd->flags |= HN_TXD_FLAG_ONLIST;
3455 #ifndef HN_USE_TXDESC_BUFRING
3456 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3458 buf_ring_enqueue(txr->hn_txdesc_br, txd);
3461 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3463 if (sc->hn_tx_sysctl_tree != NULL) {
3464 struct sysctl_oid_list *child;
3465 struct sysctl_ctx_list *ctx;
3469 * Create per TX ring sysctl tree:
3470 * dev.hn.UNIT.tx.RINGID
3472 ctx = device_get_sysctl_ctx(dev);
3473 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3475 snprintf(name, sizeof(name), "%d", id);
3476 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3477 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3479 if (txr->hn_tx_sysctl_tree != NULL) {
3480 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3482 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3483 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3484 "# of available TX descs");
3485 #ifdef HN_IFSTART_SUPPORT
3486 if (!hn_use_if_start)
3489 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3490 CTLFLAG_RD, &txr->hn_oactive, 0,
3493 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3494 CTLFLAG_RW, &txr->hn_pkts,
3495 "# of packets transmitted");
3496 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3497 CTLFLAG_RW, &txr->hn_sends, "# of sends");
3505 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3507 struct hn_tx_ring *txr = txd->txr;
3509 KASSERT(txd->m == NULL, ("still has mbuf installed"));
3510 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3512 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3513 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3514 txd->rndis_pkt_dmap);
3515 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3519 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3522 KASSERT(txd->refs == 0 || txd->refs == 1,
3523 ("invalid txd refs %d", txd->refs));
3525 /* Aggregated txds will be freed by their aggregating txd. */
3526 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3529 freed = hn_txdesc_put(txr, txd);
3530 KASSERT(freed, ("can't free txdesc"));
3535 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3539 if (txr->hn_txdesc == NULL)
3544 * Because the freeing of aggregated txds will be deferred
3545 * to the aggregating txd, two passes are used here:
3546 * - The first pass GCes any pending txds. This GC is necessary,
3547 * since if the channels are revoked, hypervisor will not
3548 * deliver send-done for all pending txds.
3549 * - The second pass frees the busdma stuffs, i.e. after all txds
3552 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3553 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3554 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3555 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3557 if (txr->hn_tx_data_dtag != NULL)
3558 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3559 if (txr->hn_tx_rndis_dtag != NULL)
3560 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3562 #ifdef HN_USE_TXDESC_BUFRING
3563 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3566 free(txr->hn_txdesc, M_DEVBUF);
3567 txr->hn_txdesc = NULL;
3569 if (txr->hn_mbuf_br != NULL)
3570 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3572 #ifndef HN_USE_TXDESC_BUFRING
3573 mtx_destroy(&txr->hn_txlist_spin);
3575 mtx_destroy(&txr->hn_tx_lock);
3579 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3581 struct sysctl_oid_list *child;
3582 struct sysctl_ctx_list *ctx;
3586 * Create TXBUF for chimney sending.
3588 * NOTE: It is shared by all channels.
3590 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3591 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3592 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3593 if (sc->hn_chim == NULL) {
3594 device_printf(sc->hn_dev, "allocate txbuf failed\n");
3598 sc->hn_tx_ring_cnt = ring_cnt;
3599 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3601 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3602 M_DEVBUF, M_WAITOK | M_ZERO);
3604 ctx = device_get_sysctl_ctx(sc->hn_dev);
3605 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3607 /* Create dev.hn.UNIT.tx sysctl tree */
3608 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3609 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3611 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3614 error = hn_tx_ring_create(sc, i);
3619 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3620 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3621 __offsetof(struct hn_tx_ring, hn_no_txdescs),
3622 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3623 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3624 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3625 __offsetof(struct hn_tx_ring, hn_send_failed),
3626 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3627 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3628 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3629 __offsetof(struct hn_tx_ring, hn_txdma_failed),
3630 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3631 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3632 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3633 __offsetof(struct hn_tx_ring, hn_flush_failed),
3634 hn_tx_stat_ulong_sysctl, "LU",
3635 "# of packet transmission aggregation flush failure");
3636 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3637 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3638 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3639 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3640 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3641 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3642 __offsetof(struct hn_tx_ring, hn_tx_chimney),
3643 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3644 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3645 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3646 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3647 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3648 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3649 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3650 "# of total TX descs");
3651 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3652 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3653 "Chimney send packet size upper boundary");
3654 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3655 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3656 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3657 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3658 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3659 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3660 hn_tx_conf_int_sysctl, "I",
3661 "Size of the packet for direct transmission");
3662 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3663 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3664 __offsetof(struct hn_tx_ring, hn_sched_tx),
3665 hn_tx_conf_int_sysctl, "I",
3666 "Always schedule transmission "
3667 "instead of doing direct transmission");
3668 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3669 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3670 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3671 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3672 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3673 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3674 "Applied packet transmission aggregation size");
3675 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3676 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3677 hn_txagg_pktmax_sysctl, "I",
3678 "Applied packet transmission aggregation packets");
3679 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3680 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3681 hn_txagg_align_sysctl, "I",
3682 "Applied packet transmission aggregation alignment");
3688 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3692 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3693 sc->hn_tx_ring[i].hn_chim_size = chim_size;
3697 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3699 struct ifnet *ifp = sc->hn_ifp;
3702 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3705 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3706 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3707 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3709 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3710 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3711 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3713 if (tso_maxlen < tso_minlen)
3714 tso_maxlen = tso_minlen;
3715 else if (tso_maxlen > IP_MAXPACKET)
3716 tso_maxlen = IP_MAXPACKET;
3717 if (tso_maxlen > sc->hn_ndis_tso_szmax)
3718 tso_maxlen = sc->hn_ndis_tso_szmax;
3719 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3721 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3725 hn_fixup_tx_data(struct hn_softc *sc)
3727 uint64_t csum_assist;
3730 hn_set_chim_size(sc, sc->hn_chim_szmax);
3731 if (hn_tx_chimney_size > 0 &&
3732 hn_tx_chimney_size < sc->hn_chim_szmax)
3733 hn_set_chim_size(sc, hn_tx_chimney_size);
3736 if (sc->hn_caps & HN_CAP_IPCS)
3737 csum_assist |= CSUM_IP;
3738 if (sc->hn_caps & HN_CAP_TCP4CS)
3739 csum_assist |= CSUM_IP_TCP;
3740 if (sc->hn_caps & HN_CAP_UDP4CS)
3741 csum_assist |= CSUM_IP_UDP;
3742 if (sc->hn_caps & HN_CAP_TCP6CS)
3743 csum_assist |= CSUM_IP6_TCP;
3744 if (sc->hn_caps & HN_CAP_UDP6CS)
3745 csum_assist |= CSUM_IP6_UDP;
3746 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3747 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3749 if (sc->hn_caps & HN_CAP_HASHVAL) {
3751 * Support HASHVAL pktinfo on TX path.
3754 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3755 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3756 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3761 hn_destroy_tx_data(struct hn_softc *sc)
3765 if (sc->hn_chim != NULL) {
3766 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
3767 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3769 device_printf(sc->hn_dev,
3770 "chimney sending buffer is referenced");
3775 if (sc->hn_tx_ring_cnt == 0)
3778 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3779 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3781 free(sc->hn_tx_ring, M_DEVBUF);
3782 sc->hn_tx_ring = NULL;
3784 sc->hn_tx_ring_cnt = 0;
3785 sc->hn_tx_ring_inuse = 0;
3788 #ifdef HN_IFSTART_SUPPORT
3791 hn_start_taskfunc(void *xtxr, int pending __unused)
3793 struct hn_tx_ring *txr = xtxr;
3795 mtx_lock(&txr->hn_tx_lock);
3796 hn_start_locked(txr, 0);
3797 mtx_unlock(&txr->hn_tx_lock);
3801 hn_start_locked(struct hn_tx_ring *txr, int len)
3803 struct hn_softc *sc = txr->hn_sc;
3804 struct ifnet *ifp = sc->hn_ifp;
3807 KASSERT(hn_use_if_start,
3808 ("hn_start_locked is called, when if_start is disabled"));
3809 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3810 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3811 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3813 if (__predict_false(txr->hn_suspended))
3816 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3820 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3821 struct hn_txdesc *txd;
3822 struct mbuf *m_head;
3825 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3829 if (len > 0 && m_head->m_pkthdr.len > len) {
3831 * This sending could be time consuming; let callers
3832 * dispatch this packet sending (and sending of any
3833 * following up packets) to tx taskqueue.
3835 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3840 #if defined(INET6) || defined(INET)
3841 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3842 m_head = hn_tso_fixup(m_head);
3843 if (__predict_false(m_head == NULL)) {
3844 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3850 txd = hn_txdesc_get(txr);
3852 txr->hn_no_txdescs++;
3853 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3854 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3858 error = hn_encap(ifp, txr, txd, &m_head);
3860 /* Both txd and m_head are freed */
3861 KASSERT(txr->hn_agg_txd == NULL,
3862 ("encap failed w/ pending aggregating txdesc"));
3866 if (txr->hn_agg_pktleft == 0) {
3867 if (txr->hn_agg_txd != NULL) {
3868 KASSERT(m_head == NULL,
3869 ("pending mbuf for aggregating txdesc"));
3870 error = hn_flush_txagg(ifp, txr);
3871 if (__predict_false(error)) {
3872 atomic_set_int(&ifp->if_drv_flags,
3877 KASSERT(m_head != NULL, ("mbuf was freed"));
3878 error = hn_txpkt(ifp, txr, txd);
3879 if (__predict_false(error)) {
3880 /* txd is freed, but m_head is not */
3881 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3882 atomic_set_int(&ifp->if_drv_flags,
3890 KASSERT(txr->hn_agg_txd != NULL,
3891 ("no aggregating txdesc"));
3892 KASSERT(m_head == NULL,
3893 ("pending mbuf for aggregating txdesc"));
3898 /* Flush pending aggerated transmission. */
3899 if (txr->hn_agg_txd != NULL)
3900 hn_flush_txagg(ifp, txr);
3905 hn_start(struct ifnet *ifp)
3907 struct hn_softc *sc = ifp->if_softc;
3908 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
3910 if (txr->hn_sched_tx)
3913 if (mtx_trylock(&txr->hn_tx_lock)) {
3916 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3917 mtx_unlock(&txr->hn_tx_lock);
3922 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3926 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3928 struct hn_tx_ring *txr = xtxr;
3930 mtx_lock(&txr->hn_tx_lock);
3931 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3932 hn_start_locked(txr, 0);
3933 mtx_unlock(&txr->hn_tx_lock);
3937 hn_start_txeof(struct hn_tx_ring *txr)
3939 struct hn_softc *sc = txr->hn_sc;
3940 struct ifnet *ifp = sc->hn_ifp;
3942 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3944 if (txr->hn_sched_tx)
3947 if (mtx_trylock(&txr->hn_tx_lock)) {
3950 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3951 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3952 mtx_unlock(&txr->hn_tx_lock);
3954 taskqueue_enqueue(txr->hn_tx_taskq,
3960 * Release the OACTIVE earlier, with the hope, that
3961 * others could catch up. The task will clear the
3962 * flag again with the hn_tx_lock to avoid possible
3965 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3966 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3970 #endif /* HN_IFSTART_SUPPORT */
3973 hn_xmit(struct hn_tx_ring *txr, int len)
3975 struct hn_softc *sc = txr->hn_sc;
3976 struct ifnet *ifp = sc->hn_ifp;
3977 struct mbuf *m_head;
3980 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3981 #ifdef HN_IFSTART_SUPPORT
3982 KASSERT(hn_use_if_start == 0,
3983 ("hn_xmit is called, when if_start is enabled"));
3985 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3987 if (__predict_false(txr->hn_suspended))
3990 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
3993 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
3994 struct hn_txdesc *txd;
3997 if (len > 0 && m_head->m_pkthdr.len > len) {
3999 * This sending could be time consuming; let callers
4000 * dispatch this packet sending (and sending of any
4001 * following up packets) to tx taskqueue.
4003 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4008 txd = hn_txdesc_get(txr);
4010 txr->hn_no_txdescs++;
4011 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4012 txr->hn_oactive = 1;
4016 error = hn_encap(ifp, txr, txd, &m_head);
4018 /* Both txd and m_head are freed; discard */
4019 KASSERT(txr->hn_agg_txd == NULL,
4020 ("encap failed w/ pending aggregating txdesc"));
4021 drbr_advance(ifp, txr->hn_mbuf_br);
4025 if (txr->hn_agg_pktleft == 0) {
4026 if (txr->hn_agg_txd != NULL) {
4027 KASSERT(m_head == NULL,
4028 ("pending mbuf for aggregating txdesc"));
4029 error = hn_flush_txagg(ifp, txr);
4030 if (__predict_false(error)) {
4031 txr->hn_oactive = 1;
4035 KASSERT(m_head != NULL, ("mbuf was freed"));
4036 error = hn_txpkt(ifp, txr, txd);
4037 if (__predict_false(error)) {
4038 /* txd is freed, but m_head is not */
4039 drbr_putback(ifp, txr->hn_mbuf_br,
4041 txr->hn_oactive = 1;
4048 KASSERT(txr->hn_agg_txd != NULL,
4049 ("no aggregating txdesc"));
4050 KASSERT(m_head == NULL,
4051 ("pending mbuf for aggregating txdesc"));
4056 drbr_advance(ifp, txr->hn_mbuf_br);
4059 /* Flush pending aggerated transmission. */
4060 if (txr->hn_agg_txd != NULL)
4061 hn_flush_txagg(ifp, txr);
4066 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4068 struct hn_softc *sc = ifp->if_softc;
4069 struct hn_tx_ring *txr;
4072 #if defined(INET6) || defined(INET)
4074 * Perform TSO packet header fixup now, since the TSO
4075 * packet header should be cache-hot.
4077 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4078 m = hn_tso_fixup(m);
4079 if (__predict_false(m == NULL)) {
4080 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4087 * Select the TX ring based on flowid
4089 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
4090 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4091 txr = &sc->hn_tx_ring[idx];
4093 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4095 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4099 if (txr->hn_oactive)
4102 if (txr->hn_sched_tx)
4105 if (mtx_trylock(&txr->hn_tx_lock)) {
4108 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4109 mtx_unlock(&txr->hn_tx_lock);
4114 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4119 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4123 mtx_lock(&txr->hn_tx_lock);
4124 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4126 mtx_unlock(&txr->hn_tx_lock);
4130 hn_xmit_qflush(struct ifnet *ifp)
4132 struct hn_softc *sc = ifp->if_softc;
4135 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4136 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4141 hn_xmit_txeof(struct hn_tx_ring *txr)
4144 if (txr->hn_sched_tx)
4147 if (mtx_trylock(&txr->hn_tx_lock)) {
4150 txr->hn_oactive = 0;
4151 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4152 mtx_unlock(&txr->hn_tx_lock);
4154 taskqueue_enqueue(txr->hn_tx_taskq,
4160 * Release the oactive earlier, with the hope, that
4161 * others could catch up. The task will clear the
4162 * oactive again with the hn_tx_lock to avoid possible
4165 txr->hn_oactive = 0;
4166 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4171 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4173 struct hn_tx_ring *txr = xtxr;
4175 mtx_lock(&txr->hn_tx_lock);
4177 mtx_unlock(&txr->hn_tx_lock);
4181 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4183 struct hn_tx_ring *txr = xtxr;
4185 mtx_lock(&txr->hn_tx_lock);
4186 txr->hn_oactive = 0;
4188 mtx_unlock(&txr->hn_tx_lock);
4192 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4194 struct vmbus_chan_br cbr;
4195 struct hn_rx_ring *rxr;
4196 struct hn_tx_ring *txr = NULL;
4199 idx = vmbus_chan_subidx(chan);
4202 * Link this channel to RX/TX ring.
4204 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4205 ("invalid channel index %d, should > 0 && < %d",
4206 idx, sc->hn_rx_ring_inuse));
4207 rxr = &sc->hn_rx_ring[idx];
4208 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4209 ("RX ring %d already attached", idx));
4210 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4213 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4214 idx, vmbus_chan_id(chan));
4217 if (idx < sc->hn_tx_ring_inuse) {
4218 txr = &sc->hn_tx_ring[idx];
4219 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4220 ("TX ring %d already attached", idx));
4221 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4223 txr->hn_chan = chan;
4225 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4226 idx, vmbus_chan_id(chan));
4230 /* Bind this channel to a proper CPU. */
4231 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4236 cbr.cbr = rxr->hn_br;
4237 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4238 cbr.cbr_txsz = HN_TXBR_SIZE;
4239 cbr.cbr_rxsz = HN_RXBR_SIZE;
4240 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4242 if (error == EISCONN) {
4243 if_printf(sc->hn_ifp, "bufring is connected after "
4244 "chan%u open failure\n", vmbus_chan_id(chan));
4245 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4247 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4248 vmbus_chan_id(chan), error);
4255 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4257 struct hn_rx_ring *rxr;
4260 idx = vmbus_chan_subidx(chan);
4263 * Link this channel to RX/TX ring.
4265 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4266 ("invalid channel index %d, should > 0 && < %d",
4267 idx, sc->hn_rx_ring_inuse));
4268 rxr = &sc->hn_rx_ring[idx];
4269 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4270 ("RX ring %d is not attached", idx));
4271 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4273 if (idx < sc->hn_tx_ring_inuse) {
4274 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4276 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4277 ("TX ring %d is not attached attached", idx));
4278 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4282 * Close this channel.
4285 * Channel closing does _not_ destroy the target channel.
4287 error = vmbus_chan_close_direct(chan);
4288 if (error == EISCONN) {
4289 if_printf(sc->hn_ifp, "chan%u bufring is connected "
4290 "after being closed\n", vmbus_chan_id(chan));
4291 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4293 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4294 vmbus_chan_id(chan), error);
4299 hn_attach_subchans(struct hn_softc *sc)
4301 struct vmbus_channel **subchans;
4302 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4305 KASSERT(subchan_cnt > 0, ("no sub-channels"));
4307 /* Attach the sub-channels. */
4308 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4309 for (i = 0; i < subchan_cnt; ++i) {
4312 error1 = hn_chan_attach(sc, subchans[i]);
4315 /* Move on; all channels will be detached later. */
4318 vmbus_subchan_rel(subchans, subchan_cnt);
4321 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4324 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4332 hn_detach_allchans(struct hn_softc *sc)
4334 struct vmbus_channel **subchans;
4335 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4338 if (subchan_cnt == 0)
4341 /* Detach the sub-channels. */
4342 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4343 for (i = 0; i < subchan_cnt; ++i)
4344 hn_chan_detach(sc, subchans[i]);
4345 vmbus_subchan_rel(subchans, subchan_cnt);
4349 * Detach the primary channel, _after_ all sub-channels
4352 hn_chan_detach(sc, sc->hn_prichan);
4354 /* Wait for sub-channels to be destroyed, if any. */
4355 vmbus_subchan_drain(sc->hn_prichan);
4358 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4359 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4360 HN_RX_FLAG_ATTACHED) == 0,
4361 ("%dth RX ring is still attached", i));
4363 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4364 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4365 HN_TX_FLAG_ATTACHED) == 0,
4366 ("%dth TX ring is still attached", i));
4372 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4374 struct vmbus_channel **subchans;
4375 int nchan, rxr_cnt, error;
4377 nchan = *nsubch + 1;
4380 * Multiple RX/TX rings are not requested.
4387 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4390 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4392 /* No RSS; this is benign. */
4397 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4401 if (nchan > rxr_cnt)
4404 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4410 * Allocate sub-channels from NVS.
4412 *nsubch = nchan - 1;
4413 error = hn_nvs_alloc_subchans(sc, nsubch);
4414 if (error || *nsubch == 0) {
4415 /* Failed to allocate sub-channels. */
4421 * Wait for all sub-channels to become ready before moving on.
4423 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4424 vmbus_subchan_rel(subchans, *nsubch);
4429 hn_synth_attachable(const struct hn_softc *sc)
4433 if (sc->hn_flags & HN_FLAG_ERRORS)
4436 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4437 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4439 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4446 hn_synth_attach(struct hn_softc *sc, int mtu)
4448 #define ATTACHED_NVS 0x0002
4449 #define ATTACHED_RNDIS 0x0004
4451 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4452 int error, nsubch, nchan, i;
4453 uint32_t old_caps, attached = 0;
4455 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4456 ("synthetic parts were attached"));
4458 if (!hn_synth_attachable(sc))
4461 /* Save capabilities for later verification. */
4462 old_caps = sc->hn_caps;
4465 /* Clear RSS stuffs. */
4466 sc->hn_rss_ind_size = 0;
4467 sc->hn_rss_hash = 0;
4470 * Attach the primary channel _before_ attaching NVS and RNDIS.
4472 error = hn_chan_attach(sc, sc->hn_prichan);
4479 error = hn_nvs_attach(sc, mtu);
4482 attached |= ATTACHED_NVS;
4485 * Attach RNDIS _after_ NVS is attached.
4487 error = hn_rndis_attach(sc, mtu);
4490 attached |= ATTACHED_RNDIS;
4493 * Make sure capabilities are not changed.
4495 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4496 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4497 old_caps, sc->hn_caps);
4503 * Allocate sub-channels for multi-TX/RX rings.
4506 * The # of RX rings that can be used is equivalent to the # of
4507 * channels to be requested.
4509 nsubch = sc->hn_rx_ring_cnt - 1;
4510 error = hn_synth_alloc_subchans(sc, &nsubch);
4513 /* NOTE: _Full_ synthetic parts detach is required now. */
4514 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4517 * Set the # of TX/RX rings that could be used according to
4518 * the # of channels that NVS offered.
4521 hn_set_ring_inuse(sc, nchan);
4523 /* Only the primary channel can be used; done */
4528 * Attach the sub-channels.
4530 * NOTE: hn_set_ring_inuse() _must_ have been called.
4532 error = hn_attach_subchans(sc);
4537 * Configure RSS key and indirect table _after_ all sub-channels
4540 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4542 * RSS key is not set yet; set it to the default RSS key.
4545 if_printf(sc->hn_ifp, "setup default RSS key\n");
4546 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4547 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4550 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4552 * RSS indirect table is not set yet; set it up in round-
4556 if_printf(sc->hn_ifp, "setup default RSS indirect "
4559 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
4560 rss->rss_ind[i] = i % nchan;
4561 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4564 * # of usable channels may be changed, so we have to
4565 * make sure that all entries in RSS indirect table
4568 * NOTE: hn_set_ring_inuse() _must_ have been called.
4570 hn_rss_ind_fixup(sc);
4573 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4578 * Fixup transmission aggregation setup.
4584 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4585 hn_synth_detach(sc);
4587 if (attached & ATTACHED_RNDIS)
4588 hn_rndis_detach(sc);
4589 if (attached & ATTACHED_NVS)
4591 hn_chan_detach(sc, sc->hn_prichan);
4592 /* Restore old capabilities. */
4593 sc->hn_caps = old_caps;
4597 #undef ATTACHED_RNDIS
4603 * The interface must have been suspended though hn_suspend(), before
4604 * this function get called.
4607 hn_synth_detach(struct hn_softc *sc)
4610 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4611 ("synthetic parts were not attached"));
4613 /* Detach the RNDIS first. */
4614 hn_rndis_detach(sc);
4619 /* Detach all of the channels. */
4620 hn_detach_allchans(sc);
4622 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4626 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4628 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4629 ("invalid ring count %d", ring_cnt));
4631 if (sc->hn_tx_ring_cnt > ring_cnt)
4632 sc->hn_tx_ring_inuse = ring_cnt;
4634 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4635 sc->hn_rx_ring_inuse = ring_cnt;
4638 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4639 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4644 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4649 * The TX bufring will not be drained by the hypervisor,
4650 * if the primary channel is revoked.
4652 while (!vmbus_chan_rx_empty(chan) ||
4653 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4654 !vmbus_chan_tx_empty(chan)))
4656 vmbus_chan_intr_drain(chan);
4660 hn_suspend_data(struct hn_softc *sc)
4662 struct vmbus_channel **subch = NULL;
4663 struct hn_tx_ring *txr;
4671 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4672 txr = &sc->hn_tx_ring[i];
4674 mtx_lock(&txr->hn_tx_lock);
4675 txr->hn_suspended = 1;
4676 mtx_unlock(&txr->hn_tx_lock);
4677 /* No one is able send more packets now. */
4680 * Wait for all pending sends to finish.
4683 * We will _not_ receive all pending send-done, if the
4684 * primary channel is revoked.
4686 while (hn_tx_ring_pending(txr) &&
4687 !vmbus_chan_is_revoked(sc->hn_prichan))
4688 pause("hnwtx", 1 /* 1 tick */);
4692 * Disable RX by clearing RX filter.
4694 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4695 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
4698 * Give RNDIS enough time to flush all pending data packets.
4700 pause("waitrx", (200 * hz) / 1000);
4703 * Drain RX/TX bufrings and interrupts.
4705 nsubch = sc->hn_rx_ring_inuse - 1;
4707 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4709 if (subch != NULL) {
4710 for (i = 0; i < nsubch; ++i)
4711 hn_chan_drain(sc, subch[i]);
4713 hn_chan_drain(sc, sc->hn_prichan);
4716 vmbus_subchan_rel(subch, nsubch);
4719 * Drain any pending TX tasks.
4722 * The above hn_chan_drain() can dispatch TX tasks, so the TX
4723 * tasks will have to be drained _after_ the above hn_chan_drain()
4726 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4727 txr = &sc->hn_tx_ring[i];
4729 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4730 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4735 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4738 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4742 hn_suspend_mgmt(struct hn_softc *sc)
4749 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4750 * through hn_mgmt_taskq.
4752 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4753 vmbus_chan_run_task(sc->hn_prichan, &task);
4756 * Make sure that all pending management tasks are completed.
4758 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4759 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4760 taskqueue_drain_all(sc->hn_mgmt_taskq0);
4764 hn_suspend(struct hn_softc *sc)
4767 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4768 hn_suspend_data(sc);
4769 hn_suspend_mgmt(sc);
4773 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4777 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4778 ("invalid TX ring count %d", tx_ring_cnt));
4780 for (i = 0; i < tx_ring_cnt; ++i) {
4781 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4783 mtx_lock(&txr->hn_tx_lock);
4784 txr->hn_suspended = 0;
4785 mtx_unlock(&txr->hn_tx_lock);
4790 hn_resume_data(struct hn_softc *sc)
4799 hn_set_rxfilter(sc);
4802 * Make sure to clear suspend status on "all" TX rings,
4803 * since hn_tx_ring_inuse can be changed after
4804 * hn_suspend_data().
4806 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4808 #ifdef HN_IFSTART_SUPPORT
4809 if (!hn_use_if_start)
4813 * Flush unused drbrs, since hn_tx_ring_inuse may be
4816 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4817 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4823 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4824 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4827 * Use txeof task, so that any pending oactive can be
4830 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4835 hn_resume_mgmt(struct hn_softc *sc)
4838 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4841 * Kick off network change detection, if it was pending.
4842 * If no network change was pending, start link status
4843 * checks, which is more lightweight than network change
4846 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4847 hn_change_network(sc);
4849 hn_update_link_status(sc);
4853 hn_resume(struct hn_softc *sc)
4856 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4862 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4864 const struct rndis_status_msg *msg;
4867 if (dlen < sizeof(*msg)) {
4868 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4873 switch (msg->rm_status) {
4874 case RNDIS_STATUS_MEDIA_CONNECT:
4875 case RNDIS_STATUS_MEDIA_DISCONNECT:
4876 hn_update_link_status(sc);
4879 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4880 /* Not really useful; ignore. */
4883 case RNDIS_STATUS_NETWORK_CHANGE:
4884 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4885 if (dlen < ofs + msg->rm_stbuflen ||
4886 msg->rm_stbuflen < sizeof(uint32_t)) {
4887 if_printf(sc->hn_ifp, "network changed\n");
4891 memcpy(&change, ((const uint8_t *)msg) + ofs,
4893 if_printf(sc->hn_ifp, "network changed, change %u\n",
4896 hn_change_network(sc);
4900 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4907 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4909 const struct rndis_pktinfo *pi = info_data;
4912 while (info_dlen != 0) {
4916 if (__predict_false(info_dlen < sizeof(*pi)))
4918 if (__predict_false(info_dlen < pi->rm_size))
4920 info_dlen -= pi->rm_size;
4922 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
4924 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
4926 dlen = pi->rm_size - pi->rm_pktinfooffset;
4929 switch (pi->rm_type) {
4930 case NDIS_PKTINFO_TYPE_VLAN:
4931 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
4933 info->vlan_info = *((const uint32_t *)data);
4934 mask |= HN_RXINFO_VLAN;
4937 case NDIS_PKTINFO_TYPE_CSUM:
4938 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
4940 info->csum_info = *((const uint32_t *)data);
4941 mask |= HN_RXINFO_CSUM;
4944 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
4945 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
4947 info->hash_value = *((const uint32_t *)data);
4948 mask |= HN_RXINFO_HASHVAL;
4951 case HN_NDIS_PKTINFO_TYPE_HASHINF:
4952 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
4954 info->hash_info = *((const uint32_t *)data);
4955 mask |= HN_RXINFO_HASHINF;
4962 if (mask == HN_RXINFO_ALL) {
4963 /* All found; done */
4967 pi = (const struct rndis_pktinfo *)
4968 ((const uint8_t *)pi + pi->rm_size);
4973 * - If there is no hash value, invalidate the hash info.
4975 if ((mask & HN_RXINFO_HASHVAL) == 0)
4976 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
4980 static __inline bool
4981 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
4984 if (off < check_off) {
4985 if (__predict_true(off + len <= check_off))
4987 } else if (off > check_off) {
4988 if (__predict_true(check_off + check_len <= off))
4995 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
4997 const struct rndis_packet_msg *pkt;
4998 struct hn_rxinfo info;
4999 int data_off, pktinfo_off, data_len, pktinfo_len;
5004 if (__predict_false(dlen < sizeof(*pkt))) {
5005 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5010 if (__predict_false(dlen < pkt->rm_len)) {
5011 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5012 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5015 if (__predict_false(pkt->rm_len <
5016 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5017 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5018 "msglen %u, data %u, oob %u, pktinfo %u\n",
5019 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5020 pkt->rm_pktinfolen);
5023 if (__predict_false(pkt->rm_datalen == 0)) {
5024 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5031 #define IS_OFFSET_INVALID(ofs) \
5032 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
5033 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5035 /* XXX Hyper-V does not meet data offset alignment requirement */
5036 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5037 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5038 "data offset %u\n", pkt->rm_dataoffset);
5041 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5042 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5043 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5044 "oob offset %u\n", pkt->rm_oobdataoffset);
5047 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5048 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5049 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5050 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5054 #undef IS_OFFSET_INVALID
5056 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5057 data_len = pkt->rm_datalen;
5058 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5059 pktinfo_len = pkt->rm_pktinfolen;
5062 * Check OOB coverage.
5064 if (__predict_false(pkt->rm_oobdatalen != 0)) {
5065 int oob_off, oob_len;
5067 if_printf(rxr->hn_ifp, "got oobdata\n");
5068 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5069 oob_len = pkt->rm_oobdatalen;
5071 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5072 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5073 "oob overflow, msglen %u, oob abs %d len %d\n",
5074 pkt->rm_len, oob_off, oob_len);
5079 * Check against data.
5081 if (hn_rndis_check_overlap(oob_off, oob_len,
5082 data_off, data_len)) {
5083 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5084 "oob overlaps data, oob abs %d len %d, "
5085 "data abs %d len %d\n",
5086 oob_off, oob_len, data_off, data_len);
5091 * Check against pktinfo.
5093 if (pktinfo_len != 0 &&
5094 hn_rndis_check_overlap(oob_off, oob_len,
5095 pktinfo_off, pktinfo_len)) {
5096 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5097 "oob overlaps pktinfo, oob abs %d len %d, "
5098 "pktinfo abs %d len %d\n",
5099 oob_off, oob_len, pktinfo_off, pktinfo_len);
5105 * Check per-packet-info coverage and find useful per-packet-info.
5107 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5108 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5109 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5110 if (__predict_true(pktinfo_len != 0)) {
5114 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5115 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5116 "pktinfo overflow, msglen %u, "
5117 "pktinfo abs %d len %d\n",
5118 pkt->rm_len, pktinfo_off, pktinfo_len);
5123 * Check packet info coverage.
5125 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5126 data_off, data_len);
5127 if (__predict_false(overlap)) {
5128 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5129 "pktinfo overlap data, pktinfo abs %d len %d, "
5130 "data abs %d len %d\n",
5131 pktinfo_off, pktinfo_len, data_off, data_len);
5136 * Find useful per-packet-info.
5138 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5139 pktinfo_len, &info);
5140 if (__predict_false(error)) {
5141 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5147 if (__predict_false(data_off + data_len > pkt->rm_len)) {
5148 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5149 "data overflow, msglen %u, data abs %d len %d\n",
5150 pkt->rm_len, data_off, data_len);
5153 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5156 static __inline void
5157 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5159 const struct rndis_msghdr *hdr;
5161 if (__predict_false(dlen < sizeof(*hdr))) {
5162 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5167 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5168 /* Hot data path. */
5169 hn_rndis_rx_data(rxr, data, dlen);
5174 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5175 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5177 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5181 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5183 const struct hn_nvs_hdr *hdr;
5185 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5186 if_printf(sc->hn_ifp, "invalid nvs notify\n");
5189 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5191 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5192 /* Useless; ignore */
5195 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5199 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5200 const struct vmbus_chanpkt_hdr *pkt)
5202 struct hn_nvs_sendctx *sndc;
5204 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5205 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5206 VMBUS_CHANPKT_DATALEN(pkt));
5209 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5215 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5216 const struct vmbus_chanpkt_hdr *pkthdr)
5218 const struct vmbus_chanpkt_rxbuf *pkt;
5219 const struct hn_nvs_hdr *nvs_hdr;
5222 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5223 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5226 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5228 /* Make sure that this is a RNDIS message. */
5229 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5230 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5235 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5236 if (__predict_false(hlen < sizeof(*pkt))) {
5237 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5240 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5242 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5243 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5248 count = pkt->cp_rxbuf_cnt;
5249 if (__predict_false(hlen <
5250 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5251 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5255 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5256 for (i = 0; i < count; ++i) {
5259 ofs = pkt->cp_rxbuf[i].rb_ofs;
5260 len = pkt->cp_rxbuf[i].rb_len;
5261 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5262 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5263 "ofs %d, len %d\n", i, ofs, len);
5266 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5270 * Ack the consumed RXBUF associated w/ this channel packet,
5271 * so that this RXBUF can be recycled by the hypervisor.
5273 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5277 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5280 struct hn_nvs_rndis_ack ack;
5283 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5284 ack.nvs_status = HN_NVS_STATUS_OK;
5288 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5289 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5290 if (__predict_false(error == EAGAIN)) {
5293 * This should _not_ happen in real world, since the
5294 * consumption of the TX bufring from the TX path is
5297 if (rxr->hn_ack_failed == 0)
5298 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5299 rxr->hn_ack_failed++;
5306 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5311 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5313 struct hn_rx_ring *rxr = xrxr;
5314 struct hn_softc *sc = rxr->hn_ifp->if_softc;
5317 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5320 pktlen = rxr->hn_pktbuf_len;
5321 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5322 if (__predict_false(error == ENOBUFS)) {
5327 * Expand channel packet buffer.
5330 * Use M_WAITOK here, since allocation failure
5333 nlen = rxr->hn_pktbuf_len * 2;
5334 while (nlen < pktlen)
5336 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5338 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5339 rxr->hn_pktbuf_len, nlen);
5341 free(rxr->hn_pktbuf, M_DEVBUF);
5342 rxr->hn_pktbuf = nbuf;
5343 rxr->hn_pktbuf_len = nlen;
5346 } else if (__predict_false(error == EAGAIN)) {
5347 /* No more channel packets; done! */
5350 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5352 switch (pkt->cph_type) {
5353 case VMBUS_CHANPKT_TYPE_COMP:
5354 hn_nvs_handle_comp(sc, chan, pkt);
5357 case VMBUS_CHANPKT_TYPE_RXBUF:
5358 hn_nvs_handle_rxbuf(rxr, chan, pkt);
5361 case VMBUS_CHANPKT_TYPE_INBAND:
5362 hn_nvs_handle_notify(sc, pkt);
5366 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5371 hn_chan_rollup(rxr, rxr->hn_txr);
5375 hn_tx_taskq_create(void *arg __unused)
5380 * Fix the # of TX taskqueues.
5382 if (hn_tx_taskq_cnt <= 0)
5383 hn_tx_taskq_cnt = 1;
5384 else if (hn_tx_taskq_cnt > mp_ncpus)
5385 hn_tx_taskq_cnt = mp_ncpus;
5388 * Fix the TX taskqueue mode.
5390 switch (hn_tx_taskq_mode) {
5391 case HN_TX_TASKQ_M_INDEP:
5392 case HN_TX_TASKQ_M_GLOBAL:
5393 case HN_TX_TASKQ_M_EVTTQ:
5396 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5400 if (vm_guest != VM_GUEST_HV)
5403 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5406 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5407 M_DEVBUF, M_WAITOK);
5408 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5409 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5410 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5411 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5415 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5416 hn_tx_taskq_create, NULL);
5419 hn_tx_taskq_destroy(void *arg __unused)
5422 if (hn_tx_taskque != NULL) {
5425 for (i = 0; i < hn_tx_taskq_cnt; ++i)
5426 taskqueue_free(hn_tx_taskque[i]);
5427 free(hn_tx_taskque, M_DEVBUF);
5430 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5431 hn_tx_taskq_destroy, NULL);