2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
62 #include <sys/param.h>
64 #include <sys/kernel.h>
65 #include <sys/limits.h>
66 #include <sys/malloc.h>
68 #include <sys/module.h>
70 #include <sys/queue.h>
73 #include <sys/socket.h>
74 #include <sys/sockio.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/taskqueue.h>
79 #include <sys/buf_ring.h>
81 #include <machine/atomic.h>
82 #include <machine/in_cksum.h>
85 #include <net/ethernet.h>
87 #include <net/if_arp.h>
88 #include <net/if_media.h>
89 #include <net/if_types.h>
90 #include <net/if_var.h>
91 #include <net/if_vlan_var.h>
92 #include <net/rndis.h>
94 #include <netinet/in_systm.h>
95 #include <netinet/in.h>
96 #include <netinet/ip.h>
97 #include <netinet/ip6.h>
98 #include <netinet/tcp.h>
99 #include <netinet/tcp_lro.h>
100 #include <netinet/udp.h>
102 #include <dev/hyperv/include/hyperv.h>
103 #include <dev/hyperv/include/hyperv_busdma.h>
104 #include <dev/hyperv/include/vmbus.h>
105 #include <dev/hyperv/include/vmbus_xact.h>
107 #include <dev/hyperv/netvsc/ndis.h>
108 #include <dev/hyperv/netvsc/if_hnreg.h>
109 #include <dev/hyperv/netvsc/if_hnvar.h>
110 #include <dev/hyperv/netvsc/hn_nvs.h>
111 #include <dev/hyperv/netvsc/hn_rndis.h>
113 #include "vmbus_if.h"
115 #define HN_IFSTART_SUPPORT
117 #define HN_RING_CNT_DEF_MAX 8
119 /* YYY should get it from the underlying channel */
120 #define HN_TX_DESC_CNT 512
122 #define HN_RNDIS_PKT_LEN \
123 (sizeof(struct rndis_packet_msg) + \
124 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
125 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
126 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
127 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
128 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
129 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
131 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
132 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
133 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
134 /* -1 for RNDIS packet message */
135 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
137 #define HN_DIRECT_TX_SIZE_DEF 128
139 #define HN_EARLY_TXEOF_THRESH 8
141 #define HN_PKTBUF_LEN_DEF (16 * 1024)
143 #define HN_LROENT_CNT_DEF 128
145 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
146 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
147 /* YYY 2*MTU is a bit rough, but should be good enough. */
148 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
150 #define HN_LRO_ACKCNT_DEF 1
152 #define HN_LOCK_INIT(sc) \
153 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
154 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
155 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
156 #define HN_LOCK(sc) \
158 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
161 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
163 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
164 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
165 #define HN_CSUM_IP_HWASSIST(sc) \
166 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
167 #define HN_CSUM_IP6_HWASSIST(sc) \
168 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
170 #define HN_PKTSIZE_MIN(align) \
171 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
172 HN_RNDIS_PKT_LEN, (align))
173 #define HN_PKTSIZE(m, align) \
174 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
176 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
179 #ifndef HN_USE_TXDESC_BUFRING
180 SLIST_ENTRY(hn_txdesc) link;
182 STAILQ_ENTRY(hn_txdesc) agg_link;
184 /* Aggregated txdescs, in sending order. */
185 STAILQ_HEAD(, hn_txdesc) agg_list;
187 /* The oldest packet, if transmission aggregation happens. */
189 struct hn_tx_ring *txr;
191 uint32_t flags; /* HN_TXD_FLAG_ */
192 struct hn_nvs_sendctx send_ctx;
196 bus_dmamap_t data_dmap;
198 bus_addr_t rndis_pkt_paddr;
199 struct rndis_packet_msg *rndis_pkt;
200 bus_dmamap_t rndis_pkt_dmap;
203 #define HN_TXD_FLAG_ONLIST 0x0001
204 #define HN_TXD_FLAG_DMAMAP 0x0002
205 #define HN_TXD_FLAG_ONAGG 0x0004
214 #define HN_RXINFO_VLAN 0x0001
215 #define HN_RXINFO_CSUM 0x0002
216 #define HN_RXINFO_HASHINF 0x0004
217 #define HN_RXINFO_HASHVAL 0x0008
218 #define HN_RXINFO_ALL \
221 HN_RXINFO_HASHINF | \
224 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
225 #define HN_NDIS_RXCSUM_INFO_INVALID 0
226 #define HN_NDIS_HASH_INFO_INVALID 0
228 static int hn_probe(device_t);
229 static int hn_attach(device_t);
230 static int hn_detach(device_t);
231 static int hn_shutdown(device_t);
232 static void hn_chan_callback(struct vmbus_channel *,
235 static void hn_init(void *);
236 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
237 #ifdef HN_IFSTART_SUPPORT
238 static void hn_start(struct ifnet *);
240 static int hn_transmit(struct ifnet *, struct mbuf *);
241 static void hn_xmit_qflush(struct ifnet *);
242 static int hn_ifmedia_upd(struct ifnet *);
243 static void hn_ifmedia_sts(struct ifnet *,
244 struct ifmediareq *);
246 static int hn_rndis_rxinfo(const void *, int,
248 static void hn_rndis_rx_data(struct hn_rx_ring *,
250 static void hn_rndis_rx_status(struct hn_softc *,
253 static void hn_nvs_handle_notify(struct hn_softc *,
254 const struct vmbus_chanpkt_hdr *);
255 static void hn_nvs_handle_comp(struct hn_softc *,
256 struct vmbus_channel *,
257 const struct vmbus_chanpkt_hdr *);
258 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
259 struct vmbus_channel *,
260 const struct vmbus_chanpkt_hdr *);
261 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
262 struct vmbus_channel *, uint64_t);
264 #if __FreeBSD_version >= 1100099
265 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
266 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
268 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
269 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
270 #if __FreeBSD_version < 1100095
271 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
273 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
275 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
276 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
277 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
278 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
279 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
280 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
281 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
282 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
283 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
284 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
285 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
286 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
287 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
288 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
289 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
291 static void hn_stop(struct hn_softc *);
292 static void hn_init_locked(struct hn_softc *);
293 static int hn_chan_attach(struct hn_softc *,
294 struct vmbus_channel *);
295 static void hn_chan_detach(struct hn_softc *,
296 struct vmbus_channel *);
297 static int hn_attach_subchans(struct hn_softc *);
298 static void hn_detach_allchans(struct hn_softc *);
299 static void hn_chan_rollup(struct hn_rx_ring *,
300 struct hn_tx_ring *);
301 static void hn_set_ring_inuse(struct hn_softc *, int);
302 static int hn_synth_attach(struct hn_softc *, int);
303 static void hn_synth_detach(struct hn_softc *);
304 static int hn_synth_alloc_subchans(struct hn_softc *,
306 static bool hn_synth_attachable(const struct hn_softc *);
307 static void hn_suspend(struct hn_softc *);
308 static void hn_suspend_data(struct hn_softc *);
309 static void hn_suspend_mgmt(struct hn_softc *);
310 static void hn_resume(struct hn_softc *);
311 static void hn_resume_data(struct hn_softc *);
312 static void hn_resume_mgmt(struct hn_softc *);
313 static void hn_suspend_mgmt_taskfunc(void *, int);
314 static void hn_chan_drain(struct hn_softc *,
315 struct vmbus_channel *);
316 static void hn_polling(struct hn_softc *, u_int);
317 static void hn_chan_polling(struct vmbus_channel *, u_int);
319 static void hn_update_link_status(struct hn_softc *);
320 static void hn_change_network(struct hn_softc *);
321 static void hn_link_taskfunc(void *, int);
322 static void hn_netchg_init_taskfunc(void *, int);
323 static void hn_netchg_status_taskfunc(void *, int);
324 static void hn_link_status(struct hn_softc *);
326 static int hn_create_rx_data(struct hn_softc *, int);
327 static void hn_destroy_rx_data(struct hn_softc *);
328 static int hn_check_iplen(const struct mbuf *, int);
329 static int hn_set_rxfilter(struct hn_softc *);
330 static int hn_rss_reconfig(struct hn_softc *);
331 static void hn_rss_ind_fixup(struct hn_softc *);
332 static int hn_rxpkt(struct hn_rx_ring *, const void *,
333 int, const struct hn_rxinfo *);
335 static int hn_tx_ring_create(struct hn_softc *, int);
336 static void hn_tx_ring_destroy(struct hn_tx_ring *);
337 static int hn_create_tx_data(struct hn_softc *, int);
338 static void hn_fixup_tx_data(struct hn_softc *);
339 static void hn_destroy_tx_data(struct hn_softc *);
340 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
341 static void hn_txdesc_gc(struct hn_tx_ring *,
343 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
344 struct hn_txdesc *, struct mbuf **);
345 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
347 static void hn_set_chim_size(struct hn_softc *, int);
348 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
349 static bool hn_tx_ring_pending(struct hn_tx_ring *);
350 static void hn_tx_ring_qflush(struct hn_tx_ring *);
351 static void hn_resume_tx(struct hn_softc *, int);
352 static void hn_set_txagg(struct hn_softc *);
353 static void *hn_try_txagg(struct ifnet *,
354 struct hn_tx_ring *, struct hn_txdesc *,
356 static int hn_get_txswq_depth(const struct hn_tx_ring *);
357 static void hn_txpkt_done(struct hn_nvs_sendctx *,
358 struct hn_softc *, struct vmbus_channel *,
360 static int hn_txpkt_sglist(struct hn_tx_ring *,
362 static int hn_txpkt_chim(struct hn_tx_ring *,
364 static int hn_xmit(struct hn_tx_ring *, int);
365 static void hn_xmit_taskfunc(void *, int);
366 static void hn_xmit_txeof(struct hn_tx_ring *);
367 static void hn_xmit_txeof_taskfunc(void *, int);
368 #ifdef HN_IFSTART_SUPPORT
369 static int hn_start_locked(struct hn_tx_ring *, int);
370 static void hn_start_taskfunc(void *, int);
371 static void hn_start_txeof(struct hn_tx_ring *);
372 static void hn_start_txeof_taskfunc(void *, int);
375 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
376 "Hyper-V network interface");
378 /* Trust tcp segements verification on host side. */
379 static int hn_trust_hosttcp = 1;
380 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
381 &hn_trust_hosttcp, 0,
382 "Trust tcp segement verification on host side, "
383 "when csum info is missing (global setting)");
385 /* Trust udp datagrams verification on host side. */
386 static int hn_trust_hostudp = 1;
387 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
388 &hn_trust_hostudp, 0,
389 "Trust udp datagram verification on host side, "
390 "when csum info is missing (global setting)");
392 /* Trust ip packets verification on host side. */
393 static int hn_trust_hostip = 1;
394 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
396 "Trust ip packet verification on host side, "
397 "when csum info is missing (global setting)");
399 /* Limit TSO burst size */
400 static int hn_tso_maxlen = IP_MAXPACKET;
401 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
402 &hn_tso_maxlen, 0, "TSO burst limit");
404 /* Limit chimney send size */
405 static int hn_tx_chimney_size = 0;
406 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
407 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
409 /* Limit the size of packet for direct transmission */
410 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
411 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
412 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
414 /* # of LRO entries per RX ring */
415 #if defined(INET) || defined(INET6)
416 #if __FreeBSD_version >= 1100095
417 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
418 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
419 &hn_lro_entry_count, 0, "LRO entry count");
423 static int hn_tx_taskq_cnt = 1;
424 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
425 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
427 #define HN_TX_TASKQ_M_INDEP 0
428 #define HN_TX_TASKQ_M_GLOBAL 1
429 #define HN_TX_TASKQ_M_EVTTQ 2
431 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
432 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
433 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
434 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
436 #ifndef HN_USE_TXDESC_BUFRING
437 static int hn_use_txdesc_bufring = 0;
439 static int hn_use_txdesc_bufring = 1;
441 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
442 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
444 #ifdef HN_IFSTART_SUPPORT
445 /* Use ifnet.if_start instead of ifnet.if_transmit */
446 static int hn_use_if_start = 0;
447 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
448 &hn_use_if_start, 0, "Use if_start TX method");
451 /* # of channels to use */
452 static int hn_chan_cnt = 0;
453 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
455 "# of channels to use; each channel has one RX ring and one TX ring");
457 /* # of transmit rings to use */
458 static int hn_tx_ring_cnt = 0;
459 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
460 &hn_tx_ring_cnt, 0, "# of TX rings to use");
462 /* Software TX ring deptch */
463 static int hn_tx_swq_depth = 0;
464 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
465 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
467 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
468 #if __FreeBSD_version >= 1100095
469 static u_int hn_lro_mbufq_depth = 0;
470 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
471 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
474 /* Packet transmission aggregation size limit */
475 static int hn_tx_agg_size = -1;
476 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
477 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
479 /* Packet transmission aggregation count limit */
480 static int hn_tx_agg_pkts = -1;
481 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
482 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
484 static u_int hn_cpu_index; /* next CPU for channel */
485 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
488 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
489 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
490 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
491 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
492 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
493 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
496 static device_method_t hn_methods[] = {
497 /* Device interface */
498 DEVMETHOD(device_probe, hn_probe),
499 DEVMETHOD(device_attach, hn_attach),
500 DEVMETHOD(device_detach, hn_detach),
501 DEVMETHOD(device_shutdown, hn_shutdown),
505 static driver_t hn_driver = {
508 sizeof(struct hn_softc)
511 static devclass_t hn_devclass;
513 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
514 MODULE_VERSION(hn, 1);
515 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
517 #if __FreeBSD_version >= 1100099
519 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
523 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
524 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
529 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
532 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
533 txd->chim_size == 0, ("invalid rndis sglist txd"));
534 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
535 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
539 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
541 struct hn_nvs_rndis rndis;
543 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
544 txd->chim_size > 0, ("invalid rndis chim txd"));
546 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
547 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
548 rndis.nvs_chim_idx = txd->chim_index;
549 rndis.nvs_chim_sz = txd->chim_size;
551 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
552 &rndis, sizeof(rndis), &txd->send_ctx));
555 static __inline uint32_t
556 hn_chim_alloc(struct hn_softc *sc)
558 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
559 u_long *bmap = sc->hn_chim_bmap;
560 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
562 for (i = 0; i < bmap_cnt; ++i) {
565 idx = ffsl(~bmap[i]);
569 --idx; /* ffsl is 1-based */
570 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
571 ("invalid i %d and idx %d", i, idx));
573 if (atomic_testandset_long(&bmap[i], idx))
576 ret = i * LONG_BIT + idx;
583 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
588 idx = chim_idx / LONG_BIT;
589 KASSERT(idx < sc->hn_chim_bmap_cnt,
590 ("invalid chimney index 0x%x", chim_idx));
592 mask = 1UL << (chim_idx % LONG_BIT);
593 KASSERT(sc->hn_chim_bmap[idx] & mask,
594 ("index bitmap 0x%lx, chimney index %u, "
595 "bitmap idx %d, bitmask 0x%lx",
596 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
598 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
601 #if defined(INET6) || defined(INET)
603 * NOTE: If this function failed, the m_head would be freed.
605 static __inline struct mbuf *
606 hn_tso_fixup(struct mbuf *m_head)
608 struct ether_vlan_header *evl;
612 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
614 #define PULLUP_HDR(m, len) \
616 if (__predict_false((m)->m_len < (len))) { \
617 (m) = m_pullup((m), (len)); \
623 PULLUP_HDR(m_head, sizeof(*evl));
624 evl = mtod(m_head, struct ether_vlan_header *);
625 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
626 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
628 ehlen = ETHER_HDR_LEN;
631 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
635 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
636 ip = mtodo(m_head, ehlen);
637 iphlen = ip->ip_hl << 2;
639 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
640 th = mtodo(m_head, ehlen + iphlen);
644 th->th_sum = in_pseudo(ip->ip_src.s_addr,
645 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
648 #if defined(INET6) && defined(INET)
655 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
656 ip6 = mtodo(m_head, ehlen);
657 if (ip6->ip6_nxt != IPPROTO_TCP) {
662 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
663 th = mtodo(m_head, ehlen + sizeof(*ip6));
666 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
673 #endif /* INET6 || INET */
676 hn_set_rxfilter(struct hn_softc *sc)
678 struct ifnet *ifp = sc->hn_ifp;
684 if (ifp->if_flags & IFF_PROMISC) {
685 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
687 filter = NDIS_PACKET_TYPE_DIRECTED;
688 if (ifp->if_flags & IFF_BROADCAST)
689 filter |= NDIS_PACKET_TYPE_BROADCAST;
690 /* TODO: support multicast list */
691 if ((ifp->if_flags & IFF_ALLMULTI) ||
692 !TAILQ_EMPTY(&ifp->if_multiaddrs))
693 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
696 if (sc->hn_rx_filter != filter) {
697 error = hn_rndis_set_rxfilter(sc, filter);
699 sc->hn_rx_filter = filter;
705 hn_set_txagg(struct hn_softc *sc)
711 * Setup aggregation size.
713 if (sc->hn_agg_size < 0)
716 size = sc->hn_agg_size;
718 if (sc->hn_rndis_agg_size < size)
719 size = sc->hn_rndis_agg_size;
721 /* NOTE: We only aggregate packets using chimney sending buffers. */
722 if (size > (uint32_t)sc->hn_chim_szmax)
723 size = sc->hn_chim_szmax;
725 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
732 /* NOTE: Type of the per TX ring setting is 'int'. */
737 * Setup aggregation packet count.
739 if (sc->hn_agg_pkts < 0)
742 pkts = sc->hn_agg_pkts;
744 if (sc->hn_rndis_agg_pkts < pkts)
745 pkts = sc->hn_rndis_agg_pkts;
754 /* NOTE: Type of the per TX ring setting is 'short'. */
759 /* NOTE: Type of the per TX ring setting is 'short'. */
760 if (sc->hn_rndis_agg_align > SHRT_MAX) {
767 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
768 size, pkts, sc->hn_rndis_agg_align);
771 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
772 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
774 mtx_lock(&txr->hn_tx_lock);
775 txr->hn_agg_szmax = size;
776 txr->hn_agg_pktmax = pkts;
777 txr->hn_agg_align = sc->hn_rndis_agg_align;
778 mtx_unlock(&txr->hn_tx_lock);
783 hn_get_txswq_depth(const struct hn_tx_ring *txr)
786 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
787 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
788 return txr->hn_txdesc_cnt;
789 return hn_tx_swq_depth;
793 hn_rss_reconfig(struct hn_softc *sc)
799 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
806 * Direct reconfiguration by setting the UNCHG flags does
807 * _not_ work properly.
810 if_printf(sc->hn_ifp, "disable RSS\n");
811 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
813 if_printf(sc->hn_ifp, "RSS disable failed\n");
818 * Reenable the RSS w/ the updated RSS key or indirect
822 if_printf(sc->hn_ifp, "reconfig RSS\n");
823 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
825 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
832 hn_rss_ind_fixup(struct hn_softc *sc)
834 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
837 nchan = sc->hn_rx_ring_inuse;
838 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
841 * Check indirect table to make sure that all channels in it
844 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
845 if (rss->rss_ind[i] >= nchan) {
846 if_printf(sc->hn_ifp,
847 "RSS indirect table %d fixup: %u -> %d\n",
848 i, rss->rss_ind[i], nchan - 1);
849 rss->rss_ind[i] = nchan - 1;
855 hn_ifmedia_upd(struct ifnet *ifp __unused)
862 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
864 struct hn_softc *sc = ifp->if_softc;
866 ifmr->ifm_status = IFM_AVALID;
867 ifmr->ifm_active = IFM_ETHER;
869 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
870 ifmr->ifm_active |= IFM_NONE;
873 ifmr->ifm_status |= IFM_ACTIVE;
874 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
877 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
878 static const struct hyperv_guid g_net_vsc_device_type = {
879 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
880 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
884 hn_probe(device_t dev)
887 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
888 &g_net_vsc_device_type) == 0) {
889 device_set_desc(dev, "Hyper-V Network Interface");
890 return BUS_PROBE_DEFAULT;
896 hn_attach(device_t dev)
898 struct hn_softc *sc = device_get_softc(dev);
899 struct sysctl_oid_list *child;
900 struct sysctl_ctx_list *ctx;
901 uint8_t eaddr[ETHER_ADDR_LEN];
902 struct ifnet *ifp = NULL;
903 int error, ring_cnt, tx_ring_cnt;
906 sc->hn_prichan = vmbus_get_channel(dev);
910 * Initialize these tunables once.
912 sc->hn_agg_size = hn_tx_agg_size;
913 sc->hn_agg_pkts = hn_tx_agg_pkts;
916 * Setup taskqueue for transmission.
918 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
922 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
924 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
925 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
926 M_WAITOK, taskqueue_thread_enqueue,
927 &sc->hn_tx_taskqs[i]);
928 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
929 "%s tx%d", device_get_nameunit(dev), i);
931 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
932 sc->hn_tx_taskqs = hn_tx_taskque;
936 * Setup taskqueue for mangement tasks, e.g. link status.
938 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
939 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
940 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
941 device_get_nameunit(dev));
942 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
943 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
944 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
945 hn_netchg_status_taskfunc, sc);
948 * Allocate ifnet and setup its name earlier, so that if_printf
949 * can be used by functions, which will be called after
952 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
954 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
957 * Initialize ifmedia earlier so that it can be unconditionally
958 * destroyed, if error happened later on.
960 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
963 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
964 * to use (tx_ring_cnt).
967 * The # of RX rings to use is same as the # of channels to use.
969 ring_cnt = hn_chan_cnt;
973 if (ring_cnt > HN_RING_CNT_DEF_MAX)
974 ring_cnt = HN_RING_CNT_DEF_MAX;
975 } else if (ring_cnt > mp_ncpus) {
979 tx_ring_cnt = hn_tx_ring_cnt;
980 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
981 tx_ring_cnt = ring_cnt;
982 #ifdef HN_IFSTART_SUPPORT
983 if (hn_use_if_start) {
984 /* ifnet.if_start only needs one TX ring. */
990 * Set the leader CPU for channels.
992 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
995 * Create enough TX/RX rings, even if only limited number of
996 * channels can be allocated.
998 error = hn_create_tx_data(sc, tx_ring_cnt);
1001 error = hn_create_rx_data(sc, ring_cnt);
1006 * Create transaction context for NVS and RNDIS transactions.
1008 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1009 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1010 if (sc->hn_xact == NULL) {
1016 * Install orphan handler for the revocation of this device's
1020 * The processing order is critical here:
1021 * Install the orphan handler, _before_ testing whether this
1022 * device's primary channel has been revoked or not.
1024 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1025 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1031 * Attach the synthetic parts, i.e. NVS and RNDIS.
1033 error = hn_synth_attach(sc, ETHERMTU);
1037 error = hn_rndis_get_eaddr(sc, eaddr);
1041 #if __FreeBSD_version >= 1100099
1042 if (sc->hn_rx_ring_inuse > 1) {
1044 * Reduce TCP segment aggregation limit for multiple
1045 * RX rings to increase ACK timeliness.
1047 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1052 * Fixup TX stuffs after synthetic parts are attached.
1054 hn_fixup_tx_data(sc);
1056 ctx = device_get_sysctl_ctx(dev);
1057 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1058 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1059 &sc->hn_nvs_ver, 0, "NVS version");
1060 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1061 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1062 hn_ndis_version_sysctl, "A", "NDIS version");
1063 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1064 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1065 hn_caps_sysctl, "A", "capabilities");
1066 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1067 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1068 hn_hwassist_sysctl, "A", "hwassist");
1069 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1070 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1071 hn_rxfilter_sysctl, "A", "rxfilter");
1072 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1073 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1074 hn_rss_hash_sysctl, "A", "RSS hash");
1075 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1076 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1077 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1078 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1079 hn_rss_key_sysctl, "IU", "RSS key");
1080 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1081 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1082 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1083 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1084 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1085 "RNDIS offered packet transmission aggregation size limit");
1086 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1087 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1088 "RNDIS offered packet transmission aggregation count limit");
1089 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1090 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1091 "RNDIS packet transmission aggregation alignment");
1092 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1093 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1094 hn_txagg_size_sysctl, "I",
1095 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1096 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1097 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1098 hn_txagg_pkts_sysctl, "I",
1099 "Packet transmission aggregation packets, "
1100 "0 -- disable, -1 -- auto");
1101 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1102 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1103 hn_polling_sysctl, "I",
1104 "Polling frequency: [100,1000000], 0 disable polling");
1107 * Setup the ifmedia, which has been initialized earlier.
1109 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1110 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1111 /* XXX ifmedia_set really should do this for us */
1112 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1115 * Setup the ifnet for this interface.
1119 ifp->if_baudrate = IF_Gbps(10);
1121 /* if_baudrate is 32bits on 32bit system. */
1122 ifp->if_baudrate = IF_Gbps(1);
1124 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1125 ifp->if_ioctl = hn_ioctl;
1126 ifp->if_init = hn_init;
1127 #ifdef HN_IFSTART_SUPPORT
1128 if (hn_use_if_start) {
1129 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1131 ifp->if_start = hn_start;
1132 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1133 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1134 IFQ_SET_READY(&ifp->if_snd);
1138 ifp->if_transmit = hn_transmit;
1139 ifp->if_qflush = hn_xmit_qflush;
1142 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1144 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1145 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1147 if (sc->hn_caps & HN_CAP_VLAN) {
1148 /* XXX not sure about VLAN_MTU. */
1149 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1152 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1153 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1154 ifp->if_capabilities |= IFCAP_TXCSUM;
1155 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1156 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1157 if (sc->hn_caps & HN_CAP_TSO4) {
1158 ifp->if_capabilities |= IFCAP_TSO4;
1159 ifp->if_hwassist |= CSUM_IP_TSO;
1161 if (sc->hn_caps & HN_CAP_TSO6) {
1162 ifp->if_capabilities |= IFCAP_TSO6;
1163 ifp->if_hwassist |= CSUM_IP6_TSO;
1166 /* Enable all available capabilities by default. */
1167 ifp->if_capenable = ifp->if_capabilities;
1170 * Disable IPv6 TSO and TXCSUM by default, they still can
1171 * be enabled through SIOCSIFCAP.
1173 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1174 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1176 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1177 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1178 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1179 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1182 ether_ifattach(ifp, eaddr);
1184 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1185 if_printf(ifp, "TSO segcnt %u segsz %u\n",
1186 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1189 /* Inform the upper layer about the long frame support. */
1190 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1193 * Kick off link status check.
1195 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1196 hn_update_link_status(sc);
1200 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1201 hn_synth_detach(sc);
1207 hn_detach(device_t dev)
1209 struct hn_softc *sc = device_get_softc(dev);
1210 struct ifnet *ifp = sc->hn_ifp;
1212 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1214 * In case that the vmbus missed the orphan handler
1217 vmbus_xact_ctx_orphan(sc->hn_xact);
1220 if (device_is_attached(dev)) {
1222 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1223 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1227 * hn_stop() only suspends data, so managment
1228 * stuffs have to be suspended manually here.
1230 hn_suspend_mgmt(sc);
1231 hn_synth_detach(sc);
1234 ether_ifdetach(ifp);
1237 ifmedia_removeall(&sc->hn_media);
1238 hn_destroy_rx_data(sc);
1239 hn_destroy_tx_data(sc);
1241 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1244 for (i = 0; i < hn_tx_taskq_cnt; ++i)
1245 taskqueue_free(sc->hn_tx_taskqs[i]);
1246 free(sc->hn_tx_taskqs, M_DEVBUF);
1248 taskqueue_free(sc->hn_mgmt_taskq0);
1250 if (sc->hn_xact != NULL) {
1252 * Uninstall the orphan handler _before_ the xact is
1255 vmbus_chan_unset_orphan(sc->hn_prichan);
1256 vmbus_xact_ctx_destroy(sc->hn_xact);
1261 HN_LOCK_DESTROY(sc);
1266 hn_shutdown(device_t dev)
1273 hn_link_status(struct hn_softc *sc)
1275 uint32_t link_status;
1278 error = hn_rndis_get_linkstatus(sc, &link_status);
1280 /* XXX what to do? */
1284 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1285 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1287 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1288 if_link_state_change(sc->hn_ifp,
1289 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1290 LINK_STATE_UP : LINK_STATE_DOWN);
1294 hn_link_taskfunc(void *xsc, int pending __unused)
1296 struct hn_softc *sc = xsc;
1298 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1304 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1306 struct hn_softc *sc = xsc;
1308 /* Prevent any link status checks from running. */
1309 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1312 * Fake up a [link down --> link up] state change; 5 seconds
1313 * delay is used, which closely simulates miibus reaction
1314 * upon link down event.
1316 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1317 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1318 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1319 &sc->hn_netchg_status, 5 * hz);
1323 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1325 struct hn_softc *sc = xsc;
1327 /* Re-allow link status checks. */
1328 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1333 hn_update_link_status(struct hn_softc *sc)
1336 if (sc->hn_mgmt_taskq != NULL)
1337 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1341 hn_change_network(struct hn_softc *sc)
1344 if (sc->hn_mgmt_taskq != NULL)
1345 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1349 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1350 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1352 struct mbuf *m = *m_head;
1355 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1357 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1358 m, segs, nsegs, BUS_DMA_NOWAIT);
1359 if (error == EFBIG) {
1362 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1366 *m_head = m = m_new;
1367 txr->hn_tx_collapsed++;
1369 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1370 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1373 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1374 BUS_DMASYNC_PREWRITE);
1375 txd->flags |= HN_TXD_FLAG_DMAMAP;
1381 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1384 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1385 ("put an onlist txd %#x", txd->flags));
1386 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1387 ("put an onagg txd %#x", txd->flags));
1389 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1390 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1393 if (!STAILQ_EMPTY(&txd->agg_list)) {
1394 struct hn_txdesc *tmp_txd;
1396 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1399 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1400 ("resursive aggregation on aggregated txdesc"));
1401 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1402 ("not aggregated txdesc"));
1403 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1404 ("aggregated txdesc uses dmamap"));
1405 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1406 ("aggregated txdesc consumes "
1407 "chimney sending buffer"));
1408 KASSERT(tmp_txd->chim_size == 0,
1409 ("aggregated txdesc has non-zero "
1410 "chimney sending size"));
1412 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1413 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1414 freed = hn_txdesc_put(txr, tmp_txd);
1415 KASSERT(freed, ("failed to free aggregated txdesc"));
1419 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1420 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1421 ("chim txd uses dmamap"));
1422 hn_chim_free(txr->hn_sc, txd->chim_index);
1423 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1425 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1426 bus_dmamap_sync(txr->hn_tx_data_dtag,
1427 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1428 bus_dmamap_unload(txr->hn_tx_data_dtag,
1430 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1433 if (txd->m != NULL) {
1438 txd->flags |= HN_TXD_FLAG_ONLIST;
1439 #ifndef HN_USE_TXDESC_BUFRING
1440 mtx_lock_spin(&txr->hn_txlist_spin);
1441 KASSERT(txr->hn_txdesc_avail >= 0 &&
1442 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1443 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1444 txr->hn_txdesc_avail++;
1445 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1446 mtx_unlock_spin(&txr->hn_txlist_spin);
1447 #else /* HN_USE_TXDESC_BUFRING */
1449 atomic_add_int(&txr->hn_txdesc_avail, 1);
1451 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1452 #endif /* !HN_USE_TXDESC_BUFRING */
1457 static __inline struct hn_txdesc *
1458 hn_txdesc_get(struct hn_tx_ring *txr)
1460 struct hn_txdesc *txd;
1462 #ifndef HN_USE_TXDESC_BUFRING
1463 mtx_lock_spin(&txr->hn_txlist_spin);
1464 txd = SLIST_FIRST(&txr->hn_txlist);
1466 KASSERT(txr->hn_txdesc_avail > 0,
1467 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1468 txr->hn_txdesc_avail--;
1469 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1471 mtx_unlock_spin(&txr->hn_txlist_spin);
1473 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1477 #ifdef HN_USE_TXDESC_BUFRING
1479 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1481 #endif /* HN_USE_TXDESC_BUFRING */
1482 KASSERT(txd->m == NULL && txd->refs == 0 &&
1483 STAILQ_EMPTY(&txd->agg_list) &&
1484 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1485 txd->chim_size == 0 &&
1486 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1487 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1488 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1489 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1495 static __inline void
1496 hn_txdesc_hold(struct hn_txdesc *txd)
1499 /* 0->1 transition will never work */
1500 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1501 atomic_add_int(&txd->refs, 1);
1504 static __inline void
1505 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1508 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1509 ("recursive aggregation on aggregating txdesc"));
1511 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1512 ("already aggregated"));
1513 KASSERT(STAILQ_EMPTY(&txd->agg_list),
1514 ("recursive aggregation on to-be-aggregated txdesc"));
1516 txd->flags |= HN_TXD_FLAG_ONAGG;
1517 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1521 hn_tx_ring_pending(struct hn_tx_ring *txr)
1523 bool pending = false;
1525 #ifndef HN_USE_TXDESC_BUFRING
1526 mtx_lock_spin(&txr->hn_txlist_spin);
1527 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1529 mtx_unlock_spin(&txr->hn_txlist_spin);
1531 if (!buf_ring_full(txr->hn_txdesc_br))
1537 static __inline void
1538 hn_txeof(struct hn_tx_ring *txr)
1540 txr->hn_has_txeof = 0;
1545 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1546 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1548 struct hn_txdesc *txd = sndc->hn_cbarg;
1549 struct hn_tx_ring *txr;
1552 KASSERT(txr->hn_chan == chan,
1553 ("channel mismatch, on chan%u, should be chan%u",
1554 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1556 txr->hn_has_txeof = 1;
1557 hn_txdesc_put(txr, txd);
1559 ++txr->hn_txdone_cnt;
1560 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1561 txr->hn_txdone_cnt = 0;
1562 if (txr->hn_oactive)
1568 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1570 #if defined(INET) || defined(INET6)
1571 struct lro_ctrl *lro = &rxr->hn_lro;
1572 struct lro_entry *queued;
1574 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1575 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1576 tcp_lro_flush(lro, queued);
1582 * 'txr' could be NULL, if multiple channels and
1583 * ifnet.if_start method are enabled.
1585 if (txr == NULL || !txr->hn_has_txeof)
1588 txr->hn_txdone_cnt = 0;
1592 static __inline uint32_t
1593 hn_rndis_pktmsg_offset(uint32_t ofs)
1596 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1597 ("invalid RNDIS packet msg offset %u", ofs));
1598 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1601 static __inline void *
1602 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1603 size_t pi_dlen, uint32_t pi_type)
1605 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1606 struct rndis_pktinfo *pi;
1608 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1609 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1612 * Per-packet-info does not move; it only grows.
1615 * rm_pktinfooffset in this phase counts from the beginning
1616 * of rndis_packet_msg.
1618 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1619 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1620 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1621 pkt->rm_pktinfolen);
1622 pkt->rm_pktinfolen += pi_size;
1624 pi->rm_size = pi_size;
1625 pi->rm_type = pi_type;
1626 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1628 /* Data immediately follow per-packet-info. */
1629 pkt->rm_dataoffset += pi_size;
1631 /* Update RNDIS packet msg length */
1632 pkt->rm_len += pi_size;
1634 return (pi->rm_data);
1638 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1640 struct hn_txdesc *txd;
1644 txd = txr->hn_agg_txd;
1645 KASSERT(txd != NULL, ("no aggregate txdesc"));
1648 * Since hn_txpkt() will reset this temporary stat, save
1649 * it now, so that oerrors can be updated properly, if
1650 * hn_txpkt() ever fails.
1652 pkts = txr->hn_stat_pkts;
1655 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1656 * failure, save it for later freeing, if hn_txpkt() ever
1660 error = hn_txpkt(ifp, txr, txd);
1661 if (__predict_false(error)) {
1662 /* txd is freed, but m is not. */
1665 txr->hn_flush_failed++;
1666 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1669 /* Reset all aggregation states. */
1670 txr->hn_agg_txd = NULL;
1671 txr->hn_agg_szleft = 0;
1672 txr->hn_agg_pktleft = 0;
1673 txr->hn_agg_prevpkt = NULL;
1679 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1684 if (txr->hn_agg_txd != NULL) {
1685 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1686 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1687 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1691 * Update the previous RNDIS packet's total length,
1692 * it can be increased due to the mandatory alignment
1693 * padding for this RNDIS packet. And update the
1694 * aggregating txdesc's chimney sending buffer size
1698 * Zero-out the padding, as required by the RNDIS spec.
1701 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1702 agg_txd->chim_size += pkt->rm_len - olen;
1704 /* Link this txdesc to the parent. */
1705 hn_txdesc_agg(agg_txd, txd);
1707 chim = (uint8_t *)pkt + pkt->rm_len;
1708 /* Save the current packet for later fixup. */
1709 txr->hn_agg_prevpkt = chim;
1711 txr->hn_agg_pktleft--;
1712 txr->hn_agg_szleft -= pktsize;
1713 if (txr->hn_agg_szleft <=
1714 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1716 * Probably can't aggregate more packets,
1717 * flush this aggregating txdesc proactively.
1719 txr->hn_agg_pktleft = 0;
1724 hn_flush_txagg(ifp, txr);
1726 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1728 txr->hn_tx_chimney_tried++;
1729 txd->chim_index = hn_chim_alloc(txr->hn_sc);
1730 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1732 txr->hn_tx_chimney++;
1734 chim = txr->hn_sc->hn_chim +
1735 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1737 if (txr->hn_agg_pktmax > 1 &&
1738 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1739 txr->hn_agg_txd = txd;
1740 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1741 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1742 txr->hn_agg_prevpkt = chim;
1749 * If this function fails, then both txd and m_head0 will be freed.
1752 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1753 struct mbuf **m_head0)
1755 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1756 int error, nsegs, i;
1757 struct mbuf *m_head = *m_head0;
1758 struct rndis_packet_msg *pkt;
1761 int pkt_hlen, pkt_size;
1763 pkt = txd->rndis_pkt;
1764 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1765 if (pkt_size < txr->hn_chim_size) {
1766 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1770 if (txr->hn_agg_txd != NULL)
1771 hn_flush_txagg(ifp, txr);
1774 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1775 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1776 pkt->rm_dataoffset = sizeof(*pkt);
1777 pkt->rm_datalen = m_head->m_pkthdr.len;
1778 pkt->rm_oobdataoffset = 0;
1779 pkt->rm_oobdatalen = 0;
1780 pkt->rm_oobdataelements = 0;
1781 pkt->rm_pktinfooffset = sizeof(*pkt);
1782 pkt->rm_pktinfolen = 0;
1783 pkt->rm_vchandle = 0;
1784 pkt->rm_reserved = 0;
1786 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1788 * Set the hash value for this packet, so that the host could
1789 * dispatch the TX done event for this packet back to this TX
1792 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1793 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1794 *pi_data = txr->hn_tx_idx;
1797 if (m_head->m_flags & M_VLANTAG) {
1798 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1799 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1800 *pi_data = NDIS_VLAN_INFO_MAKE(
1801 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1802 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1803 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1806 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1807 #if defined(INET6) || defined(INET)
1808 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1809 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1811 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1812 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1813 m_head->m_pkthdr.tso_segsz);
1816 #if defined(INET6) && defined(INET)
1821 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1822 m_head->m_pkthdr.tso_segsz);
1825 #endif /* INET6 || INET */
1826 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1827 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1828 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1829 if (m_head->m_pkthdr.csum_flags &
1830 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1831 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1833 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1834 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1835 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1838 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1839 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1840 else if (m_head->m_pkthdr.csum_flags &
1841 (CSUM_IP_UDP | CSUM_IP6_UDP))
1842 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1845 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1846 /* Convert RNDIS packet message offsets */
1847 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1848 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1851 * Fast path: Chimney sending.
1854 struct hn_txdesc *tgt_txd = txd;
1856 if (txr->hn_agg_txd != NULL) {
1857 tgt_txd = txr->hn_agg_txd;
1863 KASSERT(pkt == chim,
1864 ("RNDIS pkt not in chimney sending buffer"));
1865 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1866 ("chimney sending buffer is not used"));
1867 tgt_txd->chim_size += pkt->rm_len;
1869 m_copydata(m_head, 0, m_head->m_pkthdr.len,
1870 ((uint8_t *)chim) + pkt_hlen);
1872 txr->hn_gpa_cnt = 0;
1873 txr->hn_sendpkt = hn_txpkt_chim;
1877 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1878 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1879 ("chimney buffer is used"));
1880 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1882 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1883 if (__predict_false(error)) {
1887 * This mbuf is not linked w/ the txd yet, so free it now.
1892 freed = hn_txdesc_put(txr, txd);
1894 ("fail to free txd upon txdma error"));
1896 txr->hn_txdma_failed++;
1897 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1902 /* +1 RNDIS packet message */
1903 txr->hn_gpa_cnt = nsegs + 1;
1905 /* send packet with page buffer */
1906 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1907 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1908 txr->hn_gpa[0].gpa_len = pkt_hlen;
1911 * Fill the page buffers with mbuf info after the page
1912 * buffer for RNDIS packet message.
1914 for (i = 0; i < nsegs; ++i) {
1915 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1917 gpa->gpa_page = atop(segs[i].ds_addr);
1918 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1919 gpa->gpa_len = segs[i].ds_len;
1922 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1924 txr->hn_sendpkt = hn_txpkt_sglist;
1928 /* Set the completion routine */
1929 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1931 /* Update temporary stats for later use. */
1932 txr->hn_stat_pkts++;
1933 txr->hn_stat_size += m_head->m_pkthdr.len;
1934 if (m_head->m_flags & M_MCAST)
1935 txr->hn_stat_mcasts++;
1942 * If this function fails, then txd will be freed, but the mbuf
1943 * associated w/ the txd will _not_ be freed.
1946 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1948 int error, send_failed = 0, has_bpf;
1951 has_bpf = bpf_peers_present(ifp->if_bpf);
1954 * Make sure that this txd and any aggregated txds are not
1955 * freed before ETHER_BPF_MTAP.
1957 hn_txdesc_hold(txd);
1959 error = txr->hn_sendpkt(txr, txd);
1962 const struct hn_txdesc *tmp_txd;
1964 ETHER_BPF_MTAP(ifp, txd->m);
1965 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1966 ETHER_BPF_MTAP(ifp, tmp_txd->m);
1969 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1970 #ifdef HN_IFSTART_SUPPORT
1971 if (!hn_use_if_start)
1974 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1976 if (txr->hn_stat_mcasts != 0) {
1977 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1978 txr->hn_stat_mcasts);
1981 txr->hn_pkts += txr->hn_stat_pkts;
1985 hn_txdesc_put(txr, txd);
1987 if (__predict_false(error)) {
1991 * This should "really rarely" happen.
1993 * XXX Too many RX to be acked or too many sideband
1994 * commands to run? Ask netvsc_channel_rollup()
1995 * to kick start later.
1997 txr->hn_has_txeof = 1;
1999 txr->hn_send_failed++;
2002 * Try sending again after set hn_has_txeof;
2003 * in case that we missed the last
2004 * netvsc_channel_rollup().
2008 if_printf(ifp, "send failed\n");
2011 * Caller will perform further processing on the
2012 * associated mbuf, so don't free it in hn_txdesc_put();
2013 * only unload it from the DMA map in hn_txdesc_put(),
2017 freed = hn_txdesc_put(txr, txd);
2019 ("fail to free txd upon send error"));
2021 txr->hn_send_failed++;
2024 /* Reset temporary stats, after this sending is done. */
2025 txr->hn_stat_size = 0;
2026 txr->hn_stat_pkts = 0;
2027 txr->hn_stat_mcasts = 0;
2033 * Append the specified data to the indicated mbuf chain,
2034 * Extend the mbuf chain if the new data does not fit in
2037 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2038 * There should be an equivalent in the kernel mbuf code,
2039 * but there does not appear to be one yet.
2041 * Differs from m_append() in that additional mbufs are
2042 * allocated with cluster size MJUMPAGESIZE, and filled
2045 * Return 1 if able to complete the job; otherwise 0.
2048 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2051 int remainder, space;
2053 for (m = m0; m->m_next != NULL; m = m->m_next)
2056 space = M_TRAILINGSPACE(m);
2059 * Copy into available space.
2061 if (space > remainder)
2063 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2068 while (remainder > 0) {
2070 * Allocate a new mbuf; could check space
2071 * and allocate a cluster instead.
2073 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
2076 n->m_len = min(MJUMPAGESIZE, remainder);
2077 bcopy(cp, mtod(n, caddr_t), n->m_len);
2079 remainder -= n->m_len;
2083 if (m0->m_flags & M_PKTHDR)
2084 m0->m_pkthdr.len += len - remainder;
2086 return (remainder == 0);
2089 #if defined(INET) || defined(INET6)
2091 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2093 #if __FreeBSD_version >= 1100095
2094 if (hn_lro_mbufq_depth) {
2095 tcp_lro_queue_mbuf(lc, m);
2099 return tcp_lro_rx(lc, m, 0);
2104 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2105 const struct hn_rxinfo *info)
2107 struct ifnet *ifp = rxr->hn_ifp;
2109 int size, do_lro = 0, do_csum = 1;
2110 int hash_type = M_HASHTYPE_OPAQUE;
2112 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2116 * Bail out if packet contains more data than configured MTU.
2118 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2120 } else if (dlen <= MHLEN) {
2121 m_new = m_gethdr(M_NOWAIT, MT_DATA);
2122 if (m_new == NULL) {
2123 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2126 memcpy(mtod(m_new, void *), data, dlen);
2127 m_new->m_pkthdr.len = m_new->m_len = dlen;
2128 rxr->hn_small_pkts++;
2131 * Get an mbuf with a cluster. For packets 2K or less,
2132 * get a standard 2K cluster. For anything larger, get a
2133 * 4K cluster. Any buffers larger than 4K can cause problems
2134 * if looped around to the Hyper-V TX channel, so avoid them.
2137 if (dlen > MCLBYTES) {
2139 size = MJUMPAGESIZE;
2142 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2143 if (m_new == NULL) {
2144 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2148 hv_m_append(m_new, dlen, data);
2150 m_new->m_pkthdr.rcvif = ifp;
2152 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2155 /* receive side checksum offload */
2156 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2157 /* IP csum offload */
2158 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2159 m_new->m_pkthdr.csum_flags |=
2160 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2164 /* TCP/UDP csum offload */
2165 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2166 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2167 m_new->m_pkthdr.csum_flags |=
2168 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2169 m_new->m_pkthdr.csum_data = 0xffff;
2170 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2178 * As of this write (Oct 28th, 2016), host side will turn
2179 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2180 * the do_lro setting here is actually _not_ accurate. We
2181 * depend on the RSS hash type check to reset do_lro.
2183 if ((info->csum_info &
2184 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2185 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2188 const struct ether_header *eh;
2193 if (m_new->m_len < hoff)
2195 eh = mtod(m_new, struct ether_header *);
2196 etype = ntohs(eh->ether_type);
2197 if (etype == ETHERTYPE_VLAN) {
2198 const struct ether_vlan_header *evl;
2200 hoff = sizeof(*evl);
2201 if (m_new->m_len < hoff)
2203 evl = mtod(m_new, struct ether_vlan_header *);
2204 etype = ntohs(evl->evl_proto);
2207 if (etype == ETHERTYPE_IP) {
2210 pr = hn_check_iplen(m_new, hoff);
2211 if (pr == IPPROTO_TCP) {
2213 (rxr->hn_trust_hcsum &
2214 HN_TRUST_HCSUM_TCP)) {
2215 rxr->hn_csum_trusted++;
2216 m_new->m_pkthdr.csum_flags |=
2217 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2218 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2219 m_new->m_pkthdr.csum_data = 0xffff;
2222 } else if (pr == IPPROTO_UDP) {
2224 (rxr->hn_trust_hcsum &
2225 HN_TRUST_HCSUM_UDP)) {
2226 rxr->hn_csum_trusted++;
2227 m_new->m_pkthdr.csum_flags |=
2228 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2229 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2230 m_new->m_pkthdr.csum_data = 0xffff;
2232 } else if (pr != IPPROTO_DONE && do_csum &&
2233 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2234 rxr->hn_csum_trusted++;
2235 m_new->m_pkthdr.csum_flags |=
2236 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2241 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2242 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2243 NDIS_VLAN_INFO_ID(info->vlan_info),
2244 NDIS_VLAN_INFO_PRI(info->vlan_info),
2245 NDIS_VLAN_INFO_CFI(info->vlan_info));
2246 m_new->m_flags |= M_VLANTAG;
2249 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2251 m_new->m_pkthdr.flowid = info->hash_value;
2252 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2253 NDIS_HASH_FUNCTION_TOEPLITZ) {
2254 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2258 * do_lro is resetted, if the hash types are not TCP
2259 * related. See the comment in the above csum_flags
2263 case NDIS_HASH_IPV4:
2264 hash_type = M_HASHTYPE_RSS_IPV4;
2268 case NDIS_HASH_TCP_IPV4:
2269 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2272 case NDIS_HASH_IPV6:
2273 hash_type = M_HASHTYPE_RSS_IPV6;
2277 case NDIS_HASH_IPV6_EX:
2278 hash_type = M_HASHTYPE_RSS_IPV6_EX;
2282 case NDIS_HASH_TCP_IPV6:
2283 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2286 case NDIS_HASH_TCP_IPV6_EX:
2287 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2292 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2294 M_HASHTYPE_SET(m_new, hash_type);
2297 * Note: Moved RX completion back to hv_nv_on_receive() so all
2298 * messages (not just data messages) will trigger a response.
2304 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2305 #if defined(INET) || defined(INET6)
2306 struct lro_ctrl *lro = &rxr->hn_lro;
2309 rxr->hn_lro_tried++;
2310 if (hn_lro_rx(lro, m_new) == 0) {
2318 /* We're not holding the lock here, so don't release it */
2319 (*ifp->if_input)(ifp, m_new);
2325 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2327 struct hn_softc *sc = ifp->if_softc;
2328 struct ifreq *ifr = (struct ifreq *)data;
2329 int mask, error = 0;
2333 if (ifr->ifr_mtu > HN_MTU_MAX) {
2340 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2345 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2346 /* Can't change MTU */
2352 if (ifp->if_mtu == ifr->ifr_mtu) {
2357 /* Disable polling. */
2361 * Suspend this interface before the synthetic parts
2367 * Detach the synthetics parts, i.e. NVS and RNDIS.
2369 hn_synth_detach(sc);
2372 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2373 * with the new MTU setting.
2375 error = hn_synth_attach(sc, ifr->ifr_mtu);
2382 * Commit the requested MTU, after the synthetic parts
2383 * have been successfully attached.
2385 ifp->if_mtu = ifr->ifr_mtu;
2388 * Make sure that various parameters based on MTU are
2389 * still valid, after the MTU change.
2391 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2392 hn_set_chim_size(sc, sc->hn_chim_szmax);
2393 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2394 #if __FreeBSD_version >= 1100099
2395 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2396 HN_LRO_LENLIM_MIN(ifp))
2397 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2401 * All done! Resume the interface now.
2406 * Re-enable polling if this interface is running and
2407 * the polling is requested.
2409 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
2410 hn_polling(sc, sc->hn_pollhz);
2418 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2423 if (ifp->if_flags & IFF_UP) {
2424 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2426 * Caller meight hold mutex, e.g.
2427 * bpf; use busy-wait for the RNDIS
2431 hn_set_rxfilter(sc);
2437 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2440 sc->hn_if_flags = ifp->if_flags;
2447 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2449 if (mask & IFCAP_TXCSUM) {
2450 ifp->if_capenable ^= IFCAP_TXCSUM;
2451 if (ifp->if_capenable & IFCAP_TXCSUM)
2452 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2454 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2456 if (mask & IFCAP_TXCSUM_IPV6) {
2457 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2458 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2459 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2461 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2464 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2465 if (mask & IFCAP_RXCSUM)
2466 ifp->if_capenable ^= IFCAP_RXCSUM;
2468 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2469 if (mask & IFCAP_RXCSUM_IPV6)
2470 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2473 if (mask & IFCAP_LRO)
2474 ifp->if_capenable ^= IFCAP_LRO;
2476 if (mask & IFCAP_TSO4) {
2477 ifp->if_capenable ^= IFCAP_TSO4;
2478 if (ifp->if_capenable & IFCAP_TSO4)
2479 ifp->if_hwassist |= CSUM_IP_TSO;
2481 ifp->if_hwassist &= ~CSUM_IP_TSO;
2483 if (mask & IFCAP_TSO6) {
2484 ifp->if_capenable ^= IFCAP_TSO6;
2485 if (ifp->if_capenable & IFCAP_TSO6)
2486 ifp->if_hwassist |= CSUM_IP6_TSO;
2488 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2498 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2502 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2504 * Multicast uses mutex; use busy-wait for
2508 hn_set_rxfilter(sc);
2517 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2521 error = ether_ioctl(ifp, cmd, data);
2528 hn_stop(struct hn_softc *sc)
2530 struct ifnet *ifp = sc->hn_ifp;
2535 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2536 ("synthetic parts were not attached"));
2538 /* Disable polling. */
2541 /* Clear RUNNING bit _before_ hn_suspend_data() */
2542 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2543 hn_suspend_data(sc);
2545 /* Clear OACTIVE bit. */
2546 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2547 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2548 sc->hn_tx_ring[i].hn_oactive = 0;
2552 hn_init_locked(struct hn_softc *sc)
2554 struct ifnet *ifp = sc->hn_ifp;
2559 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2562 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2565 /* Configure RX filter */
2566 hn_set_rxfilter(sc);
2568 /* Clear OACTIVE bit. */
2569 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2570 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2571 sc->hn_tx_ring[i].hn_oactive = 0;
2573 /* Clear TX 'suspended' bit. */
2574 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2576 /* Everything is ready; unleash! */
2577 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2579 /* Re-enable polling if requested. */
2580 if (sc->hn_pollhz > 0)
2581 hn_polling(sc, sc->hn_pollhz);
2587 struct hn_softc *sc = xsc;
2594 #if __FreeBSD_version >= 1100099
2597 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2599 struct hn_softc *sc = arg1;
2600 unsigned int lenlim;
2603 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2604 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2605 if (error || req->newptr == NULL)
2609 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2610 lenlim > TCP_LRO_LENGTH_MAX) {
2614 hn_set_lro_lenlim(sc, lenlim);
2621 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2623 struct hn_softc *sc = arg1;
2624 int ackcnt, error, i;
2627 * lro_ackcnt_lim is append count limit,
2628 * +1 to turn it into aggregation limit.
2630 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2631 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2632 if (error || req->newptr == NULL)
2635 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2639 * Convert aggregation limit back to append
2644 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2645 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2653 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2655 struct hn_softc *sc = arg1;
2660 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2663 error = sysctl_handle_int(oidp, &on, 0, req);
2664 if (error || req->newptr == NULL)
2668 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2669 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2672 rxr->hn_trust_hcsum |= hcsum;
2674 rxr->hn_trust_hcsum &= ~hcsum;
2681 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2683 struct hn_softc *sc = arg1;
2684 int chim_size, error;
2686 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2687 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2688 if (error || req->newptr == NULL)
2691 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2695 hn_set_chim_size(sc, chim_size);
2700 #if __FreeBSD_version < 1100095
2702 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2704 struct hn_softc *sc = arg1;
2705 int ofs = arg2, i, error;
2706 struct hn_rx_ring *rxr;
2710 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2711 rxr = &sc->hn_rx_ring[i];
2712 stat += *((int *)((uint8_t *)rxr + ofs));
2715 error = sysctl_handle_64(oidp, &stat, 0, req);
2716 if (error || req->newptr == NULL)
2719 /* Zero out this stat. */
2720 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2721 rxr = &sc->hn_rx_ring[i];
2722 *((int *)((uint8_t *)rxr + ofs)) = 0;
2728 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2730 struct hn_softc *sc = arg1;
2731 int ofs = arg2, i, error;
2732 struct hn_rx_ring *rxr;
2736 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2737 rxr = &sc->hn_rx_ring[i];
2738 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2741 error = sysctl_handle_64(oidp, &stat, 0, req);
2742 if (error || req->newptr == NULL)
2745 /* Zero out this stat. */
2746 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2747 rxr = &sc->hn_rx_ring[i];
2748 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2756 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2758 struct hn_softc *sc = arg1;
2759 int ofs = arg2, i, error;
2760 struct hn_rx_ring *rxr;
2764 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2765 rxr = &sc->hn_rx_ring[i];
2766 stat += *((u_long *)((uint8_t *)rxr + ofs));
2769 error = sysctl_handle_long(oidp, &stat, 0, req);
2770 if (error || req->newptr == NULL)
2773 /* Zero out this stat. */
2774 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2775 rxr = &sc->hn_rx_ring[i];
2776 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2782 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2784 struct hn_softc *sc = arg1;
2785 int ofs = arg2, i, error;
2786 struct hn_tx_ring *txr;
2790 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2791 txr = &sc->hn_tx_ring[i];
2792 stat += *((u_long *)((uint8_t *)txr + ofs));
2795 error = sysctl_handle_long(oidp, &stat, 0, req);
2796 if (error || req->newptr == NULL)
2799 /* Zero out this stat. */
2800 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2801 txr = &sc->hn_tx_ring[i];
2802 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2808 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2810 struct hn_softc *sc = arg1;
2811 int ofs = arg2, i, error, conf;
2812 struct hn_tx_ring *txr;
2814 txr = &sc->hn_tx_ring[0];
2815 conf = *((int *)((uint8_t *)txr + ofs));
2817 error = sysctl_handle_int(oidp, &conf, 0, req);
2818 if (error || req->newptr == NULL)
2822 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2823 txr = &sc->hn_tx_ring[i];
2824 *((int *)((uint8_t *)txr + ofs)) = conf;
2832 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2834 struct hn_softc *sc = arg1;
2837 size = sc->hn_agg_size;
2838 error = sysctl_handle_int(oidp, &size, 0, req);
2839 if (error || req->newptr == NULL)
2843 sc->hn_agg_size = size;
2851 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2853 struct hn_softc *sc = arg1;
2856 pkts = sc->hn_agg_pkts;
2857 error = sysctl_handle_int(oidp, &pkts, 0, req);
2858 if (error || req->newptr == NULL)
2862 sc->hn_agg_pkts = pkts;
2870 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2872 struct hn_softc *sc = arg1;
2875 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2876 return (sysctl_handle_int(oidp, &pkts, 0, req));
2880 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2882 struct hn_softc *sc = arg1;
2885 align = sc->hn_tx_ring[0].hn_agg_align;
2886 return (sysctl_handle_int(oidp, &align, 0, req));
2890 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
2893 vmbus_chan_poll_disable(chan);
2895 vmbus_chan_poll_enable(chan, pollhz);
2899 hn_polling(struct hn_softc *sc, u_int pollhz)
2901 int nsubch = sc->hn_rx_ring_inuse - 1;
2906 struct vmbus_channel **subch;
2909 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
2910 for (i = 0; i < nsubch; ++i)
2911 hn_chan_polling(subch[i], pollhz);
2912 vmbus_subchan_rel(subch, nsubch);
2914 hn_chan_polling(sc->hn_prichan, pollhz);
2918 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
2920 struct hn_softc *sc = arg1;
2923 pollhz = sc->hn_pollhz;
2924 error = sysctl_handle_int(oidp, &pollhz, 0, req);
2925 if (error || req->newptr == NULL)
2929 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
2933 if (sc->hn_pollhz != pollhz) {
2934 sc->hn_pollhz = pollhz;
2935 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
2936 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
2937 hn_polling(sc, sc->hn_pollhz);
2945 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2947 struct hn_softc *sc = arg1;
2950 snprintf(verstr, sizeof(verstr), "%u.%u",
2951 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2952 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2953 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2957 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2959 struct hn_softc *sc = arg1;
2966 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2967 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2971 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2973 struct hn_softc *sc = arg1;
2974 char assist_str[128];
2978 hwassist = sc->hn_ifp->if_hwassist;
2980 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2981 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2985 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2987 struct hn_softc *sc = arg1;
2988 char filter_str[128];
2992 filter = sc->hn_rx_filter;
2994 snprintf(filter_str, sizeof(filter_str), "%b", filter,
2996 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
3000 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
3002 struct hn_softc *sc = arg1;
3007 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3008 if (error || req->newptr == NULL)
3011 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3014 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3016 if (sc->hn_rx_ring_inuse > 1) {
3017 error = hn_rss_reconfig(sc);
3019 /* Not RSS capable, at least for now; just save the RSS key. */
3028 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
3030 struct hn_softc *sc = arg1;
3035 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3036 if (error || req->newptr == NULL)
3040 * Don't allow RSS indirect table change, if this interface is not
3041 * RSS capable currently.
3043 if (sc->hn_rx_ring_inuse == 1) {
3048 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3051 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3053 hn_rss_ind_fixup(sc);
3054 error = hn_rss_reconfig(sc);
3061 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3063 struct hn_softc *sc = arg1;
3068 hash = sc->hn_rss_hash;
3070 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3071 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3075 hn_check_iplen(const struct mbuf *m, int hoff)
3077 const struct ip *ip;
3078 int len, iphlen, iplen;
3079 const struct tcphdr *th;
3080 int thoff; /* TCP data offset */
3082 len = hoff + sizeof(struct ip);
3084 /* The packet must be at least the size of an IP header. */
3085 if (m->m_pkthdr.len < len)
3086 return IPPROTO_DONE;
3088 /* The fixed IP header must reside completely in the first mbuf. */
3090 return IPPROTO_DONE;
3092 ip = mtodo(m, hoff);
3094 /* Bound check the packet's stated IP header length. */
3095 iphlen = ip->ip_hl << 2;
3096 if (iphlen < sizeof(struct ip)) /* minimum header length */
3097 return IPPROTO_DONE;
3099 /* The full IP header must reside completely in the one mbuf. */
3100 if (m->m_len < hoff + iphlen)
3101 return IPPROTO_DONE;
3103 iplen = ntohs(ip->ip_len);
3106 * Check that the amount of data in the buffers is as
3107 * at least much as the IP header would have us expect.
3109 if (m->m_pkthdr.len < hoff + iplen)
3110 return IPPROTO_DONE;
3113 * Ignore IP fragments.
3115 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3116 return IPPROTO_DONE;
3119 * The TCP/IP or UDP/IP header must be entirely contained within
3120 * the first fragment of a packet.
3124 if (iplen < iphlen + sizeof(struct tcphdr))
3125 return IPPROTO_DONE;
3126 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3127 return IPPROTO_DONE;
3128 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3129 thoff = th->th_off << 2;
3130 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3131 return IPPROTO_DONE;
3132 if (m->m_len < hoff + iphlen + thoff)
3133 return IPPROTO_DONE;
3136 if (iplen < iphlen + sizeof(struct udphdr))
3137 return IPPROTO_DONE;
3138 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3139 return IPPROTO_DONE;
3143 return IPPROTO_DONE;
3150 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3152 struct sysctl_oid_list *child;
3153 struct sysctl_ctx_list *ctx;
3154 device_t dev = sc->hn_dev;
3155 #if defined(INET) || defined(INET6)
3156 #if __FreeBSD_version >= 1100095
3163 * Create RXBUF for reception.
3166 * - It is shared by all channels.
3167 * - A large enough buffer is allocated, certain version of NVSes
3168 * may further limit the usable space.
3170 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3171 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3172 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3173 if (sc->hn_rxbuf == NULL) {
3174 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3178 sc->hn_rx_ring_cnt = ring_cnt;
3179 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3181 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3182 M_DEVBUF, M_WAITOK | M_ZERO);
3184 #if defined(INET) || defined(INET6)
3185 #if __FreeBSD_version >= 1100095
3186 lroent_cnt = hn_lro_entry_count;
3187 if (lroent_cnt < TCP_LRO_ENTRIES)
3188 lroent_cnt = TCP_LRO_ENTRIES;
3190 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3192 #endif /* INET || INET6 */
3194 ctx = device_get_sysctl_ctx(dev);
3195 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3197 /* Create dev.hn.UNIT.rx sysctl tree */
3198 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3199 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3201 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3202 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3204 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3205 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3206 &rxr->hn_br_dma, BUS_DMA_WAITOK);
3207 if (rxr->hn_br == NULL) {
3208 device_printf(dev, "allocate bufring failed\n");
3212 if (hn_trust_hosttcp)
3213 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3214 if (hn_trust_hostudp)
3215 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3216 if (hn_trust_hostip)
3217 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3218 rxr->hn_ifp = sc->hn_ifp;
3219 if (i < sc->hn_tx_ring_cnt)
3220 rxr->hn_txr = &sc->hn_tx_ring[i];
3221 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3222 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3224 rxr->hn_rxbuf = sc->hn_rxbuf;
3229 #if defined(INET) || defined(INET6)
3230 #if __FreeBSD_version >= 1100095
3231 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3232 hn_lro_mbufq_depth);
3234 tcp_lro_init(&rxr->hn_lro);
3235 rxr->hn_lro.ifp = sc->hn_ifp;
3237 #if __FreeBSD_version >= 1100099
3238 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3239 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3241 #endif /* INET || INET6 */
3243 if (sc->hn_rx_sysctl_tree != NULL) {
3247 * Create per RX ring sysctl tree:
3248 * dev.hn.UNIT.rx.RINGID
3250 snprintf(name, sizeof(name), "%d", i);
3251 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3252 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3253 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3255 if (rxr->hn_rx_sysctl_tree != NULL) {
3256 SYSCTL_ADD_ULONG(ctx,
3257 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3258 OID_AUTO, "packets", CTLFLAG_RW,
3259 &rxr->hn_pkts, "# of packets received");
3260 SYSCTL_ADD_ULONG(ctx,
3261 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3262 OID_AUTO, "rss_pkts", CTLFLAG_RW,
3264 "# of packets w/ RSS info received");
3266 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3267 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3268 &rxr->hn_pktbuf_len, 0,
3269 "Temporary channel packet buffer length");
3274 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3275 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3276 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3277 #if __FreeBSD_version < 1100095
3278 hn_rx_stat_int_sysctl,
3280 hn_rx_stat_u64_sysctl,
3282 "LU", "LRO queued");
3283 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3284 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3285 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3286 #if __FreeBSD_version < 1100095
3287 hn_rx_stat_int_sysctl,
3289 hn_rx_stat_u64_sysctl,
3291 "LU", "LRO flushed");
3292 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3293 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3294 __offsetof(struct hn_rx_ring, hn_lro_tried),
3295 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3296 #if __FreeBSD_version >= 1100099
3297 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3298 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3299 hn_lro_lenlim_sysctl, "IU",
3300 "Max # of data bytes to be aggregated by LRO");
3301 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3302 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3303 hn_lro_ackcnt_sysctl, "I",
3304 "Max # of ACKs to be aggregated by LRO");
3306 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3307 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3308 hn_trust_hcsum_sysctl, "I",
3309 "Trust tcp segement verification on host side, "
3310 "when csum info is missing");
3311 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3312 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3313 hn_trust_hcsum_sysctl, "I",
3314 "Trust udp datagram verification on host side, "
3315 "when csum info is missing");
3316 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3317 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3318 hn_trust_hcsum_sysctl, "I",
3319 "Trust ip packet verification on host side, "
3320 "when csum info is missing");
3321 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3322 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3323 __offsetof(struct hn_rx_ring, hn_csum_ip),
3324 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3325 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3326 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3327 __offsetof(struct hn_rx_ring, hn_csum_tcp),
3328 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3329 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3330 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3331 __offsetof(struct hn_rx_ring, hn_csum_udp),
3332 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3333 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3334 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3335 __offsetof(struct hn_rx_ring, hn_csum_trusted),
3336 hn_rx_stat_ulong_sysctl, "LU",
3337 "# of packets that we trust host's csum verification");
3338 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3339 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3340 __offsetof(struct hn_rx_ring, hn_small_pkts),
3341 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3342 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3343 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3344 __offsetof(struct hn_rx_ring, hn_ack_failed),
3345 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3346 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3347 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3348 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3349 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3355 hn_destroy_rx_data(struct hn_softc *sc)
3359 if (sc->hn_rxbuf != NULL) {
3360 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3361 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3363 device_printf(sc->hn_dev, "RXBUF is referenced\n");
3364 sc->hn_rxbuf = NULL;
3367 if (sc->hn_rx_ring_cnt == 0)
3370 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3371 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3373 if (rxr->hn_br == NULL)
3375 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3376 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3378 device_printf(sc->hn_dev,
3379 "%dth channel bufring is referenced", i);
3383 #if defined(INET) || defined(INET6)
3384 tcp_lro_free(&rxr->hn_lro);
3386 free(rxr->hn_pktbuf, M_DEVBUF);
3388 free(sc->hn_rx_ring, M_DEVBUF);
3389 sc->hn_rx_ring = NULL;
3391 sc->hn_rx_ring_cnt = 0;
3392 sc->hn_rx_ring_inuse = 0;
3396 hn_tx_ring_create(struct hn_softc *sc, int id)
3398 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3399 device_t dev = sc->hn_dev;
3400 bus_dma_tag_t parent_dtag;
3404 txr->hn_tx_idx = id;
3406 #ifndef HN_USE_TXDESC_BUFRING
3407 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3409 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3411 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3412 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3413 M_DEVBUF, M_WAITOK | M_ZERO);
3414 #ifndef HN_USE_TXDESC_BUFRING
3415 SLIST_INIT(&txr->hn_txlist);
3417 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3418 M_WAITOK, &txr->hn_tx_lock);
3421 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3422 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3423 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3425 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3428 #ifdef HN_IFSTART_SUPPORT
3429 if (hn_use_if_start) {
3430 txr->hn_txeof = hn_start_txeof;
3431 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3432 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3438 txr->hn_txeof = hn_xmit_txeof;
3439 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3440 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3442 br_depth = hn_get_txswq_depth(txr);
3443 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3444 M_WAITOK, &txr->hn_tx_lock);
3447 txr->hn_direct_tx_size = hn_direct_tx_size;
3450 * Always schedule transmission instead of trying to do direct
3451 * transmission. This one gives the best performance so far.
3453 txr->hn_sched_tx = 1;
3455 parent_dtag = bus_get_dma_tag(dev);
3457 /* DMA tag for RNDIS packet messages. */
3458 error = bus_dma_tag_create(parent_dtag, /* parent */
3459 HN_RNDIS_PKT_ALIGN, /* alignment */
3460 HN_RNDIS_PKT_BOUNDARY, /* boundary */
3461 BUS_SPACE_MAXADDR, /* lowaddr */
3462 BUS_SPACE_MAXADDR, /* highaddr */
3463 NULL, NULL, /* filter, filterarg */
3464 HN_RNDIS_PKT_LEN, /* maxsize */
3466 HN_RNDIS_PKT_LEN, /* maxsegsize */
3468 NULL, /* lockfunc */
3469 NULL, /* lockfuncarg */
3470 &txr->hn_tx_rndis_dtag);
3472 device_printf(dev, "failed to create rndis dmatag\n");
3476 /* DMA tag for data. */
3477 error = bus_dma_tag_create(parent_dtag, /* parent */
3479 HN_TX_DATA_BOUNDARY, /* boundary */
3480 BUS_SPACE_MAXADDR, /* lowaddr */
3481 BUS_SPACE_MAXADDR, /* highaddr */
3482 NULL, NULL, /* filter, filterarg */
3483 HN_TX_DATA_MAXSIZE, /* maxsize */
3484 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3485 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3487 NULL, /* lockfunc */
3488 NULL, /* lockfuncarg */
3489 &txr->hn_tx_data_dtag);
3491 device_printf(dev, "failed to create data dmatag\n");
3495 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3496 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3499 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3500 STAILQ_INIT(&txd->agg_list);
3503 * Allocate and load RNDIS packet message.
3505 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3506 (void **)&txd->rndis_pkt,
3507 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3508 &txd->rndis_pkt_dmap);
3511 "failed to allocate rndis_packet_msg, %d\n", i);
3515 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3516 txd->rndis_pkt_dmap,
3517 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3518 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3522 "failed to load rndis_packet_msg, %d\n", i);
3523 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3524 txd->rndis_pkt, txd->rndis_pkt_dmap);
3528 /* DMA map for TX data. */
3529 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3533 "failed to allocate tx data dmamap\n");
3534 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3535 txd->rndis_pkt_dmap);
3536 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3537 txd->rndis_pkt, txd->rndis_pkt_dmap);
3541 /* All set, put it to list */
3542 txd->flags |= HN_TXD_FLAG_ONLIST;
3543 #ifndef HN_USE_TXDESC_BUFRING
3544 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3546 buf_ring_enqueue(txr->hn_txdesc_br, txd);
3549 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3551 if (sc->hn_tx_sysctl_tree != NULL) {
3552 struct sysctl_oid_list *child;
3553 struct sysctl_ctx_list *ctx;
3557 * Create per TX ring sysctl tree:
3558 * dev.hn.UNIT.tx.RINGID
3560 ctx = device_get_sysctl_ctx(dev);
3561 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3563 snprintf(name, sizeof(name), "%d", id);
3564 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3565 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3567 if (txr->hn_tx_sysctl_tree != NULL) {
3568 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3571 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3572 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3573 "# of available TX descs");
3575 #ifdef HN_IFSTART_SUPPORT
3576 if (!hn_use_if_start)
3579 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3580 CTLFLAG_RD, &txr->hn_oactive, 0,
3583 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3584 CTLFLAG_RW, &txr->hn_pkts,
3585 "# of packets transmitted");
3586 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3587 CTLFLAG_RW, &txr->hn_sends, "# of sends");
3595 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3597 struct hn_tx_ring *txr = txd->txr;
3599 KASSERT(txd->m == NULL, ("still has mbuf installed"));
3600 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3602 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3603 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3604 txd->rndis_pkt_dmap);
3605 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3609 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3612 KASSERT(txd->refs == 0 || txd->refs == 1,
3613 ("invalid txd refs %d", txd->refs));
3615 /* Aggregated txds will be freed by their aggregating txd. */
3616 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3619 freed = hn_txdesc_put(txr, txd);
3620 KASSERT(freed, ("can't free txdesc"));
3625 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3629 if (txr->hn_txdesc == NULL)
3634 * Because the freeing of aggregated txds will be deferred
3635 * to the aggregating txd, two passes are used here:
3636 * - The first pass GCes any pending txds. This GC is necessary,
3637 * since if the channels are revoked, hypervisor will not
3638 * deliver send-done for all pending txds.
3639 * - The second pass frees the busdma stuffs, i.e. after all txds
3642 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3643 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3644 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3645 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3647 if (txr->hn_tx_data_dtag != NULL)
3648 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3649 if (txr->hn_tx_rndis_dtag != NULL)
3650 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3652 #ifdef HN_USE_TXDESC_BUFRING
3653 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3656 free(txr->hn_txdesc, M_DEVBUF);
3657 txr->hn_txdesc = NULL;
3659 if (txr->hn_mbuf_br != NULL)
3660 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3662 #ifndef HN_USE_TXDESC_BUFRING
3663 mtx_destroy(&txr->hn_txlist_spin);
3665 mtx_destroy(&txr->hn_tx_lock);
3669 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3671 struct sysctl_oid_list *child;
3672 struct sysctl_ctx_list *ctx;
3676 * Create TXBUF for chimney sending.
3678 * NOTE: It is shared by all channels.
3680 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3681 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3682 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3683 if (sc->hn_chim == NULL) {
3684 device_printf(sc->hn_dev, "allocate txbuf failed\n");
3688 sc->hn_tx_ring_cnt = ring_cnt;
3689 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3691 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3692 M_DEVBUF, M_WAITOK | M_ZERO);
3694 ctx = device_get_sysctl_ctx(sc->hn_dev);
3695 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3697 /* Create dev.hn.UNIT.tx sysctl tree */
3698 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3699 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3701 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3704 error = hn_tx_ring_create(sc, i);
3709 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3710 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3711 __offsetof(struct hn_tx_ring, hn_no_txdescs),
3712 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3713 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3714 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3715 __offsetof(struct hn_tx_ring, hn_send_failed),
3716 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3717 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3718 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3719 __offsetof(struct hn_tx_ring, hn_txdma_failed),
3720 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3721 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3722 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3723 __offsetof(struct hn_tx_ring, hn_flush_failed),
3724 hn_tx_stat_ulong_sysctl, "LU",
3725 "# of packet transmission aggregation flush failure");
3726 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3727 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3728 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3729 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3730 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3731 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3732 __offsetof(struct hn_tx_ring, hn_tx_chimney),
3733 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3734 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3735 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3736 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3737 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3738 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3739 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3740 "# of total TX descs");
3741 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3742 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3743 "Chimney send packet size upper boundary");
3744 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3745 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3746 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3747 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3748 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3749 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3750 hn_tx_conf_int_sysctl, "I",
3751 "Size of the packet for direct transmission");
3752 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3753 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3754 __offsetof(struct hn_tx_ring, hn_sched_tx),
3755 hn_tx_conf_int_sysctl, "I",
3756 "Always schedule transmission "
3757 "instead of doing direct transmission");
3758 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3759 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3760 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3761 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3762 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3763 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3764 "Applied packet transmission aggregation size");
3765 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3766 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3767 hn_txagg_pktmax_sysctl, "I",
3768 "Applied packet transmission aggregation packets");
3769 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3770 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3771 hn_txagg_align_sysctl, "I",
3772 "Applied packet transmission aggregation alignment");
3778 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3782 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3783 sc->hn_tx_ring[i].hn_chim_size = chim_size;
3787 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3789 struct ifnet *ifp = sc->hn_ifp;
3792 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3795 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3796 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3797 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3799 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3800 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3801 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3803 if (tso_maxlen < tso_minlen)
3804 tso_maxlen = tso_minlen;
3805 else if (tso_maxlen > IP_MAXPACKET)
3806 tso_maxlen = IP_MAXPACKET;
3807 if (tso_maxlen > sc->hn_ndis_tso_szmax)
3808 tso_maxlen = sc->hn_ndis_tso_szmax;
3809 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3811 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3815 hn_fixup_tx_data(struct hn_softc *sc)
3817 uint64_t csum_assist;
3820 hn_set_chim_size(sc, sc->hn_chim_szmax);
3821 if (hn_tx_chimney_size > 0 &&
3822 hn_tx_chimney_size < sc->hn_chim_szmax)
3823 hn_set_chim_size(sc, hn_tx_chimney_size);
3826 if (sc->hn_caps & HN_CAP_IPCS)
3827 csum_assist |= CSUM_IP;
3828 if (sc->hn_caps & HN_CAP_TCP4CS)
3829 csum_assist |= CSUM_IP_TCP;
3830 if (sc->hn_caps & HN_CAP_UDP4CS)
3831 csum_assist |= CSUM_IP_UDP;
3832 if (sc->hn_caps & HN_CAP_TCP6CS)
3833 csum_assist |= CSUM_IP6_TCP;
3834 if (sc->hn_caps & HN_CAP_UDP6CS)
3835 csum_assist |= CSUM_IP6_UDP;
3836 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3837 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3839 if (sc->hn_caps & HN_CAP_HASHVAL) {
3841 * Support HASHVAL pktinfo on TX path.
3844 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3845 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3846 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3851 hn_destroy_tx_data(struct hn_softc *sc)
3855 if (sc->hn_chim != NULL) {
3856 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
3857 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3859 device_printf(sc->hn_dev,
3860 "chimney sending buffer is referenced");
3865 if (sc->hn_tx_ring_cnt == 0)
3868 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3869 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3871 free(sc->hn_tx_ring, M_DEVBUF);
3872 sc->hn_tx_ring = NULL;
3874 sc->hn_tx_ring_cnt = 0;
3875 sc->hn_tx_ring_inuse = 0;
3878 #ifdef HN_IFSTART_SUPPORT
3881 hn_start_taskfunc(void *xtxr, int pending __unused)
3883 struct hn_tx_ring *txr = xtxr;
3885 mtx_lock(&txr->hn_tx_lock);
3886 hn_start_locked(txr, 0);
3887 mtx_unlock(&txr->hn_tx_lock);
3891 hn_start_locked(struct hn_tx_ring *txr, int len)
3893 struct hn_softc *sc = txr->hn_sc;
3894 struct ifnet *ifp = sc->hn_ifp;
3897 KASSERT(hn_use_if_start,
3898 ("hn_start_locked is called, when if_start is disabled"));
3899 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3900 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3901 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3903 if (__predict_false(txr->hn_suspended))
3906 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3910 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3911 struct hn_txdesc *txd;
3912 struct mbuf *m_head;
3915 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3919 if (len > 0 && m_head->m_pkthdr.len > len) {
3921 * This sending could be time consuming; let callers
3922 * dispatch this packet sending (and sending of any
3923 * following up packets) to tx taskqueue.
3925 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3930 #if defined(INET6) || defined(INET)
3931 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3932 m_head = hn_tso_fixup(m_head);
3933 if (__predict_false(m_head == NULL)) {
3934 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3940 txd = hn_txdesc_get(txr);
3942 txr->hn_no_txdescs++;
3943 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3944 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3948 error = hn_encap(ifp, txr, txd, &m_head);
3950 /* Both txd and m_head are freed */
3951 KASSERT(txr->hn_agg_txd == NULL,
3952 ("encap failed w/ pending aggregating txdesc"));
3956 if (txr->hn_agg_pktleft == 0) {
3957 if (txr->hn_agg_txd != NULL) {
3958 KASSERT(m_head == NULL,
3959 ("pending mbuf for aggregating txdesc"));
3960 error = hn_flush_txagg(ifp, txr);
3961 if (__predict_false(error)) {
3962 atomic_set_int(&ifp->if_drv_flags,
3967 KASSERT(m_head != NULL, ("mbuf was freed"));
3968 error = hn_txpkt(ifp, txr, txd);
3969 if (__predict_false(error)) {
3970 /* txd is freed, but m_head is not */
3971 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3972 atomic_set_int(&ifp->if_drv_flags,
3980 KASSERT(txr->hn_agg_txd != NULL,
3981 ("no aggregating txdesc"));
3982 KASSERT(m_head == NULL,
3983 ("pending mbuf for aggregating txdesc"));
3988 /* Flush pending aggerated transmission. */
3989 if (txr->hn_agg_txd != NULL)
3990 hn_flush_txagg(ifp, txr);
3995 hn_start(struct ifnet *ifp)
3997 struct hn_softc *sc = ifp->if_softc;
3998 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
4000 if (txr->hn_sched_tx)
4003 if (mtx_trylock(&txr->hn_tx_lock)) {
4006 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4007 mtx_unlock(&txr->hn_tx_lock);
4012 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4016 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
4018 struct hn_tx_ring *txr = xtxr;
4020 mtx_lock(&txr->hn_tx_lock);
4021 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
4022 hn_start_locked(txr, 0);
4023 mtx_unlock(&txr->hn_tx_lock);
4027 hn_start_txeof(struct hn_tx_ring *txr)
4029 struct hn_softc *sc = txr->hn_sc;
4030 struct ifnet *ifp = sc->hn_ifp;
4032 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4034 if (txr->hn_sched_tx)
4037 if (mtx_trylock(&txr->hn_tx_lock)) {
4040 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4041 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4042 mtx_unlock(&txr->hn_tx_lock);
4044 taskqueue_enqueue(txr->hn_tx_taskq,
4050 * Release the OACTIVE earlier, with the hope, that
4051 * others could catch up. The task will clear the
4052 * flag again with the hn_tx_lock to avoid possible
4055 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4056 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4060 #endif /* HN_IFSTART_SUPPORT */
4063 hn_xmit(struct hn_tx_ring *txr, int len)
4065 struct hn_softc *sc = txr->hn_sc;
4066 struct ifnet *ifp = sc->hn_ifp;
4067 struct mbuf *m_head;
4070 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4071 #ifdef HN_IFSTART_SUPPORT
4072 KASSERT(hn_use_if_start == 0,
4073 ("hn_xmit is called, when if_start is enabled"));
4075 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4077 if (__predict_false(txr->hn_suspended))
4080 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4083 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4084 struct hn_txdesc *txd;
4087 if (len > 0 && m_head->m_pkthdr.len > len) {
4089 * This sending could be time consuming; let callers
4090 * dispatch this packet sending (and sending of any
4091 * following up packets) to tx taskqueue.
4093 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4098 txd = hn_txdesc_get(txr);
4100 txr->hn_no_txdescs++;
4101 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4102 txr->hn_oactive = 1;
4106 error = hn_encap(ifp, txr, txd, &m_head);
4108 /* Both txd and m_head are freed; discard */
4109 KASSERT(txr->hn_agg_txd == NULL,
4110 ("encap failed w/ pending aggregating txdesc"));
4111 drbr_advance(ifp, txr->hn_mbuf_br);
4115 if (txr->hn_agg_pktleft == 0) {
4116 if (txr->hn_agg_txd != NULL) {
4117 KASSERT(m_head == NULL,
4118 ("pending mbuf for aggregating txdesc"));
4119 error = hn_flush_txagg(ifp, txr);
4120 if (__predict_false(error)) {
4121 txr->hn_oactive = 1;
4125 KASSERT(m_head != NULL, ("mbuf was freed"));
4126 error = hn_txpkt(ifp, txr, txd);
4127 if (__predict_false(error)) {
4128 /* txd is freed, but m_head is not */
4129 drbr_putback(ifp, txr->hn_mbuf_br,
4131 txr->hn_oactive = 1;
4138 KASSERT(txr->hn_agg_txd != NULL,
4139 ("no aggregating txdesc"));
4140 KASSERT(m_head == NULL,
4141 ("pending mbuf for aggregating txdesc"));
4146 drbr_advance(ifp, txr->hn_mbuf_br);
4149 /* Flush pending aggerated transmission. */
4150 if (txr->hn_agg_txd != NULL)
4151 hn_flush_txagg(ifp, txr);
4156 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4158 struct hn_softc *sc = ifp->if_softc;
4159 struct hn_tx_ring *txr;
4162 #if defined(INET6) || defined(INET)
4164 * Perform TSO packet header fixup now, since the TSO
4165 * packet header should be cache-hot.
4167 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4168 m = hn_tso_fixup(m);
4169 if (__predict_false(m == NULL)) {
4170 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4177 * Select the TX ring based on flowid
4179 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
4180 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4181 txr = &sc->hn_tx_ring[idx];
4183 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4185 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4189 if (txr->hn_oactive)
4192 if (txr->hn_sched_tx)
4195 if (mtx_trylock(&txr->hn_tx_lock)) {
4198 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4199 mtx_unlock(&txr->hn_tx_lock);
4204 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4209 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4213 mtx_lock(&txr->hn_tx_lock);
4214 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4216 mtx_unlock(&txr->hn_tx_lock);
4220 hn_xmit_qflush(struct ifnet *ifp)
4222 struct hn_softc *sc = ifp->if_softc;
4225 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4226 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4231 hn_xmit_txeof(struct hn_tx_ring *txr)
4234 if (txr->hn_sched_tx)
4237 if (mtx_trylock(&txr->hn_tx_lock)) {
4240 txr->hn_oactive = 0;
4241 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4242 mtx_unlock(&txr->hn_tx_lock);
4244 taskqueue_enqueue(txr->hn_tx_taskq,
4250 * Release the oactive earlier, with the hope, that
4251 * others could catch up. The task will clear the
4252 * oactive again with the hn_tx_lock to avoid possible
4255 txr->hn_oactive = 0;
4256 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4261 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4263 struct hn_tx_ring *txr = xtxr;
4265 mtx_lock(&txr->hn_tx_lock);
4267 mtx_unlock(&txr->hn_tx_lock);
4271 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4273 struct hn_tx_ring *txr = xtxr;
4275 mtx_lock(&txr->hn_tx_lock);
4276 txr->hn_oactive = 0;
4278 mtx_unlock(&txr->hn_tx_lock);
4282 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4284 struct vmbus_chan_br cbr;
4285 struct hn_rx_ring *rxr;
4286 struct hn_tx_ring *txr = NULL;
4289 idx = vmbus_chan_subidx(chan);
4292 * Link this channel to RX/TX ring.
4294 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4295 ("invalid channel index %d, should > 0 && < %d",
4296 idx, sc->hn_rx_ring_inuse));
4297 rxr = &sc->hn_rx_ring[idx];
4298 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4299 ("RX ring %d already attached", idx));
4300 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4303 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4304 idx, vmbus_chan_id(chan));
4307 if (idx < sc->hn_tx_ring_inuse) {
4308 txr = &sc->hn_tx_ring[idx];
4309 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4310 ("TX ring %d already attached", idx));
4311 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4313 txr->hn_chan = chan;
4315 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4316 idx, vmbus_chan_id(chan));
4320 /* Bind this channel to a proper CPU. */
4321 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4326 cbr.cbr = rxr->hn_br;
4327 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4328 cbr.cbr_txsz = HN_TXBR_SIZE;
4329 cbr.cbr_rxsz = HN_RXBR_SIZE;
4330 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4332 if (error == EISCONN) {
4333 if_printf(sc->hn_ifp, "bufring is connected after "
4334 "chan%u open failure\n", vmbus_chan_id(chan));
4335 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4337 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4338 vmbus_chan_id(chan), error);
4345 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4347 struct hn_rx_ring *rxr;
4350 idx = vmbus_chan_subidx(chan);
4353 * Link this channel to RX/TX ring.
4355 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4356 ("invalid channel index %d, should > 0 && < %d",
4357 idx, sc->hn_rx_ring_inuse));
4358 rxr = &sc->hn_rx_ring[idx];
4359 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4360 ("RX ring %d is not attached", idx));
4361 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4363 if (idx < sc->hn_tx_ring_inuse) {
4364 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4366 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4367 ("TX ring %d is not attached attached", idx));
4368 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4372 * Close this channel.
4375 * Channel closing does _not_ destroy the target channel.
4377 error = vmbus_chan_close_direct(chan);
4378 if (error == EISCONN) {
4379 if_printf(sc->hn_ifp, "chan%u bufring is connected "
4380 "after being closed\n", vmbus_chan_id(chan));
4381 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4383 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4384 vmbus_chan_id(chan), error);
4389 hn_attach_subchans(struct hn_softc *sc)
4391 struct vmbus_channel **subchans;
4392 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4395 KASSERT(subchan_cnt > 0, ("no sub-channels"));
4397 /* Attach the sub-channels. */
4398 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4399 for (i = 0; i < subchan_cnt; ++i) {
4402 error1 = hn_chan_attach(sc, subchans[i]);
4405 /* Move on; all channels will be detached later. */
4408 vmbus_subchan_rel(subchans, subchan_cnt);
4411 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4414 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4422 hn_detach_allchans(struct hn_softc *sc)
4424 struct vmbus_channel **subchans;
4425 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4428 if (subchan_cnt == 0)
4431 /* Detach the sub-channels. */
4432 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4433 for (i = 0; i < subchan_cnt; ++i)
4434 hn_chan_detach(sc, subchans[i]);
4435 vmbus_subchan_rel(subchans, subchan_cnt);
4439 * Detach the primary channel, _after_ all sub-channels
4442 hn_chan_detach(sc, sc->hn_prichan);
4444 /* Wait for sub-channels to be destroyed, if any. */
4445 vmbus_subchan_drain(sc->hn_prichan);
4448 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4449 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4450 HN_RX_FLAG_ATTACHED) == 0,
4451 ("%dth RX ring is still attached", i));
4453 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4454 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4455 HN_TX_FLAG_ATTACHED) == 0,
4456 ("%dth TX ring is still attached", i));
4462 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4464 struct vmbus_channel **subchans;
4465 int nchan, rxr_cnt, error;
4467 nchan = *nsubch + 1;
4470 * Multiple RX/TX rings are not requested.
4477 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4480 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4482 /* No RSS; this is benign. */
4487 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4491 if (nchan > rxr_cnt)
4494 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4500 * Allocate sub-channels from NVS.
4502 *nsubch = nchan - 1;
4503 error = hn_nvs_alloc_subchans(sc, nsubch);
4504 if (error || *nsubch == 0) {
4505 /* Failed to allocate sub-channels. */
4511 * Wait for all sub-channels to become ready before moving on.
4513 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4514 vmbus_subchan_rel(subchans, *nsubch);
4519 hn_synth_attachable(const struct hn_softc *sc)
4523 if (sc->hn_flags & HN_FLAG_ERRORS)
4526 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4527 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4529 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4536 hn_synth_attach(struct hn_softc *sc, int mtu)
4538 #define ATTACHED_NVS 0x0002
4539 #define ATTACHED_RNDIS 0x0004
4541 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4542 int error, nsubch, nchan, i;
4543 uint32_t old_caps, attached = 0;
4545 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4546 ("synthetic parts were attached"));
4548 if (!hn_synth_attachable(sc))
4551 /* Save capabilities for later verification. */
4552 old_caps = sc->hn_caps;
4555 /* Clear RSS stuffs. */
4556 sc->hn_rss_ind_size = 0;
4557 sc->hn_rss_hash = 0;
4560 * Attach the primary channel _before_ attaching NVS and RNDIS.
4562 error = hn_chan_attach(sc, sc->hn_prichan);
4569 error = hn_nvs_attach(sc, mtu);
4572 attached |= ATTACHED_NVS;
4575 * Attach RNDIS _after_ NVS is attached.
4577 error = hn_rndis_attach(sc, mtu);
4580 attached |= ATTACHED_RNDIS;
4583 * Make sure capabilities are not changed.
4585 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4586 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4587 old_caps, sc->hn_caps);
4593 * Allocate sub-channels for multi-TX/RX rings.
4596 * The # of RX rings that can be used is equivalent to the # of
4597 * channels to be requested.
4599 nsubch = sc->hn_rx_ring_cnt - 1;
4600 error = hn_synth_alloc_subchans(sc, &nsubch);
4603 /* NOTE: _Full_ synthetic parts detach is required now. */
4604 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4607 * Set the # of TX/RX rings that could be used according to
4608 * the # of channels that NVS offered.
4611 hn_set_ring_inuse(sc, nchan);
4613 /* Only the primary channel can be used; done */
4618 * Attach the sub-channels.
4620 * NOTE: hn_set_ring_inuse() _must_ have been called.
4622 error = hn_attach_subchans(sc);
4627 * Configure RSS key and indirect table _after_ all sub-channels
4630 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4632 * RSS key is not set yet; set it to the default RSS key.
4635 if_printf(sc->hn_ifp, "setup default RSS key\n");
4636 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4637 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4640 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4642 * RSS indirect table is not set yet; set it up in round-
4646 if_printf(sc->hn_ifp, "setup default RSS indirect "
4649 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
4650 rss->rss_ind[i] = i % nchan;
4651 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4654 * # of usable channels may be changed, so we have to
4655 * make sure that all entries in RSS indirect table
4658 * NOTE: hn_set_ring_inuse() _must_ have been called.
4660 hn_rss_ind_fixup(sc);
4663 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4668 * Fixup transmission aggregation setup.
4674 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4675 hn_synth_detach(sc);
4677 if (attached & ATTACHED_RNDIS)
4678 hn_rndis_detach(sc);
4679 if (attached & ATTACHED_NVS)
4681 hn_chan_detach(sc, sc->hn_prichan);
4682 /* Restore old capabilities. */
4683 sc->hn_caps = old_caps;
4687 #undef ATTACHED_RNDIS
4693 * The interface must have been suspended though hn_suspend(), before
4694 * this function get called.
4697 hn_synth_detach(struct hn_softc *sc)
4700 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4701 ("synthetic parts were not attached"));
4703 /* Detach the RNDIS first. */
4704 hn_rndis_detach(sc);
4709 /* Detach all of the channels. */
4710 hn_detach_allchans(sc);
4712 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4716 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4718 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4719 ("invalid ring count %d", ring_cnt));
4721 if (sc->hn_tx_ring_cnt > ring_cnt)
4722 sc->hn_tx_ring_inuse = ring_cnt;
4724 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4725 sc->hn_rx_ring_inuse = ring_cnt;
4728 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4729 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4734 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4739 * The TX bufring will not be drained by the hypervisor,
4740 * if the primary channel is revoked.
4742 while (!vmbus_chan_rx_empty(chan) ||
4743 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4744 !vmbus_chan_tx_empty(chan)))
4746 vmbus_chan_intr_drain(chan);
4750 hn_suspend_data(struct hn_softc *sc)
4752 struct vmbus_channel **subch = NULL;
4753 struct hn_tx_ring *txr;
4761 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4762 txr = &sc->hn_tx_ring[i];
4764 mtx_lock(&txr->hn_tx_lock);
4765 txr->hn_suspended = 1;
4766 mtx_unlock(&txr->hn_tx_lock);
4767 /* No one is able send more packets now. */
4770 * Wait for all pending sends to finish.
4773 * We will _not_ receive all pending send-done, if the
4774 * primary channel is revoked.
4776 while (hn_tx_ring_pending(txr) &&
4777 !vmbus_chan_is_revoked(sc->hn_prichan))
4778 pause("hnwtx", 1 /* 1 tick */);
4782 * Disable RX by clearing RX filter.
4784 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4785 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
4788 * Give RNDIS enough time to flush all pending data packets.
4790 pause("waitrx", (200 * hz) / 1000);
4793 * Drain RX/TX bufrings and interrupts.
4795 nsubch = sc->hn_rx_ring_inuse - 1;
4797 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4799 if (subch != NULL) {
4800 for (i = 0; i < nsubch; ++i)
4801 hn_chan_drain(sc, subch[i]);
4803 hn_chan_drain(sc, sc->hn_prichan);
4806 vmbus_subchan_rel(subch, nsubch);
4809 * Drain any pending TX tasks.
4812 * The above hn_chan_drain() can dispatch TX tasks, so the TX
4813 * tasks will have to be drained _after_ the above hn_chan_drain()
4816 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4817 txr = &sc->hn_tx_ring[i];
4819 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4820 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4825 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4828 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4832 hn_suspend_mgmt(struct hn_softc *sc)
4839 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4840 * through hn_mgmt_taskq.
4842 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4843 vmbus_chan_run_task(sc->hn_prichan, &task);
4846 * Make sure that all pending management tasks are completed.
4848 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4849 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4850 taskqueue_drain_all(sc->hn_mgmt_taskq0);
4854 hn_suspend(struct hn_softc *sc)
4857 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4858 hn_suspend_data(sc);
4859 hn_suspend_mgmt(sc);
4863 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4867 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4868 ("invalid TX ring count %d", tx_ring_cnt));
4870 for (i = 0; i < tx_ring_cnt; ++i) {
4871 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4873 mtx_lock(&txr->hn_tx_lock);
4874 txr->hn_suspended = 0;
4875 mtx_unlock(&txr->hn_tx_lock);
4880 hn_resume_data(struct hn_softc *sc)
4889 hn_set_rxfilter(sc);
4892 * Make sure to clear suspend status on "all" TX rings,
4893 * since hn_tx_ring_inuse can be changed after
4894 * hn_suspend_data().
4896 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4898 #ifdef HN_IFSTART_SUPPORT
4899 if (!hn_use_if_start)
4903 * Flush unused drbrs, since hn_tx_ring_inuse may be
4906 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4907 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4913 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4914 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4917 * Use txeof task, so that any pending oactive can be
4920 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4925 hn_resume_mgmt(struct hn_softc *sc)
4928 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4931 * Kick off network change detection, if it was pending.
4932 * If no network change was pending, start link status
4933 * checks, which is more lightweight than network change
4936 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4937 hn_change_network(sc);
4939 hn_update_link_status(sc);
4943 hn_resume(struct hn_softc *sc)
4946 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4952 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4954 const struct rndis_status_msg *msg;
4957 if (dlen < sizeof(*msg)) {
4958 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4963 switch (msg->rm_status) {
4964 case RNDIS_STATUS_MEDIA_CONNECT:
4965 case RNDIS_STATUS_MEDIA_DISCONNECT:
4966 hn_update_link_status(sc);
4969 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4970 /* Not really useful; ignore. */
4973 case RNDIS_STATUS_NETWORK_CHANGE:
4974 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4975 if (dlen < ofs + msg->rm_stbuflen ||
4976 msg->rm_stbuflen < sizeof(uint32_t)) {
4977 if_printf(sc->hn_ifp, "network changed\n");
4981 memcpy(&change, ((const uint8_t *)msg) + ofs,
4983 if_printf(sc->hn_ifp, "network changed, change %u\n",
4986 hn_change_network(sc);
4990 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4997 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4999 const struct rndis_pktinfo *pi = info_data;
5002 while (info_dlen != 0) {
5006 if (__predict_false(info_dlen < sizeof(*pi)))
5008 if (__predict_false(info_dlen < pi->rm_size))
5010 info_dlen -= pi->rm_size;
5012 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
5014 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
5016 dlen = pi->rm_size - pi->rm_pktinfooffset;
5019 switch (pi->rm_type) {
5020 case NDIS_PKTINFO_TYPE_VLAN:
5021 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
5023 info->vlan_info = *((const uint32_t *)data);
5024 mask |= HN_RXINFO_VLAN;
5027 case NDIS_PKTINFO_TYPE_CSUM:
5028 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
5030 info->csum_info = *((const uint32_t *)data);
5031 mask |= HN_RXINFO_CSUM;
5034 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5035 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5037 info->hash_value = *((const uint32_t *)data);
5038 mask |= HN_RXINFO_HASHVAL;
5041 case HN_NDIS_PKTINFO_TYPE_HASHINF:
5042 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5044 info->hash_info = *((const uint32_t *)data);
5045 mask |= HN_RXINFO_HASHINF;
5052 if (mask == HN_RXINFO_ALL) {
5053 /* All found; done */
5057 pi = (const struct rndis_pktinfo *)
5058 ((const uint8_t *)pi + pi->rm_size);
5063 * - If there is no hash value, invalidate the hash info.
5065 if ((mask & HN_RXINFO_HASHVAL) == 0)
5066 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5070 static __inline bool
5071 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5074 if (off < check_off) {
5075 if (__predict_true(off + len <= check_off))
5077 } else if (off > check_off) {
5078 if (__predict_true(check_off + check_len <= off))
5085 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5087 const struct rndis_packet_msg *pkt;
5088 struct hn_rxinfo info;
5089 int data_off, pktinfo_off, data_len, pktinfo_len;
5094 if (__predict_false(dlen < sizeof(*pkt))) {
5095 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5100 if (__predict_false(dlen < pkt->rm_len)) {
5101 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5102 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5105 if (__predict_false(pkt->rm_len <
5106 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5107 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5108 "msglen %u, data %u, oob %u, pktinfo %u\n",
5109 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5110 pkt->rm_pktinfolen);
5113 if (__predict_false(pkt->rm_datalen == 0)) {
5114 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5121 #define IS_OFFSET_INVALID(ofs) \
5122 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
5123 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5125 /* XXX Hyper-V does not meet data offset alignment requirement */
5126 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5127 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5128 "data offset %u\n", pkt->rm_dataoffset);
5131 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5132 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5133 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5134 "oob offset %u\n", pkt->rm_oobdataoffset);
5137 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5138 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5139 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5140 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5144 #undef IS_OFFSET_INVALID
5146 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5147 data_len = pkt->rm_datalen;
5148 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5149 pktinfo_len = pkt->rm_pktinfolen;
5152 * Check OOB coverage.
5154 if (__predict_false(pkt->rm_oobdatalen != 0)) {
5155 int oob_off, oob_len;
5157 if_printf(rxr->hn_ifp, "got oobdata\n");
5158 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5159 oob_len = pkt->rm_oobdatalen;
5161 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5162 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5163 "oob overflow, msglen %u, oob abs %d len %d\n",
5164 pkt->rm_len, oob_off, oob_len);
5169 * Check against data.
5171 if (hn_rndis_check_overlap(oob_off, oob_len,
5172 data_off, data_len)) {
5173 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5174 "oob overlaps data, oob abs %d len %d, "
5175 "data abs %d len %d\n",
5176 oob_off, oob_len, data_off, data_len);
5181 * Check against pktinfo.
5183 if (pktinfo_len != 0 &&
5184 hn_rndis_check_overlap(oob_off, oob_len,
5185 pktinfo_off, pktinfo_len)) {
5186 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5187 "oob overlaps pktinfo, oob abs %d len %d, "
5188 "pktinfo abs %d len %d\n",
5189 oob_off, oob_len, pktinfo_off, pktinfo_len);
5195 * Check per-packet-info coverage and find useful per-packet-info.
5197 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5198 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5199 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5200 if (__predict_true(pktinfo_len != 0)) {
5204 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5205 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5206 "pktinfo overflow, msglen %u, "
5207 "pktinfo abs %d len %d\n",
5208 pkt->rm_len, pktinfo_off, pktinfo_len);
5213 * Check packet info coverage.
5215 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5216 data_off, data_len);
5217 if (__predict_false(overlap)) {
5218 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5219 "pktinfo overlap data, pktinfo abs %d len %d, "
5220 "data abs %d len %d\n",
5221 pktinfo_off, pktinfo_len, data_off, data_len);
5226 * Find useful per-packet-info.
5228 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5229 pktinfo_len, &info);
5230 if (__predict_false(error)) {
5231 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5237 if (__predict_false(data_off + data_len > pkt->rm_len)) {
5238 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5239 "data overflow, msglen %u, data abs %d len %d\n",
5240 pkt->rm_len, data_off, data_len);
5243 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5246 static __inline void
5247 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5249 const struct rndis_msghdr *hdr;
5251 if (__predict_false(dlen < sizeof(*hdr))) {
5252 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5257 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5258 /* Hot data path. */
5259 hn_rndis_rx_data(rxr, data, dlen);
5264 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5265 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5267 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5271 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5273 const struct hn_nvs_hdr *hdr;
5275 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5276 if_printf(sc->hn_ifp, "invalid nvs notify\n");
5279 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5281 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5282 /* Useless; ignore */
5285 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5289 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5290 const struct vmbus_chanpkt_hdr *pkt)
5292 struct hn_nvs_sendctx *sndc;
5294 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5295 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5296 VMBUS_CHANPKT_DATALEN(pkt));
5299 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5305 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5306 const struct vmbus_chanpkt_hdr *pkthdr)
5308 const struct vmbus_chanpkt_rxbuf *pkt;
5309 const struct hn_nvs_hdr *nvs_hdr;
5312 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5313 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5316 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5318 /* Make sure that this is a RNDIS message. */
5319 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5320 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5325 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5326 if (__predict_false(hlen < sizeof(*pkt))) {
5327 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5330 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5332 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5333 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5338 count = pkt->cp_rxbuf_cnt;
5339 if (__predict_false(hlen <
5340 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5341 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5345 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5346 for (i = 0; i < count; ++i) {
5349 ofs = pkt->cp_rxbuf[i].rb_ofs;
5350 len = pkt->cp_rxbuf[i].rb_len;
5351 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5352 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5353 "ofs %d, len %d\n", i, ofs, len);
5356 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5360 * Ack the consumed RXBUF associated w/ this channel packet,
5361 * so that this RXBUF can be recycled by the hypervisor.
5363 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5367 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5370 struct hn_nvs_rndis_ack ack;
5373 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5374 ack.nvs_status = HN_NVS_STATUS_OK;
5378 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5379 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5380 if (__predict_false(error == EAGAIN)) {
5383 * This should _not_ happen in real world, since the
5384 * consumption of the TX bufring from the TX path is
5387 if (rxr->hn_ack_failed == 0)
5388 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5389 rxr->hn_ack_failed++;
5396 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5401 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5403 struct hn_rx_ring *rxr = xrxr;
5404 struct hn_softc *sc = rxr->hn_ifp->if_softc;
5407 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5410 pktlen = rxr->hn_pktbuf_len;
5411 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5412 if (__predict_false(error == ENOBUFS)) {
5417 * Expand channel packet buffer.
5420 * Use M_WAITOK here, since allocation failure
5423 nlen = rxr->hn_pktbuf_len * 2;
5424 while (nlen < pktlen)
5426 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5428 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5429 rxr->hn_pktbuf_len, nlen);
5431 free(rxr->hn_pktbuf, M_DEVBUF);
5432 rxr->hn_pktbuf = nbuf;
5433 rxr->hn_pktbuf_len = nlen;
5436 } else if (__predict_false(error == EAGAIN)) {
5437 /* No more channel packets; done! */
5440 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5442 switch (pkt->cph_type) {
5443 case VMBUS_CHANPKT_TYPE_COMP:
5444 hn_nvs_handle_comp(sc, chan, pkt);
5447 case VMBUS_CHANPKT_TYPE_RXBUF:
5448 hn_nvs_handle_rxbuf(rxr, chan, pkt);
5451 case VMBUS_CHANPKT_TYPE_INBAND:
5452 hn_nvs_handle_notify(sc, pkt);
5456 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5461 hn_chan_rollup(rxr, rxr->hn_txr);
5465 hn_tx_taskq_create(void *arg __unused)
5470 * Fix the # of TX taskqueues.
5472 if (hn_tx_taskq_cnt <= 0)
5473 hn_tx_taskq_cnt = 1;
5474 else if (hn_tx_taskq_cnt > mp_ncpus)
5475 hn_tx_taskq_cnt = mp_ncpus;
5478 * Fix the TX taskqueue mode.
5480 switch (hn_tx_taskq_mode) {
5481 case HN_TX_TASKQ_M_INDEP:
5482 case HN_TX_TASKQ_M_GLOBAL:
5483 case HN_TX_TASKQ_M_EVTTQ:
5486 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5490 if (vm_guest != VM_GUEST_HV)
5493 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5496 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5497 M_DEVBUF, M_WAITOK);
5498 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5499 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5500 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5501 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5505 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5506 hn_tx_taskq_create, NULL);
5509 hn_tx_taskq_destroy(void *arg __unused)
5512 if (hn_tx_taskque != NULL) {
5515 for (i = 0; i < hn_tx_taskq_cnt; ++i)
5516 taskqueue_free(hn_tx_taskque[i]);
5517 free(hn_tx_taskque, M_DEVBUF);
5520 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5521 hn_tx_taskq_destroy, NULL);